zoukankan      html  css  js  c++  java
  • OpenACC 绘制曼德勃罗集

    ▶ 书上第四章,用一系列步骤优化曼德勃罗集的计算过程。

    ● 代码

     1 // constants.h
     2 const unsigned int WIDTH=16384;
     3 const unsigned int HEIGHT=16384;
     4 const unsigned int MAX_ITERS=50;
     5 const unsigned int MAX_COLOR=255;
     6 const double xmin=-1.7;
     7 const double xmax=.5;
     8 const double ymin=-1.2;
     9 const double ymax=1.2;
    10 const double dx = (xmax - xmin) / WIDTH;
    11 const double dy = (ymax - ymin) / HEIGHT;
    1 // mandelbrot.h
    2 #pragma acc routine seq
    3 unsigned char mandelbrot(int Px, int Py);
     1 // mandelbrot.cpp
     2 #include <cstdio>
     3 #include <cstdlib>
     4 #include <fstream>
     5 #include "mandelbrot.h"
     6 #include "constants.h"
     7 
     8 using namespace std;
     9 
    10 unsigned char mandelbrot(int Px, int Py)
    11 {
    12     const double x0 = xmin + Px * dx, y0 = ymin + Py * dy;
    13     double x = 0.0, y = 0.0;
    14     int i;
    15     for(i=0; x * x + y * y < 4.0 && i < MAX_ITERS; i++)
    16     {
    17         double xtemp = x * x - y * y + x0;
    18         y = 2 * x * y + y0;
    19         x = xtemp;
    20     }
    21     return (double)MAX_COLOR * i / MAX_ITERS;
    22 }
     1 // main.cpp
     2 #include <cstdio>
     3 #include <cstdlib>
     4 #include <fstream>
     5 #include <cstring>
     6 #include <omp.h>
     7 #include <openacc.h>
     8 
     9 #include "mandelbrot.h"
    10 #include "constants.h"
    11 
    12 using namespace std;
    13 
    14 int main()
    15 {
    16     unsigned char *image = (unsigned char*)malloc(sizeof(unsigned int) * WIDTH * HEIGHT);
    17     FILE *fp=fopen("image.pgm","wb");
    18     fprintf(fp,"P5
    "#comment"
    %d %d
    %d
    ",WIDTH, HEIGHT, MAX_COLOR);
    19     
    20     acc_init(acc_device_nvidia);
    21 #pragma acc parallel num_gangs(1)
    22     {
    23         image[0] = 0;
    24     }        
    25     double st = omp_get_wtime();
    26 #pragma acc parallel loop
    27     for(int y = 0; y < HEIGHT; y++)
    28     {
    29         for(int x = 0; x < WIDTH; x++)
    30             image[y * WIDTH + x] = mandelbrot(x, y);
    31     }  
    32     double et = omp_get_wtime();
    33     printf("Time: %lf seconds.
    ", (et-st));
    34     fwrite(image,sizeof(unsigned char),WIDTH * HEIGHT, fp);
    35     fclose(fp);
    36     free(image);
    37     return 0;
    38 }

    ● 输出结果

    // Ubuntu:
    cuan@CUAN:/media/cuan/02FCDA52FCDA4019/Code/ParallelProgrammingWithOpenACC-master/Chapter04/cpp$ pgc++ -std=c++11 -acc -mp -fast -Minfo -c mandelbrot.cpp
    mandelbrot(int, int):
          9, Generating acc routine seq
             Generating Tesla code
         10, FMA (fused multiply-add) instruction(s) generated
         15, Loop not vectorized/parallelized: potential early exits
         16, FMA (fused multiply-add) instruction(s) generated
    cuan@CUAN:/media/cuan/02FCDA52FCDA4019/Code/ParallelProgrammingWithOpenACC-master/Chapter04/cpp$ pgc++ -std=c++11 -acc -mp -fast -Minfo main.cpp mandelbrot.o -o acc1.exe
    main.cpp:
    main:
         24, Accelerator kernel generated
             Generating Tesla code
             Generating implicit copyout(image[0])
         27, Accelerator kernel generated
             Generating Tesla code
             30, #pragma acc loop gang /* blockIdx.x */
             31, #pragma acc loop vector(128) /* threadIdx.x */
         27, Generating implicit copy(image[:268435456])
         31, Loop is parallelizable
             Loop not vectorized/parallelized: contains call
    cuan@CUAN:/media/cuan/02FCDA52FCDA4019/Code/ParallelProgrammingWithOpenACC-master/Chapter04/cpp$ ./acc1.exe
    Time: 0.646578 seconds.

    ● 优化 03,变化仅在 main.cpp 中

     1 // main.cpp
     2 #include <cstdio>
     3 #include <cstdlib>
     4 #include <fstream>
     5 #include <cstring>
     6 #include <omp.h>
     7 #include <openacc.h>
     8 #include "mandelbrot.h"
     9 #include "constants.h"
    10 
    11 using namespace std;
    12 
    13 int main()
    14 {
    15     const int num_blocks = 16, block_size = HEIGHT / num_blocks * WIDTH;
    16     unsigned char *image=(unsigned char*)malloc(sizeof(unsigned int) * WIDTH * HEIGHT);
    17     FILE *fp=fopen("image.pgm","wb");
    18     fprintf(fp,"P5
    "#comment"
    %d %d
    %d
    ",WIDTH, HEIGHT, MAX_COLOR);
    19 
    20     acc_init(acc_device_nvidia);
    21 #pragma acc parallel num_gangs(1)
    22     {
    23         image[0] = 0;
    24     }
    25     double st = omp_get_wtime();
    26 #pragma acc data create(image[WIDTH*HEIGHT])
    27     {
    28         for(int block = 0; block < num_blocks; block++)
    29         {
    30             const int start = block * (HEIGHT/num_blocks), end   = start + (HEIGHT/num_blocks);
    31 #pragma acc parallel loop async(block)
    32             for(int y=start;y<end;y++)
    33             {
    34                 for(int x=0;x<WIDTH;x++)
    35                     image[y*WIDTH+x]=mandelbrot(x,y);
    36             }
    37 #pragma acc update self(image[block*block_size:block_size]) async(block)
    38         }
    39     }
    40 #pragma acc wait
    41   
    42     double et = omp_get_wtime();
    43     printf("Time: %lf seconds.
    ", (et-st));
    44     fwrite(image,sizeof(unsigned char), WIDTH * HEIGHT, fp);
    45     fclose(fp);
    46     free(image);
    47     return 0;
    48 }

    ● 输出结果

    // Ubuntu:
    cuan@CUAN:/media/cuan/02FCDA52FCDA4019/Code/ParallelProgrammingWithOpenACC-master/Chapter04/cpp/task3$ pgc++ -std=c++11 -acc -mp -fast -Minfo -c mandelbrot.cpp
    mandelbrot(int, int):
         11, Generating acc routine seq
             Generating Tesla code
         12, FMA (fused multiply-add) instruction(s) generated
         15, Loop not vectorized/parallelized: potential early exits
         17, FMA (fused multiply-add) instruction(s) generated
    cuan@CUAN:/media/cuan/02FCDA52FCDA4019/Code/ParallelProgrammingWithOpenACC-master/Chapter04/cpp/task3$ pgc++ -std=c++11 -acc -mp -fast -Minfo main.cpp mandelbrot.o -o acc2.exe
    main.cpp:
    main:
         22, Accelerator kernel generated
             Generating Tesla code
             Generating implicit copyout(image[0])
         27, Generating create(image[:268435456])
         30, Accelerator kernel generated
             Generating Tesla code
             32, #pragma acc loop gang /* blockIdx.x */
             34, #pragma acc loop vector(128) /* threadIdx.x */
         34, Loop is parallelizable
             Loop not vectorized/parallelized: contains call
         38, Generating update self(image[block*16777216:16777216])
    cuan@CUAN:/media/cuan/02FCDA52FCDA4019/Code/ParallelProgrammingWithOpenACC-master/Chapter04/cpp/task3$ ./acc2.exe
    Time: 0.577263 seconds.

    ● 优化 05,添加异步计算

     1 // main.cpp
     2 #include <cstdio>
     3 #include <cstdlib>
     4 #include <fstream>
     5 #include <cstring>
     6 #include <omp.h>
     7 #include <openacc.h>
     8 #include "mandelbrot.h"
     9 #include "constants.h"
    10 
    11 using namespace std;
    12 
    13 int main()
    14 {
    15   const int num_blocks=64, block_size = HEIGHT / num_blocks * WIDTH;  
    16   unsigned char *image=(unsigned char*)malloc(sizeof(unsigned int) * WIDTH * HEIGHT);
    17   FILE *fp = fopen("image.pgm", "wb");
    18   fprintf(fp,"P5
    "#comment"
    %d %d
    %d
    ",WIDTH, HEIGHT, MAX_COLOR);
    19 
    20   const int num_gpus = acc_get_num_devices(acc_device_nvidia);
    21 
    22 #pragma omp parallel num_threads(num_gpus)
    23     {
    24         acc_init(acc_device_nvidia);
    25         acc_set_device_num(omp_get_thread_num(),acc_device_nvidia);
    26     }
    27         printf("Found %d NVIDIA GPUs.
    ", num_gpus);
    28 
    29     double st = omp_get_wtime();
    30 #pragma omp parallel num_threads(num_gpus)
    31     {
    32         int queue = 1;
    33         int my_gpu = omp_get_thread_num();
    34         acc_set_device_num(my_gpu,acc_device_nvidia);
    35         printf("Thread %d is using GPU %d
    ", my_gpu, acc_get_device_num(acc_device_nvidia));
    36 #pragma acc data create(image[WIDTH*HEIGHT])
    37         {
    38 #pragma omp for schedule(static, 1)
    39             for(int block = 0; block < num_blocks; block++) 
    40             {
    41                 const int start = block * (HEIGHT/num_blocks), end   = start + (HEIGHT/num_blocks);
    42 #pragma acc parallel loop async(queue)
    43                 for(int y=start;y<end;y++)
    44                 {
    45                     for(int x=0;x<WIDTH;x++)
    46                         image[y*WIDTH+x]=mandelbrot(x,y);
    47                 }
    48 
    49 #pragma acc update self(image[block*block_size:block_size]) async(queue)
    50         queue = (queue + 1) % 2; 
    51             }
    52         }
    53 #pragma acc wait
    54     } 
    55   
    56     double et = omp_get_wtime();
    57     printf("Time: %lf seconds.
    ", (et-st));
    58     fwrite(image,sizeof(unsigned char), WIDTH * HEIGHT, fp);
    59     fclose(fp);
    60     free(image);
    61     return 0;
    62 }

    ● 输出结果

    // Ubuntu:
    cuan@CUAN:/media/cuan/02FCDA52FCDA4019/Code/ParallelProgrammingWithOpenACC-master/Chapter04/cpp/task5.multithread$ pgc++ -std=c++11 -acc -mp -fast -Minfo -c mandelbrot.cpp
    mandelbrot(int, int):
         11, Generating acc routine seq
             Generating Tesla code
         12, FMA (fused multiply-add) instruction(s) generated
         15, Loop not vectorized/parallelized: potential early exits
         17, FMA (fused multiply-add) instruction(s) generated
    cuan@CUAN:/media/cuan/02FCDA52FCDA4019/Code/ParallelProgrammingWithOpenACC-master/Chapter04/cpp/task5.multithread$ pgc++ -std=c++11 -acc -mp -fast -Minfo main.cpp mandelbrot.o -o acc3.exe
    main.cpp:
    main:
         23, Parallel region activated
         26, Parallel region terminated
         31, Parallel region activated
         37, Generating create(image[:268435456])
         39, Parallel loop activated with static cyclic schedule
         41, Accelerator kernel generated
             Generating Tesla code
             43, #pragma acc loop gang /* blockIdx.x */
             45, #pragma acc loop vector(128) /* threadIdx.x */
         45, Loop is parallelizable
             Loop not vectorized/parallelized: contains call
         50, Generating update self(image[block*4194304:4194304])
         51, Barrier
         54, Parallel region terminated
    cuan@CUAN:/media/cuan/02FCDA52FCDA4019/Code/ParallelProgrammingWithOpenACC-master/Chapter04/cpp/task5.multithread$ ./acc3.exe
    Found 1 NVIDIA GPUs.
    Thread 0 is using GPU 0
    Time: 0.497450 seconds.

    ● nvprof 的结果汇总,三张图分别为 “并行和数据优化”,“优化 03(分块分流)” 和 “优化 05(分块调度)”

  • 相关阅读:
    ElasticSearch关闭重启命令
    解决使用驱动器中的光盘之前需要将其格式化
    mac利用Synergy操作多台电脑
    一次真实的蓝屏分析 ntkrnlmp.exe
    JS字符串false转boolean
    启明星会议室预定系统更新日志-通用版
    利用Visual Studio 2013 开发微软云Windows Azure配置指南(针对中国大陆)
    利用Bootstrap+Avalonjs+EntityFramework 开发ASP.NET WebForm应用程序(上)
    启明星会议室系统与Office365集成说明
    jQuery中attr和prop方法的区别说明
  • 原文地址:https://www.cnblogs.com/cuancuancuanhao/p/9494516.html
Copyright © 2011-2022 走看看