zoukankan      html  css  js  c++  java
  • OpenACC 优化矩阵乘法

    ▶ 按书上的步骤使用不同的导语优化矩阵乘法

    ● 已经优化的代码

      1 #include <iostream>
      2 #include <cstdlib>
      3 #include <chrono>
      4 
      5 #define SIZE 1024
      6 
      7 using namespace std;
      8 using namespace std::chrono;
      9 
     10 double a[SIZE][SIZE], b[SIZE][SIZE], c[SIZE][SIZE], d[SIZE][SIZE]; // 四个数组放入 main 里会报错 Segmentation fault (core dumped)
     11 
     12 int main()
     13 {
     14     //int i, j, k;                          // ijk 和 tmp 在循环中使用时才声明会导致运行时间变长
     15     double tmp;
     16 
     17 #pragma acc enter data create(a, b, c)
     18 #pragma acc kernels present(a, b, c)
     19     {
     20         for (int i = 0; i < SIZE; i++)      // 初始化 ab
     21         {
     22             for (int j = 0; j < SIZE; j++)
     23                 a[i][j] = (double)(i + j);
     24         }
     25         for (int i = 0; i < SIZE; i++)      // 初始化 ab
     26         {
     27             for (int j = 0; j < SIZE; j++)
     28                 b[i][j] = (double)(i - j);
     29         }
     30         for (int i = 0; i < SIZE; i++)      // 每种方法前都要清空 c
     31         {
     32             for (int j = 0; j < SIZE; j++)
     33                 c[i][j] = 0.0;
     34         }
     35     }
     36 
     37     high_resolution_clock::time_point t1 = high_resolution_clock::now();
     38 
     39 #pragma acc kernels present(a, b, c)       // 方法 1,每层循环都 auto
     40     {        
     41 #pragma acc loop auto                       
     42         for (int i = 0; i < SIZE; i++)
     43         {
     44 #pragma acc loop auto
     45             for (int j = 0; j < SIZE; j++)
     46             {
     47 #pragma acc loop auto
     48                 for (int k = 0; k < SIZE; k++)
     49                     c[i][j] += a[i][k] * b[k][j];
     50             }
     51         }
     52     }
     53 
     54     high_resolution_clock::time_point t2 = high_resolution_clock::now();
     55     duration<double> time = duration_cast<duration<double>>(t2 - t1);    
     56     printf("Time OpenACC - Auto: %.6lf s.
    
    ", time.count());
     57 
     58 #pragma acc kernels present(c)
     59     for (int i = 0; i < SIZE; i++)
     60     {
     61         for (int j = 0; j < SIZE; j++)
     62             c[i][j] = 0.0;
     63     }
     64 
     65     t1 = high_resolution_clock::now();
     66 
     67 #pragma acc kernels present(a, b, c)        // 方法 2,外两层 independent,最里层串行
     68     {
     69 #pragma acc loop independent 
     70         for (int i = 0; i < SIZE; i++)
     71         {
     72 #pragma acc loop independent
     73             for (int j = 0; j < SIZE; j++)
     74             {
     75 #pragma acc loop independent
     76                 for (int k = 0; k < SIZE; k++)
     77                     c[i][j] += a[i][k] * b[k][j];
     78             }
     79         }
     80     }
     81 
     82     t2 = high_resolution_clock::now();
     83     time = duration_cast<duration<double>>(t2 - t1);
     84     printf("Time OpenACC - Independent Seq: %.6lf s.
    
    ", time.count());
     85 
     86 #pragma acc kernels present(c)
     87     for (int i = 0; i < SIZE; i++)
     88     {
     89         for (int j = 0; j < SIZE; j++)
     90             c[i][j] = 0.0;
     91     }
     92         
     93     t1 = high_resolution_clock::now();
     94 
     95 #pragma acc kernels present(a, b, c)        // 方法 3,外两层 independent,最里层规约
     96     {
     97 #pragma acc loop independent
     98         for (int i = 0; i < SIZE; i++)
     99         {
    100 #pragma acc loop independent
    101             for (int j = 0; j < SIZE; j++)
    102             {
    103                 tmp = 0.0f;
    104 #pragma acc loop reduction(+: tmp)
    105                 for (int k = 0; k < SIZE; k++)
    106                     tmp += a[i][k] * b[k][j];
    107                 c[i][j] = tmp;
    108             }
    109         }
    110     }
    111 
    112     t2 = high_resolution_clock::now();
    113     time = duration_cast<duration<double>>(t2 - t1);
    114     printf("Time OpenACC - Independent Reduction: %.6lf s.
    
    ", time.count());
    115 
    116 #pragma acc kernels present(c)
    117     for (int i = 0; i < SIZE; i++)
    118     {
    119         for (int j = 0; j < SIZE; j++)
    120             c[i][j] = 0.0;
    121     }
    122     
    123     t1 = high_resolution_clock::now();
    124 
    125 #pragma acc kernels present(a, b, c)        // 方法 4,手动指定 gang 和 vector
    126     {
    127 #pragma acc loop gang(32)
    128         for (int i = 0; i < SIZE; i++)
    129         {
    130 #pragma acc loop vector(16)
    131             for (int j = 0; j < SIZE; j++)
    132             {
    133                 tmp = 0.0f;
    134 #pragma acc loop reduction(+: tmp)
    135                 for (int k = 0; k < SIZE; k++)
    136                     tmp += a[i][k] * b[k][j];
    137                 c[i][j] = tmp;
    138             }
    139         }
    140     }
    141 
    142     t2 = high_resolution_clock::now();
    143     time = duration_cast<duration<double>>(t2 - t1);
    144     printf("Time OpenACC - Gang Vector: %.6lf s.
    
    ", time.count());
    145 
    146 #pragma acc kernels present(c)
    147     for (int i = 0; i < SIZE; i++)
    148     {
    149         for (int j = 0; j < SIZE; j++)
    150             c[i][j] = 0.0;
    151     }
    152 
    153     t1 = high_resolution_clock::now();
    154 
    155 #pragma acc kernels present(a, b, c)        // 方法 5,分块重排
    156     {
    157 #pragma acc loop tile(32, 32) 
    158         for (int i = 0; i < SIZE; i++)
    159         {
    160             for (int j = 0; j < SIZE; j++)
    161             {
    162                 tmp = 0.0f;
    163 #pragma acc loop reduction(+ 
    164                            : tmp)
    165                 for (int k = 0; k < SIZE; ++k)
    166                     tmp += a[i][k] * b[k][j];
    167                 c[i][j] = tmp;
    168             }
    169         }
    170     }
    171 
    172     t2 = high_resolution_clock::now();
    173     time = duration_cast<duration<double>>(t2 - t1);
    174     printf("Time OpenACC - tile: %.6lf s.
    
    ", time.count());
    175 
    176 #pragma acc kernels present(c)
    177     for (int i = 0; i < SIZE; i++)
    178     {
    179         for (int j = 0; j < SIZE; j++)
    180             c[i][j] = 0.0;
    181     }
    182 
    183     t1 = high_resolution_clock::now();
    184 
    185 #pragma acc kernels present(a, b, c)        // 方法 6,合并多层迭代
    186     {
    187 #pragma acc loop collapse(2) independent
    188         for (int i = 0; i < SIZE; i++)
    189         {
    190             for (int j = 0; j < SIZE; j++)
    191             {
    192                 tmp = 0.0f;
    193 #pragma acc loop reduction(+: tmp)
    194                 for (int k = 0; k < SIZE; k++)
    195                     tmp += a[i][k] * b[k][j];
    196                 c[i][j] = tmp;
    197             }
    198         }
    199     }
    200 
    201     t2 = high_resolution_clock::now();
    202     time = duration_cast<duration<double>>(t2 - t1);
    203     printf("Time OpenACC - Collapse: %.6lf s.
    
    ", time.count());
    204 
    205 #pragma acc exit data copyout(a, b, c)
    206 
    207 #pragma omp parallel for shared(d)
    208     for (int i = 0; i < SIZE; i++)
    209     {
    210         for (int j = 0; j < SIZE; j++)
    211             d[i][j] = 0.0;
    212     }
    213 
    214     t1 = high_resolution_clock::now();
    215 
    216 #pragma omp parallel for default(none) shared(a, b, d)  // 使用 OpenMP
    217     for (int i = 0; i < SIZE; i++)
    218     {
    219         for (int j = 0; j < SIZE; j++)
    220         {
    221             for (int k = 0; k < SIZE; k++)
    222                 d[i][j] += a[i][k] * b[k][j];
    223         }
    224     }
    225     t2 = high_resolution_clock::now();
    226     time = duration_cast<duration<double>>(t2 - t1);
    227     printf("Time OpenMP: %.6lf s.
    
    ", time.count());
    228 
    229     for (int i = 0; i < SIZE; i++)                      // 检查结果
    230     {
    231         for (int j = 0; j < SIZE; j++)
    232         {
    233             if (c[i][j] != d[i][j])
    234                 printf("
    Error at [%d, %d],c = %f d = %f 
    ", i, j, c[i][j], d[i][j]);
    235         }
    236     }
    237     return 0;
    238 }

    ● 输出结果(数据管理优化前)

    cuan@CUAN:/mnt/d/Code/OpenACC/OpenACCProject/OpenACCProject$ ./acc.exe
    
    Time OpenACC - Auto: 4.589736 s.
    
    Time OpenACC - Independent Seq: 4.823721 s.
    
    Time OpenACC - Independent Reduction: 3.669336 s.
    
    Time OpenACC - Gang Vector: 3.611391 s.
    
    Time OpenACC - tile: 3.609573 s.
    
    Time OpenACC - Collapse: 3.605792 s.
    
    Time OpenMP: 4.345018 s.

    ● 输出结果(数据管理优化后)

    cuan@CUAN:~/acc$ pgc++ main.cpp -std=c++11 -acc -mp -Minfo -o main.exe
    main:
          3, include "chrono"
              31, include "chrono"
                  208, Parallel region activated
                  212, Parallel region terminated
                  217, Parallel region activated
                  224, Parallel region terminated
         19, Generating enter data create(b[:][:],c[:][:],a[:][:])
             Generating present(a[:][:],b[:][:],c[:][:])
         20, Loop is parallelizable
         22, Loop is parallelizable
             Generating Tesla code
             20, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */
             22, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
         25, Loop is parallelizable
         27, Loop is parallelizable
             Generating Tesla code
             25, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */
             27, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
         30, Loop is parallelizable
         32, Loop is parallelizable
             Generating Tesla code
             30, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */
             32, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
         32, Memory zero idiom, loop replaced by call to __c_mzero8
         40, Generating present(a[:][:],c[:][:],b[:][:])
         42, Loop is parallelizable
         45, Loop is parallelizable
         48, Complex loop carried dependence of c prevents parallelization
             Loop carried dependence of c prevents parallelization
             Loop carried backward dependence of c prevents vectorization
             Inner sequential loop scheduled on accelerator
             Generating Tesla code
             42, #pragma acc loop gang /* blockIdx.y */
             45, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
             48, #pragma acc loop seq
         48, Complex loop carried dependence of c prevents parallelization
             Loop carried backward dependence of c prevents vectorization
         56, Generating present(c[:][:])
         59, Loop is parallelizable
         61, Loop is parallelizable
             Generating Tesla code
             59, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */
             61, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
         61, Memory zero idiom, loop replaced by call to __c_mzero8
         68, Generating present(a[:][:],c[:][:],b[:][:])
         70, Loop is parallelizable
         73, Loop is parallelizable
         76, Loop is parallelizable
             Generating Tesla code
             70, #pragma acc loop gang /* blockIdx.z */
             73, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
             76, #pragma acc loop gang /* blockIdx.y */
         84, Generating present(c[:][:])
         87, Loop is parallelizable
         89, Loop is parallelizable
             Generating Tesla code
             87, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */
             89, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
         89, Memory zero idiom, loop replaced by call to __c_mzero8
         96, Generating present(a[:][:],c[:][:],b[:][:])
         98, Loop is parallelizable
        101, Loop is parallelizable
             Generating Tesla code
             98, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */
            101, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
            105, #pragma acc loop seq
        101, FMA (fused multiply-add) instruction(s) generated
        105, Loop is parallelizable
        114, Generating present(c[:][:])
        117, Loop is parallelizable
        119, Loop is parallelizable
             Generating Tesla code
            117, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */
            119, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
        119, Memory zero idiom, loop replaced by call to __c_mzero8
        126, Generating present(a[:][:],c[:][:],b[:][:])
        128, Loop is parallelizable
        131, Loop is parallelizable
             Generating Tesla code
            128, #pragma acc loop gang(32), vector(8) /* blockIdx.y threadIdx.y */
            131, #pragma acc loop gang, vector(16) /* blockIdx.x threadIdx.x */
            135, #pragma acc loop seq
        135, Loop is parallelizable
        144, Generating present(c[:][:])
        147, Loop is parallelizable
        149, Loop is parallelizable
             Generating Tesla code
            147, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */
            149, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
        149, Memory zero idiom, loop replaced by call to __c_mzero8
        156, Generating present(a[:][:],c[:][:],b[:][:])
        158, Loop is parallelizable
        160, Loop is parallelizable
             Generating Tesla code
            158, #pragma acc loop gang, vector tile(32,32) /* blockIdx.x threadIdx.x */
            160,   /* blockIdx.x threadIdx.x tiled */
            165, #pragma acc loop seq
        165, Loop is parallelizable
        174, Generating present(c[:][:])
        177, Loop is parallelizable
        179, Loop is parallelizable
             Generating Tesla code
            177, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */
            179, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
        179, Memory zero idiom, loop replaced by call to __c_mzero8
        186, Generating present(a[:][:],c[:][:],b[:][:])
        188, Loop is parallelizable
        190, Loop is parallelizable
             Generating Tesla code
            188, #pragma acc loop gang, vector(128) collapse(2) /* blockIdx.x threadIdx.x */
            190,   /* blockIdx.x threadIdx.x collapsed */
            194, #pragma acc loop seq
        194, Loop is parallelizable
        208, Generating exit data copyout(c[:][:],b[:][:],a[:][:])
             Parallel loop activated with static block schedule
        210, Memory zero idiom, loop replaced by call to __c_mzero8
        212, Barrier
        217, Parallel loop activated with static block schedule
             FMA (fused multiply-add) instruction(s) generated
        224, Barrier
    cuan@CUAN:~/acc$ ./main.exe 
    launch CUDA kernel  file=/home/cuan/acc/main.cpp function=main line=22 device=0 threadid=1 num_gangs=8192 num_workers=4 vector_length=32 grid=32x256 block=32x4
    launch CUDA kernel  file=/home/cuan/acc/main.cpp function=main line=27 device=0 threadid=1 num_gangs=8192 num_workers=4 vector_length=32 grid=32x256 block=32x4
    launch CUDA kernel  file=/home/cuan/acc/main.cpp function=main line=32 device=0 threadid=1 num_gangs=8192 num_workers=4 vector_length=32 grid=32x256 block=32x4
    launch CUDA kernel  file=/home/cuan/acc/main.cpp function=main line=48 device=0 threadid=1 num_gangs=8192 num_workers=1 vector_length=128 grid=8x1024 block=128
    Time OpenACC - Auto: 0.018726 s.
    
    launch CUDA kernel  file=/home/cuan/acc/main.cpp function=main line=61 device=0 threadid=1 num_gangs=8192 num_workers=4 vector_length=32 grid=32x256 block=32x4
    launch CUDA kernel  file=/home/cuan/acc/main.cpp function=main line=76 device=0 threadid=1 num_gangs=32768 num_workers=1 vector_length=128 grid=8x1024x4 block=128
    Time OpenACC - Independent Seq: 0.040719 s.
    
    launch CUDA kernel  file=/home/cuan/acc/main.cpp function=main line=89 device=0 threadid=1 num_gangs=8192 num_workers=4 vector_length=32 grid=32x256 block=32x4
    launch CUDA kernel  file=/home/cuan/acc/main.cpp function=main line=101 device=0 threadid=1 num_gangs=8192 num_workers=4 vector_length=32 grid=32x256 block=32x4
    Time OpenACC - Independent Reduction: 0.012491 s.
    
    launch CUDA kernel  file=/home/cuan/acc/main.cpp function=main line=119 device=0 threadid=1 num_gangs=8192 num_workers=4 vector_length=32 grid=32x256 block=32x4
    launch CUDA kernel  file=/home/cuan/acc/main.cpp function=main line=131 device=0 threadid=1 num_gangs=2048 num_workers=8 vector_length=16 grid=64x32 block=16x8
    Time OpenACC - Gang Vector: 0.012314 s.
    
    launch CUDA kernel  file=/home/cuan/acc/main.cpp function=main line=149 device=0 threadid=1 num_gangs=8192 num_workers=4 vector_length=32 grid=32x256 block=32x4
    launch CUDA kernel  file=/home/cuan/acc/main.cpp function=main line=160 device=0 threadid=1 num_gangs=1024 num_workers=1 vector_length=1024 grid=1024 block=1024
    Time OpenACC - tile: 0.013609 s.
    
    launch CUDA kernel  file=/home/cuan/acc/main.cpp function=main line=179 device=0 threadid=1 num_gangs=8192 num_workers=4 vector_length=32 grid=32x256 block=32x4
    launch CUDA kernel  file=/home/cuan/acc/main.cpp function=main line=190 device=0 threadid=1 num_gangs=8192 num_workers=1 vector_length=128 grid=8192 block=128
    Time OpenACC - Collapse: 0.012676 s.
    
    Time OpenMP: 0.504436 s.
    
    
    Accelerator Kernel Timing data
    /home/cuan/acc/main.cpp
      main  NVIDIA  devicenum=0
        time(us): 112,420
        19: compute region reached 1 time
            22: kernel launched 1 time
                grid: [32x256]  block: [32x4]
                 device time(us): total=38 max=38 min=38 avg=38
                elapsed time(us): total=317 max=317 min=317 avg=317
            27: kernel launched 1 time
                grid: [32x256]  block: [32x4]
                 device time(us): total=39 max=39 min=39 avg=39
                elapsed time(us): total=50 max=50 min=50 avg=50
            32: kernel launched 1 time
                grid: [32x256]  block: [32x4]
                 device time(us): total=39 max=39 min=39 avg=39
                elapsed time(us): total=50 max=50 min=50 avg=50
        19: data region reached 3 times
        40: compute region reached 1 time
            48: kernel launched 1 time
                grid: [8x1024]  block: [128]
                 device time(us): total=18,705 max=18,705 min=18,705 avg=18,705
                elapsed time(us): total=18,717 max=18,717 min=18,717 avg=18,717
        40: data region reached 2 times
        56: compute region reached 1 time
            61: kernel launched 1 time
                grid: [32x256]  block: [32x4]
                 device time(us): total=43 max=43 min=43 avg=43
                elapsed time(us): total=176 max=176 min=176 avg=176
        56: data region reached 2 times
        68: compute region reached 1 time
            76: kernel launched 1 time
                grid: [8x1024x4]  block: [128]
                 device time(us): total=40,585 max=40,585 min=40,585 avg=40,585
                elapsed time(us): total=40,709 max=40,709 min=40,709 avg=40,709
        68: data region reached 2 times
        84: compute region reached 1 time
            89: kernel launched 1 time
                grid: [32x256]  block: [32x4]
                 device time(us): total=39 max=39 min=39 avg=39
                elapsed time(us): total=71 max=71 min=71 avg=71
        84: data region reached 2 times
        96: compute region reached 1 time
            101: kernel launched 1 time
                grid: [32x256]  block: [32x4]
                 device time(us): total=12,456 max=12,456 min=12,456 avg=12,456
                elapsed time(us): total=12,467 max=12,467 min=12,467 avg=12,467
        96: data region reached 2 times
        114: compute region reached 1 time
            119: kernel launched 1 time
                grid: [32x256]  block: [32x4]
                 device time(us): total=37 max=37 min=37 avg=37
                elapsed time(us): total=63 max=63 min=63 avg=63
        114: data region reached 2 times
        126: compute region reached 1 time
            131: kernel launched 1 time
                grid: [64x32]  block: [16x8]
                 device time(us): total=12,295 max=12,295 min=12,295 avg=12,295
                elapsed time(us): total=12,306 max=12,306 min=12,306 avg=12,306
        126: data region reached 2 times
        144: compute region reached 1 time
            149: kernel launched 1 time
                grid: [32x256]  block: [32x4]
                 device time(us): total=42 max=42 min=42 avg=42
                elapsed time(us): total=187 max=187 min=187 avg=187
        144: data region reached 2 times
        156: compute region reached 1 time
            160: kernel launched 1 time
                grid: [1024]  block: [1024]
                 device time(us): total=13,447 max=13,447 min=13,447 avg=13,447
                elapsed time(us): total=13,599 max=13,599 min=13,599 avg=13,599
        156: data region reached 2 times
        174: compute region reached 1 time
            179: kernel launched 1 time
                grid: [32x256]  block: [32x4]
                 device time(us): total=41 max=41 min=41 avg=41
                elapsed time(us): total=173 max=173 min=173 avg=173
        174: data region reached 2 times
        186: compute region reached 1 time
            190: kernel launched 1 time
                grid: [8192]  block: [128]
                 device time(us): total=12,651 max=12,651 min=12,651 avg=12,651
                elapsed time(us): total=12,669 max=12,669 min=12,669 avg=12,669
        186: data region reached 2 times
        208: data region reached 1 time
            208: data copyout transfers: 3
                 device time(us): total=1,963 max=674 min=644 avg=654
  • 相关阅读:
    答题卡
    hdu 5451 Best Solver
    L. Poor God Water(ACM-ICPC 2018 焦作赛区网络预赛)
    MicroRNA Ranking(Tehran2016)
    Split The Tree(2018东北四省赛)
    Django项目基础开发流程
    暑假学习进度记录墙
    抖音字体设置
    十大危险cmd指令
    奶牛的聚会
  • 原文地址:https://www.cnblogs.com/cuancuancuanhao/p/9459007.html
Copyright © 2011-2022 走看看