zoukankan      html  css  js  c++  java
  • OpenACC kernels

    ▶ 使用 kernels 导语并行化 for 循环

    ● 一重循环

     1 #include <stdio.h>
     2 #include <time.h>
     3 #include <openacc.h>
     4 
     5 const int row = 128 * 256 * 512;
     6 
     7 int main()
     8 {
     9     int a[row], b[row], c[row];
    10     for (int i = 0; i < row; ++i)                // 填充 a 和 b
    11         a[i] = b[i] = i;
    12 
    13     clock_t time = clock();
    14 #ifdef _OPENACC                                  // 使用 OpenACC 时执行本段
    15 #pragma acc kernels
    16     for (int i = 0; i < row; ++i)                // c = a + b
    17         c[i] = a[i] + b[i];
    18     time = clock() - time;
    19     printf("
    Time with acc:%d ms
    ", time);
    20 #else                                            // 不用 OpenACC 时执行本段
    21     for (int i = 0; i < row; i++)
    22         c[i] = a[i] + b[i];
    23     time = clock() - time;
    24     printf("
    Time without acc:%d ms
    ", time);
    25 #endif
    26     getchar();
    27     return 0;
    28 }

    ● 输出结果

    D:CodeOpenACC>pgcc main.c -o main-no-acc.exe -Minfo                                   // 编译,-Minfo 要求输出编译优化信息,没有额外输出
    
    D:CodeOpenACC>pgcc main.c -o main.exe -Minfo -acc                                     // 编译,-acc 要求使用 OpenACC
    main:
         15, Generating implicit copyin(b[:row])                                            // 数据管理控制
             Generating implicit copyout(c[:row])
             Generating implicit copyin(a[:row])
         16, Loop is parallelizable                                                         // 并行优化 
             Generating Tesla code
             16, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */            // 使用默认 vector 尺寸
    
    D:CodeOpenACC>main-no-acc.exe
    
    Time without acc:22 ms
    
    D:CodeOpenACC>main-acc.exe
    launch CUDA kernel  file=D:CodeOpenACCmain.c function=main line=16 device=0 threadid=1 num_gangs=65535 num_workers=1 vector_length=128 grid=65536 block=128
                                                                                            // 对代码第 16 行的 for 进行了并行优化,
    Time with acc:223 ms                                                                    // 使用第 0 号设备(GPU)
                                                                                            // 线程编号 1,使用 gang 65536 个,worker 1 个,vector 宽度 128
                                                                                            // CUDA 配置为 gridDim.x = 65536,blockDim.x = 128Time
                                                                                            // 每单元计算负载 = row / grid / block = 2

    ● 二重循环

     1 #include <stdio.h>
     2 #include <time.h>
     3 #include <openacc.h>
     4 
     5 const int row = 128 * 256, col = 512;
     6 
     7 int main()
     8 {
     9     int a[row][col], b[row][col], c[row][col];    
    10     for (int i = 0; i < row; i++)
    11     {
    12         for (int j = 0; j < col; j++)
    13             a[i][j] = b[i][j] = i * j;
    14     }
    15     
    16     clock_t time = clock();
    17 #ifdef _OPENACC        
    18 #pragma acc kernels
    19     for (int i = 0; i < row; i++)                // c = a + b
    20     {
    21         for (int j = 0; j < col; j++)
    22             c[i][j] = a[i][j] + b[i][j];
    23     }
    24     time = clock() - time;
    25     printf("
    Time with acc:%d ms
    ", time);
    26 #else    
    27     for (int i = 0; i < row; i++)
    28     {
    29         for (int j = 0; j < col; j++)
    30             c[i][j] = a[i][j] + b[i][j];
    31     }
    32     time = clock() - time;
    33     printf("
    Time without acc:%d ms
    ", time);
    34 #endif
    35     getchar();
    36     return 0;
    37 }

    ● 输出结果

    D:CodeOpenACC>pgcc main.c -o main-no-acc.exe -Minfo
    
    D:CodeOpenACC>pgcc main.c -o main-acc.exe -Minfo -acc
    main:
         18, Generating implicit copyin(a[:row][:col])
             Generating implicit copyout(c[:row][:col])
             Generating implicit copyin(b[:row][:col])
         19, Loop is parallelizable
         21, Loop is parallelizable
             Generating Tesla code
             19, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */      // 高一层的循环使用的是 worker
             21, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
    
    D:CodeOpenACC>main-no-acc.exe
    
    Time without acc:35 ms
    
    
    D:CodeOpenACC>main-acc.exe
    launch CUDA kernel  file=D:CodeOpenACCmain.c function=main line=21 device=0 threadid=1 num_gangs=32768 num_workers=4 vector_length=32 grid=16x2048 block=32x4
                                                                                    // 注意参数变化,仍有 num_gangs = grid,num_workers * vector_length = block
    Time with acc:221 ms                                                            // 每单元计算负载 = row * col / grid / block = 4

    ● 三重循环

     1 #include <stdio.h>
     2 #include <time.h>
     3 #include <openacc.h>
     4 
     5 const int row = 128, col = 256, page = 512;
     6 
     7 int main()
     8 {
     9     int a[row][col][page], b[row][col][page], c[row][col][page];     
    10     for (int i = 0; i < row; i++)
    11     {
    12         for (int j = 0; j < col; j++)
    13         {
    14             for (int k = 0; k < page; k++)
    15                 a[i][j][k] = b[i][j][k] = i * j + k;
    16         }
    17     }
    18     clock_t time = clock();
    19 #ifdef _OPENACC        
    20 #pragma acc kernels
    21     for (int i = 0; i < row; i++)                        // c = a + b
    22     {
    23         for (int j = 0; j < col; j++)
    24         {
    25             for (int k = 0; k < page; k++)
    26                 c[i][j][k] = a[i][j][k] + b[i][j][k];
    27         }
    28     }
    29     time = clock() - time;
    30     printf("
    Time with acc:%d ms
    ", time);
    31 #else
    32     for (int i = 0; i < row; i++)
    33     {
    34         for (int j = 0; j < col; j++)
    35         {
    36             for (int k = 0; k < page; k++)
    37                 c[i][j][k] = a[i][j][k] + b[i][j][k];
    38         }
    39     }
    40     time = clock() - time;
    41     printf("
    Time without acc:%d ms
    ", time);
    42 #endif
    43     getchar();
    44     return 0;
    45 }

    ● 输出结果

    D:CodeOpenACC>pgcc main.c -o main-no-acc.exe -Minfo
    
    D:CodeOpenACC>pgcc main.c -o main-acc.exe -Minfo -acc
    main:
         20, Generating implicit copyin(b[:row][:col][:page])
             Generating implicit copyout(c[:row][:col][:page])
             Generating implicit copyin(a[:row][:col][:page])
         21, Loop is parallelizable
         23, Loop is parallelizable
         25, Loop is parallelizable
             Generating Tesla code
             21, #pragma acc loop gang /* blockIdx.y */                             // 最高层循环尝试调整 grid
             23, #pragma acc loop gang, vector(4) /* blockIdx.z threadIdx.y */
             25, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
    
    D:CodeOpenACC>main-no-acc.exe
    
    Time without acc:56 ms
    
    
    D:CodeOpenACC>main-acc.exe
    launch CUDA kernel  file=D:CodeOpenACCmain.c function=main line=25 device=0 threadid=1 num_gangs=32768 num_workers=4 vector_length=32 grid=16x128x16 block=32x4
                                                                                    // grid 变成了三维                                                                                
    Time with acc:231 ms                                                            // 每单元计算负载 = row *col * page / grid / block = 4
                                                                                    // row 改为 64,则 grid=16x128x16 block=32x4,计算负载 = 2
                                                                                    // col 改为 128,则 grid=16x128x16 block=32x4,计算负载 = 2
                                                                                    // page 改为 256,则 grid=8x128x32 block=32x4,计算负载 = 2
                                                                                    // row 改为 32,则 grid=16x32x64 block=32x4,计算负载 = 1

    ● 在 ubuntu 上跑一重循环的代码,注意计时器单位是 μs

    cuan@CUAN:~/Temp$ pgcc -acc main.c -o main.exe
    cuan@CUAN:~/Temp$ pgcc main.c -o main-no-acc.exe
    cuan@CUAN:~/Temp$ ./main.exe 
    
    Time with acc:174795 us
    
    cuan@CUAN:~/Temp$ ./main-no-acc.exe 
    
    Time without acc:17170 us
  • 相关阅读:
    使用MVC模型的几个常见误区
    ModelViewControl
    真的简单,还是盲目乐观?
    Kernel Korner Why and How to Use Netlink Socket
    我们手机平台的几个基础模型
    彩信库(mmslib)设计备忘录
    消极状态集
    文摘《十三》
    文摘《十二》
    文摘《十一》
  • 原文地址:https://www.cnblogs.com/cuancuancuanhao/p/9000493.html
Copyright © 2011-2022 走看看