▶ 使用 kernels 导语并行化 for 循环
● 一重循环
1 #include <stdio.h> 2 #include <time.h> 3 #include <openacc.h> 4 5 const int row = 128 * 256 * 512; 6 7 int main() 8 { 9 int a[row], b[row], c[row]; 10 for (int i = 0; i < row; ++i) // 填充 a 和 b 11 a[i] = b[i] = i; 12 13 clock_t time = clock(); 14 #ifdef _OPENACC // 使用 OpenACC 时执行本段 15 #pragma acc kernels 16 for (int i = 0; i < row; ++i) // c = a + b 17 c[i] = a[i] + b[i]; 18 time = clock() - time; 19 printf(" Time with acc:%d ms ", time); 20 #else // 不用 OpenACC 时执行本段 21 for (int i = 0; i < row; i++) 22 c[i] = a[i] + b[i]; 23 time = clock() - time; 24 printf(" Time without acc:%d ms ", time); 25 #endif 26 getchar(); 27 return 0; 28 }
● 输出结果
D:CodeOpenACC>pgcc main.c -o main-no-acc.exe -Minfo // 编译,-Minfo 要求输出编译优化信息,没有额外输出 D:CodeOpenACC>pgcc main.c -o main.exe -Minfo -acc // 编译,-acc 要求使用 OpenACC main: 15, Generating implicit copyin(b[:row]) // 数据管理控制 Generating implicit copyout(c[:row]) Generating implicit copyin(a[:row]) 16, Loop is parallelizable // 并行优化 Generating Tesla code 16, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */ // 使用默认 vector 尺寸 D:CodeOpenACC>main-no-acc.exe Time without acc:22 ms D:CodeOpenACC>main-acc.exe launch CUDA kernel file=D:CodeOpenACCmain.c function=main line=16 device=0 threadid=1 num_gangs=65535 num_workers=1 vector_length=128 grid=65536 block=128 // 对代码第 16 行的 for 进行了并行优化, Time with acc:223 ms // 使用第 0 号设备(GPU) // 线程编号 1,使用 gang 65536 个,worker 1 个,vector 宽度 128 // CUDA 配置为 gridDim.x = 65536,blockDim.x = 128Time // 每单元计算负载 = row / grid / block = 2
● 二重循环
1 #include <stdio.h> 2 #include <time.h> 3 #include <openacc.h> 4 5 const int row = 128 * 256, col = 512; 6 7 int main() 8 { 9 int a[row][col], b[row][col], c[row][col]; 10 for (int i = 0; i < row; i++) 11 { 12 for (int j = 0; j < col; j++) 13 a[i][j] = b[i][j] = i * j; 14 } 15 16 clock_t time = clock(); 17 #ifdef _OPENACC 18 #pragma acc kernels 19 for (int i = 0; i < row; i++) // c = a + b 20 { 21 for (int j = 0; j < col; j++) 22 c[i][j] = a[i][j] + b[i][j]; 23 } 24 time = clock() - time; 25 printf(" Time with acc:%d ms ", time); 26 #else 27 for (int i = 0; i < row; i++) 28 { 29 for (int j = 0; j < col; j++) 30 c[i][j] = a[i][j] + b[i][j]; 31 } 32 time = clock() - time; 33 printf(" Time without acc:%d ms ", time); 34 #endif 35 getchar(); 36 return 0; 37 }
● 输出结果
D:CodeOpenACC>pgcc main.c -o main-no-acc.exe -Minfo D:CodeOpenACC>pgcc main.c -o main-acc.exe -Minfo -acc main: 18, Generating implicit copyin(a[:row][:col]) Generating implicit copyout(c[:row][:col]) Generating implicit copyin(b[:row][:col]) 19, Loop is parallelizable 21, Loop is parallelizable Generating Tesla code 19, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */ // 高一层的循环使用的是 worker 21, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */ D:CodeOpenACC>main-no-acc.exe Time without acc:35 ms D:CodeOpenACC>main-acc.exe launch CUDA kernel file=D:CodeOpenACCmain.c function=main line=21 device=0 threadid=1 num_gangs=32768 num_workers=4 vector_length=32 grid=16x2048 block=32x4 // 注意参数变化,仍有 num_gangs = grid,num_workers * vector_length = block Time with acc:221 ms // 每单元计算负载 = row * col / grid / block = 4
● 三重循环
1 #include <stdio.h> 2 #include <time.h> 3 #include <openacc.h> 4 5 const int row = 128, col = 256, page = 512; 6 7 int main() 8 { 9 int a[row][col][page], b[row][col][page], c[row][col][page]; 10 for (int i = 0; i < row; i++) 11 { 12 for (int j = 0; j < col; j++) 13 { 14 for (int k = 0; k < page; k++) 15 a[i][j][k] = b[i][j][k] = i * j + k; 16 } 17 } 18 clock_t time = clock(); 19 #ifdef _OPENACC 20 #pragma acc kernels 21 for (int i = 0; i < row; i++) // c = a + b 22 { 23 for (int j = 0; j < col; j++) 24 { 25 for (int k = 0; k < page; k++) 26 c[i][j][k] = a[i][j][k] + b[i][j][k]; 27 } 28 } 29 time = clock() - time; 30 printf(" Time with acc:%d ms ", time); 31 #else 32 for (int i = 0; i < row; i++) 33 { 34 for (int j = 0; j < col; j++) 35 { 36 for (int k = 0; k < page; k++) 37 c[i][j][k] = a[i][j][k] + b[i][j][k]; 38 } 39 } 40 time = clock() - time; 41 printf(" Time without acc:%d ms ", time); 42 #endif 43 getchar(); 44 return 0; 45 }
● 输出结果
D:CodeOpenACC>pgcc main.c -o main-no-acc.exe -Minfo D:CodeOpenACC>pgcc main.c -o main-acc.exe -Minfo -acc main: 20, Generating implicit copyin(b[:row][:col][:page]) Generating implicit copyout(c[:row][:col][:page]) Generating implicit copyin(a[:row][:col][:page]) 21, Loop is parallelizable 23, Loop is parallelizable 25, Loop is parallelizable Generating Tesla code 21, #pragma acc loop gang /* blockIdx.y */ // 最高层循环尝试调整 grid 23, #pragma acc loop gang, vector(4) /* blockIdx.z threadIdx.y */ 25, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */ D:CodeOpenACC>main-no-acc.exe Time without acc:56 ms D:CodeOpenACC>main-acc.exe launch CUDA kernel file=D:CodeOpenACCmain.c function=main line=25 device=0 threadid=1 num_gangs=32768 num_workers=4 vector_length=32 grid=16x128x16 block=32x4 // grid 变成了三维 Time with acc:231 ms // 每单元计算负载 = row *col * page / grid / block = 4 // row 改为 64,则 grid=16x128x16 block=32x4,计算负载 = 2 // col 改为 128,则 grid=16x128x16 block=32x4,计算负载 = 2 // page 改为 256,则 grid=8x128x32 block=32x4,计算负载 = 2 // row 改为 32,则 grid=16x32x64 block=32x4,计算负载 = 1
● 在 ubuntu 上跑一重循环的代码,注意计时器单位是 μs
cuan@CUAN:~/Temp$ pgcc -acc main.c -o main.exe cuan@CUAN:~/Temp$ pgcc main.c -o main-no-acc.exe cuan@CUAN:~/Temp$ ./main.exe Time with acc:174795 us cuan@CUAN:~/Temp$ ./main-no-acc.exe Time without acc:17170 us