▶ 使用 OpenACC 的 parallel 构件来计算规约,win10 pgi 和 win10 WSL pgi 结果的不同,以及 for 循环的一个小坑
● 正常的代码
1 #include <stdio.h> 2 #include <openacc.h> 3 4 const int N = 100; 5 6 int main() 7 { 8 int i, sum, temp, a[N]; 9 for (i = sum = temp = 0; i < N; i++) 10 a[i] = i; 11 12 #ifdef _OPENACC 13 printf("device:%d, device nvidia:%d", acc_get_num_devices(acc_device_default), acc_get_num_devices(acc_device_nvidia));// 检查计算设别和 nvidia 计算设备数量 14 #pragma acc parallel 15 { 16 #pragma acc loop reduction(+:sum) // 规约计算了 1+2+...+99 17 for (i = 0; i < N; i++) 18 sum += a[i]; 19 20 temp = sum; // 在退出 parallel 构件之前就尝试使用规约变量 sum 21 } 22 printf(" Sum = %d, temp = %d ", sum, temp); 23 24 #else 25 printf(" ACC not support. "); 26 #endif 27 getchar(); 28 return 0; 29 }
● 输出结果(分别使用 win10 pgi 编译器和 win10 WSL pgi 编译器)
D:CodeOpenACCOpenACCProjectOpenACCProject>pgcc main.c -o main.exe -acc -Minfo main: 14, Accelerator kernel generated Generating Tesla code 17, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */ Generating reduction(+:sum) 14, Generating implicit copy(sum) Generating implicit copyin(a[:N])
D:CodeOpenACCOpenACCProjectOpenACCProject>main.exe
device:1, device nvidia:1 // 只有一台计算设备,应该是独立显卡
launch CUDA kernel file=D:CodeOpenACCOpenACCProjectOpenACCProjectmain.c function=main line=14 device=0 threadid=1 num_gangs=1 num_workers=1 vector_length=128 grid=1 block=128 shared memory=1024
launch CUDA kernel file=D:CodeOpenACCOpenACCProjectOpenACCProjectmain.c function=main line=14 device=0 threadid=1 num_gangs=1 num_workers=1 vector_length=256 grid=1 block=256 shared memory=1024 Sum = 4950, temp = 0 // 正确的计算了 1+2+...+99,temp 不能获得正确的结果
cuan@CUAN:/mnt/d/Code/OpenACC/OpenACCProject/OpenACCProject$ pgcc main.c -o main-ubuntu.exe -acc -Minfo main: 14, Accelerator kernel generated Generating Tesla code 17, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */ Generating reduction(+:sum) 14, Generating implicit copy(sum) Generating implicit copyin(a[:N]) cuan@CUAN:/mnt/d/Code/OpenACC/OpenACCProject/OpenACCProject$ ./main-ubuntu.exe device:1, device nvidia:0 // 只有一台计算设备,默认是 CPU ?而且没有输出 CUDA 内核的相关信息 Sum = 4950, temp = 4950 // temp 竟然是对的
● 大坑注意:
■ OpenACC 所有 for 循环仅支持简单语句(不仅限制并行构建),将上述代码中的第 9 ~ 10 行改成 for (i = sum = temp = 0; i < N; a[i] = i++); 后会出现以下结果,但是如果使用 for (i = sum = temp = 0; i < N; a[i] = i, i++); 是正确的。
D:CodeOpenACCOpenACCProjectOpenACCProject>pgcc main.c -o main.exe -Minfo -acc main: // 编译没有出现错误提示 13, Accelerator kernel generated Generating Tesla code 16, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */ Generating reduction(+:sum) 13, Generating implicit copy(sum) Generating implicit copyin(a[:N])
D:CodeOpenACCOpenACCProjectOpenACCProject>main.exe device:1, device nvidia:1 // 每次执行结果不同 Sum = 5706819, temp = 5701968 D:CodeOpenACCOpenACCProjectOpenACCProject>main.exe device:1, device nvidia:1 Sum = 5313603, temp = 5308752
cuan@CUAN:/mnt/d/Code/OpenACC/OpenACCProject/OpenACCProject$ pgcc main.c -o main-ubuntu.exe -acc -Minfo main: // 编译没有出现错误提示 13, Accelerator kernel generated Generating Tesla code 16, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */ Generating reduction(+:sum) 13, Generating implicit copy(sum) Generating implicit copyin(a[:N]) cuan@CUAN:/mnt/d/Code/OpenACC/OpenACCProject/OpenACCProject$ ./main-ubuntu.exe device:1, device nvidia:0 // 每次结果相同,但答案不正确 Sum = 4851, temp = 4851 cuan@CUAN:/mnt/d/Code/OpenACC/OpenACCProject/OpenACCProject$ ./main-ubuntu.exe device:1, device nvidia:0 Sum = 4851, temp = 4851