▶ 按照书上的代码完成了 OpenACC 与CUDA 的相互调用,以及 OpenACC 调用 cuBLAS。遇到了很多问题,如 CUDA 版本,代码版本,计算能力指定等,以后填坑。
● 代码,OpenACC 调用 CUDA
1 // kernel.cu 2 __global__ void saxpy_kernel(const int n, const float a, float *x, float *y) 3 { 4 int id = blockIdx.x * blockDim.x + threadIdx.x; 5 if (id < n) 6 y[id] += a * x[id]; 7 } 8 9 extern "C" void saxpy(const int n, const float a, float *x, float *y) 10 { 11 saxpy_kernel << < (n + 128 - 1) / 128, 128 >> > (n, a, x, y); 12 } 13 14 // main.c 15 #include <stdio.h> 16 #include <stdlib.h> 17 18 #define N 1024 19 20 #pragma acc routine seq 21 extern void saxpy(int n, float a, float *x, float *y); 22 23 int main() 24 { 25 float *x = (float *)malloc(sizeof(float)*N); 26 float *y = (float *)malloc(sizeof(float)*N); 27 28 #pragma acc data create(x[0:N]) copyout(y[0:N]) 29 { 30 #pragma acc kernels 31 #pragma acc loop independent 32 for (int i = 0; i < N; i++) 33 { 34 x[i] = 1.0f; 35 y[i] = 4.0f; 36 } 37 #pragma acc host_data use_device(x, y) 38 saxpy(N, 2.0f, x, y); 39 } 40 #pragma wait 41 42 printf(" y[0] = %f ", y[0]); 43 free(x); 44 free(y); 45 //getchar(); 46 return 0; 47 }
● 输出结果,代码在 win10上不能链接,报错:LINK : fatal error LNK1104: 无法打开文件“libcudapgi.lib”,WSL上输出结果不正确,在 Ubuntu 中报链接错误。参考了 参考https://blog.csdn.net/wcj0626/article/details/12611689?locationNum=12&fps=1 和 https://stackoverflow.com/questions/31737024/openacc-calling-cuda-device-kernel-from-openacc-parallel-loop,还是没有解决问题
WSL: cuan@CUAN:/mnt/d/Code/CUDA/cudaProject/cudaProject$ nvcc -c kernel.cu -rdc=true cuan@CUAN:/mnt/d/Code/CUDA/cudaProject/cudaProject$ pgcc -acc -c main.c cuan@CUAN:/mnt/d/Code/CUDA/cudaProject/cudaProject$ pgcc -ta=tesla:rdc,cuda9.1 -Mcuda -o acc.exe main.o kernel.o cuan@CUAN:/mnt/d/Code/CUDA/cudaProject/cudaProject$ ./acc.exe y[0] = 4.000000 Ubuntu: @E@nvlink fatal : elfLink fatbinary error pgacclnk: child process exit status 2: /usr/local/pgi/linux86-64/18.4/bin/pgnvd
● 代码,OpenACC 调用 CUDA
1 // fun.c 2 void set(const int n, const float c, float *x) 3 { 4 #pragma acc kernels deviceptr(x) 5 for (int i = 0; i < n; i++) 6 x[i] = c; 7 } 8 9 void saxpy(const int n, const float a, float *restrict x, float *restrict y) 10 { 11 #pragma acc kernels deviceptr(x, y) 12 for (int i = 0; i < n; i++) 13 y[i] += a * x[i]; 14 } 15 16 // main.cu 17 #include <stdio.h> 18 #include <cuda.h> 19 #include "cuda_runtime.h" 20 #include "device_launch_parameters.h" 21 22 #define N 1024 23 24 extern "C" void set(int, float, float *); 25 extern "C" void saxpy(int, float, float *, float *); 26 27 int main() 28 { 29 float *x, *y, y0; 30 cudaMalloc((void**)&x, sizeof(float)*N); 31 cudaMalloc((void**)&y, sizeof(float)*N); 32 33 set(N, 1.0f, x); 34 set(N, 0.0f, y); 35 saxpy(N, 2.0f, x, y); 36 cudaMemcpy(&y0, y, sizeof(float), cudaMemcpyDeviceToHost); 37 38 printf(" y[0] = %f ", y0); 39 cudaFree(x); 40 cudaFree(y); 41 //getchar(); 42 return 0; 43 }
● 输出结果,代码在 win10上不能链接,WSL 和 Ubuntu 中报链接错误
WSL: cuan@CUAN:/mnt/d/Code/CUDA/cudaProject/cudaProject$ nvcc -c main.cu -rdc=true cuan@CUAN:/mnt/d/Code/CUDA/cudaProject/cudaProject$ pgcc -acc -c fun.c cuan@CUAN:/mnt/d/Code/CUDA/cudaProject/cudaProject$ pgcc -ta=tesla:rdc,cuda9.1 -Mcuda -o acc.exe main.o fun.o cuan@CUAN:/mnt/d/Code/CUDA/cudaProject/cudaProject$ ./acc.exe Segmentation fault (core dumped) Ubuntu: cuan@CUAN:~/Temp$ nvcc -c main.cu -rdc=true cuan@CUAN:~/Temp$ pgcc -acc -c fun.c cuan@CUAN:~/Temp$ pgcc -ta=tesla:rdc,cuda9.1 -Mcuda -o acc.exe main.o fun.o @E@nvlink fatal : elfLink fatbinary error pgacclnk: child process exit status 2: /usr/local/pgi/linux86-64/18.4/bin/pgnvd
● 代码,CUDA 调用 OpenACC,捆绑变量地址
1 // fun.c 2 #include <openacc.h> 3 4 void map(float *restrict pHost, float *restrict pDevice, int sizeByte) 5 { 6 acc_map_data(pHost, pDevice, sizeByte); 7 } 8 9 void set(int n, float c, float *x) 10 { 11 #pragma acc kernels present(x) 12 for (int i = 0; i < n; i++) 13 x[i] = c; 14 } 15 16 void saxpy(int n, float a, float *restrict x, float *restrict y) 17 { 18 #pragma acc kernels deviceptr(x,y) 19 for (int i = 0; i < n; i++) 20 y[i] += a * x[i]; 21 } 22 23 // main.cu 24 #include <stdio.h> 25 #include <stdlib.h> 26 27 #define N 1024 28 29 extern "C" void map(float *, float *, int); 30 extern "C" void set(int, float, float *); 31 extern "C" void saxpy(int, float, float *, float *); 32 33 int main() 34 { 35 36 float *x = (float *)malloc(sizeof(float)*N); 37 float *y = (float *)malloc(sizeof(float)*N); 38 float *dx, *dy, y0; 39 cudaMalloc((void**)&dx, sizeof(float)*N); 40 cudaMalloc((void**)&dy, sizeof(float)*N); 41 42 map(x, dx, sizeof(float)*N); 43 map(y, dy, sizeof(float)*N); 44 set(N, 1.0f, x); 45 set(N, 4.0f, y); 46 saxpy(N, 2.0f, x, y); 47 cudaMemcpy(&y0, y, sizeof(float), cudaMemcpyDeviceToHost); 48 49 printf(" y[0] = %f ",y0); 50 free(x); 51 free(y); 52 cudaFree(x); 53 cudaFree(y); 54 //getchar(); 55 return 0; 56 }
● 输出结果,代码在 win10上不能链接,在 WSL 上结果正确,在 Ubuntu 中未尝试
WSL: cuan@CUAN:/mnt/d/Code/CUDA/cudaProject/cudaProject$ nvcc -c main.cu -rdc=true cuan@CUAN:/mnt/d/Code/CUDA/cudaProject/cudaProject$ pgcc -c fun.c -acc cuan@CUAN:/mnt/d/Code/CUDA/cudaProject/cudaProject$ pgcc -acc -Mcuda -o acc.exe main.o fun.o -ta=tesla:rdc,cuda9.1 cuan@CUAN:/mnt/d/Code/CUDA/cudaProject/cudaProject$ ./acc.exe y[0] = 6.000000
● 代码,OpenACC 调用 cuBLAS
1 #include <stdio.h> 2 #include <stdlib.h> 3 4 #define N 1024 5 6 extern void cublasSaxpy(int, float, float *, int, float *, int); 7 8 int main() 9 { 10 float *x = (float *)malloc(sizeof(float)*N); 11 float *y = (float *)malloc(sizeof(float)*N); 12 13 #pragma acc data create(x[0:N]) copyout(y[0:N]) 14 { 15 #pragma acc kernels 16 for (int i = 0; i < N; i++) 17 { 18 x[i] = 1.0f; 19 y[i] = 4.0f; 20 } 21 #pragma acc host_data use_device(x,y) 22 { 23 cublasSaxpy(N, 2.0f, x, 1, y, 1); 24 } 25 } 26 27 printf(" y[0] = %f ", y[0]); 28 free(x); 29 free(y); 30 //getchar(); 31 return 0; 32 }
● 输出结果,代码在 win10上不能链接,在 WSL 中结果错误,在 Ubuntu 中结果正确
WSL: cuan@CUAN:/mnt/d/Code/CUDA/cudaProject/cudaProject$ nvcc -c fun.c -rdc=true cuan@CUAN:/mnt/d/Code/CUDA/cudaProject/cudaProject$ pgcc -acc -c main.c cuan@CUAN:/mnt/d/Code/CUDA/cudaProject/cudaProject$ pgcc -acc -Mcuda -lcublas -o acc.exe main.o cuan@CUAN:/mnt/d/Code/CUDA/cudaProject/cudaProject$ ./acc.exe y[0] = 4.000000 Ubuntu: cuan@CUAN:~/Temp$ nvcc -c fun.c -rdc=true cuan@CUAN:~/Temp$ pgcc -acc -c main.c cuan@CUAN:~/Temp$ pgcc -acc -Mcuda -lcublas -o acc.exe main.o cuan@CUAN:~/Temp$ ./acc.exe y[0] = 6.000000