▶ 使用 CUDA Runtime API,运行时编译,Driver API 三种接口计算向量加法
▶ 源代码,CUDA Runtime API
1 #include <stdio.h> 2 #include <cuda_runtime.h> 3 #include "device_launch_parameters.h" 4 #include <helper_cuda.h> 5 6 #define ELEMENT 50000 7 8 __global__ void vectorAdd(const float *A, const float *B, float *C, int size) 9 { 10 int i = blockDim.x * blockIdx.x + threadIdx.x; 11 if (i < size) 12 C[i] = A[i] + B[i]; 13 } 14 15 int main() 16 { 17 printf(" Start. "); 18 size_t size = ELEMENT * sizeof(float); 19 20 float *h_A = (float *)malloc(size); 21 float *h_B = (float *)malloc(size); 22 float *h_C = (float *)malloc(size); 23 float *d_A = NULL; 24 float *d_B = NULL; 25 float *d_C = NULL; 26 cudaMalloc((void **)&d_A, size); 27 cudaMalloc((void **)&d_B, size); 28 cudaMalloc((void **)&d_C, size); 29 for (int i = 0; i < ELEMENT; ++i) 30 { 31 h_A[i] = rand() / (float)RAND_MAX; 32 h_B[i] = rand() / (float)RAND_MAX; 33 } 34 cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice); 35 cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice); 36 37 int threadsPerBlock = 256; 38 int blocksPerGrid = (ELEMENT + threadsPerBlock - 1) / threadsPerBlock; 39 vectorAdd << <blocksPerGrid, threadsPerBlock >> > (d_A, d_B, d_C, ELEMENT); 40 cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost); 41 42 for (int i = 0; i < ELEMENT; ++i) 43 { 44 if (fabs(h_A[i] + h_B[i] - h_C[i]) > 1e-5) 45 { 46 printf(" Result error at i = %d, h_A[i] = %f, h_B[i] = %f, h_C[i] = %f ", i, h_A[i], h_B[i], h_C[i]); 47 getchar(); 48 return 1; 49 } 50 } 51 52 free(h_A); 53 free(h_B); 54 free(h_C); 55 cudaFree(d_A); 56 cudaFree(d_B); 57 cudaFree(d_C); 58 printf(" Finish. "); 59 getchar(); 60 return 0; 61 }
● 输出结果:
Start.
Finish.
▶ 源代码,运行时编译
1 // vectorAdd_kernel.cu 2 extern "C" __global__ void vectorAdd(const float *A, const float *B, float *C, int size) 3 { 4 int i = blockDim.x * blockIdx.x + threadIdx.x; 5 if (i < size) 6 C[i] = A[i] + B[i]; 7 }
1 // vectorAdd.cpp 2 #include <stdio.h> 3 #include <cuda_runtime.h> 4 #include "device_launch_parameters.h" 5 #include <cuda.h> 6 #include <nvrtc_helper.h> 7 8 #define ELEMENT 50000 9 10 int main() 11 { 12 printf(" Start. "); 13 14 char *ptx, *kernel_file; 15 size_t ptxSize; 16 kernel_file = "D:\Program\CUDA9.0\Samples\0_Simple\vectorAdd_nvrtc\vectorAdd_kernel.cu"; 17 compileFileToPTX(kernel_file, 1, NULL, &ptx, &ptxSize, 0); 18 CUmodule module = loadPTX(ptx, 1, NULL); 19 CUfunction kernel_addr; 20 cuModuleGetFunction(&kernel_addr, module, "vectorAdd"); 21 22 size_t size = ELEMENT * sizeof(float); 23 24 float *h_A = (float *)malloc(size); 25 float *h_B = (float *)malloc(size); 26 float *h_C = (float *)malloc(size); 27 CUdeviceptr d_A, d_B, d_C; 28 cuMemAlloc(&d_A, size); 29 cuMemAlloc(&d_B, size); 30 cuMemAlloc(&d_C, size); 31 for (int i = 0; i < ELEMENT; ++i) 32 { 33 h_A[i] = rand()/(float)RAND_MAX; 34 h_B[i] = rand()/(float)RAND_MAX; 35 } 36 cuMemcpyHtoD(d_A, h_A, size); 37 cuMemcpyHtoD(d_B, h_B, size); 38 39 int threadsPerBlock = 256; 40 dim3 cudaBlockSize(threadsPerBlock,1,1); 41 dim3 cudaGridSize((ELEMENT + threadsPerBlock - 1) / threadsPerBlock, 1, 1); 42 int element = ELEMENT; 43 void *arr[] = { (void *)&d_A, (void *)&d_B, (void *)&d_C, (void *)&element}; 44 cuLaunchKernel(kernel_addr, cudaGridSize.x, cudaGridSize.y, cudaGridSize.z, cudaBlockSize.x, cudaBlockSize.y, cudaBlockSize.z, 0, 0, &arr[0], 0); 45 cuCtxSynchronize(); 46 cuMemcpyDtoH(h_C, d_C, size); 47 48 for (int i = 0; i < ELEMENT; ++i) 49 { 50 if (fabs(h_A[i] + h_B[i] - h_C[i]) > 1e-5) 51 { 52 printf(" Result error at i = %d, h_A[i] = %f, h_B[i] = %f, h_C[i] = %f ", i, h_A[i], h_B[i], h_C[i]); 53 getchar(); 54 return 1; 55 } 56 } 57 58 free(h_A); 59 free(h_B); 60 free(h_C); 61 cuMemFree(d_A); 62 cuMemFree(d_B); 63 cuMemFree(d_C); 64 printf(" Finish. "); 65 getchar(); 66 return 0; 67 }
● 输出结果:
Start. > Using CUDA Device [0]: GeForce GTX 1070 > GPU Device has SM 6.1 compute capability Finish.
▶ 源代码,Driver API,也需要上面的 vectorAdd_kernel.cu,调用核函数有三种方式,中间那种有点问题,结果不对
1 #include <stdio.h> 2 #include <helper_cuda.h> 3 #include <cuda.h> 4 #include <string> 5 #include <drvapi_error_string.h> 6 7 #define ELEMENT 50000 8 #define PATH "C:\ProgramData\NVIDIA Corporation\CUDA Samples\v9.1\0_Simple\vectorAddDrv\data\" 9 10 #if defined(_WIN64) || defined(__LP64__) 11 #define PTX_FILE "vectorAdd_kernel64.ptx" 12 #else 13 #define PTX_FILE "vectorAdd_kernel32.ptx" 14 #endif 15 16 using namespace std; 17 18 void RandomInit(float *data, int n) 19 { 20 for (int i = 0; i < n; ++i) 21 data[i] = rand() / (float)RAND_MAX; 22 } 23 24 int main(int argc, char **argv) 25 { 26 printf(" Start. "); 27 cuInit(0);// 相当于 runtime API 的 cudaSetDevice(0);,要先初始化设备才能创建上下文 28 CUcontext cuContext; 29 cuCtxCreate(&cuContext, 0, 0); 30 31 // 编译 32 string module_path, ptx_source; 33 module_path = PATH"vectorAdd_kernel64.ptx"; 34 FILE *fp = fopen(module_path.c_str(), "rb"); 35 fseek(fp, 0, SEEK_END); 36 int file_size = ftell(fp); 37 char *buf = new char[file_size + 1]; 38 fseek(fp, 0, SEEK_SET); 39 fread(buf, sizeof(char), file_size, fp); 40 fclose(fp); 41 buf[file_size] = '