▶ CPU - GPU 异步操作
▶ 源代码
1 #include <stdio.h> 2 #include <cuda_runtime.h> 3 #include "device_launch_parameters.h" 4 #include <helper_cuda.h> 5 #include <helper_functions.h> 6 7 __global__ void increment_kernel(int *g_data, int inc_value) 8 { 9 int idx = blockIdx.x * blockDim.x + threadIdx.x; 10 g_data[idx] = g_data[idx] + inc_value; 11 } 12 13 bool correct_output(int *data, const int n, const int x) 14 { 15 for (int i = 0; i < n; i++) 16 { 17 if (data[i] != x) 18 { 19 printf("Error! data[%d] = %d, ref = %d ", i, data[i], x); 20 return false; 21 } 22 } 23 return true; 24 } 25 26 int main(int argc, char *argv[]) 27 { 28 printf("Start. "); 29 int devID = findCudaDevice(argc, (const char **)argv); // 通过命令行参数选择设备,可以为空 30 cudaDeviceProp deviceProps; 31 cudaGetDeviceProperties(&deviceProps, devID); 32 printf("CUDA device [%s] ", deviceProps.name); 33 34 const int n = 16 * 1024 * 1024; 35 const int nbytes = n * sizeof(int); 36 const int value = 26; 37 38 int *a, *d_a; 39 cudaMallocHost((void **)&a, nbytes); 40 cudaMalloc((void **)&d_a, nbytes); 41 memset(a, 0, nbytes); 42 cudaMemset(d_a, 255, nbytes); 43 44 cudaEvent_t start, stop; // GPU 端计时器 45 cudaEventCreate(&start); 46 cudaEventCreate(&stop); 47 48 StopWatchInterface *timer = NULL; // CPU 端计时器 49 sdkCreateTimer(&timer); 50 sdkResetTimer(&timer); 51 52 dim3 threads = dim3(512, 1, 1); 53 dim3 blocks = dim3(n / threads.x, 1, 1); 54 55 sdkStartTimer(&timer); // 注意 GPU 计时器是夹在 CPU 计时器内的,但是 GPU 函数都是异步的 56 cudaEventRecord(start, 0); 57 cudaMemcpyAsync(d_a, a, nbytes, cudaMemcpyHostToDevice, 0); 58 increment_kernel << <blocks, threads, 0, 0 >> > (d_a, value); 59 cudaMemcpyAsync(a, d_a, nbytes, cudaMemcpyDeviceToHost, 0); 60 cudaEventRecord(stop, 0); 61 sdkStopTimer(&timer); 62 63 unsigned long int counter = 0; // 记录 GPU 运行完成以前 CPU 运行了多少次 while 的循环 64 while (cudaEventQuery(stop) == cudaErrorNotReady) 65 counter++; 66 67 float gpu_time = 0.0f; // 此时保证 GPU 运行完成,才能记录时间 68 cudaEventElapsedTime(&gpu_time, start, stop); 69 70 printf("time spent by GPU: %.2f ", gpu_time); 71 printf("time spent by CPU: %.2f ", sdkGetTimerValue(&timer)); 72 printf("CPU executed %lu iterations while waiting for GPU to finish ", counter); 73 printf(" Finish: %s.", correct_output(a, n, value) ? "Pass" : "Fail"); 74 75 cudaEventDestroy(start); 76 cudaEventDestroy(stop); 77 cudaFreeHost(a); 78 cudaFree(d_a); 79 getchar(); 80 return 0; 81 }
● 输出结果:
GPU Device 0: "GeForce GTX 1070" with compute capability 6.1 CUDA device [GeForce GTX 1070] time spent by GPU: 11.50 time spent by CPU: 0.05 CPU executed 3026 iterations while waiting for GPU to finish Finish!
▶ 新姿势:
● 调用主函数时的第0个参数作为程序名字符串,可以用于输出。
1 int main(int argc, char *argv[]) 2 ... 3 printf("%s", argv[0]);
● 在没有附加 flag 的情况下申请主机内存,注意使用cudaFreeHost释放
1 int *a, nbytes = n * sizeof(int); 2 cudaMallocHost((void **)&a, nbytes); 3 ... 4 cudaFreeHost(a);
● 记录 CPU 调用 CUDA 所用的时间
1 StopWatchInterface *timer = NULL; 2 sdkCreateTimer(&timer); 3 sdkResetTimer(&timer); 4 sdkStartTimer(&timer); 5 6 ...// 核函数调用 7 8 sdkStopTimer(&timer); 9 printf("%.2f ms", sdkGetTimerValue(&timer));
● 查看GPU队列状态的函数
extern __host__ cudaError_t CUDARTAPI cudaEventQuery(cudaEvent_t event);
■ stop为放置到流中的一个事件,cudaEventQuery(stop)返回该事件的状态,等于cudaSuccess(值等于0)表示已经发生;等于cudaErrorNotReady(值等于35)表示尚未发生。源代码中利用这段时间让CPU空转,记录了迭代次数。
while (cudaEventQuery(stop) == cudaErrorNotReady) counter++;
● stdlib.h 中关于返回成功和失败的宏
1 #define EXIT_SUCCESS 0 2 #define EXIT_FAILURE 1
● 示例文件中的错误检查函数(定义在helper_cuda.h中),报告出错文件、行号、函数名,并且重启cudaDevice。
1 #define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__) 2 3 template< typename T > 4 void check(T result, char const *const func, const char *const file, int const line) 5 { 6 if (result) 7 { 8 fprintf(stderr, "CUDA error at %s:%d code=%d(%s) "%s" ", 9 file, line, static_cast<unsigned int>(result), _cudaGetErrorEnum(result), func); 10 DEVICE_RESET// Make sure we call CUDA Device Reset before exiting 11 exit(EXIT_FAILURE); 12 } 13 } 14 15 #define DEVICE_RESET cudaDeviceReset();