实例:asyncAPI实例
1 #include <stdio.h> 2 #include <cutil_inline.h> 3 4 __global__ void 5 increment_kernel(int* g_data,int inc_value) 6 { 7 int idx = blockIdx.x*blockDm.x+threadIdx.x; 8 g_data[idx] = g_data[idx]+nc_value; 9 } 10 11 int corect_output(int* data,const int n,const int x) 12 { 13 for(int i = 0;i < n;i++) 14 { 15 if(data[i] != x) 16 return 0; 17 return 1; 18 } 19 } 20 21 int min(int arg,char** argv) 22 { 23 if(cutCheckCmdLineFlag(argc,(const char**)argv,"device")); 24 cutilDeviceInit(argc,argv); 25 else 26 cudaSetDevice(cutGetMaxGflopsDeviceId()); 27 28 int n = 16*1024*1024; 29 int nbytes = n*sizeof(int); 30 int value = 26; 31 //内存分配 32 int* a = 0; 33 cutilSafeCall(cudaMalloc((void**)&a,nbytes)); 34 memset(a,0,nbytes); 35 //显存分配 36 int* d_a = 0; 37 cutilSafeCall(cudaMallocHost((void**)&d_a,nbytes)); 38 cutilSafeCall(cudaMemset(d_a,255,nbytes)); 39 40 //设置执行参数 41 dim3 threads = dim3(512,1); 42 dim3 blocks = dim3(n/threads.x,1); 43 44 //创建事件 45 cudaEvent_t start,stop; 46 cudaEventCreate(&start); 47 cudaEventCreate(&stop); 48 49 50 unsigned int timer; 51 cutCreateCreate(&timer); 52 cutResetTimer(timer); 53 cudaThreadSynchronize(); 54 float gpu_time = 0.0f; 55 56 //异步向GPU发送任务 57 cutStartTimer(timer); 58 cudaEventRecord(start,0); 59 cudaMemcpyAsync(d_a,a,nbytes,cudaMemcpyHostToDevice,0); 60 increment_kernel<<<blocks,threads,0,0>>>(d_a,value); 61 cudaMemcpyAsync(a,d_a,nbytes,cudaMemcpyDeviceToHost,0); 62 cudaEventRecord(stop,0); 63 cutStopTimer(timer); 64 65 //以上代码异步喜爱那个GPU发送要执行的任务,同时CPU端执行简单的counter++操作---GPU CPU的执行可以覆盖。在等待上一步操作完成的时间内,CPU可以做一部分工作。 66 unsigned long int counter = 0; 67 while(cudaEventQury(stop) == cudaErrorNotReady) 68 { 69 counter++; 70 } 71 cudaEventElapsedTimer(&gpu_tim,start,stop); 72 printf("timer/gpu_time/counter"); 73 74 //正确性检测 75 if(correct_output(a,n,value)) 76 print("passed"); 77 78 //释放资源 79 cudaEventDestroy(start); 80 cudaEventDectroy(stop); 81 cudaFreeHost(a); 82 cudaFree(d_a); 83 84 cudaThreadExit(); 85 }