1:运行时API实现
事件管理可以用于测量程序运行时间,或者管理CPU和GPU同时进行
1 //创建事件 2 cudaEvent_t start,stop; 3 cudaEventCreate(&start); 4 cudaEventCreate(&stop); 5 6 cudaEventRecord(start,0); 7 //do somthing 8 cudaEventRecord(stop,0); 9 cudaEventSynchronize(stop); 10 float elapsedTime; 11 cudaEventElapsedTime(&elapsedTime,start,stop); 12 13 cudaEventDestroy(start); 14 cudaEventDestroy(stop);
2:驱动API实现
1 CUevent start,stop; 2 cuEventCreate(&start); 3 cuEventCreate(&stop); 4 5 cuEventRecord(start,0); 6 //do somthing 7 cuEventRecord(stop,0); 8 cuEventSynchronize(stop); 9 float elapsedTime; 10 cuEventElapsedTime(&elapsedTime,start,stop); 11 12 cuEventDestroy(start); 13 cuEventDestroy(stop);
3:simpleStream实例
1 #include <stdio.h> 2 #include <cutil_inline.h> 3 4 __global__ void 5 init_array(int *g_data,int* factor,int num_iterations) 6 { 7 int idx = blockIdx.x*blockDim.x+threadIdx.x; 8 for(int i = 0;i < n;i++) 9 g_data[idx] += *factor; 10 } 11 12 int correct_data(int *a,const int n,const int c) 13 { 14 for(int i = 0;i < n;i++) 15 { 16 if(a[i] != c) 17 { 18 printf("%d:%d %d ",i,a[i],c); 19 return 0; 20 } 21 return 1; 22 } 23 } 24 25 26 int main(int arg,char* argv[]) 27 { 28 int CUDA_device = 0; 29 in nstream = 4; 30 int nreps = 10;//整体循环次数 31 int n = 16*1024*1024;//数组元素个数 32 int nbytes = n*sizeof(int); 33 dim3 threads,blocks; 34 float elapsed_time,time_memcpy,time_kernel; 35 36 int niterations;//kernel内部循环次数 37 if(argc > 1) 38 { 39 CUDA_Device = atoi(argv[1]); 40 } 41 42 //查询设备计算能力 43 int num_devices = 0; 44 cudaGetDeviceCount(&num_devices); 45 if(0 == num_devices) 46 { 47 printf("no device "); 48 return 1; 49 } 50 if(CUDA_devices >= num_device) 51 { 52 printf("CUDA_device between 0 and %d ", num_devices-1); 53 return 1; 54 } 55 56 cudaSetDevice(CUDA_device); 57 cudaDeviceProp device_properties; 58 cudaDeviceProperties(&device_properties,CUDA_device); 59 printf("running on:%s ",device_properties.name); 60 61 //内存分配 62 int c = 5; 63 int *a = 0; 64 cuMallocHost((void**)&a,nbytes); 65 66 //显存分配 67 int* d_a = 0,*d_c = 0; 68 cudaMalloc((void**)&d_a,nbytes); 69 cudaMalloc((void**)&d_c,sizeof(int)); 70 cudaMemcpy(d_c,&c,sizeof(int),cudaMemcpyHostToDevice); 71 72 //流的创建和初始化 73 cudaStream_t* streams = (cudaStream_t*)malloc(nstreams*sizeof(cudaStream_t)); 74 for(int i = 0;i nstreams;i++) 75 { 76 cudaStreamCreate(&streams[i]); 77 } 78 79 //事件的创建 80 cudaEvent_t start_event,stop_event; 81 cudaEventCreate(&start_event); 82 cudaEventCreate(&stop_event); 83 84 //内存拷贝计时 85 cudaEventRecord(start_event,0);//stream0中计时,确保所有之前的cuda调用均已完成 86 cudaMemcpyAsync(d_a,a,nbytes,cudaMemcpyHostToDevice,streama[0]); 87 cudaEventRecord(stop_event,0); 88 cudaEventElapsedTimer(&time_memcpy,start_event,stop_event); 89 90 //kernel计时,使用流 91 threads = dim3(512,1); 92 blocks = dim3(n/threads.x,1); 93 cudaEventRecord(start_event,0); 94 init_array<<<blocks,threads,0,streams[0]>>>(d_a,d_c,niterations); 95 cudaEventRecord(stop_event,0); 96 cudaEventSynchronize(stop_event); 97 cudaElapsedTime(&time_kernel,start_event,stop_event); 98 99 //kernel计时,不使用流 100 threads = dim3(512,1); 101 blocks = dim3(n/threads.x,1); 102 cudaEventRecord(start_event,0); 103 for(int i = 0;i < nreps;i++) 104 { 105 init_array<<<blocks,threads>>>(d_a,d_c,niterations); 106 cudaMemcpy(a,d_a,nbytes,cudaMemcpyDeviceToHost); 107 } 108 cudaEventRecord(stop_event,0); 109 cudaEventSynchronize(stop_event); 110 cudaElapsedTime(&elapsed_time,start_event,stop_event); 111 112 //核对结果 113 if(correct_data(a,n,c*nreps*niterations)) 114 printf("passed "); 115 116 //释放资源 117 for(int i = 0;i < nstreams;i++) 118 cudaStreamDestroy(streams[i]); 119 cudaEventDestroy(start_event); 120 cudaEventDestroy(stop_event); 121 cudaFreeHost(a); 122 cudaFree(d_a); 123 cudaFree(d_c); 124 125 cudaThreadExit(); 126 return 0;