zoukankan      html  css  js  c++  java
  • cuda基础--流

    1:运行时API实现

      事件管理可以用于测量程序运行时间,或者管理CPU和GPU同时进行

     1 //创建事件
     2 cudaEvent_t start,stop;
     3 cudaEventCreate(&start);
     4 cudaEventCreate(&stop);
     5 
     6 cudaEventRecord(start,0);
     7 //do somthing
     8 cudaEventRecord(stop,0);
     9 cudaEventSynchronize(stop);
    10 float elapsedTime;
    11 cudaEventElapsedTime(&elapsedTime,start,stop);
    12 
    13 cudaEventDestroy(start);
    14 cudaEventDestroy(stop);

    2:驱动API实现

     1 CUevent start,stop;
     2 cuEventCreate(&start);
     3 cuEventCreate(&stop);
     4 
     5 cuEventRecord(start,0);
     6 //do somthing
     7 cuEventRecord(stop,0);
     8 cuEventSynchronize(stop);
     9 float elapsedTime;
    10 cuEventElapsedTime(&elapsedTime,start,stop);
    11 
    12 cuEventDestroy(start);
    13 cuEventDestroy(stop);

    3:simpleStream实例

      1 #include <stdio.h>
      2 #include <cutil_inline.h>
      3 
      4 __global__ void 
      5 init_array(int *g_data,int* factor,int num_iterations)
      6 {
      7 int idx = blockIdx.x*blockDim.x+threadIdx.x;
      8 for(int i = 0;i < n;i++)
      9 g_data[idx] += *factor;
     10 }
     11 
     12 int correct_data(int *a,const int n,const int c)
     13 {
     14 for(int i = 0;i < n;i++)
     15 {
     16 if(a[i] != c)
     17 {
     18 printf("%d:%d %d
    ",i,a[i],c);
     19 return 0;
     20 }
     21 return 1;
     22 }
     23 }
     24 
     25 
     26 int main(int arg,char* argv[])
     27 {
     28 int CUDA_device = 0;
     29 in nstream = 4;
     30 int nreps = 10;//整体循环次数
     31 int n = 16*1024*1024;//数组元素个数
     32 int nbytes = n*sizeof(int);
     33 dim3 threads,blocks;
     34 float elapsed_time,time_memcpy,time_kernel;
     35 
     36 int niterations;//kernel内部循环次数
     37 if(argc > 1)
     38 {
     39 CUDA_Device = atoi(argv[1]);
     40 }
     41 
     42 //查询设备计算能力
     43 int num_devices = 0;
     44 cudaGetDeviceCount(&num_devices);
     45 if(0 == num_devices)
     46 {
     47 printf("no device
    ");
     48 return 1;
     49 }
     50 if(CUDA_devices >= num_device)
     51 {
     52  printf("CUDA_device between 0 and %d
    ", num_devices-1);
     53 return 1;  
     54 }
     55 
     56 cudaSetDevice(CUDA_device);
     57 cudaDeviceProp device_properties;
     58 cudaDeviceProperties(&device_properties,CUDA_device);
     59 printf("running on:%s
    ",device_properties.name);
     60 
     61 //内存分配
     62 int c = 5;
     63 int *a = 0;
     64 cuMallocHost((void**)&a,nbytes);
     65 
     66 //显存分配
     67 int* d_a = 0,*d_c = 0;
     68 cudaMalloc((void**)&d_a,nbytes);
     69 cudaMalloc((void**)&d_c,sizeof(int));
     70 cudaMemcpy(d_c,&c,sizeof(int),cudaMemcpyHostToDevice);
     71 
     72 //流的创建和初始化
     73 cudaStream_t* streams = (cudaStream_t*)malloc(nstreams*sizeof(cudaStream_t));
     74 for(int i = 0;i  nstreams;i++)
     75 {
     76 cudaStreamCreate(&streams[i]);
     77 }
     78 
     79 //事件的创建
     80 cudaEvent_t  start_event,stop_event;
     81 cudaEventCreate(&start_event);
     82 cudaEventCreate(&stop_event);
     83 
     84 //内存拷贝计时
     85 cudaEventRecord(start_event,0);//stream0中计时,确保所有之前的cuda调用均已完成
     86 cudaMemcpyAsync(d_a,a,nbytes,cudaMemcpyHostToDevice,streama[0]);
     87 cudaEventRecord(stop_event,0);
     88 cudaEventElapsedTimer(&time_memcpy,start_event,stop_event);
     89 
     90 //kernel计时,使用流
     91 threads = dim3(512,1);
     92 blocks = dim3(n/threads.x,1);
     93 cudaEventRecord(start_event,0);
     94 init_array<<<blocks,threads,0,streams[0]>>>(d_a,d_c,niterations);
     95 cudaEventRecord(stop_event,0);
     96 cudaEventSynchronize(stop_event);
     97 cudaElapsedTime(&time_kernel,start_event,stop_event);
     98 
     99 //kernel计时,不使用流
    100 threads = dim3(512,1);
    101 blocks = dim3(n/threads.x,1);
    102 cudaEventRecord(start_event,0);
    103 for(int i = 0;i < nreps;i++)
    104 {
    105 init_array<<<blocks,threads>>>(d_a,d_c,niterations);
    106 cudaMemcpy(a,d_a,nbytes,cudaMemcpyDeviceToHost);
    107 }
    108 cudaEventRecord(stop_event,0);
    109 cudaEventSynchronize(stop_event);
    110 cudaElapsedTime(&elapsed_time,start_event,stop_event);
    111 
    112 //核对结果
    113 if(correct_data(a,n,c*nreps*niterations))
    114 printf("passed
    ");
    115 
    116 //释放资源
    117 for(int i = 0;i < nstreams;i++)
    118 cudaStreamDestroy(streams[i]);
    119 cudaEventDestroy(start_event);
    120 cudaEventDestroy(stop_event);
    121 cudaFreeHost(a);
    122 cudaFree(d_a);
    123 cudaFree(d_c);
    124 
    125 cudaThreadExit();
    126 return 0;
  • 相关阅读:
    java使用Websocket获取HttpSession出现的问题与解决
    java 静态导入 小结
    【编程思想笔记】内部类的初始化
    【搬运】Tea算法Java实现工具类
    OBS studio最新版配置鉴权推流
    debian 9 双显卡安装NVIDIA显卡驱动
    【学习笔记】Java finalize()的使用
    【学习笔记】js下拉刷新、上拉加载 mescroll框架的使用
    [随笔] 简单操作解决Google chrome颜色显示不正常的情况
    【自制工具类】struts返回json数据包装格式类
  • 原文地址:https://www.cnblogs.com/pengtangtang/p/13083913.html
Copyright © 2011-2022 走看看