zoukankan      html  css  js  c++  java
  • GPU和CPU耗时统计方法

    GPU端耗时统计

     1     cudaEvent_t start, stop;
     2     checkCudaErrors(cudaEventCreate(&start));
     3     checkCudaErrors(cudaEventCreate(&stop));
     4     checkCudaErrors(cudaDeviceSynchronize());
     5 
     6     float gpu_time = 0.0f;
     7     cudaEventRecord(start, 0);//cuda context中的操作完毕事件被记录
     8     //分配设备端内存
     9     float *d_idata;
    10     checkCudaErrors(cudaMalloc((void **) &d_idata, mem_size));
    11     
    12     //将主机端数据拷贝到设备端内存
    13     checkCudaErrors(cudaMemcpy(d_idata, h_idata, mem_size,  cudaMemcpyHostToDevice));
    14 
    15     //设备端为结果分配内存
    16     float *d_odata;
    17     checkCudaErrors(cudaMalloc((void **) &d_odata, mem_size));
    18 
    19     //设置执行参数
    20     dim3  grid(1, 1, 1);
    21     dim3  threads(num_threads, 1, 1);
    22 
    23     //执行内核,参数含义:grid是网格的纬度,threads是块的纬度,mem_size最多能动态分配的共享内存大小
    24     testKernel<<< grid, threads, mem_size >>>(d_idata, d_odata);
    25 
    26     //检查内核执行状态
    27     getLastCudaError("Kernel execution failed");
    28 
    29     //在主机端为结果分配内存
    30     float *h_odata = (float *) malloc(mem_size);
    31     //从设备端拷贝结果到主机端
    32     checkCudaErrors(cudaMemcpy(h_odata, d_odata, sizeof(float) * num_threads,
    33                                cudaMemcpyDeviceToHost));
    34 
    35     cudaEventRecord(stop, 0);
    36     unsigned long int counter = 0;
    37     while (cudaEventQuery(stop) == cudaErrorNotReady)
    38     {
    39         counter++;
    40     }
    41     checkCudaErrors(cudaEventElapsedTime(&gpu_time, start, stop));
    42     printf("GPU执行耗时: %.2f (ms)
    ", gpu_time);
    43     printf("CPU executed %lu iterations while waiting for GPU to finish
    ", counter);

    CPU端耗时统计

     1     StopWatchInterface *timer = 0;
     2     sdkCreateTimer(&timer);
     3     sdkResetTimer(&timer);
     4 
     5     sdkStartTimer(&timer);
     6     //计算参考方案
     7     float *reference = (float *) malloc(mem_size);
     8     computeGold(reference, h_idata, num_threads);
     9     sdkStopTimer(&timer);
    10     printf("串行耗时:%f (ms)
    ", sdkGetTimerValue(&timer));
  • 相关阅读:
    关于深浅克隆
    忙话codesmith
    SynchronizedDictionary线程安全的泛型版本
    自制定长的Queue
    白忙活
    发现一个不错的技术社区(水木清华)
    自已实现线程池
    Berkeley DB .net 进行添加和更新操作
    关于ThreadPool.RegisterWaitForSingleObject和WaitHandle的应用介绍
    1100内产生3个不重复的随机数
  • 原文地址:https://www.cnblogs.com/liangliangdetianxia/p/4198607.html
Copyright © 2011-2022 走看看