zoukankan      html  css  js  c++  java
  • 0_Simple__asyncAPI

    ▶ CPU - GPU 异步操作

    ▶ 源代码

     1 #include <stdio.h>
     2 #include <cuda_runtime.h>
     3 #include "device_launch_parameters.h"
     4 #include <helper_cuda.h>
     5 #include <helper_functions.h>
     6 
     7 __global__ void increment_kernel(int *g_data, int inc_value)
     8 {
     9     int idx = blockIdx.x * blockDim.x + threadIdx.x;
    10     g_data[idx] = g_data[idx] + inc_value;
    11 }
    12 
    13 bool correct_output(int *data, const int n, const int x)
    14 {
    15     for (int i = 0; i < n; i++)
    16     {
    17         if (data[i] != x)
    18         {
    19             printf("Error! data[%d] = %d, ref = %d
    ", i, data[i], x);
    20             return false;
    21         }
    22     }
    23     return true;
    24 }
    25 
    26 int main(int argc, char *argv[])
    27 {
    28     printf("Start.
    ");
    29     int devID = findCudaDevice(argc, (const char **)argv);  // 通过命令行参数选择设备,可以为空
    30     cudaDeviceProp deviceProps;
    31     cudaGetDeviceProperties(&deviceProps, devID);
    32     printf("CUDA device [%s]
    ", deviceProps.name);
    33 
    34     const int n = 16 * 1024 * 1024;
    35     const int nbytes = n * sizeof(int);
    36     const int value = 26;
    37 
    38     int *a, *d_a;
    39     cudaMallocHost((void **)&a, nbytes);
    40     cudaMalloc((void **)&d_a, nbytes);
    41     memset(a, 0, nbytes);
    42     cudaMemset(d_a, 255, nbytes);
    43 
    44     cudaEvent_t start, stop;                // GPU 端计时器
    45     cudaEventCreate(&start);
    46     cudaEventCreate(&stop);
    47 
    48     StopWatchInterface *timer = NULL;       // CPU 端计时器
    49     sdkCreateTimer(&timer);
    50     sdkResetTimer(&timer);
    51 
    52     dim3 threads = dim3(512, 1, 1);
    53     dim3 blocks = dim3(n / threads.x, 1, 1);
    54 
    55     sdkStartTimer(&timer);                  // 注意 GPU 计时器是夹在 CPU 计时器内的,但是 GPU 函数都是异步的
    56     cudaEventRecord(start, 0);
    57     cudaMemcpyAsync(d_a, a, nbytes, cudaMemcpyHostToDevice, 0);
    58     increment_kernel << <blocks, threads, 0, 0 >> > (d_a, value);
    59     cudaMemcpyAsync(a, d_a, nbytes, cudaMemcpyDeviceToHost, 0);
    60     cudaEventRecord(stop, 0);
    61     sdkStopTimer(&timer);
    62 
    63     unsigned long int counter = 0;          // 记录 GPU 运行完成以前 CPU 运行了多少次 while 的循环 
    64     while (cudaEventQuery(stop) == cudaErrorNotReady)
    65         counter++;
    66 
    67     float gpu_time = 0.0f;                  // 此时保证 GPU 运行完成,才能记录时间
    68     cudaEventElapsedTime(&gpu_time, start, stop);
    69 
    70     printf("time spent by GPU: %.2f
    ", gpu_time);
    71     printf("time spent by CPU: %.2f
    ", sdkGetTimerValue(&timer));
    72     printf("CPU executed %lu iterations while waiting for GPU to finish
    ", counter);
    73     printf("
    	Finish: %s.", correct_output(a, n, value) ? "Pass" : "Fail");
    74 
    75     cudaEventDestroy(start);
    76     cudaEventDestroy(stop);
    77     cudaFreeHost(a);
    78     cudaFree(d_a);
    79     getchar();
    80     return 0;
    81 }

    ● 输出结果:

    GPU Device 0: "GeForce GTX 1070" with compute capability 6.1
    
    CUDA device [GeForce GTX 1070]
    time spent by GPU: 11.50
    time spent by CPU: 0.05
    CPU executed 3026 iterations while waiting for GPU to finish
    
            Finish!

    ▶ 新姿势:

    ● 调用主函数时的第0个参数作为程序名字符串,可以用于输出。

    1 int main(int argc, char *argv[])
    2 ...
    3 printf("%s", argv[0]);

    ● 在没有附加 flag 的情况下申请主机内存,注意使用cudaFreeHost释放

    1 int *a, nbytes = n * sizeof(int);
    2 cudaMallocHost((void **)&a, nbytes);
    3 ...
    4 cudaFreeHost(a);

    ● 记录 CPU 调用 CUDA 所用的时间

    1 StopWatchInterface *timer = NULL;
    2 sdkCreateTimer(&timer);
    3 sdkResetTimer(&timer);
    4 sdkStartTimer(&timer);
    5     
    6 ...// 核函数调用
    7     
    8 sdkStopTimer(&timer);
    9 printf("%.2f ms", sdkGetTimerValue(&timer));

    ● 查看GPU队列状态的函数

    extern __host__ cudaError_t CUDARTAPI cudaEventQuery(cudaEvent_t event);

    ■ stop为放置到流中的一个事件,cudaEventQuery(stop)返回该事件的状态,等于cudaSuccess(值等于0)表示已经发生;等于cudaErrorNotReady(值等于35)表示尚未发生。源代码中利用这段时间让CPU空转,记录了迭代次数。

    while (cudaEventQuery(stop) == cudaErrorNotReady) counter++;

    ● stdlib.h 中关于返回成功和失败的宏

    1 #define EXIT_SUCCESS 0
    2 #define EXIT_FAILURE  1

    ● 示例文件中的错误检查函数(定义在helper_cuda.h中),报告出错文件、行号、函数名,并且重启cudaDevice。

     1 #define checkCudaErrors(val)  check((val), #val, __FILE__, __LINE__)
     2 
     3 template< typename T >
     4 void check(T result, char const *const func, const char *const file, int const line)
     5 {
     6     if (result)
     7     {
     8         fprintf(stderr, "CUDA error at %s:%d code=%d(%s) "%s" 
    ",
     9             file, line, static_cast<unsigned int>(result), _cudaGetErrorEnum(result), func);
    10         DEVICE_RESET// Make sure we call CUDA Device Reset before exiting
    11         exit(EXIT_FAILURE);
    12     }
    13 }
    14 
    15 #define DEVICE_RESET  cudaDeviceReset();
  • 相关阅读:
    路飞学城Python-Day142
    路飞学城Python-Day141
    路飞学城Python-Day140
    路飞学城Python-Day136
    路飞学城Python-Day137
    路飞学城Python-Day117
    java基础知识总结
    Maven
    MySql实现分页查询
    js中的正则表达式入门
  • 原文地址:https://www.cnblogs.com/cuancuancuanhao/p/7723570.html
Copyright © 2011-2022 走看看