zoukankan      html  css  js  c++  java
  • 0_Simple__matrixMulDrv

    使用CUDA的 Driver API 来计算矩阵乘法。

    ▶ 源代码:

      1 #include <stdio.h>
      2 
      3 #include <cuda.h>
      4 #include <builtin_types.h>
      5 #include <helper_cuda_drvapi.h>
      6 #include <helper_timer.h>
      7 #include "matrixMul.h"
      8 
      9 #define PTX_FILE "matrixMul_kernel64.ptx"
     10 #define CUBIN_FILE "matrixMul_kernel64.cubin"
     11 
     12 const bool use_64bit_memory_address = true;
     13 using namespace std;
     14 
     15 CUdevice cuDevice;
     16 CUcontext cuContext;
     17 CUmodule cuModule;
     18 size_t totalGlobalMem;
     19 
     20 void constantInit(float *data, int size, float val)
     21 {
     22     for (int i = 0; i < size; ++i)
     23         data[i] = val;
     24 }
     25 
     26 bool inline findModulePath(const char *module_file, string &module_path, char **argv, string &ptx_source)
     27 {
     28     char *actual_path = sdkFindFilePath(module_file, argv[0]);// 依命令行的参数
     29 
     30     if (actual_path)
     31         module_path = actual_path;
     32     else
     33     {
     34         printf("> findModulePath file not found: <%s> 
    ", module_file);
     35         return false;
     36     }
     37 
     38     if (module_path.empty())
     39     {
     40         printf("> findModulePath file not found: <%s> 
    ", module_file);
     41         return false;
     42     }
     43     printf("> findModulePath <%s>
    ", module_path.c_str());
     44 
     45     if (module_path.rfind(".ptx") != string::npos)
     46     {
     47         FILE *fp = fopen(module_path.c_str(), "rb");
     48         fseek(fp, 0, SEEK_END);
     49         int file_size = ftell(fp);
     50         char *buf = new char[file_size + 1];
     51         fseek(fp, 0, SEEK_SET);
     52         fread(buf, sizeof(char), file_size, fp);
     53         fclose(fp);
     54         buf[file_size] = '';
     55         ptx_source = buf;
     56         delete[] buf;
     57     }
     58     return true;
     59 }
     60 
     61 static CUresult initCUDA(int argc, char **argv, CUfunction *pMatrixMul)
     62 {
     63     CUfunction cuFunction = 0;// 用于存放取出的函数
     64     CUresult status;          // 记录每一步操作返回的状态,有false时立即用goto语句转到函数末尾退出
     65     int major = 0, minor = 0;
     66     char deviceName[100];
     67     string module_path, ptx_source;
     68 
     69     cuDevice = findCudaDeviceDRV(argc, (const char **)argv);// 寻找设备,依命令行参数指定或者选择计算能力最高的
     70     cuDeviceComputeCapability(&major, &minor, cuDevice);
     71     cuDeviceGetName(deviceName, 256, cuDevice);
     72     printf("> GPU Device has SM %d.%d compute capability
    ", major, minor);
     73     cuDeviceTotalMem(&totalGlobalMem, cuDevice);            // 获取显存总量 
     74     printf("  Total amount of global memory:     %llu bytes
    ", (unsigned long long)totalGlobalMem);
     75     printf("  64-bit Memory Address:             %s
    ", (totalGlobalMem > (unsigned long long)4 * 1024 * 1024 * 1024L) ? "YES" : "NO");
     76 
     77     status = cuCtxCreate(&cuContext, 0, cuDevice);          // 创建上下文
     78     if (CUDA_SUCCESS != status)
     79         goto Error;
     80 
     81     if (!findModulePath(PTX_FILE, module_path, argv, ptx_source))// 查找指定的模块 "matrixMul_kernel64.ptx"
     82     {
     83         if (!findModulePath(CUBIN_FILE, module_path, argv, ptx_source))// 查找模块 "matrixMul_kernel64.cubin"
     84         {
     85             printf("> findModulePath could not find <matrixMul_kernel> ptx or cubin
    ");
     86             status = CUDA_ERROR_NOT_FOUND;
     87             goto Error;
     88         }
     89     }
     90     else
     91         printf("> initCUDA loading module: <%s>
    ", module_path.c_str());
     92 
     93     if (module_path.rfind("ptx") != string::npos)
     94     {
     95         // in this branch we use compilation with parameters
     96         const unsigned int jitNumOptions = 3;
     97         CUjit_option *jitOptions = new CUjit_option[jitNumOptions];
     98         void **jitOptVals = new void *[jitNumOptions];
     99 
    100         // set up size of compilation log buffer
    101         jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
    102         int jitLogBufferSize = 1024;
    103         jitOptVals[0] = (void *)(size_t)jitLogBufferSize;
    104 
    105         // set up pointer to the compilation log buffer
    106         jitOptions[1] = CU_JIT_INFO_LOG_BUFFER;
    107         char *jitLogBuffer = new char[jitLogBufferSize];
    108         jitOptVals[1] = jitLogBuffer;
    109 
    110         // set up pointer to set the Maximum # of registers for a particular kernel
    111         jitOptions[2] = CU_JIT_MAX_REGISTERS;
    112         int jitRegCount = 32;
    113         jitOptVals[2] = (void *)(size_t)jitRegCount;
    114 
    115         // 编译模块
    116         status = cuModuleLoadDataEx(&cuModule, ptx_source.c_str(), jitNumOptions, jitOptions, (void **)jitOptVals);
    117 
    118         printf("> PTX JIT log:
    %s
    ", jitLogBuffer);
    119     }
    120     else
    121         status = cuModuleLoad(&cuModule, module_path.c_str());
    122 
    123     if (CUDA_SUCCESS != status)
    124         goto Error;
    125 
    126     // 取出函数
    127     if (totalGlobalMem > (unsigned long long)4 * 1024 * 1024 * 1024L)
    128         status = cuModuleGetFunction(&cuFunction, cuModule, "matrixMul_bs32_64bit");
    129     else
    130         status = cuModuleGetFunction(&cuFunction, cuModule, "matrixMul_bs32_32bit");
    131 
    132     if (CUDA_SUCCESS != status)
    133         goto Error;
    134     *pMatrixMul = cuFunction;
    135     return CUDA_SUCCESS;
    136 
    137 Error:
    138     cuCtxDestroy(cuContext);
    139     return status;
    140 }
    141 
    142 void runTest(int argc, char **argv)
    143 {
    144     int block_size = 32;
    145 
    146     // 获取计算函数
    147     CUfunction matrixMul = NULL;// CUDA 函数指针
    148     CUresult error_id = initCUDA(argc, argv, &matrixMul);// 获取函数
    149 
    150     // 数据准备工作
    151     unsigned int size_A = WA * HA;
    152     unsigned int mem_size_A = sizeof(float) * size_A;
    153     float *h_A = (float *) malloc(mem_size_A);
    154     unsigned int size_B = WB * HB;
    155     unsigned int mem_size_B = sizeof(float) * size_B;
    156     float *h_B = (float *) malloc(mem_size_B);
    157     size_t size_C = WC * HC;
    158     size_t mem_size_C = sizeof(float) * size_C;
    159     float *h_C = (float *)malloc(mem_size_C);
    160     constantInit(h_A, size_A, 1.0f);    // 全 1 阵
    161     constantInit(h_B, size_B, 0.01f);   // 全0.01 阵
    162 
    163     // 如果是64位系统,则这里申请四块1G的显存占着,没啥用
    164     CUdeviceptr d_Mem[4];
    165     if (use_64bit_memory_address)
    166     {
    167         unsigned int mem_size = 1024*1024*1024;
    168         cuMemAlloc(&d_Mem[0], mem_size);
    169         cuMemAlloc(&d_Mem[1], mem_size);
    170         cuMemAlloc(&d_Mem[2], mem_size);
    171         cuMemAlloc(&d_Mem[3], mem_size);
    172     }
    173 
    174     CUdeviceptr d_A;
    175     cuMemAlloc(&d_A, mem_size_A);
    176     CUdeviceptr d_B;
    177     cuMemAlloc(&d_B, mem_size_B);
    178     CUdeviceptr d_C;
    179     cuMemAlloc(&d_C, mem_size_C);
    180     cuMemcpyHtoD(d_A, h_A, mem_size_A);
    181     cuMemcpyHtoD(d_B, h_B, mem_size_B);
    182 
    183     // 计时相关
    184     StopWatchInterface *timer = NULL;
    185     sdkCreateTimer(&timer);
    186     sdkStartTimer(&timer);
    187 
    188     dim3 block(block_size, block_size, 1);
    189     dim3 grid(WC / block_size, HC / block_size, 1);
    190 
    191     // 两种方式调用 Driver API
    192     if (1)
    193     {                                       
    194         // 64位内存地址且显存足够大,使用 size_t 为尺寸格式,否则使用 int 为尺寸格式,其调用格式相同 
    195         if (use_64bit_memory_address && (totalGlobalMem > (unsigned long long)4*1024*1024*1024L))
    196         {
    197             size_t Matrix_Width_A = (size_t)WA;
    198             size_t Matrix_Width_B = (size_t)WB;
    199             void *args[5] = { &d_C, &d_A, &d_B, &Matrix_Width_A, &Matrix_Width_B};
    200             // CUDA 4.0 Driver API 核函数调用,使用倒数第二个指针参数
    201             cuLaunchKernel(matrixMul, grid.x, grid.y, grid.z, block.x, block.y, block.z,
    202                            2 * block_size*block_size * sizeof(float), NULL, args, NULL);
    203         }
    204         else
    205         {
    206             int Matrix_Width_A = WA;
    207             int Matrix_Width_B = WB;
    208             void *args[5] = { &d_C, &d_A, &d_B, &Matrix_Width_A, &Matrix_Width_B};
    209             cuLaunchKernel(matrixMul, grid.x, grid.y, grid.z, block.x, block.y, block.z,
    210                            2 * block_size*block_size * sizeof(float), NULL, args, NULL);
    211         }
    212     }
    213     else
    214     {
    215         int offset = 0;
    216         char argBuffer[256];// 与上面 args 相同顺序依次填入所需的指针参数,用 offset 作偏移
    217 
    218         *((CUdeviceptr *)&argBuffer[offset]) = d_C;
    219         offset += sizeof(d_C);
    220         *((CUdeviceptr *)&argBuffer[offset]) = d_A;
    221         offset += sizeof(d_A);
    222         *((CUdeviceptr *)&argBuffer[offset]) = d_B;
    223         offset += sizeof(d_B);
    224 
    225         if (use_64bit_memory_address && (totalGlobalMem > (unsigned long long)4*1024*1024*1024L))
    226         {
    227             size_t Matrix_Width_A = (size_t)WA;
    228             size_t Matrix_Width_B = (size_t)WB;
    229             *((CUdeviceptr *)&argBuffer[offset]) = Matrix_Width_A;
    230             offset += sizeof(Matrix_Width_A);
    231             *((CUdeviceptr *)&argBuffer[offset]) = Matrix_Width_B;
    232             offset += sizeof(Matrix_Width_B);
    233         }
    234         else
    235         {
    236             int Matrix_Width_A = WA;
    237             int Matrix_Width_B = WB;
    238             *((int *)&argBuffer[offset]) = Matrix_Width_A;
    239             offset += sizeof(Matrix_Width_A);
    240             *((int *)&argBuffer[offset]) = Matrix_Width_B;
    241             offset += sizeof(Matrix_Width_B);
    242         }
    243 
    244         // 用一个 void * 来封装上面5个参数,并加上参数尺寸和一个指明参数结束的结束宏
    245         void *kernel_launch_config[5] =
    246         {
    247             CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer,
    248             CU_LAUNCH_PARAM_BUFFER_SIZE,    &offset,
    249             CU_LAUNCH_PARAM_END
    250         };
    251 
    252         // CUDA 4.0 Driver API 核函数调用,使用最后一个指针参数
    253         cuLaunchKernel(matrixMul, grid.x, grid.y, grid.z, block.x, block.y, block.z,
    254                        2 * block_size*block_size * sizeof(float), NULL, NULL, (void **)&kernel_launch_config);
    255     }
    256 
    257     cuMemcpyDtoH((void *) h_C, d_C, mem_size_C);
    258 
    259     sdkStopTimer(&timer);
    260     printf("Processing time: %f (ms)
    ", sdkGetTimerValue(&timer));
    261     sdkDeleteTimer(&timer);
    262 
    263     //检查结果
    264     printf("Checking computed result for correctness: ");
    265     bool correct = true;
    266     for (int i = 0; i < (int)(WC * HC); i++)
    267     {
    268         if (fabs(h_C[i] - (WA * 0.01f)) > 1e-5)
    269         {
    270             printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > 1e-5
    ", i, h_C[i], WA*0.01f);
    271             correct = false;
    272         }
    273     }
    274     printf("%s
    ", correct ? "Result = PASS" : "Result = FAIL");
    275     printf("
    NOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.
    ");
    276 
    277     if (use_64bit_memory_address)
    278     {
    279         cuMemFree(d_Mem[0]);
    280         cuMemFree(d_Mem[1]);
    281         cuMemFree(d_Mem[2]);
    282         cuMemFree(d_Mem[3]);
    283     }
    284     free(h_A);
    285     free(h_B);
    286     free(h_C);
    287     cuMemFree(d_A);
    288     cuMemFree(d_B);
    289     cuMemFree(d_C);
    290     cuCtxDestroy(cuContext);
    291 }
    292 
    293 int main(int argc, char **argv)
    294 {
    295     printf("[ matrixMulDrv(Driver API) ]
    ");
    296     runTest(argc, argv);
    297 
    298     getchar();
    299     return 0;
    300 }

    ▶ 输出结果:

    [ matrixMulDrv (Driver API) ]
    > Using CUDA Device [0]: GeForce GTX 1070
    > GPU Device has SM 6.1 compute capability
    Total amount of global memory:     8589934592 bytes
    64-bit Memory Address:             YES
    sdkFindFilePath <matrixMul_kernel64.ptx> in ./
    sdkFindFilePath <matrixMul_kernel64.ptx> in ./../../bin/win64/Debug/matrixMulDrv_data_files/
    sdkFindFilePath <matrixMul_kernel64.ptx> in ./common/
    sdkFindFilePath <matrixMul_kernel64.ptx> in ./common/data/
    sdkFindFilePath <matrixMul_kernel64.ptx> in ./data/
    > findModulePath <./data/matrixMul_kernel64.ptx>
    > initCUDA loading module: <./data/matrixMul_kernel64.ptx>
    > PTX JIT log:
    
    Processing time: 0.568077 (ms)
    Checking computed result for correctness: Result = PASS
    
    NOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.

    ▶ 涨姿势:

    ● 头文件 matrixMul.h 的内容:

     1 #ifndef _MATRIXMUL_H_
     2 #define _MATRIXMUL_H_
     3 
     4 // 规定了参与计算的矩阵的维数
     5 #define WA        (4 * block_size)
     6 #define HA         (6 * block_size)
     7 #define WB        (4 * block_size)
     8 #define HB WA
     9 #define WC WB
    10 #define HC HA
    11 
    12 #endif // _MATRIXMUL_H_

    ● C++ 中 string 类的基本使用方法

    1 using namespace std;
    2 
    3 string buf, buf2;
    4 int n;
    5 char *buf = new char[n];// 动态创建字符数组大小,类似malloc
    6 buf[n - 1] = '';      // 手动结尾补零
    7 buf2 = buf;             // 直接赋值
    8 delete[] buf;           // 删除该数组,类似 free

    ●  class StopWatchInterface ,定义于 helper_timer.h 中用于计时的一个类,这里只说明其使用方法,其内容在头文件随笔中详细讨论。

    1 StopWatchInterface *timer = NULL;   // 创建计时类指针   
    2 sdkCreateTimer(&timer);             // 创建计时类
    3 sdkStartTimer(&timer);              // 开始计时
    4 
    5 ...                                 // 核函数运行过程
    6 
    7 sdkStopTimer(&timer);               // 停止计时
    8 sdkGetTimerValue(&timer);           // 获取时间(返回浮点类型的毫秒数)
    9 sdkDeleteTimer(&timer);             // 删除计时类

    ● cuda.h 中各种定义

    typedef int CUdevice;                      // CUDA int 类型,用于标志设备号
    typedef struct CUfunc_st *CUfunction;      // CUDA 函数指针
    typedef struct CUmod_st *CUmodule;         // CUDA 模块指针
    typedef struct CUctx_st *CUcontext;        // CUDA 上下文指针
    typedef enum cudaError_enum {...}CUresult; // CUDA 各种错误信息标号
    typedef unsigned long long CUdeviceptr;    // 无符号长长整型
    
    CUresult CUDAAPI cuDeviceGetName(char *name, int len, CUdevice dev);// 获取设备名称
    
    CUresult CUDAAPI cuDeviceComputeCapability(int *major, int *minor, CUdevice dev);   // 获取设备计算能力
    
    inline CUdevice findCudaDeviceDRV(int argc, const char **argv);     // 依命令行指定设备,否则选择计算能力最高的设备。内含函数调用 cuInit(0)
    
    #define cuDeviceTotalMem cuDeviceTotalMem_v2                        // 获取显存大小
    CUresult CUDAAPI cuDeviceTotalMem(size_t *bytes, CUdevice dev);
    
    #define cuMemAlloc cuMemAlloc_v2                                    // 申请显存
    CUresult CUDAAPI cuMemAlloc(CUdeviceptr *dptr, size_t bytesize);    
    
    #define cuMemFree cuMemFree_v2                                      // 释放显存
    CUresult CUDAAPI cuMemFree(CUdeviceptr dptr);
    
    CUresult CUDAAPI cuInit(unsigned int Flags);                        // 重要的初始化设备参数,在创建上下文之前要先调用它,参数可以设为 0
    
    #define cuCtxCreate cuCtxCreate_v2                                  // 创建上下文
    CUresult CUDAAPI cuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev);
    
    #define cuCtxDestroy cuCtxDestroy_v2                                // 销毁上下文
    CUresult CUDAAPI cuCtxDestroy(CUcontext ctx);                       
    
    #define cuMemcpyHtoD __CUDA_API_PTDS(cuMemcpyHtoD_v2)   // cudaMemcpy(cudaMemcpyHostToDevice)的别名
    #define cuMemcpyDtoH __CUDA_API_PTDS(cuMemcpyDtoH_v2)   // cudaMemcpy(cudaMemcpyDeviceToHost)的别名
    #define __CUDA_API_PTDS(api) api
    
    // 从 ptx 流 image 中编译模块 module,并且包括 numOptions 个参数,参数名列表为 options,参数值列表为 optionValues 
    CUresult CUDAAPI cuModuleLoadDataEx(CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues);
    
    // 指定路径 fname 中获取模块 module
    CUresult CUDAAPI cuModuleLoad(CUmodule *module, const char *fname);
    
    // 从指定模块 hmod 中获取名为 name 的函赋给函数指针 hfunc
    CUresult CUDAAPI cuModuleGetFunction(CUfunction *hfunc, CUmodule hmod, const char *name);

    ● 代码中使用了 goto 语句,基本使用过程如下。好处是函数整个函数 initCUDA 中只有一个 return,坏处是到处是跳转。

     1 int function()
     2 {
     3     CUresult status;
     4 
     5     status = cudaFunction();
     6     if (!status == CUDA_SUCCESS)// 函数cudaFunction运行不正常 
     7         goto Error;
     8 
     9     ...                         // 函数运行正常
    10 
    11     return 0;                   // 正常结束,返回 0
    12 Error:                          
    13     return status;              // 非正常结束,返回首个错误编号
    14 }

    ● Driver API 的简略使用过程。本篇源代码很长,但是压缩后可以变成以下内容,方便看出该接口函数的使用过程。

      1 #include <stdio.h>
      2 #include <cuda.h>
      3 #include <builtin_types.h>
      4 #include <helper_cuda_drvapi.h>
      5 #include <helper_timer.h>
      6 
      7 int main()
      8 {
      9     // 常量
     10     CUdevice cuDevice = 0;
     11     CUcontext cuContext;
     12     CUmodule cuModule;
     13     CUfunction matrixMul = NULL;
     14     CUresult status;
     15     char module_path[30] = "./data/matrixMul_kernel64.ptx";
     16     char ptx_source[63894];
     17 
     18     // 创建上下文
     19     cuInit(0);
     20     status = cuCtxCreate(&cuContext, 0, cuDevice);
     21 
     22     // 获取函数
     23     FILE *fp = fopen(module_path, "rb");
     24     fseek(fp, 0, SEEK_END);
     25     int file_size = ftell(fp);
     26     fseek(fp, 0, SEEK_SET);
     27     fread(ptx_source, sizeof(char), file_size, fp);
     28     ptx_source[63894 - 1] = '';
     29 
     30     // 设置编译选项
     31     const unsigned int jitNumOptions = 3;
     32     CUjit_option *jitOptions = new CUjit_option[jitNumOptions];
     33     void **jitOptVals = new void *[jitNumOptions];
     34 
     35     // 编译日志大小
     36     jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
     37     int jitLogBufferSize = 1024;
     38     jitOptVals[0] = (void *)(size_t)jitLogBufferSize;
     39 
     40     // 编译日志的指针
     41     jitOptions[1] = CU_JIT_INFO_LOG_BUFFER;
     42     char *jitLogBuffer = new char[jitLogBufferSize];
     43     jitOptVals[1] = jitLogBuffer;
     44 
     45     // 单核函数寄存器数量
     46     jitOptions[2] = CU_JIT_MAX_REGISTERS;
     47     int jitRegCount = 32;
     48     jitOptVals[2] = (void *)(size_t)jitRegCount;
     49 
     50     // 编译模块
     51     status = cuModuleLoadDataEx(&cuModule, ptx_source, jitNumOptions, jitOptions, (void **)jitOptVals);
     52     printf("
    PTX JIT log:
    %s
    ", jitLogBuffer);
     53     status = cuModuleGetFunction(&matrixMul, cuModule, "matrixMul_bs32_64bit");
     54 
     55     // 数据准备工作
     56     int block_size = 32;
     57     int wa = 4 * block_size;
     58     int ha = 6 * block_size;
     59     int wb = 4 * block_size;
     60     int hb = wa;
     61     int wc = wb;
     62     int hc = ha;
     63 
     64     unsigned int size_A = wa * ha;
     65     unsigned int mem_size_A = sizeof(float) * size_A;
     66     float *h_A = (float *)malloc(mem_size_A);
     67     unsigned int size_B = wb * hb;
     68     unsigned int mem_size_B = sizeof(float) * size_B;
     69     float *h_B = (float *)malloc(mem_size_B);
     70     size_t size_C = wc * hc;
     71     size_t mem_size_C = sizeof(float) * size_C;
     72     float *h_C = (float *)malloc(mem_size_C);
     73 
     74     for (int i = 0; i < size_A; ++i)
     75         h_A[i] = 1.0f;
     76     for (int i = 0; i < size_B; ++i)
     77         h_B[i] = 0.01f;
     78 
     79     CUdeviceptr d_A;
     80     cuMemAlloc(&d_A, mem_size_A);
     81     CUdeviceptr d_B;
     82     cuMemAlloc(&d_B, mem_size_B);
     83     CUdeviceptr d_C;
     84     cuMemAlloc(&d_C, mem_size_C);
     85     cuMemcpyHtoD(d_A, h_A, mem_size_A);
     86     cuMemcpyHtoD(d_B, h_B, mem_size_B);
     87 
     88     dim3 block(block_size, block_size, 1);
     89     dim3 grid(wc / block_size, hc / block_size, 1);
     90 
     91     // 两种方式调用 Driver API
     92     if (1)
     93     {
     94         size_t Matrix_Width_A = (size_t)wa;
     95         size_t Matrix_Width_B = (size_t)wb;
     96         void *args[5] = { &d_C, &d_A, &d_B, &Matrix_Width_A, &Matrix_Width_B };
     97         // CUDA 4.0 Driver API 核函数调用,使用倒数第二个指针参数
     98         cuLaunchKernel(matrixMul, grid.x, grid.y, grid.z, block.x, block.y, block.z,
     99             2 * block_size*block_size * sizeof(float), NULL, args, NULL);
    100     }
    101     else
    102     {
    103         int offset = 0;
    104         char argBuffer[256];// 与上面 args 相同顺序依次填入所需的指针参数,用 offset 作偏移
    105 
    106         *((CUdeviceptr *)&argBuffer[offset]) = d_C;
    107         offset += sizeof(d_C);
    108         *((CUdeviceptr *)&argBuffer[offset]) = d_A;
    109         offset += sizeof(d_A);
    110         *((CUdeviceptr *)&argBuffer[offset]) = d_B;
    111         offset += sizeof(d_B);
    112         size_t Matrix_Width_A = (size_t)wa;
    113         size_t Matrix_Width_B = (size_t)wb;
    114         *((CUdeviceptr *)&argBuffer[offset]) = Matrix_Width_A;
    115         offset += sizeof(Matrix_Width_A);
    116         *((CUdeviceptr *)&argBuffer[offset]) = Matrix_Width_B;
    117         offset += sizeof(Matrix_Width_B);
    118 
    119         // 用一个 void * 来封装上面5个参数,并加上参数尺寸和一个指明参数结束的结束宏
    120         void *kernel_launch_config[5] =
    121         {
    122             CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer,
    123             CU_LAUNCH_PARAM_BUFFER_SIZE,    &offset,
    124             CU_LAUNCH_PARAM_END
    125         };
    126 
    127         // CUDA 4.0 Driver API 核函数调用,使用最后一个指针参数
    128         cuLaunchKernel(matrixMul, grid.x, grid.y, grid.z, block.x, block.y, block.z,
    129             2 * block_size*block_size * sizeof(float), NULL, NULL, (void **)&kernel_launch_config);
    130     }
    131 
    132     cuMemcpyDtoH((void *)h_C, d_C, mem_size_C);
    133 
    134     //检查结果
    135     printf("Checking computed result for correctness: ");
    136     bool correct = true;
    137     for (int i = 0; i < (int)(wc * hc); i++)
    138     {
    139         if (fabs(h_C[i] - (wa * 0.01f)) > 1e-5)
    140         {
    141             printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > 1e-5
    ", i, h_C[i], wa*0.01f);
    142             correct = false;
    143         }
    144     }
    145     printf("%s
    ", correct ? "Result = PASS" : "Result = FAIL");
    146 
    147     free(h_A);
    148     free(h_B);
    149     free(h_C);
    150     cuMemFree(d_A);
    151     cuMemFree(d_B);
    152     cuMemFree(d_C);
    153     cuCtxDestroy(cuContext);
    154 
    155     getchar();
    156     return 0;
    157 }
  • 相关阅读:
    LightOJ 1139 8 puzzle + hdu 1043 Eight A*
    hdu 1180 优先队列 + bfs
    hdu 1270
    HDU Doing Homework
    hdu 1171 Big Event in HDU
    hdu 3613 (KMP)回文串
    POJ 3461 Oulipo(KMP)
    POJ 1565(DP状态压缩)
    NYOJ 634 万里挑一(优先队列)
    职场手记1_你想成文什么样的人
  • 原文地址:https://www.cnblogs.com/cuancuancuanhao/p/7774220.html
Copyright © 2011-2022 走看看