zoukankan      html  css  js  c++  java
  • 0_Simple__matrixMul + 0_Simple__matrixMul_nvrtc

    矩阵乘法,使用一维线程块和共享内存。并且在静态代码和运行时编译两种条件下使用。

    ▶ 源代码:静态使用

      1 #include <stdio.h>
      2 #include <assert.h>
      3 #include <cuda_runtime.h>
      4 #include "device_launch_parameters.h"
      5 #include <helper_functions.h>
      6 #include <helper_cuda.h>
      7 
      8 template <int BLOCK_SIZE> __global__ void matrixMulCUDA(float *C, float *A, float *B, int wA, int wB)
      9 {
     10     int bx = blockIdx.x;
     11     int by = blockIdx.y;
     12     int tx = threadIdx.x;
     13     int ty = threadIdx.y;
     14 
     15     int aBegin = wA * BLOCK_SIZE * by;  // A的行程起点
     16     int aEnd   = aBegin + wA - 1;       // A的行程终点
     17     int aStep  = BLOCK_SIZE;            // A的跨度(一个 block 为宽 BLOCK_SIZE 的一维条带,各线程分别对应其中的一个元素)
     18     int bBegin = BLOCK_SIZE * bx;       // B的行程起点
     19     int bStep  = BLOCK_SIZE * wB;       // B的跨度(一个 block 为高 BLOCK_SIZE 的一维条带,各线程分别对应其中的一个元素)
     20     float Csub = 0;
     21     
     22     for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep)
     23     {
     24         __shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
     25         __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
     26         As[ty][tx] = A[a + wA * ty + tx];
     27         Bs[ty][tx] = B[b + wB * ty + tx];
     28         __syncthreads();
     29 
     30 #pragma unroll// 循环展开为 BLOCK_SIZE 个赋值语句,提高效率
     31         for (int k = 0; k < BLOCK_SIZE; ++k)
     32             Csub += As[ty][k] * Bs[k][tx];
     33         __syncthreads();
     34     }
     35 
     36     int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;
     37     C[c + wB * ty + tx] = Csub;
     38 }
     39 
     40 void constantInit(float *data, int size, float val)
     41 {
     42     for (int i = 0; i < size; ++i)
     43         data[i] = val;
     44 }
     45 
     46 int matrixMultiply(int argc, char **argv, int block_size, dim3 &dimsA, dim3 &dimsB)
     47 {
     48     unsigned int size_A = dimsA.x * dimsA.y;
     49     unsigned int mem_size_A = sizeof(float) * size_A;
     50     float *h_A = (float *)malloc(mem_size_A);
     51     unsigned int size_B = dimsB.x * dimsB.y;
     52     unsigned int mem_size_B = sizeof(float) * size_B;
     53     float *h_B = (float *)malloc(mem_size_B);
     54     constantInit(h_A, size_A, 1.0f);
     55     constantInit(h_B, size_B, 0.01f);
     56     dim3 dimsC(dimsB.x, dimsA.y, 1);
     57     unsigned int mem_size_C = dimsC.x * dimsC.y * sizeof(float);
     58     float *h_C = (float *) malloc(mem_size_C);
     59     float *d_A, *d_B, *d_C;
     60     cudaMalloc((void **) &d_A, mem_size_A);
     61     cudaMalloc((void **) &d_B, mem_size_B);
     62     cudaMalloc((void **) &d_C, mem_size_C);
     63     cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice);
     64     cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice);
     65 
     66     // 热身
     67     dim3 threads(block_size, block_size);
     68     dim3 grid(dimsB.x / threads.x, dimsA.y / threads.y);
     69     if (block_size == 16)
     70         matrixMulCUDA<16><<< grid, threads >>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
     71     else
     72         matrixMulCUDA<32><<< grid, threads >>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
     73     printf("done
    ");
     74     cudaDeviceSynchronize();
     75 
     76     printf("Computing result using CUDA Kernel...
    ");
     77     cudaEvent_t start;
     78     cudaEventCreate(&start);
     79     cudaEvent_t stop;
     80     cudaEventCreate(&stop);
     81     cudaEventRecord(start, NULL);
     82 
     83     int nIter = 300;
     84     for (int j = 0; j < nIter; j++)
     85     {
     86         if (block_size == 16)
     87             matrixMulCUDA<16><<< grid, threads >>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
     88         else
     89             matrixMulCUDA<32><<< grid, threads >>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
     90     }
     91     cudaEventRecord(stop, NULL);
     92     cudaEventSynchronize(stop);
     93 
     94     float msecTotal = 0.0f;
     95     cudaEventElapsedTime(&msecTotal, start, stop);
     96     float msecPerMatrixMul = msecTotal / nIter;
     97     double flopsPerMatrixMul = 2.0 * (double)dimsA.x * (double)dimsA.y * (double)dimsB.x;
     98     double gigaFlops = (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f);
     99     printf("Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops, WorkgroupSize= %u threads/block
    ",
    100         gigaFlops, msecPerMatrixMul, flopsPerMatrixMul, threads.x * threads.y);
    101     cudaMemcpy(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost);
    102     
    103     // 检查结果,要求相对误差:|<x, y>_cpu - <x,y>_gpu| / <|x|, |y|>  < eps    
    104     printf("Checking computed result for correctness: ");
    105     bool correct = true;
    106     double eps = 1.e-6 ; // machine zero
    107     for (int i = 0; i < (int)(dimsC.x * dimsC.y); i++)
    108     {
    109         double abs_err = fabs(h_C[i] - (dimsA.x * valB));
    110         double dot_length = dimsA.x;
    111         double abs_val = fabs(h_C[i]);
    112         double rel_err = abs_err/abs_val/dot_length ;
    113         if (rel_err > eps)
    114         {
    115             printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E
    ", i, h_C[i], dimsA.x*valB, eps);
    116             correct = false;
    117         }
    118     }
    119     printf("%s
    ", correct ? "Result = PASS" : "Result = FAIL");
    120 
    121     free(h_A);
    122     free(h_B);
    123     free(h_C);
    124     cudaFree(d_A);
    125     cudaFree(d_B);
    126     cudaFree(d_C);
    127     printf("
    NOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.
    ");
    128     if (correct)
    129         return EXIT_SUCCESS;
    130     else
    131         return EXIT_FAILURE;
    132 }
    133 
    134 int main(int argc, char **argv)
    135 {
    136     printf("[Matrix Multiply Using CUDA] - Starting...
    ");
    137 
    138     if (checkCmdLineFlag(argc, (const char **)argv, "help") || checkCmdLineFlag(argc, (const char **)argv, "?"))
    139     {
    140         printf("Usage -device=n (n >= 0 for deviceID)
    ");
    141         printf("      -wA=WidthA -hA=HeightA (Width x Height of Matrix A)
    ");
    142         printf("      -wB=WidthB -hB=HeightB (Width x Height of Matrix B)
    ");
    143         printf("  Note: Outer matrix dimensions of A & B matrices must be equal.
    ");
    144         exit(EXIT_SUCCESS);
    145     }
    146 
    147     int devID = 0;// 指定设备,默认用0号设备
    148     if (checkCmdLineFlag(argc, (const char **)argv, "device"))
    149     {
    150         devID = getCmdLineArgumentInt(argc, (const char **)argv, "device");
    151         cudaSetDevice(devID);
    152     }
    153     cudaDeviceProp deviceProp;
    154     cudaGetDevice(&devID);
    155     cudaGetDeviceProperties(&deviceProp, devID);
    156 
    157     if (deviceProp.computeMode == cudaComputeModeProhibited)
    158     {
    159         fprintf(stderr, "Error: device is running in <Compute Mode Prohibited>, no threads can use ::cudaSetDevice().
    ");
    160         exit(EXIT_SUCCESS);
    161     }
    162 
    163     int block_size = (deviceProp.major < 2) ? 16 : 32;
    164 
    165     dim3 dimsA(5*2*block_size, 5*2*block_size, 1);
    166     dim3 dimsB(5*4*block_size, 5*2*block_size, 1);
    167 
    168     // 使用命令行指定的A、B的维度参数
    169     if (checkCmdLineFlag(argc, (const char **)argv, "wA"))
    170         dimsA.x = getCmdLineArgumentInt(argc, (const char **)argv, "wA");
    171     if (checkCmdLineFlag(argc, (const char **)argv, "hA"))
    172         dimsA.y = getCmdLineArgumentInt(argc, (const char **)argv, "hA");
    173     if (checkCmdLineFlag(argc, (const char **)argv, "wB"))
    174         dimsB.x = getCmdLineArgumentInt(argc, (const char **)argv, "wB");
    175     if (checkCmdLineFlag(argc, (const char **)argv, "hB"))
    176         dimsB.y = getCmdLineArgumentInt(argc, (const char **)argv, "hB");
    177     if (dimsA.x != dimsB.y)
    178     {
    179         printf("Error: outer matrix dimensions must be equal. (%d != %d)
    ",
    180                dimsA.x, dimsB.y);
    181         exit(EXIT_FAILURE);
    182     }
    183     printf("MatrixA(%d,%d), MatrixB(%d,%d)
    ", dimsA.x, dimsA.y, dimsB.x, dimsB.y);
    184 
    185     int matrix_result = matrixMultiply(argc, argv, block_size, dimsA, dimsB);
    186 
    187     getchar();
    188     exit(matrix_result);
    189 }

    ▶ 源代码:运行时编译

     1 /*matrixMul_kernel.cu*/
     2 template <int BLOCK_SIZE> __device__ void matrixMulCUDA(float *C, float *A, float *B, int wA, int wB)
     3 {
     4     int bx = blockIdx.x;
     5     int by = blockIdx.y;
     6     int tx = threadIdx.x;
     7     int ty = threadIdx.y;
     8     int aBegin = wA * BLOCK_SIZE * by;
     9     int aEnd   = aBegin + wA - 1;
    10     int aStep  = BLOCK_SIZE;
    11     int bBegin = BLOCK_SIZE * bx;
    12     int bStep = BLOCK_SIZE * wB;
    13     float Csub = 0;
    14     for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep)
    15     {
    16         __shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
    17         __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
    18         As[ty][tx] = A[a + wA * ty + tx];
    19         Bs[ty][tx] = B[b + wB * ty + tx];
    20         __syncthreads();
    21 #pragma unroll
    22         for (int k = 0; k < BLOCK_SIZE; ++k)
    23             Csub += As[ty][k] * Bs[k][tx];
    24         __syncthreads();
    25     }
    26     int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;
    27     C[c + wB * ty + tx] = Csub;
    28 }
    29 
    30 extern "C" __global__ void  matrixMulCUDA_block16(float *C, float *A, float *B, int wA, int wB)
    31 {
    32     matrixMulCUDA<16>(C,A,B,wA,wB);
    33 }
    34 
    35 extern "C" __global__ void  matrixMulCUDA_block32(float *C, float *A, float *B, int wA, int wB)
    36 {
    37     matrixMulCUDA<32>(C,A,B,wA,wB);
    38 }
      1 /*matrixMul.cpp*/
      2 #include <stdio.h>
      3 #include <assert.h>
      4 #include <cuda_runtime.h>
      5 #include "device_launch_parameters.h"
      6 #include "nvrtc_helper.h"
      7 #include <helper_functions.h>
      8 
      9 void constantInit(float *data, int size, float val)
     10 {
     11     for (int i = 0; i < size; ++i)
     12         data[i] = val;
     13 }
     14 
     15 int matrixMultiply(int argc, char **argv, int block_size, dim3 &dimsA, dim3 &dimsB)
     16 {
     17     // Allocate host memory for matrices A and B
     18     unsigned int size_A = dimsA.x * dimsA.y;
     19     unsigned int mem_size_A = sizeof(float) * size_A;
     20     float *h_A = (float *)malloc(mem_size_A);
     21     unsigned int size_B = dimsB.x * dimsB.y;
     22     unsigned int mem_size_B = sizeof(float) * size_B;
     23     float *h_B = (float *)malloc(mem_size_B);
     24     const float valB = 0.01f;
     25     constantInit(h_A, size_A, 1.0f);
     26     constantInit(h_B, size_B, valB);
     27     CUdeviceptr d_A, d_B, d_C;
     28 
     29     char *ptx, *kernel_file;
     30     size_t ptxSize;
     31     kernel_file = sdkFindFilePath("matrixMul_kernel.cu", argv[0]);
     32     compileFileToPTX(kernel_file, 0, NULL, &ptx, &ptxSize);
     33     CUmodule module = loadPTX(ptx, argc, argv);
     34 
     35     dim3 dimsC(dimsB.x, dimsA.y, 1);
     36     unsigned int mem_size_C = dimsC.x * dimsC.y * sizeof(float);
     37     float *h_C = (float *) malloc(mem_size_C);
     38     cuMemAlloc(&d_A, mem_size_A);
     39     cuMemAlloc(&d_B, mem_size_B);
     40     cuMemAlloc(&d_C, mem_size_C);
     41     cuMemcpyHtoD(d_A, h_A, mem_size_A);
     42     cuMemcpyHtoD(d_B, h_B, mem_size_B);
     43 
     44     dim3 threads(block_size, block_size);
     45     dim3 grid(dimsB.x / threads.x, dimsA.y / threads.y);
     46 
     47     printf("Computing result using CUDA Kernel...
    ");
     48 
     49     CUfunction kernel_addr;
     50     if (block_size == 16)
     51       cuModuleGetFunction(&kernel_addr, module, "matrixMulCUDA_block16");
     52     else
     53       cuModuleGetFunction(&kernel_addr, module, "matrixMulCUDA_block32");
     54 
     55     void *arr[] = { (void *)&d_C, (void *)&d_A, (void *)&d_B, (void *)&dimsA.x, (void *)&dimsB.x };
     56 
     57     // Execute the kernel
     58     int nIter = 300;
     59 
     60     for (int j = 0; j < nIter; j++)
     61     {
     62         cuLaunchKernel(kernel_addr,
     63             grid.x, grid.y, grid.z,
     64             threads.x, threads.y, threads.z,
     65             0, 0, &arr[0], 0);
     66         cuCtxSynchronize();
     67     }
     68     cuMemcpyDtoH(h_C, d_C, mem_size_C);
     69 
     70     printf("Checking computed result for correctness: ");
     71     bool correct = true;
     72     double eps = 1.e-6 ;
     73     for (int i = 0; i < (int)(dimsC.x * dimsC.y); i++)
     74     {
     75         double abs_err = fabs(h_C[i] - (dimsA.x * valB);
     76         double dot_length = dimsA.x;
     77         double abs_val = fabs(h_C[i]);
     78         double rel_err = abs_err/abs_val/dot_length ;
     79         if (rel_err > eps)
     80         {
     81             printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E
    ", i, h_C[i], dimsA.x*valB, eps);
     82             correct = false;
     83         }
     84     }
     85     printf("%s
    ", correct ? "Result = PASS" : "Result = FAIL");
     86 
     87     printf("
    NOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.
    ");
     88     free(h_A);
     89     free(h_B);
     90     free(h_C);
     91     cuMemFree(d_A);
     92     cuMemFree(d_B);
     93     cuMemFree(d_C);
     94     if (correct)
     95         return EXIT_SUCCESS;
     96     else
     97         return EXIT_FAILURE;
     98 }
     99 
    100 int main(int argc, char **argv)
    101 {
    102     printf("[Matrix Multiply Using CUDA] - Starting...
    ");
    103 
    104     if (checkCmdLineFlag(argc, (const char **)argv, "help") || checkCmdLineFlag(argc, (const char **)argv, "?"))
    105     {
    106         printf("Usage -device=n (n >= 0 for deviceID)
    ");
    107         printf("      -wA=WidthA -hA=HeightA (Width x Height of Matrix A)
    ");
    108         printf("      -wB=WidthB -hB=HeightB (Width x Height of Matrix B)
    ");
    109         printf("  Note: Outer matrix dimensions of A & B matrices must be equal.
    ");
    110         exit(EXIT_SUCCESS);
    111     }
    112 
    113     int block_size = 32;
    114     dim3 dimsA(5*2*block_size, 5*2*block_size, 1);
    115     dim3 dimsB(5*4*block_size, 5*2*block_size, 1);
    116 
    117     if (checkCmdLineFlag(argc, (const char **)argv, "wA"))
    118         dimsA.x = getCmdLineArgumentInt(argc, (const char **)argv, "wA");
    119     if (checkCmdLineFlag(argc, (const char **)argv, "hA"))
    120         dimsA.y = getCmdLineArgumentInt(argc, (const char **)argv, "hA");
    121     if (checkCmdLineFlag(argc, (const char **)argv, "wB"))
    122         dimsB.x = getCmdLineArgumentInt(argc, (const char **)argv, "wB");
    123     if (checkCmdLineFlag(argc, (const char **)argv, "hB"))
    124         dimsB.y = getCmdLineArgumentInt(argc, (const char **)argv, "hB");
    125     if (dimsA.x != dimsB.y)
    126     {
    127         printf("Error: outer matrix dimensions must be equal. (%d != %d)
    ", dimsA.x, dimsB.y);
    128     }   exit(EXIT_FAILURE);
    129     printf("MatrixA(%d,%d), MatrixB(%d,%d)
    ", dimsA.x, dimsA.y, dimsB.x, dimsB.y);
    130 
    131     int matrix_result = matrixMultiply(argc, argv, block_size, dimsA, dimsB);
    132 
    133     getchar();
    134     exit(matrix_result);
    135 }

    ▶ 输出结果:

    [Matrix Multiply Using CUDA] - Starting...
    GPU Device 0: "GeForce GTX 1070" with compute capability 6.1
    
    MatrixA(320,320), MatrixB(640,320)
    Computing result using CUDA Kernel...
    done
    Performance= 22.95 GFlop/s, Time= 5.712 msec, Size= 131072000 Ops, WorkgroupSize= 1024 threads/block
    Checking computed result for correctness: Result = PASS
    
    NOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.

    ▶ 涨姿势:

    ● 程序写得很烂,各种声明、初始化杂糅。

    ● 一个根据cuda错误种类返回错误描述的函数

    extern __host__ __cudart_builtin__ const char* CUDARTAPI cudaGetErrorString(cudaError_t error);

    ● 预编译命令展开循环

    1 #pragma unroll
    2 for (i = 0; i < m; i++)
    3     c[i] = a[i] + b[i];

    等价于

    1 c[0] = a[0] + b[0];
    2 c[1] = a[1] + b[1];
    3 c[2] = a[2] + b[2];
    4 ...
    5 c[m-1] = a[m-1] + b[m-1];

     #pragma unroll 命令后面可接数字,表明展开前多少次迭代,例如 #pragma unroll 4 

    ● 核函数泛型编程。可以在调用核函数时传入一个常量参数,变相使用动态数组来规定共享内存等数组的大小。

    1 template <int BLOCK_SIZE> __global__ void functionName(void)
    2 {
    3     __shared__ int shareArray[BLOCK_SIZE];
    4     ...
    5 }    
    6 
    7 cunctionName<16> << < blocksize, threadsize >> >();

    ● 热身,在多次重复实验前提前算一次。对缓存有帮助,有效减小实验结果(计算耗时)的方差。

  • 相关阅读:
    树形DP
    区间DP
    洛谷P1462 通往奥格瑞玛的道路
    缓存--Redis
    Flack--SQLAlchemy
    Flask--WTForms
    Flask框架
    通过反射,获取linkedHashMap的最后一个键值对。对map按照值进行排序。
    Comparable和Comparator的使用
    构造函数,构造代码块,静态函数的执行顺序
  • 原文地址:https://www.cnblogs.com/cuancuancuanhao/p/7745509.html
Copyright © 2011-2022 走看看