zoukankan      html  css  js  c++  java
  • 2.3CUDA矩阵乘法

    CPU 矩阵乘法

    能相乘的两个矩阵,必须满足一个矩阵的行数和第二个矩阵的列数相同. 

    A(N*P) * B(P*M) = C(N*M). 其中P是行数,N是列数, 从宽高的角度来说,即 A的宽度和B的高度是相同的.C矩阵 = ha * wb.

    其中C(i,j) = A矩阵中的i行和B矩阵中的j列进行点乘得到该点的值.
    //C = A*B
    void MatrixMulCPU(float* _C,const float *_A,const float *_B,int _wa,int _ha,int _wb)
    {
        float sum = 0;
        for (int i = 0; i < _ha; ++i)
        {
            for (int j = 0; j < _wb; ++j)
            {
                sum = 0;
                for (int k = 0; k < _wa; ++k)
                {
           //i*_wa得到当前对应的是A矩阵中的哪一行,+k对应当前行的哪一列.矩阵A的stride是wa
           //j对应当前矩阵B的哪一列,+k*wb依次表示第0行的j列,第1行的j列...第wa-1行的j列.矩阵B的stride是wb sum
    += (float)_A[i*_wa+k]*(float)_B[k*_wb+ j]; } _C[i*_wb+j] = (float)sum; } } }

    简单矩阵乘法

    C(i,j) = sum { A(i,k)*B(k,j) } 0<=k < _wa;耦合程度很小,所以我们可以通过划分区域的方法,让每个线程负责一个区域。
    怎么划分呢?首先最初的想法是让每一个线程计算一个C(i,j),那么估算一下,应该需要height_c*width_c,也就是ha*wb个线程。进一步,我们将矩阵按一个大方格Block划分,如果一个方格Block大小是16*16,那么矩阵80*48的可以表示为5(*16) * 3(*16),即5*3个大格子(Grid),所以grid的划分自然就是(height_c/16) *(width_c/16)个线程了。
    好了,划分完后,内核代码如下: 这个kernel的代码只是把外层两个循环变成

    计算版本0:

    __global__ void matrix_kernel_0(float* _C,const float* _A,const float *_B,int _wa,int _wb)
    {
        float sum = 0;
        //找出该线程所在的行列
        int row = blockIdx.y*blockDim.y + threadIdx.y;  // X 对应矩阵row, Y对应举证col
        int col = blockIdx.x*blockDim.x + threadIdx.x;
    
        //线程Thread(row,col)负责计算C(row,col)
        for (int i = 0; i < _wa; ++i)
        {
            sum += _A[row*_wa + i]*_B[i*_wb + col];
        }
        _C[row*_wb + col] = sum;
    }

    这个是Heterogeneous Parallel Programming lab3:Basic Matrix Matrix Multiplication的代码:

    #include <wb.h>
    
    #define wbCheck(stmt)                                                          
      do {                                                                         
        cudaError_t err = stmt;                                                    
        if (err != cudaSuccess) {                                                  
          wbLog(ERROR, "Failed to run stmt ", #stmt);                              
          wbLog(ERROR, "Got CUDA error ...  ", cudaGetErrorString(err));           
          return -1;                                                               
        }                                                                          
      } while (0)
    
    
    #if 0 //This is C verison matrixMUl
    //C = A*B
    void MatrixMulCPU(float* _C,const float *_A,const float *_B,int _wa,int _ha,int _wb)
    {
        float sum = 0;
        for (int i = 0; i < _ha; ++i)
        {
            for (int j = 0; j < _wb; ++j)
            {
                sum = 0;
                for (int k = 0; k < _wa; ++k)
                {
                    sum += (float)_A[i*_wa+k]*(float)_B[k*_wb+ j];
                }
                _C[i*_wb+j] = (float)sum;
            }
        }
    }
    #endif
    
    
    // Compute C = A * B , Matrix C = hA * wB = rowA * columnB
    __global__ void matrixMultiply(float *A, float *B, float *C, int numARows,
                                   int numAColumns, int numBRows, int numBColumns,
                                   int numCRows, int numCColumns) {
      //@@ Insert code to implement matrix multiplication here
         float sum = 0.0f;
    
        int row = blockIdx.y*blockDim.y + threadIdx.y;
        int col = blockIdx.x*blockDim.x + threadIdx.x;
    
    
        if(row < numCRows && col < numCColumns){
            for (int i = 0; i < numAColumns; ++i)
            {
                sum += A[row*numAColumns + i] * B[i*numBColumns + col];
            }
            C[row*numBColumns + col] = sum;
        }
        printf("C = %f
    ",C[row*numBColumns + col]);
    
    }
    
    
    int main(int argc, char **argv) {
      wbArg_t args;
      float *hostA; // The A matrix
      float *hostB; // The B matrix
      float *hostC; // The output C matrix
      float *deviceA;
      float *deviceB;
      float *deviceC;
      int numARows;    // number of rows in the matrix A
      int numAColumns; // number of columns in the matrix A
      int numBRows;    // number of rows in the matrix B
      int numBColumns; // number of columns in the matrix B
      int numCRows;    // number of rows in the matrix C (you have to set this)
      int numCColumns; // number of columns in the matrix C (you have to set this)
    
      args = wbArg_read(argc, argv);
    
      wbTime_start(Generic, "Importing data and creating memory on host");
      hostA =
          ( float * )wbImport(wbArg_getInputFile(args, 0), &numARows, &numAColumns);
      hostB =
          ( float * )wbImport(wbArg_getInputFile(args, 1), &numBRows, &numBColumns);
      //@@ Set numCRows and numCColumns
      numCRows = 0;
      numCColumns = 0;
      if(numAColumns != numBRows){
        wbLog(TRACE, "numAColumns != numBRows, Break ");
        return 1;
      }
      numCRows = numARows;
      numCColumns = numBColumns;
      unsigned int A_size = numARows * numAColumns * sizeof(float);
      unsigned int B_size = numBRows * numBColumns * sizeof(float);
      unsigned int C_size = numCRows * numCColumns * sizeof(float);
      //@@ Allocate the hostC matrix
      hostC = ( float * )malloc(C_size);
      wbTime_stop(Generic, "Importing data and creating memory on host");
    
      wbLog(TRACE, "The dimensions of A are ", numARows, " x ", numAColumns);
      wbLog(TRACE, "The dimensions of B are ", numBRows, " x ", numBColumns);
    
      wbTime_start(GPU, "Allocating GPU memory.");
      //@@ Allocate GPU memory here
      wbCheck(cudaMalloc((void**)&deviceA, A_size)); 
      wbCheck(cudaMalloc((void**)&deviceB, B_size)); 
      wbCheck(cudaMalloc((void**)&deviceC, C_size)); 
      wbTime_stop(GPU, "Allocating GPU memory.");
    
      wbTime_start(GPU, "Copying input memory to the GPU.");
      //@@ Copy memory to the GPU here
      wbCheck(cudaMemcpy(deviceA, hostA, A_size, cudaMemcpyHostToDevice));
      wbCheck(cudaMemcpy(deviceB, hostB, B_size, cudaMemcpyHostToDevice));
      wbCheck(cudaMemcpy(deviceC, hostC, C_size, cudaMemcpyHostToDevice));
      wbTime_stop(GPU, "Copying input memory to the GPU.");
    
      //@@ Initialize the grid and block dimensions here
      dim3 DimGrid((numCColumns-1)/16+1, (numCRows-1)/16+1, 1);
      dim3 DimBlock(16, 16, 1);
    
      wbTime_start(Compute, "Performing CUDA computation");
      //@@ Launch the GPU Kernel here
      matrixMultiply<<< DimGrid, DimBlock >>>(deviceA, deviceB, deviceC, numARows, numAColumns, numBRows, numBColumns, numCRows, numCColumns);
      cudaDeviceSynchronize();
      wbTime_stop(Compute, "Performing CUDA computation");
    
      wbTime_start(Copy, "Copying output memory to the CPU");
      //@@ Copy the GPU memory back to the CPU here
      //@@ Copy the GPU memory back to the CPU here
      wbCheck(cudaMemcpy(hostC, deviceC, C_size, cudaMemcpyDeviceToHost));
    
      wbTime_stop(Copy, "Copying output memory to the CPU");
    
      wbTime_start(GPU, "Freeing GPU Memory");
      //@@ Free the GPU memory here
      wbCheck(cudaFree(deviceA));
      wbCheck(cudaFree(deviceB));
      wbCheck(cudaFree(deviceC));
    
      wbTime_stop(GPU, "Freeing GPU Memory");
    
      wbSolution(args, hostC, numCRows, numCColumns);
    
      free(hostA);
      free(hostB);
      free(hostC);
    
      return 0;
    }
    View Code

    使用tile来划分矩阵乘法

    另外一种思路,我们不让每一个线程完整计算一个C(i,j),通过C(i,j) = sum { A(i,k)*B(k,j) }发现,我们还可以再细度划分:
    Csub(i,j) = sum{A(i,ksub+offsetA)*B(ksub+offsetB,j)}  0<=ksub < blockSize
    C(i,j) = sum{Csub(i,j)}
    就是把矩阵分成n*n个大的子块,然后每一个block负责计算子块i 和 子块j的子乘积,计算完毕后加起来则可。这里主要引入shared Memory来提高程序效率.

    计算矩阵我们

    __global__ void matrix_kernel_1(float* _C,const float* _A,const float *_B,int _wa,int _wb) //_wa是A矩阵的宽度,_wb是矩阵B的宽度
    {
        int bx = blockIdx.x; //Block X的当前位置
        int by = blockIdx.y; //Block y的当前位置
        int tx = threadIdx.x;
        int ty = threadIdx.y;
    
        //该block要处理的A ,A的取值方向是X轴方向, B的取值方向是Y轴方向
        int aBegin = _wa*(by*BLOCK_SIZE);//A(0,by) //在矩阵A上每个block的首地址
        int aEnd = aBegin + _wa - 1; //
        int aStep = BLOCK_SIZE;//offsetA  //因为A是横向取值,所以step是blocksize
    
        int bBegin = BLOCK_SIZE*bx;//B(bx,0) //矩阵B的首地址
        int bStep = BLOCK_SIZE*_wb;//offsetB //因为B是纵向取值,所以step是blocksize*_wb.
        
        float cSub = 0;
    //每一个线程计算一个像素点,分成wa/block 次来计算,每次计算一段A(sub) * B(sub),最后累加得到C的结果.
    //假设矩阵都是n*n的,那么旧的basicMatrix每个线程都需要执行2n次globalMemory的访问,这里用到sharedMemory只需要执行2n/blocksize,每个线程可以提高blocksize倍,
    //每个block里面的thread都是通过读取sharedMemory来执行计算的,速度会非常快.
    for (int a = aBegin,b = bBegin; a <= aEnd; a += aStep,b += bStep) { __shared__ float As[BLOCK_SIZE][BLOCK_SIZE]; __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE]; //每个线程负责一个元素拷贝,我们以block为单位来分析。假设blocksize=16, 一个block里面有 16*16个线程。
    //每个block 可以填满需要用到的 As, 和Bs大小的矩阵。这里就是矩阵A里面的16*16的数据可以填满,保存在sharedMemory中。同样B矩阵也是。
    As[ty][tx] = _A[a + _wa*ty + tx]; Bs[ty][tx] = _B[b + _wb*ty + tx]; __syncthreads(); //同步使得矩阵A,和矩阵B的第一个tile*tile的数据保存在As和Bs里,供下面的计算使用. //每个线程负责计算一个子块i 和 子块j的子乘积宽度是block_size,执行到wa/block_size次,累加可得到C的值 for (int k = 0; k < BLOCK_SIZE; ++k) { cSub += As[ty][k]*Bs[k][tx];  } __syncthreads(); } //全局地址,向全局寄存器写回去 //一个线程负责一个元素,一个block负责一个子块 int cIndex = (by*BLOCK_SIZE + ty)*_wb + (bx*BLOCK_SIZE + tx); _C[cIndex] = cSub; }

    二维矩阵的索引问题:

    假设有一个32*48的矩阵,x的范围[0,47], y的范围是[0,31]。 在代码中是以一个二维数组保存,内存是连续的。目前我们要找到point(23,23)的索引:

    一维指针指向的应该是: 23*48 + 23.

    但是我们同样也可以用grid 和block 来划分这个矩阵:

    grid(3,2) (2是列,即X维度,2是行,Y维度), block(16,16) 。 grid X的范围是是[0,2], Y的范围是[0,1]. 同理适用于block 我们也可以用下面的方式找到point(23,23)的索引:

    point(23,23)  对应grid的坐标是(1,1) 对应的block坐标是(7,7)。block的宽度是16。

    point.y = (by*Block_size + ty) = 1*16+7 =23

    point.x =  (bx*Block_size + tx)= 1*16+7 =23

    point(x,y)的索引位置是: y * width + x

    这个是Heterogeneous Parallel Programming lab4:Basic Matrix Matrix Multiplication的代码:

    #include <wb.h>
    
    #define wbCheck(stmt)                                                          
      do {                                                                         
        cudaError_t err = stmt;                                                    
        if (err != cudaSuccess) {                                                  
          wbLog(ERROR, "Failed to run stmt ", #stmt);                              
          wbLog(ERROR, "Got CUDA error ...  ", cudaGetErrorString(err));           
          return -1;                                                               
        }                                                                          
      } while (0)
    
    #define TILE_WIDTH 32  //block size ,each thread to calucate each block
    
    // Compute C = A * B
    __global__ void matrixMultiplyShared(float *A, float *B, float *C, int numARows,
                                         int numAColumns, int numBRows,
                                         int numBColumns, int numCRows,
                                         int numCColumns) {
      //@@ Insert code to implement matrix multiplication here
      //@@ You have to use shared memory for this MP
        
        __shared__ float sharedM[TILE_WIDTH][TILE_WIDTH];
        __shared__ float sharedN[TILE_WIDTH][TILE_WIDTH];
        int bx = blockIdx.x;
        int by = blockIdx.y;
        int tx = threadIdx.x;
        int ty = threadIdx.y;
        int row = by*TILE_WIDTH + ty;
        int col = bx*TILE_WIDTH + tx;
        float v = 0.0;
    
        for (int i = 0; i < (int)(ceil((float)numAColumns/TILE_WIDTH)); i++)
        {
            if (i*TILE_WIDTH + tx < numAColumns && row < numARows)
                sharedM[ty][tx] = A[row*numAColumns + i*TILE_WIDTH + tx];
            else
                sharedM[ty][tx] = 0.0;
    
            if (i*TILE_WIDTH + ty < numBRows && col < numBColumns)
                sharedN[ty][tx] = B[(i*TILE_WIDTH + ty)*numBColumns + col];
            else
                sharedN[ty][tx] = 0.0;
            __syncthreads();
    
            for(int j = 0; j < TILE_WIDTH; j++)
                v += sharedM[ty][j] * sharedN[j][tx];
            __syncthreads();
        }
    
        if (row < numCRows && col < numCColumns)
            C[row*numCColumns + col] = v;
        
    }
    
    int main(int argc, char **argv) {
      wbArg_t args;
      float *hostA; // The A matrix
      float *hostB; // The B matrix
      float *hostC; // The output C matrix
      float *deviceA;
      float *deviceB;
      float *deviceC;
      int numARows;    // number of rows in the matrix A
      int numAColumns; // number of columns in the matrix A
      int numBRows;    // number of rows in the matrix B
      int numBColumns; // number of columns in the matrix B
      int numCRows;    // number of rows in the matrix C (you have to set this)
      int numCColumns; // number of columns in the matrix C (you have to set this)
    
      args = wbArg_read(argc, argv);
    
      wbTime_start(Generic, "Importing data and creating memory on host");
      hostA =
          ( float * )wbImport(wbArg_getInputFile(args, 0), &numARows, &numAColumns);
      hostB =
          ( float * )wbImport(wbArg_getInputFile(args, 1), &numBRows, &numBColumns);
      //@@ Set numCRows and numCColumns
      numCRows = 0;
      numCColumns = 0;
      if(numAColumns != numBRows){
        wbLog(TRACE, "numAColumns != numBRows, Break ");
        return 1;
      }
      numCRows = numARows;
      numCColumns = numBColumns;
      unsigned int A_size = numARows * numAColumns * sizeof(float);
      unsigned int B_size = numBRows * numBColumns * sizeof(float);
      unsigned int C_size = numCRows * numCColumns * sizeof(float);
      //@@ Allocate the hostC matrix
      hostC = ( float * )malloc(C_size);
      wbTime_stop(Generic, "Importing data and creating memory on host");
    
      wbLog(TRACE, "The dimensions of A are ", numARows, " x ", numAColumns);
      wbLog(TRACE, "The dimensions of B are ", numBRows, " x ", numBColumns);
    
      wbTime_start(GPU, "Allocating GPU memory.");
      //@@ Allocate GPU memory here
      wbCheck(cudaMalloc((void**)&deviceA, A_size)); 
      wbCheck(cudaMalloc((void**)&deviceB, B_size)); 
      wbCheck(cudaMalloc((void**)&deviceC, C_size)); 
      wbTime_stop(GPU, "Allocating GPU memory.");
    
      wbTime_start(GPU, "Copying input memory to the GPU.");
      //@@ Copy memory to the GPU here
      wbCheck(cudaMemcpy(deviceA, hostA, A_size, cudaMemcpyHostToDevice));
      wbCheck(cudaMemcpy(deviceB, hostB, B_size, cudaMemcpyHostToDevice));
      wbCheck(cudaMemcpy(deviceC, hostC, C_size, cudaMemcpyHostToDevice));
      wbTime_stop(GPU, "Copying input memory to the GPU.");
    
      //@@ Initialize the grid and block dimensions here
      dim3 DimGrid(ceil(numCColumns / 32.0), ceil(numCRows / 32.0), 1);
      dim3 DimBlock(TILE_WIDTH, TILE_WIDTH, 1);
    
      wbTime_start(Compute, "Performing CUDA computation");
      //@@ Launch the GPU Kernel here
      matrixMultiplyShared<<< DimGrid, DimBlock >>>(deviceA, deviceB, deviceC, numARows, numAColumns, numBRows, numBColumns, numCRows, numCColumns);
      cudaDeviceSynchronize();
      wbTime_stop(Compute, "Performing CUDA computation");
    
      wbTime_start(Copy, "Copying output memory to the CPU");
      //@@ Copy the GPU memory back to the CPU here
      //@@ Copy the GPU memory back to the CPU here
      wbCheck(cudaMemcpy(hostC, deviceC, C_size, cudaMemcpyDeviceToHost));
    
      wbTime_stop(Copy, "Copying output memory to the CPU");
    
      wbTime_start(GPU, "Freeing GPU Memory");
      //@@ Free the GPU memory here
      wbCheck(cudaFree(deviceA));
      wbCheck(cudaFree(deviceB));
      wbCheck(cudaFree(deviceC));
    
      wbTime_stop(GPU, "Freeing GPU Memory");
    
      wbSolution(args, hostC, numCRows, numCColumns);
    
      free(hostA);
      free(hostB);
      free(hostC);
    
      return 0;
    }
    View Code
  • 相关阅读:
    编译安装httpd
    ANSIBLE安装和常用模块模块使用详细教程
    MySQL集群高可用
    MySQL数据库备份和恢复
    MySQL数据库多表查询
    MySQL语句使用。
    MySQL多实例安装教程
    二进制安装MySQL数据库
    半自动化系统安装
    c语言分别用库函数和系统函数来进行文件操作效率对比
  • 原文地址:https://www.cnblogs.com/biglucky/p/4244187.html
Copyright © 2011-2022 走看看