zoukankan      html  css  js  c++  java
  • CUDA 矩阵相乘完整代码


    #include "cuda_runtime.h"
    #include "device_launch_parameters.h"

    #include <stdio.h>
    #include <stdlib.h>
    #include <time.h>
    #include "cublas_v2.h"

    #define BLOCK_SIZE 16

    cudaError_t multiCuda(float *c, float *a, float *b, unsigned int aH, unsigned int aW, unsigned int bH, unsigned int bW);

    __global__ void multiKernel(float *c, float *a, float*b, unsigned int aW, unsigned int bW)
    {
      //saved in register
      int xBlock = blockIdx.x;
      int yBlock = blockIdx.y;

      int xThread = threadIdx.x;
      int yThread = threadIdx.y;

      unsigned int aWidth = aW;
      unsigned int bWidth = bW;

      float Cvalue= 0;

      for(int i=0; i< aWidth/BLOCK_SIZE; ++i)
      {
        __shared__ int aSub[BLOCK_SIZE][BLOCK_SIZE];
        __shared__ int bSub[BLOCK_SIZE][BLOCK_SIZE];

        aSub[yThread][xThread] = a[(yBlock*blockDim.y + yThread)*aWidth + i*blockDim.x + xThread];
        bSub[yThread][xThread] = b[(i*blockDim.y + yThread)*bWidth + xBlock*blockDim.x + xThread];

        __syncthreads();

        for(int e=0; e<BLOCK_SIZE; ++e)
        {
          Cvalue += aSub[yThread][e]*bSub[e][xThread];
        }
        __syncthreads();
      }

      int cIndex = (yBlock*blockDim.y + yThread)*bWidth + xBlock*blockDim.x + xThread;
      c[cIndex] = Cvalue;
    }

    __global__ void multiKernel_NoLoop(float *c, float *a, float*b, unsigned int aW, unsigned int bW)
    {
      int xBlock = blockIdx.x;
      int yBlock = blockIdx.y;

      int xThread = threadIdx.x;
      int yThread = threadIdx.y;

      unsigned int aWidth = aW;
      unsigned int bWidth = bW;

      float Cvalue= 0;

      for(int i=0; i< aWidth/BLOCK_SIZE; ++i)
      {
        __shared__ int aSub[BLOCK_SIZE][BLOCK_SIZE];
        __shared__ int bSub[BLOCK_SIZE][BLOCK_SIZE];

        aSub[yThread][xThread] = a[(yBlock*blockDim.y + yThread)*aWidth + i*blockDim.x + xThread];
        bSub[yThread][xThread] = b[(i*blockDim.y + yThread)*bWidth + xBlock*blockDim.x + xThread];

        __syncthreads();

        Cvalue += aSub[yThread][0]*bSub[0][xThread] + aSub[yThread][1]*bSub[1][xThread] +
              aSub[yThread][2]*bSub[2][xThread] + aSub[yThread][3]*bSub[3][xThread] +
              aSub[yThread][4]*bSub[4][xThread] + aSub[yThread][5]*bSub[5][xThread] +
              aSub[yThread][6]*bSub[6][xThread] + aSub[yThread][7]*bSub[7][xThread] +
              aSub[yThread][8]*bSub[8][xThread] + aSub[yThread][9]*bSub[9][xThread] +
              aSub[yThread][10]*bSub[10][xThread] + aSub[yThread][11]*bSub[11][xThread] +
              aSub[yThread][12]*bSub[12][xThread] + aSub[yThread][13]*bSub[13][xThread] +
              aSub[yThread][14]*bSub[14][xThread] + aSub[yThread][15]*bSub[15][xThread] ;

        __syncthreads();
      }

      int cIndex = (yBlock*blockDim.y + yThread)*bWidth + xBlock*blockDim.x + xThread;
      c[cIndex] = Cvalue;
    }

    cudaError_t multiWithcuBlase(float *c, float *a, float *b, unsigned int aH, unsigned int aW, unsigned int bH, unsigned int bW);

    void multiCPU(float *c, float *a, float *b, unsigned int aH, unsigned int aW, unsigned int bH, unsigned int bW);

    int main()
    {
      const unsigned int aH = 320;
      const unsigned int aW = 320;
      const unsigned int bW = 320;
      const unsigned int bH = aW;
      const unsigned int cH = aH;
      const unsigned int cW = bW;

      float *cpu_a, *cpu_b,*cpu_c;

      cpu_a = (float*)malloc(aH*aW*sizeof(float));
      cpu_b = (float*)malloc(bH*bW*sizeof(float));
      cpu_c = (float*)malloc(cH*cW*sizeof(float));

      for(int y=0; y<aH; ++y)
      {
        for(int x =0; x<aW; ++x)
        {
          int index = y*aW + x;
          cpu_a[index] = (float)(x<y?x:y);
        }
      }

      for(int y=0; y<bH; ++y)
      {
        for(int x =0; x<bW; ++x)
        {
          int index = y*bW + x;
          cpu_b[index] = (float)(x<y?x:y);
        }
      }

      cudaError_t cudaStatus = multiCuda(cpu_c, cpu_a, cpu_b, aH, aW, bH, bW);
      if (cudaStatus != cudaSuccess) {
      fprintf(stderr, "multiCuda failed!");
      return 1;
      }
      /*
      for(int y=0; y<cH; ++y)
      {
        for(int x =0; x<cW; ++x)
        {
          if(x==1&&y==1)
          {
            int index = y*cW + x;
            printf("c(1,1)=%.1f ",cpu_c[index]);
          }

        }
      }
    */
      cudaStatus = multiWithcuBlase(cpu_c, cpu_a, cpu_b, aH, aW, bH, bW);
      if (cudaStatus != cudaSuccess) {
      fprintf(stderr, "multiCuda failed!");
      return 1;
      }
      /*
      for(int y=0; y<cH; ++y)
      {
        for(int x =0; x<cW; ++x)
        {
          if(x==1&&y==1)
          {
            int index = y*cW + x;
            printf("c(1,1)=%.1f ",cpu_c[index]);
          }

        }
      }
    */

      // cudaDeviceReset must be called before exiting in order for profiling and
      // tracing tools such as Nsight and Visual Profiler to show complete traces.
      cudaStatus = cudaDeviceReset();
      if (cudaStatus != cudaSuccess) {
      fprintf(stderr, "cudaDeviceReset failed!");
      return 1;
      }

      float start,end;
      start = clock();
      multiCPU(cpu_c, cpu_a, cpu_b, aH, aW, bH, bW);
      end = clock();
      printf("CPU runtime is %f msec ",end - start);

      free(cpu_a);
      free(cpu_b);
      free(cpu_c);

      getchar();
      return 0;
    }

    cudaError_t multiCuda(float *c, float *a, float *b, unsigned int aH, unsigned int aW, unsigned int bH, unsigned int bW)
    {
      float *gpu_a = 0;
      float *gpu_b = 0;
      float *gpu_c = 0;
      cudaError_t cudaStatus;

      cudaStatus = cudaSetDevice(0);
      if (cudaStatus != cudaSuccess) {
      fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
      goto Error;
      }

      size_t size_a = aH*aW*sizeof(float);
      cudaStatus = cudaMalloc((void**)&gpu_a, size_a);
      if (cudaStatus != cudaSuccess) {
      fprintf(stderr, "cudaMalloc failed!");
      goto Error;
      }

      size_t size_b = bH*bW*sizeof(float);
      cudaStatus = cudaMalloc((void**)&gpu_b, size_b);
      if (cudaStatus != cudaSuccess) {
      fprintf(stderr, "cudaMalloc failed!");
      goto Error;
      }

      size_t size_c = aH*bW*sizeof(float);
      cudaStatus = cudaMalloc((void**)&gpu_c,size_c);
      if (cudaStatus != cudaSuccess) {
      fprintf(stderr, "cudaMalloc failed!");
      goto Error;
      }

      cudaStatus = cudaMemcpy(gpu_a, a, size_a, cudaMemcpyHostToDevice);
      if (cudaStatus != cudaSuccess) {
      fprintf(stderr, "cudaMemcpy failed!");
      goto Error;
      }

      cudaStatus = cudaMemcpy(gpu_b, b,size_b, cudaMemcpyHostToDevice);
      if (cudaStatus != cudaSuccess) {
      fprintf(stderr, "cudaMemcpy failed!");
      goto Error;
      }

      dim3 blocks(BLOCK_SIZE,BLOCK_SIZE);
      dim3 grids(bW/BLOCK_SIZE,aH/BLOCK_SIZE);

      cudaEvent_t start;
      cudaStatus = cudaEventCreate(&start);
      if (cudaStatus != cudaSuccess){
      fprintf(stderr, "Failed to create start event (error code %s)! ", cudaGetErrorString(cudaStatus));
      goto Error;
      }
      cudaEvent_t stop;
      cudaStatus = cudaEventCreate(&stop);
      if (cudaStatus != cudaSuccess){
      fprintf(stderr, "Failed to create stop event (error code %s)! ", cudaGetErrorString(cudaStatus));
      goto Error;
      }

      cudaStatus = cudaEventRecord(start, NULL);
      if (cudaStatus != cudaSuccess){
      fprintf(stderr, "Failed to record start event (error code %s)! ", cudaGetErrorString(cudaStatus));
      goto Error;
      }

      multiKernel<<<grids,blocks>>>(gpu_c,gpu_a,gpu_b,aW,bW);

      cudaStatus = cudaEventRecord(stop, NULL);
      if (cudaStatus != cudaSuccess){
      fprintf(stderr, "Failed to record stop event (error code %s)! ", cudaGetErrorString(cudaStatus));
      goto Error;
      }
      cudaStatus = cudaEventSynchronize(stop);
      if (cudaStatus != cudaSuccess){
      fprintf(stderr, "Failed to synchronize on the stop event (error code %s)! ", cudaGetErrorString(cudaStatus));
      goto Error;
      }

      float msecTotal = 0.0f;
      cudaStatus = cudaEventElapsedTime(&msecTotal, start, stop);
      if (cudaStatus != cudaSuccess){
      fprintf(stderr, "Failed to get time elapsed between events (error code %s)! ", cudaGetErrorString(cudaStatus));
      goto Error;
      }
      printf("HS__ GPU runtime is %f msec ",msecTotal);

    /*******************************************************/
      cudaEvent_t start1;
      cudaStatus = cudaEventCreate(&start1);
      if (cudaStatus != cudaSuccess){
      fprintf(stderr, "Failed to create start event (error code %s)! ", cudaGetErrorString(cudaStatus));
      goto Error;
      }
      cudaEvent_t stop1;
      cudaStatus = cudaEventCreate(&stop1);
      if (cudaStatus != cudaSuccess){
      fprintf(stderr, "Failed to create stop event (error code %s)! ", cudaGetErrorString(cudaStatus));
      goto Error;
      }

      cudaStatus = cudaEventRecord(start1, NULL);
      if (cudaStatus != cudaSuccess){
      fprintf(stderr, "Failed to record start event (error code %s)! ", cudaGetErrorString(cudaStatus));
      goto Error;
      }

      multiKernel_NoLoop<<<grids,blocks>>>(gpu_c,gpu_a,gpu_b,aW,bW);

      cudaStatus = cudaEventRecord(stop1, NULL);
      if (cudaStatus != cudaSuccess){
      fprintf(stderr, "Failed to record stop event (error code %s)! ", cudaGetErrorString(cudaStatus));
      goto Error;
      }
      cudaStatus = cudaEventSynchronize(stop1);
      if (cudaStatus != cudaSuccess){
      fprintf(stderr, "Failed to synchronize on the stop event (error code %s)! ", cudaGetErrorString(cudaStatus));
      goto Error;
      }

      float msecTotal1 = 0.0f;
      cudaStatus = cudaEventElapsedTime(&msecTotal1, start1, stop1);
      if (cudaStatus != cudaSuccess){
      fprintf(stderr, "Failed to get time elapsed between events (error code %s)! ", cudaGetErrorString(cudaStatus));
      goto Error;
      }
      printf("HS__NoLoop GPU runtime is %f msec ",msecTotal1);

    /***********************************************************/

      cudaStatus = cudaGetLastError();
      if (cudaStatus != cudaSuccess) {
      fprintf(stderr, "addKernel launch failed: %s ", cudaGetErrorString(cudaStatus));
      goto Error;
      }

      // cudaDeviceSynchronize waits for the kernel to finish, and returns
      // any errors encountered during the launch.
      cudaStatus = cudaDeviceSynchronize();
      if (cudaStatus != cudaSuccess) {
      fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel! ", cudaStatus);
      goto Error;
      }

      cudaStatus = cudaMemcpy(c, gpu_c, size_c, cudaMemcpyDeviceToHost);
      if (cudaStatus != cudaSuccess) {
      fprintf(stderr, "cudaMemcpy failed!");
      goto Error;
      }

    Error:
      cudaFree(gpu_a);
      cudaFree(gpu_b);
      cudaFree(gpu_c);

      return cudaStatus;
    }

    void inline checkError(cublasStatus_t status, const char *msg)
    {
      if (status != CUBLAS_STATUS_SUCCESS)
      {
      printf("%s", msg);
      exit(EXIT_FAILURE);
      }
    }

    cudaError_t multiWithcuBlase(float *c, float *a, float *b, unsigned int aH, unsigned int aW, unsigned int bH, unsigned int bW)
    {
      float *gpu_a = 0;
      float *gpu_b = 0;
      float *gpu_c = 0;
      cudaError_t cudaStatus;

      cudaStatus = cudaSetDevice(0);
      if (cudaStatus != cudaSuccess) {
      fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
      goto Error;
      }

      size_t size_a = aH*aW*sizeof(float);
      cudaStatus = cudaMalloc((void**)&gpu_a, size_a);
      if (cudaStatus != cudaSuccess) {
      fprintf(stderr, "cudaMalloc failed!");
      goto Error;
      }

      size_t size_b = bH*bW*sizeof(float);
      cudaStatus = cudaMalloc((void**)&gpu_b, size_b);
      if (cudaStatus != cudaSuccess) {
      fprintf(stderr, "cudaMalloc failed!");
      goto Error;
      }

      size_t size_c = aH*bW*sizeof(float);
      cudaStatus = cudaMalloc((void**)&gpu_c,size_c);
      if (cudaStatus != cudaSuccess) {
      fprintf(stderr, "cudaMalloc failed!");
      goto Error;
      }

      cudaStatus = cudaMemcpy(gpu_a, a, size_a, cudaMemcpyHostToDevice);
      if (cudaStatus != cudaSuccess) {
      fprintf(stderr, "cudaMemcpy failed!");
      goto Error;
      }

      cudaStatus = cudaMemcpy(gpu_b, b,size_b, cudaMemcpyHostToDevice);
      if (cudaStatus != cudaSuccess) {
      fprintf(stderr, "cudaMemcpy failed!");
      goto Error;
      }

      dim3 blocks(BLOCK_SIZE,BLOCK_SIZE);
      dim3 grids(bW/BLOCK_SIZE,aH/BLOCK_SIZE);

      //printf("Computing result using CUBLAS... ");

      cublasHandle_t handle;
      cublasStatus_t ret;
      ret = cublasCreate(&handle);
      if (ret != CUBLAS_STATUS_SUCCESS)
      {
      printf("cublasCreate returned error code %d, line(%d) ", ret, __LINE__);
      goto Error;
      }

      const float alpha = 1.0f;
      const float beta = 0.0f;

      cudaEvent_t start;
      cudaStatus = cudaEventCreate(&start);
      if (cudaStatus != cudaSuccess){
      fprintf(stderr, "Failed to create start event (error code %s)! ", cudaGetErrorString(cudaStatus));
      goto Error;
      }
      cudaEvent_t stop;
      cudaStatus = cudaEventCreate(&stop);
      if (cudaStatus != cudaSuccess){
      fprintf(stderr, "Failed to create stop event (error code %s)! ", cudaGetErrorString(cudaStatus));
      goto Error;
      }

      cudaStatus = cudaEventRecord(start, NULL);
      if (cudaStatus != cudaSuccess){
      fprintf(stderr, "Failed to record start event (error code %s)! ", cudaGetErrorString(cudaStatus));
      goto Error;
      }

      ret = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, aH, bW, aW, &alpha, gpu_a, aH, gpu_b, bH, &beta, gpu_c, aH);

      cudaStatus = cudaEventRecord(stop, NULL);
      if (cudaStatus != cudaSuccess){
      fprintf(stderr, "Failed to record stop event (error code %s)! ", cudaGetErrorString(cudaStatus));
      goto Error;
      }
      cudaStatus = cudaEventSynchronize(stop);
      if (cudaStatus != cudaSuccess){
      fprintf(stderr, "Failed to synchronize on the stop event (error code %s)! ", cudaGetErrorString(cudaStatus));
      goto Error;
      }

      float msecTotal = 0.0f;
      cudaStatus = cudaEventElapsedTime(&msecTotal, start, stop);
      if (cudaStatus != cudaSuccess){
      fprintf(stderr, "Failed to get time elapsed between events (error code %s)! ", cudaGetErrorString(cudaStatus));
      goto Error;
      }
      printf("cuBlas__ GPU runtime is %f msec ",msecTotal);

      cudaStatus = cudaMemcpy(c, gpu_c, size_c, cudaMemcpyDeviceToHost);
      if (cudaStatus != cudaSuccess) {
      fprintf(stderr, "cudaMemcpy failed!");
      goto Error;
      }
      checkError(cublasDestroy(handle), "cublasDestroy() error! ");

    Error:
      cudaFree(gpu_a);
      cudaFree(gpu_b);
      cudaFree(gpu_c);

      return cudaStatus;

    }


    void multiCPU(float *c, float *a, float *b, unsigned int aH, unsigned int aW, unsigned int bH, unsigned int bW)
    {
      for(int y=0; y<aH; ++y)
      {
        for(int x =0; x<bW; ++x)
        {
          int index = y*bW + x;
          c[index] = 0.0f;
          for(int i=0; i<aW; ++i)
          {
            c[index] += a[y*aW+i]*b[i*bW + x];
          }
        }
      }
    }

  • 相关阅读:
    atitit.按钮光标滑过高亮切换以及其他动态效果的实现css html js attilax总结
    atitit. 文件上传带进度条 atiUP 设计 java c# php
    atitit.新增编辑功能 跟orm的实现 attilax p31
    atitit. java jsoup html table的读取解析 总结
    atitit.设计文档操作日志的实现
    atitit.资源释放机制attilax总结
    (转)Android的消息机制,用Android线程间通信的Message机制,Android中Handler的使用方法——在子线程中更新界面,handler机制
    (转)Android笔记handler机制
    (转)数据存储
    (转)android连网详解
  • 原文地址:https://www.cnblogs.com/huangshan/p/3916918.html
Copyright © 2011-2022 走看看