zoukankan      html  css  js  c++  java
  • cuda(2) 矩阵乘法优化过程

    Created on 2013-8-5
    URL : http://blog.sina.com.cn/s/blog_a502f1a30101mjch.html
    @author: zhxfl
    转载请说明出处

      1 #include <stdio.h>
      2 #include <time.h>
      3 #include <cuda_runtime.h>
      4 __global__ void matrixMulCUDA(int *A,int *B,int * C, 
      5     dim3 dimsA,dim3 dimsB, dim3 dimsC)
      6 {
      7     int i = blockIdx.x;
      8     int j = threadIdx.x;
      9 
     10     for(int k = 0; k < dimsA.y; k++)
     11     {
     12         C[i * dimsC.y + j] += A[i * dimsA.y + k] * B[k * dimsB.y + j];
     13         //printf("id = %d %d %d A = %d B = %d C = %d 
    ", i,j,k, A[i * dimsA.y + k],
     14         //     B[k * dimsB.y + j],
     15         //     C[i * dimsC.y + j]);
     16     }
     17 }
     18 
     19 int* matrixMultiplyByGpu(int *h_A, int n1,int m1,int *h_B,int n2,int m2)
     20 {
     21     int *d_A, *d_B, *d_C;
     22     int *h_C;
     23 
     24     dim3 dimsA(n1,m1);
     25     dim3 dimsB(n2,m2);
     26     dim3 dimsC(n1,m2);
     27 
     28     int mem_size_A = dimsA.x * dimsA.y * sizeof(int);
     29     int mem_size_B = dimsB.x * dimsB.y * sizeof(int);
     30     int mem_size_C = dimsC.x * dimsC.y * sizeof(int);
     31 
     32     cudaMalloc((void**)&d_A, mem_size_A);
     33     cudaMalloc((void**)&d_B, mem_size_B);
     34     cudaMalloc((void**)&d_C, mem_size_C);
     35 
     36     cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice);
     37     cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice);
     38 
     39     h_C = (int*)malloc(sizeof(int)*mem_size_C);
     40     for(int i = 0; i<dimsC.x * dimsC.y;i++)h_C[i] = 0;
     41     cudaMemcpy(d_C, h_C, mem_size_C, cudaMemcpyHostToDevice);
     42     dim3 grid(dimsC.x,dimsC.y);
     43     matrixMulCUDA<<<dimsC.x,dimsC.y>>>(d_A,d_B,d_C,dimsA,dimsB,dimsC);
     44     cudaMemcpy(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost);
     45     cudaFree(d_A);
     46     cudaFree(d_B);
     47     cudaFree(d_C);
     48     return h_C;
     49 }
     50 
     51 int* matrixMultiplyByCpu(int *h_A, int n1,int m1,int *h_B,int n2,int m2)
     52 {
     53     int *h_C = new int [n1 * m2];
     54     for(int i = 0; i < n1 * m2; i++)h_C[i] = 0;
     55 
     56     for(int i = 0; i < n1; i ++)
     57     {
     58         for(int j = 0; j < m2; j++)
     59         {
     60             for(int k = 0; k < m1; k++)
     61             {
     62                 //h_C[i][j] = h_A[i][k] * h_B[k][j];
     63                 h_C[i * m2 + j] += h_A[i * m1 + k] * h_B[k * m2 + j];
     64             }
     65         }
     66     }
     67     return h_C;
     68 }
     69 
     70 void outPutMatrix(char c,int *g, int n,int m)
     71 {
     72     return;
     73     printf("matrix %c [%3d %3d]
    ", c, n, m);
     74     for(int i = 0; i < n * m;i++)
     75     {
     76         printf("%5d ", g[i]);
     77         if((i + 1) % m == 0)printf("
    ");
     78     }
     79 }
     80 
     81 const int base = 1000;
     82 const int large = 2;
     83 int main()
     84 {
     85     int n1 = base;
     86     int m1 = base + 1;
     87     int n2 = m1;
     88     int m2 = base;
     89     int *g1 = new int[n1 * m1];
     90     int *g2 = new int[n2 * m2];
     91     for(int i = 0; i < n1 * m1;i++)g1[i] = rand() % large;
     92     for(int i = 0; i < n2 * m2;i++)g2[i] = rand() % large;
     93     outPutMatrix('A',g1,n1,m1);
     94     outPutMatrix('B',g2,n2,m2);
     95     int *gg1,*gg2;
     96     
     97     clock_t start, finish;
     98 
     99     start = clock();
    100     gg1 = matrixMultiplyByGpu(g1,n1,m1,g2,n2,m2);
    101     finish = clock();
    102     printf("GPU time = %f
    ",(double)(finish - start) / CLOCKS_PER_SEC);
    103     
    104     start = clock();
    105     gg2 = matrixMultiplyByCpu(g1,n1,m1,g2,n2,m2);
    106     finish = clock();
    107     printf("CPU time = %f
    ",(double)(finish - start) / CLOCKS_PER_SEC);
    108 
    109     printf("check---");
    110     for(int i = 0; i< n1*m2;i++)
    111     {
    112         if(gg1[i] != gg2[i])
    113         {
    114             printf("wrong ans
    ");
    115             break;
    116         }
    117     }
    118     outPutMatrix('1',gg1,n1,m2);
    119     outPutMatrix('2',gg2,n1,m2);
    120 }
    版本一

    版本一分析:

    n 约等于 maxThreadsPerBlock

    这里我们的矩阵空间复杂度大概是o(n^2),两个这样矩阵的乘法复杂度大概是0(n^3),这里使用GPU优化的方案是开启n个block,每个block有n个thread。这样我们的并发量就是n^2,也就是计算复杂度大概是0(n)。

    版本一测试:

    n 约等于 maxThreadsPerBlock

    这里请注意,你的base + 1 < min(maxThreadsPerBlock,maxGridSize[0]),不然将超过cuda的最大计算量,会导致你的计算结果错误。

    根据我的机子的情况 n = 1000,运行时间如下,可以看出计算时间大概是13.87

      1 #include <stdio.h>
      2 #include <time.h>
      3 #include <cuda_runtime.h>
      4 __global__ void matrixMulCUDA(float *A,float *B,float * C, 
      5     dim3 dimsA,dim3 dimsB, dim3 dimsC)
      6 {
      7     int i = blockIdx.x;
      8     int j = threadIdx.x;
      9 
     10     for(int k = 0; k < dimsA.y; k++)
     11     {
     12         C[i * dimsC.y + j] += A[i * dimsA.y + k] * B[k * dimsB.y + j];
     13         //printf("id = %d %d %d A = %d B = %d C = %d 
    ", i,j,k, A[i * dimsA.y + k],
     14         //     B[k * dimsB.y + j],
     15         //     C[i * dimsC.y + j]);
     16     }
     17 }
     18 
     19 float* matrixMultiplyByGpu(float *h_A, int n1,int m1,float *h_B,int n2,int m2)
     20 {
     21     float *d_A, *d_B, *d_C;
     22     float *h_C;
     23 
     24     dim3 dimsA(n1,m1);
     25     dim3 dimsB(n2,m2);
     26     dim3 dimsC(n1,m2);
     27 
     28     int mem_size_A = dimsA.x * dimsA.y * sizeof(float);
     29     int mem_size_B = dimsB.x * dimsB.y * sizeof(float);
     30     int mem_size_C = dimsC.x * dimsC.y * sizeof(float);
     31 
     32     cudaMalloc((void**)&d_A, mem_size_A);
     33     cudaMalloc((void**)&d_B, mem_size_B);
     34     cudaMalloc((void**)&d_C, mem_size_C);
     35 
     36     cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice);
     37     cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice);
     38 
     39     h_C = (float*)malloc(sizeof(float)*mem_size_C);
     40     for(int i = 0; i<dimsC.x * dimsC.y;i++)h_C[i] = 0;
     41     cudaMemcpy(d_C, h_C, mem_size_C, cudaMemcpyHostToDevice);
     42     dim3 grid(dimsC.x,dimsC.y);
     43     matrixMulCUDA<<<dimsC.x,dimsC.y>>>(d_A,d_B,d_C,dimsA,dimsB,dimsC);
     44     cudaMemcpy(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost);
     45     cudaFree(d_A);
     46     cudaFree(d_B);
     47     cudaFree(d_C);
     48     return h_C;
     49 }
     50 
     51 float* matrixMultiplyByCpu(float *h_A, int n1,int m1,float *h_B,int n2,int m2)
     52 {
     53     float *h_C = new float [n1 * m2];
     54     for(int i = 0; i < n1 * m2; i++)h_C[i] = 0;
     55 
     56     for(int i = 0; i < n1; i ++)
     57     {
     58         for(int j = 0; j < m2; j++)
     59         {
     60             for(int k = 0; k < m1; k++)
     61             {
     62                 //h_C[i][j] = h_A[i][k] * h_B[k][j];
     63                 h_C[i * m2 + j] += h_A[i * m1 + k] * h_B[k * m2 + j];
     64             }
     65         }
     66     }
     67     return h_C;
     68 }
     69 
     70 void outPutMatrix(char c,float *g, int n,int m)
     71 {
     72     return;
     73     printf("matrix %c [%3d %3d]
    ", c, n, m);
     74     for(int i = 0; i < n * m;i++)
     75     {
     76         printf("%5f ", g[i]);
     77         if((i + 1) % m == 0)printf("
    ");
     78     }
     79 }
     80 
     81 const int base = 1000;
     82 const int large = 2;
     83 int main()
     84 {
     85     int n1 = base;
     86     int m1 = base + 1;
     87     int n2 = m1;
     88     int m2 = base;
     89     float *g1 = new float[n1 * m1];
     90     float *g2 = new float[n2 * m2];
     91     for(int i = 0; i < n1 * m1;i++)g1[i] = rand() % large + 1.0f / 3.0f;
     92     for(int i = 0; i < n2 * m2;i++)g2[i] = rand() % large + 1.0f / 3.0f;
     93     outPutMatrix('A',g1,n1,m1);
     94     outPutMatrix('B',g2,n2,m2);
     95     float *gg1,*gg2;
     96     
     97     clock_t start, finish;
     98 
     99     start = clock();
    100     gg1 = matrixMultiplyByGpu(g1,n1,m1,g2,n2,m2);
    101     finish = clock();
    102     printf("GPU time = %f
    ",(double)(finish - start) / CLOCKS_PER_SEC);
    103     
    104     start = clock();
    105     gg2 = matrixMultiplyByCpu(g1,n1,m1,g2,n2,m2);
    106     finish = clock();
    107     printf("CPU time = %f
    ",(double)(finish - start) / CLOCKS_PER_SEC);
    108 
    109     printf("check---");
    110     for(int i = 0; i< n1*m2;i++)
    111     {
    112         if(fabs(gg1[i] - gg2[i]) > 0.01)
    113         {
    114             printf("%f
     %f
    wrong ans
    ",gg1[i],gg2[i]);
    115             break;
    116         }
    117     }
    118     outPutMatrix('1',gg1,n1,m2);
    119     outPutMatrix('2',gg2,n1,m2);
    120 }
    版本二

    版本一分析:

    在版本一的基础上改成float运算

    版本一测试:

    结果如下,没有太大区别,本来预期是GPU的浮点计算能力会比CPU好很多的,但这里看来,并没有很明显的区别。

  • 相关阅读:
    Python进阶(二)
    python基础(四)
    Python基础(三)
    python基础(二)
    Python基础(一)
    backbone之extend方法(刚明白了点)
    js中url相关
    bootstrap导航、导航条及导航翻页相关详解
    bootstrap中css组件(除导航条)
    bootstrap基础排版优化
  • 原文地址:https://www.cnblogs.com/zhxfl/p/3238277.html
Copyright © 2011-2022 走看看