Created on 2013-8-5
URL : http://blog.sina.com.cn/s/blog_a502f1a30101mjch.html
@author: zhxfl
转载请说明出处
1 #include <stdio.h> 2 #include <time.h> 3 #include <cuda_runtime.h> 4 __global__ void matrixMulCUDA(int *A,int *B,int * C, 5 dim3 dimsA,dim3 dimsB, dim3 dimsC) 6 { 7 int i = blockIdx.x; 8 int j = threadIdx.x; 9 10 for(int k = 0; k < dimsA.y; k++) 11 { 12 C[i * dimsC.y + j] += A[i * dimsA.y + k] * B[k * dimsB.y + j]; 13 //printf("id = %d %d %d A = %d B = %d C = %d ", i,j,k, A[i * dimsA.y + k], 14 // B[k * dimsB.y + j], 15 // C[i * dimsC.y + j]); 16 } 17 } 18 19 int* matrixMultiplyByGpu(int *h_A, int n1,int m1,int *h_B,int n2,int m2) 20 { 21 int *d_A, *d_B, *d_C; 22 int *h_C; 23 24 dim3 dimsA(n1,m1); 25 dim3 dimsB(n2,m2); 26 dim3 dimsC(n1,m2); 27 28 int mem_size_A = dimsA.x * dimsA.y * sizeof(int); 29 int mem_size_B = dimsB.x * dimsB.y * sizeof(int); 30 int mem_size_C = dimsC.x * dimsC.y * sizeof(int); 31 32 cudaMalloc((void**)&d_A, mem_size_A); 33 cudaMalloc((void**)&d_B, mem_size_B); 34 cudaMalloc((void**)&d_C, mem_size_C); 35 36 cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice); 37 cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice); 38 39 h_C = (int*)malloc(sizeof(int)*mem_size_C); 40 for(int i = 0; i<dimsC.x * dimsC.y;i++)h_C[i] = 0; 41 cudaMemcpy(d_C, h_C, mem_size_C, cudaMemcpyHostToDevice); 42 dim3 grid(dimsC.x,dimsC.y); 43 matrixMulCUDA<<<dimsC.x,dimsC.y>>>(d_A,d_B,d_C,dimsA,dimsB,dimsC); 44 cudaMemcpy(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost); 45 cudaFree(d_A); 46 cudaFree(d_B); 47 cudaFree(d_C); 48 return h_C; 49 } 50 51 int* matrixMultiplyByCpu(int *h_A, int n1,int m1,int *h_B,int n2,int m2) 52 { 53 int *h_C = new int [n1 * m2]; 54 for(int i = 0; i < n1 * m2; i++)h_C[i] = 0; 55 56 for(int i = 0; i < n1; i ++) 57 { 58 for(int j = 0; j < m2; j++) 59 { 60 for(int k = 0; k < m1; k++) 61 { 62 //h_C[i][j] = h_A[i][k] * h_B[k][j]; 63 h_C[i * m2 + j] += h_A[i * m1 + k] * h_B[k * m2 + j]; 64 } 65 } 66 } 67 return h_C; 68 } 69 70 void outPutMatrix(char c,int *g, int n,int m) 71 { 72 return; 73 printf("matrix %c [%3d %3d] ", c, n, m); 74 for(int i = 0; i < n * m;i++) 75 { 76 printf("%5d ", g[i]); 77 if((i + 1) % m == 0)printf(" "); 78 } 79 } 80 81 const int base = 1000; 82 const int large = 2; 83 int main() 84 { 85 int n1 = base; 86 int m1 = base + 1; 87 int n2 = m1; 88 int m2 = base; 89 int *g1 = new int[n1 * m1]; 90 int *g2 = new int[n2 * m2]; 91 for(int i = 0; i < n1 * m1;i++)g1[i] = rand() % large; 92 for(int i = 0; i < n2 * m2;i++)g2[i] = rand() % large; 93 outPutMatrix('A',g1,n1,m1); 94 outPutMatrix('B',g2,n2,m2); 95 int *gg1,*gg2; 96 97 clock_t start, finish; 98 99 start = clock(); 100 gg1 = matrixMultiplyByGpu(g1,n1,m1,g2,n2,m2); 101 finish = clock(); 102 printf("GPU time = %f ",(double)(finish - start) / CLOCKS_PER_SEC); 103 104 start = clock(); 105 gg2 = matrixMultiplyByCpu(g1,n1,m1,g2,n2,m2); 106 finish = clock(); 107 printf("CPU time = %f ",(double)(finish - start) / CLOCKS_PER_SEC); 108 109 printf("check---"); 110 for(int i = 0; i< n1*m2;i++) 111 { 112 if(gg1[i] != gg2[i]) 113 { 114 printf("wrong ans "); 115 break; 116 } 117 } 118 outPutMatrix('1',gg1,n1,m2); 119 outPutMatrix('2',gg2,n1,m2); 120 }
版本一分析:
n 约等于 maxThreadsPerBlock
这里我们的矩阵空间复杂度大概是o(n^2),两个这样矩阵的乘法复杂度大概是0(n^3),这里使用GPU优化的方案是开启n个block,每个block有n个thread。这样我们的并发量就是n^2,也就是计算复杂度大概是0(n)。
版本一测试:
n 约等于 maxThreadsPerBlock
这里请注意,你的base + 1 < min(maxThreadsPerBlock,maxGridSize[0]),不然将超过cuda的最大计算量,会导致你的计算结果错误。
根据我的机子的情况 n = 1000,运行时间如下,可以看出计算时间大概是13.87倍
1 #include <stdio.h> 2 #include <time.h> 3 #include <cuda_runtime.h> 4 __global__ void matrixMulCUDA(float *A,float *B,float * C, 5 dim3 dimsA,dim3 dimsB, dim3 dimsC) 6 { 7 int i = blockIdx.x; 8 int j = threadIdx.x; 9 10 for(int k = 0; k < dimsA.y; k++) 11 { 12 C[i * dimsC.y + j] += A[i * dimsA.y + k] * B[k * dimsB.y + j]; 13 //printf("id = %d %d %d A = %d B = %d C = %d ", i,j,k, A[i * dimsA.y + k], 14 // B[k * dimsB.y + j], 15 // C[i * dimsC.y + j]); 16 } 17 } 18 19 float* matrixMultiplyByGpu(float *h_A, int n1,int m1,float *h_B,int n2,int m2) 20 { 21 float *d_A, *d_B, *d_C; 22 float *h_C; 23 24 dim3 dimsA(n1,m1); 25 dim3 dimsB(n2,m2); 26 dim3 dimsC(n1,m2); 27 28 int mem_size_A = dimsA.x * dimsA.y * sizeof(float); 29 int mem_size_B = dimsB.x * dimsB.y * sizeof(float); 30 int mem_size_C = dimsC.x * dimsC.y * sizeof(float); 31 32 cudaMalloc((void**)&d_A, mem_size_A); 33 cudaMalloc((void**)&d_B, mem_size_B); 34 cudaMalloc((void**)&d_C, mem_size_C); 35 36 cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice); 37 cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice); 38 39 h_C = (float*)malloc(sizeof(float)*mem_size_C); 40 for(int i = 0; i<dimsC.x * dimsC.y;i++)h_C[i] = 0; 41 cudaMemcpy(d_C, h_C, mem_size_C, cudaMemcpyHostToDevice); 42 dim3 grid(dimsC.x,dimsC.y); 43 matrixMulCUDA<<<dimsC.x,dimsC.y>>>(d_A,d_B,d_C,dimsA,dimsB,dimsC); 44 cudaMemcpy(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost); 45 cudaFree(d_A); 46 cudaFree(d_B); 47 cudaFree(d_C); 48 return h_C; 49 } 50 51 float* matrixMultiplyByCpu(float *h_A, int n1,int m1,float *h_B,int n2,int m2) 52 { 53 float *h_C = new float [n1 * m2]; 54 for(int i = 0; i < n1 * m2; i++)h_C[i] = 0; 55 56 for(int i = 0; i < n1; i ++) 57 { 58 for(int j = 0; j < m2; j++) 59 { 60 for(int k = 0; k < m1; k++) 61 { 62 //h_C[i][j] = h_A[i][k] * h_B[k][j]; 63 h_C[i * m2 + j] += h_A[i * m1 + k] * h_B[k * m2 + j]; 64 } 65 } 66 } 67 return h_C; 68 } 69 70 void outPutMatrix(char c,float *g, int n,int m) 71 { 72 return; 73 printf("matrix %c [%3d %3d] ", c, n, m); 74 for(int i = 0; i < n * m;i++) 75 { 76 printf("%5f ", g[i]); 77 if((i + 1) % m == 0)printf(" "); 78 } 79 } 80 81 const int base = 1000; 82 const int large = 2; 83 int main() 84 { 85 int n1 = base; 86 int m1 = base + 1; 87 int n2 = m1; 88 int m2 = base; 89 float *g1 = new float[n1 * m1]; 90 float *g2 = new float[n2 * m2]; 91 for(int i = 0; i < n1 * m1;i++)g1[i] = rand() % large + 1.0f / 3.0f; 92 for(int i = 0; i < n2 * m2;i++)g2[i] = rand() % large + 1.0f / 3.0f; 93 outPutMatrix('A',g1,n1,m1); 94 outPutMatrix('B',g2,n2,m2); 95 float *gg1,*gg2; 96 97 clock_t start, finish; 98 99 start = clock(); 100 gg1 = matrixMultiplyByGpu(g1,n1,m1,g2,n2,m2); 101 finish = clock(); 102 printf("GPU time = %f ",(double)(finish - start) / CLOCKS_PER_SEC); 103 104 start = clock(); 105 gg2 = matrixMultiplyByCpu(g1,n1,m1,g2,n2,m2); 106 finish = clock(); 107 printf("CPU time = %f ",(double)(finish - start) / CLOCKS_PER_SEC); 108 109 printf("check---"); 110 for(int i = 0; i< n1*m2;i++) 111 { 112 if(fabs(gg1[i] - gg2[i]) > 0.01) 113 { 114 printf("%f %f wrong ans ",gg1[i],gg2[i]); 115 break; 116 } 117 } 118 outPutMatrix('1',gg1,n1,m2); 119 outPutMatrix('2',gg2,n1,m2); 120 }
版本一分析:
在版本一的基础上改成float运算
版本一测试:
结果如下,没有太大区别,本来预期是GPU的浮点计算能力会比CPU好很多的,但这里看来,并没有很明显的区别。