zoukankan      html  css  js  c++  java
  • 矩阵乘法

    ▶ 计算矩阵矩阵乘法 Am×n Bn×p == Cm×p 过程。

    ▶ 原始矩阵乘法,一个线程计算结果矩阵中的一个元素。

      1 #include <stdio.h>
      2 #include <stdlib.h>
      3 #include <malloc.h>
      4 #include <time.h>
      5 #include "cuda_runtime.h"
      6 #include "device_launch_parameters.h"
      7 
      8 #define M           (1024 + 13)
      9 #define N           (1024 + 31)
     10 #define P           (1024 + 7)
     11 #define THREAD      16
     12 #define SEED        (unsigned int)clock()
     13 #define CEIL(x,y)   (( x - 1 ) /  y + 1)
     14 
     15 void checkNULL(void *input)
     16 {
     17     if (input == NULL)
     18     {
     19         printf("
    	find a NULL!");
     20         exit(1);
     21     }
     22     return;
     23 }
     24 
     25 void checkCudaError(cudaError input)
     26 {
     27     if (input != cudaSuccess)
     28     {
     29         printf("
    	find a cudaError!");
     30         exit(1);
     31     }
     32     return;
     33 }
     34 
     35 __global__ void times(float *a, float *b, float *c)// A(mn)×B(np)=C(np)
     36 {
     37     int k;
     38     float temp;
     39     int idx = blockIdx.x * blockDim.x + threadIdx.x;// B、C中精确列号
     40     int idy = blockIdx.y * blockDim.y + threadIdx.y;// A、C中精确行号
     41 
     42     for (; idy < M; idy += gridDim.y*blockDim.y)
     43     {
     44         for (; idx < P; idx += gridDim.x*blockDim.x)
     45         {
     46             for (k = 0, temp = 0.0f; k < N; k++)
     47                 temp += a[idy * N + k] * b[k * P + idx];
     48             c[idy * P + idx] = temp;
     49         }
     50     }
     51     return;
     52 }
     53 
     54 int main()
     55 {
     56     int i, j, k, flag;
     57     float *a, *b, *c, *dev_a, *dev_b, *dev_c, temp, elapsedTime;
     58     clock_t time, cputime;
     59     cudaEvent_t start, stop;
     60     cudaEventCreate(&start);
     61     cudaEventCreate(&stop);
     62 
     63     printf("
    	Matrix size: A(%d, %d) × B(%d, %d) == C(%d, %d)", M, N, N, P, M, P);
     64     printf("
    	Start!");
     65 
     66     a = (float *)malloc(sizeof(float) * M * N);
     67     b = (float *)malloc(sizeof(float) * N * P);
     68     c = (float *)malloc(sizeof(float) * M * P);
     69     checkNULL(a);
     70     checkNULL(b);
     71     checkNULL(c);
     72     checkCudaError(cudaMalloc((float **)&dev_a, sizeof(float) * M * N));
     73     checkCudaError(cudaMalloc((float **)&dev_b, sizeof(float) * N * P));
     74     checkCudaError(cudaMalloc((float **)&dev_c, sizeof(float) * M * P));
     75 
     76     time = clock();
     77     srand(SEED);
     78     for (i = 0; i < M * N; a[i] = (float)rand() / RAND_MAX, i++);
     79     for (i = 0; i < N * P; b[i] = (float)rand() / RAND_MAX, i++);
     80     printf("
    	Initialized! Time:	%6.2f ms", (float)(clock() - time));
     81 
     82     // 用CPU计算
     83     time = clock();
     84     for (i = 0; i < M; i++)
     85     {
     86         for (j = 0; j < P; j++)
     87         {
     88             for (k = 0, temp = 0.0f; k < N; k++)
     89                 c[i * P + j] += a[i * N + k] * b[k * P + j];
     90         }
     91     }
     92     cputime = clock() - time;
     93     printf("
    	CPU cost time:	%6.2f ms", (float)cputime);
     94 
     95     // 用GPU计算
     96     time = clock();
     97     cudaEventRecord(start, 0);
     98 
     99     cudaMemcpy(dev_a, a, sizeof(float) * M * N, cudaMemcpyHostToDevice);
    100     cudaMemcpy(dev_b, b, sizeof(float) * N * P, cudaMemcpyHostToDevice);
    101 
    102     times << < dim3(CEIL(P, THREAD), CEIL(M, THREAD), 1), dim3(THREAD, THREAD) >> > (dev_a, dev_b, dev_c);
    103 
    104     cudaMemcpy(c, dev_c, sizeof(float) * M * P, cudaMemcpyDeviceToHost);
    105     cudaDeviceSynchronize();
    106 
    107     time = clock() - time;
    108     cudaEventRecord(stop, 0);
    109 
    110     cudaEventSynchronize(stop);
    111     cudaEventElapsedTime(&elapsedTime, start, stop);
    112     printf("
    	GPU cost time by CPU API:	%6.2f ms
    	GPU cost time by GPU API:	%6.2f ms
    
    ", (float)time, elapsedTime);
    113     printf("
    	AccRatio =	%3.1f
    
    ", (float)cputime / time);
    114 
    115     // 检验结果(第一次CPU计算的结果被覆盖掉了,这里相当于再算一次,逐个检查)
    116     for (i = 0, flag = 1; i < M; i++)
    117     {
    118         for (j = 0; j < P; j++)
    119         {
    120             for (k = 0, temp = 0.0f; k < N; k++)
    121                 temp += a[i * N + k] * b[k * P + j];
    122             if (temp != c[i * P + j])
    123             {
    124                 printf("
    	Calculate error at (%d,%d),
    		c[i] == %f,CPU == %f", i, j, c[i * P + j], temp);
    125                 flag = 0;
    126             }
    127         }
    128     }
    129     if (flag == 1)
    130         printf("
    	Calculate correctly!");
    131 
    132     cudaEventDestroy(start);
    133     cudaEventDestroy(stop);
    134     cudaFree(dev_a);
    135     cudaFree(dev_b);
    136     cudaFree(dev_c);
    137     getchar();
    138     return 0;
    139 }

     ● 输出结果

            Matrix size: A(1037, 1055) × B(1055, 1031) == C(1037, 1031)
            Start!
            Initialized! Time:      127.00 ms
            CPU cost time:  4049.00 ms
            GPU cost time by CPU API:       103.00 ms
            GPU cost time by GPU API:       103.02 ms
    
    
            AccRatio =      39.3
    
    
            Calculate correctly!

    ● 矩阵初始化消耗的时间,在计算超大型矩阵的过程中初始化需要占用时间(因为矩阵元素是在CPU中逐个初始化的)。

    ● 比较了GPU运算过程的CPU和GPU计时的结果,在如上代码中(在内存拷贝前开始计时,在内存回拷完成后结束计时),两种计时方式差距很小,基本上认为结果相同。

    ● 下方为相同的矩阵乘法在CPU中的计算耗时,可见目前GPU效率约为CPU的39倍。

    ● 若去除内存拷贝时间,则GPU耗时可减少到100ms左右,注意此时若想利用利用CPU计时,则必须在第二次掐表以前加入 cudaDeviceSynchronize(); 。因为核函数调用的同时,主机代码会继续往下执行,只到同步语句或者亟需GPU预算结果的语句才会停止,在上面的例子中,若不加入同步语句,则计时结果总是为0 ms。

    ● 可以进一步细致调节线程粒度 THREAD,耗时在小范围内有变化。

    ▶ 改进一,使用共享内存减少全局内存访问量

     1 __global__ void times(float *a, float *b, float *c)// A(mn)×B(np)=C(mp)
     2 {
     3     __shared__ float shareA[THREAD * THREAD];
     4     __shared__ float shareB[THREAD * THREAD];
     5 
     6     int k, t;
     7     float temp;
     8     int idx = blockIdx.x * blockDim.x + threadIdx.x;// B、C中精确列号
     9     int idy = blockIdx.y * blockDim.y + threadIdx.y;// A、C中精确行号  
    10 
    11     for (k = 0, temp = 0.0f; k < CEIL(N, THREAD); k++)// N为行程长,k为行程步数
    12     {
    13         shareA[threadIdx.y * THREAD + threadIdx.x] = (idy < M && k * THREAD + threadIdx.x < N) ? a[idy * N + k * THREAD + threadIdx.x] : 0.0f;
    14         shareB[threadIdx.y * THREAD + threadIdx.x] = (idx < P && k * THREAD + threadIdx.y < N) ? b[(k * THREAD + threadIdx.y) * P + idx] : 0.0f;
    15         __syncthreads();
    16 
    17         for (t = 0; t < THREAD; t++)// THREAD为行程长,t为行程步数
    18             temp += shareA[threadIdx.y * THREAD + t] * shareB[t * THREAD + threadIdx.x];
    19         __syncthreads();
    20     }
    21     if (idx < P && idy < M && idy * P + idx < M * P)
    22         c[idy * P + idx] = temp;
    23     return;
    24 }

    ● 输出结果

            Matrix size: A(1037, 1055) × B(1055, 1031) == C(1037, 1031)
            Start!
            Initialized! Time:      123.00 ms
            CPU cost time:  4044.00 ms
            GPU cost time by CPU API:       147.00 ms
            GPU cost time by GPU API:       146.75 ms
    
    
            AccRatio =      27.5
    
    
            Calculate correctly!

    ▶改进二,共享内存存储方式调整

     1 __global__ void times(float *a, float *b, float *c)// A(mn)×B(np)=C(mp)
     2 {
     3     __shared__ float shareA[THREAD * THREAD];
     4     __shared__ float shareB[THREAD * THREAD];
     5 
     6     int k, t;
     7     float temp;
     8     int idx = blockIdx.x * blockDim.x + threadIdx.x;// B、C中精确列号
     9     int idy = blockIdx.y * blockDim.y + threadIdx.y;// A、C中精确行号  
    10 
    11     for (k = 0, temp = 0.0f; k < CEIL(N, THREAD); k++)// N为行程长,k为行程步数
    12     {
    13         // shareA 改为转置(原始方法读取 a,写入 shareMemory 时挑适当位置
    14         shareA[threadIdx.x * THREAD + threadIdx.y] = (idy < M && k * THREAD + threadIdx.x < N) ? a[idy * N + k * THREAD + threadIdx.x] : 0.0f;
    15         shareB[threadIdx.y * THREAD + threadIdx.x] = (idx < P && k * THREAD + threadIdx.y < N) ? b[(k * THREAD + threadIdx.y) * P + idx] : 0.0f;
    16         __syncthreads();
    17 
    18         for (t = 0; t < THREAD; t++)// THREAD为行程长,t为行程步数
    19             // 注意A的行列发生了交换
    20             temp += shareA[t * THREAD + threadIdx.y] * shareB[t * THREAD + threadIdx.x];
    21         __syncthreads();
    22     }
    23     if (idx < P && idy < M && idy * P + idx < M * P)
    24         c[idy * P + idx] = temp;
    25     return;
    26 }

    ● 输出结果

            Matrix size: A(1037, 1055) × B(1055, 1031) == C(1037, 1031)
            Start!
            Initialized! Time:      121.00 ms
            CPU cost time:  4059.00 ms
            GPU cost time by CPU API:       135.00 ms
            GPU cost time by GPU API:       134.80 ms
    
    
            AccRatio =      30.1
    
    
            Calculate correctly!
  • 相关阅读:
    Vue(小案例_vue+axios仿手机app)_go实现退回上一个路由
    nyoj 635 Oh, my goddess
    nyoj 587 blockhouses
    nyoj 483 Nightmare
    nyoj 592 spiral grid
    nyoj 927 The partial sum problem
    nyoj 523 亡命逃窜
    nyoj 929 密码宝盒
    nyoj 999 师傅又被妖怪抓走了
    nyoj 293 Sticks
  • 原文地址:https://www.cnblogs.com/cuancuancuanhao/p/7657668.html
Copyright © 2011-2022 走看看