zoukankan      html  css  js  c++  java
  • 矩阵加法

    ▶ 直接的矩阵加法,没有优化

      1 #include <stdio.h>
      2 #include <stdlib.h>
      3 #include <malloc.h>
      4 #include <time.h>
      5 #include "cuda_runtime.h"
      6 #include "device_launch_parameters.h"
      7 
      8 #define WIDTH       (1024*1+19)
      9 #define HEIGHT      (1024*1+17)
     10 #define THREAD_SIZE 16
     11 #define BLOCK_SIZE  256
     12 #define DIMENSION   1   //使用指定维数的跳转
     13 #define SEED        1   //(unsigned int)time(MULL)
     14 
     15 void checkNULL(void *input)
     16 {
     17     if (input == NULL)
     18     {
     19         printf("
    	find a NULL!");
     20         exit(1);
     21     }
     22     return;
     23 }
     24 
     25 void checkCudaError(cudaError input)
     26 {
     27     if (input != cudaSuccess)
     28     {
     29         printf("
    	find a cudaError!");
     30         exit(1);
     31     }
     32     return;
     33 }
     34 
     35 int checkResult(float * in1, float * in2, const int length)
     36 // 注意返回值为0(两向量相等)或者“值不等的元素下标加一”(防止0号元素就不相等),返回后若想使用该下标则需要减1
     37 {
     38     int i;
     39     for (i = 0; i < length; i++)
     40     {
     41         if (in1[i] != in2[i])
     42             return i + 1;
     43     }
     44     return 0;
     45 }
     46 
     47 // 采用一维跳转
     48 __global__ void add1(const float *a, const float *b, float *c)
     49 {
     50     int id = blockIdx.x * blockDim.x + threadIdx.x;
     51 
     52     while (id < WIDTH * HEIGHT)
     53     {
     54         c[id] = a[id] + b[id];
     55         id += gridDim.x*blockDim.x;
     56     }
     57     return;
     58 }
     59 
     60 // 采用二维跳转
     61 __global__ void add2(const float *a, const float *b, float *c)
     62 {
     63     int idx = blockIdx.x * blockDim.x + threadIdx.x;
     64     int idy = blockIdx.y * blockDim.y + threadIdx.y;
     65     int id = idx + idy *blockDim.y *gridDim.y;
     66 
     67     while (id < WIDTH * HEIGHT)
     68     {
     69         c[id] = a[id] + b[id];
     70         id += gridDim.x * blockDim.x * gridDim.y * blockDim.y; 
     71     }
     72     return;
     73 }
     74 
     75 int main()
     76 {
     77     int i;
     78     float *h_a, *h_b, *cpu_out, *gpu_out;
     79     float *d_a, *d_b, *d_c;
     80     clock_t timeCPU;
     81     cudaEvent_t start, stop;
     82     float timeGPU;
     83     cudaEventCreate(&start);
     84     cudaEventCreate(&stop);
     85 
     86     printf("
    	Start! Matrix dimension:	%d × %d", WIDTH, HEIGHT);
     87 
     88     checkNULL(h_a = (float *)malloc(sizeof(float)*WIDTH*HEIGHT));
     89     checkNULL(h_b = (float *)malloc(sizeof(float)*WIDTH*HEIGHT));
     90     checkNULL(cpu_out = (float *)malloc(sizeof(float)*WIDTH*HEIGHT));
     91     checkNULL(gpu_out = (float *)malloc(sizeof(float)*WIDTH*HEIGHT));
     92     checkCudaError(cudaMalloc((float **)&d_a, sizeof(float)*WIDTH*HEIGHT));
     93     checkCudaError(cudaMalloc((float **)&d_b, sizeof(float)*WIDTH*HEIGHT));
     94     checkCudaError(cudaMalloc((float **)&d_c, sizeof(float)*WIDTH*HEIGHT));
     95 
     96     // 初始化
     97     timeCPU = clock();
     98     srand(SEED);
     99     for (i = 0; i < WIDTH*HEIGHT; i++)
    100     {
    101         h_a[i] = (float)rand() / RAND_MAX;
    102         h_b[i] = (float)rand() / RAND_MAX;
    103     }
    104     printf("
    	Initialized! Time:		%8.3f ms
    ", (float)(clock() - timeCPU));
    105 
    106     timeCPU = clock();
    107     for (i = 0; i < WIDTH*HEIGHT; i++)
    108         cpu_out[i] = h_a[i] + h_b[i];
    109     printf("
    	CPU time:	%8.3f ms
    ", (float)(clock() - timeCPU)); 
    110 
    111     cudaMemset(d_c, 0, sizeof(float) * WIDTH * HEIGHT);
    112     cudaEventRecord(start, 0);
    113     cudaMemcpy(d_a, h_a, sizeof(float) * WIDTH * HEIGHT, cudaMemcpyHostToDevice);
    114     cudaMemcpy(d_b, h_b, sizeof(float) * WIDTH * HEIGHT, cudaMemcpyHostToDevice);
    115     if(DIMENSION == 1)
    116         add1 << < BLOCK_SIZE * BLOCK_SIZE, THREAD_SIZE * THREAD_SIZE >> > (d_a, d_b, d_c);
    117     else
    118         add2 << < dim3(BLOCK_SIZE, BLOCK_SIZE), dim3(THREAD_SIZE, THREAD_SIZE) >> > (d_a, d_b, d_c);
    119     cudaMemcpy(gpu_out, d_c, sizeof(float) * WIDTH * HEIGHT, cudaMemcpyDeviceToHost);
    120     cudaDeviceSynchronize();
    121     cudaEventRecord(stop,0);
    122     cudaEventSynchronize(stop);
    123     cudaEventElapsedTime(&timeGPU, start, stop);
    124     if (i = checkResult(cpu_out, gpu_out, WIDTH * HEIGHT))
    125         printf("
    	Compute error at i = %d
    	cpu_out[i] = %10d, gpu_out[i] = %10d
    ", i - 1, cpu_out[i - 1], gpu_out[i - 1]);
    126     else
    127         printf("
    	GPU Compute correctly!
    ");
    128     printf("
    	GPU Time:	%8.3f ms
    ", timeGPU);
    129 
    130     free(h_a);
    131     free(h_b);
    132     free(cpu_out);
    133     free(gpu_out);
    134     cudaFree(d_a);
    135     cudaFree(d_b);
    136     cudaFree(d_c);
    137     getchar();
    138     return 0;
    139 }

    ▶输出结果,对于超大型矩阵的加法计算,左下图为一维跳转,右下图为二维跳转。矩阵较大时初始化矩阵费时较多。超大矩阵的计算上GPU相对CPU产生了一定的优势,一维跳转比二维跳转稍快,因为省去了复杂的下标映射。

    ▶ 对于较小的矩阵,GPU没有发挥出优势, 因为内存拷贝等方面耗时明显。

  • 相关阅读:
    前端日常开发常用功能系列之乱序
    ES6系列之箭头函数
    ES6系列之let/const及块级作用域
    前端日常开发常用功能系列之数组扁平
    前端日常开发常用功能系列之数组最值
    前端日常开发常用功能系列之拷贝
    前端日常开发常用功能系列之数组去重
    前端日常开发常用功能系列之节流
    前端日常开发常用功能系列之防抖
    数据库连接池优化配置(druid,dbcp,c3p0)
  • 原文地址:https://www.cnblogs.com/cuancuancuanhao/p/7654756.html
Copyright © 2011-2022 走看看