zoukankan      html  css  js  c++  java
  • 一维卷积

    展示了三种不同的GPU一维卷积方法,分别为简单(全局内存)卷积,含光环元素的共享内存方法,不含光环元素的共享内存方法。并且改进了CPU的一维卷积方案(不需要分边界情况单独处理)。

      1 #include <stdio.h>
      2 #include <stdlib.h>
      3 #include <windows.h>
      4 #include <time.h>
      5 #include <math.h>
      6 #include "cuda_runtime.h"
      7 #include "device_launch_parameters.h"
      8 
      9 #define ARRAY_SIZE      (1024*73+27)
     10 #define MASK_SIZE       7   //奇数
     11 #define WIDTH           64
     12 #define SEED            1   //(unsigned int)clock()    
     13 #define MIN(x,y)        ((x)<(y)?(x):(y))
     14 #define CEIL(x,y)       (int)(( x - 1 ) /  y + 1)
     15 
     16 typedef int format;    // int or float
     17 
     18 __constant__ format d_mask[MASK_SIZE];
     19 
     20 void checkCudaError(cudaError input)
     21 {
     22     if (input != cudaSuccess)
     23     {
     24         printf("
    	find a cudaError!");
     25         exit(1);
     26     }
     27     return;
     28 }
     29 
     30 int checkResult(format * in1, format * in2, const int length)
     31 {
     32     for (int i = 0; i < length; i++)
     33     {
     34         if (in1[i] != in2[i])
     35             return i;
     36     }
     37     return 0;
     38 }
     39 
     40 void convolutionCPU(const format *in, const format *mask, format *out, const int array_size, const int mask_size)
     41 {
     42     for (int i = 0; i < array_size; i++)// 外层循环针对数组元素
     43     {
     44         out[i] = 0;
     45         for (int j = -MIN(mask_size / 2, i); j <= MIN(mask_size / 2, array_size - 1 - i); j++)// 魔改的判断条件,不需要分边界情况讨论
     46             out[i] += in[i + j] * mask[mask_size / 2 + j];
     47     }
     48     return;
     49 }
     50 
     51 __global__ void convolutionGPU1(const format *in, format *out, const int array_size, const int mask_size)
     52 {
     53     int id = blockIdx.x * blockDim.x + threadIdx.x;
     54     if (id < array_size)
     55     {
     56         format sum = 0;
     57         for (int j = 0; j < mask_size; j++)// 循环针对蒙版元素
     58             sum += (id - mask_size / 2 + j >= 0 && id - mask_size / 2 + j < array_size) ? in[id - mask_size / 2 + j] * d_mask[j] : 0;
     59         out[id] = sum;
     60     }
     61     return;
     62 }
     63 
     64 __global__ void convolutionGPU2(const format *in, format *out, const int array_size, const int mask_size)
     65 {
     66     extern __shared__ format share_in[];// 指定共享内存包括两端的光环元素
     67     int id = blockIdx.x * blockDim.x + threadIdx.x;
     68     if (id < array_size)
     69     {
     70         format sum = 0;
     71 
     72         // 用前 mask_size / 2 个线程来填充前 mask_size / 2 个光环元素,保证顺序相同,提高全局内存访问效率
     73         if (threadIdx.x <mask_size / 2)
     74             share_in[threadIdx.x] = (id - mask_size / 2 >= 0) ? in[id - mask_size / 2] : 0;
     75         
     76         share_in[mask_size / 2 + threadIdx.x] = in[blockIdx.x * blockDim.x + threadIdx.x];//中间部分下标用 mask_size / 2 垫起
     77 
     78         // 用后 mask_size / 2 个线程来填充后 mask_size / 2 个光环元素
     79         if (threadIdx.x >= blockDim.x - mask_size / 2)
     80             share_in[mask_size / 2 + blockDim.x + threadIdx.x] = (id + mask_size / 2 < array_size) ? in[id + mask_size / 2] : 0;
     81         __syncthreads();
     82 
     83         for (int j = 0; j < mask_size; j++)// 卷积,循环针对蒙版元素
     84             sum += share_in[threadIdx.x + j] * d_mask[j];
     85         out[id] = sum;
     86     }
     87     return;
     88 }
     89 
     90 __global__ void convolutionGPU3(const format *in, format *out, const int array_size, const int mask_size)
     91 {
     92     extern __shared__ format share_in[];// 指定共享内存不包括两端的光环元素
     93     int id = blockIdx.x * blockDim.x + threadIdx.x;
     94     if (id < array_size)
     95     {
     96         int in_j, j;// in_j 为本线程中涉及卷积的原数组的第j个元素的下标
     97         format sum = 0;
     98 
     99         share_in[threadIdx.x] = in[id];
    100         __syncthreads();
    101 
    102         for (j = 0; j < mask_size; j++)
    103         {
    104             in_j = id - mask_size / 2 + j;
    105             if (in_j >= 0 && in_j < array_size)
    106                 sum += (in_j >= blockIdx.x*blockDim.x && in_j < (blockIdx.x + 1)*blockDim.x) ?
    107                 share_in[threadIdx.x + j - mask_size / 2] * d_mask[j] : sum += in[in_j] * d_mask[j];
    108         }
    109     }
    110     return;
    111 }
    112 
    113 int main()
    114 {
    115     int i;
    116     format h_in[ARRAY_SIZE], h_mask[MASK_SIZE], cpu_out[ARRAY_SIZE],gpu_out[ARRAY_SIZE];
    117     format *d_in, *d_out;
    118     clock_t time;
    119     cudaEvent_t start, stop;
    120     float elapsedTime1, elapsedTime2, elapsedTime3;
    121     cudaEventCreate(&start);
    122     cudaEventCreate(&stop);
    123     
    124     checkCudaError(cudaMalloc((void **)&d_in, sizeof(format) * ARRAY_SIZE));
    125     checkCudaError(cudaMalloc((void **)&d_mask, sizeof(format) * MASK_SIZE));
    126     checkCudaError(cudaMalloc((void **)&d_out, sizeof(format) * ARRAY_SIZE));
    127 
    128     srand(SEED);
    129     for (i = 0; i < ARRAY_SIZE; i++)
    130         h_in[i] = (rand() - RAND_MAX / 2);
    131     for (i = 0; i < MASK_SIZE; i++)
    132         h_mask[i] = 1;
    133     
    134     time = clock();
    135     convolutionCPU(h_in, h_mask, cpu_out, ARRAY_SIZE, MASK_SIZE);
    136     time = clock() - time;
    137 
    138     cudaMemcpy(d_in, h_in, sizeof(format) * ARRAY_SIZE, cudaMemcpyHostToDevice);
    139     cudaMemcpyToSymbol(d_mask, h_mask, sizeof(format) * MASK_SIZE);
    140 
    141     cudaMemset(d_out, 0, sizeof(format) * ARRAY_SIZE);
    142     cudaEventRecord(start, 0);
    143     convolutionGPU1 << < CEIL(ARRAY_SIZE, WIDTH), WIDTH >> > (d_in, d_out, ARRAY_SIZE, MASK_SIZE);
    144     cudaMemcpy(gpu_out, d_out, sizeof(format) * ARRAY_SIZE, cudaMemcpyDeviceToHost);
    145     cudaDeviceSynchronize();
    146     cudaEventRecord(stop, 0);
    147     cudaEventSynchronize(stop);
    148     cudaEventElapsedTime(&elapsedTime1, start, stop);
    149     if (i = checkResult(cpu_out, gpu_out, ARRAY_SIZE))
    150         printf("
    	Compute error at i = %d
    	cpu_out[i] = %10d, gpu_out[i] = %10d
    ", i, cpu_out[i], gpu_out[i]);
    151     else
    152         printf("
    	GPU1 Compute correctly!
    ");
    153     
    154     cudaMemset(d_out, 0, sizeof(format) * ARRAY_SIZE);
    155     cudaEventRecord(start, 0);
    156     convolutionGPU2 << < CEIL(ARRAY_SIZE, WIDTH), WIDTH, sizeof(format) * (WIDTH + MASK_SIZE - 1) >> > (d_in, d_out, ARRAY_SIZE, MASK_SIZE);
    157     cudaMemcpy(gpu_out, d_out, sizeof(format) * ARRAY_SIZE, cudaMemcpyDeviceToHost);
    158     cudaDeviceSynchronize();
    159     cudaEventRecord(stop, 0);
    160     cudaEventSynchronize(stop);
    161     cudaEventElapsedTime(&elapsedTime2, start, stop);
    162     if (i = checkResult(cpu_out, gpu_out, ARRAY_SIZE))
    163         printf("
    	Compute error at i = %d
    	cpu_out[i] = %10d, gpu_out[i] = %10d
    ", i, cpu_out[i], gpu_out[i]);
    164     else
    165         printf("
    	GPU2 Compute correctly!
    ");
    166 
    167     cudaMemset(d_out, 0, sizeof(format) * ARRAY_SIZE);
    168     cudaEventRecord(start, 0);
    169     convolutionGPU3 << < CEIL(ARRAY_SIZE, WIDTH), WIDTH, sizeof(format) * WIDTH >> > (d_in, d_out, ARRAY_SIZE, MASK_SIZE);
    170     cudaMemcpy(gpu_out, d_out, sizeof(format) * ARRAY_SIZE, cudaMemcpyDeviceToHost);
    171     cudaDeviceSynchronize();
    172     cudaEventRecord(stop, 0);
    173     cudaEventSynchronize(stop);
    174     cudaEventElapsedTime(&elapsedTime3, start, stop);
    175     if (i = checkResult(cpu_out, gpu_out, ARRAY_SIZE))
    176         printf("
    	Compute error at i = %d
    	cpu_out[i] = %10d, gpu_out[i] = %10d
    ", i, cpu_out[i], gpu_out[i]);
    177     else
    178         printf("
    	GPU3 Compute correctly!
    ");
    179   
    180     printf("
    	Spending time:
    	CPU:	%10ld ms
    181         
    	GPU1:	%10.2f ms
    	GPU2:	%10.2f ms
    	GPU3:	%10.2f ms
    ",
    182         time, elapsedTime1, elapsedTime2, elapsedTime3);
    183 
    184     cudaFree(d_in);
    185     cudaFree(d_mask);
    186     cudaFree(d_out);
    187     cudaEventDestroy(start);
    188     cudaEventDestroy(stop);
    189     getchar();
    190     return 0;
    191 }

    ▶ 输出结果如下图,计时部分有点问题(如何使用同一个 cudaEvent _t start, stop 对多个事件进行计时?),三种方法计算结果均正确,等待及时问题解决再来对比测评。

  • 相关阅读:
    Linux下截图工具
    Vue学习——router路由的实现原理
    Vue学习——vue的双向数据绑定原理
    JavaScript学习——面向对象(一)——创建对象(工厂模式和构造器模式)
    子组件给父组件的传值
    Vue组件
    JavaScript学习——事件对象Event
    JavaScript学习——事件处理程序
    JavaScript技巧——轮播图
    javascript——let关键字
  • 原文地址:https://www.cnblogs.com/cuancuancuanhao/p/7667440.html
Copyright © 2011-2022 走看看