zoukankan      html  css  js  c++  java
  • 前缀和

    三种不同的方法计算前缀和,并与CPU的结果进行了对比。

      1 #include <stdio.h>
      2 #include <stdlib.h>
      3 #include <windows.h>
      4 #include <time.h>
      5 #include <math.h>
      6 #include "cuda_runtime.h"
      7 #include "device_launch_parameters.h"
      8 
      9 #define ARRAY_SIZE      (1024*7+419)
     10 #define SIMPLE_TASK     1024
     11 #define WIDTH           256
     12 #define SEED            1   //(unsigned int)clock()
     13 #define RAND_RANGE      10  // 取[-RAND_RANGE,+RAND_RANGE]范围的随机数
     14 #define MIN(x,y)        ((x)<(y)?(x):(y))
     15 #define CEIL(x,y)       (int)(( x - 1 ) /  y + 1)
     16 
     17 typedef int format;    // int or float
     18 
     19 void checkCudaError(cudaError input)
     20 {
     21     if (input != cudaSuccess)
     22     {
     23         printf("
    	find a cudaError!");
     24         exit(1);
     25     }
     26     return;
     27 }
     28 
     29 int ceil_base2(int input)// 将input向上约化到2的整数幂上去
     30 {
     31     int i;
     32     for (i = 1; input > 1; input = (input + 1) / 2, i *= 2); 
     33     return i;
     34 }
     35 
     36 int checkResult(format * in1, format * in2, const int length)
     37 // 注意返回值为0(两向量相等)或者“值不等的元素下标加一”(防止0号元素就不相等),返回后若想使用该下标则需要减1
     38 {
     39     int i;
     40     for (i = 0; i < length; i++)
     41     {
     42         if (in1[i] != in2[i])
     43             return i+1;
     44     }
     45     return 0;
     46 }
     47 
     48 void prefixSumCPU(const format *in, format *out, const int array_size)
     49 {
     50     out[0] = in[0];
     51     for (int i = 1; i < array_size; i++)
     52         out[i] = out[i - 1] + in[i];
     53     return;
     54 }
     55 
     56 __global__ void select(const format *in, format *out, const int space)// 从in中等间距挑选元素放到out中去,注意要用
     57 {
     58     out[threadIdx.x] = (threadIdx.x == 0) ? 0 : in[(threadIdx.x)*space - 1];
     59     return;
     60 }
     61 
     62 __global__ void add(format *in, const format * addend, const int array_size)// 线程块内所有元素加上一个来自addend中的特定的元素
     63 {
     64     __shared__ format share_addend;
     65     share_addend = addend[blockIdx.x];
     66 
     67     int id = blockIdx.x * blockDim.x + threadIdx.x;
     68     if (id < array_size)
     69         in[id] += share_addend;
     70     return;
     71 }
     72 
     73 __global__ void prefixSumGPU1(const format *in, format *out, const int array_size)// 低效率分段闭扫描前缀和,可以处理任意长度
     74 {
     75     extern __shared__ format share_in[];
     76     int id = blockIdx.x * blockDim.x + threadIdx.x;
     77     int jump;
     78     share_in[threadIdx.x] = 0;
     79     if (id < array_size)
     80         share_in[threadIdx.x] = in[id];
     81     __syncthreads();
     82 
     83     for (jump = 1; jump < blockDim.x; jump *= 2)
     84     {
     85         share_in[threadIdx.x] += (threadIdx.x >= jump) ? share_in[threadIdx.x - jump] : 0;
     86         __syncthreads();
     87     }
     88     if (id < array_size)
     89         out[id] = share_in[threadIdx.x];
     90     return;
     91 }
     92 
     93 __global__ void prefixSumGPU2(const format *in, format *out, const int array_size)//收集 - 分发树法,处理2的整数次幂个元素
     94 {
     95     extern __shared__ format share_in[];
     96     int id = blockIdx.x * blockDim.x + threadIdx.x;
     97     int target;
     98     int jump;
     99     share_in[threadIdx.x] = 0;
    100     if (id < array_size)
    101         share_in[threadIdx.x] = in[id];
    102     __syncthreads();
    103 
    104     /*//收集树,与后面一个收集树等效,但是使用了离散的线程,读写效率较低
    105     for (jump = 1; jump < blockDim.x; jump *= 2)
    106     {
    107     share_in[threadIdx.x] += ((threadIdx.x + 1) % (2 * jump) == 0) ? share_in[threadIdx.x - jump] : 0;
    108     __syncthreads();
    109     }
    110     */
    111     for (jump = 1; jump < blockDim.x; jump *= 2)// 收集树,利用连续的线程,减少逻辑分支
    112     {
    113         target = (threadIdx.x + 1) * 2 * jump - 1;
    114         if (target < blockDim.x)
    115             share_in[target] += share_in[target - jump];
    116         __syncthreads();
    117     }
    118     for (jump = blockDim.x / 4; jump > 0; jump /= 2)// 分发树
    119     {
    120         target = (threadIdx.x + 1) * 2 * jump - 1;
    121         if (target + jump < blockDim.x)
    122             share_in[target + jump] += share_in[target];
    123         __syncthreads();
    124     }
    125     if (id < array_size)
    126         out[id] = share_in[threadIdx.x];
    127     return;
    128 }
    129 
    130 __global__ void prefixSumGPU3(const format *in, format *out, const int array_size)//改良版收集 - 分发树,处理的元素个数为线程数的2倍
    131 {
    132     extern __shared__ format share_in[];
    133     int id = blockIdx.x * blockDim.x + threadIdx.x;
    134     int target;
    135     int jump;
    136     share_in[2 * threadIdx.x] = 0;
    137     share_in[2 * threadIdx.x + 1] = 0;
    138     if (2 * id + 1 < (array_size + 1) / 2 * 2)// 改动,调整下标范围以适应奇数个元素
    139     {
    140         share_in[2 * threadIdx.x] = in[2 * id];// 改动,一个线程要搬运两个全局内存
    141         share_in[2 * threadIdx.x + 1] = in[2 * id + 1];
    142     }
    143     __syncthreads();
    144 
    145     for (jump = 1; jump < 2 * blockDim.x; jump *= 2)// 改动,放宽了jump的上限
    146     {
    147         target = (threadIdx.x + 1) * 2 * jump - 1;
    148         if (target < 2 * blockDim.x)// 改动,放宽了接收元素下标范围
    149             share_in[target] += share_in[target - jump];
    150         __syncthreads();
    151     }
    152     for (jump = blockDim.x / 2; jump > 0; jump /= 2)// 改动,放宽了jump的初始值
    153     {
    154         target = (threadIdx.x + 1) * 2 * jump - 1;
    155         if (target + jump < 2 * blockDim.x)// 改动,放宽了接收元素下标范围
    156             share_in[target + jump] += share_in[target];
    157         __syncthreads();
    158     }
    159     if (2 * id + 1 < (array_size + 1) / 2 * 2)// 改动,调整下标范围以适应奇数个元素
    160     {
    161         out[2 * id] = share_in[2 * threadIdx.x];// 改动,一个线程要搬运两个全局内存
    162         out[2 * id + 1] = share_in[2 * threadIdx.x + 1];
    163     }
    164     return;
    165 }
    166 
    167 int main()
    168 {
    169     int i;
    170     int simple_task = MIN(ceil_base2(ARRAY_SIZE), SIMPLE_TASK);//要么将原数组向上扩充到最小的2的整数次幂个大小,要么最大限制1024个元素
    171     format h_in[ARRAY_SIZE], cpu_out[ARRAY_SIZE], gpu_out[ARRAY_SIZE];
    172     format *d_in, *d_out, *d_temp, *d_temp2;
    173     
    174     cudaStream_t stream;
    175     cudaStreamCreate(&stream);
    176     clock_t time;
    177     cudaEvent_t start, stop;
    178     float elapsedTime1, elapsedTime2, elapsedTime3, elapsedTime4;
    179     cudaEventCreate(&start);
    180     cudaEventCreate(&stop);
    181 
    182     checkCudaError(cudaMalloc((void **)&d_in, sizeof(format) * ARRAY_SIZE));
    183     checkCudaError(cudaMalloc((void **)&d_out, sizeof(format) * ARRAY_SIZE));
    184     int d_temp_length = CEIL(ARRAY_SIZE, WIDTH);// array首次分解为线程块的数量
    185     checkCudaError(cudaMalloc((void **)&d_temp, sizeof(format) * d_temp_length));
    186 
    187     srand(SEED);
    188     for (i = 0; i < ARRAY_SIZE; i++)
    189         h_in[i] = rand() % (RAND_RANGE << 1 + 1) - RAND_RANGE;
    190 
    191     time = clock();
    192     prefixSumCPU(h_in, cpu_out, ARRAY_SIZE);
    193     time = clock() - time;
    194 
    195     cudaMemcpy(d_in, h_in, sizeof(format) * ARRAY_SIZE, cudaMemcpyHostToDevice);
    196 
    197     cudaMemset(d_out, 0, sizeof(format) * ARRAY_SIZE);
    198     cudaEventRecord(start, 0);
    199     prefixSumGPU1 << < 1, simple_task, sizeof(format) * simple_task >> > (d_in, d_out, MIN(ARRAY_SIZE, SIMPLE_TASK));
    200     cudaMemcpy(gpu_out, d_out, sizeof(format) * MIN(ARRAY_SIZE, SIMPLE_TASK), cudaMemcpyDeviceToHost);
    201     cudaDeviceSynchronize();
    202     cudaEventRecord(stop, 0);
    203     cudaEventSynchronize(stop);
    204     cudaEventElapsedTime(&elapsedTime1, start, stop);
    205     if (i = checkResult(cpu_out, gpu_out, MIN(ARRAY_SIZE, SIMPLE_TASK)))
    206         printf("
    	Compute error at i = %d
    	cpu_out[i] = %10d, gpu_out[i] = %10d
    ", i - 1, cpu_out[i - 1], gpu_out[i - 1]);
    207     else
    208         printf("
    	GPU1 Compute correctly!
    ");
    209 
    210     cudaMemset(d_out, 0, sizeof(format) * ARRAY_SIZE);
    211     cudaEventRecord(start, 0);
    212     prefixSumGPU2 << < 1, simple_task, sizeof(format) * simple_task >> > (d_in, d_out, MIN(ARRAY_SIZE, SIMPLE_TASK));
    213     cudaMemcpy(gpu_out, d_out, sizeof(format) * MIN(ARRAY_SIZE, SIMPLE_TASK), cudaMemcpyDeviceToHost);
    214     cudaDeviceSynchronize();
    215     cudaEventRecord(stop, 0);
    216     cudaEventSynchronize(stop);
    217     cudaEventElapsedTime(&elapsedTime2, start, stop);
    218     if (i = checkResult(cpu_out, gpu_out, MIN(ARRAY_SIZE, SIMPLE_TASK)))
    219         printf("
    	Compute error at i = %d
    	cpu_out[i] = %10d, gpu_out[i] = %10d
    ", i - 1, cpu_out[i - 1], gpu_out[i - 1]);
    220     else
    221         printf("
    	GPU2 Compute correctly!
    ");
    222 
    223     cudaMemset(d_out, 0, sizeof(format) * ARRAY_SIZE);
    224     cudaEventRecord(start, 0);
    225     prefixSumGPU3 << < 1, simple_task / 2, sizeof(format) * simple_task >> > (d_in, d_out, MIN(ARRAY_SIZE, SIMPLE_TASK));
    226     cudaMemcpy(gpu_out, d_out, sizeof(format) * MIN(ARRAY_SIZE, SIMPLE_TASK), cudaMemcpyDeviceToHost);
    227     cudaDeviceSynchronize();
    228     cudaEventRecord(stop, 0);
    229     cudaEventSynchronize(stop);
    230     cudaEventElapsedTime(&elapsedTime3, start, stop);
    231     if (i = checkResult(cpu_out, gpu_out, MIN(ARRAY_SIZE, SIMPLE_TASK)))
    232         printf("
    	Compute error at i = %d
    	cpu_out[i] = %10d, gpu_out[i] = %10d
    ", i - 1, cpu_out[i - 1], gpu_out[i - 1]);
    233     else
    234         printf("
    	GPU3 Compute correctly!
    ");
    235 
    236     // 长向量前缀和,假设长度不大于WIDTH^2(两次迭代可完成)
    237     cudaMemset(d_out, 0, sizeof(format) * ARRAY_SIZE);
    238     cudaEventRecord(start, 0);
    239     prefixSumGPU3 << < d_temp_length, WIDTH / 2, sizeof(format) * WIDTH, stream >> > (d_in, d_out, ARRAY_SIZE);
    240     select << < 1, d_temp_length, 0, stream >> > (d_out, d_temp, WIDTH);
    241     prefixSumGPU3 << < 1, ceil_base2(d_temp_length), sizeof(format) * ceil_base2(d_temp_length), stream >> > (d_temp, d_temp, d_temp_length);
    242     add << < d_temp_length, WIDTH, 0, stream >> > (d_out, d_temp, ARRAY_SIZE);
    243     cudaMemcpy(gpu_out, d_out, sizeof(format) * ARRAY_SIZE, cudaMemcpyDeviceToHost);
    244     cudaDeviceSynchronize();
    245     cudaEventRecord(stop, 0);
    246     cudaEventSynchronize(stop);
    247     cudaEventElapsedTime(&elapsedTime4, start, stop);
    248     if (i = checkResult(cpu_out, gpu_out, ARRAY_SIZE))
    249         printf("
    	Compute error at i = %d
    	cpu_out[i] = %10d, gpu_out[i] = %10d
    ", i - 1, cpu_out[i - 1], gpu_out[i - 1]);
    250     else
    251         printf("
    	GPU4 Compute correctly!
    ");
    252     
    253     // 长向量前缀和,可能多次迭代
    254     // TODO
    255 
    256     printf("
    	Spending time:
    	CPU:	%10ld ms
    257         
    	GPU1:	%f ms
    	GPU2:	%f ms
    	GPU3:	%f ms
    	GPU4:	%f ms
    ",
    258         time, elapsedTime1, elapsedTime2, elapsedTime3, elapsedTime4);
    259 
    260     cudaFree(d_in);
    261     cudaFree(d_out);
    262     cudaFree(d_temp);
    263     cudaStreamDestroy(stream);
    264     cudaEventDestroy(start);
    265     cudaEventDestroy(stop);
    266     getchar();
    267     return 0;
    268 }

    ▶ 结果如下图。第一种方法存在不可重现的bug,且仅当输入数组规模大于512时开始出现,原因未知。其他几种方法计算结果均正确,再计算较短的响亮的时候第三种方法(改良的收集 - 分发树法)效率最高,当向量长度远大于自己设定的阈值1024(单个线程亏最大处理大小)时,第四种方案的效率很高。由于warm-up的原因,第四种方案计算全部向量的速度还有可能快于第三种方案计算前1024个元素的速度。

  • 相关阅读:
    2016年第七届蓝桥杯C/C++ A组国赛 —— 第一题:随意组合
    寻找段落
    攻击火星
    图论入门
    实数加法
    求 10000 以内 n 的阶乘
    大整数因子
    计算2的N次方
    大整数减法
    大整数加法
  • 原文地址:https://www.cnblogs.com/cuancuancuanhao/p/7669637.html
Copyright © 2011-2022 走看看