zoukankan      html  css  js  c++  java
  • 数组规约加法(任意长度)

    ▶ 采用两种方法计算任意长度的数组规约加法。第一种是将原数组收缩到一个固定长度的共享内存数组中,再利用二分规约计算求和,这里测试了固定长度分别取2,4,8……1024的情形;第二种方法是将原数组分段,每段使用各自的共享内存进行二分规约,结果汇总后再次分段,规约,直至数组长度减小到1,输出结果。

    ● 代码

      1 #include <stdio.h>
      2 #include <stdlib.h>
      3 #include <windows.h>
      4 #include <time.h>
      5 #include <math.h>
      6 #include "cuda_runtime.h"
      7 #include "device_launch_parameters.h"
      8 
      9 #define ARRAY_SIZE      (1024 * 7 + 119)
     10 #define WIDTH           512
     11 #define SEED            1   //(unsigned int)clock()
     12 #define CEIL(x,y)       (int)(( x - 1 ) /  y + 1)    
     13 #define MIN(x,y)        ((x)<(y)?(x):(y))
     14 //#define RESIZE(x)       MIN(WIDTH, 1 << (int)ceil(log(x) / log(2)))
     15 
     16 typedef int format;    // int or float
     17 
     18 void checkCudaError(cudaError input)
     19 {
     20     if (input != cudaSuccess)
     21     {
     22         printf("
    	find a cudaError!");
     23         exit(1);
     24     }
     25     return;
     26 }
     27 
     28 // 不超过1024个元素的,任意个元素的规约加法。调用时指定与原数组长度相同大小的共享内存数组
     29 __global__ void add_1(const format *in, format *out)
     30 {
     31     extern __shared__ format sdata[];
     32     sdata[threadIdx.x] = in[threadIdx.x];
     33 
     34     for (int s = blockDim.x; s > 0; s = (s - 1) / 2 + 1)
     35     {        
     36         sdata[threadIdx.x] += (threadIdx.x >= s / 2) ? 0 : sdata[threadIdx.x + (s - 1) / 2 + 1];// 考虑加数是否为奇数个,后半线程都加 0
     37         __syncthreads();
     38     }
     39 
     40     if (threadIdx.x == 0)
     41         *out = sdata[0];
     42     return;
     43 }
     44 
     45 // 将原数组收缩到blockDim.x的尺度上进行二分规约。调用时指定与线程数相同大小的共享内存数组
     46 __global__ void add_2(const format *in, format *out, int size)// size为原数组大小,可以远大于blockDim.x
     47 {
     48     extern __shared__ format sdata[];
     49     sdata[threadIdx.x] = (threadIdx.x < size) ? in[threadIdx.x] : 0;
     50 
     51     // 收尾操作,在另外两个规约加法函数中没有的
     52     for (int id = threadIdx.x + blockDim.x; id < size; id += blockDim.x)
     53         sdata[threadIdx.x] += in[id];
     54     __syncthreads();
     55 
     56     for (int jump = blockDim.x / 2; jump > 0; jump /= 2)
     57     {
     58         sdata[threadIdx.x] += (threadIdx.x >= jump) ? 0 : sdata[threadIdx.x + jump];
     59         __syncthreads();
     60     }
     61     __syncthreads();
     62 
     63     if (threadIdx.x == 0)
     64         *out = sdata[0];
     65     return;
     66 }
     67 // 限制元素个数为2的整数次幂的规约加法。调用时指定共享内存数组大小为 1 << ((log(size) - 1) / 2 + 1) 
     68 __global__ void add_simple(const format *in, format *out, int size)
     69 {
     70     extern __shared__ format sdata[];
     71     sdata[threadIdx.x] = (blockIdx.x * blockDim.x + threadIdx.x < size) ? in[blockIdx.x * blockDim.x + threadIdx.x] : 0;
     72 
     73     for (int jump = blockDim.x / 2; jump > 0; jump /= 2)
     74     {
     75         sdata[threadIdx.x] += (threadIdx.x < jump) ? sdata[threadIdx.x + jump] : 0;
     76         __syncthreads();
     77     }
     78     __syncthreads();
     79 
     80     if (threadIdx.x == 0)
     81         out[blockIdx.x] = sdata[0];
     82     return;
     83 }
     84 
     85 int main()
     86 {
     87     int i, j, leftLength;
     88     format h_in[ARRAY_SIZE], h_out[10 + 1], cpu_sum;
     89     format *d_in, *d_out, *d_temp1, *d_temp2;
     90     cudaEvent_t start, stop;
     91     float elapsedTime[10 + 1];
     92     cudaEventCreate(&start);
     93     cudaEventCreate(&stop);
     94 
     95     checkCudaError(cudaMalloc((void **)&d_in, sizeof(format) * ARRAY_SIZE));
     96     checkCudaError(cudaMalloc((void **)&d_out, sizeof(format)));
     97     checkCudaError(cudaMalloc((void **)&d_temp1, sizeof(format) * CEIL(ARRAY_SIZE, WIDTH)));
     98     checkCudaError(cudaMalloc((void **)&d_temp2, sizeof(format) * CEIL(CEIL(ARRAY_SIZE, WIDTH), WIDTH)));
     99 
    100     srand(SEED);
    101     for (i = 0, cpu_sum = 0; i < ARRAY_SIZE; i++)
    102     {
    103         h_in[i] = rand() - RAND_MAX / 2;
    104         cpu_sum += h_in[i];
    105     }
    106     cudaMemcpy(d_in, h_in, sizeof(format) * ARRAY_SIZE, cudaMemcpyHostToDevice);
    107 
    108     // 方法一
    109     for (i = 0; i < 10; i++)
    110     {
    111         cudaEventRecord(start, 0);
    112         add_2 << < 1, 2 << i, sizeof(format) * (2 << i) >> > (d_in, d_out, ARRAY_SIZE);
    113         cudaMemcpy(&h_out[i], d_out, sizeof(format), cudaMemcpyDeviceToHost);
    114         cudaDeviceSynchronize();
    115         cudaEventRecord(stop, 0);
    116         cudaEventSynchronize(stop);
    117         cudaEventElapsedTime(&elapsedTime[i], start, stop);
    118     }
    119     // 方法二
    120     cudaEventRecord(start, 0);
    121     for (i = 0; i < CEIL(log(ARRAY_SIZE), log(WIDTH)); i++)// 一次循环完成一次迭代,共需要 CEIL(log(ARRAY_SIZE), log(WIDTH))次迭代
    122     {
    123         if (i == 0)// 首次迭代,将d_in算到d_temp1中去
    124         {
    125             add_simple << <CEIL(ARRAY_SIZE, WIDTH), WIDTH, sizeof(format) * WIDTH >> > (d_in, d_temp1, ARRAY_SIZE);
    126             cudaDeviceSynchronize();
    127             leftLength = ARRAY_SIZE / WIDTH + (bool)(ARRAY_SIZE % WIDTH);
    128         }
    129         else if (i % 2)// 非首次的偶数次,把d_temp1算到d_temp2中去
    130         {
    131             add_simple << <CEIL(leftLength, WIDTH), WIDTH, sizeof(format) * WIDTH >> > (d_temp1, d_temp2, WIDTH);
    132             cudaDeviceSynchronize();
    133             leftLength = leftLength / WIDTH + (bool)(leftLength % WIDTH);
    134         }
    135         else// 非首次的奇数次,把d_temp2算到d_temp1中去
    136         {
    137             add_simple << <CEIL(leftLength, WIDTH), WIDTH, sizeof(format) * WIDTH >> > (d_temp2, d_temp1, WIDTH);
    138             cudaDeviceSynchronize();
    139             leftLength = leftLength / WIDTH + (bool)(leftLength % WIDTH);
    140         }
    141     }
    142     if (i % 2)//说明经历了i次规约,i奇结果在d_temp1中,i偶结果在d_temp2中
    143         cudaMemcpy(&h_out[10], d_temp1, sizeof(format), cudaMemcpyDeviceToHost);
    144     else
    145         cudaMemcpy(&h_out[10], d_temp2, sizeof(format), cudaMemcpyDeviceToHost);
    146     cudaDeviceSynchronize();
    147     cudaEventRecord(stop, 0);
    148     cudaEventSynchronize(stop);
    149     cudaEventElapsedTime(&elapsedTime[10], start, stop);
    150 
    151     printf("
    	CPU:			%f", (float)cpu_sum);
    152     for (i = 0; i < 10; i++)
    153         printf("
    	GPU, %4d threads:	%d	Time:	%6.2f ms", 2 << i, (int)h_out[i], elapsedTime[i]);
    154     printf("
    	GPU,   new method:	%d	Time:	%6.2f ms", (int)h_out[10], elapsedTime[10]);
    155 
    156     cudaFree(d_in);
    157     cudaFree(d_out);
    158     cudaFree(d_temp1);
    159     cudaFree(d_temp2);
    160     cudaEventDestroy(start);
    161     cudaEventDestroy(stop);
    162 
    163     getchar();
    164     return 0;
    165 }

    ● 输出结果,可见随着一个线程块内的线程数的增大,每次全局内存读取的带宽增大,计算所需迭代次数减小,但共享内存结果的汇总耗时增加,总时间先减小后增大,在blockDim.x在256和512左右接近最优。采用第二种方法在CPU中逻辑较为复杂,没有去的较好的改进。

  • 相关阅读:
    CentOS安装
    java字符串
    h5弹球对战游戏
    看是否健康
    layui社区源码笔记之fly-list
    layui社区源码笔记之user-rank
    layui社区源码笔记之layui-input form
    layui社区源码笔记之fly-tab
    layui社区模板主页框架分析
    分组答辩小程序
  • 原文地址:https://www.cnblogs.com/cuancuancuanhao/p/7660621.html
Copyright © 2011-2022 走看看