▶ 照着书上的代码,写了几个一步归约的计算,只计算一步,将原数组归约到不超过 1024 个工作项
● 代码
1 // kernel.cl 2 __kernel void reduce01(__global uint* input, __global uint* output, __local uint* sdata) 3 { 4 const unsigned int tid = get_local_id(0), blockSize = get_local_size(0); 5 unsigned int s; 6 7 sdata[tid] = input[get_global_id(0)]; 8 barrier(CLK_LOCAL_MEM_FENCE); 9 10 // 三种写法,用一种就够 11 // 1、模法,问题: % 运算很慢 12 for (s = 1; s < blockSize; s <<= 1) 13 { 14 if (tid % (2 * s) == 0) 15 sdata[tid] += sdata[tid + s]; 16 barrier(CLK_LOCAL_MEM_FENCE); 17 } 18 // 2、间隔缩短法,问题:首次迭代只用一半的工作项,之后每次迭代活跃的工作项持续减少 19 for (s = blockSize / 2; s > 0; s >>= 1) 20 { 21 if (tid < s) 22 sdata[tid] += sdata[tid + s]; 23 barrier(CLK_LOCAL_MEM_FENCE); 24 } 25 // 3、间隔增长法,问题:当间隔等于某几个数的时候会产生 26 unsigned int index; 27 for (s = 1; s < blockSize; s <<= 1) 28 { 29 if ((index = 2 * s * tid) < blockSize) 30 sdata[index] += sdata[index + s]; 31 barrier(CLK_LOCAL_MEM_FENCE); 32 } 33 34 if (tid == 0) 35 output[get_group_id(0)] = sdata[0]; 36 } 37 38 __kernel void reduce02(__global uint* input, __global uint* output, __local uint* sdata) 39 { 40 const unsigned int tid = get_local_id(0), bid = get_group_id(0), blockSize = get_local_size(0); 41 const unsigned int index = bid * (blockSize * 2) + tid; 42 unsigned int s; 43 44 sdata[tid] = input[index] + input[index + blockSize];// 读入局部内存时就进行一次归约 45 barrier(CLK_LOCAL_MEM_FENCE); 46 47 // 两种写法,用一种就够 48 // 1、不手动展开循环,仍然有工作项浪费的问题 49 for (s = blockSize / 2; s > 0; s >>= 1) 50 { 51 if (tid < s) 52 sdata[tid] += sdata[tid + s]; 53 barrier(CLK_LOCAL_MEM_FENCE); 54 } 55 // 2、手动展开最后的循环 56 for (s = blockSize / 2; s > 32; s >>= 1)// BUG:如果从 64 开始手工归约,在这一行有且仅有一个工作项会算出 640 = 512 + 128 来,其他行却没问题 57 { 58 if (tid < s) 59 sdata[tid] += sdata[tid + s]; 60 barrier(CLK_LOCAL_MEM_FENCE); 61 } 62 if (tid < 32) // 手动展开最后的归约,注意同步,书中源代码中没有同步,计算结果是错的 63 { 64 if (blockSize >= 64) 65 sdata[tid] += sdata[tid + 32]; 66 barrier(CLK_LOCAL_MEM_FENCE); 67 if (blockSize >= 32) 68 sdata[tid] += sdata[tid + 16]; 69 barrier(CLK_LOCAL_MEM_FENCE); 70 if (blockSize >= 16) 71 sdata[tid] += sdata[tid + 8]; 72 barrier(CLK_LOCAL_MEM_FENCE); 73 if (blockSize >= 8) 74 sdata[tid] += sdata[tid + 4]; 75 barrier(CLK_LOCAL_MEM_FENCE); 76 if (blockSize >= 4) 77 sdata[tid] += sdata[tid + 2]; 78 barrier(CLK_LOCAL_MEM_FENCE); 79 if (blockSize >= 2) 80 sdata[tid] += sdata[tid + 1]; 81 barrier(CLK_LOCAL_MEM_FENCE); 82 } 83 84 if (tid == 0) 85 output[bid] = sdata[0]; 86 }
1 // main.c 2 #include <stdio.h> 3 #include <stdlib.h> 4 #include <cl.h> 5 6 #define BLOCK_SIZE 256 // 工作组内最大工作项数为 1024 7 #define DATA_SIZE (BLOCK_SIZE * 1024) // 一维最大工作组数为1024 8 9 const char *sourceText = "D:/Code/OpenCL/OpenCLProjectTemp/OpenCLProjectTemp/kernel.cl"; 10 11 int readText(const char* kernelPath, char **pcode)// 读取文本文件放入 pcode,返回字符串长度 12 { 13 FILE *fp; 14 int size; 15 //printf("<readText> File: %s ", kernelPath); 16 fopen_s(&fp, kernelPath, "rb"); 17 if (!fp) 18 { 19 printf("Open kernel file failed "); 20 getchar(); 21 exit(-1); 22 } 23 if (fseek(fp, 0, SEEK_END) != 0) 24 { 25 printf("Seek end of file failed "); 26 getchar(); 27 exit(-1); 28 } 29 if ((size = ftell(fp)) < 0) 30 { 31 printf("Get file position failed "); 32 getchar(); 33 exit(-1); 34 } 35 rewind(fp); 36 if ((*pcode = (char *)malloc(size + 1)) == NULL) 37 { 38 printf("Allocate space failed "); 39 getchar(); 40 exit(-1); 41 } 42 fread(*pcode, 1, size, fp); 43 (*pcode)[size] = '