▶ 参考书中的代码,写了
● 代码,核函数文件包含三中算法
1 // kernel.cl 2 __kernel void bitonicSort01(__global uint *data, const uint stage, const uint subStage, const uint direction)// 基本的元素对调整 3 { 4 const uint gid = get_global_id(0); 5 const uint isAscend = ((gid / (1 << stage)) % 2) ? 1 - direction : direction; // 判断本工作项的元素对应该排成升序还是降序 6 const uint distance = 1 << (stage - subStage); // 元素对下标差 7 const uint lid = (gid / distance) * distance * 2 + gid % distance; // 寻找元素对的左右元素 8 const uint rid = lid + distance; 9 const uint lElement = data[lid], rElement = data[rid]; 10 if (lElement > rElement && isAscend || lElement < rElement && !isAscend) // 不符合排序要求,交换两元素 11 data[lid] = rElement, data[rid] = lElement; 12 } 13 14 __kernel void bitonicSort02(__global uint *data, const uint stage, const uint subStage, const uint direction, __local uint *localMem)// 使用局部内存调整 15 { 16 const uint gid = get_global_id(0), mid = get_local_id(0); // 同 bitonicSort01 17 const uint isAscend = ((gid / (1 << stage)) % 2) ? 1 - direction : direction; 18 const uint distance = 1 << (stage - subStage); 19 const uint lid = (gid / distance) * distance * 2 + gid % distance; 20 const uint rid = lid + distance; 21 22 localMem[mid * 2 + 0] = data[lid]; // 读取 data 的时候读进局部内存,与局部内存相关的下标用的都是 mid 而不是 gid 23 localMem[mid * 2 + 1] = data[rid]; 24 barrier(CLK_LOCAL_MEM_FENCE); 25 26 if (localMem[mid * 2 + 0] > localMem[mid * 2 + 1] && isAscend || localMem[mid * 2 + 0] < localMem[mid * 2 + 1] && !isAscend) 27 data[lid] = localMem[mid * 2 + 1], data[rid] = localMem[mid * 2 + 0]; 28 } 29 30 #define STRIDE 4 // aux 中四个元素一组,表示一个工作项的元素对索引和值,依照 main.c 中给定的第 5 参数的大小进行相等的调整 31 32 __kernel void bitonicSort03(__global uint *data, const uint stage, const uint subStage, const uint direction, __local uint *localMem, __local uint *aux)// 使用两个局部内存,感觉多此一举? 33 { 34 const uint gid = get_global_id(0), mid = get_local_id(0); // 同 bitonicSort02 35 const uint isAscend = ((gid / (1 << stage)) % 2) ? 1 - direction : direction; 36 const uint distance = 1 << (stage - subStage); 37 const uint lid = (gid / distance) * distance * 2 + gid % distance; 38 const uint rid = lid + distance; 39 40 localMem[mid * 2 + 0] = data[lid]; 41 localMem[mid * 2 + 1] = data[rid]; 42 barrier(CLK_LOCAL_MEM_FENCE); 43 44 aux[mid * STRIDE + 0] = lid; // 开始向aux 中填充 45 aux[mid * STRIDE + 2] = rid; 46 if (localMem[mid * 2 + 0] > localMem[mid * 2 + 1] && isAscend || localMem[mid * 2 + 0] < localMem[mid * 2 + 1] && !isAscend) 47 aux[mid * STRIDE + 1] = localMem[mid * 2 + 1], aux[mid * STRIDE + 3] = localMem[mid * 2 + 0]; 48 else 49 aux[mid * STRIDE + 1] = localMem[mid * 2 + 0], aux[mid * STRIDE + 3] = localMem[mid * 2 + 1]; 50 barrier(CLK_LOCAL_MEM_FENCE); 51 52 data[aux[mid * STRIDE + 0]] = aux[mid * STRIDE + 1], data[aux[mid * STRIDE + 2]] = aux[mid * STRIDE + 3]; // 向原数组中填充数据 53 /*// 书中的填充方法,一个工作组仅使用一个工作项串行地向原数组中填充,美名曰“无冲突”,实际上花费了 5 倍的时间 54 if (mid == 0) 55 { 56 for (int i = 0; i < get_local_size(0); i++) 57 data[aux[i * STRIDE + 0]] = aux[i * STRIDE + 1], data[aux[i * STRIDE + 2]] = aux[i * STRIDE + 3]; 58 } 59 */ 60 }
1 // main.c 2 #include <stdio.h> 3 #include <stdlib.h> 4 #include <cl.h> 5 6 //#define PRINT_RESULT // 输出排序前后的数组元素(数据量较大时不用) 7 #define ASCENDING 1 // 升序 8 #define DESCENDING 0 // 降序 9 #define DATA_SIZE (1<<20) // 数据规模 10 #define GROUP_SIZE 128 // 工作组大小 11 12 const char *sourceText = "D:/Code/OpenCL/OpenCLProjectTemp/OpenCLProjectTemp/kernel.cl"; 13 const unsigned int sortOrder = ASCENDING; // ASCENDING / DESCENDING 14 15 int readText(const char* kernelPath, char **pcode)// 读取文本文件放入 pcode,返回字符串长度 16 { 17 FILE *fp; 18 int size; 19 //printf("<readText> File: %s ", kernelPath); 20 fopen_s(&fp, kernelPath, "rb"); 21 if (!fp) 22 { 23 printf("Open kernel file failed "); 24 getchar(); 25 exit(-1); 26 } 27 if (fseek(fp, 0, SEEK_END) != 0) 28 { 29 printf("Seek end of file failed "); 30 getchar(); 31 exit(-1); 32 } 33 if ((size = ftell(fp)) < 0) 34 { 35 printf("Get file position failed "); 36 getchar(); 37 exit(-1); 38 } 39 rewind(fp); 40 if ((*pcode = (char *)malloc(size + 1)) == NULL) 41 { 42 printf("Allocate space failed "); 43 getchar(); 44 exit(-1); 45 } 46 fread(*pcode, 1, size, fp); 47 (*pcode)[size] = '