zoukankan      html  css  js  c++  java
  • OpenCL 双调排序 GPU 版

    ▶ 参考书中的代码,写了

    ● 代码,核函数文件包含三中算法

     1 // kernel.cl
     2 __kernel void bitonicSort01(__global uint *data, const uint stage, const uint subStage, const uint direction)// 基本的元素对调整
     3 {
     4     const uint gid = get_global_id(0);
     5     const uint isAscend = ((gid / (1 << stage)) % 2) ? 1 - direction : direction;   // 判断本工作项的元素对应该排成升序还是降序
     6     const uint distance = 1 << (stage - subStage);                                  // 元素对下标差    
     7     const uint lid = (gid / distance) * distance * 2 + gid % distance;              // 寻找元素对的左右元素
     8     const uint rid = lid + distance;
     9     const uint lElement = data[lid], rElement = data[rid];
    10     if (lElement > rElement && isAscend || lElement < rElement && !isAscend)        // 不符合排序要求,交换两元素           
    11         data[lid] = rElement, data[rid] = lElement;
    12 }
    13 
    14 __kernel void bitonicSort02(__global uint *data, const uint stage, const uint subStage, const uint direction, __local uint *localMem)// 使用局部内存调整
    15 {
    16     const uint gid = get_global_id(0), mid = get_local_id(0);                       // 同 bitonicSort01
    17     const uint isAscend = ((gid / (1 << stage)) % 2) ? 1 - direction : direction;
    18     const uint distance = 1 << (stage - subStage);                               
    19     const uint lid = (gid / distance) * distance * 2 + gid % distance;           
    20     const uint rid = lid + distance;
    21 
    22     localMem[mid * 2 + 0] = data[lid];  // 读取 data 的时候读进局部内存,与局部内存相关的下标用的都是 mid 而不是 gid
    23     localMem[mid * 2 + 1] = data[rid];
    24     barrier(CLK_LOCAL_MEM_FENCE);
    25 
    26     if (localMem[mid * 2 + 0] > localMem[mid * 2 + 1] && isAscend || localMem[mid * 2 + 0] < localMem[mid * 2 + 1] && !isAscend)
    27         data[lid] = localMem[mid * 2 + 1], data[rid] = localMem[mid * 2 + 0];
    28 }
    29 
    30 #define STRIDE 4 // aux 中四个元素一组,表示一个工作项的元素对索引和值,依照 main.c 中给定的第 5 参数的大小进行相等的调整
    31                                                                                                                                  
    32 __kernel void bitonicSort03(__global uint *data, const uint stage, const uint subStage, const uint direction, __local uint *localMem, __local uint *aux)// 使用两个局部内存,感觉多此一举?
    33 {
    34     const uint gid = get_global_id(0), mid = get_local_id(0);                       // 同 bitonicSort02
    35     const uint isAscend = ((gid / (1 << stage)) % 2) ? 1 - direction : direction;
    36     const uint distance = 1 << (stage - subStage);
    37     const uint lid = (gid / distance) * distance * 2 + gid % distance;
    38     const uint rid = lid + distance;
    39 
    40     localMem[mid * 2 + 0] = data[lid];
    41     localMem[mid * 2 + 1] = data[rid];
    42     barrier(CLK_LOCAL_MEM_FENCE);
    43 
    44     aux[mid * STRIDE + 0] = lid;                                                    // 开始向aux 中填充    
    45     aux[mid * STRIDE + 2] = rid;
    46     if (localMem[mid * 2 + 0] > localMem[mid * 2 + 1] && isAscend || localMem[mid * 2 + 0] < localMem[mid * 2 + 1] && !isAscend)
    47         aux[mid * STRIDE + 1] = localMem[mid * 2 + 1], aux[mid * STRIDE + 3] = localMem[mid * 2 + 0];
    48     else
    49         aux[mid * STRIDE + 1] = localMem[mid * 2 + 0], aux[mid * STRIDE + 3] = localMem[mid * 2 + 1];
    50     barrier(CLK_LOCAL_MEM_FENCE);
    51     
    52     data[aux[mid * STRIDE + 0]] = aux[mid * STRIDE + 1], data[aux[mid * STRIDE + 2]] = aux[mid * STRIDE + 3];   // 向原数组中填充数据        
    53     /*// 书中的填充方法,一个工作组仅使用一个工作项串行地向原数组中填充,美名曰“无冲突”,实际上花费了 5 倍的时间
    54     if (mid == 0)
    55     {
    56         for (int i = 0; i < get_local_size(0); i++)
    57             data[aux[i * STRIDE + 0]] = aux[i * STRIDE + 1], data[aux[i * STRIDE + 2]] = aux[i * STRIDE + 3];
    58     }
    59     */
    60 }
      1 // main.c
      2 #include <stdio.h>
      3 #include <stdlib.h>
      4 #include <cl.h>
      5 
      6 //#define PRINT_RESULT      // 输出排序前后的数组元素(数据量较大时不用)
      7 #define ASCENDING   1       // 升序
      8 #define DESCENDING  0       // 降序
      9 #define DATA_SIZE   (1<<20) // 数据规模
     10 #define GROUP_SIZE  128     // 工作组大小
     11 
     12 const char *sourceText = "D:/Code/OpenCL/OpenCLProjectTemp/OpenCLProjectTemp/kernel.cl";
     13 const unsigned int sortOrder = ASCENDING; // ASCENDING / DESCENDING
     14 
     15 int readText(const char* kernelPath, char **pcode)// 读取文本文件放入 pcode,返回字符串长度
     16 {
     17     FILE *fp;
     18     int size;
     19     //printf("<readText> File: %s
    ", kernelPath);
     20     fopen_s(&fp, kernelPath, "rb");
     21     if (!fp)
     22     {
     23         printf("Open kernel file failed
    ");
     24         getchar();
     25         exit(-1);
     26     }
     27     if (fseek(fp, 0, SEEK_END) != 0)
     28     {
     29         printf("Seek end of file failed
    ");
     30         getchar();
     31         exit(-1);
     32     }
     33     if ((size = ftell(fp)) < 0)
     34     {
     35         printf("Get file position failed
    ");
     36         getchar();
     37         exit(-1);
     38     }
     39     rewind(fp);
     40     if ((*pcode = (char *)malloc(size + 1)) == NULL)
     41     {
     42         printf("Allocate space failed
    ");
     43         getchar();
     44         exit(-1);
     45     }
     46     fread(*pcode, 1, size, fp);
     47     (*pcode)[size] = '';
     48     fclose(fp);
     49     return size + 1;
     50 }
     51 
     52 int main()
     53 {
     54     int i;
     55     srand(97);
     56     
     57     cl_int status;
     58     cl_uint nPlatform;
     59     clGetPlatformIDs(0, NULL, &nPlatform);
     60     cl_platform_id *listPlatform = (cl_platform_id*)malloc(nPlatform * sizeof(cl_platform_id));
     61     clGetPlatformIDs(nPlatform, listPlatform, NULL);
     62     cl_uint nDevice;
     63     clGetDeviceIDs(listPlatform[0], CL_DEVICE_TYPE_ALL, 0, NULL, &nDevice);
     64     cl_device_id *listDevice = (cl_device_id*)malloc(nDevice * sizeof(cl_device_id));
     65     clGetDeviceIDs(listPlatform[0], CL_DEVICE_TYPE_ALL, nDevice, listDevice, NULL);
     66     cl_context context = clCreateContext(NULL, nDevice, listDevice, NULL, NULL, &status);
     67     cl_command_queue queue = clCreateCommandQueue(context, listDevice[0], CL_QUEUE_PROFILING_ENABLE, &status);
     68     
     69     int *hostA = (cl_int*)malloc(DATA_SIZE * sizeof(cl_int));
     70     for (i = 0; i < DATA_SIZE; hostA[i++] = rand());
     71 #ifdef PRINT_RESULT // 输出排序前的数组
     72     printf("
    ");
     73     for (i = 0; i < DATA_SIZE; i++)
     74     {
     75         printf("%5d,", hostA[i]);
     76         if ((i + 1) % 16 == 0)
     77             printf("
    ");
     78     }
     79     printf("
    ");
     80 #endif
     81     cl_mem deviceA = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, DATA_SIZE * sizeof(cl_int), hostA, &status);
     82 
     83     char *code;
     84     size_t codeLength = readText(sourceText, &code);
     85     cl_program program = clCreateProgramWithSource(context, 1, (const char**)&code, &codeLength, &status);
     86     status = clBuildProgram(program, nDevice, listDevice, NULL, NULL, NULL);
     87     if (status)
     88     {
     89         char info[10000];
     90         clGetProgramBuildInfo(program, listDevice[0], CL_PROGRAM_BUILD_LOG, 10000, info, NULL);
     91         printf("
    %s
    ", info);
     92     }
     93     //cl_kernel kernel = clCreateKernel(program, "bitonicSort01", &status); // 选择使用的核函数
     94     //cl_kernel kernel = clCreateKernel(program, "bitonicSort02", &status);
     95     cl_kernel kernel = clCreateKernel(program, "bitonicSort03", &status);
     96 
     97     size_t globalSize = DATA_SIZE / 2, groupSize = GROUP_SIZE;  
     98     cl_uint stageCount, stage, subStage;
     99     for (i = DATA_SIZE, stageCount = 0; i > 1; stageCount++, i >>= 1);  // 需要的总轮数
    100     clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&deviceA);
    101     clSetKernelArg(kernel, 3, sizeof(cl_uint), (void*)&sortOrder);
    102     clSetKernelArg(kernel, 4, GROUP_SIZE * 2 * sizeof(cl_uint), NULL);  // bitonicSort02 和 bitonicSort03 需要的局部内存参数
    103     clSetKernelArg(kernel, 5, GROUP_SIZE * 4 * sizeof(cl_uint), NULL);  // bitonicSort03 需要的局部内存参数
    104     
    105     cl_event exeEvent;                                                  // 计时用的事件
    106     cl_ulong executionStart, executionEnd, timeCount1, timeCount2;      // 计时器, 精确到 ns
    107     for (stage = timeCount1 = timeCount2 = 0; stage < stageCount; stage++)  // 当前轮数
    108     {
    109         clSetKernelArg(kernel, 1, sizeof(cl_uint), (void*)&stage);
    110         for (subStage = 0; subStage < stage + 1; subStage++)                // 当前轮中归并的步骤数
    111         {
    112             clSetKernelArg(kernel, 2, sizeof(cl_uint), (void*)&subStage);
    113             //clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &globalSize, &groupSize, 0, NULL, NULL);   // 不计时的核函数调用
    114             clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &globalSize, &groupSize, 0, NULL, &exeEvent); 
    115             clWaitForEvents(1, &exeEvent);            
    116             clGetEventProfilingInfo(exeEvent, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &executionStart, NULL);
    117             clGetEventProfilingInfo(exeEvent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &executionEnd, NULL);            
    118             timeCount1 += (executionEnd - executionStart) / 1000000, timeCount2 += (executionEnd - executionStart) % 1000000;// ms 的整数和小数部分分开记录
    119         }
    120     }
    121     printf("Kernel time: %lu.%lu ms
    ", timeCount1 + timeCount2 / 1000000, timeCount2 % 1000000);       // 内核运行总时间
    122     
    123     clEnqueueReadBuffer(queue, deviceA, CL_TRUE, 0, DATA_SIZE * sizeof(cl_int), hostA, 0, NULL, NULL);
    124 #ifdef PRINT_RESULT                     // 输出排序后的结果
    125     printf("
    ");
    126     for (i = 0; i < DATA_SIZE; i++)
    127     {
    128         printf("%5d,", hostA[i]);
    129         if ((i + 1) % 16 == 0)
    130             printf("
    ");
    131     }
    132     printf("
    ");
    133 #else
    134     for (i = 0; i < DATA_SIZE - 1; i++) // 不输出结果,只做检查,有错误的时候输出第一个错误的位置
    135     {
    136         if (sortOrder != (hostA[i + 1] >= hostA[i]))
    137         {
    138             printf("Error at i == %d, hostA[i] == %d, hostA[i+1] == %d", i, hostA[i], hostA[i + 1]);
    139             break;
    140         }
    141     }
    142 #endif
    143 
    144     free(listPlatform);
    145     free(listDevice);
    146     clReleaseContext(context);
    147     clReleaseCommandQueue(queue);
    148     clReleaseProgram(program);
    149     clReleaseKernel(kernel);
    150     clReleaseEvent(exeEvent);
    151     free(hostA);
    152     clReleaseMemObject(deviceA);    
    153     getchar();
    154     return 0;
    155 }

    ● 输出结果,统一采用 (1<<20) 的数据规模,尝试不同的工作组大小。使用局部内存并没有明显提升,尤其是使用两个局部内存的方法,严重拖后腿。

    bitonicSort01
    // groupSize = 64
    kernels time: 7.941984 ms
    // groupSize = 128
    kernels time: 7.947360 ms
    // groupSize = 256
    kernels time: 7.935616 ms
    // groupSize = 512
    kernels time: 7.919776 ms
    // groupSize = 1024
    kernels time: 7.970080 ms
    
    bitonicSort02
    // groupSize = 64
    kernels time: 7.980160 ms
    // groupSize = 128
    kernels time: 7.957984 ms
    // groupSize = 256
    kernels time: 7.964032 ms
    // groupSize = 512
    kernels time: 7.959072 ms
    // groupSize = 1024
    kernels time: 8.517120 ms
    
    bitonicSort03
    // groupSize = 64
    kernels time: 9.901120 ms
    // groupSize = 128
    kernels time: 9.936384 ms
    // groupSize = 256
    kernels time: 9.950976 ms
    // groupSize = 512
    kernels time: 10.134176 ms
    // groupSize = 1024
    kernels time: 10.533516 ms
    
    bitonicSort03(书中的串行写入)
    // groupSize = 64
    kernels time: 51.590080 ms
    // groupSize = 128
    kernels time: 31.607904 ms
    // groupSize = 256
    kernels time: 35.344244 ms
    // groupSize = 512
    kernels time: 50.841184 ms
    // groupSize = 1024
    kernels time: 93.284544 ms

    ● 总结

    ■ CPU 版双调排序使用递归,代码比较简洁,也可以使用本篇中的方法家拿其转化为循环迭代。GPU暂时不使用递归,主要精力在于不停地向命令队列中入队不同层次的任务,同一层次内的任务可以并行执行

    ■ 这本书就是垃圾,就 T M D 是 垃 圾!①书里代码引用不全,比如核函数里上来就使用变量 threadId,声明定义都没有;代码混乱,比如在 bitonicSort01 和 bitonicSort02 里 threadId 的含义是不同的,尤其书中 bitonicSort02 里同时用该变量表示 get_global_id(0) 和 get_local_id(0),一想就知道代码执行结果肯定是错的;③ 机翻,好不容易写到倒数第二章了,翻译都懒得弄了,这里抄一段原文看谁看得懂:“我们基本上执行的操作是引入名为 sharedMem 的变量,并且使用加载这些值的简单策略:每个线程将在共享内存数据存储器中存储两个值(邻近值),在随后的代码部分中读出此线程,并且用于引用全局内存的所有读取操作目前在本地 / 共享内存中执行”。④ 残缺不堪,感觉 bitonicSort03 的优化应该是有一定道理的,但是在这里完全没有看出该优化的目的和优势所在,书中也没有说清楚。

  • 相关阅读:
    MoveWindow() SetWindowPos()的区别与联系
    SEO搜索引擎优化基础
    Windows核心编程小结1
    STL学习笔记8 -- 函数对象
    Java关于反射
    多线程处理慢sql查询小笔记~
    前端小菜鸡使用Vue+Element笔记(二)
    前端小菜鸡使用Vue+Element笔记(一)
    Hive/hbase/sqoop的基本使用教程~
    Hive/Hbase/Sqoop的安装教程
  • 原文地址:https://www.cnblogs.com/cuancuancuanhao/p/9128697.html
Copyright © 2011-2022 走看看