zoukankan      html  css  js  c++  java
  • OpenCL 归约 1

    ▶ 照着书上的代码,写了几个一步归约的计算,只计算一步,将原数组归约到不超过 1024 个工作项

     ● 代码

     1 // kernel.cl
     2 __kernel void reduce01(__global uint* input, __global uint* output, __local uint* sdata)
     3 {
     4     const unsigned int tid = get_local_id(0), blockSize = get_local_size(0);
     5     unsigned int s;
     6 
     7     sdata[tid] = input[get_global_id(0)];
     8     barrier(CLK_LOCAL_MEM_FENCE);
     9 
    10     // 三种写法,用一种就够
    11     // 1、模法,问题: % 运算很慢
    12     for (s = 1; s < blockSize; s <<= 1)
    13     {
    14         if (tid % (2 * s) == 0)                 
    15             sdata[tid] += sdata[tid + s];
    16         barrier(CLK_LOCAL_MEM_FENCE);
    17     }
    18     // 2、间隔缩短法,问题:首次迭代只用一半的工作项,之后每次迭代活跃的工作项持续减少
    19     for (s = blockSize / 2; s > 0; s >>= 1)
    20     {
    21         if (tid < s) 
    22             sdata[tid] += sdata[tid + s];
    23         barrier(CLK_LOCAL_MEM_FENCE);
    24     }
    25     // 3、间隔增长法,问题:当间隔等于某几个数的时候会产生
    26     unsigned int index;
    27     for (s = 1; s < blockSize; s <<= 1)
    28     {
    29         if ((index = 2 * s * tid) < blockSize)
    30             sdata[index] += sdata[index + s];
    31         barrier(CLK_LOCAL_MEM_FENCE);
    32     }
    33 
    34     if (tid == 0)
    35         output[get_group_id(0)] = sdata[0];
    36 }
    37 
    38 __kernel void reduce02(__global uint* input, __global uint* output, __local uint* sdata)
    39 {
    40     const unsigned int tid = get_local_id(0), bid = get_group_id(0), blockSize = get_local_size(0);
    41     const unsigned int index = bid * (blockSize * 2) + tid;
    42     unsigned int s;
    43     
    44     sdata[tid] = input[index] + input[index + blockSize];// 读入局部内存时就进行一次归约
    45     barrier(CLK_LOCAL_MEM_FENCE);
    46 
    47     // 两种写法,用一种就够
    48     // 1、不手动展开循环,仍然有工作项浪费的问题
    49     for (s = blockSize / 2; s > 0; s >>= 1)
    50     {
    51         if (tid < s)
    52             sdata[tid] += sdata[tid + s];
    53         barrier(CLK_LOCAL_MEM_FENCE);
    54     }
    55     // 2、手动展开最后的循环
    56     for (s = blockSize / 2; s > 32; s >>= 1)// BUG:如果从 64 开始手工归约,在这一行有且仅有一个工作项会算出 640 = 512 + 128 来,其他行却没问题
    57     {
    58         if (tid < s)
    59             sdata[tid] += sdata[tid + s];
    60         barrier(CLK_LOCAL_MEM_FENCE);
    61     }
    62     if (tid < 32)                           // 手动展开最后的归约,注意同步,书中源代码中没有同步,计算结果是错的
    63     {
    64         if (blockSize >= 64)
    65             sdata[tid] += sdata[tid + 32];
    66         barrier(CLK_LOCAL_MEM_FENCE);
    67         if (blockSize >= 32)
    68             sdata[tid] += sdata[tid + 16];
    69         barrier(CLK_LOCAL_MEM_FENCE);
    70         if (blockSize >= 16)
    71             sdata[tid] += sdata[tid + 8];
    72         barrier(CLK_LOCAL_MEM_FENCE);
    73         if (blockSize >= 8)
    74             sdata[tid] += sdata[tid + 4];
    75         barrier(CLK_LOCAL_MEM_FENCE);
    76         if (blockSize >= 4)
    77             sdata[tid] += sdata[tid + 2];
    78         barrier(CLK_LOCAL_MEM_FENCE);
    79         if (blockSize >= 2)
    80             sdata[tid] += sdata[tid + 1];
    81         barrier(CLK_LOCAL_MEM_FENCE);
    82     }
    83 
    84     if (tid == 0)
    85         output[bid] = sdata[0];
    86 }
      1 // main.c
      2 #include <stdio.h>
      3 #include <stdlib.h>
      4 #include <cl.h>
      5 
      6 #define BLOCK_SIZE  256                 // 工作组内最大工作项数为 1024
      7 #define DATA_SIZE   (BLOCK_SIZE * 1024) // 一维最大工作组数为1024
      8 
      9 const char *sourceText = "D:/Code/OpenCL/OpenCLProjectTemp/OpenCLProjectTemp/kernel.cl";
     10 
     11 int readText(const char* kernelPath, char **pcode)// 读取文本文件放入 pcode,返回字符串长度
     12 {
     13     FILE *fp;
     14     int size;
     15     //printf("<readText> File: %s
    ", kernelPath);
     16     fopen_s(&fp, kernelPath, "rb");
     17     if (!fp)
     18     {
     19         printf("Open kernel file failed
    ");
     20         getchar();
     21         exit(-1);
     22     }
     23     if (fseek(fp, 0, SEEK_END) != 0)
     24     {
     25         printf("Seek end of file failed
    ");
     26         getchar();
     27         exit(-1);
     28     }
     29     if ((size = ftell(fp)) < 0)
     30     {
     31         printf("Get file position failed
    ");
     32         getchar();
     33         exit(-1);
     34     }
     35     rewind(fp);
     36     if ((*pcode = (char *)malloc(size + 1)) == NULL)
     37     {
     38         printf("Allocate space failed
    ");
     39         getchar();
     40         exit(-1);
     41     }
     42     fread(*pcode, 1, size, fp);
     43     (*pcode)[size] = '';
     44     fclose(fp);
     45     return size + 1;
     46 }
     47 
     48 int main()
     49 {    
     50     cl_int status;
     51     cl_uint nPlatform;
     52     clGetPlatformIDs(0, NULL, &nPlatform);
     53     cl_platform_id *listPlatform = (cl_platform_id*)malloc(nPlatform * sizeof(cl_platform_id));
     54     clGetPlatformIDs(nPlatform, listPlatform, NULL);
     55     cl_uint nDevice;
     56     clGetDeviceIDs(listPlatform[0], CL_DEVICE_TYPE_ALL, 0, NULL, &nDevice);
     57     cl_device_id *listDevice = (cl_device_id*)malloc(nDevice * sizeof(cl_device_id));
     58     clGetDeviceIDs(listPlatform[0], CL_DEVICE_TYPE_ALL, nDevice, listDevice, NULL);
     59     cl_context context = clCreateContext(NULL, nDevice, listDevice, NULL, NULL, &status);
     60     cl_command_queue queue = clCreateCommandQueue(context, listDevice[0], CL_QUEUE_PROFILING_ENABLE, &status);
     61 
     62     //const unsigned int nGroup = DATA_SIZE / BLOCK_SIZE;     // reduce01 使用
     63     const unsigned int nGroup = DATA_SIZE / BLOCK_SIZE / 2; // reduce02 使用
     64     int *hostA = (cl_int*)malloc(sizeof(cl_int) * DATA_SIZE);
     65     int *hostB = (cl_int*)malloc(sizeof(cl_int) * nGroup);
     66     int i;
     67     unsigned long refSum;
     68     srand(97);
     69     for (i = 0, refSum = 0L; i < DATA_SIZE; refSum += (hostA[i++] = 1));// rand()));
     70     memset(hostB, 0, sizeof(int) * nGroup);
     71     cl_mem deviceA = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_int) * DATA_SIZE, hostA, &status);
     72     cl_mem deviceB = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(cl_int) * nGroup, NULL, &status);
     73        
     74     char *code;
     75     size_t codeLength = readText(sourceText, &code);
     76     cl_program program = clCreateProgramWithSource(context, 1, (const char**)&code, &codeLength, &status);
     77     status = clBuildProgram(program, nDevice, listDevice, NULL, NULL, NULL);
     78     if (status)
     79     {
     80         char info[10000];
     81         clGetProgramBuildInfo(program, listDevice[0], CL_PROGRAM_BUILD_LOG, 10000, info, NULL);
     82         printf("
    %s
    ", info);
     83     }
     84     //cl_kernel kernel = clCreateKernel(program, "reduce01", &status);
     85     cl_kernel kernel = clCreateKernel(program, "reduce02", &status);   
     86     
     87     clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&deviceA);
     88     clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&deviceB);
     89     clSetKernelArg(kernel, 2, BLOCK_SIZE * sizeof(cl_int), NULL);
     90 
     91     size_t globalSize = DATA_SIZE, localSize = BLOCK_SIZE;
     92     cl_event ev;
     93     //cl_ulong startTime, endTime;
     94     status = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &globalSize, &localSize, 0, NULL, &ev);
     95     clFinish(queue);
     96     //clGetEventProfilingInfo(ev, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &startTime, NULL);  // 不启用计时,因为一趟归约时间太短
     97     //clGetEventProfilingInfo(ev, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &endTime, NULL);
     98     //printf("Time:%lu.%lu
    ", (endTime - startTime) / 1000000000, (endTime - startTime) % 1000000000);
     99     
    100     clEnqueueReadBuffer(queue, deviceB, CL_TRUE, 0, sizeof(cl_int) * nGroup, hostB, 0, NULL, NULL);
    101     for (i = 0; i < nGroup; refSum -= hostB[i++]);
    102     printf("Result %s.
    ", (refSum == 0) ? "correct" : "error");
    103 
    104     free(hostA);
    105     free(hostB);
    106     free(code);
    107     free(listPlatform);
    108     free(listDevice);
    109     clReleaseContext(context);
    110     clReleaseCommandQueue(queue);
    111     clReleaseProgram(program);
    112     clReleaseKernel(kernel);
    113     clReleaseEvent(ev);
    114     clReleaseMemObject(deviceA);
    115     clReleaseMemObject(deviceB);
    116     getchar();
    117     return 0;
    118 }

    ● 输出结果

    Result correct.
  • 相关阅读:
    OpenStack IceHouse版cinder模块新添加功能
    最小代价生成树
    HDU 3065 病毒侵袭持续中(AC自己主动机)
    POJ--2284--That Nice Euler Circuit【平面图欧拉公式】
    java工具类(四)之实现日期随意跳转
    Metasploit学习笔记之——情报搜集
    POJ 2378 Tree Cutting 子树统计
    cocos2d-x 3.0 touch事件官方解释
    html_entity_decode() 将 HTML 实体转成字符原型
    微信公众平台开发(81) 2014新年微信贺卡
  • 原文地址:https://www.cnblogs.com/cuancuancuanhao/p/9148224.html
Copyright © 2011-2022 走看看