zoukankan      html  css  js  c++  java
  • OpenCL 设备队列

    ▶ 按书上写的设备队列的代码,需要 OpenCL2.0 的平台和设备,先把代码堆上来

    ● 程序主要功能:用主机上的数组 Ahost 和 Bhost 创建设备缓冲区 Adevice 和 Bdevice,调用核函数 foo 及其子核函数 fooChild 计算 factor * Adevice .* Bdevice,结果写入 Cdevice,最后拷贝回主机数组 Chost 检查结果。

    ● 代码

     1 //deviceQueue.cl
     2 __kernel void fooChild(const int nElement, const float factor,
     3     __global const float *A, __global const float *B, __global float *C)
     4 {
     5     uint gid = get_global_id(0);
     6     if (gid < nElement)
     7         C[gid] = factor * A[gid] * B[gid];
     8 }
     9 
    10 __kernel void foo(const int nElement, const float factor,
    11     __global const float *A, __global const float *B, __global float *C)
    12 {
    13     uint gid = get_global_id(0), gsize = get_global_size(0);
    14     uint childGsize = nElement / gsize, childOffset = gid * childGsize;
    15 
    16     __global const float *Achild = &A[childOffset];
    17     __global const float *Bchild = &B[childOffset];
    18     __global const float *Cchild = &C[childOffset];
    19 
    20     queue_t defQ = get_default_queue();
    21     ndrange_t ndrange = ndrange_1D(childGsize);
    22     void(^fooChildWrapper)(void) = ^{ fooChild(childGsize, factor, Achild, Bchild, Cchild); };
    23     enqueue_kernel(defQ, CLK_ENQUEUE_FLAGS_NO_WAIT, ndrange, saxpyDpChildWrapper);
    24 }
      1 //main.c
      2 #include <stdio.h>  
      3 #include <stdlib.h>  
      4 #include <cl.h>
      5 
      6 const char *sourceCode = "D:/Code/deviceQueue.cl";
      7 
      8 char* readSource(const char* kernelPath)// 读取文本文件,存储为 char *
      9 {
     10     FILE *fp;
     11     char *source;
     12     long int size;
     13     //printf("readSource, Program file: %s
    ", kernelPath);
     14     fopen_s(&fp, kernelPath, "rb");
     15     if (!fp)
     16     {
     17         printf("Open kernel file failed
    ");
     18         exit(-1);
     19     }
     20     if (fseek(fp, 0, SEEK_END) != 0)
     21     {
     22         printf("Seek end of file faildd
    ");
     23         exit(-1);
     24     }
     25     if ((size = ftell(fp)) < 0)
     26     {
     27         printf("Get file position failed
    ");
     28         exit(-1);
     29     }
     30     rewind(fp);
     31     if ((source = (char *)malloc(size + 1)) == NULL)
     32     {
     33         printf("Allocate space failed
    ");
     34         exit(-1);
     35     }
     36     fread(source, 1, size, fp);
     37     fclose(fp);
     38     source[size] = '';
     39     return source;
     40 }
     41 
     42 int main()
     43 {
     44     const int nElement = 8196, nChildElement = 128, dataSize = nElement * sizeof(float);
     45     float factor = 2.3f;    
     46     char info[1024] = { 0 };
     47     int i;
     48 
     49     // 初始化平台
     50     cl_int status;
     51     cl_platform_id platform;
     52     status = clGetPlatformIDs(1, &platform, NULL);
     53     cl_device_id device;
     54     status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
     55     cl_context_properties contextProp[] = { CL_CONTEXT_PLATFORM,(cl_context_properties)(platform), 0 };
     56     cl_context context = clCreateContext(contextProp, 1, &device, NULL, contextProp, &status);
     57     cl_queue_properties queueProp[3] = { CL_QUEUE_PROPERTIES,CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_ON_DEVICE | CL_QUEUE_ON_DEVICE_DEFAULT, 0 };
     58     cl_command_queue queue = clCreateCommandQueueWithProperties(context, device, queueProp, &status);
     59     cl_event eventProducer, eventConsumer;
     60 
     61     const char* source = readSource(sourceCode);
     62     cl_program program = clCreateProgramWithSource(context, 1, &source, NULL, &status);
     63     status = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
     64     if (status)
     65     {
     66         clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 1024, info, NULL);
     67         printf("Build log:
    %s
    ", info);
     68     }
     69 
     70     cl_kernel foo = clCreateKernel(program, "foo", &status);    
     71     size_t globalSize = nElement / nChildElement, localSize = 1;// 每个父工作项调度 nChildElement 个子工作项
     72 
     73     float *Ahost = (float *)malloc(dataSize);
     74     float *Bhost = (float *)malloc(dataSize);
     75     float *Chost = (float *)malloc(dataSize);
     76     for (i = 0; i < nElement; Ahost[i] = i, Bhost[i] = i + 1, Chost[i] = 0.f, i++);
     77 
     78     cl_mem Adevice, Bdevice, Cdevice;
     79     Adevice = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, dataSize, Ahost, &status);
     80     Bdevice = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, dataSize, Bhost, &status);
     81     Cdevice = clCreateBuffer(context, CL_MEM_WRITE_ONLY, dataSize, NULL, &status);
     82     
     83     clSetKernelArg(foo, 0, sizeof(int), (void*)&nElement);
     84     clSetKernelArg(foo, 1, sizeof(float), (void*)&factor);
     85     clSetKernelArg(foo, 2, sizeof(cl_mem), Adevice);
     86     clSetKernelArg(foo, 3, sizeof(cl_mem), Bdevice);
     87     clSetKernelArg(foo, 4, sizeof(cl_mem), Cdevice);
     88     
     89     clEnqueueNDRangeKernel(queue, foo, 1, NULL, &globalSize, &localSize, 0, NULL, &eventProducer);
     90     clFinish(queue);
     91 
     92     clEnqueueReadBuffer(queue, Cdevice, CL_TRUE, dataSize, dataSize, Chost, 1, &eventConsumer, NULL);
     93     clFinish(queue);
     94 
     95     for (i = 0; i < nElement; i++)
     96     {
     97         if (Chost[i] != factor*i*(i + 1))
     98             break;
     99     }
    100     printf("Output is %s.
    ", (i == nElement) ? "correct" : "incorrect");
    101 
    102     free(Ahost);
    103     free(Bhost);
    104     free(Chost);
    105     clReleaseContext(context);
    106     clReleaseCommandQueue(queue);
    107     clReleaseProgram(program);
    108     clReleaseKernel(foo);
    109     clReleaseMemObject(Adevice);
    110     clReleaseMemObject(Bdevice);
    111     clReleaseMemObject(Cdevice);
    112     getchar();
    113     return 0;
    114 }

    ● 输出结果

    ■ 一直卡在函数 clCreateCommandQueueWithProperties 的调用上,返回值 -6(CL_OUT_OF_HOST_MEMORY),原因不明,stackExchange 上有人说换了显卡驱动就好了(https://stackoverflow.com/questions/39864947/opencl-cl-out-of-host-memory-on-clcreatecommandqueuewithproperties-with-minima),还有人说是设备位数的问题(https://stackoverflow.com/questions/45231329/opencl-clcreatecommandqueue-cl-out-of-host-memory-error),但是我更新了显卡驱动,工程改成 32 位(才发现显卡是 32 位的)还是不行。

    ■ 强行忽略上面的问题(clCreateCommandQueueWithProperties 第四参数用 NULL)仍然程序编译失败,返回 -11(CL_BUILD_PROGRAM_FAILURE),原因是不能支持和函数中的 queue_t 和 ndrange_t 数据类型,后面的块语法就更别想了,应该是平台和设备不能完全支持 OpenCL2.0 所致。

    ■ 更新心啊卡驱动和 AMD OpenCL2.0 驱动,编译失败,报错 -11,报错信息变成了【Error: Inserting openCl Source to binary】和【Error: Compiling CL to IR】

  • 相关阅读:
    线段树(单点更新) HDOJ 2795 Billboard
    线段树(单点更新) HDU 1754 I Hate It
    线段树(单点更新)/树状数组 HDOJ 1166 敌兵布阵
    递推DP URAL 1031 Railway Tickets
    记忆化搜索(DFS+DP) URAL 1223 Chernobyl’ Eagle on a Roof
    递推DP URAL 1244 Gentlemen
    DFS水题 URAL 1152 False Mirrors
    记忆化搜索(DFS+DP) URAL 1501 Sense of Beauty
    DP+高精度 URAL 1036 Lucky Tickets
    DP/最短路 URAL 1741 Communication Fiend
  • 原文地址:https://www.cnblogs.com/cuancuancuanhao/p/9038952.html
Copyright © 2011-2022 走看看