zoukankan      html  css  js  c++  java
  • OpenCL入门

    1,OpenCL程序包含两部分:

    设备上执行:GPU 主机上运行:CPU

    需要使用OpenCL语言编写Kernel函数。

    2,

    (1)Kernel:设备程序执行的入口点,是唯一可以从主机上调用执行的函数。

    eg:

    Function:

    void vector_add_cpu (const float* src_a,
                   const float* src_b,
                   float*  res,
                   const int num)
    {
       for (int i = 0; i < num; i++)
          res[i] = src_a[i] + src_b[i];
    }
    

    OpenCL Kernel:

    __kernel void vector_add_gpu (__global const float* src_a,
                         __global const float* src_b,
                         __global float* res,
               const int num)
    {
       /* get_global_id(0) 返回正在执行的这个线程的ID。
       许多线程会在同一时间开始执行同一个kernel,
       每个线程都会收到一个不同的ID,所以必然会执行一个不同的计算。*/
       const int idx = get_global_id(0);
    
       /* 每个work-item都会检查自己的id是否在向量数组的区间内。
       如果在,work-item就会执行相应的计算。*/
       if (idx < num)
          res[idx] = src_a[idx] + src_b[idx];
    }

    如何编写Kernel,如何表达并行性,执行模型是什么样的?

    SIMT: single instruction multi thread

    work-item(工作项): 最小的执行单元。当一个Kernel开始执行,可定义数量的work-item开始运行,每个都执行相同的代码。其中,每个work-item有一个id,可以在kernel中被访问,运行在work-item上的kernel通过id找到work-item需要处理的数据。

    work-group(工作组):允许work-item之间的通信和协作。也有唯一的可被kernel读取的id。

    ND-range:下一个组织级别,定义work-group的组织形式。

    Attention:

    a. 关键字__kernel定义函数为kernel, 必须返回void。

    b. 关键字__global位于参数前面,定义了参数内存的存放位置。

    c. 所有Kernel必须写在".cl"文件中,".cl"文件必须只包含OpenCL代码。

    (2)Host(主机)

    首先建立基本的OpenCL运行环境:

    Platform: Host加OpenCL框架管理下的若干设备构成Platform。通过Platform,应用程序可以与设备共享资源并在设备上执行Kernel。Platform通过cl_platform表现。

    Platform初始化:

    // return error code
    cl_int oclGetPlatformID(cl_platform_id *platforms) //Pointer to the platform object
    

    Device: 通过cl_device表现。

    // Return the error code
    cl_int clGetDeviceIDs(cl_platform_id platform,
    cl_device_type device_type, //Bitfield identifying the type. For GPU we use CL_DEVICE_TYPE_GPU
    cl_uint num_entries, //Number of Devices, typically 1
    cl_device_id *devices, //Pointer to the device object
    cl_unit *num_devices) //Puts here the number of devices matching the device_type
    

    Context: 定义整个OpenCL环境,包括OpenCL kernel、设备、内存管理、命令队列等。Context使用cl_context表现。

    // Returns the contxt
    cl_context cl_CreateContext(onst cl_context_properties *properties, //Bitwise with the properties(ee specification)
    cl_unit num_devices, //Number of devices
    const cl_device_id *devices, //Pointer to devices object
    void (*pfn_notify)(const char *errinfo, const void *private_info, size_t cb, void *user_data),
    void *user_data, 
    cl_int *errcode_ret) //error code result
    

    Command_Queue: 存储需要在设备上执行的OpenCL指令的队列。Command_Queue建立在一个Context指定的设备上,多个Command_Queue允许应用程序在不需要同步的情况下执行多条无关联的指令。

    cl_command_queue clCreateCommandQueue(cl_context context,
    cl_device_id device,
    cl_command_queue_properties properties, //Bitwise with the properties
    cl_int *errcode_ret) //Error code result

    环境初始化方法:

    cl_int error = 0;   // Used to handle error codes
    cl_platform_id platform;
    cl_context context;
    cl_command_queue queue;
    cl_device_id device;
    
    // Platform
    error = oclGetPlatformID(&platform);
    if (error != CL_SUCCESS) {
       cout << "Error getting platform id: " << errorMessage(error) << endl;
       exit(error);
    }
    // Device
    error = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
    if (err != CL_SUCCESS) {
       cout << "Error getting device ids: " << errorMessage(error) << endl;
       exit(error);
    }
    // Context
    context = clCreateContext(0, 1, &device, NULL, NULL, &error);
    if (error != CL_SUCCESS) {
       cout << "Error creating context: " << errorMessage(error) << endl;
       exit(error);
    }
    // Command-queue
    queue = clCreateCommandQueue(context, device, 0, &error);
    if (error != CL_SUCCESS) {
       cout << "Error creating command queue: " << errorMessage(error) << endl;
       exit(error);
    }
    

    接着,分配内存。

    针对介绍Kernel时举的例子,需要分配三个向量的内存空间,且至少将其中的两个初始化。

    const int size = 1234567
    float* src_a_h = new float[size];
    float* src_b_h = new float[size];
    float* res_h = new float[size];
    // Initialize both vectors
    for (int i = 0; i < size; i++) {
       src_a_h = src_b_h = (float) i;
    }
    

    而在Devices上分配内存,我们需要使用cl_mem类型:

    //Returns the cl_mem object referencing the memory allocated on the device
    cl_mem clCreateBuffer(cl_context context, //the context where the memory will be allocated
    cl_mem_flags flags,
    size_t size // Size in bytes
    void *host_ptr,
    cl_int *errcode_ret)
    

    其中,flags是逐位的,选项包括:

    CL_MEM_READ_WRITE

    CL_MEM_WRITE_ONLY

    CL_MEM_READ_ONLY

    CL_MEM_USE_HOST_PTR

    CL_MEM_ALLOC_HOST_PTR

    CL_MEM_COPY_HOST_PTR – 从 host_ptr处拷贝数据

    而clCreateBuffer的使用方法为:

    const int mem_size = sizeof(float)*size;
    
    // Allocates a buffer of size mem_size and copies mem_size bytes from src_a_h
    
    cl_mem src_a_d = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, mem_size, src_a_h, &error);
    
    cl_mem src_b_d = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, mem_size, src_b_h, &error);
    
    cl_mem res_d = clCreateBuffer(context, CL_MEM_WRITE_ONLY, mem_size, NULL, &error);
    

    (3)程序与Kernel

    怎么调用Kernel?编译器怎么将代码放在Device上?怎么编译Kernel?

    对比OpenCL Program与OpenCL Kernel:

    Kernel:本质是一个可以在Host上调用,在Device上运行的函数,是在运行时编译的。所有运行在Device上的代码,包括Kernel以及Kernel调用的其他函数都是在运行时编译的。

    Program:OpenCL  Program是由Kernel函数/其他函数/声明组成。通过cl_program表示。当创建一个Program时,必须指定其是有哪些文件组成的,然后编译它。

    Create a Program:

    // Returns the OpenCL program
    cl_program clCreateProgramWithSource(cl_context context,
    cl_uint count, //number of files
    const char **strings, //array of strings, each one is a file
    const size_t *lengths, //array specifying the file lengths
    cl_int *errcode_ret) //error code to be returned
    

    在创建了Program后,执行编译操作:

    cl_int clBuildProgram(cl_program program,
    cl_uint num_devices,
    const cl_device_id *device_list,
    const char *options, //Compiler options, see the specifications for more details
    void (*pfn_notify)(cl_program, void *user_data),
    void *user_data)
    

    编译后,我们使用如下函数查看编译log:

    cl_int clGetProgramBuildInfo(cl_program program,
    cl_device_id device,
    cl_program_build_info param_name, //The parameter we want to know
    size_t param_value_size,
    void *param_value, // The answer
    size_t *param_value_size_ret)
    

    最后提取Program的入口,使用cl_kernel:

    cl_kernel cl_CreateKernel(cl_pogram program,
    const char *kernel_name, //The name of the kernel, i.e.the name of the kernel function as it's declared in the code
    cl_int *errcode_ret)
    

    我们可以创建多个Program,而每个Program中可以包含多个Kernel。创建Program:

    // Create a program
    size_t src_size = 0;
    const char* path = shrFindFilePath("vector_add_gpu.cl", NULL);
    const char* source = oclLoadProgSource(path, "", &src_size);
    cl_program program = clCreateProgramWithSource(context, 1 &source, &src_size, &error);
    assert(error == CL_SUCCESS)
    
    //Build the program
    error = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
    assert(error == CL_SUCCESS)
    
    //Show the log
    char* build_log;
    size_t log_size;
    //First call to know the proper size
    clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
    build_log = new char[log_size + 1];
    //Second call to get the log
    clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL);
    build_log[log_size] = ''";
    cout << build_log << endl;
    delete[] build_log;
    
    //Extracting the kernel
    cl_kernel vector_add_kernel = clCreateKernel(program, "vector_add_gpu", &error);
    assert(error == CL_SUCCESS);
    

    我们将Kernel建立好后,在运行Kernel前,需要先设置Kernel的参数。

    cl_int clSetKernelArg(cl_kernel kernel, //Which Kernel
    cl_uint arg_index, //Which argument
    size_t arg_size, //Size of the next argument(not the value pointed by it)
    const void *arg_value) //Value
    

    对于Kernel中的每个参数,都需要调用上述的函数来为参数设置Value。所有参数设置完毕后,我们可以调用Kernel:

    cl_int clWnqueueNDRangeKernel(cl_command_queue conmmand_queue,
    cl_kernel kernel,
    cl_uint work_dim, //Choose if using 1D, 2D or 3D work-items and work-groups
    const size_t *global_work_offset,
    const size_t *global_work_size, //The total number of work-items(must have work-dim dimensions)
    const size_t *local_work_size, //The number of work-items per work-group(must have work-dim dimensions)
    cl_uint num_events_in_wait_list,
    const cl_event *event_wait_list,
    cl_event *event)
    

    最终,设置参数及调用Kernel:

    //Enqueuing parameters
    //Note that we inform the size of the cl_mem object, not the size of the memory pointed by it
    error = clSetKernelArg(vector_add_k, 0, sizeof(cl_mem), &src_a_d);
    error |= clSetKernelArg(vector_add_k, 1, sizeof(cl_mem), &src_b_d);
    error |= clSetKernelArg(vector_add_k, 2, sizeof(cl_mem), &res_d);
    error |= clSetKernelArg(vector_add_k, 3, sizeof(size_t), &size);
    assert(error == CL_SUCCESS)
    
    //Luauching Kernel
    const size_t local_ws = 512; //Number of work-items per work-group
    //shrRoundUp returns the smallest multiple of local_ws bigger than size
    const size_t global_ws = shrRoundUp(local_ws, size); //Total number of work-items
    error = clEnqueueNDRangeKernel(queue, vector_add_k, 1, NULL, &global_ws, &local_ws, 0, NULL, NULL);
    assert(error == CL_SUCCESS);
    

    (4)读取结果

    cl_int clEnqueueReadBuffer(cl_command_queue command_queue,
    cl_mem buffer, //From which buffer
    cl_bool offset //Offset from the beginning
    size_t cb, //Size to be read(in bytes)
    void *ptr, //Pointer to the host memory
    cl_uint num_events_in_wait_list,
    const cl_event *event_wait_list,
    cl_event *event)
    

    使用方法如下:

    //Reading back
    float* check = net float[size];
    clEnQueueReadBuffer(queue, res_d, CL_TRUE, 0, mem_size, check, 0, NULL, NULL);
    

    (5)清除内存

    使用clCreate申请的(缓冲区/Kernel/队列等)内存必须使用clRelease释放。

    // Cleaning memory
    delete[] src_a_h;
    delete[] src_b_h;
    delete[] res_h;
    delete[] check;
    clReleaseKernel(vector_add_k);
    clReleaseCommandQueue(queue);
    clReleaseContext(context);
    clReleaseMemObject(src_a_d);
    clReleaseMemObject(src_ab_d);
    clReleaseMemObject(res_d);
    
    未经允许,请勿转载
  • 相关阅读:
    Git Bash 常用指令
    C/C++连接MySQL数据库执行查询
    question from asktom
    ORACLE AWR报告
    查看oracle表索引
    ORACLE数据库关闭与启动
    SYS vs SYSTEM and SYSDBA vs SYSOPER
    【面试】二叉树遍历的非递归实现
    快速排序的非递归实现
    MySQL数据库基础
  • 原文地址:https://www.cnblogs.com/zhuzhudong/p/13361155.html
Copyright © 2011-2022 走看看