最近有不少朋友在多次循环执行OpenCL内核程序的时候碰到一些问题。由于对OpenCL初学者而言可能比较普遍,因此我这里给出一个清晰简单的demo来掩饰如何简单又高效地执行循环执行OpenCL内核。
以下程序的大概意思与流程是:
内核程序含有两个参数,第一个参数既是输入又是输出,第二个参数仅仅用于输入。不过第一个参数只对其初始化一次,而第二个参数在每次循环执行新一次的内核程序前会再传递一次数据。这么做有助于同学更好地去理解、把握存储器对象的基本使用方法。
存储器对象在通过cl_context上下文创建完之后,其所在的GPU端的位置就不变了。因此,我们在循环执行内核程序之前不需要把存储器对象释放掉,然后重新分配。这么做就比较低效了。我们完全可以重用同一个存储器对象。
以下代码在我的MacBook Air上能完全通过编译执行。没有任何warning。
执行环境:基于Haswell微架构的Intel Core i7 4650U,Intel HD Graphics 5000,8GB DDR3L,128GB SSD。
OS X 10.9.2 Mavericks,Xcode 5.1,Apple LLVM 5.1,支持GNU11标准的C编译器。
#include <stdio.h> #include <string.h> #include <stdlib.h> #ifdef __APPLE__ #include <OpenCL/opencl.h> #else #include <CL/cl.h> #endif int main(void) { cl_int ret; cl_platform_id platform_id = NULL; cl_device_id device_id = NULL; cl_context context = NULL; cl_command_queue command_queue = NULL; cl_mem memObj1 = NULL; cl_mem memObj2 = NULL; char *kernelSource = NULL; cl_program program = NULL; cl_kernel kernel = NULL; int *pInputBuffer1 = NULL; int *pInputBuffer2 = NULL; int *pOutputBuffer = NULL; clGetPlatformIDs(1, &platform_id, NULL); if(platform_id == NULL) { puts("Get OpenCL platform failed!"); goto FINISH; } clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL); if(device_id == NULL) { puts("No GPU available as a compute device!"); goto FINISH; } context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret); if(context == NULL) { puts("Context not established!"); goto FINISH; } command_queue = clCreateCommandQueue(context, device_id, 0, &ret); if(command_queue == NULL) { puts("Command queue cannot be created!"); goto FINISH; } // Specify the path of the kernel source const char *pFileName = "/Users/zennychen/Downloads/test.cl"; FILE *fp = fopen(pFileName, "r"); if (fp == NULL) { puts("The specified kernel source file cannot be opened!"); goto FINISH; } fseek(fp, 0, SEEK_END); const long kernelLength = ftell(fp); fseek(fp, 0, SEEK_SET); kernelSource = malloc(kernelLength); fread(kernelSource, 1, kernelLength, fp); fclose(fp); program = clCreateProgramWithSource(context, 1, (const char**)&kernelSource, (const size_t*)&kernelLength, &ret); ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL); if (ret != CL_SUCCESS) { size_t len; char buffer[8 * 1024]; printf("Error: Failed to build program executable! "); clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); printf("%s ", buffer); goto FINISH; } kernel = clCreateKernel(program, "test", &ret); if(kernel == NULL) { puts("Kernel failed to create!"); goto FINISH; } const size_t contentLength = sizeof(*pInputBuffer1) * 1024 * 1024; // 这里预分配的缓存大小为4MB,第一个参数是读写的 memObj1 = clCreateBuffer(context, CL_MEM_READ_WRITE, contentLength, NULL, &ret); if(memObj1 == NULL) { puts("Memory object1 failed to create!"); goto FINISH; } // 这里预分配的缓存大小为4MB,第一个参数是只读的 memObj2 = clCreateBuffer(context, CL_MEM_READ_ONLY, contentLength, NULL, &ret); if(memObj1 == NULL) { puts("Memory object2 failed to create!"); goto FINISH; } ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&memObj1); ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&memObj2); if(ret != CL_SUCCESS) { puts("Set arguments error!"); goto FINISH; } // 以下为在主机端分配输入缓存 pInputBuffer1 = malloc(contentLength); pInputBuffer2 = malloc(contentLength); // 然后对此工作缓存进行初始化 for(int i = 0; i < 1024 * 1024; i++) pInputBuffer1[i] = i + 1; memset(pInputBuffer2, 0, contentLength); // 然后分配输出缓存 pOutputBuffer = malloc(contentLength); // 先将第一个参数的数据传入GPU端,以后就不去改动了 ret = clEnqueueWriteBuffer(command_queue, memObj1, CL_TRUE, 0, contentLength, pInputBuffer1, 0, NULL, NULL); if(ret != CL_SUCCESS) { puts("Data transfer failed"); goto FINISH; } int count = 5; // 执行5次循环 do { // 先将第二个参数传给GPU ret = clEnqueueWriteBuffer(command_queue, memObj2, CL_TRUE, 0, contentLength, pInputBuffer2, 0, NULL, NULL); if(ret != CL_SUCCESS) { puts("Data transfer failed"); goto FINISH; } // 这里指定将总共有1024 * 1024个work-item ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, (const size_t[]){1024 * 1024}, NULL, 0, NULL, NULL); // 将结果拷贝给主机端 ret = clEnqueueReadBuffer(command_queue, memObj1, CL_TRUE, 0, contentLength, pOutputBuffer, 0, NULL, NULL); // 做次同步,这里偷懒,不用wait event机制了~ clFinish(command_queue); // 做校验 const int newValue = 5 - count + 1; const int addition = (5 - count) * newValue / 2; for(int i = 0; i < 1024 * 1024; i++) { if(pOutputBuffer[i] != i + 1 + addition) { puts("Result error!"); break; } } // 最后,给第二个缓存初始化新数据 for(int i = 0; i < 1024 * 1024; i++) pInputBuffer2[i] = newValue; } while(--count > 0); FINISH: /* Finalization */ if(pInputBuffer1 != NULL) free(pInputBuffer1); if(pInputBuffer2 != NULL) free(pInputBuffer2); if(pOutputBuffer != NULL) free(pOutputBuffer); if(kernelSource != NULL) free(kernelSource); if(memObj1 != NULL) clReleaseMemObject(memObj1); if(memObj2 != NULL) clReleaseMemObject(memObj2); if(kernel != NULL) clReleaseKernel(kernel); if(program != NULL) clReleaseProgram(program); if(command_queue != NULL) clReleaseCommandQueue(command_queue); if(context != NULL) clReleaseContext(context); return 0; }
上面OpenCL内核源文件的路径被写死了——“/Users/zennychen/Downloads/test.cl”。各位可以根据自己环境重新指定。
另外,上面用了一些C99语法特性。如果是用Win7的小伙伴们,请使用Visual Studio 2013(Express/Professional)的C编译器。
下面是OpenCL内核源文件:
__kernel void test(__global int *pInOut, __global int *pIn) { int index = get_global_id(0); pInOut[index] += pIn[index]; }