原文地址:CoderDock.com
前四章已经讲了那么多的概念,下面该真刀真枪地试一试了。
下面是宿主机端程序:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 | int main(void) { //创建2个输入向量 int i; const int LIST_SIZE=1024; int *A=(int*)malloc(sizeof(int)*LIST_SIZE); int *B=(int*)malloc(sizeof(int)*LIST_SIZE); for (i=0; i<LIST_SIZE; i++) { A[i]=i; B[i]=LIST_SIZE-i; } //载入内核源码到source_str FILE *fp; char *source_str; size_t source_size; fp=fopen("vector_add_kernel.cl","r"); if (!fp) { fprintf(stderr, "Failed to load kernel
"); exit(1); } source_str=(char*)malloc(MAX_SOURCE_SIZE); source_size=fread(source_str,1,MAX_SOURCE_SIZE,fp); fclose(fp); //获得平台和设备信息 cl_platform_id platform_id=NULL; cl_device_id device_id=NULL; cl_uint ret_num_devices; cl_uint ret_num_platforms; cl_int ret=clGetPlatformIDs(1, &platform_id, &ret_num_platforms); ret=clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, &ret_num_devices); //创建OpenCL上下文 cl_context context=clCreateContext(NULL,1,&device_id,NULL,NULL,&ret); //创建命令队列 cl_command_queue command_queue=clCreateCommandQueue(context,device_id,0,&ret); //创建内存缓冲对象,在设备上为每个向量 cl_mem a_mem_obj=clCreateBuffer(context,CL_MEM_READ_ONLY, LIST_SIZE*sizeof(int),NULL,&ret); cl_mem b_mem_obj=clCreateBuffer(context,CL_MEM_READ_ONLY, LIST_SIZE*sizeof(int),NULL,&ret); cl_mem c_mem_obj=clCreateBuffer(context,CL_MEM_WRITE_ONLY, LIST_SIZE*sizeof(int),NULL,&ret); //拷贝数据A和B到对应的内存缓冲 ret=clEnqueueWriteBuffer(command_queue,a_mem_obj,CL_TRUE,0, LIST_SIZE*sizeof(int),A,0,NULL,NULL); ret=clEnqueueWriteBuffer(command_queue,a_mem_obj,CL_TRUE,0, LIST_SIZE*sizeof(int),B,0,NULL,NULL); //创建程序 cl_program program=clCreateProgramWithSource(context,1, (const char**)&source_str,(const size_t*)&source_size, &ret); //构建程序 ret=clBuildProgram(program,1,&device_id,NULL,NULL,NULL); //创建OpenCL内核 cl_kernel kernel=clCreateKernel(program,"vector_add",&ret); //设置内核参数 ret=clSetKernelArg(kernel,0,sizeof(cl_mem),(void*)&a_mem_obj); ret=clSetKernelArg(kernel,1,sizeof(cl_mem),(void*)&b_mem_obj); ret=clSetKernelArg(kernel,2,sizeof(cl_mem),(void*)&c_mem_obj); //执行内核 size_t global_item_size=LIST_SIZE; //处理整个列表 size_t local_item_size=64; //分割为64个组 ret=clEnqueueNDRangeKernel(command_queue,kernel,1,NULL, &global_item_size,&local_item_size,0,NULL,NULL); //读取内存缓冲C到本地变量C int *C=(int*)malloc(sizeof(int)*LIST_SIZE); ret=clEnqueueReadBuffer(command_queue,c_mem_obj,CL_TRUE,0, LIST_SIZE*sizeof(int),C,0,NULL,NULL); //显示结果 for (i=0; i<LIST_SIZE; i++) { printf("%d + %d = %d
", A[i], B[i], C[i]); } //清理资源 ret=clFlush(command_queue); ret=clFinish(command_queue); ret=clReleaseKernel(kernel); ret=clReleaseProgram(program); ret=clReleaseMemObject(a_mem_obj); ret=clReleaseMemObject(b_mem_obj); ret=clReleaseMemObject(c_mem_obj); ret=clReleaseCommandQueue(command_queue); ret=clReleaseContext(context); free(A); free(B); free(C); return 0; } |
下面是内核程序(vector_add_kernel.cl):
1 2 3 4 5 6 | __kernel void vector_add(__global const int *A, __global const int *B, __global int *C) { //获得当前要处理元素的索 int i=get_global_id(0); //做计算 C[i]=A[i]+B[i]; } |
之后使用命令:
1 | gcc -o Test Test.c -I<Header_to_OpenCL_Path> |
进行编译。