代码功能为计算2个数相加
#include <CL/CL.h>
#include <iostream>
#pragma comment( lib, "opencl.lib" )
int main(int argc, char** argv)
{
cl_context hContext;
cl_context_properties prop = CL_CONTEXT_PLATFORM;
cl_platform_id platform;
clGetPlatformIDs( 1, &platform, NULL );
cl_device_id device;
clGetDeviceIDs( platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL );
cl_int err;
hContext = clCreateContext( NULL, 1, &device, NULL, NULL , &err );
size_t nCtxDescSize;
clGetContextInfo( hContext, CL_CONTEXT_DEVICES, 0, 0, &nCtxDescSize );
cl_device_id* pDevices = (cl_device_id*)new char[nCtxDescSize];
clGetContextInfo( hContext, CL_CONTEXT_DEVICES, nCtxDescSize, pDevices, NULL );
const char* sources =
{
"__kernel void VectorAdd(__global const float* a, __global const float* b, __global float* c)\n"
"{\n"
" // get index into global data array\n"
" int iGID = get_global_id(0);\n"
"\n"
" // bound check (equivalent to the limit on a 'for' loop for standard/serial C code\n"
" if (iGID >= 1)\n"
" { \n"
" return; \n"
" }\n"
" c[iGID] = a[iGID] + b[iGID];\n"
"}\n"
};
const char** sSources = (const char**)&sources;
cl_command_queue hCmdQueue;
hCmdQueue = clCreateCommandQueue( hContext, pDevices[0], 0, 0 );
cl_program hProgram;
hProgram = clCreateProgramWithSource( hContext, 1, sSources, 0, 0 );
clBuildProgram( hProgram, NULL, NULL, NULL, NULL, NULL );
cl_kernel hKernel;
hKernel = clCreateKernel( hProgram, "VectorAdd", 0 );
int cnBlockSize = 512;
int cnBlocks = 3;
size_t cnDimension = cnBlocks * cnBlockSize;
float* pA = new float[cnDimension];
float* pB = new float[cnDimension];
float* pC = new float[cnDimension];
memset( pA, 0, 4 * cnDimension );
memset( pB, 0, 4 * cnDimension );
*pA = 1.0f;
*pB = 2.0f;
cl_mem MemA, MemB, MemC;
MemA = clCreateBuffer( hContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
cnDimension * sizeof(float), pA, 0 );
MemB = clCreateBuffer( hContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
cnDimension * sizeof(float), pB, 0 );
MemC = clCreateBuffer( hContext, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR,
cnDimension * sizeof(float), pC, 0 );
clSetKernelArg( hKernel, 0, sizeof(cl_mem), (void*)&MemA );
clSetKernelArg( hKernel, 1, sizeof(cl_mem), (void*)&MemB );
clSetKernelArg( hKernel, 2, sizeof(cl_mem), (void*)&MemC );
clEnqueueNDRangeKernel( hCmdQueue, hKernel, 1, 0, &cnDimension, 0, 0, 0, 0 );
clEnqueueReadBuffer( hCmdQueue, MemC, CL_TRUE, 0, cnBlockSize * sizeof(float),
pC, NULL, NULL, NULL );
delete[] pA;
delete[] pB;
delete[] pC;
delete[] pDevices;
clReleaseMemObject( MemA );
clReleaseMemObject( MemB );
clReleaseMemObject( MemC );
return 0;
}