zoukankan      html  css  js  c++  java
  • 【转载】OpenCL实现矩阵相乘

    矩阵相乘其实就是前一个矩阵的每一行乘以后一个矩阵的每一列,然后将乘后的每一个数字相加,得到结果矩阵的指定位置的数值。具体算法回顾一下线性代数即可。但是这种行列相乘其实都是独立的,如果是CPU计算必须串行算法,一行一列的乘,但是放到GPU里面则可以并行相乘,如果维数很大那就会大大节约时间。

    具体代码如下:

    __kernel                                           
    void simpleMultiply(__global float* outPutC,               
    int widthA,                                     
    int heightA,                                       
    int widthB ,                                       
    int heightB ,                                      
     __global float* inputA ,                          
    __global  float* inputB                            
     )                                                 
    {                                                  
       int row = get_global_id(1);                      
       int col = get_global_id(0);                        
       float sum = 0.0f  ;                               
       for(int i=0;i<widthA; i++)                       
       {                                                
            sum += inputA[row*widthA+i] * inputB[i*widthB+col];   
        }                                              
       outPutC[row*widthB+col] = sum;                          
    } ; 
    // FirstOpenCL.cpp : 定义控制台应用程序的入口点。
    //
    #include "stdafx.h"
    #include <iostream>  
    #include<time.h>  
    #include <string>   
    #include<math.h>  
    #include <vector>  
    #include <CL/cl.h>  
    #include <fstream>  
    
    using namespace std; 
    
    #pragma comment (lib,"OpenCL.lib")
    std::string  convertToString(const char *filename)
    {
        size_t size;  
        char*  str;  
        std::string s;  
        std::fstream f(filename, (std::fstream::in | std::fstream::binary));  
        if(f.is_open())  
        {  
            size_t fileSize;  
            f.seekg(0, std::fstream::end);  
            size = fileSize = (size_t)f.tellg();  
            f.seekg(0, std::fstream::beg);  
            str = new char[size+1];  
            if(!str)  
            {  
                f.close();  
                std::cout << "Memory allocation failed";  
                return NULL;  
            }  
      
            f.read(str, fileSize);  
            f.close();  
            str[size] = '';  
            s = str;  
            delete[] str;  
            return s;  
        }  
        else  
        {  
            std::cout << "
    File containg the kernel code(".cl") not found. Please copy the required file in the folder containg the executable.
    ";  
            exit(1);  
        }  
        return NULL;  
    }  
      
    int main()  
    {  
        //查询平台  
        cl_int ciErrNum;  
        cl_platform_id platform;  
        ciErrNum = clGetPlatformIDs(1, &platform, NULL);  
    
        //获取设备信息  
        cl_device_id device;  
        cl_int   status;  
        cl_uint maxDims;  
        cl_event events[2];  
        size_t globalThreads[1];  
        size_t localThreads[1];  
        size_t maxWorkGroupSize;  
        size_t maxWorkItemSizes[3];  
     
        ciErrNum = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &device, NULL);  
        status = clGetDeviceInfo( device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t),(void*)&maxWorkGroupSize,NULL);   
        status = clGetDeviceInfo( device, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(cl_uint),(void*)&maxDims, NULL);  
        status = clGetDeviceInfo( device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t)*maxDims,(void*)maxWorkItemSizes, NULL);  
          
        //创建上下文   
        cl_context ctx = clCreateContext(NULL, 1, &device, NULL, NULL, &ciErrNum);   
        cl_command_queue myqueue = clCreateCommandQueue(ctx,device,0,&ciErrNum);  
    
        int wA=6,hA=6;  
        int wB=6,hB=6;  
        int wC=6,hC=6;  
      
        // 数组的大小  
        const int  elementsA = wA*hA;  
        const int  elementsB = wB*hB;  
        const int  elementsC = hA*wB;  
          
        // 计算内存大小  
        size_t datasizeA = sizeof(float)*elementsA;  
        size_t datasizeB = sizeof(float)*elementsB;  
        size_t datasizeC = sizeof(float)*elementsC;  
        // 分配内存空间  
        float *A = (float*)malloc(datasizeA);  
        float *B = (float*)malloc(datasizeB);  
        float *C = (float*)malloc(datasizeC);  
      
        // 初始化输入数组  
        for(int i = 0;i < elementsA;i++)  
        {  
            A[i] = (float)((float)i + 1.0); 
        }  
        for(int i = 0;i < elementsB;i++)  
        {  
            B[i] = (float)((float)i + 1.0);
        }  
        
        cl_mem bufferA = clCreateBuffer(ctx,CL_MEM_READ_ONLY,wA*hA*sizeof(float),NULL,&ciErrNum);  
        ciErrNum = clEnqueueWriteBuffer(myqueue,bufferA,CL_TRUE,0,wA*hA*sizeof(float),(void*)A,0,NULL,NULL);  
      
        cl_mem bufferB = clCreateBuffer(ctx,CL_MEM_READ_ONLY,wB*hB*sizeof(float),NULL,&ciErrNum);  
        ciErrNum = clEnqueueWriteBuffer(myqueue,bufferB,CL_TRUE,0,wB*hB*sizeof(float),(void*)B,0,NULL,NULL);  
      
        cl_mem bufferC = clCreateBuffer(ctx,CL_MEM_WRITE_ONLY,hA*wB*sizeof(float),NULL,&ciErrNum);  
      
        //运行时kernel编译  
        const char * filename  = "HelloWorld_Kernel.cl";  
        std::string  sourceStr = convertToString(filename);  
        const char * source    = sourceStr.c_str();  
        size_t sourceSize[]    = { strlen(source) };  
        //直接将CL文件读到记忆体  
        cl_program myprog = clCreateProgramWithSource( ctx, 1,&source, sourceSize, &ciErrNum);  
        //cl_program myprog = clCreateProgramWithSource(ctx,1,(const char**)&programSource,NULL,&ciErrNum);  
        ciErrNum = clBuildProgram(myprog,0,NULL,NULL,NULL,NULL);  
      
        cl_kernel mykernel = clCreateKernel(myprog,"simpleMultiply",&ciErrNum);  
        //运行程序  
        clSetKernelArg(mykernel,0,sizeof(cl_mem),(void*)&bufferC);  
        clSetKernelArg(mykernel,1,sizeof(cl_mem),(void*)&wA);  
        clSetKernelArg(mykernel,2,sizeof(cl_mem),(void*)&hA);  
        clSetKernelArg(mykernel,3,sizeof(cl_mem),(void*)&wB);  
        clSetKernelArg(mykernel,4,sizeof(cl_mem),(void*)&hB);  
        clSetKernelArg(mykernel,5,sizeof(cl_mem),(void*)&bufferA);  
        clSetKernelArg(mykernel,6,sizeof(cl_mem),(void*)&bufferB);  
      
        size_t localws[2] ={wC,wC};  
        size_t globalws[2]={wC,hC};  
     
        ciErrNum = clEnqueueNDRangeKernel(myqueue,mykernel,2,NULL,globalws,localws,0,NULL,&events[0]);  
        status = clWaitForEvents(1, &events[0]);  
        status = clReleaseEvent(events[0]);  
        //将结果拷贝到主机端  
        ciErrNum = clEnqueueReadBuffer(myqueue,bufferC,CL_TRUE,0,wC*hC*sizeof(float),(void*)C,0,NULL,&events[1]);  
          
        status = clWaitForEvents(1, &events[1]);    
        status = clReleaseEvent(events[1]);  
        
        printf("
    Array A:
    ");
        for (int i = 0; i < wA; i++) {
            for (int j = 0; j < hA; j++)
                printf("%4.3f	", A[i*hA + j]);
            printf("
    ");
        }
        printf("
    Array B:
    ");
        for (int i = 0; i < wB; i++) {
            for (int j = 0; j < hB; j++)
                printf("%4.3f	", B[i*hB + j]);
            printf("
    ");
        }
        printf("
    Array C:
    ");
        for (int i = 0; i < wC; i++) {
            for (int j = 0; j < hC; j++)
                printf("%4.3f	", C[i*hC + j]);
            printf("
    ");
        }
    
        getchar();
        return 0;  
    }  
  • 相关阅读:
    Git面试题
    JVM的垃圾回收算法
    如何有效避免文件名相同的问题
    java.lang.IllegalArgumentException: An invalid character [44] was present in the Cookie value 异常的原因及处理方法
    eclipse新建maven项目出错 pom.xml报错
    maven配置本地仓库、maven配置阿里中央仓库、eclipse配置maven
    Maven安装、配置环境变量
    XML学习——java解析xml文件
    JSP九大内置对象
    JavaWeb学习——session总结
  • 原文地址:https://www.cnblogs.com/mod109/p/6282485.html
Copyright © 2011-2022 走看看