zoukankan      html  css  js  c++  java
  • cuda 编程1

    本文参考链接:
    《CUDA C Programming Guide》(《CUDA C 编程指南》)导读 https://zhuanlan.zhihu.com/p/53773183?from_voters_page=true

    //main.cu

    /* main.cu */
    #include <iostream>
    #include <time.h>
    #include "opencv2/highgui.hpp"   
    #include "opencv2/opencv.hpp"
    using namespace cv;
    using namespace std;
    
    //内核函数
    __global__ void rgb2grayincuda(uchar3 * const d_in, unsigned char * const d_out, 
                                    uint imgheight, uint imgwidth)
    {
        const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
        const unsigned int idy = blockIdx.y * blockDim.y + threadIdx.y;
    
        if (idx < imgwidth && idy < imgheight)  //有的线程会跑到图像外面去,不执行即可
        {
            uchar3 rgb = d_in[idy * imgwidth + idx];
            d_out[idy * imgwidth + idx] = 0.299f * rgb.x + 0.587f * rgb.y + 0.114f * rgb.z;
        }
    }
    
    //用于对比的CPU串行代码
    void rgb2grayincpu(unsigned char * const d_in, unsigned char * const d_out,
                                    uint imgheight, uint imgwidth)
    {
        for(int i = 0; i < imgheight; i++)
        {
            for(int j = 0; j < imgwidth; j++)
            {
                d_out[i * imgwidth + j] = 0.299f * d_in[(i * imgwidth + j)*3]
                                         + 0.587f * d_in[(i * imgwidth + j)*3 + 1]
                                         + 0.114f * d_in[(i * imgwidth + j)*3 + 2];
            }
        }
    }
    
    int main(void)
    {
        Mat srcImage = imread("/data_2/dog2.jpg");
        imshow("srcImage", srcImage);
        waitKey(0);
    
        const uint imgheight = srcImage.rows;
        const uint imgwidth = srcImage.cols;
    
        Mat grayImage(imgheight, imgwidth, CV_8UC1, Scalar(0));
    
        uchar3 *d_in;   //向量类型,3个uchar
        unsigned char *d_out;
    
        //首先分配GPU上的内存
        cudaMalloc((void**)&d_in, imgheight*imgwidth*sizeof(uchar3));
        cudaMalloc((void**)&d_out, imgheight*imgwidth*sizeof(unsigned char));
    
        //将主机端数据拷贝到GPU上
        cudaMemcpy(d_in, srcImage.data, imgheight*imgwidth*sizeof(uchar3), cudaMemcpyHostToDevice);
    
        //每个线程处理一个像素
        dim3 threadsPerBlock(32, 32);
        dim3 blocksPerGrid((imgwidth + threadsPerBlock.x - 1) / threadsPerBlock.x,
            (imgheight + threadsPerBlock.y - 1) / threadsPerBlock.y);
    
        clock_t start, end;
        start = clock();
    #if 0 //cuda
        //启动内核
        rgb2grayincuda<< <blocksPerGrid, threadsPerBlock>> >(d_in, d_out, imgheight, imgwidth);
        //执行内核是一个异步操作,因此需要同步以测量准确时间
        cudaDeviceSynchronize();
        end = clock();
        printf("cuda exec time is %.8f
    ", (double)(end-start)/CLOCKS_PER_SEC);
        //拷贝回来数据
        cudaMemcpy(grayImage.data, d_out, imgheight*imgwidth*sizeof(unsigned char), cudaMemcpyDeviceToHost);
        //释放显存
        cudaFree(d_in);
        cudaFree(d_out);
    #endif
    
    #if 1 //cpu
        rgb2grayincpu(srcImage.data, grayImage.data,imgheight, imgwidth);
    
         //执行内核是一个异步操作,因此需要同步以测量准确时间
        //cudaDeviceSynchronize();
        end = clock();
        printf("cpu exec time is %.8f
    ", (double)(end-start)/CLOCKS_PER_SEC);
    
    #endif
        imshow("grayImage", grayImage);
        waitKey(0);
        return 0;
    }
    

    //CMakeLists.txt

    cmake_minimum_required(VERSION 2.8)
    project(testcuda)
    find_package(CUDA REQUIRED)
    find_package(OpenCV REQUIRED)
    include_directories("/home/yhl/software_install/opencv3.2/include")
    cuda_add_executable(testcuda main.cu)
    target_link_libraries(testcuda ${OpenCV_LIBS})
    

    cuda 运行:cuda exec time is 0.00005800
    cpu 运行:cpu exec time is 0.00115700

    例子2:
    参考链接
    https://zhuanlan.zhihu.com/p/34587739

    #include <iostream>
    #include <time.h>
    #include "opencv2/highgui.hpp"   
    #include "opencv2/opencv.hpp"
    using namespace cv;
    using namespace std;
    
    int main(void)
    {
    int dev = 0;
        cudaDeviceProp devProp;
        //CHECK(cudaGetDeviceProperties(&devProp, dev));
        cudaGetDeviceProperties(&devProp, dev);
        std::cout << "使用GPU device " << dev << ": " << devProp.name << std::endl;
        std::cout << "SM的数量:" << devProp.multiProcessorCount << std::endl;
        std::cout << "每个线程块的共享内存大小:" << devProp.sharedMemPerBlock / 1024.0 << " KB" << std::endl;
        std::cout << "每个线程块的最大线程数:" << devProp.maxThreadsPerBlock << std::endl;
        std::cout << "每个EM的最大线程数:" << devProp.maxThreadsPerMultiProcessor << std::endl;
        std::cout << "每个EM的最大线程束数:" << devProp.maxThreadsPerMultiProcessor / 32 << std::endl;
    }
    

    输出如下:
    使用GPU device 0: GeForce GTX 1080
    SM的数量:20
    每个线程块的共享内存大小:48 KB
    每个线程块的最大线程数:1024
    每个EM的最大线程数:2048
    每个EM的最大线程束数:64

    cuda编程,10 篇博客,深入浅出谈CUDA

    https://blog.csdn.net/sunmc1204953974/category_6156113.html

  • 相关阅读:
    [C/C++]宽字符与控制台程序
    C# 实现屏幕键盘 (SCREENKEYBOARD)
    c#模拟键盘输入
    窗口玻璃特效,半透明窗口,使用DWM实现Aero Glass效果
    DMRS、DRS、SRS、CRS各自作用区别
    LTE的9种传输模式
    在4G通讯技术中什么是ZC根序列,ZC根序列规划的目的和原则是什么?
    為何LTE要先偵測PSS然後再偵測SSS 转自C114
    PSS和SSS用户小区接入的同步过程
    LTE PCI MOD3 规划
  • 原文地址:https://www.cnblogs.com/yanghailin/p/14234808.html
Copyright © 2011-2022 走看看