本文参考链接:
《CUDA C Programming Guide》(《CUDA C 编程指南》)导读 https://zhuanlan.zhihu.com/p/53773183?from_voters_page=true
/* main.cu */
#include <iostream>
#include <time.h>
#include "opencv2/highgui.hpp"
#include "opencv2/opencv.hpp"
using namespace cv;
using namespace std;
//内核函数
__global__ void rgb2grayincuda(uchar3 * const d_in, unsigned char * const d_out,
uint imgheight, uint imgwidth)
{
const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
const unsigned int idy = blockIdx.y * blockDim.y + threadIdx.y;
if (idx < imgwidth && idy < imgheight) //有的线程会跑到图像外面去,不执行即可
{
uchar3 rgb = d_in[idy * imgwidth + idx];
d_out[idy * imgwidth + idx] = 0.299f * rgb.x + 0.587f * rgb.y + 0.114f * rgb.z;
}
}
//用于对比的CPU串行代码
void rgb2grayincpu(unsigned char * const d_in, unsigned char * const d_out,
uint imgheight, uint imgwidth)
{
for(int i = 0; i < imgheight; i++)
{
for(int j = 0; j < imgwidth; j++)
{
d_out[i * imgwidth + j] = 0.299f * d_in[(i * imgwidth + j)*3]
+ 0.587f * d_in[(i * imgwidth + j)*3 + 1]
+ 0.114f * d_in[(i * imgwidth + j)*3 + 2];
}
}
}
int main(void)
{
Mat srcImage = imread("/data_2/dog2.jpg");
imshow("srcImage", srcImage);
waitKey(0);
const uint imgheight = srcImage.rows;
const uint imgwidth = srcImage.cols;
Mat grayImage(imgheight, imgwidth, CV_8UC1, Scalar(0));
uchar3 *d_in; //向量类型,3个uchar
unsigned char *d_out;
//首先分配GPU上的内存
cudaMalloc((void**)&d_in, imgheight*imgwidth*sizeof(uchar3));
cudaMalloc((void**)&d_out, imgheight*imgwidth*sizeof(unsigned char));
//将主机端数据拷贝到GPU上
cudaMemcpy(d_in, srcImage.data, imgheight*imgwidth*sizeof(uchar3), cudaMemcpyHostToDevice);
//每个线程处理一个像素
dim3 threadsPerBlock(32, 32);
dim3 blocksPerGrid((imgwidth + threadsPerBlock.x - 1) / threadsPerBlock.x,
(imgheight + threadsPerBlock.y - 1) / threadsPerBlock.y);
clock_t start, end;
start = clock();
#if 0 //cuda
//启动内核
rgb2grayincuda<< <blocksPerGrid, threadsPerBlock>> >(d_in, d_out, imgheight, imgwidth);
//执行内核是一个异步操作,因此需要同步以测量准确时间
cudaDeviceSynchronize();
end = clock();
printf("cuda exec time is %.8f
", (double)(end-start)/CLOCKS_PER_SEC);
//拷贝回来数据
cudaMemcpy(grayImage.data, d_out, imgheight*imgwidth*sizeof(unsigned char), cudaMemcpyDeviceToHost);
//释放显存
cudaFree(d_in);
cudaFree(d_out);
#endif
#if 1 //cpu
rgb2grayincpu(srcImage.data, grayImage.data,imgheight, imgwidth);
//执行内核是一个异步操作,因此需要同步以测量准确时间
//cudaDeviceSynchronize();
end = clock();
printf("cpu exec time is %.8f
", (double)(end-start)/CLOCKS_PER_SEC);
#endif
imshow("grayImage", grayImage);
waitKey(0);
return 0;
}
cmake_minimum_required(VERSION 2.8)
project(testcuda)
find_package(CUDA REQUIRED)
find_package(OpenCV REQUIRED)
include_directories("/home/yhl/software_install/opencv3.2/include")
cuda_add_executable(testcuda main.cu)
target_link_libraries(testcuda ${OpenCV_LIBS})
cuda 运行:cuda exec time is 0.00005800
cpu 运行:cpu exec time is 0.00115700
例子2:
参考链接
https://zhuanlan.zhihu.com/p/34587739
#include <iostream>
#include <time.h>
#include "opencv2/highgui.hpp"
#include "opencv2/opencv.hpp"
using namespace cv;
using namespace std;
int main(void)
{
int dev = 0;
cudaDeviceProp devProp;
//CHECK(cudaGetDeviceProperties(&devProp, dev));
cudaGetDeviceProperties(&devProp, dev);
std::cout << "使用GPU device " << dev << ": " << devProp.name << std::endl;
std::cout << "SM的数量:" << devProp.multiProcessorCount << std::endl;
std::cout << "每个线程块的共享内存大小:" << devProp.sharedMemPerBlock / 1024.0 << " KB" << std::endl;
std::cout << "每个线程块的最大线程数:" << devProp.maxThreadsPerBlock << std::endl;
std::cout << "每个EM的最大线程数:" << devProp.maxThreadsPerMultiProcessor << std::endl;
std::cout << "每个EM的最大线程束数:" << devProp.maxThreadsPerMultiProcessor / 32 << std::endl;
}
输出如下:
使用GPU device 0: GeForce GTX 1080
SM的数量:20
每个线程块的共享内存大小:48 KB
每个线程块的最大线程数:1024
每个EM的最大线程数:2048
每个EM的最大线程束数:64