使用纹理引用来旋转图片,并在使用了静态编译和运行时编译两种环境。
▶ 源代码:静态编译
1 #include <stdio.h> 2 #include <windows.h> 3 #include <cuda_runtime.h> 4 #include "device_launch_parameters.h" 5 #include <helper_functions.h> 6 #include <helper_cuda.h> 7 8 #define MAX_EPSILON_ERROR 5e-3f 9 const float angle = 0.5f; 10 texture<float, 2, cudaReadModeElementType> tex; 11 12 __global__ void transformKernel(float *outputData, int width, int height, float theta) 13 { 14 unsigned int x = blockIdx.x*blockDim.x + threadIdx.x; 15 unsigned int y = blockIdx.y*blockDim.y + threadIdx.y; 16 float u = x / (float)width - 0.5f; 17 float v = y / (float)height - 0.5f; 18 19 outputData[y*width + x] = tex2D(tex, u * cosf(theta) - v * sinf(theta) + 0.5f, v * cosf(theta) + u * sinf(theta) + 0.5f); 20 } 21 22 int main() 23 { 24 printf(" Start. "); 25 26 // 读取图片数据 27 float *h_data = NULL, *h_dataRef = NULL; 28 unsigned int width, height, size; 29 sdkLoadPGM("D:\Code\CUDA\cudaProjectTemp\data\lena_bw.pgm", &h_data, &width, &height);// 删掉了用函数 sdkFindFilePath() 查找输入文件的过程 30 size = width * height * sizeof(float); 31 sdkLoadPGM("D:\Code\CUDA\cudaProjectTemp\data\ref_rotated.pgm", &h_dataRef, &width, &height); 32 printf(" Load input files, %d x %d pixels ", width, height); 33 34 // 申请设备内存 35 float *d_data = NULL; 36 cudaMalloc((void **)&d_data, size); 37 cudaArray *cuArray; 38 cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat); 39 cudaMallocArray(&cuArray, &channelDesc, width, height); 40 cudaMemcpyToArray(cuArray, 0, 0, h_data, size, cudaMemcpyHostToDevice);// 与 simpleSurfaceWrite 中不同,直接拷贝进 cuArray 41 42 // 绑定纹理引用 43 tex.addressMode[0] = cudaAddressModeWrap; 44 tex.addressMode[1] = cudaAddressModeWrap; 45 tex.filterMode = cudaFilterModeLinear; 46 tex.normalized = true; 47 cudaBindTextureToArray(tex, cuArray, channelDesc); 48 49 // 预跑 50 dim3 dimBlock(8, 8, 1); 51 dim3 dimGrid(width / dimBlock.x, height / dimBlock.y, 1); 52 transformKernel << <dimGrid, dimBlock, 0 >> >(d_data, width, height, angle); 53 cudaDeviceSynchronize(); 54 55 StopWatchInterface *timer = NULL; 56 sdkCreateTimer(&timer); 57 sdkStartTimer(&timer); 58 59 transformKernel << <dimGrid, dimBlock, 0 >> >(d_data, width, height, angle); 60 cudaDeviceSynchronize(); 61 62 sdkStopTimer(&timer); 63 printf(" Cost time: %f ms, %.2f Mpixels/sec ", sdkGetTimerValue(&timer), (width *height / (sdkGetTimerValue(&timer) / 1000.0f)) / 1e6); 64 sdkDeleteTimer(&timer); 65 66 // 结果回收、输出和检验 67 cudaMemcpy(h_data, d_data, size, cudaMemcpyDeviceToHost); 68 sdkSavePGM("D:\Code\CUDA\cudaProjectTemp\data\output.pgm", h_data, width, height); 69 printf(" Save output file. "); 70 printf(" Finish, return %s. ", compareData(h_data, h_dataRef, width * height, MAX_EPSILON_ERROR, 0.0f) ? "Passed" : "Failed"); 71 72 cudaFree(d_data); 73 cudaFreeArray(cuArray); 74 getchar(); 75 return 0; 76 }
▶ 输出结果
Start. Load input files, 512 x 512 pixels Cost time: 0.362788 ms, 722.58 Mpixels/sec Save output file. Finish, return Passed.
▶ 源代码:运行时编译
1 // simpleTexture_kernel.cu 2 #ifndef _SIMPLETEXTURE_KERNEL_H_ 3 #define _SIMPLETEXTURE_KERNEL_H_ 4 5 texture<float, 2, cudaReadModeElementType> tex; 6 7 extern "C" __global__ void transformKernel(float *g_odata, int width, int height, float theta) 8 { 9 unsigned int x = blockIdx.x*blockDim.x + threadIdx.x; 10 unsigned int y = blockIdx.y*blockDim.y + threadIdx.y; 11 float u = x / (float)width - 0.5f; 12 float v = y / (float)height - 0.5f; 13 14 g_odata[y*width + x] = tex2D(tex, u * cosf(theta) - v * sinf(theta) + 0.5f, v * cosf(theta) + u * sinf(theta) + 0.5f); 15 } 16 17 #endif
1 // simpleTextureDrv.cpp 2 #include <stdio.h> 3 #include <iostream> 4 #include <helper_functions.h> 5 #include <cuda.h> 6 7 #define MAX_EPSILON_ERROR 5e-3f 8 #define PATH "D:\Program\CUDA9.0\Samples\0_Simple\simpleTextureDrv\data\" 9 using namespace std; 10 float angle = 0.5f; 11 CUmodule cuModule; 12 CUcontext cuContext; 13 14 CUfunction initCUDA() 15 { 16 CUfunction cuFunction = 0; 17 string module_path, ptx_source; 18 cuInit(0); // 初始化设备,类似于 runtime 中的函数 cudaSetDevice() 19 cuCtxCreate(&cuContext, 0, 0); // 创建上下文,后两个参数分别是标志参数和设备号 20 21 // 读取 .ptx 文件 22 module_path = PATH"simpleTexture_kernel64.ptx"; 23 FILE *fp = fopen(module_path.c_str(), "rb"); 24 fseek(fp, 0, SEEK_END); 25 int file_size = ftell(fp); 26 char *buf = new char[file_size + 1]; 27 fseek(fp, 0, SEEK_SET); 28 fread(buf, sizeof(char), file_size, fp); 29 fclose(fp); 30 buf[file_size] = '