zoukankan      html  css  js  c++  java
  • 0_Simple__simpleTexture + 0_Simple__simpleTextureDrv

    使用纹理引用来旋转图片,并在使用了静态编译和运行时编译两种环境。

    ▶ 源代码:静态编译

     1 #include <stdio.h>
     2 #include <windows.h>
     3 #include <cuda_runtime.h>
     4 #include "device_launch_parameters.h"
     5 #include <helper_functions.h>
     6 #include <helper_cuda.h>
     7 
     8 #define MAX_EPSILON_ERROR 5e-3f
     9 const float angle = 0.5f;
    10 texture<float, 2, cudaReadModeElementType> tex;
    11 
    12 __global__ void transformKernel(float *outputData, int width, int height, float theta)
    13 {
    14     unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
    15     unsigned int y = blockIdx.y*blockDim.y + threadIdx.y; 
    16     float u = x / (float)width - 0.5f;
    17     float v = y / (float)height - 0.5f;
    18 
    19     outputData[y*width + x] = tex2D(tex, u * cosf(theta) - v * sinf(theta) + 0.5f, v * cosf(theta) + u * sinf(theta) + 0.5f);
    20 }
    21 
    22 int main()
    23 {
    24     printf("
    	Start.
    ");
    25 
    26     // 读取图片数据
    27     float *h_data = NULL, *h_dataRef = NULL;
    28     unsigned int width, height, size;
    29     sdkLoadPGM("D:\Code\CUDA\cudaProjectTemp\data\lena_bw.pgm", &h_data, &width, &height);// 删掉了用函数 sdkFindFilePath() 查找输入文件的过程
    30     size = width * height * sizeof(float);
    31     sdkLoadPGM("D:\Code\CUDA\cudaProjectTemp\data\ref_rotated.pgm", &h_dataRef, &width, &height);
    32     printf("
    	Load input files, %d x %d pixels
    ", width, height);
    33 
    34     // 申请设备内存
    35     float *d_data = NULL;
    36     cudaMalloc((void **)&d_data, size);
    37     cudaArray *cuArray;
    38     cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
    39     cudaMallocArray(&cuArray, &channelDesc, width, height); 
    40     cudaMemcpyToArray(cuArray, 0, 0, h_data, size, cudaMemcpyHostToDevice);// 与 simpleSurfaceWrite 中不同,直接拷贝进 cuArray 
    41 
    42     // 绑定纹理引用
    43     tex.addressMode[0] = cudaAddressModeWrap;
    44     tex.addressMode[1] = cudaAddressModeWrap;
    45     tex.filterMode = cudaFilterModeLinear;
    46     tex.normalized = true;
    47     cudaBindTextureToArray(tex, cuArray, channelDesc);
    48 
    49     // 预跑
    50     dim3 dimBlock(8, 8, 1);
    51     dim3 dimGrid(width / dimBlock.x, height / dimBlock.y, 1);
    52     transformKernel << <dimGrid, dimBlock, 0 >> >(d_data, width, height, angle);
    53     cudaDeviceSynchronize();
    54 
    55     StopWatchInterface *timer = NULL;
    56     sdkCreateTimer(&timer);
    57     sdkStartTimer(&timer);
    58 
    59     transformKernel << <dimGrid, dimBlock, 0 >> >(d_data, width, height, angle);
    60     cudaDeviceSynchronize();
    61     
    62     sdkStopTimer(&timer);
    63     printf("
    	Cost time: %f ms, %.2f Mpixels/sec
    ", sdkGetTimerValue(&timer), (width *height / (sdkGetTimerValue(&timer) / 1000.0f)) / 1e6);
    64     sdkDeleteTimer(&timer);
    65     
    66     // 结果回收、输出和检验
    67     cudaMemcpy(h_data, d_data, size, cudaMemcpyDeviceToHost);
    68     sdkSavePGM("D:\Code\CUDA\cudaProjectTemp\data\output.pgm", h_data, width, height);
    69     printf("
    	Save output file.
    ");
    70     printf("
    	Finish, return %s.
    ", compareData(h_data, h_dataRef, width * height, MAX_EPSILON_ERROR, 0.0f) ? "Passed" : "Failed");
    71 
    72     cudaFree(d_data);
    73     cudaFreeArray(cuArray);
    74     getchar();
    75     return 0;
    76 }

    ▶ 输出结果

        Start.
    
        Load input files, 512 x 512 pixels
    
        Cost time: 0.362788 ms, 722.58 Mpixels/sec
    
        Save output file.
    
        Finish, return Passed.

    ▶ 源代码:运行时编译

     1 // simpleTexture_kernel.cu
     2 #ifndef _SIMPLETEXTURE_KERNEL_H_
     3 #define _SIMPLETEXTURE_KERNEL_H_
     4 
     5 texture<float, 2, cudaReadModeElementType> tex;
     6 
     7 extern "C" __global__ void transformKernel(float *g_odata, int width, int height, float theta)
     8 {
     9     unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
    10     unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
    11     float u = x / (float)width - 0.5f;
    12     float v = y / (float)height - 0.5f;
    13 
    14     g_odata[y*width + x] = tex2D(tex, u * cosf(theta) - v * sinf(theta) + 0.5f, v * cosf(theta) + u * sinf(theta) + 0.5f);
    15 }
    16 
    17 #endif
      1 // simpleTextureDrv.cpp
      2 #include <stdio.h>
      3 #include <iostream>
      4 #include <helper_functions.h>
      5 #include <cuda.h>
      6 
      7 #define MAX_EPSILON_ERROR 5e-3f
      8 #define PATH "D:\Program\CUDA9.0\Samples\0_Simple\simpleTextureDrv\data\"
      9 using namespace std;
     10 float angle = 0.5f;
     11 CUmodule cuModule;
     12 CUcontext cuContext;
     13 
     14 CUfunction initCUDA()
     15 {
     16     CUfunction cuFunction = 0;
     17     string module_path, ptx_source;
     18     cuInit(0);                      // 初始化设备,类似于 runtime 中的函数 cudaSetDevice()
     19     cuCtxCreate(&cuContext, 0, 0);  // 创建上下文,后两个参数分别是标志参数和设备号
     20 
     21     // 读取 .ptx 文件
     22     module_path = PATH"simpleTexture_kernel64.ptx";
     23     FILE *fp = fopen(module_path.c_str(), "rb");
     24     fseek(fp, 0, SEEK_END);
     25     int file_size = ftell(fp);
     26     char *buf = new char[file_size + 1];
     27     fseek(fp, 0, SEEK_SET);
     28     fread(buf, sizeof(char), file_size, fp);
     29     fclose(fp);
     30     buf[file_size] = '';
     31     ptx_source = buf;
     32     delete[] buf;
     33 
     34     if (module_path.rfind("ptx") != string::npos)// 使用的是.ptx,需要运行时编译
     35     {
     36         // 设定编译参数,CUjit_option 放置参数名,jitOptVals 放置参数值
     37         const unsigned int jitNumOptions = 3;
     38         CUjit_option *jitOptions = new CUjit_option[jitNumOptions];
     39         void **jitOptVals = new void *[jitNumOptions];
     40 
     41         // 编译日志长度
     42         jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
     43         int jitLogBufferSize = 1024;
     44         jitOptVals[0] = (void *)(size_t)jitLogBufferSize;
     45 
     46         // 编译日志内容
     47         jitOptions[1] = CU_JIT_INFO_LOG_BUFFER;
     48         char *jitLogBuffer = new char[jitLogBufferSize];
     49         jitOptVals[1] = jitLogBuffer;
     50 
     51         // 设定一个内核使用的寄存器数量
     52         jitOptions[2] = CU_JIT_MAX_REGISTERS;
     53         int jitRegCount = 32;
     54         jitOptVals[2] = (void *)(size_t)jitRegCount;
     55 
     56         // 编译模块
     57         cuModuleLoadDataEx(&cuModule, ptx_source.c_str(), jitNumOptions, jitOptions, (void **)jitOptVals);
     58         //printf("
    	PTX JIT log:
    %s
    ", jitLogBuffer);// 输出编译日志
     59     }
     60     else// 使用的是 .cubin,不用编译(本例中不经过这个分支)
     61         cuModuleLoad(&cuModule, module_path.c_str());
     62     
     63     // 取出编译好的模块中的函数
     64     cuModuleGetFunction(&cuFunction, cuModule, "transformKernel");
     65     return cuFunction;// 删掉了错误检查,如果中间某一步出错,则应该先销毁上下文再退出
     66 }
     67 
     68 int main()
     69 {
     70     printf("
    	Start.
    "); 
     71 
     72     // 初始化设备,编译 PTX
     73     CUfunction transform = initCUDA();
     74 
     75     // 读取图片数据
     76     float *h_data = NULL, *h_dataRef = NULL;
     77     unsigned int width, height, size;
     78     sdkLoadPGM(PATH"lena_bw.pgm", &h_data, &width, &height);// 删掉了用函数 sdkFindFilePath() 查找输入文件的过程
     79     size = width * height * sizeof(float);
     80     sdkLoadPGM(PATH"ref_rotated.pgm", &h_dataRef, &width, &height);
     81     printf("
    	Load input files, %d x %d pixels
    ", width, height);
     82 
     83     // 申请设备内存
     84     CUdeviceptr d_data = (CUdeviceptr)NULL;
     85     cuMemAlloc(&d_data, size);
     86     CUarray cu_array;
     87     CUDA_ARRAY_DESCRIPTOR desc;
     88     desc.Format = CU_AD_FORMAT_FLOAT;
     89     desc.NumChannels = 1;
     90     desc.Width = width;
     91     desc.Height = height;
     92     cuArrayCreate(&cu_array, &desc);
     93     CUDA_MEMCPY2D copyParam;
     94     memset(&copyParam, 0, sizeof(copyParam));
     95     copyParam.dstMemoryType = CU_MEMORYTYPE_ARRAY;
     96     copyParam.dstArray = cu_array;
     97     copyParam.srcMemoryType = CU_MEMORYTYPE_HOST;
     98     copyParam.srcHost = h_data;
     99     copyParam.srcPitch = width * sizeof(float);
    100     copyParam.WidthInBytes = copyParam.srcPitch;
    101     copyParam.Height = height;
    102     cuMemcpy2D(&copyParam);
    103 
    104     // 绑定纹理引用
    105     CUtexref cu_texref;
    106     cuModuleGetTexRef(&cu_texref, cuModule, "tex");
    107     cuTexRefSetArray(cu_texref, cu_array, CU_TRSA_OVERRIDE_FORMAT);
    108     cuTexRefSetAddressMode(cu_texref, 0, CU_TR_ADDRESS_MODE_WRAP);
    109     cuTexRefSetAddressMode(cu_texref, 1, CU_TR_ADDRESS_MODE_WRAP);
    110     cuTexRefSetFilterMode(cu_texref, CU_TR_FILTER_MODE_LINEAR);
    111     cuTexRefSetFlags(cu_texref, CU_TRSF_NORMALIZED_COORDINATES);
    112     cuTexRefSetFormat(cu_texref, CU_AD_FORMAT_FLOAT, 1);
    113     cuParamSetTexRef(transform, CU_PARAM_TR_DEFAULT, cu_texref);
    114 
    115     int block_size = 8;
    116     StopWatchInterface *timer = NULL;
    117 
    118     // 两种调用 Driver API 的方式
    119     if (1)
    120     {
    121         void *args[5] = {&d_data, &width, &height, &angle};
    122         // 预跑
    123         cuLaunchKernel(transform, (width / block_size), (height / block_size), 1, block_size, block_size, 1, 0, NULL, args, NULL);
    124         cuCtxSynchronize();
    125         // 再跑一次测试性能
    126         sdkCreateTimer(&timer);
    127         sdkStartTimer(&timer);        
    128         cuLaunchKernel(transform, (width / block_size), (height / block_size), 1, block_size, block_size, 1, 0, NULL, args, NULL);
    129     }
    130     else
    131     {
    132         int offset = 0;
    133         char argBuffer[256];
    134         // 在一个 CUdeviceptr(unsigned long long)长度的空间里写入调用参数
    135         *((CUdeviceptr *)&argBuffer[offset]) = d_data;
    136         offset += sizeof(d_data);
    137         *((unsigned int *)&argBuffer[offset]) = width;
    138         offset += sizeof(width);
    139         *((unsigned int *)&argBuffer[offset]) = height;
    140         offset += sizeof(height);
    141         *((float *)&argBuffer[offset]) = angle;
    142         offset += sizeof(angle);
    143         void *kernel_launch_config[5] = {CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer, CU_LAUNCH_PARAM_BUFFER_SIZE, &offset, CU_LAUNCH_PARAM_END };
    144         // 预跑
    145         cuLaunchKernel(transform, (width / block_size), (height / block_size), 1,block_size, block_size, 1,0,NULL, NULL, (void **)&kernel_launch_config);
    146         cuCtxSynchronize();
    147         // 再跑一次测试性能
    148         sdkCreateTimer(&timer);
    149         sdkStartTimer(&timer);        
    150         cuLaunchKernel(transform, (width / block_size), (height / block_size), 1,block_size, block_size, 1,0, 0,NULL, (void **)&kernel_launch_config);
    151     }
    152     cuCtxSynchronize();
    153     sdkStopTimer(&timer);
    154     printf("
    	Cost time: %f ms, %.2f Mpixels/sec
    ", sdkGetTimerValue(&timer), (width *height / (sdkGetTimerValue(&timer) / 1000.0f)) / 1e6);
    155     sdkDeleteTimer(&timer);
    156 
    157     // 结果回收、输出和检验
    158     cuMemcpyDtoH(h_data, d_data, size);
    159     sdkSavePGM("D:\Code\CUDA\cudaProjectTemp\data\output.pgm", h_data, width, height);
    160     printf("
    	Save output file.
    ");
    161     printf("
    	Finish, return %s.
    ", compareData(h_data, h_dataRef, width * height, MAX_EPSILON_ERROR, 0.15f) ? "Passed" : "Failed");
    162 
    163     cuMemFree(d_data);
    164     cuArrayDestroy(cu_array);
    165     cuCtxDestroy(cuContext);
    166     getchar();
    167     return 0;
    168 }

    ▶ 输出结果:

        Start.
    
        Load input files, 512 x 512 pixels
    
        Cost time: 0.355230 ms, 737.96 Mpixels/sec
    
        Save output file.
    
        Finish, return Passed.

    ▶ 涨姿势

    ● 一般,与 0_Simple__simpleSurfaceWrite 类似。

  • 相关阅读:
    Golang Struct 声明和使用
    docker 中ulimit设置理解
    微服务架构引入的问题及解决方案
    Jenkins 集成Sonar代码质量扫描
    Jenkins和gitlab集成自动构建
    初识微服务架构
    jenkins 集成钉钉机器人通知
    Go 新起点
    shell中的(),{}几种语法用法
    二进制日志配置和运维管理
  • 原文地址:https://www.cnblogs.com/cuancuancuanhao/p/7957569.html
Copyright © 2011-2022 走看看