zoukankan      html  css  js  c++  java
  • 0_Simple__simpleTemplates + 0_Simple__simpleTemplates_nvrtc

    使用 C++ 的模板

    ▶ 源代码:静态使用

      1 // sharedmem.cuh
      2 #ifndef _SHAREDMEM_H_
      3 #define _SHAREDMEM_H_
      4 
      5 // SharedMemory 的封装
      6 template <typename T> struct SharedMemory
      7 {
      8     __device__ T *getPointer()
      9     {
     10         extern __device__ void error(void);
     11         error();
     12         return NULL;
     13     }
     14 };
     15 
     16 // SharedMemory 的各种数据类型的实现 
     17 template <> struct SharedMemory <int>
     18 {
     19     __device__ int *getPointer()
     20     {
     21         extern __shared__ int s_int[];
     22         return s_int;
     23     }
     24 };
     25 
     26 template <> struct SharedMemory <unsigned int>
     27 {
     28     __device__ unsigned int *getPointer()
     29     {
     30         extern __shared__ unsigned int s_uint[];
     31         return s_uint;
     32     }
     33 };
     34 
     35 template <> struct SharedMemory <char>
     36 {
     37     __device__ char *getPointer()
     38     {
     39         extern __shared__ char s_char[];
     40         return s_char;
     41     }
     42 };
     43 
     44 template <> struct SharedMemory <unsigned char>
     45 {
     46     __device__ unsigned char *getPointer()
     47     {
     48         extern __shared__ unsigned char s_uchar[];
     49         return s_uchar;
     50     }
     51 };
     52 
     53 template <> struct SharedMemory <short>
     54 {
     55     __device__ short *getPointer()
     56     {
     57         extern __shared__ short s_short[];
     58         return s_short;
     59     }
     60 };
     61 
     62 template <> struct SharedMemory <unsigned short>
     63 {
     64     __device__ unsigned short *getPointer()
     65     {
     66         extern __shared__ unsigned short s_ushort[];
     67         return s_ushort;
     68     }
     69 };
     70 
     71 template <> struct SharedMemory <long>
     72 {
     73     __device__ long *getPointer()
     74     {
     75         extern __shared__ long s_long[];
     76         return s_long;
     77     }
     78 };
     79 
     80 template <> struct SharedMemory <unsigned long>
     81 {
     82     __device__ unsigned long *getPointer()
     83     {
     84         extern __shared__ unsigned long s_ulong[];
     85         return s_ulong;
     86     }
     87 };
     88 
     89 template <> struct SharedMemory <bool>
     90 {
     91     __device__ bool *getPointer()
     92     {
     93         extern __shared__ bool s_bool[];
     94         return s_bool;
     95     }
     96 };
     97 
     98 template <> struct SharedMemory <float>
     99 {
    100     __device__ float *getPointer()
    101     {
    102         extern __shared__ float s_float[];
    103         return s_float;
    104     }
    105 };
    106 
    107 template <> struct SharedMemory <double>
    108 {
    109     __device__ double *getPointer()
    110     {
    111         extern __shared__ double s_double[];
    112         return s_double;
    113     }
    114 };
    115 
    116 #endif
      1 // simpleTemplates.cu
      2 #include <stdio.h>
      3 #include <timer.h>
      4 #include <cuda_runtime.h>
      5 #include "device_launch_parameters.h"
      6 #include <helper_functions.h>
      7 #include <helper_cuda.h>
      8 #include "sharedmem.cuh"
      9 
     10 template<class T> __global__ void testKernel(T *g_idata, T *g_odata)
     11 {
     12     SharedMemory<T> smem;
     13     T *sdata = smem.getPointer();
     14     // 以上两行结合,等效于 extern __shared__  T sdata[];
     15     const unsigned int tid = threadIdx.x;
     16 
     17     sdata[tid] = g_idata[tid];
     18     __syncthreads();
     19     sdata[tid] = (T) blockDim.x * sdata[tid];
     20     __syncthreads();
     21     g_odata[tid] = sdata[tid];
     22 }
     23 
     24 template<class T> void computeGold(T *reference, T *idata, const unsigned int len)// 生成理论结果数据
     25 {
     26     const T T_len = static_cast<T>(len);// 强制类型转换(const unsigned int -> T),并加上 const 限定
     27     for (unsigned int i = 0; i < len; ++i)
     28         reference[i] = idata[i] * T_len;
     29 }
     30 
     31 // ArrayComparator 的封装
     32 template<class T> class ArrayComparator
     33 {
     34     public:
     35         bool compare(const T *reference, T *data, unsigned int len)
     36         {
     37             fprintf(stderr, "Error: no comparison function implemented for this type
    ");
     38             return false;
     39         }
     40 };
     41 // int 和 flaot 的实现,其中的函数 compareData() 定义于 helper_image.h
     42 template<> class ArrayComparator<int>
     43 {
     44     public:
     45         bool compare(const int *reference, int *data, unsigned int len) { return compareData(reference, data, len, 0.15f, 0.0f); }
     46 };
     47 
     48 template<> class ArrayComparator<float>
     49 {
     50     public:
     51         bool compare(const float *reference, float *data, unsigned int len) { return compareData(reference, data, len, 0.15f, 0.15f); }
     52 };
     53 
     54 // ArrayFileWriter 的封装
     55 template<class T> class ArrayFileWriter
     56 {
     57     public:
     58         bool write(const char *filename, T *data, unsigned int len, float epsilon)
     59         {
     60             fprintf(stderr, "Error: no file write function implemented for this type
    ");
     61             return false;
     62         }
     63 };
     64 // int 和 flaot 的实现,其中的函数 sdkWriteFile() 定义于 helper_image.h
     65 template<> class ArrayFileWriter<int>
     66 {
     67     public:
     68         bool write(const char *filename, int *data, unsigned int len, float epsilon) { return sdkWriteFile(filename, data, len, epsilon, false); }
     69 };
     70 
     71 template<> class ArrayFileWriter<float>
     72 {
     73     public:
     74         bool write(const char *filename, float *data, unsigned int len, float epsilon) { return sdkWriteFile(filename, data, len, epsilon, false); }
     75 };
     76 
     77 template<class T> bool test(int len)
     78 {
     79     unsigned int mem_size = sizeof(T) * len;
     80     dim3  grid(1, 1, 1);
     81     dim3  threads(len, 1, 1);
     82     ArrayComparator<T> comparator;
     83     ArrayFileWriter<T> writer;
     84     cudaSetDevice(0);
     85     StartTimer();
     86     
     87     // 申请内存
     88     T *h_idata, *h_odata, *d_idata, *d_odata;
     89     h_idata = (T *)malloc(mem_size);
     90     h_odata = (T *)malloc(mem_size);
     91     cudaMalloc((void **)&d_idata, mem_size);
     92     cudaMalloc((void **)&d_odata, mem_size);
     93     for (unsigned int i = 0; i < len; ++i)
     94         h_idata[i] = (T) i;
     95     cudaMemcpy(d_idata, h_idata, mem_size, cudaMemcpyHostToDevice);
     96     
     97     // 计算和计时
     98     testKernel<T> << < grid, threads, mem_size >> > (d_idata, d_odata);
     99     cudaMemcpy(h_odata, d_odata, sizeof(T) * len, cudaMemcpyDeviceToHost);
    100     printf("
    	Processing time: %f ms
    ", GetTimer());
    101 
    102     // 检查结果
    103     computeGold<T>(h_idata, h_idata, len);// 生成理论结果数据
    104     bool result = comparator.compare(h_idata, h_odata, len);
    105     //writer.write("./data/regression.dat", h_odata, num_threads, 0.0f);// 写入文件的部分
    106     
    107     free(h_idata);
    108     free(h_odata);
    109     cudaFree(d_idata);
    110     cudaFree(d_odata);
    111     return result;
    112 }
    113 
    114 int main()
    115 {
    116     printf("
    	Start.
    ");
    117     printf("
    	> test<float, 32>, result: %s.
    ", test<float>(32) ? "Passed" : "Failed");
    118     printf("
    	> test<float, 64>, result: %s.
    ", test<float>(64) ? "Passed" : "Failed");
    119 
    120     getchar();
    121     return 0;
    122 }

    ▶ 输出结果:

        Start.
    
        Processing time: 107.394216 ms
    
        > test<float, 32>, result: Passed.
    
        Processing time: 3.153182 ms
    
        > test<float, 64>, result: Passed.

    ▶ 源代码:使用运行时编译

    1 // sharedmem.cuh,与静态完全相同
     1 // simpleTemplates_kernel.cu
     2 #include "sharedmem.cuh"
     3 
     4 template<class T> __global__ void testKernel(T *g_idata, T *g_odata)
     5 {
     6     SharedMemory<T> smem;
     7     T *sdata = smem.getPointer();
     8     // 以上两行结合,等效于 extern __shared__  T sdata[];
     9     const unsigned int tid = threadIdx.x;
    10 
    11     sdata[tid] = g_idata[tid];
    12     __syncthreads();
    13     sdata[tid] = (T)blockDim.x * sdata[tid];
    14     __syncthreads();
    15     g_odata[tid] = sdata[tid];
    16 }
    17 
    18 extern "C" __global__ void testFloat(float *p1, float *p2) {  testKernel<float>(p1, p2); }
    19 
    20 extern "C" __global__ void testInt(int *p1, int *p2) {  testKernel<int>(p1, p2); }
      1 // simpleTemplates.cpp
      2 #include <stdio.h>
      3 #include <cuda_runtime.h>
      4 #include "device_launch_parameters.h"
      5 #include <helper_functions.h>
      6 #include <nvrtc_helper.h>
      7 #include <timer.h>
      8 
      9 template<class T> void computeGold(T *reference, T *idata, const unsigned int len)// 生成理论结果数据
     10 {
     11     const T T_len = static_cast<T>(len);// 强制类型转换(const unsigned int -> T),并加上 const 限定
     12     for (unsigned int i = 0; i < len; ++i)
     13         reference[i] = idata[i] * T_len;
     14 }
     15 
     16 // ArrayComparator 的封装
     17 template<class T> class ArrayComparator
     18 {
     19 public:
     20     bool compare(const T *reference, T *data, unsigned int len)
     21     {
     22         fprintf(stderr, "Error: no comparison function implemented for this type
    ");
     23         return false;
     24     }
     25 };
     26 // int 和 flaot 的实现,其中的函数 compareData() 定义于 helper_image.h
     27 template<> class ArrayComparator<int>
     28 {
     29 public:
     30     bool compare(const int *reference, int *data, unsigned int len) { return compareData(reference, data, len, 0.15f, 0.0f); }
     31 };
     32 
     33 template<> class ArrayComparator<float>
     34 {
     35 public:
     36     bool compare(const float *reference, float *data, unsigned int len) { return compareData(reference, data, len, 0.15f, 0.15f); }
     37 };
     38 
     39 // ArrayFileWriter 的封装
     40 template<class T> class ArrayFileWriter
     41 {
     42 public:
     43     bool write(const char *filename, T *data, unsigned int len, float epsilon)
     44     {
     45         fprintf(stderr, "Error: no file write function implemented for this type
    ");
     46         return false;
     47     }
     48 };
     49 // int 和 flaot 的实现,其中的函数 sdkWriteFile() 定义于 helper_image.h
     50 template<> class ArrayFileWriter<int>
     51 {
     52 public:
     53     bool write(const char *filename, int *data, unsigned int len, float epsilon) { return sdkWriteFile(filename, data, len, epsilon, false); }
     54 };
     55 
     56 template<> class ArrayFileWriter<float>
     57 {
     58 public:
     59     bool write(const char *filename, float *data, unsigned int len, float epsilon) { return sdkWriteFile(filename, data, len, epsilon, false); }
     60 };
     61 
     62 // getKernel 的模板
     63 template <typename T> CUfunction getKernel(CUmodule in);
     64 
     65 template<> CUfunction getKernel<int>(CUmodule in)
     66 {
     67     CUfunction kernel_addr;
     68     cuModuleGetFunction(&kernel_addr, in, "testInt");
     69     return kernel_addr;
     70 }
     71 
     72 template<> CUfunction getKernel<float>(CUmodule in)
     73 {
     74     CUfunction kernel_addr;
     75     cuModuleGetFunction(&kernel_addr, in, "testFloat");
     76     return kernel_addr;
     77 }
     78                                      
     79 template<class T> bool test(int len)
     80 {
     81     // 与静态不同,编译 PTX
     82     char *kernel_file = "D:\Program\CUDA9.0\Samples\0_Simple\simpleTemplates_nvrtc\simpleTemplates_kernel.cu";
     83     char *ptx;
     84     size_t ptxSize;
     85     compileFileToPTX(kernel_file, 1, NULL, &ptx, &ptxSize, 0);  // 1, NULL 分别为 argc 和 argv
     86     CUmodule module = loadPTX(ptx, 1, NULL);                    // 1, NULL 分别为 argc 和 argv,有关于 GPU的输出
     87 
     88     unsigned int mem_size = sizeof(T) * len;
     89     dim3  grid(1, 1, 1);
     90     dim3  threads(len, 1, 1);
     91     ArrayComparator<T> comparator;
     92     ArrayFileWriter<T> writer;
     93     StartTimer();
     94 
     95     // 申请内存
     96     T *h_idata, *h_odata;
     97     CUdeviceptr d_idata, d_odata;                   // 与静态不同
     98     h_idata = (T *)malloc(mem_size);
     99     h_odata = (T *)malloc(mem_size);
    100     cuMemAlloc(&d_idata, mem_size);                 // 与静态不同
    101     cuMemAlloc(&d_odata, mem_size);
    102     for (unsigned int i = 0; i < len; ++i)
    103         h_idata[i] = (T)i;
    104     cuMemcpyHtoD(d_idata, h_idata, mem_size);       // 与静态不同
    105 
    106     // 计算和计时
    107     CUfunction kernel_addr = getKernel<T>(module);
    108 
    109     void *arr[] = { (void *)&d_idata, (void *)&d_odata };
    110     cuLaunchKernel(kernel_addr, grid.x, grid.y, grid.z, threads.x, threads.y, threads.z, mem_size, 0, &arr[0], 0);
    111     cuCtxSynchronize();                             // 上下文同步
    112     cuMemcpyDtoH(h_odata, d_odata, sizeof(T) * len);// 与静态不同
    113     printf("
    	Processing time: %f ms
    ", GetTimer());
    114 
    115     // 检查结果
    116     computeGold<T>(h_idata, h_idata, len);// 生成理论结果数据
    117     bool result = comparator.compare(h_idata, h_odata, len);
    118     //writer.write("./data/regression.dat", h_odata, len, 0.0f);// 写入文件的部分
    119 
    120     free(h_idata);
    121     free(h_odata);
    122     cuMemFree(d_idata);                             // 与静态不同
    123     cuMemFree(d_odata);
    124     return result;
    125 }
    126 
    127 int main()
    128 {
    129     printf("
    	Start.
    ");
    130     printf("
    	> test<float, 32>, result: %s.
    ", test<float>(32) ? "Passed" : "Failed");
    131     printf("
    	> test<int, 64>, result: %s.
    ", test<int>(64) ? "Passed" : "Failed");
    132 
    133     getchar();
    134     return 0;
    135 }

    ▶ 输出结果:

        Start.
    > Using CUDA Device [0]: GeForce GTX 1070
    > GPU Device has SM 6.1 compute capability
    
        Processing time: 0.699976 ms
    
        > test<float, 32>, result: Passed.
    > Using CUDA Device [0]: GeForce GTX 1070
    > GPU Device has SM 6.1 compute capability
    
        Processing time: 0.665355 ms
    
        > test<int, 64>, result: Passed.

    ▶ 涨姿势

    ● 封装了 SharedMemory,ArrayComparator,ArrayFileWriter 三个模板,并定义了其在不同的数据类型下的实现。

  • 相关阅读:
    排序总结[3]_线性排序算法
    Spring九问
    DP-最大递增子序列与最大递增子数组; 最大公共子序列与最大公共子数组。
    java 8 新特性
    数据库事务隔离等级
    算法思维方式之二——DP与DFS
    算法思维方式—— 由排列组合想到的
    java Servlet简介
    java hashCode, 引用以及equals().
    java反射简介
  • 原文地址:https://www.cnblogs.com/cuancuancuanhao/p/7955908.html
Copyright © 2011-2022 走看看