zoukankan      html  css  js  c++  java
  • 0_Simple__cppOverload

    ▶ 使用 cuda 内置结构 cudaFuncAttributes 来观察核函数的共享内存、寄存器数量

    ▶ 源代码

     1 // cppOverload_kernel.cu
     2 __global__ void simple_kernel(const int *pIn, int *pOut, int a)
     3 {
     4     __shared__ int sData[THREAD_N];
     5     int tid = threadIdx.x + blockDim.x * blockIdx.x;
     6 
     7     sData[threadIdx.x] = pIn[tid];
     8     __syncthreads();
     9     pOut[tid] = sData[threadIdx.x] * a + tid;
    10 }
    11 
    12 __global__ void simple_kernel(const int2 *pIn, int *pOut, int a)
    13 {
    14     __shared__ int2 sData[THREAD_N];
    15     int tid = threadIdx.x + blockDim.x * blockIdx.x;
    16 
    17     sData[threadIdx.x] = pIn[tid];
    18     __syncthreads();
    19     pOut[tid] = (sData[threadIdx.x].x + sData[threadIdx.x].y) * a + tid;
    20 }
    21 
    22 __global__ void simple_kernel(const int *pIn1, const int *pIn2, int *pOut, int a)
    23 {
    24     __shared__ int sData1[THREAD_N], sData2[THREAD_N];
    25     int tid = threadIdx.x + blockDim.x * blockIdx.x;
    26 
    27     sData1[threadIdx.x] = pIn1[tid];
    28     sData2[threadIdx.x] = pIn2[tid];
    29     __syncthreads();
    30     pOut[tid] = (sData1[threadIdx.x] + sData2[threadIdx.x])*a + tid;
    31 }
      1 // cppOverload.cu
      2 #include <stdio.h>
      3 #include <helper_cuda.h>
      4 #include <helper_math.h>
      5 #include <helper_string.h>
      6 
      7 #define THREAD_N            256
      8 #include "cppOverload_kernel.cu"                                            // 源代码文件中使用了 THREAD_N,必须先定义
      9 
     10 #define N                   1024
     11 #define DIV_UP(a, b)        (((a) + (b) - 1) / (b))
     12 #define OUTPUT_ATTR(attr)                                               
     13     printf("Shared Size:           %d
    ", (int)attr.sharedSizeBytes);   
     14     printf("Constant Size:         %d
    ", (int)attr.constSizeBytes);    
     15     printf("Local Size:            %d
    ", (int)attr.localSizeBytes);    
     16     printf("Max Threads Per Block: %d
    ", attr.maxThreadsPerBlock);     
     17     printf("Number of Registers:   %d
    ", attr.numRegs);                
     18     printf("PTX Version:           %d
    ", attr.ptxVersion);             
     19     printf("Binary Version:        %d
    ", attr.binaryVersion);             
     20 
     21 bool check_func1(int *hInput, int *hOutput, int a)
     22 {
     23     for (int i = 0; i < N; ++i)
     24     {
     25         int cpuRes = hInput[i] * a + i;
     26         if (hOutput[i] != cpuRes)
     27             return false;
     28     }
     29     return true;
     30 }
     31 
     32 bool check_func2(int2 *hInput, int *hOutput, int a)
     33 {
     34     for (int i = 0; i < N; i++)
     35     {
     36         int cpuRes = (hInput[i].x + hInput[i].y)*a + i;
     37         if (hOutput[i] != cpuRes)
     38             return false;
     39     }
     40     return true;
     41 }
     42 
     43 bool check_func3(int *hInput1, int *hInput2, int *hOutput, int a)
     44 {
     45     for (int i = 0; i < N; i++)
     46     {
     47         if (hOutput[i] != (hInput1[i] + hInput2[i])*a + i)
     48             return false;
     49     }
     50     return true;
     51 }
     52 
     53 int main(int argc, const char *argv[])
     54 {    
     55     int deviceID = cudaSetDevice(0);
     56 
     57     int *hInput = NULL, *hOutput = NULL, *dInput = NULL, *dOutput = NULL;
     58     cudaMalloc(&dInput, sizeof(int)*N * 2);
     59     cudaMalloc(&dOutput, sizeof(int)*N);
     60     cudaMallocHost(&hInput, sizeof(int)*N * 2);
     61     cudaMallocHost(&hOutput, sizeof(int)*N);
     62     
     63     for (int i = 0; i < N * 2; i++)
     64         hInput[i] = i;
     65     cudaMemcpy(dInput, hInput, sizeof(int)*N * 2, cudaMemcpyHostToDevice);
     66   
     67     const int a = 2;
     68     void(*func1)(const int *, int *, int) = simple_kernel;
     69     void(*func2)(const int2 *, int *, int) = simple_kernel;
     70     void(*func3)(const int *, const int *, int *, int) = simple_kernel;
     71     struct cudaFuncAttributes attr;
     72 
     73     // function 1
     74     memset(&attr, 0, sizeof(attr));
     75     cudaFuncSetCacheConfig(*func1, cudaFuncCachePreferShared);                      // 运行前分析资源占用
     76     cudaFuncGetAttributes(&attr, *func1);
     77     OUTPUT_ATTR(attr);
     78     (*func1) << <DIV_UP(N, THREAD_N), THREAD_N >> >(dInput, dOutput, a);
     79     cudaDeviceSynchronize();
     80     cudaMemcpy(hOutput, dOutput, sizeof(int)*N, cudaMemcpyDeviceToHost);    
     81     printf("simple_kernel(const int *pIn, int *pOut, int a) %s
    
    ", check_func1(hInput, hOutput, a) ? "PASSED" : "FAILED");
     82 
     83     // function 2
     84     memset(&attr, 0, sizeof(attr));
     85     cudaFuncSetCacheConfig(*func2, cudaFuncCachePreferShared);
     86     cudaFuncGetAttributes(&attr, *func2);
     87     OUTPUT_ATTR(attr);
     88     (*func2) << <DIV_UP(N, THREAD_N), THREAD_N >> >((int2 *)dInput, dOutput, a);    // 强行转换成 int2*,反正也是对其的
     89     cudaMemcpy(hOutput, dOutput, sizeof(int)*N, cudaMemcpyDeviceToHost);
     90     printf("simple_kernel(const int2 *pIn, int *pOut, int a) %s
    
    ", check_func2(reinterpret_cast<int2 *>(hInput), hOutput, a) ? "PASSED" : "FAILED");
     91 
     92     // function 3
     93     memset(&attr, 0, sizeof(attr));
     94     cudaFuncSetCacheConfig(*func3, cudaFuncCachePreferShared);
     95     cudaFuncGetAttributes(&attr, *func3);
     96     OUTPUT_ATTR(attr);
     97     (*func3) << <DIV_UP(N, THREAD_N), THREAD_N >> >(dInput, dInput + N, dOutput, a);
     98     cudaMemcpy(hOutput, dOutput, sizeof(int)*N, cudaMemcpyDeviceToHost);    
     99     printf("simple_kernel(const int *pIn1, const int *pIn2, int *pOut, int a) %s
    
    ", check_func3(&hInput[0], &hInput[N], hOutput, a) ? "PASSED" : "FAILED");
    100 
    101     cudaFree(dInput);
    102     cudaFree(dOutput);
    103     cudaFreeHost(hOutput);
    104     cudaFreeHost(hInput);    
    105     getchar();
    106     return 0;
    107 }

    ● 输出结果:

    Shared Size:           1024
    Constant Size:         0
    Local Size:            0
    Max Threads Per Block: 1024
    Number of Registers:   12
    PTX Version:           60
    Binary Version:        60
    simple_kernel(const int *pIn, int *pOut, int a) PASSED
    
    Shared Size:           2048
    Constant Size:         0
    Local Size:            0
    Max Threads Per Block: 1024
    Number of Registers:   13
    PTX Version:           60
    Binary Version:        60
    simple_kernel(const int2 *pIn, int *pOut, int a) PASSED
    
    Shared Size:           2048
    Constant Size:         0
    Local Size:            0
    Max Threads Per Block: 1024
    Number of Registers:   14
    PTX Version:           60
    Binary Version:        60
    simple_kernel(const int *pIn1, const int *pIn2, int *pOut, int a) PASSED

    ▶ 涨姿势:

    ● cuda 使用扩展名为 .cuh 的头文件

    ● cuda内置结构 cudaFuncAttributes 的定义:

     1 struct __device_builtin__ cudaFuncAttributes
     2 {
     3     size_t sharedSizeBytes; // 共享内存大小
     4     size_t constSizeBytees; // 常量内存大小
     5     size_t localSizeBytes;  // 局部内存大小
     6     int maxThreadsPerBlock; // 每线程块线最大程数量
     7     int numRegs;            // 寄存器数量
     8     int ptxVersion;         // PTX版本号
     9     int binaryVersion;      // 机器码版本号
    10     int cacheModeCA;        // 是否使用编译指令 -Xptxas --dlcm=ca
    11 };

    ● 通过使用cuda的内置结构和函数来查看核函数使用的共享内存与寄存器数量

    1 struct cudaFuncAttributes attr;
    2 memset(&attr, 0, sizeof(attr));
    3 cudaFuncSetCacheConfig(*function, cudaFuncCachePreferShared);
    4 cudaFuncGetAttributes(&attr, *function);

    ■ 涉及的函数

     1 extern __host__ cudaError_t CUDARTAPI cudaFuncSetCacheConfig(const void *func, enum cudaFuncCache cacheConfig);
     2 
     3 __device__ __attribute__((nv_weak)) cudaError_t cudaFuncGetAttributes(struct cudaFuncAttributes *p, const void *c)
     4 {
     5     return cudaErrorUnknown;
     6 }
     7 
     8 #define OUTPUT_ATTR(attr)                                           
     9     printf("Shared Size:   %d
    ", (int)attr.sharedSizeBytes);       
    10     printf("Constant Size: %d
    ", (int)attr.constSizeBytes);        
    11     printf("Local Size:    %d
    ", (int)attr.localSizeBytes);        
    12     printf("Max Threads Per Block: %d
    ", attr.maxThreadsPerBlock); 
    13     printf("Number of Registers: %d
    ", attr.numRegs);              
    14     printf("PTX Version: %d
    ", attr.ptxVersion);                   
    15     printf("Binary Version: %d
    ", attr.binaryVersion);
  • 相关阅读:
    Tomcat配置JNDI
    (转)通过反编译深入理解Java String及intern
    (转)Java8内存模型-永久代(PermGen)和元空间(Metaspace)
    排序算法
    并发编程
    MySQL
    Go语言
    Go语言
    Go语言
    Go语言
  • 原文地址:https://www.cnblogs.com/cuancuancuanhao/p/7742811.html
Copyright © 2011-2022 走看看