▶ 使用 cuda 内置结构 cudaFuncAttributes 来观察核函数的共享内存、寄存器数量
▶ 源代码
1 // cppOverload_kernel.cu 2 __global__ void simple_kernel(const int *pIn, int *pOut, int a) 3 { 4 __shared__ int sData[THREAD_N]; 5 int tid = threadIdx.x + blockDim.x * blockIdx.x; 6 7 sData[threadIdx.x] = pIn[tid]; 8 __syncthreads(); 9 pOut[tid] = sData[threadIdx.x] * a + tid; 10 } 11 12 __global__ void simple_kernel(const int2 *pIn, int *pOut, int a) 13 { 14 __shared__ int2 sData[THREAD_N]; 15 int tid = threadIdx.x + blockDim.x * blockIdx.x; 16 17 sData[threadIdx.x] = pIn[tid]; 18 __syncthreads(); 19 pOut[tid] = (sData[threadIdx.x].x + sData[threadIdx.x].y) * a + tid; 20 } 21 22 __global__ void simple_kernel(const int *pIn1, const int *pIn2, int *pOut, int a) 23 { 24 __shared__ int sData1[THREAD_N], sData2[THREAD_N]; 25 int tid = threadIdx.x + blockDim.x * blockIdx.x; 26 27 sData1[threadIdx.x] = pIn1[tid]; 28 sData2[threadIdx.x] = pIn2[tid]; 29 __syncthreads(); 30 pOut[tid] = (sData1[threadIdx.x] + sData2[threadIdx.x])*a + tid; 31 }
1 // cppOverload.cu 2 #include <stdio.h> 3 #include <helper_cuda.h> 4 #include <helper_math.h> 5 #include <helper_string.h> 6 7 #define THREAD_N 256 8 #include "cppOverload_kernel.cu" // 源代码文件中使用了 THREAD_N,必须先定义 9 10 #define N 1024 11 #define DIV_UP(a, b) (((a) + (b) - 1) / (b)) 12 #define OUTPUT_ATTR(attr) 13 printf("Shared Size: %d ", (int)attr.sharedSizeBytes); 14 printf("Constant Size: %d ", (int)attr.constSizeBytes); 15 printf("Local Size: %d ", (int)attr.localSizeBytes); 16 printf("Max Threads Per Block: %d ", attr.maxThreadsPerBlock); 17 printf("Number of Registers: %d ", attr.numRegs); 18 printf("PTX Version: %d ", attr.ptxVersion); 19 printf("Binary Version: %d ", attr.binaryVersion); 20 21 bool check_func1(int *hInput, int *hOutput, int a) 22 { 23 for (int i = 0; i < N; ++i) 24 { 25 int cpuRes = hInput[i] * a + i; 26 if (hOutput[i] != cpuRes) 27 return false; 28 } 29 return true; 30 } 31 32 bool check_func2(int2 *hInput, int *hOutput, int a) 33 { 34 for (int i = 0; i < N; i++) 35 { 36 int cpuRes = (hInput[i].x + hInput[i].y)*a + i; 37 if (hOutput[i] != cpuRes) 38 return false; 39 } 40 return true; 41 } 42 43 bool check_func3(int *hInput1, int *hInput2, int *hOutput, int a) 44 { 45 for (int i = 0; i < N; i++) 46 { 47 if (hOutput[i] != (hInput1[i] + hInput2[i])*a + i) 48 return false; 49 } 50 return true; 51 } 52 53 int main(int argc, const char *argv[]) 54 { 55 int deviceID = cudaSetDevice(0); 56 57 int *hInput = NULL, *hOutput = NULL, *dInput = NULL, *dOutput = NULL; 58 cudaMalloc(&dInput, sizeof(int)*N * 2); 59 cudaMalloc(&dOutput, sizeof(int)*N); 60 cudaMallocHost(&hInput, sizeof(int)*N * 2); 61 cudaMallocHost(&hOutput, sizeof(int)*N); 62 63 for (int i = 0; i < N * 2; i++) 64 hInput[i] = i; 65 cudaMemcpy(dInput, hInput, sizeof(int)*N * 2, cudaMemcpyHostToDevice); 66 67 const int a = 2; 68 void(*func1)(const int *, int *, int) = simple_kernel; 69 void(*func2)(const int2 *, int *, int) = simple_kernel; 70 void(*func3)(const int *, const int *, int *, int) = simple_kernel; 71 struct cudaFuncAttributes attr; 72 73 // function 1 74 memset(&attr, 0, sizeof(attr)); 75 cudaFuncSetCacheConfig(*func1, cudaFuncCachePreferShared); // 运行前分析资源占用 76 cudaFuncGetAttributes(&attr, *func1); 77 OUTPUT_ATTR(attr); 78 (*func1) << <DIV_UP(N, THREAD_N), THREAD_N >> >(dInput, dOutput, a); 79 cudaDeviceSynchronize(); 80 cudaMemcpy(hOutput, dOutput, sizeof(int)*N, cudaMemcpyDeviceToHost); 81 printf("simple_kernel(const int *pIn, int *pOut, int a) %s ", check_func1(hInput, hOutput, a) ? "PASSED" : "FAILED"); 82 83 // function 2 84 memset(&attr, 0, sizeof(attr)); 85 cudaFuncSetCacheConfig(*func2, cudaFuncCachePreferShared); 86 cudaFuncGetAttributes(&attr, *func2); 87 OUTPUT_ATTR(attr); 88 (*func2) << <DIV_UP(N, THREAD_N), THREAD_N >> >((int2 *)dInput, dOutput, a); // 强行转换成 int2*,反正也是对其的 89 cudaMemcpy(hOutput, dOutput, sizeof(int)*N, cudaMemcpyDeviceToHost); 90 printf("simple_kernel(const int2 *pIn, int *pOut, int a) %s ", check_func2(reinterpret_cast<int2 *>(hInput), hOutput, a) ? "PASSED" : "FAILED"); 91 92 // function 3 93 memset(&attr, 0, sizeof(attr)); 94 cudaFuncSetCacheConfig(*func3, cudaFuncCachePreferShared); 95 cudaFuncGetAttributes(&attr, *func3); 96 OUTPUT_ATTR(attr); 97 (*func3) << <DIV_UP(N, THREAD_N), THREAD_N >> >(dInput, dInput + N, dOutput, a); 98 cudaMemcpy(hOutput, dOutput, sizeof(int)*N, cudaMemcpyDeviceToHost); 99 printf("simple_kernel(const int *pIn1, const int *pIn2, int *pOut, int a) %s ", check_func3(&hInput[0], &hInput[N], hOutput, a) ? "PASSED" : "FAILED"); 100 101 cudaFree(dInput); 102 cudaFree(dOutput); 103 cudaFreeHost(hOutput); 104 cudaFreeHost(hInput); 105 getchar(); 106 return 0; 107 }
● 输出结果:
Shared Size: 1024 Constant Size: 0 Local Size: 0 Max Threads Per Block: 1024 Number of Registers: 12 PTX Version: 60 Binary Version: 60 simple_kernel(const int *pIn, int *pOut, int a) PASSED Shared Size: 2048 Constant Size: 0 Local Size: 0 Max Threads Per Block: 1024 Number of Registers: 13 PTX Version: 60 Binary Version: 60 simple_kernel(const int2 *pIn, int *pOut, int a) PASSED Shared Size: 2048 Constant Size: 0 Local Size: 0 Max Threads Per Block: 1024 Number of Registers: 14 PTX Version: 60 Binary Version: 60 simple_kernel(const int *pIn1, const int *pIn2, int *pOut, int a) PASSED
▶ 涨姿势:
● cuda 使用扩展名为 .cuh 的头文件
● cuda内置结构 cudaFuncAttributes 的定义:
1 struct __device_builtin__ cudaFuncAttributes 2 { 3 size_t sharedSizeBytes; // 共享内存大小 4 size_t constSizeBytees; // 常量内存大小 5 size_t localSizeBytes; // 局部内存大小 6 int maxThreadsPerBlock; // 每线程块线最大程数量 7 int numRegs; // 寄存器数量 8 int ptxVersion; // PTX版本号 9 int binaryVersion; // 机器码版本号 10 int cacheModeCA; // 是否使用编译指令 -Xptxas --dlcm=ca 11 };
● 通过使用cuda的内置结构和函数来查看核函数使用的共享内存与寄存器数量
1 struct cudaFuncAttributes attr; 2 memset(&attr, 0, sizeof(attr)); 3 cudaFuncSetCacheConfig(*function, cudaFuncCachePreferShared); 4 cudaFuncGetAttributes(&attr, *function);
■ 涉及的函数
1 extern __host__ cudaError_t CUDARTAPI cudaFuncSetCacheConfig(const void *func, enum cudaFuncCache cacheConfig); 2 3 __device__ __attribute__((nv_weak)) cudaError_t cudaFuncGetAttributes(struct cudaFuncAttributes *p, const void *c) 4 { 5 return cudaErrorUnknown; 6 } 7 8 #define OUTPUT_ATTR(attr) 9 printf("Shared Size: %d ", (int)attr.sharedSizeBytes); 10 printf("Constant Size: %d ", (int)attr.constSizeBytes); 11 printf("Local Size: %d ", (int)attr.localSizeBytes); 12 printf("Max Threads Per Block: %d ", attr.maxThreadsPerBlock); 13 printf("Number of Registers: %d ", attr.numRegs); 14 printf("PTX Version: %d ", attr.ptxVersion); 15 printf("Binary Version: %d ", attr.binaryVersion);