zoukankan      html  css  js  c++  java
  • 0_Simple__simpleOccupancy

    计算核函数调用使得占用率,并尝试使用 runtime 函数自动优化线程块尺寸,以便提高占用率。


    ▶ 源代码。

      1 #include <iostream>
      2 #include "cuda_runtime.h"
      3 #include "device_launch_parameters.h"
      4 #include <helper_cuda.h>         
      5 
      6 const int manualBlockSize = 32;
      7 
      8 // 核函数,输入数组的每个元素平方后放回
      9 __global__ void square(int *array, int arrayCount)
     10 {
     11     extern __shared__ int dynamicSmem[];
     12     int idx = threadIdx.x + blockIdx.x * blockDim.x;
     13     
     14     if (idx < arrayCount)
     15         array[idx] *= array[idx];
     16 }
     17 
     18 // 负责调用核函数,计时,并考虑是否使用 runtime 函数优化线程块尺寸
     19 static int launchConfig(int *data, int size, bool automatic)
     20 {
     21     int blockSize;
     22     int numBlocks;
     23     int gridSize;
     24     int minGridSize;
     25     float elapsedTime;
     26     double potentialOccupancy;
     27     size_t dynamicSMemUsage = 0;
     28     
     29     cudaDeviceProp prop;
     30     cudaGetDeviceProperties(&prop, 0);
     31     cudaEvent_t start;
     32     cudaEvent_t end;
     33     cudaEventCreate(&start);
     34     cudaEventCreate(&end);
     35 
     36     if (automatic)// true 则使用 runtime 函数自动优化线程块尺寸
     37     {
     38         cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, (void*)square, dynamicSMemUsage, size);
     39         printf("
    	Suggested block size: %d, minimum grid size for maximum occupancy: %d
    ", blockSize, minGridSize);
     40     }
     41     else
     42         blockSize = manualBlockSize;
     43 
     44     gridSize = (size + blockSize - 1) / blockSize;
     45 
     46     cudaEventRecord(start);
     47     square<<<gridSize, blockSize, dynamicSMemUsage>>>(data, size);
     48     cudaEventRecord(end);
     49     cudaDeviceSynchronize();
     50     cudaEventElapsedTime(&elapsedTime, start, end);
     51     printf("
    	Elapsed time: %4.2f ms
    ", elapsedTime);
     52     
     53     // 依线程数计算占用率,分子分母同除以 prop.warpSize 即按活动线程束数计算,两者等价
     54     cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocks, square, blockSize, dynamicSMemUsage); 
     55     potentialOccupancy = (double)(numBlocks * blockSize) / (prop.maxThreadsPerMultiProcessor); 
     56     printf("
    	Potential occupancy: %4.2f %%
    ", potentialOccupancy * 100);
     57 
     58     return 0;
     59 }
     60 
     61 // 负责核函数调用前后内存控制,以及结果检查
     62 static int test(bool automaticLaunchConfig, const int count = 1000000)
     63 {
     64     int size = count * sizeof(int);
     65     int *h_data = (int *)malloc(size);
     66     for (int i = 0; i < count; i++)
     67         h_data[i] = i;
     68     int *d_data;
     69     cudaMalloc(&d_data, size);
     70     
     71     cudaMemcpy(d_data, h_data, size, cudaMemcpyHostToDevice);
     72     memset(h_data,0,size);
     73     launchConfig(d_data, count, automaticLaunchConfig);
     74     cudaMemcpy(h_data, d_data, size, cudaMemcpyDeviceToHost);
     75     
     76     for (int i = 0; i < count; i += 1)
     77     {
     78         if (h_data[i] != i * i)
     79         {
     80             printf("
    	Error at %d, d_data = %d
    ", i, h_data[i]);
     81             return 1;
     82         }
     83     }             
     84 
     85     free(h_data);
     86     cudaFree(d_data);
     87     return 0;
     88 }
     89 
     90 int main()
     91 {
     92     int status;
     93 
     94     printf("
    	Start.
    ");
     95     
     96     printf("
    	Manual configuration test, BlockSize = %d
    ", manualBlockSize);
     97     if (test(false))
     98     {
     99         printf("
    	Test failed
    ");
    100         return -1;
    101     }
    102 
    103     printf("
    	Automatic configuration
    ");
    104     if (test(true))
    105     {
    106         printf("
    	Test failed
    ");
    107         return -1;
    108     }        
    109     
    110     printf("
    	Test PASSED
    ");
    111     getchar();
    112     return 0;
    113 }

    ▶ 输出结果

        Start.
    
        Manual configuration test, BlockSize = 32
    
        Elapsed time: 0.13 ms
    
        Potential occupancy: 50.00 %
    
        Automatic configuration
    
        Suggested block size: 1024, minimum grid size for maximum occupancy: 32
    
        Elapsed time: 0.12 ms
    
        Potential occupancy: 100.00 %
    
        Test PASSED

    ▶ 涨姿势

    ● 用到的几个 runtime 函数及其相互关系。

      1 // driver_types.h
      2 // 用于优化线程块尺寸的函数中的标志
      3 #define cudaOccupancyDefault                0x00  // 默认标志
      4 #define cudaOccupancyDisableCachingOverride 0x01  // 开启全局缓存,且不能被禁用
      5 
      6 // cuda_device_runtime_api.h
      7 // 与 cuda_runtime.h 中同名的函数,貌似没有用到?
      8 __device__ __NV_WEAK__ cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize)
      9 {
     10     return cudaErrorUnknown;
     11 }
     12 
     13 // 被函数 cudaOccupancyMaxActiveBlocksPerMultiprocessor 和函数 cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags 调用的
     14 __device__ __NV_WEAK__ cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize, unsigned int flags)
     15 {
     16     return cudaErrorUnknown;
     17 }
     18 
     19 // cuda_runtime.h
     20 template<class T>
     21 static __inline__ __host__ cudaError_t cudaOccupancyMaxActiveBlocksPerMultiprocessor(int* numBlocks, T func, int blockSize, size_t dynamicSMemSize)
     22 {
     23     return ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(numBlocks, (const void*)func, blockSize, dynamicSMemSize, cudaOccupancyDefault);
     24 }
     25 
     26 template<typename UnaryFunction, class T>
     27 static __inline__ __host__ CUDART_DEVICE cudaError_t cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags
     28 (
     29     int* minGridSize, int* blockSize, T func, UnaryFunction blockSizeToDynamicSMemSize, int blockSizeLimit = 0, unsigned int flags = 0
     30 )
     31 {    
     32     cudaError_t status;
     33 
     34     // 设备和函数属性
     35     int                       device;
     36     struct cudaFuncAttributes attr;
     37     int maxThreadsPerMultiProcessor;
     38     int warpSize;
     39     int devMaxThreadsPerBlock;
     40     int multiProcessorCount;
     41     int occupancyLimit;
     42     int granularity;
     43 
     44     // 记录最大值
     45     int maxBlockSize = 0;
     46     int numBlocks = 0;
     47     int maxOccupancy = 0;
     48 
     49     // 临时变量
     50     int blockSizeToTryAligned;
     51     int blockSizeToTry;
     52     int occupancyInBlocks;
     53     int occupancyInThreads;
     54     size_t dynamicSMemSize;
     55 
     56     // 检查输入
     57     if (!minGridSize || !blockSize || !func)
     58         return cudaErrorInvalidValue;
     59 
     60     //获取设备和核函数属性
     61     status = ::cudaGetDevice(&device);
     62     if (status != cudaSuccess)
     63         return status;
     64     status = cudaDeviceGetAttribute(&maxThreadsPerMultiProcessor, cudaDevAttrMaxThreadsPerMultiProcessor, device);
     65     if (status != cudaSuccess)
     66         return status;
     67     status = cudaDeviceGetAttribute(&warpSize,cudaDevAttrWarpSize,device);
     68     if (status != cudaSuccess)
     69         return status;
     70     status = cudaDeviceGetAttribute(&devMaxThreadsPerBlock,cudaDevAttrMaxThreadsPerBlock,device);
     71     if (status != cudaSuccess)
     72         return status;
     73     status = cudaDeviceGetAttribute(&multiProcessorCount,cudaDevAttrMultiProcessorCount,device);
     74     if (status != cudaSuccess)
     75         return status;
     76     status = cudaFuncGetAttributes(&attr, func);
     77     if (status != cudaSuccess)
     78         return status;
     79 
     80     //尝试线程块尺寸
     81     occupancyLimit = maxThreadsPerMultiProcessor;
     82     granularity = warpSize;
     83 
     84     if (blockSizeLimit == 0 || blockSizeLimit > devMaxThreadsPerBlock)
     85         blockSizeLimit = devMaxThreadsPerBlock;
     86 
     87     if (blockSizeLimit > attr.maxThreadsPerBlock)
     88         blockSizeLimit = attr.maxThreadsPerBlock;
     89 
     90     for (blockSizeToTryAligned = ((blockSizeLimit + (warpSize - 1)) / warpSize) * warpSize; blockSizeToTryAligned > 0; blockSizeToTryAligned -= warpSize)
     91         // blockSizeLimit 向上对齐到 warpSize 的整数倍,并尝试以 warpSize 为单位向下减少
     92         // 如果一开始 blockSizeLimit 就比 blockSizeToTryAligned 小,则从 blockSizeLimit 开始尝试(这时只用迭代一次)
     93     {        
     94         blockSizeToTry = (blockSizeLimit < blockSizeToTryAligned) ? blockSizeLimit : blockSizeToTryAligned;
     95         dynamicSMemSize = blockSizeToDynamicSMemSize(blockSizeToTry);
     96 
     97         // 计算占用率的核心
     98         status = cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(&occupancyInBlocks, func, blockSizeToTry, dynamicSMemSize, flags);
     99         if (status != cudaSuccess)
    100             return status;
    101         
    102         // 记录有效结果
    103         if ((occupancyInThreads = blockSizeToTry * occupancyInBlocks) > maxOccupancy)
    104         {
    105             maxBlockSize = blockSizeToTry;
    106             numBlocks = occupancyInBlocks;
    107             maxOccupancy = occupancyInThreads;
    108         }
    109 
    110         // 已经达到了占用率 100%,退出
    111         if (occupancyLimit == maxOccupancy)
    112             break;
    113     }
    114 
    115     // 返回最优结果
    116     *minGridSize = numBlocks * multiProcessorCount;
    117     *blockSize = maxBlockSize;
    118 
    119     return status;
    120 }
    121 
    122 class __cudaOccupancyB2DHelper
    123 {
    124     size_t n;
    125     public:
    126         inline __host__ CUDART_DEVICE __cudaOccupancyB2DHelper(size_t n_) : n(n_) {}
    127         inline __host__ CUDART_DEVICE size_t operator()(int)
    128         {
    129             return n; 
    130         }
    131 };
    132 
    133 // 优化线程块尺寸的 runtime 函数
    134 // 参数:输出最小线程格尺寸 minGridSize,输出线程块尺寸 blockSize,内核 func,动态共享内存大小 dynamicSMemSize,总线程数 blockSizeLimit
    135 template<class T>
    136 static __inline__ __host__ CUDART_DEVICE cudaError_t cudaOccupancyMaxPotentialBlockSize
    137 (
    138     int *minGridSize, int *blockSize, T func, size_t dynamicSMemSize = 0, int blockSizeLimit = 0
    139 )
    140 {
    141     return cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags(minGridSize, blockSize, func, __cudaOccupancyB2DHelper(dynamicSMemSize), blockSizeLimit, cudaOccupancyDefault);
    142 }
  • 相关阅读:
    结对项目之感
    调查问卷之体会
    我的软件工程课程目标
    关于软件工程的课程建议
    结对编程--fault,error,failure
    结对编程总结
    结对编码感想
    我的软件工程课目标
    《软件工程》课之-调查问卷的心得体会
    软件工程课程讨论记录
  • 原文地址:https://www.cnblogs.com/cuancuancuanhao/p/7893570.html
Copyright © 2011-2022 走看看