zoukankan      html  css  js  c++  java
  • 0_Simple__cdpSimplePrint + 0_Simple__cdpSimpleQuicksort

    ▶ CUDA 动态并行实现快排算法(单线程的递归调用)

    ▶ 源代码:动态并行递归调用线程块。要点:添加 -rdc=true 选项(生成 relocatable device code,相当于执行分离编译),以及链接库 cudadevrt.lib (用于动态并行,不同于运行时库 cudart.lib)

      1 #include <stdio.h>
      2 #include <cuda.h>
      3 #include <cuda_runtime.h>
      4 #include "device_launch_parameters.h"
      5 #include <helper_cuda.h> 
      6 #include <helper_string.h>
      7 
      8 __device__ int g_blockId = 0;                                       // 线程块的全局编号,供所有线程读写
      9 
     10 __device__ void print_info(int depth, int blockId, int parent_threadId, int parent_blockId)  // 打印当前线程块信息,包括深度,当前块号,
     11 {
     12     if (threadIdx.x == 0)
     13     {
     14         if (depth == 0)
     15             printf("BLOCK %d launched by the host
    ", blockId);
     16         else
     17         {
     18             char buffer[32];
     19             for (int i = 0; i < depth; ++i)                         // 对应更多层级,每层前面都有相应层数的 "|  "
     20             {
     21                 buffer[3 * i + 0] = '+';
     22                 buffer[3 * i + 1] = ' ';
     23                 buffer[3 * i + 2] = ' ';
     24             }
     25             buffer[3 * depth] = '';
     26             printf("%sBLOCK %d launched by thread %d of block %d
    ", buffer, blockId, parent_threadId, parent_blockId);
     27         }
     28     }
     29     __syncthreads();
     30 }
     31 
     32 __global__ void cdp_kernel(int max_depth, int depth, int parent_threadId, int parent_blockId)// 线程块递归
     33 {
     34     __shared__ int s_blockId;                                       // 当前线程块的编号
     35 
     36     if (threadIdx.x == 0)                                           // 读取当前 g_blockId 到 s_blockId 中,并将 g_blockId 加一
     37         s_blockId = atomicAdd(&g_blockId, 1);
     38     __syncthreads();
     39 
     40     print_info(depth, s_blockId, parent_threadId, parent_blockId);  // 打印当前线程块信息,
     41 
     42     if (++depth >= max_depth)                                       // 达到最大递归深度则退出,否则继续调用 cdp_kernel()
     43         return;
     44     cdp_kernel << <gridDim.x, blockDim.x >> >(max_depth, depth, threadIdx.x, s_blockId);
     45 }
     46 
     47 int main(int argc, char **argv)
     48 {
     49     printf("CUDA Dynamic Parallelism
    ");
     50     int max_depth = 3;
     51     int device_count = 0, device = -1;
     52 
     53     if (checkCmdLineFlag(argc, (const char **)argv, "help") || checkCmdLineFlag(argc, (const char **)argv, "h"))// 帮助模式
     54     {
     55         printf("Usage: %s depth=<max_depth>	(where max_depth is a value between 1 and 8).
    ", argv[0]);
     56         exit(EXIT_SUCCESS);
     57     }
     58     if (checkCmdLineFlag(argc, (const char **)argv, "depth"))       // 手动设置递归深度
     59     {
     60         max_depth = getCmdLineArgumentInt(argc, (const char **)argv, "depth");
     61         if (max_depth < 1 || max_depth > 8)
     62         {
     63             printf("depth parameter has to be between 1 and 8
    ");
     64             exit(EXIT_FAILURE);
     65         }
     66     }
     67     if (checkCmdLineFlag(argc, (const char **)argv, "device"))      // 命令行指定设备
     68     {
     69         device = getCmdLineArgumentInt(argc, (const char **)argv, "device");
     70         cudaDeviceProp properties;
     71         cudaGetDeviceProperties(&properties, device);
     72         if (properties.major > 3 || (properties.major == 3 && properties.minor >= 5))
     73             printf("Running on GPU %d (%s)
    ", device, properties.name);
     74         else
     75         {
     76             printf("ERROR: required GPU with compute SM 3.5 or higher.
    Current GPU compute SM %d.%d
    ", properties.major, properties.minor);
     77             exit(EXIT_FAILURE);
     78         }
     79     }
     80     else
     81     {
     82         cudaGetDeviceCount(&device_count);
     83         for (int i = 0; i < device_count; ++i)
     84         {
     85             cudaDeviceProp properties;
     86             cudaGetDeviceProperties(&properties, i);
     87             if (properties.major > 3 || (properties.major == 3 && properties.minor >= 5))
     88             {
     89                 device = i;
     90                 printf("Running on GPU %d (%s)", i, properties.name);
     91                 break;
     92             }
     93             printf("Running on GPU %d (%s) does not support CUDA Dynamic Parallelism", i, properties.name);
     94         }
     95     }
     96     if (device == -1)
     97     {
     98         printf("required GPU with compute SM 3.5 or higher.");
     99         exit(EXIT_WAIVED);
    100     }
    101     cudaSetDevice(device);
    102 
    103     cudaDeviceSetLimit(cudaLimitDevRuntimeSyncDepth, max_depth);
    104     printf("Launching cdp_kernel() with CUDA Dynamic Parallelism:
    
    ");
    105     cdp_kernel << <2, 2 >> >(max_depth, 0, 0, -1);
    106     cudaGetLastError();
    107     cudaDeviceSynchronize();
    108 
    109     getchar();
    110     exit(EXIT_SUCCESS);
    111 }

    ● 输出结果:主机调用 2 个线程块,每个线程块 2 个线程,每个线程按同样规模递归调用,共 2*4 个二级核函数,2*4*4 个三级核函数,一共 42 个线程块

    CUDA Dynamic Parallelism
    Running on GPU 0 (GeForce GTX 1070)Launching cdp_kernel() with CUDA Dynamic Parallelism:
    
    BLOCK 0 launched by the host
    BLOCK 1 launched by the host
    +  BLOCK 2 launched by thread 0 of block 1
    +  BLOCK 3 launched by thread 0 of block 1
    +  BLOCK 4 launched by thread 0 of block 0
    +  BLOCK 5 launched by thread 0 of block 0
    +  +  BLOCK 10 launched by thread 0 of block 3
    +  +  BLOCK 11 launched by thread 0 of block 3
    +  +  BLOCK 7 launched by thread 0 of block 2
    +  +  BLOCK 6 launched by thread 0 of block 2
    +  +  BLOCK 12 launched by thread 0 of block 5
    +  +  BLOCK 13 launched by thread 0 of block 5
    +  +  BLOCK 8 launched by thread 0 of block 4
    +  +  BLOCK 9 launched by thread 0 of block 4
    +  +  BLOCK 15 launched by thread 1 of block 3
    +  +  BLOCK 14 launched by thread 1 of block 3
    +  +  BLOCK 19 launched by thread 1 of block 2
    +  +  BLOCK 16 launched by thread 1 of block 5
    +  +  BLOCK 17 launched by thread 1 of block 5
    +  +  BLOCK 18 launched by thread 1 of block 2
    +  +  BLOCK 21 launched by thread 1 of block 4
    +  +  BLOCK 20 launched by thread 1 of block 4
    +  BLOCK 22 launched by thread 1 of block 1
    +  BLOCK 23 launched by thread 1 of block 1
    +  BLOCK 24 launched by thread 1 of block 0
    +  BLOCK 25 launched by thread 1 of block 0
    +  +  BLOCK 28 launched by thread 0 of block 23
    +  +  BLOCK 27 launched by thread 0 of block 22
    +  +  BLOCK 29 launched by thread 0 of block 24
    +  +  BLOCK 26 launched by thread 0 of block 23
    +  +  BLOCK 31 launched by thread 0 of block 24
    +  +  BLOCK 30 launched by thread 0 of block 22
    +  +  BLOCK 33 launched by thread 0 of block 25
    +  +  BLOCK 32 launched by thread 0 of block 25
    +  +  BLOCK 34 launched by thread 1 of block 23
    +  +  BLOCK 35 launched by thread 1 of block 23
    +  +  BLOCK 36 launched by thread 1 of block 22
    +  +  BLOCK 37 launched by thread 1 of block 22
    +  +  BLOCK 38 launched by thread 1 of block 25
    +  +  BLOCK 39 launched by thread 1 of block 25
    +  +  BLOCK 40 launched by thread 1 of block 24
    +  +  BLOCK 41 launched by thread 1 of block 24

    ▶ 涨姿势:

    ● 在核函数中递归地调用核函数,注意函数调用的格式

    ▶ 源代码:动态并行实现快排算法,输出结果只有 Finish!

      1 #include <stdio.h>
      2 #include <cuda.h>
      3 #include <cuda_runtime.h>
      4 #include "device_launch_parameters.h"
      5 #include <helper_cuda.h> 
      6 #include <helper_string.h>
      7 
      8 #define MAX_DEPTH       16
      9 #define INSERTION_SORT  32
     10 
     11 __device__ void selection_sort(unsigned int *data, int left, int right) //选择排序,单线程完成
     12 {
     13     for (int i = left; i <= right; ++i)
     14     {
     15         unsigned min_val = data[i];
     16         int min_idx = i;
     17         for (int j = i + 1; j <= right; ++j)                            // 找最小元素及其下标
     18         {
     19             unsigned val_j = data[j];
     20             if (val_j < min_val)
     21             {
     22                 min_idx = j;
     23                 min_val = val_j;
     24             }
     25         }
     26         if (i != min_idx)                                               // 交换第 i 号元素到指定的位置上
     27         {
     28             data[min_idx] = data[i];
     29             data[i] = min_val;
     30         }
     31     }
     32 }
     33 
     34 __global__ void cdp_simple_quicksort(unsigned int *data, int left, int right, int depth)    // 快排主体,内含递归调用
     35 {
     36     if (depth >= MAX_DEPTH || right - left <= INSERTION_SORT)           // 递归深度达到 MAX_DEPTH 或者 数组中元素个数不多于 INSERTION_SORT 时使用选排
     37     {
     38         selection_sort(data, left, right);
     39         return;
     40     }
     41     unsigned int *lptr = data + left, *rptr = data + right, pivot = data[(left + right) / 2];
     42     while (lptr <= rptr)
     43     {
     44         unsigned int lval = *lptr, rval = *rptr;                        // 指定左指针指向的值和右指针指向的值
     45         while (lval < pivot)                                            // 递增左指针,等价于 lptr++; lval = *lptr;        
     46             lval = *(++lptr);
     47         while (rval > pivot)                                            // 递减右指针        
     48             rval = *(--rptr);
     49         if (lptr <= rptr)                                               // 交换左右指针指向的值
     50         {
     51             *lptr++ = rval;
     52             *rptr-- = lval;
     53         }
     54     }
     55     if (left < rptr - data)                                             // 将左右分区放到两个不同的流中
     56     {
     57         cudaStream_t s0;
     58         cudaStreamCreateWithFlags(&s0, cudaStreamNonBlocking);          // 指定该流不与 0 号流进行同步
     59         cdp_simple_quicksort << < 1, 1, 0, s0 >> >(data, left, rptr - data, depth + 1);
     60         cudaStreamDestroy(s0);
     61     }
     62     if (lptr - data < right)
     63     {
     64         cudaStream_t s1;
     65         cudaStreamCreateWithFlags(&s1, cudaStreamNonBlocking);
     66         cdp_simple_quicksort << < 1, 1, 0, s1 >> >(data, lptr - data, right, depth + 1);
     67         cudaStreamDestroy(s1);
     68     }
     69 }
     70 
     71 void run_qsort(unsigned int *data, unsigned int n)                      // 快排入口
     72 {
     73     cudaDeviceSetLimit(cudaLimitDevRuntimeSyncDepth, MAX_DEPTH);        // 设置最大递归深度    
     74     cdp_simple_quicksort << < 1, 1 >> >(data, 0, n - 1, 0);
     75     cudaDeviceSynchronize();
     76 }
     77 
     78 int main(int argc, char **argv)
     79 {
     80     cudaSetDevice(0);
     81     const int n = 1024;
     82 
     83     unsigned int *h_data = (unsigned int *)malloc(sizeof(unsigned int) * n);    
     84     srand(2047);
     85     for (unsigned i = 0; i < n; i++)
     86         h_data[i] = rand() % n;
     87 
     88 
     89     unsigned int *d_data;
     90     cudaMalloc((void **)&d_data, n * sizeof(unsigned int));
     91     cudaMemcpy(d_data, h_data, n * sizeof(unsigned int), cudaMemcpyHostToDevice);
     92 
     93     run_qsort(d_data, n);
     94     
     95     cudaMemcpy(h_data, d_data, n * sizeof(unsigned), cudaMemcpyDeviceToHost);
     96 
     97     for (int i = 1; i < n; ++i)
     98     {
     99         if (h_data[i - 1] > h_data[i])
    100         {
    101             printf("Error at i == %d, h_data[i-1] == %d, h_data[i] == %d
    ", h_data[i - 1], h_data[i]);
    102             break;
    103         }
    104     }
    105     printf("Finish!
    ");
    106 
    107     free(h_data);
    108     cudaFree(d_data);
    109     getchar();
    110     exit(EXIT_SUCCESS);
    111 }

    ▶ 新姿势:

    ● checkCmdLineFlag 用于检验函数参数 argv 是否等于字符串 string_ref(定义于 helper_string.h 中)

     1 inline bool checkCmdLineFlag(const int argc, const char **argv, const char *string_ref)
     2 {
     3     bool bFound = false;
     4     if (argc >= 1)
     5     {
     6         for (int i = 1; i < argc; i++)
     7         {
     8             int string_start = stringRemoveDelimiter('-', argv[i]);
     9             const char *string_argv = &argv[i][string_start], 
    10             const char*equal_pos = strchr(string_argv, '=');
    11             int argv_length = (int)(equal_pos == 0 ? strlen(string_argv) : equal_pos - string_argv);
    12             int length = (int)strlen(string_ref);
    13             if (length == argv_length && !STRNCASECMP(string_argv, string_ref, length))
    14             {
    15                 bFound = true;
    16                 continue;
    17             }
    18         }
    19     }
    20     return bFound;
    21 }
    22 
    23 inline int stringRemoveDelimiter(char delimiter, const char *string)    // 去除特定的符号,上述函数的中用于去除参数前面的 - 或 --
    24 {
    25     int string_start = 0;
    26     while (string[string_start] == delimiter)
    27         string_start++;
    28     if (string_start >= (int)strlen(string) - 1)
    29         return 0;
    30     return string_start;
    31 }
    32 
    33 #define STRNCASECMP _strnicmp                                           // 比较字符串(定义于string.h中)
    34 
    35 _ACRTIMP int __cdecl _strnicmp(_In_reads_or_z_(_MaxCount) char const* _String1, _In_reads_or_z_(_MaxCount) char const* _String2, _In_ size_t _MaxCount);

    ● getCmdLineArgumentInt 用于提取函数参数 argv 中的整数(定义于 helper_string.h 中)

     1 inline int getCmdLineArgumentInt(const int argc, const char **argv, const char *string_ref)
     2 {
     3     bool bFound = false;
     4     int value = -1;
     5     if (argc >= 1)
     6     {
     7         for (int i = 1; i < argc; i++)
     8         {
     9             int string_start = stringRemoveDelimiter('-', argv[i]);
    10             const char *string_argv = &argv[i][string_start];
    11             int length = (int)strlen(string_ref);
    12             if (!STRNCASECMP(string_argv, string_ref, length))
    13             {
    14                 if (length + 1 <= (int)strlen(string_argv))
    15                 {
    16                     int auto_inc = (string_argv[length] == '=') ? 1 : 0;
    17                     value = atoi(&string_argv[length + auto_inc]);
    18                 }
    19                 else
    20                     value = 0;
    21                 bFound = true;
    22                 continue;
    23             }
    24         }
    25     }
    26     if (bFound)
    27         return value;
    28     return 0;
    29 }

    ● 指定最大递归深度

    extern __host__ cudaError_t CUDARTAPI cudaDeviceSetLimit(enum cudaLimit limit, size_t value);

    ● 带有标识符的 cudaStreamCreateWithFlags ,设置流的优先级

    extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamCreateWithFlags(cudaStream_t *pStream, unsigned int flags);

    ■ 对比 cudaStreamCreate

    extern __host__ cudaError_t CUDARTAPI cudaStreamCreate(cudaStream_t *pStream);
  • 相关阅读:
    中国历史朝代公元对照简表
    [Solved] DashBoard – Excel Service: The data sources may be unreachable, may not be responding, or may have denied you access.
    Delete/Remove Project from TFS 2010
    Sharepoint site showing system account instead of my username on the top right corner.
    你的成功在于你每天养成的习惯
    Internet Information Services is running in 32bit emulation mode. Correct the issue listed above and rerun setup.
    Prepare to back up and restore a farm (Office SharePoint Server 2007)
    Word中字号与磅值的对应关系
    How to: Change the Frequency for Refreshing the Data Warehouse for Team System
    UI Automation in WPF/Silverlight
  • 原文地址:https://www.cnblogs.com/cuancuancuanhao/p/7726441.html
Copyright © 2011-2022 走看看