zoukankan      html  css  js  c++  java
  • 0_Simple__cdpSimplePrint + 0_Simple__cdpSimpleQuicksort

    ▶ CUDA 动态并行实现快排算法(单线程的递归调用)

    ▶ 源代码:动态并行递归调用线程块。要点:添加 -rdc=true 选项(生成 relocatable device code,相当于执行分离编译),以及链接库 cudadevrt.lib (用于动态并行,不同于运行时库 cudart.lib)

      1 #include <stdio.h>
      2 #include <cuda.h>
      3 #include <cuda_runtime.h>
      4 #include "device_launch_parameters.h"
      5 #include <helper_cuda.h> 
      6 #include <helper_string.h>
      7 
      8 __device__ int g_blockId = 0;                                       // 线程块的全局编号,供所有线程读写
      9 
     10 __device__ void print_info(int depth, int blockId, int parent_threadId, int parent_blockId)  // 打印当前线程块信息,包括深度,当前块号,
     11 {
     12     if (threadIdx.x == 0)
     13     {
     14         if (depth == 0)
     15             printf("BLOCK %d launched by the host
    ", blockId);
     16         else
     17         {
     18             char buffer[32];
     19             for (int i = 0; i < depth; ++i)                         // 对应更多层级,每层前面都有相应层数的 "|  "
     20             {
     21                 buffer[3 * i + 0] = '+';
     22                 buffer[3 * i + 1] = ' ';
     23                 buffer[3 * i + 2] = ' ';
     24             }
     25             buffer[3 * depth] = '';
     26             printf("%sBLOCK %d launched by thread %d of block %d
    ", buffer, blockId, parent_threadId, parent_blockId);
     27         }
     28     }
     29     __syncthreads();
     30 }
     31 
     32 __global__ void cdp_kernel(int max_depth, int depth, int parent_threadId, int parent_blockId)// 线程块递归
     33 {
     34     __shared__ int s_blockId;                                       // 当前线程块的编号
     35 
     36     if (threadIdx.x == 0)                                           // 读取当前 g_blockId 到 s_blockId 中,并将 g_blockId 加一
     37         s_blockId = atomicAdd(&g_blockId, 1);
     38     __syncthreads();
     39 
     40     print_info(depth, s_blockId, parent_threadId, parent_blockId);  // 打印当前线程块信息,
     41 
     42     if (++depth >= max_depth)                                       // 达到最大递归深度则退出,否则继续调用 cdp_kernel()
     43         return;
     44     cdp_kernel << <gridDim.x, blockDim.x >> >(max_depth, depth, threadIdx.x, s_blockId);
     45 }
     46 
     47 int main(int argc, char **argv)
     48 {
     49     printf("CUDA Dynamic Parallelism
    ");
     50     int max_depth = 3;
     51     int device_count = 0, device = -1;
     52 
     53     if (checkCmdLineFlag(argc, (const char **)argv, "help") || checkCmdLineFlag(argc, (const char **)argv, "h"))// 帮助模式
     54     {
     55         printf("Usage: %s depth=<max_depth>	(where max_depth is a value between 1 and 8).
    ", argv[0]);
     56         exit(EXIT_SUCCESS);
     57     }
     58     if (checkCmdLineFlag(argc, (const char **)argv, "depth"))       // 手动设置递归深度
     59     {
     60         max_depth = getCmdLineArgumentInt(argc, (const char **)argv, "depth");
     61         if (max_depth < 1 || max_depth > 8)
     62         {
     63             printf("depth parameter has to be between 1 and 8
    ");
     64             exit(EXIT_FAILURE);
     65         }
     66     }
     67     if (checkCmdLineFlag(argc, (const char **)argv, "device"))      // 命令行指定设备
     68     {
     69         device = getCmdLineArgumentInt(argc, (const char **)argv, "device");
     70         cudaDeviceProp properties;
     71         cudaGetDeviceProperties(&properties, device);
     72         if (properties.major > 3 || (properties.major == 3 && properties.minor >= 5))
     73             printf("Running on GPU %d (%s)
    ", device, properties.name);
     74         else
     75         {
     76             printf("ERROR: required GPU with compute SM 3.5 or higher.
    Current GPU compute SM %d.%d
    ", properties.major, properties.minor);
     77             exit(EXIT_FAILURE);
     78         }
     79     }
     80     else
     81     {
     82         cudaGetDeviceCount(&device_count);
     83         for (int i = 0; i < device_count; ++i)
     84         {
     85             cudaDeviceProp properties;
     86             cudaGetDeviceProperties(&properties, i);
     87             if (properties.major > 3 || (properties.major == 3 && properties.minor >= 5))
     88             {
     89                 device = i;
     90                 printf("Running on GPU %d (%s)", i, properties.name);
     91                 break;
     92             }
     93             printf("Running on GPU %d (%s) does not support CUDA Dynamic Parallelism", i, properties.name);
     94         }
     95     }
     96     if (device == -1)
     97     {
     98         printf("required GPU with compute SM 3.5 or higher.");
     99         exit(EXIT_WAIVED);
    100     }
    101     cudaSetDevice(device);
    102 
    103     cudaDeviceSetLimit(cudaLimitDevRuntimeSyncDepth, max_depth);
    104     printf("Launching cdp_kernel() with CUDA Dynamic Parallelism:
    
    ");
    105     cdp_kernel << <2, 2 >> >(max_depth, 0, 0, -1);
    106     cudaGetLastError();
    107     cudaDeviceSynchronize();
    108 
    109     getchar();
    110     exit(EXIT_SUCCESS);
    111 }

    ● 输出结果:主机调用 2 个线程块,每个线程块 2 个线程,每个线程按同样规模递归调用,共 2*4 个二级核函数,2*4*4 个三级核函数,一共 42 个线程块

    CUDA Dynamic Parallelism
    Running on GPU 0 (GeForce GTX 1070)Launching cdp_kernel() with CUDA Dynamic Parallelism:
    
    BLOCK 0 launched by the host
    BLOCK 1 launched by the host
    +  BLOCK 2 launched by thread 0 of block 1
    +  BLOCK 3 launched by thread 0 of block 1
    +  BLOCK 4 launched by thread 0 of block 0
    +  BLOCK 5 launched by thread 0 of block 0
    +  +  BLOCK 10 launched by thread 0 of block 3
    +  +  BLOCK 11 launched by thread 0 of block 3
    +  +  BLOCK 7 launched by thread 0 of block 2
    +  +  BLOCK 6 launched by thread 0 of block 2
    +  +  BLOCK 12 launched by thread 0 of block 5
    +  +  BLOCK 13 launched by thread 0 of block 5
    +  +  BLOCK 8 launched by thread 0 of block 4
    +  +  BLOCK 9 launched by thread 0 of block 4
    +  +  BLOCK 15 launched by thread 1 of block 3
    +  +  BLOCK 14 launched by thread 1 of block 3
    +  +  BLOCK 19 launched by thread 1 of block 2
    +  +  BLOCK 16 launched by thread 1 of block 5
    +  +  BLOCK 17 launched by thread 1 of block 5
    +  +  BLOCK 18 launched by thread 1 of block 2
    +  +  BLOCK 21 launched by thread 1 of block 4
    +  +  BLOCK 20 launched by thread 1 of block 4
    +  BLOCK 22 launched by thread 1 of block 1
    +  BLOCK 23 launched by thread 1 of block 1
    +  BLOCK 24 launched by thread 1 of block 0
    +  BLOCK 25 launched by thread 1 of block 0
    +  +  BLOCK 28 launched by thread 0 of block 23
    +  +  BLOCK 27 launched by thread 0 of block 22
    +  +  BLOCK 29 launched by thread 0 of block 24
    +  +  BLOCK 26 launched by thread 0 of block 23
    +  +  BLOCK 31 launched by thread 0 of block 24
    +  +  BLOCK 30 launched by thread 0 of block 22
    +  +  BLOCK 33 launched by thread 0 of block 25
    +  +  BLOCK 32 launched by thread 0 of block 25
    +  +  BLOCK 34 launched by thread 1 of block 23
    +  +  BLOCK 35 launched by thread 1 of block 23
    +  +  BLOCK 36 launched by thread 1 of block 22
    +  +  BLOCK 37 launched by thread 1 of block 22
    +  +  BLOCK 38 launched by thread 1 of block 25
    +  +  BLOCK 39 launched by thread 1 of block 25
    +  +  BLOCK 40 launched by thread 1 of block 24
    +  +  BLOCK 41 launched by thread 1 of block 24

    ▶ 涨姿势:

    ● 在核函数中递归地调用核函数,注意函数调用的格式

    ▶ 源代码:动态并行实现快排算法,输出结果只有 Finish!

      1 #include <stdio.h>
      2 #include <cuda.h>
      3 #include <cuda_runtime.h>
      4 #include "device_launch_parameters.h"
      5 #include <helper_cuda.h> 
      6 #include <helper_string.h>
      7 
      8 #define MAX_DEPTH       16
      9 #define INSERTION_SORT  32
     10 
     11 __device__ void selection_sort(unsigned int *data, int left, int right) //选择排序,单线程完成
     12 {
     13     for (int i = left; i <= right; ++i)
     14     {
     15         unsigned min_val = data[i];
     16         int min_idx = i;
     17         for (int j = i + 1; j <= right; ++j)                            // 找最小元素及其下标
     18         {
     19             unsigned val_j = data[j];
     20             if (val_j < min_val)
     21             {
     22                 min_idx = j;
     23                 min_val = val_j;
     24             }
     25         }
     26         if (i != min_idx)                                               // 交换第 i 号元素到指定的位置上
     27         {
     28             data[min_idx] = data[i];
     29             data[i] = min_val;
     30         }
     31     }
     32 }
     33 
     34 __global__ void cdp_simple_quicksort(unsigned int *data, int left, int right, int depth)    // 快排主体,内含递归调用
     35 {
     36     if (depth >= MAX_DEPTH || right - left <= INSERTION_SORT)           // 递归深度达到 MAX_DEPTH 或者 数组中元素个数不多于 INSERTION_SORT 时使用选排
     37     {
     38         selection_sort(data, left, right);
     39         return;
     40     }
     41     unsigned int *lptr = data + left, *rptr = data + right, pivot = data[(left + right) / 2];
     42     while (lptr <= rptr)
     43     {
     44         unsigned int lval = *lptr, rval = *rptr;                        // 指定左指针指向的值和右指针指向的值
     45         while (lval < pivot)                                            // 递增左指针,等价于 lptr++; lval = *lptr;        
     46             lval = *(++lptr);
     47         while (rval > pivot)                                            // 递减右指针        
     48             rval = *(--rptr);
     49         if (lptr <= rptr)                                               // 交换左右指针指向的值
     50         {
     51             *lptr++ = rval;
     52             *rptr-- = lval;
     53         }
     54     }
     55     if (left < rptr - data)                                             // 将左右分区放到两个不同的流中
     56     {
     57         cudaStream_t s0;
     58         cudaStreamCreateWithFlags(&s0, cudaStreamNonBlocking);          // 指定该流不与 0 号流进行同步
     59         cdp_simple_quicksort << < 1, 1, 0, s0 >> >(data, left, rptr - data, depth + 1);
     60         cudaStreamDestroy(s0);
     61     }
     62     if (lptr - data < right)
     63     {
     64         cudaStream_t s1;
     65         cudaStreamCreateWithFlags(&s1, cudaStreamNonBlocking);
     66         cdp_simple_quicksort << < 1, 1, 0, s1 >> >(data, lptr - data, right, depth + 1);
     67         cudaStreamDestroy(s1);
     68     }
     69 }
     70 
     71 void run_qsort(unsigned int *data, unsigned int n)                      // 快排入口
     72 {
     73     cudaDeviceSetLimit(cudaLimitDevRuntimeSyncDepth, MAX_DEPTH);        // 设置最大递归深度    
     74     cdp_simple_quicksort << < 1, 1 >> >(data, 0, n - 1, 0);
     75     cudaDeviceSynchronize();
     76 }
     77 
     78 int main(int argc, char **argv)
     79 {
     80     cudaSetDevice(0);
     81     const int n = 1024;
     82 
     83     unsigned int *h_data = (unsigned int *)malloc(sizeof(unsigned int) * n);    
     84     srand(2047);
     85     for (unsigned i = 0; i < n; i++)
     86         h_data[i] = rand() % n;
     87 
     88 
     89     unsigned int *d_data;
     90     cudaMalloc((void **)&d_data, n * sizeof(unsigned int));
     91     cudaMemcpy(d_data, h_data, n * sizeof(unsigned int), cudaMemcpyHostToDevice);
     92 
     93     run_qsort(d_data, n);
     94     
     95     cudaMemcpy(h_data, d_data, n * sizeof(unsigned), cudaMemcpyDeviceToHost);
     96 
     97     for (int i = 1; i < n; ++i)
     98     {
     99         if (h_data[i - 1] > h_data[i])
    100         {
    101             printf("Error at i == %d, h_data[i-1] == %d, h_data[i] == %d
    ", h_data[i - 1], h_data[i]);
    102             break;
    103         }
    104     }
    105     printf("Finish!
    ");
    106 
    107     free(h_data);
    108     cudaFree(d_data);
    109     getchar();
    110     exit(EXIT_SUCCESS);
    111 }

    ▶ 新姿势:

    ● checkCmdLineFlag 用于检验函数参数 argv 是否等于字符串 string_ref(定义于 helper_string.h 中)

     1 inline bool checkCmdLineFlag(const int argc, const char **argv, const char *string_ref)
     2 {
     3     bool bFound = false;
     4     if (argc >= 1)
     5     {
     6         for (int i = 1; i < argc; i++)
     7         {
     8             int string_start = stringRemoveDelimiter('-', argv[i]);
     9             const char *string_argv = &argv[i][string_start], 
    10             const char*equal_pos = strchr(string_argv, '=');
    11             int argv_length = (int)(equal_pos == 0 ? strlen(string_argv) : equal_pos - string_argv);
    12             int length = (int)strlen(string_ref);
    13             if (length == argv_length && !STRNCASECMP(string_argv, string_ref, length))
    14             {
    15                 bFound = true;
    16                 continue;
    17             }
    18         }
    19     }
    20     return bFound;
    21 }
    22 
    23 inline int stringRemoveDelimiter(char delimiter, const char *string)    // 去除特定的符号,上述函数的中用于去除参数前面的 - 或 --
    24 {
    25     int string_start = 0;
    26     while (string[string_start] == delimiter)
    27         string_start++;
    28     if (string_start >= (int)strlen(string) - 1)
    29         return 0;
    30     return string_start;
    31 }
    32 
    33 #define STRNCASECMP _strnicmp                                           // 比较字符串(定义于string.h中)
    34 
    35 _ACRTIMP int __cdecl _strnicmp(_In_reads_or_z_(_MaxCount) char const* _String1, _In_reads_or_z_(_MaxCount) char const* _String2, _In_ size_t _MaxCount);

    ● getCmdLineArgumentInt 用于提取函数参数 argv 中的整数(定义于 helper_string.h 中)

     1 inline int getCmdLineArgumentInt(const int argc, const char **argv, const char *string_ref)
     2 {
     3     bool bFound = false;
     4     int value = -1;
     5     if (argc >= 1)
     6     {
     7         for (int i = 1; i < argc; i++)
     8         {
     9             int string_start = stringRemoveDelimiter('-', argv[i]);
    10             const char *string_argv = &argv[i][string_start];
    11             int length = (int)strlen(string_ref);
    12             if (!STRNCASECMP(string_argv, string_ref, length))
    13             {
    14                 if (length + 1 <= (int)strlen(string_argv))
    15                 {
    16                     int auto_inc = (string_argv[length] == '=') ? 1 : 0;
    17                     value = atoi(&string_argv[length + auto_inc]);
    18                 }
    19                 else
    20                     value = 0;
    21                 bFound = true;
    22                 continue;
    23             }
    24         }
    25     }
    26     if (bFound)
    27         return value;
    28     return 0;
    29 }

    ● 指定最大递归深度

    extern __host__ cudaError_t CUDARTAPI cudaDeviceSetLimit(enum cudaLimit limit, size_t value);

    ● 带有标识符的 cudaStreamCreateWithFlags ,设置流的优先级

    extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamCreateWithFlags(cudaStream_t *pStream, unsigned int flags);

    ■ 对比 cudaStreamCreate

    extern __host__ cudaError_t CUDARTAPI cudaStreamCreate(cudaStream_t *pStream);
  • 相关阅读:
    c#FileStream文件读写(转)
    mvc Razor 视图中找不到 ViewBag的定义
    JS正则表达式验证账号、手机号、电话和邮箱
    jquery each循环,
    $.grep(array, callback, [invert])过滤,常用
    arguments 对象
    有关Select option 元素
    MVC零基础学习整理(一)
    根据年月日算出当前日期是星期几
    C# winfrom 模拟ftp文件管理
  • 原文地址:https://www.cnblogs.com/cuancuancuanhao/p/7726441.html
Copyright © 2011-2022 走看看