zoukankan      html  css  js  c++  java
  • 0_Simple__simpleCooperativeGroups

    ▶ 协作组,CUDA9.0 的新特性

    ▶ 源代码,如何获得协作组的编号?

     1 #include <stdio.h>
     2 #include "cuda_runtime.h"
     3 #include "device_launch_parameters.h"
     4 #include <cooperative_groups.h>
     5 
     6 #define THREAD_PER_BLOCK 64
     7 
     8 using namespace cooperative_groups;                             // 注意使用命名空间
     9 
    10 __device__ int sumReduction(thread_group g, int *x, int val)    // 规约设备函数,要求共享内存 int *x 要够放得下 g.size() 个参加规约的元素 
    11 {
    12     int lane = g.thread_rank();                                 // 线程在协作组中的编号,教程中名字就叫 line ID
    13 
    14     for (int i = g.size() / 2; i > 0; i /= 2)
    15     {
    16         x[lane] = val;                                          // 第一次迭代该步相当于初始化,以后迭代该步相当于存储上一次迭代的结果
    17         g.sync();                                               // 协作组同步
    18         if (lane < i)
    19             val += x[lane + i];                                 // 利用每个线程局部变量 val 记录当前结果
    20         g.sync();
    21     }
    22     if (g.thread_rank() == 0)                                   // 零号线程返回计算结果
    23         return val;
    24     else
    25         return -1;
    26 }
    27 
    28 __global__ void cgkernel()
    29 {
    30     extern __shared__ int workspace[];
    31 
    32     thread_block group = this_thread_block();                   // 将线程块内所有线程打包为一个协作组
    33     int groupSize = group.size();                               // 获得协作组大小(线程个数)
    34     int input = group.thread_rank();                            // 获得线程在协作组内的编号,并作为计算输入
    35     int output = sumReduction(group, workspace, input);         // 规约计算,注意直接使用共享内存作为工作空间
    36     int expectedOutput = (groupSize - 1)*groupSize / 2;         // 预期计算结果,0 + 1 + 2 +...+ 63 = 2016
    37 
    38     if (group.thread_rank() == 0)                               // 0 号线程报告计算结果,宣布开始新的 4 个协作组的计算任务
    39     {
    40         printf("
    	Sum of thread 0 ~ %d in group is %d (expected %d)
    ", group.size() - 1, output, expectedOutput);
    41         printf("
    	Now creating %d groups, each of size 16 threads:
    ", group.size() / 16);
    42     }
    43     group.sync();                                               // 协作组同步
    44 
    45     thread_block_tile<16> group16 = tiled_partition<16>(group); // 每16个线程分割为一个协作组(只能使用 2 的整数次幂)
    46 
    47     int offset = group.thread_rank() - group16.thread_rank();   // 各协作组使用的共享内存的地址偏移量                                                                 
    48     printf("%d -> thread_rank = %d, group16.thread_rank = %d, offset = %d
    ", threadIdx.x, group.thread_rank(), group16.thread_rank(), offset);
    49     // dim3 group.group_index() 打印出来全是 (0, 0, 0),dim3 group.thread_index() 打印出来跟 group.thread_rank() 一样 
    50         
    51     input = group16.thread_rank();                              // 获得线程在新协作组中的编号,并作为计算输入
    52     output = sumReduction(group16, workspace + offset, input);  // 规约计算,注意工作空间的地址偏移
    53     expectedOutput = 15 * 16 / 2;                               // 预期计算结果,0 + 1 + 2 +...+ 16 = 120
    54 
    55     if (group16.thread_rank() == 0)                             // 各协作组零号线程报告计算结果
    56         printf("
    	Sum of all ranks 0..15 in group16 is %d (expected %d)
    ", output, expectedOutput);   
    57     return;
    58 }
    59 
    60 int main()
    61 {
    62     printf("
    	Start with %d threads.
    ", THREAD_PER_BLOCK);
    63 
    64     cgkernel << <1, THREAD_PER_BLOCK, THREAD_PER_BLOCK * sizeof(int) >> > ();
    65     cudaDeviceSynchronize();
    66 
    67     printf("
    	Finish.
    ");
    68     getchar();
    69     return 0;
    70 }

    ● 输出结果

            Start with 64 threads.
    
            Sum of thread 0 ~ 63 in group is 2016 (expected 2016)
    
            Now creating 4 groups, each of size 16 threads:
    0 -> thread_rank = 0, group16.thread_rank = 0, offset = 0
    1 -> thread_rank = 1, group16.thread_rank = 1, offset = 0
    2 -> thread_rank = 2, group16.thread_rank = 2, offset = 0
    3 -> thread_rank = 3, group16.thread_rank = 3, offset = 0
    4 -> thread_rank = 4, group16.thread_rank = 4, offset = 0
    5 -> thread_rank = 5, group16.thread_rank = 5, offset = 0
    6 -> thread_rank = 6, group16.thread_rank = 6, offset = 0
    7 -> thread_rank = 7, group16.thread_rank = 7, offset = 0
    8 -> thread_rank = 8, group16.thread_rank = 8, offset = 0
    9 -> thread_rank = 9, group16.thread_rank = 9, offset = 0
    10 -> thread_rank = 10, group16.thread_rank = 10, offset = 0
    11 -> thread_rank = 11, group16.thread_rank = 11, offset = 0
    12 -> thread_rank = 12, group16.thread_rank = 12, offset = 0
    13 -> thread_rank = 13, group16.thread_rank = 13, offset = 0
    14 -> thread_rank = 14, group16.thread_rank = 14, offset = 0
    15 -> thread_rank = 15, group16.thread_rank = 15, offset = 0
    16 -> thread_rank = 16, group16.thread_rank = 0, offset = 16
    17 -> thread_rank = 17, group16.thread_rank = 1, offset = 16
    18 -> thread_rank = 18, group16.thread_rank = 2, offset = 16
    19 -> thread_rank = 19, group16.thread_rank = 3, offset = 16
    20 -> thread_rank = 20, group16.thread_rank = 4, offset = 16
    21 -> thread_rank = 21, group16.thread_rank = 5, offset = 16
    22 -> thread_rank = 22, group16.thread_rank = 6, offset = 16
    23 -> thread_rank = 23, group16.thread_rank = 7, offset = 16
    24 -> thread_rank = 24, group16.thread_rank = 8, offset = 16
    25 -> thread_rank = 25, group16.thread_rank = 9, offset = 16
    26 -> thread_rank = 26, group16.thread_rank = 10, offset = 16
    27 -> thread_rank = 27, group16.thread_rank = 11, offset = 16
    28 -> thread_rank = 28, group16.thread_rank = 12, offset = 16
    29 -> thread_rank = 29, group16.thread_rank = 13, offset = 16
    30 -> thread_rank = 30, group16.thread_rank = 14, offset = 16
    31 -> thread_rank = 31, group16.thread_rank = 15, offset = 16
    32 -> thread_rank = 32, group16.thread_rank = 0, offset = 32
    33 -> thread_rank = 33, group16.thread_rank = 1, offset = 32
    34 -> thread_rank = 34, group16.thread_rank = 2, offset = 32
    35 -> thread_rank = 35, group16.thread_rank = 3, offset = 32
    36 -> thread_rank = 36, group16.thread_rank = 4, offset = 32
    37 -> thread_rank = 37, group16.thread_rank = 5, offset = 32
    38 -> thread_rank = 38, group16.thread_rank = 6, offset = 32
    39 -> thread_rank = 39, group16.thread_rank = 7, offset = 32
    40 -> thread_rank = 40, group16.thread_rank = 8, offset = 32
    41 -> thread_rank = 41, group16.thread_rank = 9, offset = 32
    42 -> thread_rank = 42, group16.thread_rank = 10, offset = 32
    43 -> thread_rank = 43, group16.thread_rank = 11, offset = 32
    44 -> thread_rank = 44, group16.thread_rank = 12, offset = 32
    45 -> thread_rank = 45, group16.thread_rank = 13, offset = 32
    46 -> thread_rank = 46, group16.thread_rank = 14, offset = 32
    47 -> thread_rank = 47, group16.thread_rank = 15, offset = 32
    48 -> thread_rank = 48, group16.thread_rank = 0, offset = 48
    49 -> thread_rank = 49, group16.thread_rank = 1, offset = 48
    50 -> thread_rank = 50, group16.thread_rank = 2, offset = 48
    51 -> thread_rank = 51, group16.thread_rank = 3, offset = 48
    52 -> thread_rank = 52, group16.thread_rank = 4, offset = 48
    53 -> thread_rank = 53, group16.thread_rank = 5, offset = 48
    54 -> thread_rank = 54, group16.thread_rank = 6, offset = 48
    55 -> thread_rank = 55, group16.thread_rank = 7, offset = 48
    56 -> thread_rank = 56, group16.thread_rank = 8, offset = 48
    57 -> thread_rank = 57, group16.thread_rank = 9, offset = 48
    58 -> thread_rank = 58, group16.thread_rank = 10, offset = 48
    59 -> thread_rank = 59, group16.thread_rank = 11, offset = 48
    60 -> thread_rank = 60, group16.thread_rank = 12, offset = 48
    61 -> thread_rank = 61, group16.thread_rank = 13, offset = 48
    62 -> thread_rank = 62, group16.thread_rank = 14, offset = 48
    63 -> thread_rank = 63, group16.thread_rank = 15, offset = 48
    
            Sum of all ranks 0..15 in group16 is 120 (expected 120)
    
            Sum of all ranks 0..15 in group16 is 120 (expected 120)
    
            Sum of all ranks 0..15 in group16 is 120 (expected 120)
    
            Sum of all ranks 0..15 in group16 is 120 (expected 120)
    
            Finish.

    ▶ 涨姿势:

    ● 相关定义

      1 // cooperative_groups_helper.h
      2 # if !defined(_CG_QUALIFIER)
      3 #  define _CG_QUALIFIER __forceinline__ __device__
      4 # endif
      5 
      6 # define die() assert(0);
      7 
      8 // cooperative_groups.h(调整顺序)
      9 class thread_group                      // 通用线程组类型
     10 {
     11     friend _CG_QUALIFIER thread_group this_thread();
     12     friend _CG_QUALIFIER thread_group tiled_partition(const thread_group& parent, unsigned int tilesz);
     13     friend class thread_block;
     14 
     15 protected:
     16     union __align__(8) 
     17     {
     18         unsigned int type : 8;
     19         struct 
     20         {
     21             unsigned int type : 8;
     22             unsigned int size : 24;
     23             unsigned int mask;
     24         } coalesced;
     25         struct 
     26         {
     27             void* ptr[2];
     28         } buffer;
     29     } _data;
     30     
     31     _CG_QUALIFIER thread_group operator=(const thread_group& src);
     32     
     33     _CG_QUALIFIER thread_group(__internal::groupType type) 
     34     {
     35         _data.type = type;
     36     }
     37 #if __cplusplus >= 201103L
     38     static_assert(sizeof(_data) == 16, "Failed size check");
     39 #endif
     40 
     41 public:
     42     _CG_QUALIFIER unsigned int size() const;
     43     _CG_QUALIFIER unsigned int thread_rank() const;
     44     _CG_QUALIFIER void sync() const;
     45 };
     46 
     47 class thread_block : public thread_group
     48 {
     49     friend _CG_QUALIFIER thread_block this_thread_block();
     50     friend _CG_QUALIFIER thread_group tiled_partition(const thread_group& parent, unsigned int tilesz);
     51     friend _CG_QUALIFIER thread_group tiled_partition(const thread_block& parent, unsigned int tilesz);
     52 
     53     _CG_QUALIFIER thread_block() : thread_group(__internal::ThreadBlock) {}
     54 
     55     _CG_QUALIFIER thread_group _get_tiled_threads(unsigned int tilesz) const
     56     {
     57         const bool pow2_tilesz = ((tilesz & (tilesz - 1)) == 0);
     58 
     59         if (tilesz == 0 || (tilesz > 32) || !pow2_tilesz)
     60         {
     61             die();
     62             return (thread_block());
     63         }
     64 
     65         unsigned int mask;
     66         unsigned int base_offset = thread_rank() & (~(tilesz - 1));
     67         unsigned int masklength = min(size() - base_offset, tilesz);
     68         mask = (unsigned int)(-1) >> (32 - masklength);
     69         mask <<= (__internal::laneid() & ~(tilesz - 1));
     70         thread_group tile = thread_group(__internal::CoalescedTile);
     71         tile._data.coalesced.mask = mask;
     72         tile._data.coalesced.size = __popc(mask);
     73         return (tile);
     74     }
     75 
     76 public:
     77     _CG_QUALIFIER void sync() const { __internal::cta::sync(); }
     78     _CG_QUALIFIER unsigned int size() const { return (__internal::cta::size()); }
     79     _CG_QUALIFIER unsigned int thread_rank() const { return (__internal::cta::thread_rank()); }
     80     _CG_QUALIFIER dim3 group_index() const { return (__internal::cta::group_index()); }
     81     _CG_QUALIFIER dim3 thread_index() const { return (__internal::cta::thread_index()); }
     82 };
     83 
     84 _CG_QUALIFIER thread_block this_thread_block()// 范例代码中用到的,实际是调用了 thread_block 的构造函数
     85 {
     86     return (thread_block());
     87 }
     88 
     89 template <unsigned int Size>
     90 class thread_block_tile;
     91 template <> class thread_block_tile<32> : public __thread_block_tile_base<32> { };
     92 template <> class thread_block_tile<16> : public __thread_block_tile_base<16> { };
     93 template <> class thread_block_tile<8> : public __thread_block_tile_base<8> { };
     94 template <> class thread_block_tile<4> : public __thread_block_tile_base<4> { };
     95 template <> class thread_block_tile<2> : public __thread_block_tile_base<2> { };
     96 template <> class thread_block_tile<1> : public __thread_block_tile_base<1> { };
     97 
     98 template <unsigned int Size>
     99 class __thread_block_tile_base : public thread_group
    100 {
    101     static const unsigned int numThreads = Size;
    102     _CG_QUALIFIER unsigned int build_mask() const 
    103     {
    104         unsigned int mask;
    105         if (numThreads == 32)
    106             mask = 0xFFFFFFFF;        
    107         else 
    108         {
    109             mask = (unsigned int)(-1) >> (32 - numThreads);
    110             mask <<= (__internal::laneid() & (~(numThreads - 1)));
    111         }
    112         return (mask);
    113     }
    114 
    115 protected:
    116     _CG_QUALIFIER __thread_block_tile_base() : thread_group(__internal::CoalescedTile) 
    117     {
    118         _data.coalesced.mask = build_mask();
    119         _data.coalesced.size = numThreads;
    120     }
    121 
    122 public:
    123     _CG_QUALIFIER void sync() const { __syncwarp(build_mask()); }
    124     _CG_QUALIFIER unsigned int thread_rank() const { return (threadIdx.x & (numThreads - 1)); }
    125     _CG_QUALIFIER unsigned int size() const { return (numThreads); }
    126 
    127     // PTX supported collectives
    128     _CG_QUALIFIER int shfl(int var, int srcRank) const { return (__shfl_sync(build_mask(), var, srcRank, numThreads)); }
    129     ...
    130 
    131 #ifdef _CG_HAS_FP16_COLLECTIVE
    132     _CG_QUALIFIER __half shfl(__half var, int srcRank) const { return (__shfl_sync(build_mask(), var, srcRank, numThreads)); }
    133     ...
    134     
    135 #endif
    136 
    137 #ifdef _CG_HAS_MATCH_COLLECTIVE
    138     _CG_QUALIFIER unsigned int match_any(int val) const 
    139     {
    140         unsigned int lane_match = build_mask() & __match_any_sync(build_mask(), val);
    141         return (lane_match >> (__internal::laneid() & (~(numThreads - 1))));
    142     }
    143     ...
    144 #endif
    145 };

    ● 用到的线程协作相关函数

    1 thread_block threadBlockGroup = this_thread_block();    // 将当前线程块分配为一个协作组
    2 
    3 thread_block_tile<16> tiledPartition16 = tiled_partition<16>(threadBlockGroup); // 协作组分组
    4     
    5 int in = tiledPartition16.thread_rank();                // 协作组中线程的编号
    6 
    7 tiledPartition16.sync();                            // 协作组同步
  • 相关阅读:
    将asp页面转换成htm页面的方法
    JavaScript 参考教程
    ASP六大对象介绍
    Asp组件初级入门与精通系列之二
    Asp组件初级入门与精通系列之三
    asp中防止脚本注入攻击
    Asp组件初级入门与精通系列之六
    取得select的option的text值
    用組件封裝數據庫操作(二)
    SQL注入天书ASP注入漏洞全接触
  • 原文地址:https://www.cnblogs.com/cuancuancuanhao/p/7881093.html
Copyright © 2011-2022 走看看