▶ 协作组,CUDA9.0 的新特性
▶ 源代码,如何获得协作组的编号?
1 #include <stdio.h> 2 #include "cuda_runtime.h" 3 #include "device_launch_parameters.h" 4 #include <cooperative_groups.h> 5 6 #define THREAD_PER_BLOCK 64 7 8 using namespace cooperative_groups; // 注意使用命名空间 9 10 __device__ int sumReduction(thread_group g, int *x, int val) // 规约设备函数,要求共享内存 int *x 要够放得下 g.size() 个参加规约的元素 11 { 12 int lane = g.thread_rank(); // 线程在协作组中的编号,教程中名字就叫 line ID 13 14 for (int i = g.size() / 2; i > 0; i /= 2) 15 { 16 x[lane] = val; // 第一次迭代该步相当于初始化,以后迭代该步相当于存储上一次迭代的结果 17 g.sync(); // 协作组同步 18 if (lane < i) 19 val += x[lane + i]; // 利用每个线程局部变量 val 记录当前结果 20 g.sync(); 21 } 22 if (g.thread_rank() == 0) // 零号线程返回计算结果 23 return val; 24 else 25 return -1; 26 } 27 28 __global__ void cgkernel() 29 { 30 extern __shared__ int workspace[]; 31 32 thread_block group = this_thread_block(); // 将线程块内所有线程打包为一个协作组 33 int groupSize = group.size(); // 获得协作组大小(线程个数) 34 int input = group.thread_rank(); // 获得线程在协作组内的编号,并作为计算输入 35 int output = sumReduction(group, workspace, input); // 规约计算,注意直接使用共享内存作为工作空间 36 int expectedOutput = (groupSize - 1)*groupSize / 2; // 预期计算结果,0 + 1 + 2 +...+ 63 = 2016 37 38 if (group.thread_rank() == 0) // 0 号线程报告计算结果,宣布开始新的 4 个协作组的计算任务 39 { 40 printf(" Sum of thread 0 ~ %d in group is %d (expected %d) ", group.size() - 1, output, expectedOutput); 41 printf(" Now creating %d groups, each of size 16 threads: ", group.size() / 16); 42 } 43 group.sync(); // 协作组同步 44 45 thread_block_tile<16> group16 = tiled_partition<16>(group); // 每16个线程分割为一个协作组(只能使用 2 的整数次幂) 46 47 int offset = group.thread_rank() - group16.thread_rank(); // 各协作组使用的共享内存的地址偏移量 48 printf("%d -> thread_rank = %d, group16.thread_rank = %d, offset = %d ", threadIdx.x, group.thread_rank(), group16.thread_rank(), offset); 49 // dim3 group.group_index() 打印出来全是 (0, 0, 0),dim3 group.thread_index() 打印出来跟 group.thread_rank() 一样 50 51 input = group16.thread_rank(); // 获得线程在新协作组中的编号,并作为计算输入 52 output = sumReduction(group16, workspace + offset, input); // 规约计算,注意工作空间的地址偏移 53 expectedOutput = 15 * 16 / 2; // 预期计算结果,0 + 1 + 2 +...+ 16 = 120 54 55 if (group16.thread_rank() == 0) // 各协作组零号线程报告计算结果 56 printf(" Sum of all ranks 0..15 in group16 is %d (expected %d) ", output, expectedOutput); 57 return; 58 } 59 60 int main() 61 { 62 printf(" Start with %d threads. ", THREAD_PER_BLOCK); 63 64 cgkernel << <1, THREAD_PER_BLOCK, THREAD_PER_BLOCK * sizeof(int) >> > (); 65 cudaDeviceSynchronize(); 66 67 printf(" Finish. "); 68 getchar(); 69 return 0; 70 }
● 输出结果
Start with 64 threads. Sum of thread 0 ~ 63 in group is 2016 (expected 2016) Now creating 4 groups, each of size 16 threads: 0 -> thread_rank = 0, group16.thread_rank = 0, offset = 0 1 -> thread_rank = 1, group16.thread_rank = 1, offset = 0 2 -> thread_rank = 2, group16.thread_rank = 2, offset = 0 3 -> thread_rank = 3, group16.thread_rank = 3, offset = 0 4 -> thread_rank = 4, group16.thread_rank = 4, offset = 0 5 -> thread_rank = 5, group16.thread_rank = 5, offset = 0 6 -> thread_rank = 6, group16.thread_rank = 6, offset = 0 7 -> thread_rank = 7, group16.thread_rank = 7, offset = 0 8 -> thread_rank = 8, group16.thread_rank = 8, offset = 0 9 -> thread_rank = 9, group16.thread_rank = 9, offset = 0 10 -> thread_rank = 10, group16.thread_rank = 10, offset = 0 11 -> thread_rank = 11, group16.thread_rank = 11, offset = 0 12 -> thread_rank = 12, group16.thread_rank = 12, offset = 0 13 -> thread_rank = 13, group16.thread_rank = 13, offset = 0 14 -> thread_rank = 14, group16.thread_rank = 14, offset = 0 15 -> thread_rank = 15, group16.thread_rank = 15, offset = 0 16 -> thread_rank = 16, group16.thread_rank = 0, offset = 16 17 -> thread_rank = 17, group16.thread_rank = 1, offset = 16 18 -> thread_rank = 18, group16.thread_rank = 2, offset = 16 19 -> thread_rank = 19, group16.thread_rank = 3, offset = 16 20 -> thread_rank = 20, group16.thread_rank = 4, offset = 16 21 -> thread_rank = 21, group16.thread_rank = 5, offset = 16 22 -> thread_rank = 22, group16.thread_rank = 6, offset = 16 23 -> thread_rank = 23, group16.thread_rank = 7, offset = 16 24 -> thread_rank = 24, group16.thread_rank = 8, offset = 16 25 -> thread_rank = 25, group16.thread_rank = 9, offset = 16 26 -> thread_rank = 26, group16.thread_rank = 10, offset = 16 27 -> thread_rank = 27, group16.thread_rank = 11, offset = 16 28 -> thread_rank = 28, group16.thread_rank = 12, offset = 16 29 -> thread_rank = 29, group16.thread_rank = 13, offset = 16 30 -> thread_rank = 30, group16.thread_rank = 14, offset = 16 31 -> thread_rank = 31, group16.thread_rank = 15, offset = 16 32 -> thread_rank = 32, group16.thread_rank = 0, offset = 32 33 -> thread_rank = 33, group16.thread_rank = 1, offset = 32 34 -> thread_rank = 34, group16.thread_rank = 2, offset = 32 35 -> thread_rank = 35, group16.thread_rank = 3, offset = 32 36 -> thread_rank = 36, group16.thread_rank = 4, offset = 32 37 -> thread_rank = 37, group16.thread_rank = 5, offset = 32 38 -> thread_rank = 38, group16.thread_rank = 6, offset = 32 39 -> thread_rank = 39, group16.thread_rank = 7, offset = 32 40 -> thread_rank = 40, group16.thread_rank = 8, offset = 32 41 -> thread_rank = 41, group16.thread_rank = 9, offset = 32 42 -> thread_rank = 42, group16.thread_rank = 10, offset = 32 43 -> thread_rank = 43, group16.thread_rank = 11, offset = 32 44 -> thread_rank = 44, group16.thread_rank = 12, offset = 32 45 -> thread_rank = 45, group16.thread_rank = 13, offset = 32 46 -> thread_rank = 46, group16.thread_rank = 14, offset = 32 47 -> thread_rank = 47, group16.thread_rank = 15, offset = 32 48 -> thread_rank = 48, group16.thread_rank = 0, offset = 48 49 -> thread_rank = 49, group16.thread_rank = 1, offset = 48 50 -> thread_rank = 50, group16.thread_rank = 2, offset = 48 51 -> thread_rank = 51, group16.thread_rank = 3, offset = 48 52 -> thread_rank = 52, group16.thread_rank = 4, offset = 48 53 -> thread_rank = 53, group16.thread_rank = 5, offset = 48 54 -> thread_rank = 54, group16.thread_rank = 6, offset = 48 55 -> thread_rank = 55, group16.thread_rank = 7, offset = 48 56 -> thread_rank = 56, group16.thread_rank = 8, offset = 48 57 -> thread_rank = 57, group16.thread_rank = 9, offset = 48 58 -> thread_rank = 58, group16.thread_rank = 10, offset = 48 59 -> thread_rank = 59, group16.thread_rank = 11, offset = 48 60 -> thread_rank = 60, group16.thread_rank = 12, offset = 48 61 -> thread_rank = 61, group16.thread_rank = 13, offset = 48 62 -> thread_rank = 62, group16.thread_rank = 14, offset = 48 63 -> thread_rank = 63, group16.thread_rank = 15, offset = 48 Sum of all ranks 0..15 in group16 is 120 (expected 120) Sum of all ranks 0..15 in group16 is 120 (expected 120) Sum of all ranks 0..15 in group16 is 120 (expected 120) Sum of all ranks 0..15 in group16 is 120 (expected 120) Finish.
▶ 涨姿势:
● 相关定义
1 // cooperative_groups_helper.h 2 # if !defined(_CG_QUALIFIER) 3 # define _CG_QUALIFIER __forceinline__ __device__ 4 # endif 5 6 # define die() assert(0); 7 8 // cooperative_groups.h(调整顺序) 9 class thread_group // 通用线程组类型 10 { 11 friend _CG_QUALIFIER thread_group this_thread(); 12 friend _CG_QUALIFIER thread_group tiled_partition(const thread_group& parent, unsigned int tilesz); 13 friend class thread_block; 14 15 protected: 16 union __align__(8) 17 { 18 unsigned int type : 8; 19 struct 20 { 21 unsigned int type : 8; 22 unsigned int size : 24; 23 unsigned int mask; 24 } coalesced; 25 struct 26 { 27 void* ptr[2]; 28 } buffer; 29 } _data; 30 31 _CG_QUALIFIER thread_group operator=(const thread_group& src); 32 33 _CG_QUALIFIER thread_group(__internal::groupType type) 34 { 35 _data.type = type; 36 } 37 #if __cplusplus >= 201103L 38 static_assert(sizeof(_data) == 16, "Failed size check"); 39 #endif 40 41 public: 42 _CG_QUALIFIER unsigned int size() const; 43 _CG_QUALIFIER unsigned int thread_rank() const; 44 _CG_QUALIFIER void sync() const; 45 }; 46 47 class thread_block : public thread_group 48 { 49 friend _CG_QUALIFIER thread_block this_thread_block(); 50 friend _CG_QUALIFIER thread_group tiled_partition(const thread_group& parent, unsigned int tilesz); 51 friend _CG_QUALIFIER thread_group tiled_partition(const thread_block& parent, unsigned int tilesz); 52 53 _CG_QUALIFIER thread_block() : thread_group(__internal::ThreadBlock) {} 54 55 _CG_QUALIFIER thread_group _get_tiled_threads(unsigned int tilesz) const 56 { 57 const bool pow2_tilesz = ((tilesz & (tilesz - 1)) == 0); 58 59 if (tilesz == 0 || (tilesz > 32) || !pow2_tilesz) 60 { 61 die(); 62 return (thread_block()); 63 } 64 65 unsigned int mask; 66 unsigned int base_offset = thread_rank() & (~(tilesz - 1)); 67 unsigned int masklength = min(size() - base_offset, tilesz); 68 mask = (unsigned int)(-1) >> (32 - masklength); 69 mask <<= (__internal::laneid() & ~(tilesz - 1)); 70 thread_group tile = thread_group(__internal::CoalescedTile); 71 tile._data.coalesced.mask = mask; 72 tile._data.coalesced.size = __popc(mask); 73 return (tile); 74 } 75 76 public: 77 _CG_QUALIFIER void sync() const { __internal::cta::sync(); } 78 _CG_QUALIFIER unsigned int size() const { return (__internal::cta::size()); } 79 _CG_QUALIFIER unsigned int thread_rank() const { return (__internal::cta::thread_rank()); } 80 _CG_QUALIFIER dim3 group_index() const { return (__internal::cta::group_index()); } 81 _CG_QUALIFIER dim3 thread_index() const { return (__internal::cta::thread_index()); } 82 }; 83 84 _CG_QUALIFIER thread_block this_thread_block()// 范例代码中用到的,实际是调用了 thread_block 的构造函数 85 { 86 return (thread_block()); 87 } 88 89 template <unsigned int Size> 90 class thread_block_tile; 91 template <> class thread_block_tile<32> : public __thread_block_tile_base<32> { }; 92 template <> class thread_block_tile<16> : public __thread_block_tile_base<16> { }; 93 template <> class thread_block_tile<8> : public __thread_block_tile_base<8> { }; 94 template <> class thread_block_tile<4> : public __thread_block_tile_base<4> { }; 95 template <> class thread_block_tile<2> : public __thread_block_tile_base<2> { }; 96 template <> class thread_block_tile<1> : public __thread_block_tile_base<1> { }; 97 98 template <unsigned int Size> 99 class __thread_block_tile_base : public thread_group 100 { 101 static const unsigned int numThreads = Size; 102 _CG_QUALIFIER unsigned int build_mask() const 103 { 104 unsigned int mask; 105 if (numThreads == 32) 106 mask = 0xFFFFFFFF; 107 else 108 { 109 mask = (unsigned int)(-1) >> (32 - numThreads); 110 mask <<= (__internal::laneid() & (~(numThreads - 1))); 111 } 112 return (mask); 113 } 114 115 protected: 116 _CG_QUALIFIER __thread_block_tile_base() : thread_group(__internal::CoalescedTile) 117 { 118 _data.coalesced.mask = build_mask(); 119 _data.coalesced.size = numThreads; 120 } 121 122 public: 123 _CG_QUALIFIER void sync() const { __syncwarp(build_mask()); } 124 _CG_QUALIFIER unsigned int thread_rank() const { return (threadIdx.x & (numThreads - 1)); } 125 _CG_QUALIFIER unsigned int size() const { return (numThreads); } 126 127 // PTX supported collectives 128 _CG_QUALIFIER int shfl(int var, int srcRank) const { return (__shfl_sync(build_mask(), var, srcRank, numThreads)); } 129 ... 130 131 #ifdef _CG_HAS_FP16_COLLECTIVE 132 _CG_QUALIFIER __half shfl(__half var, int srcRank) const { return (__shfl_sync(build_mask(), var, srcRank, numThreads)); } 133 ... 134 135 #endif 136 137 #ifdef _CG_HAS_MATCH_COLLECTIVE 138 _CG_QUALIFIER unsigned int match_any(int val) const 139 { 140 unsigned int lane_match = build_mask() & __match_any_sync(build_mask(), val); 141 return (lane_match >> (__internal::laneid() & (~(numThreads - 1)))); 142 } 143 ... 144 #endif 145 };
● 用到的线程协作相关函数
1 thread_block threadBlockGroup = this_thread_block(); // 将当前线程块分配为一个协作组 2 3 thread_block_tile<16> tiledPartition16 = tiled_partition<16>(threadBlockGroup); // 协作组分组 4 5 int in = tiledPartition16.thread_rank(); // 协作组中线程的编号 6 7 tiledPartition16.sync(); // 协作组同步