▶ CUDA 动态并行实现快排算法(单线程的递归调用)
▶ 源代码:动态并行递归调用线程块。要点:添加 -rdc=true 选项(生成 relocatable device code,相当于执行分离编译),以及链接库 cudadevrt.lib (用于动态并行,不同于运行时库 cudart.lib)
1 #include <stdio.h> 2 #include <cuda.h> 3 #include <cuda_runtime.h> 4 #include "device_launch_parameters.h" 5 #include <helper_cuda.h> 6 #include <helper_string.h> 7 8 __device__ int g_blockId = 0; // 线程块的全局编号,供所有线程读写 9 10 __device__ void print_info(int depth, int blockId, int parent_threadId, int parent_blockId) // 打印当前线程块信息,包括深度,当前块号, 11 { 12 if (threadIdx.x == 0) 13 { 14 if (depth == 0) 15 printf("BLOCK %d launched by the host ", blockId); 16 else 17 { 18 char buffer[32]; 19 for (int i = 0; i < depth; ++i) // 对应更多层级,每层前面都有相应层数的 "| " 20 { 21 buffer[3 * i + 0] = '+'; 22 buffer[3 * i + 1] = ' '; 23 buffer[3 * i + 2] = ' '; 24 } 25 buffer[3 * depth] = '