题目:在长方形布局的方式中,每个线程块的X轴方向上开启了32个线程,Y轴方向上开启了4个线程。在线程网格上,X轴方向上有1个线程块,Y轴方向有4个线程块。计算在X轴方向和Y轴方向上的线程块索引与线程索引等一些信息。
1 #include "cuda_runtime.h" 2 #include "device_launch_parameters.h" 3 #include <stdio.h> 4 #include <stdlib.h> 5 __global__ void what_is_my_id_2d_A( 6 unsigned int * const block_x, 7 unsigned int * const block_y, 8 unsigned int * const thread, 9 unsigned int * const calc_thread, 10 unsigned int * const x_thread, 11 unsigned int * const y_thread, 12 unsigned int * const grid_dimx, 13 unsigned int * const block_dimx, 14 unsigned int * const grid_dimy, 15 unsigned int * const block_dimy) 16 { 17 const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; 18 const unsigned int idy = blockIdx.y * blockDim.y + threadIdx.y; 19 const unsigned int thread_idx = (gridDim.x * blockDim.x) * idy + idx; 20 block_x[thread_idx] = blockIdx.x; 21 block_y[thread_idx] = blockIdx.y; 22 thread[thread_idx] = threadIdx.x; 23 calc_thread[thread_idx] = thread_idx; 24 x_thread[thread_idx] = idx; 25 y_thread[thread_idx] = idy; 26 grid_dimx[thread_idx] = gridDim.x; 27 block_dimx[thread_idx] = blockDim.x; 28 grid_dimy[thread_idx] = gridDim.y; 29 block_dimy[thread_idx] = blockDim.y; 30 } 31 32 #define ARRAY_SIZE_X 32 33 #define ARRAY_SIZE_Y 16 34 #define ARRAY_SIZE_IN_BYTES ((ARRAY_SIZE_X) * (ARRAY_SIZE_Y) * sizeof(unsigned int)) 35 36 unsigned int cpu_block_x[ARRAY_SIZE_Y][ARRAY_SIZE_X]; 37 unsigned int cpu_block_y[ARRAY_SIZE_Y][ARRAY_SIZE_X]; 38 unsigned int cpu_thread[ARRAY_SIZE_Y][ARRAY_SIZE_X]; 39 unsigned int cpu_warp[ARRAY_SIZE_Y][ARRAY_SIZE_X]; 40 unsigned int cpu_calc_thread[ARRAY_SIZE_Y][ARRAY_SIZE_X]; 41 unsigned int cpu_xthread[ARRAY_SIZE_Y][ARRAY_SIZE_X]; 42 unsigned int cpu_ythread[ARRAY_SIZE_Y][ARRAY_SIZE_X]; 43 unsigned int cpu_grid_dimx[ARRAY_SIZE_Y][ARRAY_SIZE_X]; 44 unsigned int cpu_block_dimx[ARRAY_SIZE_Y][ARRAY_SIZE_X]; 45 unsigned int cpu_grid_dimy[ARRAY_SIZE_Y][ARRAY_SIZE_X]; 46 unsigned int cpu_block_dimy[ARRAY_SIZE_Y][ARRAY_SIZE_X]; 47 48 int main(void){ 49 const dim3 threads_rect(32, 4); 50 const dim3 blocks_rect(1, 4); 51 52 const dim3 threads_square(16, 8); 53 const dim3 blocks_square(2, 2); 54 55 char ch; 56 57 unsigned int * gpu_block_x; 58 unsigned int * gpu_block_y; 59 unsigned int * gpu_thread; 60 unsigned int * gpu_warp; 61 unsigned int * gpu_calc_thread; 62 unsigned int * gpu_xthread; 63 unsigned int * gpu_ythread; 64 unsigned int * gpu_grid_dimx; 65 unsigned int * gpu_block_dimx; 66 unsigned int * gpu_grid_dimy; 67 unsigned int * gpu_block_dimy; 68 69 cudaMalloc((void **)&gpu_block_x, ARRAY_SIZE_IN_BYTES); 70 cudaMalloc((void **)&gpu_block_y, ARRAY_SIZE_IN_BYTES); 71 cudaMalloc((void **)&gpu_thread, ARRAY_SIZE_IN_BYTES); 72 cudaMalloc((void **)&gpu_calc_thread, ARRAY_SIZE_IN_BYTES); 73 cudaMalloc((void **)&gpu_xthread, ARRAY_SIZE_IN_BYTES); 74 cudaMalloc((void **)&gpu_ythread, ARRAY_SIZE_IN_BYTES); 75 cudaMalloc((void **)&gpu_grid_dimx, ARRAY_SIZE_IN_BYTES); 76 cudaMalloc((void **)&gpu_block_dimx, ARRAY_SIZE_IN_BYTES); 77 cudaMalloc((void **)&gpu_grid_dimy, ARRAY_SIZE_IN_BYTES); 78 cudaMalloc((void **)&gpu_block_dimy, ARRAY_SIZE_IN_BYTES); 79 80 for (int kernel = 0; kernel < 2; kernel++){ 81 switch (kernel) 82 { 83 case 0: 84 { 85 what_is_my_id_2d_A << <blocks_rect, threads_rect >> >(gpu_block_x, gpu_block_y, 86 gpu_thread, gpu_calc_thread, gpu_xthread, gpu_ythread, gpu_grid_dimx, 87 gpu_block_dimx, gpu_grid_dimy, gpu_block_dimy); 88 } break; 89 90 case 1: 91 { 92 what_is_my_id_2d_A << <blocks_square, threads_square >> >(gpu_block_x, gpu_block_y, 93 gpu_thread, gpu_calc_thread, gpu_xthread, gpu_ythread, gpu_grid_dimx, 94 gpu_block_dimx, gpu_grid_dimy, gpu_block_dimy); 95 } break; 96 97 default: exit(1); break; 98 } 99 100 cudaMemcpy(cpu_block_x, gpu_block_x, ARRAY_SIZE_IN_BYTES, cudaMemcpyDeviceToHost); 101 cudaMemcpy(cpu_block_y, gpu_block_y, ARRAY_SIZE_IN_BYTES, cudaMemcpyDeviceToHost); 102 cudaMemcpy(cpu_thread, gpu_thread, ARRAY_SIZE_IN_BYTES, cudaMemcpyDeviceToHost); 103 cudaMemcpy(cpu_calc_thread, gpu_calc_thread, ARRAY_SIZE_IN_BYTES, cudaMemcpyDeviceToHost); 104 cudaMemcpy(cpu_xthread, gpu_xthread, ARRAY_SIZE_IN_BYTES, cudaMemcpyDeviceToHost); 105 cudaMemcpy(cpu_ythread, gpu_ythread, ARRAY_SIZE_IN_BYTES, cudaMemcpyDeviceToHost); 106 cudaMemcpy(cpu_grid_dimx, gpu_grid_dimx, ARRAY_SIZE_IN_BYTES, cudaMemcpyDeviceToHost); 107 cudaMemcpy(cpu_block_dimx, gpu_block_dimx, ARRAY_SIZE_IN_BYTES, cudaMemcpyDeviceToHost); 108 cudaMemcpy(cpu_grid_dimy, gpu_grid_dimy, ARRAY_SIZE_IN_BYTES, cudaMemcpyDeviceToHost); 109 cudaMemcpy(cpu_block_dimy, gpu_block_dimy, ARRAY_SIZE_IN_BYTES, cudaMemcpyDeviceToHost); 110 111 printf(" Kernel %d ", kernel); 112 for (int y = 0; y < ARRAY_SIZE_Y; y++){ 113 for (int x = 0; x < ARRAY_SIZE_X; x++){ 114 printf("CT:%2u BKX:%1u BKY:%1u TID:%2u YTID:%2u XTID:%2u GDX:%1u BDX:%1u GDY %1u BDY %1u ", 115 cpu_calc_thread[y][x], cpu_block_x[y][x], cpu_block_y[y][x], cpu_thread[y][x], cpu_ythread[y][x], 116 cpu_xthread[y][x], cpu_grid_dimx[y][x], cpu_block_dimx[y][x], 117 cpu_grid_dimy[y][x], cpu_block_dimy[y][x]); 118 ch = getchar(); 119 } 120 } 121 printf("Press any key to continue "); 122 ch = getchar(); 123 } 124 125 cudaFree(gpu_block_x); 126 cudaFree(gpu_block_y); 127 cudaFree(gpu_thread); 128 cudaFree(gpu_calc_thread); 129 cudaFree(gpu_xthread); 130 cudaFree(gpu_ythread); 131 cudaFree(gpu_grid_dimx); 132 cudaFree(gpu_block_dimx); 133 cudaFree(gpu_grid_dimy); 134 cudaFree(gpu_block_dimy); 135 }