1 #include <stdio.h> 2 #include "gputimer.h" 3 #include "cuda_runtime.h" 4 #include "device_launch_parameters.h" 5 #include <stdlib.h> 6 7 const int N = 1024; 8 const int K = 32; 9 10 void fill_matrix(float * mat){ 11 for (int i = 0; i < N*N; i++) 12 mat[i] = (float)i; 13 } 14 15 void print_matrix(float *mat) 16 { 17 for (int j = 0; j < N; j++) 18 { 19 for (int i = 0; i < N; i++) { printf("%4.4g ", mat[i + j*N]); } 20 printf(" "); 21 } 22 } 23 24 __global__ void transpose_serial(float in[], float out[]){ 25 for (int i = 0; i < N; i++) 26 for (int j = 0; j < N; j++) 27 out[i + j*N] = in[j + i*N]; 28 } 29 30 __global__ void transpose_parallel_per_row(float in[], float out[]){ 31 int i = threadIdx.x; 32 33 for (int j = 0; j < N; j++) 34 out[j + i*N] = in[i + j*N]; 35 } 36 37 __global__ void transpose_parallel_per_element(float in[], float out[]){ 38 int i = blockIdx.x * K + threadIdx.x; 39 int j = blockIdx.y * K + threadIdx.y; 40 out[j + i*N] = in[i + j*N]; 41 } 42 int main(void){ 43 int numbytes = N * N * sizeof(float); 44 45 float *in = (float *)malloc(numbytes); 46 float *out = (float *)malloc(numbytes); 47 fill_matrix(in); 48 49 float *d_in, *d_out; 50 51 cudaMalloc((void **)&d_in, numbytes); 52 cudaMalloc((void **)&d_out, numbytes); 53 cudaMemcpy(d_in, in, numbytes, cudaMemcpyHostToDevice); 54 55 GpuTimer timer; 56 timer.Start(); 57 transpose_serial << <1, 1 >> >(d_in, d_out); 58 timer.Stop(); 59 cudaMemcpy(out, d_out, numbytes, cudaMemcpyDeviceToHost); 60 printf("transpose_serial:%g ms. ", timer.Elapsed()); 61 62 timer.Start(); 63 transpose_parallel_per_row << <1, N >> >(d_in, d_out); 64 timer.Stop(); 65 cudaMemcpy(out, d_out, numbytes, cudaMemcpyDeviceToHost); 66 printf("transpose_parallel_per_row:%g ms. ", timer.Elapsed()); 67 68 dim3 blocks(N / K, N / K); 69 dim3 threads(K, K); 70 timer.Start(); 71 transpose_parallel_per_element << <blocks, threads >> >(d_in, d_out); 72 timer.Stop(); 73 cudaMemcpy(out, d_out, numbytes, cudaMemcpyDeviceToHost); 74 printf("transpose_parallel_per_element: %g ms. ", timer.Elapsed()); 75 76 cudaFree(d_in); 77 cudaFree(d_out); 78 79 return 0; 80 }