// 调用CUDA kernel 是非阻塞的,调用kernel语句后面的语句不等待kernel执行完,立即执行。所以在 call_kernel(see kernel.cu) 中执行 m5op.dump 是错误的!!!
// REF: https://www.cs.virginia.edu/~csadmin/wiki/index.php/CUDA_Support/Measuring_kernel_runtime
// cudaThreadSynchronize() 暂停调用者的执行,直到前面的 stream operation 执行完毕。
// REF: https://stackoverflow.com/questions/13485018/cudastreamsynchronize-vs-cudadevicesynchronize-vs-cudathreadsynchronize
// C++ thread join 问题,在 kernel.cpp 中也有 join,那么是在 kernel.cpp 中 dump 还是在main.cpp中join后面dump?
// REF: http://en.cppreference.com/w/cpp/algorithm/for_each
// 若 GPU 先执行完毕,在 main.cpp 中join后 dump 似乎合理; 若 CPU 先执行完毕,岂不是要阻塞在 cudaThreadSynchronize 处?
// 暂且在 kernel.cp p中 dump!
kernel.cpp
// CPU threads-------------------------------------------------------------------------------------- void run_cpu_threads(T *matrix_out, T *matrix, std::atomic_int *flags, int n, int m, int pad, int n_threads, int ldim, int n_tasks, float alpha #ifdef CUDA_8_0 , std::atomic_int *worklist #endif ) { std::cout<<"run_cpu_threads start."<<std::endl; const int REGS_CPU = REGS * ldim; std::vector<std::thread> cpu_threads; for(int i = 0; i < n_threads; i++) { cpu_threads.push_back(std::thread([=]() { #ifdef CUDA_8_0 Partitioner p = partitioner_create(n_tasks, alpha, i, n_threads, worklist); #else Partitioner p = partitioner_create(n_tasks, alpha, i, n_threads); #endif const int matrix_size = m * (n + pad); const int matrix_size_align = (matrix_size + ldim * REGS - 1) / (ldim * REGS) * (ldim * REGS); for(int my_s = cpu_first(&p); cpu_more(&p); my_s = cpu_next(&p)) { // Declare on-chip memory T reg[REGS_CPU]; int pos = matrix_size_align - 1 - (my_s * REGS_CPU); int my_s_row = pos / (n + pad); int my_x = pos % (n + pad); int pos2 = my_s_row * n + my_x; // Load in on-chip memory #pragma unroll for(int j = 0; j < REGS_CPU; j++) { if(pos2 >= 0 && my_x < n && pos2 < matrix_size) reg[j] = matrix[pos2]; else reg[j] = 0; pos--; my_s_row = pos / (n + pad); my_x = pos % (n + pad); pos2 = my_s_row * n + my_x; } // Set global synch while((&flags[my_s])->load() == 0) { } (&flags[my_s + 1])->fetch_add(1); // Store to global memory pos = matrix_size_align - 1 - (my_s * REGS_CPU); #pragma unroll for(int j = 0; j < REGS_CPU; j++) { if(pos >= 0 && pos < matrix_size) matrix_out[pos] = reg[j]; pos--; } } })); } std::for_each(cpu_threads.begin(), cpu_threads.end(), [](std::thread &t) { t.join(); }); std::cout<<"dump.. after run_cpu_threads end."<<std::endl; m5_dump_stats(0,0); }
kernel.cu
cudaError_t call_Padding_kernel(int blocks, int threads, int n, int m, int pad, int n_tasks, float alpha, T *matrix_out, T *matrix, int *flags #ifdef CUDA_8_0 , int l_mem_size, int *worklist #endif ){ std::cout<<"call_pad start."<<std::endl; dim3 dimGrid(blocks); dim3 dimBlock(threads); Padding_kernel<<<dimGrid, dimBlock #ifdef CUDA_8_0 , l_mem_size #endif >>>(n, m, pad, n_tasks, alpha, matrix_out, matrix, flags #ifdef CUDA_8_0 , worklist #endif ); cudaError_t err = cudaGetLastError(); std::cout<<"dump.. after call_pad end."<<std::endl; m5_dump_stats(0,0); return err; }
main.cpp
for(int rep = 0; rep < p.n_warmup + p.n_reps; rep++) { // Reset #ifdef CUDA_8_0 for(int i = 0; i < p.n_bins; i++) { h_histo[i].store(0); } #else memset(h_histo, 0, p.n_bins * sizeof(unsigned int)); cudaStatus = cudaMemcpy(d_histo, h_histo, p.n_bins * sizeof(unsigned int), cudaMemcpyHostToDevice); cudaThreadSynchronize(); CUDA_ERR(); #endif std::cout<<"m5 work begin."<<std::endl; // Launch GPU threads // Kernel launch if(p.n_gpu_blocks > 0) { std::cout<<"launch gpu."<<std::endl; cudaStatus = call_Histogram_kernel(p.n_gpu_blocks, p.n_gpu_threads, p.in_size, p.n_bins, n_cpu_bins, d_in, (unsigned int*)d_histo, p.n_bins * sizeof(unsigned int)); CUDA_ERR(); } // Launch CPU threads std::cout<<"launch cpu."<<std::endl; std::thread main_thread(run_cpu_threads, (unsigned int *)h_histo, h_in, p.in_size, p.n_bins, p.n_threads, p.n_gpu_threads, n_cpu_bins); std::cout<<"cuda sync."<<std::endl; cudaThreadSynchronize(); std::cout<<"cpu join after cuda sync."<<std::endl; main_thread.join(); //m5_work_end(0, 0); std::cout<<"m5 work end."<<std::endl; }