zoukankan      html  css  js  c++  java
  • cudaThreadSynchronize()

    // 调用CUDA kernel 是非阻塞的,调用kernel语句后面的语句不等待kernel执行完,立即执行。所以在 call_kernel(see kernel.cu) 中执行 m5op.dump 是错误的!!!

    // REF: https://www.cs.virginia.edu/~csadmin/wiki/index.php/CUDA_Support/Measuring_kernel_runtime  

    // cudaThreadSynchronize() 暂停调用者的执行,直到前面的 stream operation 执行完毕。

    // REF: https://stackoverflow.com/questions/13485018/cudastreamsynchronize-vs-cudadevicesynchronize-vs-cudathreadsynchronize

    // C++ thread join 问题,在 kernel.cpp 中也有 join,那么是在 kernel.cpp 中 dump 还是在main.cpp中join后面dump?

    // REF: http://en.cppreference.com/w/cpp/algorithm/for_each

    // 若 GPU 先执行完毕,在 main.cpp 中join后 dump 似乎合理; 若 CPU 先执行完毕,岂不是要阻塞在 cudaThreadSynchronize 处?

    // 暂且在 kernel.cp p中 dump!

    kernel.cpp

    // CPU threads--------------------------------------------------------------------------------------
    void run_cpu_threads(T *matrix_out, T *matrix, std::atomic_int *flags, int n, int m, int pad, int n_threads, int ldim, int n_tasks, float alpha
    #ifdef CUDA_8_0
        , std::atomic_int *worklist
    #endif
        ) {
            std::cout<<"run_cpu_threads start."<<std::endl;
    
        const int                REGS_CPU = REGS * ldim;
        std::vector<std::thread> cpu_threads;
        for(int i = 0; i < n_threads; i++) {
        
            cpu_threads.push_back(std::thread([=]() {
    
    #ifdef CUDA_8_0
                Partitioner p = partitioner_create(n_tasks, alpha, i, n_threads, worklist);
    #else
                Partitioner p = partitioner_create(n_tasks, alpha, i, n_threads);
    #endif
    
                const int matrix_size       = m * (n + pad);
                const int matrix_size_align = (matrix_size + ldim * REGS - 1) / (ldim * REGS) * (ldim * REGS);
    
                for(int my_s = cpu_first(&p); cpu_more(&p); my_s = cpu_next(&p)) {
    
                    // Declare on-chip memory
                    T   reg[REGS_CPU];
                    int pos      = matrix_size_align - 1 - (my_s * REGS_CPU);
                    int my_s_row = pos / (n + pad);
                    int my_x     = pos % (n + pad);
                    int pos2     = my_s_row * n + my_x;
    // Load in on-chip memory
    #pragma unroll
                    for(int j = 0; j < REGS_CPU; j++) {
                        if(pos2 >= 0 && my_x < n && pos2 < matrix_size)
                            reg[j] = matrix[pos2];
                        else
                            reg[j] = 0;
                        pos--;
                        my_s_row = pos / (n + pad);
                        my_x     = pos % (n + pad);
                        pos2     = my_s_row * n + my_x;
                    }
    
                    // Set global synch
                    while((&flags[my_s])->load() == 0) {
                    }
                    (&flags[my_s + 1])->fetch_add(1);
    
                    // Store to global memory
                    pos = matrix_size_align - 1 - (my_s * REGS_CPU);
    #pragma unroll
                    for(int j = 0; j < REGS_CPU; j++) {
                        if(pos >= 0 && pos < matrix_size)
                            matrix_out[pos] = reg[j];
                        pos--;
                    }
                }
            }));
        }
        std::for_each(cpu_threads.begin(), cpu_threads.end(), [](std::thread &t) { t.join(); });
        std::cout<<"dump.. after run_cpu_threads end."<<std::endl;
        m5_dump_stats(0,0);
    }

    kernel.cu

    cudaError_t call_Padding_kernel(int blocks, int threads, int n, int m, int pad, int n_tasks, float alpha, 
        T *matrix_out, T *matrix, int *flags
    #ifdef CUDA_8_0
        , int l_mem_size, int *worklist
    #endif
        ){
            std::cout<<"call_pad start."<<std::endl;
        dim3 dimGrid(blocks);
        dim3 dimBlock(threads);
        Padding_kernel<<<dimGrid, dimBlock
    #ifdef CUDA_8_0
            , l_mem_size
    #endif
            >>>(n, m, pad, n_tasks, alpha, 
            matrix_out, matrix, flags
    #ifdef CUDA_8_0
            , worklist
    #endif
            );
        cudaError_t err = cudaGetLastError();
        std::cout<<"dump.. after call_pad end."<<std::endl;
        m5_dump_stats(0,0);
        return err;
    }

    main.cpp

    for(int rep = 0; rep < p.n_warmup + p.n_reps; rep++) {
    
            // Reset
    #ifdef CUDA_8_0
            for(int i = 0; i < p.n_bins; i++) {
                h_histo[i].store(0);
            }
    #else
            memset(h_histo, 0, p.n_bins * sizeof(unsigned int));
            cudaStatus = cudaMemcpy(d_histo, h_histo, p.n_bins * sizeof(unsigned int), cudaMemcpyHostToDevice);
            cudaThreadSynchronize();
            CUDA_ERR();
    #endif
    
            std::cout<<"m5 work begin."<<std::endl;
    
            // Launch GPU threads
            // Kernel launch
            if(p.n_gpu_blocks > 0) {
                std::cout<<"launch gpu."<<std::endl;
                cudaStatus = call_Histogram_kernel(p.n_gpu_blocks, p.n_gpu_threads, p.in_size, p.n_bins, n_cpu_bins, 
                    d_in, (unsigned int*)d_histo, p.n_bins * sizeof(unsigned int));
                CUDA_ERR();
            }
    
            // Launch CPU threads
            std::cout<<"launch cpu."<<std::endl;
            std::thread main_thread(run_cpu_threads, (unsigned int *)h_histo, h_in, p.in_size, p.n_bins, p.n_threads,
                p.n_gpu_threads, n_cpu_bins);
                std::cout<<"cuda sync."<<std::endl;
    
            cudaThreadSynchronize();
            std::cout<<"cpu join after cuda sync."<<std::endl;
            main_thread.join();
    
            //m5_work_end(0, 0);
            std::cout<<"m5 work end."<<std::endl;
        }
  • 相关阅读:
    linux—上传,下载本地文件到服务器
    elasticsearch摸石头过河——数据导入(五)
    elasticsearch摸石头过河——配置文件解析(四)
    elasticsearch摸石头过河——基本安装应用(三)
    elasticsearch摸石头过河——常用数据类型(二)
    spring AOP——名词,语法介绍(一)
    EXCEL(POI)导入导出工具类
    MAVEN 排除第三方jar
    elasticsearch摸石头过河——基本概念(一)
    websocket应用
  • 原文地址:https://www.cnblogs.com/chenhuanBlogs/p/7788721.html
Copyright © 2011-2022 走看看