zoukankan      html  css  js  c++  java
  • 4.3 Reduction代码(Heterogeneous Parallel Programming class lab)

    首先添加上Heterogeneous Parallel Programming class 中 lab: Reduction的代码:

    myReduction.c

    // MP Reduction
    // Given a list (lst) of length n
    // Output its sum = lst[0] + lst[1] + ... + lst[n-1];
    
    #include    <wb.h>
    
    #define BLOCK_SIZE 512 //@@ You can change this
    
    #define wbCheck(stmt) do {                                                    
            cudaError_t err = stmt;                                               
            if (err != cudaSuccess) {                                             
                wbLog(ERROR, "Failed to run stmt ", #stmt);                       
                wbLog(ERROR, "Got CUDA error ...  ", cudaGetErrorString(err));    
                return -1;                                                        
            }                                                                     
        } while(0)
    
    __global__ void reduction(float *g_idata, float *g_odata, unsigned int n){
        
        __shared__ float sdata[BLOCK_SIZE];
    
        // load shared mem
        unsigned int tid = threadIdx.x;
        unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
    
        sdata[tid] = (i < n) ? g_idata[i] : 0;
    
        __syncthreads();
    
        // do reduction in shared mem, stride is divided by 2,
        for (unsigned int s=blockDim.x/2; s>0; s>>=1)
        {
            //__syncthreads();
            if (tid < s)
            {
                sdata[tid] += sdata[tid + s];
            }
    
            __syncthreads();
        }
    
        // write result for this block to global mem
        if (tid == 0) g_odata[blockIdx.x] = sdata[0];
    
    }
      
    
    __global__ void total(float * input, float * output, int len) {
        //@@ Load a segment of the input vector into shared memory
        __shared__ float partialSum[2 * BLOCK_SIZE];  //blockDim.x is not okay, compile fail
        unsigned int t = threadIdx.x;
        unsigned int start = 2 * blockIdx.x * blockDim.x;
        if (start + t < len)
           partialSum[t] = input[start + t];
        else
           partialSum[t] = 0;
        
        if (start + blockDim.x + t < len)
           partialSum[blockDim.x + t] = input[start + blockDim.x + t];
        else
           partialSum[blockDim.x + t] = 0;
        
        //@@ Traverse the reduction tree
        for (unsigned int stride = blockDim.x; stride >= 1; stride >>= 1) {
           __syncthreads();
           if (t < stride)
              partialSum[t] += partialSum[t+stride];
        }
        //@@ Write the computed sum of the block to the output vector at the 
        //@@ correct index
        if (t == 0)
           output[blockIdx.x] = partialSum[0];
    }
    
    int main(int argc, char ** argv) {
        int ii;
        wbArg_t args;
        float * hostInput; // The input 1D list
        float * hostOutput; // The output list
        float * deviceInput;
        float * deviceOutput;
        int numInputElements; // number of elements in the input list
        int numOutputElements; // number of elements in the output list
    
        args = wbArg_read(argc, argv);
    
        wbTime_start(Generic, "Importing data and creating memory on host");
        hostInput = (float *) wbImport(wbArg_getInputFile(args, 0), &numInputElements);
    
        numOutputElements = numInputElements / (BLOCK_SIZE);
        if (numInputElements % (BLOCK_SIZE)) {
            numOutputElements++;
        }
        
        //This for kernel total
        /*numOutputElements = numInputElements / (BLOCK_SIZE <<1);
        if (numInputElements % (BLOCK_SIZE)<<1) {
            numOutputElements++;
        } */
        hostOutput = (float*) malloc(numOutputElements * sizeof(float));
    
        wbTime_stop(Generic, "Importing data and creating memory on host");
    
        wbLog(TRACE, "The number of input elements in the input is ", numInputElements);
        wbLog(TRACE, "The number of output elements in the input is ", numOutputElements);
    
        wbTime_start(GPU, "Allocating GPU memory.");
        //@@ Allocate GPU memory here
        cudaMalloc((void **) &deviceInput, numInputElements * sizeof(float));
        cudaMalloc((void **) &deviceOutput, numOutputElements * sizeof(float));
    
        wbTime_stop(GPU, "Allocating GPU memory.");
    
        wbTime_start(GPU, "Copying input memory to the GPU.");
        //@@ Copy memory to the GPU here
        cudaMemcpy(deviceInput,
                   hostInput,
                   numInputElements * sizeof(float),
                   cudaMemcpyHostToDevice);
    
    
        wbTime_stop(GPU, "Copying input memory to the GPU.");
        //@@ Initialize the grid and block dimensions here
        dim3 dimGrid(numOutputElements, 1, 1);
        dim3 dimBlock(BLOCK_SIZE, 1, 1);
    
        wbTime_start(Compute, "Performing CUDA computation");
        //@@ Launch the GPU Kernel here
        reduction<<<dimGrid,dimBlock>>>(deviceInput, deviceOutput, numInputElements);
        //total<<<dimGrid, dimBlock>>>(deviceInput, deviceOutput, numInputElements);
        cudaDeviceSynchronize();
        wbTime_stop(Compute, "Performing CUDA computation");
    
        wbTime_start(Copy, "Copying output memory to the CPU");
        //@@ Copy the GPU memory back to the CPU here
        cudaMemcpy(hostOutput, deviceOutput, sizeof(float) * numOutputElements, cudaMemcpyDeviceToHost);
        wbTime_stop(Copy, "Copying output memory to the CPU");
    
        /********************************************************************
         * Reduce output vector on the host
         * NOTE: One could also perform the reduction of the output vector
         * recursively and support any size input. For simplicity, we do not
         * require that for this lab.
         ********************************************************************/
        for (ii = 1; ii < numOutputElements; ii++) {
            hostOutput[0] += hostOutput[ii];
        }
    
        wbTime_start(GPU, "Freeing GPU Memory");
        //@@ Free the GPU memory here
        cudaFree(deviceInput);
        cudaFree(deviceOutput);
    
        wbTime_stop(GPU, "Freeing GPU Memory");
    
        wbSolution(args, hostOutput, 1);
    
        free(hostInput);
        free(hostOutput);
    
        return 0;
    }
    View Code
  • 相关阅读:
    C#博客随笔之四:使用C#模拟办公网登录HttpClient的使用
    C#博客随笔之三:Linq in C#
    C#博客随笔之二:wp开发之弹出对话框
    C#博客随笔之一:使用C#的第一个WP程序
    Fedora15命令速查手册
    乐观是一种智慧
    完全教程 Aircrackng破解WEP、WPAPSK加密利器
    FreeBSD常用命令大全
    Linux 网络管理员指南——前言
    API
  • 原文地址:https://www.cnblogs.com/biglucky/p/4271857.html
Copyright © 2011-2022 走看看