zoukankan      html  css  js  c++  java
  • CUDA学习4 线程协作

    CUDA学习3 Max pooling (python c++ cuda)中有一个2D grid的CUDA实现,用时141ms。

    以下为2D grid 2D blocks实现,耗时进一步降低到16ms。

        int x = blockIdx.x;
        int y = blockIdx.y;
        int tx = threadIdx.x;
        int ty = threadIdx.y;
        int index2 = y*gridDim.x*blockDim.y*blockDim.x + x* blockDim.y*blockDim.x + ty*blockDim.x + tx;
     
    线程索引计算方式如上,此处需要的循环为(N,M,PH,PH),因此配置如下。(PH*PH=144未超出本机显卡max threads per block=1024的限制)
     
    dim3    grid(M, N);
    dim3    threads(PH, PH);

    下面是完整代码。

    #include <windows.h>
    #include <iostream>
    
    
    __global__ void MaxPool2d(float* bottom_data, const int height, const int pooled_height, float* top_data)
    {
        int x = blockIdx.x;
        int y = blockIdx.y;
        int dx = gridDim.x;
        //int dy = gridDim.y;
        int tx = threadIdx.x;
        int ty = threadIdx.y;
        int dtx = blockDim.x;
        int dty = blockDim.y;
        float s = -10000.0;
        int index2 = y*dx*dtx*dty + x*dtx*dty + ty*dtx + tx;
        int index = y*dx*height*height + x*height*height + ty*pooled_height*height + tx*pooled_height;
        for (int u = 0; u < pooled_height && (u + pooled_height*ty)<height; ++u)
        for (int v = 0; v < pooled_height && (v + pooled_height*tx)<height; ++v)
        if (*(bottom_data + index + u*height + v)>s)
            s = *(bottom_data + index + u*height + v);
        *(top_data + index2) = s;
    }
    
    int main()
    {
        const int N = 500, M =100, H = 24, W = 24, D = 2;
        const int PH = H / D + H % D;
        int image_size = N*M*H*W*sizeof(float);
        int out_size = N*M*PH*PH*sizeof(float);
        float mul_by = 0.01;
        float *input, *output, *dev_output, *dev_input;
        input = new float[image_size];
        output = new float[out_size];
        for (int i = 0; i<N*M*H*W; i++)
            *(input + i) = i*mul_by;
    
        cudaMalloc((void**)&dev_output, out_size);
        cudaMalloc((void**)&dev_input, image_size);
        cudaMemcpy(dev_input, input, image_size, cudaMemcpyHostToDevice);
        dim3    grid(M, N);
        dim3    threads(PH, PH);
        DWORD start_time = GetTickCount();
        MaxPool2d << <grid, threads >> >(dev_input, H, D, dev_output);
        cudaMemcpy(output, dev_output, out_size, cudaMemcpyDeviceToHost);
        DWORD end_time = GetTickCount();
        std::cout << "Cost: " << end_time - start_time << "ms." << std::endl;
        for (int i = 0; i<10; i++)
            std::cout << *(output + i) << std::endl;
    
        cudaFree(dev_input);
        cudaFree(dev_output);
        delete[] output;
        delete[] input;
        system("pause");
    }
    
    /*
    Cost: 16ms.
    0.25
    0.27
    0.29
    0.31
    0.33
    0.35
    0.37
    0.39
    0.41
    0.43
    */

     以下是采用3D grid 3D blocks的错误实现,如下每次比较大小时,都是和-1000.0在比较。

    #include <windows.h>
    #include <iostream>
    
    __global__ void MaxPool2d(float* bottom_data, const int height, const int pooled_height, float* top_data)
    {
        
        int x = blockIdx.x;
        int y = blockIdx.y;
        int z = blockIdx.z;
        int dx = gridDim.x;
        int dy = gridDim.y;
        int tx = threadIdx.x;
        int ty = threadIdx.y;
        int tz = threadIdx.z;
        int dtx = blockDim.x;
        int dty = blockDim.y;
        int dtz = blockDim.z;
    
        int index2 = z*dy*dx*dtz + y*dx*dtz + x*dtz + tz;
        int index = z*dy*height*height + y*height*height + x*pooled_height*height + tz*pooled_height + ty*height + tx;
        if (tx==0 && ty==0)
            *(top_data + index2) = -1000.0;
        if (ty<height - pooled_height*x)
            if (tx<height - pooled_height*tz)
                if (*(bottom_data + index)>*(top_data + index2))
                    *(top_data + index2) = *(bottom_data + index);
        //__syncthreads();
    }
    
    int main()
    {
        const int N = 500, M =100, H = 24, W = 24, D = 2;
        const int PH = H / D + H % D;
        int image_size = N*M*H*W*sizeof(float);
        int out_size = N*M*PH*PH*sizeof(float);
        float mul_by = -0.01;
        float *input, *output, *dev_output, *dev_input;
        input = new float[image_size];
        output = new float[out_size];
        for (int i = 0; i<N*M*H*W; i++)
            *(input + i) = i*mul_by;
    
        cudaMalloc((void**)&dev_output, out_size);
        cudaMalloc((void**)&dev_input, image_size);
        cudaMemcpy(dev_input, input, image_size, cudaMemcpyHostToDevice);
        dim3    grid(PH,M, N);
        dim3    threads(D, D,PH);
        DWORD start_time = GetTickCount();
        MaxPool2d << <grid, threads >> >(dev_input, H, D, dev_output);
        cudaMemcpy(output, dev_output, out_size, cudaMemcpyDeviceToHost);
        DWORD end_time = GetTickCount();
        std::cout << "Cost: " << end_time - start_time << "ms." << std::endl;
        for (int i = 0; i<10; i++)
            std::cout << *(output + i) << std::endl;
    
        cudaFree(dev_input);
        cudaFree(dev_output);
        delete[] output;
        delete[] input;
        system("pause");
    }
    
    /*
    Cost: 47ms.
    -0.25
    -0.27
    -0.29
    -0.31
    -0.33
    -0.35
    -0.37
    -0.39
    -0.41
    -0.43
    */
  • 相关阅读:
    Java实现 LeetCode 30 串联所有单词的子串
    Java实现 LeetCode 29 两数相除
    Java实现 LeetCode 29 两数相除
    Java实现 LeetCode 29 两数相除
    Java实现 LeetCode 28 实现strStr()
    Java实现 LeetCode 28 实现strStr()
    Java实现 LeetCode 28 实现strStr()
    Java实现 LeetCode 27 移除元素
    Java实现 LeetCode 27 移除元素
    字符编码终极笔记:ASCII、Unicode、UTF-8、UTF-16、UCS、BOM、Endian
  • 原文地址:https://www.cnblogs.com/qw12/p/6399421.html
Copyright © 2011-2022 走看看