zoukankan      html  css  js  c++  java
  • CUDA学习3 Max pooling (python c++ cuda)

    1.Python

    CNN4 参数优化中有一个CNN模型,其中的限速步是max pooling。

    如下所示,Python中运行一个50*100*24*24的max pooling需要3秒。

    import numpy as np
    import time
    
    def simple_pool(input, ds=(2, 2)):
        n, m, h, w = input.shape
        d, s = ds
        zh = h / d + h % d
        zw = w / s + w % s
        z = np.zeros((n, m,zh,zw))
        for k in range(n):
            for o in range(m):
                for i in range(zh):
                    for j in range(zw):
                        maxd = -10000
                        for u in range(min(d,h-d*i) ):
                            for v in range(min(d,w-d*j)):
                                if input[k,o,d*i+u,d*j+v]>maxd:
                                    maxd=input[k,o,d*i+u,d*j+v]
                        z[k, o, i, j] = maxd
    
        return z
    
    N,M,H,W=[50,100,24,24]
    a=np.reshape(range(N*M*H*W),(N,M,H,W))*0.01
    start_time= time.time()
    out_data=simple_pool(a)
    print "Cost:",time.time()-start_time,"s"
    print out_data[0,0,0,:10]
    
    """
    Cost: 3.08899998665 s
    [ 0.25  0.27  0.29  0.31  0.33  0.35  0.37  0.39  0.41  0.43]
    """

    2.C++

    采用c++,仅需16~30ms。

    #include<iostream>
    #include<windows.h>
    
    void MaxPool2d(const float* const bottom_data, const int num, const int channels,
        const int height, const int width, const int pooled_height,float* top_data)
    {
        const int w = width;
        const int h = height;
        const int m = channels;
        const int n = num;
        const int d = pooled_height;
        const int zh = h / d + h % d;
        const int zw = w / d + w % d;
        int i,j,k,o,u,v,index,index2=0;
        float s;
        for (k = 0; k < n; ++k)
            for (o = 0; o < m; ++o)
                for (i = 0; i < zh; ++i)
                    for (j = 0; j < zw; ++j)
                    {
                        index=k*m*h*w+o*h*w+d*i*w+d*j;
                        s=-10000.0;
                        for (u = 0; u < d&&(u+d*i)<h; ++u)
                            for (v = 0; v < d&&(v+d*j)<w; ++v)
                                if (*(bottom_data+index+u*w+v)>s)
                                    s=*(bottom_data+index+u*w+v);
                        *(top_data+index2)=s;
                        ++index2;
                    }
    }
    
    int main()
    {
      const int N=50,M=100,H=24,W=24,P=(H+1)/2;
      float mul_min=0.01;
      float *input,*output;
      input=new float [N*M*H*W*sizeof(float)];
      output=new float [N*M*P*P*sizeof(float)];
      for(int i=0;i<N*M*H*W;i++)
        *(input+i)=i*mul_min;
    
      DWORD start_time=GetTickCount();
      MaxPool2d(input,N,M,H,W,2,output);
      DWORD end_time=GetTickCount();
      std::cout<<"Cost: "<<end_time-start_time<<"ms."<<std::endl;
      for(int i=0;i<10;i++)
        std::cout<<*(output+i)<<std::endl;
    
      delete []input;
      delete []output;
    }
    
    /*
    Cost: 16ms.
    0.25
    0.27
    0.29
    0.31
    0.33
    0.35
    0.37
    0.39
    0.41
    0.43
    */

    3.CUDA

    在N=50时为16ms,N=500时为141ms(c++中为218ms),略有提升,应该是计算快了一些,数据交换慢了一些。

    #include <windows.h>
    #include <iostream>
    
    __global__ void MaxPool2d(float* bottom_data, const int height, const int width, 
        const int pooled_height,const int out_height,float* top_data)
    {
        int x = blockIdx.x;
        int y = blockIdx.y;
        int i,j,u,v,index;
        int index2=x*gridDim.y*out_height*out_height+y*out_height*out_height;
        float s;
        for (i = 0; i < out_height; ++i)
            for (j = 0; j < out_height; ++j)
            {
                index=x*gridDim.y*height*width+y*height*width+i*pooled_height*width+j*pooled_height;
                s=-10000.0;
                for (u = 0; u < pooled_height&&(u+pooled_height*i)<height; ++u)
                    for (v = 0; v < pooled_height&&(v+pooled_height*j)<width; ++v)
                        if (*(bottom_data+index+u*width+v)>s)
                            s=*(bottom_data+index+u*width+v);
                *(top_data+index2)=s;
                ++index2;
            }
    }
    
    int main()
    {
      const int N=500,M=100,H=24,W=24,D=2;
      const int PH=H / D + H % D;
      int image_size = N*M*H*W*sizeof(float);
      int out_size = N*M*PH*PH*sizeof(float);
      float mul_by=0.01;
      float *input,*output,*dev_output,*dev_input;
      input = new float[image_size];
      output = new float[out_size];
      for (int i = 0; i<N*M*H*W; i++)
          *(input + i) = i*mul_by;
    
      cudaMalloc((void**)&dev_output, out_size);
      cudaMalloc((void**)&dev_input, image_size);
      cudaMemcpy(dev_input, input, image_size, cudaMemcpyHostToDevice);
      dim3    grid(N, M);
      DWORD start_time=GetTickCount();
      MaxPool2d<<<grid,1>>>(dev_input,H,W,D,PH,dev_output);
      cudaMemcpy(output, dev_output, out_size, cudaMemcpyDeviceToHost);
      DWORD end_time=GetTickCount();
      std::cout<<"Cost: "<<end_time-start_time<<"ms."<<std::endl;
      for (int i = 0; i<10; i++)
          std::cout << *(output + i) << std::endl;
    
      cudaFree(dev_input);
      cudaFree(dev_output);
      delete[] output;
      delete[] input;
      system("pause");
    }
    
    /*
    Cost: 141ms.
    0.25
    0.27
    0.29
    0.31
    0.33
    0.35
    0.37
    0.39
    0.41
    0.43
    */
  • 相关阅读:
    Java文档注释
    Java程序基本框架
    Java文件手动编译执行步骤
    JDK安装中配置Path无效解决办法
    JDK安装配置
    Java简单介绍运行机制
    python代码注释
    python从hello world开始
    python,pycharm,anaconda之间的区别与联系
    python环境配置
  • 原文地址:https://www.cnblogs.com/qw12/p/6391402.html
Copyright © 2011-2022 走看看