zoukankan      html  css  js  c++  java
  • cudnn 卷积例子

    运行环境:linux cuda cudnn
    代码: 执行卷积操作 代码参考:https://gist.github.com/odashi/1c20ba90388cf02330e1b95963d78039 
    #include <iomanip>
    #include <iostream>
    #include <cstdlib>
    #include <vector>
    #include <stdio.h>
    #include <cuda.h>
    #include <cudnn_v7.h>
    
    #define CUDA_CALL(f) { 
      cudaError_t err = (f); 
      if (err != cudaSuccess) { 
        std::cout 
            << "    Error occurred: " << err << std::endl; 
        std::exit(1); 
      } 
    }
    
    #define CUDNN_CALL(f) { 
      cudnnStatus_t err = (f); 
      if (err != CUDNN_STATUS_SUCCESS) { 
        std::cout 
            << "    Error occurred: " << err << std::endl; 
        std::exit(1); 
      } 
    }
    
    __global__ void dev_const(float *px, float k) {
      int tid = threadIdx.x + blockIdx.x * blockDim.x;
      px[tid] = k;
    }
    
    __global__ void dev_iota(float *px) {
      int tid = threadIdx.x + blockIdx.x * blockDim.x;
      px[tid] = tid;
    }
    
    void print(const float *data, int n, int c, int h, int w) {
      std::vector<float> buffer(1 << 20);
      CUDA_CALL(cudaMemcpy(
            buffer.data(), data,
            n * c * h * w * sizeof(float),
            cudaMemcpyDeviceToHost));
      int a = 0;
      for (int i = 0; i < n; ++i) {
        for (int j = 0; j < c; ++j) {
          std::cout << "n=" << i << ", c=" << j << ":" << std::endl;
          for (int k = 0; k < h; ++k) {
            for (int l = 0; l < w; ++l) {
              std::cout << std::setw(4) << std::right << buffer[a];
              ++a;
            }
            std::cout << std::endl;
          }
        }
      }
      std::cout << std::endl;
    }
    
    int main() {
      cudnnHandle_t cudnn;
      CUDNN_CALL(cudnnCreate(&cudnn));
    
      // input
      const int in_n = 1;
      const int in_c = 1;
      const int in_h = 5;
      const int in_w = 5;
      std::cout << "in_n: " << in_n << std::endl;
      std::cout << "in_c: " << in_c << std::endl;
      std::cout << "in_h: " << in_h << std::endl;
      std::cout << "in_w: " << in_w << std::endl;
      std::cout << std::endl;
    
      cudnnTensorDescriptor_t in_desc;
      CUDNN_CALL(cudnnCreateTensorDescriptor(&in_desc));
      CUDNN_CALL(cudnnSetTensor4dDescriptor(
            in_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT,
            in_n, in_c, in_h, in_w));
    
      float *in_data;
      CUDA_CALL(cudaMalloc(
            &in_data, in_n * in_c * in_h * in_w * sizeof(float)));
    
      // filter
      const int filt_k = 1;
      const int filt_c = 1;
      const int filt_h = 2;
      const int filt_w = 2;
      std::cout << "filt_k: " << filt_k << std::endl;
      std::cout << "filt_c: " << filt_c << std::endl;
      std::cout << "filt_h: " << filt_h << std::endl;
      std::cout << "filt_w: " << filt_w << std::endl;
      std::cout << std::endl;
    
      cudnnFilterDescriptor_t filt_desc;
      CUDNN_CALL(cudnnCreateFilterDescriptor(&filt_desc));
      CUDNN_CALL(cudnnSetFilter4dDescriptor(
            filt_desc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW,
            filt_k, filt_c, filt_h, filt_w));
    
      float *filt_data;
      CUDA_CALL(cudaMalloc(
          &filt_data, filt_k * filt_c * filt_h * filt_w * sizeof(float)));
    
      // convolution
      const int pad_h = 1;
      const int pad_w = 1;
      const int str_h = 1;
      const int str_w = 1;
      const int dil_h = 1;
      const int dil_w = 1;
      std::cout << "pad_h: " << pad_h << std::endl;
      std::cout << "pad_w: " << pad_w << std::endl;
      std::cout << "str_h: " << str_h << std::endl;
      std::cout << "str_w: " << str_w << std::endl;
      std::cout << "dil_h: " << dil_h << std::endl;
      std::cout << "dil_w: " << dil_w << std::endl;
      std::cout << std::endl;
    
      cudnnConvolutionDescriptor_t conv_desc;
      CUDNN_CALL(cudnnCreateConvolutionDescriptor(&conv_desc));
      CUDNN_CALL(cudnnSetConvolution2dDescriptor(
            conv_desc,
            pad_h, pad_w, str_h, str_w, dil_h, dil_w,
            CUDNN_CONVOLUTION, CUDNN_DATA_FLOAT));
    
      // output
      int out_n;
      int out_c;
      int out_h;
      int out_w;
      
      CUDNN_CALL(cudnnGetConvolution2dForwardOutputDim(
            conv_desc, in_desc, filt_desc,
            &out_n, &out_c, &out_h, &out_w));
    
      std::cout << "out_n: " << out_n << std::endl;
      std::cout << "out_c: " << out_c << std::endl;
      std::cout << "out_h: " << out_h << std::endl;
      std::cout << "out_w: " << out_w << std::endl;
      std::cout << std::endl;
    
      cudnnTensorDescriptor_t out_desc;
      CUDNN_CALL(cudnnCreateTensorDescriptor(&out_desc));
      CUDNN_CALL(cudnnSetTensor4dDescriptor(
            out_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT,
            out_n, out_c, out_h, out_w));
    
      float *out_data;
      CUDA_CALL(cudaMalloc(
            &out_data, out_n * out_c * out_h * out_w * sizeof(float)));
    
      // algorithm
      cudnnConvolutionFwdAlgo_t algo;
      CUDNN_CALL(cudnnGetConvolutionForwardAlgorithm(
            cudnn,
            in_desc, filt_desc, conv_desc, out_desc,
            CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, 0, &algo));
    
      std::cout << "Convolution algorithm: " << algo << std::endl;
      std::cout << std::endl;
    
      // workspace
      size_t ws_size;
      CUDNN_CALL(cudnnGetConvolutionForwardWorkspaceSize(
            cudnn, in_desc, filt_desc, conv_desc, out_desc, algo, &ws_size));
    
      float *ws_data;
      CUDA_CALL(cudaMalloc(&ws_data, ws_size));
    
      std::cout << "Workspace size: " << ws_size << std::endl;
      std::cout << std::endl;
    
      // perform
      float alpha = 1.f;
      float beta = 0.f;
      dev_iota<<<in_w * in_h, in_n * in_c>>>(in_data);
      dev_const<<<filt_w * filt_h, filt_k * filt_c>>>(filt_data, 1.f);
      CUDNN_CALL(cudnnConvolutionForward(
          cudnn,
          &alpha, in_desc, in_data, filt_desc, filt_data,
          conv_desc, algo, ws_data, ws_size,
          &beta, out_desc, out_data));
    
      // results
      std::cout << "in_data:" << std::endl;
      print(in_data, in_n, in_c, in_h, in_w);
      
      std::cout << "filt_data:" << std::endl;
      print(filt_data, filt_k, filt_c, filt_h, filt_w);
      
      std::cout << "out_data:" << std::endl;
      print(out_data, out_n, out_c, out_h, out_w);
    
      // finalizing
      CUDA_CALL(cudaFree(ws_data));
      CUDA_CALL(cudaFree(out_data));
      CUDNN_CALL(cudnnDestroyTensorDescriptor(out_desc));
      CUDNN_CALL(cudnnDestroyConvolutionDescriptor(conv_desc));
      CUDA_CALL(cudaFree(filt_data));
      CUDNN_CALL(cudnnDestroyFilterDescriptor(filt_desc));
      CUDA_CALL(cudaFree(in_data));
      CUDNN_CALL(cudnnDestroyTensorDescriptor(in_desc));
      CUDNN_CALL(cudnnDestroy(cudnn));
      return 0;
    }
    

      



    运行:

    nvcc conv_cudnn.cu -lcudnn
    ./a.out
    

      

     
     

    <wiz_tmp_tag id="wiz-table-range-border" contenteditable="false" style="display: none;">

  • 相关阅读:
    SSWA jsp 函数 的页面
    fun_function.execute
    pa_transaction_interface_all
    EBS mo_glob_org_access_tmp表的分析
    R12将银行和分行都使用TCA管理
    ap_invoice_distributions_all到xla_ae_lines
    python使用uuid和guid
    js http请求
    DES加密
    wxPython操作图形用户界面
  • 原文地址:https://www.cnblogs.com/combfish/p/9259362.html
Copyright © 2011-2022 走看看