zoukankan      html  css  js  c++  java
  • Caffe常用算子GPU和CPU对比

    通过整理LeNet、AlexNet、VGG16、googLeNet、ResNet、MLP统计出的常用算子(不包括ReLU),表格是对比。

    Prelu

    Cpu

    Gpu

      for (int i = 0; i < count; ++i) {

        int c = (i / dim) % channels / div_factor;

        top_data[i] = std::max(bottom_data[i], Dtype(0))

            + slope_data[c] * std::min(bottom_data[i], Dtype(0));

      }

    Kernel代码

      for (int i = blockIdx.x * blockDim.x + threadIdx.x;

           i < (n);       i += blockDim.x * gridDim.x)

    {

        int c = (index / dim) % channels / div_factor;

        out[index] = in[index] > 0 ? in[index] : in[index] * slope_data[c];

    }

    Im2col

    Cpu版:

    Gpu版本

    channel_size = height * width;

    for (int channel = channels; channel--; data_im += channel_size) {

      for (int kernel_row = 0; kernel_row < kernel_h; kernel_row++) {

        for (int kernel_col = 0; kernel_col < kernel_w; kernel_col++) {

          int input_row = -pad_h + kernel_row * dilation_h;

          for (int output_rows = output_h; output_rows; output_rows--) {

            if (!is_a_ge_zero_and_a_lt_b(input_row, height)) {

              for (int output_cols = output_w; output_cols; output_cols--) {

                *(data_col++) = 0;

              }

            } else {

              int input_col = -pad_w + kernel_col * dilation_w;

              for (int output_col = output_w; output_col; output_col--) {

                if (is_a_ge_zero_and_a_lt_b(input_col, width)) {

                  *(data_col++) = data_im[input_row * width + input_col];

                } else {

                  *(data_col++) = 0;

                }

                input_col += stride_w;

              }

            }

            input_row += stride_h;

          }

        }

    kernel   

    for (int i = blockIdx.x * blockDim.x + threadIdx.x;

           i < (n);       i += blockDim.x * gridDim.x)

     {

        const int h_index = i/ width_col;

        const int h_col = h_index % height_col;

        const int w_col = i % width_col;

        const int c_im = h_index / height_col;

        const int c_col = c_im * kernel_h * kernel_w;

        const int h_offset = h_col * stride_h - pad_h;

        const int w_offset = w_col * stride_w - pad_w;

        Dtype* data_col_ptr = data_col;

        data_col_ptr += (c_col * height_col + h_col) * width_col + w_col;

        const Dtype* data_im_ptr = data_im;

        data_im_ptr += (c_im * height + h_offset) * width + w_offset;

        for (int i = 0; i < kernel_h; ++i) {

          for (int j = 0; j < kernel_w; ++j) {

            int h_im = h_offset + i * dilation_h;

            int w_im = w_offset + j * dilation_w;

            *data_col_ptr =

                (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) ?

                data_im_ptr[i * dilation_h * width + j * dilation_w] : 0;

            data_col_ptr += height_col * width_col;

          }

        }

      }

    Host代码:

      int num_kernels = channels * height_col * width_col;

      im2col_gpu_kernel<Dtype><<<(num_kernels+511)/512, 512>>>( … …)

    Pool算子

    Cpu:AVE版本的,MAX类似,缺少Stochastic

    Gpu版本

    for (int n = 0; n < bottom[0]->num(); ++n) {

      for (int c = 0; c < channels_; ++c) {

        for (int ph = 0; ph < pooled_height_; ++ph) {

          for (int pw = 0; pw < pooled_width_; ++pw) {

            int hstart = ph * stride_h_ - pad_h_;

            int wstart = pw * stride_w_ - pad_w_;

            int hend = min(hstart + kernel_h_, height_);

            int wend = min(wstart + kernel_w_, width_);

            hstart = max(hstart, 0);

            wstart = max(wstart, 0);

            const int pool_index = ph * pooled_width_ + pw;

            for (int h = hstart; h < hend; ++h) {

              for (int w = wstart; w < wend; ++w) {   //AVE

                top_data[ph * pooled_width_ + pw] +=

                        bottom_data[h * width_ + w];

              }

            }

          }

        }

    // compute offset

    bottom_data += bottom[0]->offset(0, 1);

    top_data += top[0]->offset(0, 1);

      }

    }

    const int w = index % width;

    const int h = (index / width) % height;

    const int c = (index / width / height) % channels;

    const int n = index / width / height / channels;

    const int phstart =

             (h + pad_h < kernel_h) ? 0 : (h + pad_h - kernel_h) / stride_h + 1;

    const int phend = min((h + pad_h) / stride_h + 1, pooled_height);

    const int pwstart =

             (w + pad_w < kernel_w) ? 0 : (w + pad_w - kernel_w) / stride_w + 1;

    const int pwend = min((w + pad_w) / stride_w + 1, pooled_width);

    Dtype gradient = 0;

    const int offset = (n * channels + c) * pooled_height * pooled_width;

    const Dtype* const top_diff_slice = top_diff + offset;

    if (mask) {

    const int* const mask_slice = mask + offset;

      for (int ph = phstart; ph < phend; ++ph) {

       for (int pw = pwstart; pw < pwend; ++pw) {

          if (mask_slice[ph * pooled_width + pw] == h * width + w) {

             gradient += top_diff_slice[ph * pooled_width + pw];

          }

        }

    }

    }

    bottom_diff[index] = gradient;

    FC算子(InnerProduct)

    Cpu:

    Gpu版本:和CPU版本一致

    caffe_cpu_gemm<Dtype>(CblasNoTrans, transpose_ ? CblasNoTrans : CblasTrans,M_, N_, K_, (Dtype)1.,bottom_data, weight, (Dtype)0., top_data);

    if (bias_term_) {

        caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, M_, N_, 1, (Dtype)1.,bias_multiplier_.cpu_data(),this->blobs_[1]->cpu_data(), (Dtype)1., top_data);

    }

    Dropout算子

    Cpu:

    Gpu版本

    // Create random numbers

    caffe_rng_bernoulli(count, 1. - threshold_, mask);

    for (int i = 0; i < count; ++i) {

      top_data[i] = bottom_data[i] * mask[i] * scale_;

    }

      

    Softmax算子

    Cpu:

    Gpu版本

    for (int i = 0; i < outer_num_; ++i) {

      // initialize scale_data to the first plane

      caffe_copy(inner_num_, bottom_data + i * dim, scale_data);

      for (int j = 0; j < channels; j++) {

        for (int k = 0; k < inner_num_; k++) {

          scale_data[k] = std::max(scale_data[k],

            bottom_data[i * dim + j * inner_num_ + k]);

        }

      }

      // subtraction

     caffe_cpu_gemm<Dtype>(CblasNoTrans,CblasNoTrans,channels, inner_num_,

            1, -1., sum_multiplier_.cpu_data(), scale_data, 1., top_data);

      // exponentiation

      caffe_exp<Dtype>(dim, top_data, top_data);

      // sum after exp

      caffe_cpu_gemv<Dtype>(CblasTrans, channels, inner_num_, 1.,

          top_data, sum_multiplier_.cpu_data(), 0., scale_data);

      // division

      for (int j = 0; j < channels; j++) {

        caffe_div(inner_num_, top_data, scale_data, top_data);

        top_data += inner_num_;

      }

    }

    kernel  以kernel_channel_sum为例

    int n = index / spatial_dim;

    int s = index % spatial_dim;

    Dtype sum = 0;

    for (int c = 0; c < channels; ++c) {

      sum += data[(n * channels + c) * spatial_dim + s];

    }

    channel_sum[index] = sum;

    Host代码:

    kernel_channel_max<Dtype><<<CAFFE_GET_BLOCKS(outer_num_ * inner_num_),CAFFE_CUDA_NUM_THREADS>>>(outer_num_, channels, inner_num_, top_data,scale_data);

    kernel_channel_subtract<Dtype><<<CAFFE_GET_BLOCKS(count),CAFFE_CUDA_NUM_THREADS>>>(count, outer_num_, channels, inner_num_,scale_data, top_data);

    kernel_exp<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(count, top_data, top_data);

    kernel_channel_sum<Dtype><<<CAFFE_GET_BLOCKS(outer_num_ * inner_num_),CAFFE_CUDA_NUM_THREADS>>>(outer_num_, channels, inner_num_, top_data,scale_data);

    kernel_channel_div<Dtype><<<CAFFE_GET_BLOCKS(count),CAFFE_CUDA_NUM_THREADS>>>(count, outer_num_, channels, inner_num_,scale_data, top_data);

    Sigmoid算子

    Cpu:

    Gpu版本

    for (int i = 0; i < count; ++i) {

        top_data[i] = sigmoid(bottom_data[i]);

    }

    Sigmoid:

    inline Dtype sigmoid(Dtype x) {

      return 0.5 * tanh(0.5 * x) + 0.5;

    }

    out[index] = 0.5 * tanh(0.5 * in[index]) + 0.5;

    LRN算子

    Cpu:

    Gpu版本:和Cpu一样

    split_layer_->Forward(bottom, split_top_vec_);

    square_layer_->Forward(square_bottom_vec_, square_top_vec_);//PowerLayer

    pool_layer_->Forward(square_top_vec_, pool_top_vec_);

    power_layer_->Forward(pool_top_vec_, power_top_vec_);

    product_layer_->Forward(product_bottom_vec_, top);//EltwiseLayer

    Split算子

    Cpu:

    Gpu版本:和Cpu一样

    for (int i = 0; i < top.size(); ++i) {

        top[i]->ShareData(*bottom[0]);

    }

    ShareData

    void Blob<Dtype>::ShareData(const Blob& other) {

      CHECK_EQ(count_, other.count());

      data_ = other.data();

    }

    Eltwise算子

    Cpu:MAX版本的

    Gpu版本

    // Initialize

        mask = max_idx_.mutable_cpu_data();

        caffe_set(count, -1, mask);

        caffe_set(count, Dtype(-FLT_MAX), top_data);

        // bottom 0 & 1

        bottom_data_a = bottom[0]->cpu_data();

        bottom_data_b = bottom[1]->cpu_data();

        for (int idx = 0; idx < count; ++idx) {

          if (bottom_data_a[idx] > bottom_data_b[idx]) {

            top_data[idx] = bottom_data_a[idx];  // maxval

            mask[idx] = 0;  // maxid

          } else {

            top_data[idx] = bottom_data_b[idx];  // maxval

            mask[idx] = 1;  // maxid

          }

        }

        // bottom 2++

        for (int blob_idx = 2; blob_idx < bottom.size(); ++blob_idx) {

          bottom_data_b = bottom[blob_idx]->cpu_data();

          for (int idx = 0; idx < count; ++idx) {

            if (bottom_data_b[idx] > top_data[idx]) {

              top_data[idx] = bottom_data_b[idx];  // maxval

              mask[idx] = blob_idx;  // maxid

            }

          }

        }

    kernel   

    Dtype maxval = -FLT_MAX;

    int maxidx = -1;

    if (bottom_data_a[index] > bottom_data_b[index]) {

      // only update for very first bottom_data blob (blob_idx == 0)

      if (blob_idx == 0) {

        maxval = bottom_data_a[index];

        top_data[index] = maxval;

        maxidx = blob_idx;

        mask[index] = maxidx;

      }

    } else {

      maxval = bottom_data_b[index];

      top_data[index] = maxval;

      maxidx = blob_idx + 1;

      mask[index] = maxidx;

    }

    Host代码:

    mask = max_idx_.mutable_gpu_data();

    MaxForward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(count,bottom[0]->gpu_data(), bottom[1]->gpu_data(), 0, top_data, mask);

    for (int i = 2; i < bottom.size(); ++i) {

    MaxForward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(count, top_data, bottom[i]->gpu_data(), i-1, top_data, mask);

    }

    SigmoidCrossEntropyLoss算子

    Cpu:

    Gpu版本

    for (int i = 0; i < bottom[0]->count(); ++i) {

      const int target_value = static_cast<int>(target[i]);

      if (has_ignore_label_ && target_value == ignore_label_) {

        continue;

      }

      loss -= input_data[i] * (target[i] - (input_data[i] >= 0)) -

          log(1 + exp(input_data[i] - 2 * input_data[i] * (input_data[i] >= 0)));

      ++valid_count;

    }

    //return max(1.0,normalizer_)

    normalizer_ = get_normalizer(normalization_, valid_count);

    top[0]->mutable_cpu_data()[0] = loss / normalizer_;

    kernel

    const int target_value = static_cast<int>(target[i]);

    if (has_ignore_label_ && target_value == ignore_label_) {

      loss[i] = 0;

      counts[i] = 0;

    } else {

      loss[i] = input_data[i] * (target[i] - (input_data[i] >= 0)) -

          log(1 + exp(input_data[i] - 2 * input_data[i] *

          (input_data[i] >= 0)));

      counts[i] = 1;

    }

    Host代码:

    SigmoidCrossEntropyLossForwardGPU<Dtype><<<CAFFE_GET_BLOCKS(count),CAFFE_CUDA_NUM_THREADS>>>(count, input_data, target, loss_data,

    has_ignore_label_, ignore_label_, count_data);

    // Only launch another CUDA kernel if we actually need the valid count.

    if (normalization_ == LossParameter_NormalizationMode_VALID &&

        has_ignore_label_) {

      caffe_gpu_asum(count, count_data, &valid_count);

    } else {

      valid_count = count;

    }

    Scale算子

    Cpu:

    Gpu版本

    for (int n = 0; n < outer_dim_; ++n) {

        for (int d = 0; d < scale_dim_; ++d) {

          const Dtype factor = scale_data[d];

          caffe_cpu_scale(inner_dim_, factor, bottom_data, top_data);

          bottom_data += inner_dim_;

          top_data += inner_dim_;

        }

    }

    caffe_cpu_scale:

    for (int i = 0; i < N; ++i) {

      Y[i] += alpha;

    }

    kernel

    const int scale_index = (index / inner_dim) % scale_dim;

    out[index] = in[index] * scale[scale_index];

    BatchNorm算子

    Cpu:代码太长待分析

    Gpu版本

    实现逻辑与CPU版本一致,只不过所有的函数调用都成了caffe_gpu_gemm这种

  • 相关阅读:
    C# 窗体间传值方法大汇总(转)
    STM32 配置PC13~PC15
    STM32的USART发送数据时如何使用TXE和TC标志
    STM32_NVIC寄存器详解
    protel99se 问题汇总(不定期更新)
    STM32串口IAP实验笔记
    Keil MDK下如何设置非零初始化变量(复位后变量值不丢失)
    STM32定时器配置(TIM1-TIM8)高级定时器+普通定时器,定时计数模式下总结
    帮助类-AD域操作
    GitHub贡献第一的公司是谁?微软开源软件列表
  • 原文地址:https://www.cnblogs.com/jourluohua/p/9673572.html
Copyright © 2011-2022 走看看