1.BaseConvolutionLayer & ConvolutionLayer
/// @brief The spatial dimensions of a filter kernel. // kernel的形状 = [kernel_h, kernel_w] Blob<int> kernel_shape_; /// @brief The spatial dimensions of the stride. // 步长形状 = [stride_h, stride_w] Blob<int> stride_; /// @brief The spatial dimensions of the padding. // pad的形状 = [pad_h, pad_w] Blob<int> pad_; /// @brief The spatial dimensions of the convolution input. // 卷积的输入形状 = [输入图像通道数, 输入图像h, 输入图像w] Blob<int> conv_input_shape_; /// @brief The spatial dimensions of the col_buffer. // col_buffer的形状 = [kernel_dim_, conv_out_spatial_dim_ ] vector<int> col_buffer_shape_; /// @brief The spatial dimensions of the output. // 输出的形状 vector<int> output_shape_; // 输入的形状 const vector<int>* bottom_shape_; // 空间轴个数 int num_spatial_axes_; // 输入度维度 = 输入图像通道数*输入图像的h*输入图像w int bottom_dim_; // 输出维度 = 输出通道数*输出h*输出w int top_dim_; // 输入图像的第几个轴是通道 int channel_axis_; // batchsize int num_; // 输入图像的通道数 int channels_; // 卷积组的大小 int group_; // 输出空间维度 = 卷积之后的图像长*卷积之后图像的宽 int out_spatial_dim_; // 使用卷积组用到的 int weight_offset_; // 卷积后的图像的通道数 int num_output_; // 是否启用偏置 bool bias_term_; // 是不是1x1卷积 bool is_1x1_; // 强制使用n维通用卷积 bool force_nd_im2col_; // conv_in_channels_ * conv_out_spatial_dim_ int num_kernels_im2col_; // num_kernels_col2im_ = reverse_dimensions() ? top_dim_ : bottom_dim_ int num_kernels_col2im_; // 卷积的输出通道数 ,在参数配置文件中设置 int conv_out_channels_; // 卷积的输入通道数 (即输入图像的通道数) int conv_in_channels_; // 卷积的输出的空间维度 = 卷积后图像h*卷积后图像w int conv_out_spatial_dim_; // 卷积核的维度 = 输入图像的维度*卷积核的h*卷积核的w int kernel_dim_; // 在使用gropu参数的时候使用的offset int col_offset_; int output_offset_; // im2col的时候使用的存储空间 Blob<Dtype> col_buffer_; // 将偏置扩展成矩阵的东东 Blob<Dtype> bias_multiplier_;
public: //构造函数 explicit BaseConvolutionLayer(const LayerParameter& param) : Layer<Dtype>(param) {} //初始化 virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top); //重设形状 virtual void Reshape(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
GEMM的全称是General Matrix Matrix Multiply。其基本形式如下:

void BaseConvolutionLayer<Dtype>::forward_cpu_gemm(const Dtype* input, const Dtype* weights, Dtype* output, bool skip_im2col) { const Dtype* col_buff = input; if (!is_1x1_) { if (!skip_im2col) { conv_im2col_cpu(input, col_buffer_.mutable_cpu_data()); } col_buff = col_buffer_.cpu_data(); } for (int g = 0; g < group_; ++g) { caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, conv_out_channels_ / group_, conv_out_spatial_dim_, kernel_dim_, (Dtype)1., weights + weight_offset_ * g, col_buff + col_offset_ * g, (Dtype)0., output + output_offset_ * g); } } template <typename Dtype> void BaseConvolutionLayer<Dtype>::forward_cpu_bias(Dtype* output, const Dtype* bias) { caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num_output_, out_spatial_dim_, 1, (Dtype)1., bias, bias_multiplier_.cpu_data(), (Dtype)1., output); }
inline void conv_im2col_cpu(const Dtype* data, Dtype* col_buff) { if (!force_nd_im2col_ && num_spatial_axes_ == 2) { im2col_cpu(data, conv_in_channels_, conv_input_shape_.cpu_data()[1], conv_input_shape_.cpu_data()[2], kernel_shape_.cpu_data()[0], kernel_shape_.cpu_data()[1], pad_.cpu_data()[0], pad_.cpu_data()[1], stride_.cpu_data()[0], stride_.cpu_data()[1], dilation_.cpu_data()[0], dilation_.cpu_data()[1], col_buff); } else { im2col_nd_cpu(data, num_spatial_axes_, conv_input_shape_.cpu_data(), col_buffer_shape_.data(), kernel_shape_.cpu_data(), pad_.cpu_data(), stride_.cpu_data(), dilation_.cpu_data(), col_buff); } } inline void conv_col2im_cpu(const Dtype* col_buff, Dtype* data) { if (!force_nd_im2col_ && num_spatial_axes_ == 2) { col2im_cpu(col_buff, conv_in_channels_, conv_input_shape_.cpu_data()[1], conv_input_shape_.cpu_data()[2], kernel_shape_.cpu_data()[0], kernel_shape_.cpu_data()[1], pad_.cpu_data()[0], pad_.cpu_data()[1], stride_.cpu_data()[0], stride_.cpu_data()[1], dilation_.cpu_data()[0], dilation_.cpu_data()[1], data); } else { col2im_nd_cpu(col_buff, num_spatial_axes_, conv_input_shape_.cpu_data(), col_buffer_shape_.data(), kernel_shape_.cpu_data(), pad_.cpu_data(), stride_.cpu_data(), dilation_.cpu_data(), data); } }
template <typename Dtype> void ConvolutionLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { const Dtype* weight = this->blobs_[0]->cpu_data(); for (int i = 0; i < bottom.size(); ++i) { const Dtype* bottom_data = bottom[i]->cpu_data(); Dtype* top_data = top[i]->mutable_cpu_data(); for (int n = 0; n < this->num_; ++n) { this->forward_cpu_gemm(bottom_data + n * this->bottom_dim_, weight, top_data + n * this->top_dim_); if (this->bias_term_) { const Dtype* bias = this->blobs_[1]->cpu_data(); this->forward_cpu_bias(top_data + n * this->top_dim_, bias); } } } }
template <typename Dtype> void ConvolutionLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) { const Dtype* weight = this->blobs_[0]->cpu_data(); Dtype* weight_diff = this->blobs_[0]->mutable_cpu_diff(); for (int i = 0; i < top.size(); ++i) { const Dtype* top_diff = top[i]->cpu_diff(); const Dtype* bottom_data = bottom[i]->cpu_data(); Dtype* bottom_diff = bottom[i]->mutable_cpu_diff(); // Bias gradient, if necessary. if (this->bias_term_ && this->param_propagate_down_[1]) { Dtype* bias_diff = this->blobs_[1]->mutable_cpu_diff(); for (int n = 0; n < this->num_; ++n) { this->backward_cpu_bias(bias_diff, top_diff + n * this->top_dim_); } } if (this->param_propagate_down_[0] || propagate_down[i]) { for (int n = 0; n < this->num_; ++n) { // gradient w.r.t. weight. Note that we will accumulate diffs. if (this->param_propagate_down_[0]) { this->weight_cpu_gemm(bottom_data + n * this->bottom_dim_, top_diff + n * this->top_dim_, weight_diff); } // gradient w.r.t. bottom data, if necessary. if (propagate_down[i]) { this->backward_cpu_gemm(top_diff + n * this->top_dim_, weight, bottom_diff + n * this->bottom_dim_); } } } } }
bottom, weight, bias导数计算,向后传递。
template <typename Dtype> void BaseConvolutionLayer<Dtype>::backward_cpu_gemm(const Dtype* output, const Dtype* weights, Dtype* input) { Dtype* col_buff = col_buffer_.mutable_cpu_data(); if (is_1x1_) { col_buff = input; } for (int g = 0; g < group_; ++g) { caffe_cpu_gemm<Dtype>(CblasTrans, CblasNoTrans, kernel_dim_, conv_out_spatial_dim_, conv_out_channels_ / group_, (Dtype)1., weights + weight_offset_ * g, output + output_offset_ * g, (Dtype)0., col_buff + col_offset_ * g); } if (!is_1x1_) { conv_col2im_cpu(col_buff, input); } } template <typename Dtype> void BaseConvolutionLayer<Dtype>::weight_cpu_gemm(const Dtype* input, const Dtype* output, Dtype* weights) { const Dtype* col_buff = input; if (!is_1x1_) { conv_im2col_cpu(input, col_buffer_.mutable_cpu_data()); col_buff = col_buffer_.cpu_data(); } for (int g = 0; g < group_; ++g) { caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasTrans, conv_out_channels_ / group_, kernel_dim_, conv_out_spatial_dim_, (Dtype)1., output + output_offset_ * g, col_buff + col_offset_ * g, (Dtype)1., weights + weight_offset_ * g); } } template <typename Dtype> void BaseConvolutionLayer<Dtype>::backward_cpu_bias(Dtype* bias, const Dtype* input) { caffe_cpu_gemv<Dtype>(CblasNoTrans, num_output_, out_spatial_dim_, 1., input, bias_multiplier_.cpu_data(), 1., bias); }
在caffe中,卷积运算就是先对数据进行im2col操作,再进行内积运算(inner product)。这样做,比原始的卷积操作速度更快。
图片来自High Performance Convolutional Neural Networks for Document Processing, 是图示了caffe中卷积计算的原理。
def conv(a, v, full=0): # valid:0 full:1 ah, aw = np.shape(a) vh, vw = np.shape(v) print ah,aw,vh,vw if full: temp = np.zeros((ah + 2 * vh - 2, aw + 2 * vw - 2)) temp[vh - 1:vh - 1 + ah, vw - 1:vw - 1 + aw] = a a = temp ah, aw = np.shape(a) k =[[ np.sum(np.multiply(a[i:i + vh, j:j + vw], v)) for j in range(aw - vw + 1)] for i in range(ah - vh + 1)] return k
template <typename Dtype> void im2col_cpu(const Dtype* data_im, const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, Dtype* data_col) { const int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1; const int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1; const int channels_col = channels * kernel_h * kernel_w; for (int c_col = 0; c_col < channels_col; ++c_col) { int w_offset = c_col % kernel_w; int h_offset = (c_col / kernel_w) % kernel_h; int c_im = c_col / kernel_h / kernel_w; for (int h_col = 0; h_col < height_col; ++h_col) { for (int w_col = 0; w_col < width_col; ++w_col) { int h_im = h_col * stride_h - pad_h + h_offset; int w_im = w_col * stride_w - pad_w + w_offset; data_col[(c_col * height_col + h_col) * width_col + w_col] = (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) ? data_im[(c_im * height + h_im) * width + w_im] : 0; } } } } /* 将图片按照卷积的窗口大小切成子图,拉成一列。
*/ template <typename Dtype> inline void im2col_nd_core_cpu(const Dtype* data_input, const bool im2col, const int num_spatial_axes, const int* im_shape, const int* col_shape, const int* kernel_shape, const int* pad, const int* stride, Dtype* data_output) /* 针对输入的spatial dimension 不是二维的情况 */ template <typename Dtype> void col2im_cpu(const Dtype* data_col, const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, Dtype* data_im) { caffe_set(height * width * channels, Dtype(0), data_im); const int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1; const int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1; const int channels_col = channels * kernel_h * kernel_w; for (int c_col = 0; c_col < channels_col; ++c_col) { int w_offset = c_col % kernel_w; int h_offset = (c_col / kernel_w) % kernel_h; int c_im = c_col / kernel_h / kernel_w; for (int h_col = 0; h_col < height_col; ++h_col) { for (int w_col = 0; w_col < width_col; ++w_col) { int h_im = h_col * stride_h - pad_h + h_offset; int w_im = w_col * stride_w - pad_w + w_offset; if (h_im >= 0 && h_im < height && w_im >= 0 && w_im < width) data_im[(c_im * height + h_im) * width + w_im] += data_col[(c_col * height_col + h_col) * width_col + w_col]; } } } } //im2col_cpu()的相反过程