最近,ncnn release了新版本, 该版本其中一个亮点是增加了图优化,目的是使得前向图结构更加简洁, 运行速度可以加快。下面来逐一分析:
对于连续两个算子能否合并成一个算子,需要符合特定的条件。
(1)XXX-batchnorm
int fuse_convolution_batchnorm(); // group1
int fuse_convolutiondepthwise_batchnorm();
int fuse_deconvolution_batchnorm();
int fuse_deconvolutiondepthwise_batchnorm();
int fuse_innerproduct_batchnorm();
(2)XXX-activation
int fuse_convolution_activation(); // group2
int fuse_convolutiondepthwise_activation();
int fuse_deconvolution_activation();
int fuse_deconvolutiondepthwise_activation();
int fuse_innerproduct_activation();
(3)batchnorm-scale
(4)innerproduct-dropout
以conv+batchnorm为例
int NetOptimize::fuse_convolution_batchnorm(){
const int layer_count = layers.size();
// 遍历所有层
for(int i=0; i<layer_count; i++){
// 找Convolution层
if(layers[i]->type != "Convolution")
continue;
// Convolution - BatchNorm
int top_blob_index = layers[i]->tops[0];
int j = i + 1;
for(;j<layer_count;j++){
// 在确定conv情况下, 寻找bn
if(layers[j]->type != "BatchNorm")
continue;
// bn的blob非唯一即不符合要求
if(layers[j]->bottoms.size() != 1)
continue;
// 寻找conv_bn可以连接成功的pair
if(layers[j]->bottoms[0] == top_blob_index)
break; // 寻找成功
}
// 边界条件, 越界则继续下一层迭代
if(j == layer_count)
continue;
// fuse "Convolution - BatchNorm" to "Convolution"
// 经过上述筛选, <i, j>表示一个<con_id, bn_id>对, 可以进行合并
ncnn::Convolution* convolution = (ncnn::Convolution*)layers[i];
ncnn::BatchNorm* batchnorm = (ncnn::BatchNorm*)layers[j];
fprintf(stderr, "fuse_convolution_batchnorm %s %s ", convolution->name.c_str(), batchnorm->name.c_str());
// =======> code segment begin
{
int channels = batchnorm->channels;
float eps = batchnorm->eps;
// a = bias - slope * mean / sqrt(var + eps)
// b = slope / sqrt(var + eps)
// value = value * b + a
std:: vector<float> a(channels);
std:: vector<float> b(channels);
// 这里吐槽一下ncnn,都什么鬼命名?!!! a,b完全没有任何可读性.....
for(int i=0; i< channels; i++){
float sqrt_var = sqrt(batchnorm->var_data[i] + eps);
a[i] = batchnorm->bias_data[i] - batchnorm->slope_data[i] * batchnorm->mean_data[i] / sqrt_var;
b[i] = batchnorm->slope_data[i] / sqrt_var;
}
if(convolution->bias_term ==0){
// init bias as zero
convolution->bias_term = 1;
convolution->bias_term = ncnn::Mat(channels);
convolution->bias_data.fill(0.f);
}
// 跨度
const int weight_per_outch = convolution->weight_data_size / channels;
float* weight = convolution->weight_data;
float* bias = convolution->bias_data;
for(int i=0; i<channels; i++){
float* conv_weight_outch = weight + weight_per_outch * i;
for(int j=0; j<weight_per_outch; j++){
conv_weight_outch[j] *= b[i]; // 二维展开逐一相乘
}
bias[i] += a[i];
}
}
// =======> code segment end
// 修改相关的layer 关系
int top_blob_index_final = batchnorm->tops[0]; // 记录batchnorm的输出blob
convolution->tops[0] = top_blob_index_final; // 将convolution的输出blob设置为原来batchnorm的输出blob
blobs[top_blob_index_final].product = i; // 将blob的生产者layer改变为conv而不再是原来的bn
batchnorm->type = "ncnnfused"; // 修改原始layer的层属性
}
}