zoukankan      html  css  js  c++  java
  • Caffe源码-LossLayer类(下)

    InfogainLossLayer类简介

    InfogainLossLayer与SoftmaxWithLossLayer类似,只不过增加了一个信息增益矩阵(H),用于指定某真实类别的数据被预测为某一类别时的权重,常用于类间样本数不均衡的情况。当矩阵(H)为单位矩阵时,等同于SoftmaxWithLossLayer。

    1. 第一个输入blob为网络的预测值,大小( ilde{N} imes C imes ilde H imes ilde W),范围(x_{n,k} in [-infty, +infty])。计算loss时使用softmax函数值作为其概率,(hat{p}_{n,k} = frac{e^{x_{n,k}}}{sumlimits_{k'=1}^{K} e^{x_{n,k'}}})
    • 后续假设计算softmax时是沿着第1维(维度(C))进行的,则维度(C)的大小即为类别总数(K),数据的总个数为外部个数(对应代码中的outer_num_)乘上内部个数inner_num_,即(N= ilde N * ilde H * ilde W)
    1. 第二个输入blob为标签值,大小( ilde{N} imes 1 imes ilde H imes ilde W),也即((N imes 1 imes 1 imes 1)),范围(l_n in [0, 1, 2, ..., K - 1])之间的整数,第(n)个数据的真实类别为(l_n)
    • 与SoftmaxWithLossLayer类似,caffe代码中并没有严格限制标签blob的形状,只要求预测blob与标签blob的第0维相等(LossLayer中要求),和标签blob的总个数等于(N)
    1. 前向计算时,loss的计算公式为: (E = -frac{1}{N} sumlimits_{n=1}^N sumlimits_{k=1}^{K} H_{l_n,k} log(hat{p}_{n,k}))
    • (H_{l_n,k})为信息增益矩阵(H)中的元素,表示真实类别为(l_n),预测类别为(k)的值。矩阵(H)的大小为(K imes K)
    1. 反向计算时,预测blob的梯度的计算过程如下:
    • (frac{{partial {{hat p}_{n,k'}}}}{{partial {x_{n,k}}}}{ m{ = }}{left( {frac{{{e^{{x_{n,k'}}}}}}{{{e^{{x_{n,1}}}}{ m{ + }}{e^{{x_{n,{ m{2}}}}}}{ m{ + }}...{ m{ + }}{e^{{x_{n,K}}}}}}} ight)_{{x_{n,k}}}}^prime { m{ = }}left{ {egin{array}{*{20}{c}} {{{hat p}_{n,k'}} - {{hat p}_{n,k'}}*{{hat p}_{n,k}},k = {k'}}\ { - {{hat p}_{n,k'}}*{{hat p}_{n,k}},k e {k'}} end{array}} ight.)
    • (E = - frac{1}{N}sumlimits_{n = 1}^N {sumlimits_{k = 1}^K {{H_{{l_n},k}}} } log left( {{{hat p}_{n,k}}} ight) = - frac{1}{N}sumlimits_{n = 1}^N {left( {{H_{{l_n},1}}log {{hat p}_{n,1}} + {H_{{l_n},2}}log {{hat p}_{n,2}} + ... + {H_{{l_n},K}}log {{hat p}_{n,K}}} ight)})
    • (frac{{partial E}}{{partial {{hat p}_{n,k'}}}} = - frac{1}{N}{H_{{l_n},k'}}frac{1}{{{{hat p}_{n,k}}}})
    • (frac{{partial E}}{{partial {x_{n,k}}}} = sumlimits_{k' = 1}^K {frac{{partial E}}{{partial {{hat p}_{n,k'}}}}frac{{partial {{hat p}_{n,k'}}}}{{partial {x_{n,k}}}}} = frac{{partial E}}{{partial {{hat p}_{n,1}}}}frac{{partial {{hat p}_{n,1}}}}{{partial {x_{n,k}}}} + frac{{partial E}}{{partial {{hat p}_{n,2}}}}frac{{partial {{hat p}_{n,2}}}}{{partial {x_{n,k}}}} + ... + frac{{partial E}}{{partial {{hat p}_{n,k}}}}frac{{partial {{hat p}_{n,k}}}}{{partial {x_{n,k}}}} + ... + frac{{partial E}}{{partial {{hat p}_{n,K}}}}frac{{partial {{hat p}_{n,K}}}}{{partial {x_{n,k}}}})
    • (= - frac{1}{N}left( {{H_{l_n,1}}left( { - {{hat p}_{n,k}}} ight) + {H_{l_n,2}}left( { - {{hat p}_{n,k}}} ight) + ... + {H_{{l_n},k}}left( {{ m{1}} - {{hat p}_{n,k}}} ight){ m{ + }}...{ m{ + }}{H_{{l_n},K}}left( { - {{hat p}_{n,k}}} ight)} ight))
    • (= frac{1}{N}left( {{{hat p}_{n,k}}sumlimits_{k' = 1}^K {{H_{{l_n},k'}}} - {H_{{l_n},k}}} ight))
    • 最后可计算:(frac{partial J}{partial {x_{n,k}}} = frac{partial J}{partial E}*frac{partial E}{partial {x_{n,k}}})

    infogain_loss_layer.cpp源码

    template <typename Dtype>
    void InfogainLossLayer<Dtype>::LayerSetUp(
        const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
      LossLayer<Dtype>::LayerSetUp(bottom, top);    //基类的初始化函数
      // internal softmax layer
      LayerParameter softmax_layer_param(this->layer_param_);   //layer参数,用于创建softmax层
      SoftmaxParameter* softmax_param = softmax_layer_param.mutable_softmax_param();  //layer参数中的softmax参数
      softmax_param->set_axis(this->layer_param_.infogain_loss_param().axis());       //设置计算softmax时的沿着的轴
      softmax_layer_param.set_type("Softmax");    //设置层的类型
      softmax_layer_param.clear_loss_weight();
      softmax_layer_param.add_loss_weight(1);     //清空权重参数,并设置为1
      softmax_layer_ = LayerRegistry<Dtype>::CreateLayer(softmax_layer_param);  //根据layer参数创建softmax层
      softmax_bottom_vec_.clear();
      softmax_bottom_vec_.push_back(bottom[0]);   //设置softmax层的输入blob
      softmax_top_vec_.clear();
      softmax_top_vec_.push_back(&prob_);         //设置softmax层的输出blob
      softmax_layer_->SetUp(softmax_bottom_vec_, softmax_top_vec_);   //调用softmax层的初始化函数
    
      // ignore label
      has_ignore_label_ = this->layer_param_.loss_param().has_ignore_label();   //设置了无效标签
      if (has_ignore_label_) {
        ignore_label_ = this->layer_param_.loss_param().ignore_label(); //存入当前layer中
      }
      // normalization
      CHECK(!this->layer_param_.loss_param().has_normalize())
        << "normalize is deprecated. use "normalization"";  //normalize参数为旧版本,已弃用
      normalization_ = this->layer_param_.loss_param().normalization(); //normalization参数制定了规范化方式
      // matrix H
      if (bottom.size() < 3) {    //输入blob的个数小于3,则输入中不带信息增益矩阵H
        CHECK(this->layer_param_.infogain_loss_param().has_source())
            << "Infogain matrix source must be specified.";   //检查,在layer参数中必须指定增益矩阵H的来源文件
        BlobProto blob_proto;
        //从二进制文件中读取消息到blob_proto中
        ReadProtoFromBinaryFile(this->layer_param_.infogain_loss_param().source(), &blob_proto);
        infogain_.FromProto(blob_proto);  //将blob_proto中的数据转成blob类型,存储到信息增益矩阵H中
      }
    }
    
    template <typename Dtype>
    void InfogainLossLayer<Dtype>::Reshape(
        const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
      LossLayer<Dtype>::Reshape(bottom, top);   //调整形状
      softmax_layer_->Reshape(softmax_bottom_vec_, softmax_top_vec_); //调整softmax层的形状
      //读取消息中的axis参数,计算对应的维度存入infogain_axis_中.后续则是沿着第infogain_axis_维计算softmax值
      infogain_axis_ = bottom[0]->CanonicalAxisIndex(this->layer_param_.infogain_loss_param().axis());
      outer_num_ = bottom[0]->count(0, infogain_axis_);   //外部个数,第 [0, infogain_axis_) 维的乘积
      inner_num_ = bottom[0]->count(infogain_axis_ + 1);  //内部个数,第 [infogain_axis_ + 1, end) 维的乘积
      CHECK_EQ(outer_num_ * inner_num_, bottom[1]->count())   //数据的总个数等于外部个数乘上内部个数,必须等于标签blob的总个数
          << "Number of labels must match number of predictions; "
          << "e.g., if infogain axis == 1 and prediction shape is (N, C, H, W), "
          << "label count (number of labels) must be N*H*W, "
          << "with integer values in {0, 1, ..., C-1}.";
      //同样,假设infogain_axis_=1.则 outer_num_ = N, inner_num_ = H*W, 类别总数 K=C
      num_labels_ = bottom[0]->shape(infogain_axis_);   //类别总数K
      Blob<Dtype>* infogain = NULL;   //信息增益矩阵
      if (bottom.size() < 3) {
        infogain = &infogain_;        //在layer参数中指定
      } else {
        infogain = bottom[2];         //在输入blob中指定
      }
      CHECK_EQ(infogain->count(), num_labels_*num_labels_);   //检查,信息增益矩阵H的大小必须维K*K,K为类别总数
      sum_rows_H_.Reshape(vector<int>(1, num_labels_));       //用于存放矩阵H的每行的和
      if (bottom.size() == 2) {
        // H is provided as a parameter and will not change. sum rows once
        sum_rows_of_H(infogain);    //如果是在layer参数中指定信息增益矩阵H,则每行的和在每次训练时是固定值,可先计算出来
      }
      if (top.size() >= 2) {
        // softmax output
        top[1]->ReshapeLike(*bottom[0]);  //如果设置了多个输出blob,则将top[1]作为softmax层的输出,调整对应的形状
      }
    }
    
    template <typename Dtype>
    Dtype InfogainLossLayer<Dtype>::get_normalizer(
        LossParameter_NormalizationMode normalization_mode, int valid_count) {  //根据规范化方式计算规范化系数
      Dtype normalizer;
      switch (normalization_mode) {
        case LossParameter_NormalizationMode_FULL:
          normalizer = Dtype(outer_num_ * inner_num_);    //FULL模式,规范化系数即为数据的总个数
          break;
        case LossParameter_NormalizationMode_VALID:
          if (valid_count == -1) {
            normalizer = Dtype(outer_num_ * inner_num_);  //VALID模式,如果未设置无效标签则等同于FULL模式
          } else {
            normalizer = Dtype(valid_count);  //设置了无效标签,则规范化系数为有效数据的个数
          }
          break;
        case LossParameter_NormalizationMode_BATCH_SIZE:  //BATCH_SIZE模式,规范化系数为外部个数
          normalizer = Dtype(outer_num_);
          break;
        case LossParameter_NormalizationMode_NONE:        //NONE模式,无需规范化,规范化系数为1
          normalizer = Dtype(1);
          break;
        default:
          LOG(FATAL) << "Unknown normalization mode: "
              << LossParameter_NormalizationMode_Name(normalization_mode);
      }
      // Some users will have no labels for some examples in order to 'turn off' a
      // particular loss in a multi-task setup. The max prevents NaNs in that case.
      return std::max(Dtype(1.0), normalizer);  //同样,防止有效标签个数为0而出现的除0错误
    }
    
    template <typename Dtype>
    void InfogainLossLayer<Dtype>::sum_rows_of_H(const Blob<Dtype>* H) {  //计算H矩阵每行的和,存入sum_rows_H_中
      CHECK_EQ(H->count(), num_labels_*num_labels_)
        << "H must be " << num_labels_ << "x" << num_labels_;   //检查,H的大小必须为K*K
      const Dtype* infogain_mat = H->cpu_data();                //H矩阵的数据指针
      Dtype* sum = sum_rows_H_.mutable_cpu_data();              //sum_rows_H_的数据指针
      for ( int row = 0; row < num_labels_ ; row++ ) {
        sum[row] = 0;
        for ( int col = 0; col < num_labels_ ; col++ ) {
          sum[row] += infogain_mat[row*num_labels_+col];        //累加每行的和
        }
      }
    }
    
    template <typename Dtype>
    void InfogainLossLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
        const vector<Blob<Dtype>*>& top) {
      // The forward pass computes the softmax prob values.
      softmax_layer_->Forward(softmax_bottom_vec_, softmax_top_vec_);   //先计算softmax层的输出
      const Dtype* prob_data = prob_.cpu_data();          //softmax层的输出数据指针
      const Dtype* bottom_label = bottom[1]->cpu_data();  //标签数据指针
      const Dtype* infogain_mat = NULL;       //信息增益矩阵的数据指针
      if (bottom.size() < 3) {
        infogain_mat = infogain_.cpu_data();  //来自layer参数
      } else {
        infogain_mat = bottom[2]->cpu_data(); //来自输入blob
      }
      int count = 0;
      Dtype loss = 0;
      for (int i = 0; i < outer_num_; ++i) {    //N
        for (int j = 0; j < inner_num_; j++) {  //H*W
          //bottom_label数据的大小为N*H*W,获取(i,j)位置数据的真实标签
          const int label_value = static_cast<int>(bottom_label[i * inner_num_ + j]);
          if (has_ignore_label_ && label_value == ignore_label_) {
            continue;   //设置了无效标签,并且当前数据标签无效,忽略
          }
          DCHECK_GE(label_value, 0);      //数据的标签值必须在 [0, num_labels_) 之间
          DCHECK_LT(label_value, num_labels_);
          for (int l = 0; l < num_labels_; l++) {
            //infogain_mat[label_value * num_labels_ + l]为真实标签为label_value,预测标签为l的权重
            //prob_data[i * inner_num_*num_labels_ + l * inner_num_ + j]为(i,j)位置的数据的预测标签为l的概率(softmax值)
            loss -= infogain_mat[label_value * num_labels_ + l] *
              log(std::max(prob_data[i * inner_num_*num_labels_ + l * inner_num_ + j],
                    Dtype(kLOG_THRESHOLD)));
          }
          ++count;    //有效标签的数据个数
        }
      }
      top[0]->mutable_cpu_data()[0] = loss / get_normalizer(normalization_, count); //再除以规范化系数
      if (top.size() == 2) {
        top[1]->ShareData(prob_);   //输入blob个数为2,则将softmax层的输出作为第二个输出
      }
    }
    
    template <typename Dtype>
    void InfogainLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
        const vector<bool>& propagate_down,
        const vector<Blob<Dtype>*>& bottom) {
      if (propagate_down[1]) {    //标签blob禁止设置梯度反传
        LOG(FATAL) << this->type() << " Layer cannot backpropagate to label inputs.";
      }
      if (propagate_down.size() > 2 && propagate_down[2]) { //信息增益矩阵H同样禁止设置梯度反传
        LOG(FATAL) << this->type() << " Layer cannot backpropagate to infogain inputs.";
      }
      if (propagate_down[0]) {    //预测blob需要梯度反传
        const Dtype* prob_data = prob_.cpu_data();          //softmax层的输出
        const Dtype* bottom_label = bottom[1]->cpu_data();  //标签数据
        const Dtype* infogain_mat = NULL;
        if (bottom.size() < 3) {
          infogain_mat = infogain_.cpu_data();    //增益矩阵H来自layer参数(每行的和已经在Reshape()中计算出)
        } else {
          infogain_mat = bottom[2]->cpu_data();   //增益矩阵H来自输入blob
          // H is provided as a "bottom" and might change. sum rows every time.
          sum_rows_of_H(bottom[2]);               //则计算每行的和
        }
        const Dtype* sum_rows_H = sum_rows_H_.cpu_data();   //增益矩阵H每行的和
        Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); //输入blob的梯度数据指针
        const int dim = bottom[0]->count() / outer_num_;    //C*H*W
        int count = 0;
        for (int i = 0; i < outer_num_; ++i) {    //N
          for (int j = 0; j < inner_num_; ++j) {  //H*W
            const int label_value = static_cast<int>(bottom_label[i * inner_num_ + j]); //(i,j)位置的真实标签
            DCHECK_GE(label_value, 0);    //检查标签值在 [0, num_labels_) 之间
            DCHECK_LT(label_value, num_labels_);
            if (has_ignore_label_ && label_value == ignore_label_) {  //当前位置的标签无效
              for (int l = 0; l < num_labels_; ++l) {
                bottom_diff[i * dim + l * inner_num_ + j] = 0;  //清空(i,j)位置的数据对每种类别的预测值的梯度
              }
            } else {
              for (int l = 0; l < num_labels_; ++l) {
                //prob_data[i*dim + l*inner_num_ + j] 为(i,j)位置的数据对类别l的预测概率
                //sum_rows_H[label_value] 为(i,j)位置的数据的真实标签label_value在信息增益矩阵H中所在行的和
                //infogain_mat[label_value * num_labels_ + l] 为真实标签为label_value,预测标签为l的权重
                bottom_diff[i * dim + l * inner_num_ + j] =
                   prob_data[i*dim + l*inner_num_ + j]*sum_rows_H[label_value]
                   - infogain_mat[label_value * num_labels_ + l];
              }
              ++count;    //有效数据个数
            }
          }
        }
        // Scale gradient
        Dtype loss_weight = top[0]->cpu_diff()[0] / get_normalizer(normalization_, count);  //除以规范化系数,得到缩放系数
        caffe_scal(bottom[0]->count(), loss_weight, bottom_diff);  //bottom_diff *= loss_weight
      }
    }
    

    EuclideanLossLayer类简介

    EuclideanLossLayer类用于计算预测值与真实值的欧式距离损失,用于回归任务中。

    1. 第一个输入blob为网络的预测值,大小(N imes C imes H imes W),范围(hat{y}_n in [-infty, +infty])
    2. 第二个输入blob为标签值,大小(N imes C imes H imes W),范围(y_{n} in [-infty, +infty])
    • 注意实际预测值与标签值的位置可互换,并且反向传播时允许计算两个输入blob的梯度。
    1. 前向计算时,loss的计算公式为:(E = frac{1}{2N} sumlimits_{n=1}^N |{hat{y}_n - y_n}|_2^2)
    2. 反向计算时,第一个输入blob的梯度为:(frac{partial J}{partial {hat{y}_n}} = frac{partial J}{partial E}*frac{partial E}{partial {hat{y}_n}} = frac{partial J}{partial E} * frac{1}{2N}*2(hat{y}_n-y_n)=frac{partial J}{partial E}*frac{hat{y}_n-y_n}{N})
    3. 反向计算时,第二个输入blob的梯度为:(frac{partial J}{partial {y_n}} = frac{partial J}{partial E}*frac{partial E}{partial {y_n}} =-frac{partial J}{partial E}*frac{hat{y}_n-y_n}{N})

    euclidean_loss_layer.cpp源码

    template <typename Dtype>
    void EuclideanLossLayer<Dtype>::Reshape(
      const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
      LossLayer<Dtype>::Reshape(bottom, top);         //调用基类的调整形状
      CHECK_EQ(bottom[0]->count(1), bottom[1]->count(1))
          << "Inputs must have the same dimension.";  //检查C*H*W的总数相等
      diff_.ReshapeLike(*bottom[0]);                  //diff_调整为bottom[0]的形状
    }
    
    template <typename Dtype>
    void EuclideanLossLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
        const vector<Blob<Dtype>*>& top) {
      int count = bottom[0]->count();   //数据的总个数N*C*H*W
      //diff_ = bottom[0] - bottom[1]   //a - b
      caffe_sub(count, bottom[0]->cpu_data(), bottom[1]->cpu_data(), diff_.mutable_cpu_data());
      Dtype dot = caffe_cpu_dot(count, diff_.cpu_data(), diff_.cpu_data()); //计算内积,dot = diff_ * diff_
      Dtype loss = dot / bottom[0]->num() / Dtype(2); //得到 loss = dot / N / 2   //E = 1 / 2 / N * (a - b) * (a - b)
      top[0]->mutable_cpu_data()[0] = loss;
    }
    
    //EuclideanLossLayer并没有严格限制输入blob中预测值和标签值的位置,并且会计算两个输入blob的梯度
    template <typename Dtype>
    void EuclideanLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
      for (int i = 0; i < 2; ++i) {
        if (propagate_down[i]) {    //允许梯度反传
          const Dtype sign = (i == 0) ? 1 : -1;
          const Dtype alpha = sign * top[0]->cpu_diff()[0] / bottom[i]->num();
          //bottom[i] = alpha * diff_ + 0 * bottom[i]
          //a_diff = 1 * λ / N * (a - b)
          //b_diff = -1 * λ / N * (a - b)
          caffe_cpu_axpby(bottom[i]->count(), alpha, diff_.cpu_data(), Dtype(0), bottom[i]->mutable_cpu_diff());
        }
      }
    }
    

    HingeLossLayer类简介

    HingeLossLayer类用于计算合页损失,用于一对多的分类任务中。hinge loss用于SVM中,也正是hinge loss的特性使得SVM中的超平面仅依赖少数样本。

    1. 第一个输入blob为网络的预测值,大小(N imes C imes H imes W),范围(t_{n,k} in [-infty, +infty])。其中数据总个数为(N),数据的类别总数为(K = CHW)
    2. 第二个输入blob为标签值,大小(N imes 1 imes 1 imes 1),范围(l_n in [0, 1, 2, ..., K - 1])之间的整数,第(n)个数据的真实类别为(l_n)
    3. 前向计算时,loss的计算公式为:(E = frac{1}{N} sumlimits_{n=1}^N sumlimits_{k=1}^K [max(0, 1 - delta * t_{n,k})] ^ p)
    • 其中,(delta=left{egin{matrix} 1 & k=l_n\ -1 & k eq l_n end{matrix} ight.)(p)为正则化系数,(p=1)表示L1正则化,(p=2)表示L2正则化
    • 从loss中可以看出,当(t_{n,k}>=1)(样本与超平面较远),并且预测正确(delta=1,(k=l_n))时,(1 - delta * t_{n,k} < 0),该样本对loss无贡献。只有那些超平面附近的数据((t_{n,k}<1)),或者预测错误的数据((delta=-1)),才会计入loss中。
    1. 反向计算时,第一个输入blob的梯度计算公式如下。
    • (frac{partial J}{partial {t_{n,k}}} = frac{partial J}{partial E}*frac{partial E}{partial {t_{n,k}}})
    • (p=1)时,(E = frac{1}{N} sumlimits_{n=1}^N sumlimits_{k=1}^K |max(0, 1 - delta * t_{n,k})|)
    • (frac{partial E}{partial {t_{n,k}}}=left{egin{matrix} 1/N*(-delta) & 1 - delta * t_{n,k} > 0 \ 0 & 1 - delta * t_{n,k} leqslant 0 end{matrix} ight.=left{egin{matrix} -1/N & 1 - delta * t_{n,k} > 0,k=l_n \ 1/N & 1 - delta * t_{n,k} > 0,k eq l_n \ 0 & 1 - delta * t_{n,k} leqslant 0 end{matrix} ight.)
    • (p=2)时,(E = frac{1}{N} sumlimits_{n=1}^N sumlimits_{k=1}^K |max(0, 1 - delta * t_{n,k})|^2)
    • (frac{partial E}{partial {t_{n,k}}}=2/N*max(0, 1 - delta * t_{n,k})*(-delta)=left{egin{matrix} -2/N*max(0, 1 - delta * t_{n,k}) & k=l_n\ 2/N*max(0, 1 - delta * t_{n,k}) & k eq l_n end{matrix} ight.)

    hinge_loss_layer.cpp源码

    template <typename Dtype>
    void HingeLossLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
        const vector<Blob<Dtype>*>& top) {
      const Dtype* bottom_data = bottom[0]->cpu_data();   //预测值数据指针
      Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); //预测值梯度数据指针
      const Dtype* label = bottom[1]->cpu_data();         //标签值数据指针
      int num = bottom[0]->num();     //N,为数据的总个数
      int count = bottom[0]->count(); //N*C*H*W
      int dim = count / num;          //C*H*W,为标签的总类别数K
    
      caffe_copy(count, bottom_data, bottom_diff);    //bottom_diff = bottom_data
      for (int i = 0; i < num; ++i) {
        //label[i]为第i个数据的真实标签
        bottom_diff[i * dim + static_cast<int>(label[i])] *= -1;  //得到 -δ*t_nk, δ= -1(k≠l_n)或1(k=l_n)
      }
      for (int i = 0; i < num; ++i) {
        for (int j = 0; j < dim; ++j) {
          //第i个数据的第j类别的值
          bottom_diff[i * dim + j] = std::max(Dtype(0), 1 + bottom_diff[i * dim + j]);  //max(0, 1-δ*t_nk)
        }
      }
      Dtype* loss = top[0]->mutable_cpu_data();   //输出loss
      switch (this->layer_param_.hinge_loss_param().norm()) {   //正则化方式
      case HingeLossParameter_Norm_L1:
        loss[0] = caffe_cpu_asum(count, bottom_diff) / num;   //L1正则化,计算各数据的绝对值之和,再除以个数
        break;
      case HingeLossParameter_Norm_L2:
        loss[0] = caffe_cpu_dot(count, bottom_diff, bottom_diff) / num; //L2正则化,计算各数据的平方和,在除以个数
        break;
      default:
        LOG(FATAL) << "Unknown Norm";
      }
    }
    
    template <typename Dtype>
    void HingeLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
      if (propagate_down[1]) {    //标签blob不允许梯度反传
        LOG(FATAL) << this->type() << " Layer cannot backpropagate to label inputs.";
      }
      if (propagate_down[0]) {
        //预测值的梯度数据,在Forward_cpu()函数中已保存了max(0, 1-δ*t_nk)
        Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
        const Dtype* label = bottom[1]->cpu_data();   //标签值
        int num = bottom[0]->num();     //N,为数据的总个数
        int count = bottom[0]->count(); //N*C*H*W
        int dim = count / num;          //C*H*W,为标签的总类别数K
    
        for (int i = 0; i < num; ++i) {
          //label[i]为第i个数据的真实标签,得到:
          //bottom_diff = max(0, 1-δ*t_nk) {k≠l_n},
          //              -max(0, 1-δ*t_nk) {k=l_n}
          bottom_diff[i * dim + static_cast<int>(label[i])] *= -1;
        }
    
        //该段的具体计算过程可参考博客上面的说明
        const Dtype loss_weight = top[0]->cpu_diff()[0];
        switch (this->layer_param_.hinge_loss_param().norm()) {
        case HingeLossParameter_Norm_L1:    //L1正则化方式
          //sign(bottom_diff) = 1 {k≠l_n, 1-δ*t_nk > 0},
          //                    0 {k≠l_n, 1-δ*t_nk ≤ 0},
          //                   -1 {k=l_n, 1-δ*t_nk > 0},
          //                    0 {k=l_n, 1-δ*t_nk ≤ 0}
          caffe_cpu_sign(count, bottom_diff, bottom_diff);    //计算符号,bottom_diff = sign(bottom_diff)
          caffe_scal(count, loss_weight / num, bottom_diff);  //bottom_diff *= loss_weight / num
          break;
        case HingeLossParameter_Norm_L2:    //L2正则化方式
          caffe_scal(count, loss_weight * 2 / num, bottom_diff);  //bottom_diff *= loss_weight * 2 / num
          break;
        default:
          LOG(FATAL) << "Unknown Norm";
        }
      }
    }
    

    ContrastiveLossLayer类简介

    ContrastiveLossLayer类用于计算对比损失,该损失函数的思路是同类样本的欧氏距离应尽可能小,非同类样本之间的欧氏距离应该不小于指定阈值,常用于孪生神经网络(siamese network)的训练。

    1. 第一个输入blob为特征向量(a),大小(N imes C imes 1 imes 1),范围(a_{n,k} in [-infty, +infty])。其中数据总个数为(N),特征向量的长度为(C)
    2. 第二个输入blob为特征向量(b),大小(N imes C imes 1 imes 1),形状与第一个输入blob完全相同。数据范围(b_{n,k} in [-infty, +infty])
    3. 第三个输入blob为二元相似度(y),大小(N imes 1 imes 1 imes 1),范围(y_n=1)(a_n)(b_n)为同类样本)或(y_n=0)(a_n)(b_n)非同类样本)
    4. 前向计算时,loss的计算公式为:(E = frac{1}{2N} sumlimits_{n=1}^N [y_n*d_n^2 + (1-y_n)*max (margin-d_n, 0)^2])(代码中legacy_version=false)或(E = frac{1}{2N} sumlimits_{n=1}^N [y_n*d_n^2 + (1-y_n)*max (margin-d_n^2, 0)])(代码中legacy_version=true
    • 其中,(margin)为一个常数,表示非同类样本的最小欧式距离阈值,小于该值则会计入loss中
    • (d_n)为两个特征向量的欧氏距离,(d_n^2=|{a_n - b_n}|_2^2=sumlimits_{k=1}^K{(a_{n,k} - b_{n,k})^2})
    1. 反向计算时,输入blob的梯度计算公式如下。
    • (frac{partial J}{partial {a_{n,k}}} = frac{partial J}{partial E}*frac{partial E}{partial {a_{n,k}}},frac{partial J}{partial {b_{n,k}}} = frac{partial J}{partial E}*frac{partial E}{partial {b_{n,k}}})
    • (a_n)(b_n)为同类样本时,则(y_n=1),此时
      (frac{partial E}{partial {a_{n,k}}}=frac{partial E}{partial d_n^2}*frac{partial d_n^2}{partial a_{n,k}}=frac{1}{2N}*2(a_{n,k} - b_{n,k})=frac{a_{n,k} - b_{n,k}}{N})
      (frac{partial E}{partial {b_{n,k}}}=frac{partial E}{partial d_n^2}*frac{partial d_n^2}{partial b_{n,k}}=-frac{a_{n,k} - b_{n,k}}{N})
    • (a_n)(b_n)为非同类样本时,则(y_n=0),此时若legacy_version=false,则(E = frac{1}{2N} sumlimits_{n=1}^N [y_n*d_n^2 + (1-y_n)*max (margin-d_n, 0)^2])
      (frac{partial E}{partial {a_{n,k}}}=frac{{partial E}}{{partial {d_n}}}frac{{partial {d_n}}}{{partial {a_{n,k}}}}= left{egin{matrix} frac{1}{2N}*2(margin-d_n)*(-1)*frac{{partial {d_n}}}{{partial {a_{n,k}}}} & margin-d_n > 0 \ 0 & margin-d_n leqslant 0 end{matrix} ight.)
      (= left{ {egin{array}{*{20}{c}} { - frac{{(margin - {d_n})}}{N}*frac{{{a_{n,k}} - {b_{n,k}}}}{{{d_n}}}}&{margin - {d_n} > 0}\ 0&{margin - {d_n} leqslant 0} end{array}} ight.)
      同理有(frac{partial E}{partial {b_{n,k}}}= left{ {egin{array}{*{20}{c}} {frac{{(margin - {d_n})}}{N}*frac{{{a_{n,k}} - {b_{n,k}}}}{{{d_n}}}}&{margin - {d_n} > 0}\ 0&{margin - {d_n} leqslant 0} end{array}} ight.)
    • (a_n)(b_n)为非同类样本时,则(y_n=0),此时若legacy_version=true,则(E = frac{1}{2N} sumlimits_{n=1}^N [y_n*d_n^2 + (1-y_n)*max (margin-d_n^2, 0)])
      (frac{partial E}{partial {a_{n,k}}}=frac{partial E}{partial d_n^2}*frac{partial d_n^2}{partial a_{n,k}}= left{egin{matrix} frac{1}{2N}*(-1)*frac{partial d_n^2}{partial a_{n,k}} & margin - {d_n} > 0\ 0 & margin - {d_n} leqslant 0 end{matrix} ight.)
      (=left{egin{matrix} -frac{a_{n,k}-b_{n,k}}{N} & margin - {d_n} > 0 \ 0 & margin - {d_n} leqslant 0 end{matrix} ight.)
      同理有(frac{partial E}{partial {b_{n,k}}}=left{egin{matrix} frac{a_{n,k}-b_{n,k}}{N} & margin - {d_n} > 0 \ 0 & margin - {d_n} leqslant 0 end{matrix} ight.)

    contrastive_loss_layer.cpp源码

    template <typename Dtype>
    void ContrastiveLossLayer<Dtype>::LayerSetUp(
      const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
      LossLayer<Dtype>::LayerSetUp(bottom, top);      //调用基类的初始化函数
      CHECK_EQ(bottom[0]->channels(), bottom[1]->channels()); //C维大小相等
      CHECK_EQ(bottom[0]->height(), 1);     //输入0的形状必须为N*C*1*1
      CHECK_EQ(bottom[0]->width(), 1);
      CHECK_EQ(bottom[1]->height(), 1);     //输入1的形状必须为N*C*1*1
      CHECK_EQ(bottom[1]->width(), 1);
      CHECK_EQ(bottom[2]->channels(), 1);   //输入2的形状必须为N*1*1*1,标签值,表示输入0与输入1的数据是否属于同类
      CHECK_EQ(bottom[2]->height(), 1);
      CHECK_EQ(bottom[2]->width(), 1);
      diff_.Reshape(bottom[0]->num(), bottom[0]->channels(), 1, 1);     //形状调整为N*C*1*1 //存放所有数据的所有特征向量的差
      diff_sq_.Reshape(bottom[0]->num(), bottom[0]->channels(), 1, 1);  //形状调整为N*C*1*1 //gpu计算的临时变量
      dist_sq_.Reshape(bottom[0]->num(), 1, 1, 1);                      //形状调整为N*1*1*1 //存放数据的欧氏距离的平方
      // vector of ones used to sum along channels
      summer_vec_.Reshape(bottom[0]->channels(), 1, 1, 1);  //形状调整为C*1*1*1
      for (int i = 0; i < bottom[0]->channels(); ++i)
        summer_vec_.mutable_cpu_data()[i] = Dtype(1);       //初始设置为1
    }
    
    template <typename Dtype>
    void ContrastiveLossLayer<Dtype>::Forward_cpu(
        const vector<Blob<Dtype>*>& bottom,
        const vector<Blob<Dtype>*>& top) {
      int count = bottom[0]->count();
      // diff_ = bottom[0] - bottom[1]      //a_ij-b_ij
      caffe_sub(count, bottom[0]->cpu_data(), bottom[1]->cpu_data(), diff_.mutable_cpu_data());
      const int channels = bottom[0]->channels();   //每个数据的特征长度
      //距离阈值,对比损失中,非同类样本的欧式距离必须大于margin,否则对应的loss值非0
      Dtype margin = this->layer_param_.contrastive_loss_param().margin();
      //legacy_version为false(默认值)时使用(margin - d)^2公式,为true时使用(margin - d^2)公式
      bool legacy_version = this->layer_param_.contrastive_loss_param().legacy_version();
      Dtype loss(0.0);
      for (int i = 0; i < bottom[0]->num(); ++i) {    //每个数据
        //diff_.cpu_data() + (i*channels)为第i个数据的特征向量的起始位置  //计算两个特征向量的内积,得到d^2
        //d^2 = Σ_{j} (a_ij-b_ij) * (a_ij-b_ij)
        dist_sq_.mutable_cpu_data()[i] = caffe_cpu_dot(channels,
            diff_.cpu_data() + (i*channels), diff_.cpu_data() + (i*channels));
        if (static_cast<int>(bottom[2]->cpu_data()[i])) {  // similar pairs //两个向量为相同类
          loss += dist_sq_.cpu_data()[i];   // E += y*d^2 (y=1)
        } else {  // dissimilar pairs       //非同类
          if (legacy_version) {
            loss += std::max(margin - dist_sq_.cpu_data()[i], Dtype(0.0));  //E += (1-y)*max(0, margin - d^2) (y=0)
          } else {
            Dtype dist = std::max<Dtype>(margin - sqrt(dist_sq_.cpu_data()[i]), Dtype(0.0));
            loss += dist*dist;    //E += (1-y)*max(0, margin - d)^2 (y=0)
          }
        }
      }
      loss = loss / static_cast<Dtype>(bottom[0]->num()) / Dtype(2);    //E = E / N / 2
      top[0]->mutable_cpu_data()[0] = loss;   //最终的loss
    }
    
    template <typename Dtype>
    void ContrastiveLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
      Dtype margin = this->layer_param_.contrastive_loss_param().margin();    //margin距离阈值
      bool legacy_version = this->layer_param_.contrastive_loss_param().legacy_version(); //版本
      for (int i = 0; i < 2; ++i) {
        if (propagate_down[i]) {
          const Dtype sign = (i == 0) ? 1 : -1;   //δ = 1 (a_ij) 或 -1 (b_ij)
          //alpha = δ * λ / N
          const Dtype alpha = sign * top[0]->cpu_diff()[0] / static_cast<Dtype>(bottom[i]->num());
          int num = bottom[i]->num();           //数据的个数
          int channels = bottom[i]->channels(); //数据的特征向量的长度
          for (int j = 0; j < num; ++j) {
            Dtype* bout = bottom[i]->mutable_cpu_diff();  //梯度数据指针
            if (static_cast<int>(bottom[2]->cpu_data()[j])) {  // similar pairs   //相同类
              //相同类,loss的计算公式为 E += y*d^2 (y=1),并且 d^2 = Σ_{j} (a_ij-b_ij) * (a_ij-b_ij)
              //则对 a_ij 或 b_ij 的梯度为 δ * λ / N * (a_ij-b_ij)
              caffe_cpu_axpby(channels, alpha, diff_.cpu_data() + (j*channels),
                  Dtype(0.0), bout + (j*channels));
            } else {  // dissimilar pairs   //不同类
              Dtype mdist(0.0);
              Dtype beta(0.0);
              if (legacy_version) {   //对应 E += (1-y)*max(0, margin - d^2) (y=0)
                mdist = margin - dist_sq_.cpu_data()[j];      //mdist = margin - d^2
                beta = -alpha;                                //beta = -δ * λ / N
              } else {                //对应 E += (1-y)*max(0, margin - d)^2 (y=0)
                Dtype dist = sqrt(dist_sq_.cpu_data()[j]);    //d = sqrt(d^2)
                mdist = margin - dist;                        //mdist = margin - d
                beta = -alpha * mdist / (dist + Dtype(1e-4)); //beta = -δ * λ / N * (margin - d) / d
              }
              if (mdist > Dtype(0.0)) {   //max(0, mdist)时,取的是mdist
                //legacy_version为true时, bout = -δ * λ / N * (a_ij-b_ij)
                //legacy_version为false时, bout = -δ * λ / N * (margin - d) / d * (a_ij-b_ij)
                caffe_cpu_axpby(channels, beta, diff_.cpu_data() + (j*channels),
                    Dtype(0.0), bout + (j*channels));
              } else {       //max(0, mdist)时,取的是0
                caffe_set(channels, Dtype(0), bout + (j*channels));   //置为0
              }
            }
          }
        }
      }
    }
    

    Caffe的源码笔者是第一次阅读,一边阅读一边记录,对代码的理解和分析可能会存在错误或遗漏,希望各位读者批评指正,谢谢支持!

  • 相关阅读:
    招聘ASP.NET(C#)开发人员(已经截止,谢谢大家支持)
    VisualStudioCode开发Vue
    全局异常处理机制(Filter拦截器对比)
    工程师
    kubernetes(k8s)里面部署服务器集群并访问项目
    webpack 就是前端模块化打包工具
    Visual Studio Code配置C/C++开发环境
    docker和k8s(Kubernetes)是配套使用
    kettle 多表全删全插同步数据
    wireshark 抓HTTPS 的包 HTTPS = TLS + HTTP TLSv1.2 协议
  • 原文地址:https://www.cnblogs.com/Relu110/p/12154009.html
Copyright © 2011-2022 走看看