  • 源码分析 Large-Margin Softmax Loss for Convolutional Neural Networks


    • caffe.proto


    • largemargin_inner_product_laye.hpp
    #include <vector>
    #include "caffe/blob.hpp"
    #include "caffe/layer.hpp"
    #include "caffe/proto/caffe.pb.h"
    namespace caffe {
     * @brief Also known as a "LargeMargin fully-connected" layer, computes an LargeMargin inner product
     *        with a set of learned weights, and (optionally) adds biases.
     * TODO(dox): thorough documentation for Forward, Backward, and proto params.
    template <typename Dtype>
    class LargeMarginInnerProductLayer : public Layer<Dtype> {
      explicit LargeMarginInnerProductLayer(const LayerParameter& param)
          : Layer<Dtype>(param) {}
      virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
          const vector<Blob<Dtype>*>& top);
      virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
          const vector<Blob<Dtype>*>& top);
      virtual inline const char* type() const { return "LargeMarginInnerProduct"; }
      virtual inline int ExactNumBottomBlobs() const { return 2; }
      virtual inline int MaxTopBlobs() const { return 2; }
      virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
          const vector<Blob<Dtype>*>& top);
      virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
          const vector<Blob<Dtype>*>& top);
      virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
          const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
      virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
          const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
      int M_;
      int K_;
      int N_;
      LargeMarginInnerProductParameter_LargeMarginType type_;
      // common variables
      Blob<Dtype> x_norm_;
      Blob<Dtype> w_norm_;
      Blob<Dtype> cos_theta_;
      Blob<Dtype> sign_0_; // sign_0 = sign(cos_theta)
      // for DOUBLE type
      Blob<Dtype> cos_theta_quadratic_;
      // for TRIPLE type
      Blob<Dtype> sign_1_; // sign_1 = sign(abs(cos_theta) - 0.5)
      Blob<Dtype> sign_2_; // sign_2 = sign_0 * (1 + sign_1) - 2
      Blob<Dtype> cos_theta_cubic_;
      // for QUADRA type
      Blob<Dtype> sign_3_; // sign_3 = sign_0 * sign(2 * cos_theta_quadratic_ - 1)
      Blob<Dtype> sign_4_; // sign_4 = 2 * sign_0 + sign_3 - 3
      Blob<Dtype> cos_theta_quartic_;
      int iter_;
      Dtype lambda_;
    }  // namespace caffe


    • largemargin_inner_product_laye.cpp


    #include <vector>
    #include "caffe/blob.hpp"
    #include "caffe/common.hpp"
    #include "caffe/filler.hpp"
    #include "caffe/layer.hpp"
    #include "caffe/util/math_functions.hpp"
    #include "caffe/layers/largemargin_inner_product_layer.hpp"
    namespace caffe {
    //该函数主要完成参数赋值、weight Blob初始化
    template <typename Dtype>
    void LargeMarginInnerProductLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
          const vector<Blob<Dtype>*>& top) {
      CHECK_EQ(bottom[0]->num(), bottom[1]->num())
          << "Number of labels must match number of output; "
          << "DO NOT support multi-label this version."
          << "e.g., if prediction shape is (M X N), "
          << "label count (number of labels) must be M, "
          << "with integer values in {0, 1, ..., N-1}.";
      type_ = this->layer_param_.largemargin_inner_product_param().type();
      iter_ = this->layer_param_.largemargin_inner_product_param().iteration();
      lambda_ = (Dtype)0.;
      const int num_output = this->layer_param_.largemargin_inner_product_param().num_output();
      N_ = num_output;
      const int axis = bottom[0]->CanonicalAxisIndex(
      K_ = bottom[0]->count(axis);
      // Check if we need to set up the weights
      if (this->blobs_.size() > 0) {
        LOG(INFO) << "Skipping parameter initialization";
      } else {
        // Intialize the weight
        vector<int> weight_shape(2);
        weight_shape[0] = N_;
        weight_shape[1] = K_;
        this->blobs_[0].reset(new Blob<Dtype>(weight_shape));//weight维度为(N_,K_)
        // fill the weights
        shared_ptr<Filler<Dtype> > weight_filler(GetFiller<Dtype>(
      }  // parameter initialization
      this->param_propagate_down_.resize(this->blobs_.size(), true);//weight大小设置以及初始化
    template <typename Dtype>
    void LargeMarginInnerProductLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
          const vector<Blob<Dtype>*>& top) {
      // Figure out the dimensions
      const int axis = bottom[0]->CanonicalAxisIndex(
      const int new_K = bottom[0]->count(axis);
      CHECK_EQ(K_, new_K)
          << "Input size incompatible with inner product parameters.";
      M_ = bottom[0]->count(0, axis);//单个样本输出维度N_,样本数M_,单个样本维度K_,请记住这三个参数的含义
      vector<int> top_shape = bottom[0]->shape();
      top_shape.resize(axis + 1);
      top_shape[axis] = N_;
      // if needed, reshape top[1] to output lambda
      vector<int> lambda_shape(1, 1);
      // common variables
      vector<int> shape_1_X_M(1, M_);
      vector<int> shape_1_X_N(1, N_);
      // optional temp variables
      switch (type_) {
      case LargeMarginInnerProductParameter_LargeMarginType_SINGLE:
      case LargeMarginInnerProductParameter_LargeMarginType_DOUBLE:
      case LargeMarginInnerProductParameter_LargeMarginType_TRIPLE:
      case LargeMarginInnerProductParameter_LargeMarginType_QUADRUPLE:
        LOG(FATAL) << "Unknown L-Softmax type.";
    template <typename Dtype>
    void LargeMarginInnerProductLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
        const vector<Blob<Dtype>*>& top) {
      iter_ += (Dtype)1.;
      Dtype base_ = this->layer_param_.largemargin_inner_product_param().base();
      Dtype gamma_ = this->layer_param_.largemargin_inner_product_param().gamma();
      Dtype power_ = this->layer_param_.largemargin_inner_product_param().power();
      Dtype lambda_min_ = this->layer_param_.largemargin_inner_product_param().lambda_min();
      lambda_ = base_ * pow(((Dtype)1. + gamma_ * iter_), -power_);
      lambda_ = std::max(lambda_, lambda_min_);
      top[1]->mutable_cpu_data()[0] = lambda_;//指数退化项,iter_很大时,lambda_趋于0
      /************************* common variables *************************/
      const Dtype* bottom_data = bottom[0]->cpu_data();
      Dtype* mutable_x_norm_data = x_norm_.mutable_cpu_data();
      for (int i = 0; i < M_; i++) {
        mutable_x_norm_data[i] = sqrt(caffe_cpu_dot(K_, bottom_data + i * K_, bottom_data + i * K_));//norm{xi}计算,i属于M_
      const Dtype* weight = this->blobs_[0]->cpu_data();
      Dtype* mutable_w_norm_data = w_norm_.mutable_cpu_data();
      for (int i = 0; i < N_; i++) {
        mutable_w_norm_data[i] = sqrt(caffe_cpu_dot(K_, weight + i * K_, weight + i * K_));//norm{wi}计算,i属于N_
      Blob<Dtype> xw_norm_product_;
      caffe_cpu_gemm<Dtype>(CblasTrans, CblasNoTrans, M_, N_, 1, (Dtype)1.,
          x_norm_.cpu_data(), w_norm_.cpu_data(), (Dtype)0., xw_norm_product_.mutable_cpu_data());//norm{wi}乘以norm{xj},输出维度为(M_,N_)
      caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasTrans, M_, N_, K_, (Dtype)1.,
          bottom_data, weight, (Dtype)0., cos_theta_.mutable_cpu_data());
      caffe_add_scalar(M_ * N_, (Dtype)0.000000001, xw_norm_product_.mutable_cpu_data());//防止分母为0
      caffe_div(M_ * N_, cos_theta_.cpu_data(), xw_norm_product_.cpu_data(), cos_theta_.mutable_cpu_data());//cos(theta),输出维度为(M_,N_)
      caffe_cpu_sign(M_ * N_, cos_theta_.cpu_data(), sign_0_.mutable_cpu_data());
      switch (type_) {
      case LargeMarginInnerProductParameter_LargeMarginType_SINGLE:
      case LargeMarginInnerProductParameter_LargeMarginType_DOUBLE:
        caffe_powx(M_ * N_, cos_theta_.cpu_data(), (Dtype)2., cos_theta_quadratic_.mutable_cpu_data());//cos(theta)^2,输出维度为(M_,N_)
      case LargeMarginInnerProductParameter_LargeMarginType_TRIPLE:
        caffe_powx(M_ * N_, cos_theta_.cpu_data(), (Dtype)2., cos_theta_quadratic_.mutable_cpu_data());
        caffe_powx(M_ * N_, cos_theta_.cpu_data(), (Dtype)3., cos_theta_cubic_.mutable_cpu_data());
        caffe_abs(M_ * N_, cos_theta_.cpu_data(), sign_1_.mutable_cpu_data());
        caffe_add_scalar(M_ * N_, -(Dtype)0.5, sign_1_.mutable_cpu_data());
        caffe_cpu_sign(M_ * N_, sign_1_.cpu_data(), sign_1_.mutable_cpu_data());
        caffe_copy(M_ * N_, sign_1_.cpu_data(), sign_2_.mutable_cpu_data());
        caffe_add_scalar(M_ * N_, (Dtype)1., sign_2_.mutable_cpu_data());
        caffe_mul(M_ * N_, sign_0_.cpu_data(), sign_2_.cpu_data(), sign_2_.mutable_cpu_data());
        caffe_add_scalar(M_ * N_, - (Dtype)2., sign_2_.mutable_cpu_data());
      case LargeMarginInnerProductParameter_LargeMarginType_QUADRUPLE:
        caffe_powx(M_ * N_, cos_theta_.cpu_data(), (Dtype)2., cos_theta_quadratic_.mutable_cpu_data());
        caffe_powx(M_ * N_, cos_theta_.cpu_data(), (Dtype)3., cos_theta_cubic_.mutable_cpu_data());
        caffe_powx(M_ * N_, cos_theta_.cpu_data(), (Dtype)4., cos_theta_quartic_.mutable_cpu_data());
        caffe_copy(M_ * N_, cos_theta_quadratic_.cpu_data(), sign_3_.mutable_cpu_data());
        caffe_scal(M_ * N_, (Dtype)2., sign_3_.mutable_cpu_data());
        caffe_add_scalar(M_ * N_, (Dtype)-1., sign_3_.mutable_cpu_data());
        caffe_cpu_sign(M_ * N_, sign_3_.cpu_data(), sign_3_.mutable_cpu_data());
        caffe_mul(M_ * N_, sign_0_.cpu_data(), sign_3_.cpu_data(), sign_3_.mutable_cpu_data());
        caffe_copy(M_ * N_, sign_0_.cpu_data(), sign_4_.mutable_cpu_data());
        caffe_scal(M_ * N_, (Dtype)2., sign_4_.mutable_cpu_data());
        caffe_add(M_ * N_, sign_4_.cpu_data(), sign_3_.cpu_data(), sign_4_.mutable_cpu_data());
        caffe_add_scalar(M_ * N_, - (Dtype)3., sign_4_.mutable_cpu_data());
        LOG(FATAL) << "Unknown L-Softmax type.";
      /************************* Forward *************************/
      Dtype* top_data = top[0]->mutable_cpu_data();
      caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasTrans, M_, N_, K_, (Dtype)1.,
          bottom_data, weight, (Dtype)0., top_data);//top = XW’,X为bottom data维度为(M_,K_),W'为权重矩阵、维度为(K_,N_)
      const Dtype* label = bottom[1]->cpu_data();
      const Dtype* xw_norm_product_data = xw_norm_product_.cpu_data();
        switch (type_) {
      case LargeMarginInnerProductParameter_LargeMarginType_SINGLE: {
      case LargeMarginInnerProductParameter_LargeMarginType_DOUBLE: {
          const Dtype* sign_0_data = sign_0_.cpu_data();
          const Dtype* cos_theta_quadratic_data = cos_theta_quadratic_.cpu_data();
        for (int i = 0; i < M_; i++) {
          const int label_value = static_cast<int>(label[i]);
          top_data[i * N_ + label_value] = xw_norm_product_data[i * N_ + label_value] * 
                                           ((Dtype)2. * sign_0_data[i * N_ + label_value] * 
                                           cos_theta_quadratic_data[i * N_ + label_value] - (Dtype)1.);
        caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasTrans, M_, N_, K_, lambda_,
          bottom_data, weight, (Dtype)1., top_data);//引入lambda_,加速收敛
        caffe_scal(M_ * N_, (Dtype)1./((Dtype)1. + lambda_), top_data);
      case LargeMarginInnerProductParameter_LargeMarginType_TRIPLE: {
          const Dtype* sign_1_data = sign_1_.cpu_data();
        const Dtype* sign_2_data = sign_2_.cpu_data();
        const Dtype* cos_theta_data = cos_theta_.cpu_data();
        const Dtype* cos_theta_cubic_data = cos_theta_cubic_.cpu_data();
        for (int i = 0; i < M_; i++) {
          const int label_value = static_cast<int>(label[i]);
          top_data[i * N_ + label_value] = xw_norm_product_data[i * N_ + label_value] * 
                                          (sign_1_data[i * N_ + label_value] * ((Dtype)4. * 
                                              cos_theta_cubic_data[i * N_ + label_value] - 
                                           (Dtype)3. * cos_theta_data[i * N_ + label_value]) + 
                                           sign_2_data[i * N_ + label_value]);
        caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasTrans, M_, N_, K_, lambda_,
          bottom_data, weight, (Dtype)1., top_data);
        caffe_scal(M_ * N_, (Dtype)1./((Dtype)1. + lambda_), top_data);
      case LargeMarginInnerProductParameter_LargeMarginType_QUADRUPLE: {
          const Dtype* sign_3_data = sign_3_.cpu_data();
        const Dtype* sign_4_data = sign_4_.cpu_data();
        const Dtype* cos_theta_quadratic_data = cos_theta_quadratic_.cpu_data();
        const Dtype* cos_theta_quartic_data = cos_theta_quartic_.cpu_data();
        for (int i = 0; i < M_; i++) {
          const int label_value = static_cast<int>(label[i]);
          top_data[i * N_ + label_value] = xw_norm_product_data[i * N_ + label_value] * 
                                           (sign_3_data[i * N_ + label_value] * ((Dtype)8. * 
                                               cos_theta_quartic_data[i * N_ + label_value] - 
                                            (Dtype)8. * cos_theta_quadratic_data[i * N_ + label_value] + 
                                            (Dtype)1.) + sign_4_data[i * N_ + label_value]);
        caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasTrans, M_, N_, K_, lambda_,
          bottom_data, weight, (Dtype)1., top_data);
        caffe_scal(M_ * N_, (Dtype)1./((Dtype)1. + lambda_), top_data);
      default: {
        LOG(FATAL) << "Unknown L-Softmax type.";
    template <typename Dtype>
    void LargeMarginInnerProductLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
        const vector<bool>& propagate_down,
        const vector<Blob<Dtype>*>& bottom) {
      Blob<Dtype> inv_w_norm_;
      Blob<Dtype> xw_norm_ratio_;
      caffe_add_scalar(N_, (Dtype)0.000000001, w_norm_.mutable_cpu_data());
      caffe_set(N_, (Dtype)1., inv_w_norm_.mutable_cpu_data());
      caffe_div(N_, inv_w_norm_.cpu_data(), w_norm_.cpu_data(), inv_w_norm_.mutable_cpu_data());
      caffe_cpu_gemm<Dtype>(CblasTrans, CblasNoTrans, M_, N_, 1, (Dtype)1.,
          x_norm_.cpu_data(), inv_w_norm_.cpu_data(), (Dtype)0., xw_norm_ratio_.mutable_cpu_data());
      const Dtype* top_diff = top[0]->cpu_diff();
      const Dtype* bottom_data = bottom[0]->cpu_data();
      const Dtype* label = bottom[1]->cpu_data();
      const Dtype* weight = this->blobs_[0]->cpu_data();
      if (this->param_propagate_down_[0]) {
        Dtype* weight_diff = this->blobs_[0]->mutable_cpu_diff();//请注意:weight和weight_diff含义不同
        const Dtype* xw_norm_ratio_data = xw_norm_ratio_.cpu_data();
        switch (type_) {
        case LargeMarginInnerProductParameter_LargeMarginType_SINGLE: {
          caffe_cpu_gemm<Dtype>(CblasTrans, CblasNoTrans, N_, K_, M_, (Dtype)1.,
            top_diff, bottom_data, (Dtype)1., this->blobs_[0]->mutable_cpu_diff());
        case LargeMarginInnerProductParameter_LargeMarginType_DOUBLE: {
          const Dtype* sign_0_data = sign_0_.cpu_data();
          const Dtype* cos_theta_data = cos_theta_.cpu_data();
          const Dtype* cos_theta_quadratic_data = cos_theta_quadratic_.cpu_data();
          for (int i = 0; i < N_; i++) {
            for (int j = 0; j < M_; j++) {// dL/dwij = sum{dL/dfni*dfni/dwij},求和范围n属于[0,M_)
              const int label_value = static_cast<int>(label[j]);
              if (label_value != i) {
                caffe_cpu_axpby(K_, (Dtype)1. / ((Dtype)1. + lambda_) * top_diff[j * N_ + i], 
                                bottom_data + j * K_, (Dtype)1., weight_diff + i * K_);
              } else {
                caffe_cpu_axpby(K_, (Dtype)1. / ((Dtype)1. + lambda_) * top_diff[j * N_ + i] * 
                                    (Dtype)4. * sign_0_data[j * N_ + i] * cos_theta_data[j * N_ + i],
                                bottom_data + j * K_, (Dtype)1., weight_diff + i * K_);
                caffe_cpu_axpby(K_, (Dtype)1. / ((Dtype)1. + lambda_) * top_diff[j * N_ + i] * 
                                    (-xw_norm_ratio_data[j * N_ + i]) * ((Dtype)2. * sign_0_data[j * N_ + i] * 
                                    cos_theta_quadratic_data[j * N_ + i] + (Dtype)1.), 
                                weight + i * K_, (Dtype)1., weight_diff + i * K_);
          caffe_cpu_gemm<Dtype>(CblasTrans, CblasNoTrans, N_, K_, M_, lambda_/((Dtype)1. + lambda_),
            top_diff, bottom_data, (Dtype)1., this->blobs_[0]->mutable_cpu_diff());
        case LargeMarginInnerProductParameter_LargeMarginType_TRIPLE: {
          const Dtype* sign_1_data = sign_1_.cpu_data();
          const Dtype* sign_2_data = sign_2_.cpu_data();
          const Dtype* cos_theta_quadratic_data = cos_theta_quadratic_.cpu_data();
          const Dtype* cos_theta_cubic_data = cos_theta_cubic_.cpu_data();
          for (int i = 0; i < N_; i++) {
            for (int j = 0; j < M_; j++) {
              const int label_value = static_cast<int>(label[j]);
              if (label_value != i) {
                caffe_cpu_axpby(K_, (Dtype)1. / ((Dtype)1. + lambda_) * top_diff[j * N_ + i], 
                                bottom_data + j * K_, (Dtype)1., weight_diff + i * K_);
              } else {
                caffe_cpu_axpby(K_, (Dtype)1. / ((Dtype)1. + lambda_) * top_diff[j * N_ + i] * 
                                    sign_1_data[j * N_ + i] * ((Dtype)12. * cos_theta_quadratic_data[j * N_ + i] - 
                                bottom_data + j * K_, (Dtype)1., weight_diff + i * K_);
                caffe_cpu_axpby(K_, (Dtype)1. / ((Dtype)1. + lambda_) * top_diff[j * N_ + i] * 
                                    (-xw_norm_ratio_data[j * N_ + i]) * ((Dtype)8. * sign_1_data[j * N_ + i] * 
                                    cos_theta_cubic_data[j * N_ + i] - sign_2_data[j * N_ + i]), 
                                weight + i * K_, (Dtype)1., weight_diff + i * K_);
          caffe_cpu_gemm<Dtype>(CblasTrans, CblasNoTrans, N_, K_, M_, lambda_/((Dtype)1. + lambda_),
            top_diff, bottom_data, (Dtype)1., this->blobs_[0]->mutable_cpu_diff());
        case LargeMarginInnerProductParameter_LargeMarginType_QUADRUPLE: {
          const Dtype* sign_3_data = sign_3_.cpu_data();
          const Dtype* sign_4_data = sign_4_.cpu_data();
          const Dtype* cos_theta_data = cos_theta_.cpu_data();
          const Dtype* cos_theta_quadratic_data = cos_theta_quadratic_.cpu_data();
          const Dtype* cos_theta_cubic_data = cos_theta_cubic_.cpu_data();
          const Dtype* cos_theta_quartic_data = cos_theta_quartic_.cpu_data();
          for (int i = 0; i < N_; i++) {
            for (int j = 0; j < M_; j++) {
              const int label_value = static_cast<int>(label[j]);
              if (label_value != i) {
                caffe_cpu_axpby(K_, (Dtype)1. / ((Dtype)1. + lambda_) * top_diff[j * N_ + i], 
                                bottom_data + j * K_, (Dtype)1., weight_diff + i * K_);
              } else {
                caffe_cpu_axpby(K_, (Dtype)1. / ((Dtype)1. + lambda_) * top_diff[j * N_ + i] * 
                                    sign_3_data[j * N_ + i] * ((Dtype)32. * cos_theta_cubic_data[j * N_ + i] - 
                                    (Dtype)16. * cos_theta_data[j * N_ + i]),
                                bottom_data + j * K_, (Dtype)1., weight_diff + i * K_);
                caffe_cpu_axpby(K_, (Dtype)1. / ((Dtype)1. + lambda_) * top_diff[j * N_ + i] *
                                    (-xw_norm_ratio_data[j * N_ + i]) * (sign_3_data[j * N_ + i] * 
                                    ((Dtype)24. * cos_theta_quartic_data[j * N_ + i] - 
                                     (Dtype)8. * cos_theta_quadratic_data[j * N_ + i] - (Dtype)1.) - 
                                    sign_4_data[j * N_ + i]), 
                                weight + i * K_, (Dtype)1., weight_diff + i * K_);
          caffe_cpu_gemm<Dtype>(CblasTrans, CblasNoTrans, N_, K_, M_, lambda_/((Dtype)1. + lambda_),
            top_diff, bottom_data, (Dtype)1., this->blobs_[0]->mutable_cpu_diff());
        default: {
          LOG(FATAL) << "Unknown L-Softmax type.";
      // Gradient with respect to bottom data
      if (propagate_down[0]) {
        Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
        const Dtype* xw_norm_ratio_data = xw_norm_ratio_.cpu_data();
        caffe_set(M_ * K_, (Dtype)0., bottom_diff);
        switch (type_) {
        case LargeMarginInnerProductParameter_LargeMarginType_SINGLE: {
          caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, M_, K_, N_, (Dtype)1.,
            top_diff, this->blobs_[0]->cpu_data(), (Dtype)0.,
        case LargeMarginInnerProductParameter_LargeMarginType_DOUBLE: {
          const Dtype* sign_0_data = sign_0_.cpu_data();
          const Dtype* cos_theta_data = cos_theta_.cpu_data();
          const Dtype* cos_theta_quadratic_data = cos_theta_quadratic_.cpu_data();
          for (int i = 0; i < M_; i++) {
            const int label_value = static_cast<int>(label[i]);
            for (int j = 0; j < N_; j++) {// dL/dxij = sum{dL/dfin*dfin/dxij},求和范围n属于[0,N_)
              if (label_value != j) {
                caffe_cpu_axpby(K_, (Dtype)1. / ((Dtype)1. + lambda_) * top_diff[i * N_ + j], 
                                weight + j * K_, (Dtype)1., bottom_diff + i * K_);
              } else {
                caffe_cpu_axpby(K_, (Dtype)1. / ((Dtype)1. + lambda_) * top_diff[i * N_ + j] * 
                                    (Dtype)4. * sign_0_data[i * N_ + j] * cos_theta_data[i * N_ + j], 
                                weight + j * K_, (Dtype)1., bottom_diff + i * K_);
                caffe_cpu_axpby(K_, (Dtype)1. / ((Dtype)1. + lambda_) * top_diff[i * N_ + j] / 
                                    (-xw_norm_ratio_data[i * N_ + j]) * ((Dtype)2. * sign_0_data[i * N_ + j] * 
                                    cos_theta_quadratic_data[i * N_ + j] + (Dtype)1.), 
                                bottom_data + i * K_, (Dtype)1., bottom_diff + i * K_);
          caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, M_, K_, N_, lambda_/((Dtype)1. + lambda_),
            top_diff, this->blobs_[0]->cpu_data(), (Dtype)1.,
        case LargeMarginInnerProductParameter_LargeMarginType_TRIPLE: {
          const Dtype* sign_1_data = sign_1_.cpu_data();
          const Dtype* sign_2_data = sign_2_.cpu_data();
          const Dtype* cos_theta_quadratic_data = cos_theta_quadratic_.cpu_data();
          const Dtype* cos_theta_cubic_data = cos_theta_cubic_.cpu_data();
          for (int i = 0; i < M_; i++) {
            const int label_value = static_cast<int>(label[i]);
            for (int j = 0; j < N_; j++) {
              if (label_value != j) {
                caffe_cpu_axpby(K_, (Dtype)1. / ((Dtype)1. + lambda_) * top_diff[i * N_ + j], 
                                weight + j * K_, (Dtype)1., bottom_diff + i * K_);
              } else {
                caffe_cpu_axpby(K_, (Dtype)1. / ((Dtype)1. + lambda_) * top_diff[i * N_ + j] * 
                                    sign_1_data[i * N_ + j] * ((Dtype)12. * cos_theta_quadratic_data[i * N_ + j] - 
                                weight + j * K_, (Dtype)1., bottom_diff + i * K_);
                caffe_cpu_axpby(K_, (Dtype)1. / ((Dtype)1. + lambda_) * top_diff[i * N_ + j] / 
                                    (-xw_norm_ratio_data[i * N_ + j]) * ((Dtype)8. * sign_1_data[i * N_ + j] * 
                                    cos_theta_cubic_data[i * N_ + j] - sign_2_data[i * N_ +j]), 
                                bottom_data + i * K_, (Dtype)1., bottom_diff + i * K_);
          caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, M_, K_, N_, lambda_/((Dtype)1. + lambda_),
            top_diff, this->blobs_[0]->cpu_data(), (Dtype)1.,
        case LargeMarginInnerProductParameter_LargeMarginType_QUADRUPLE: {
          const Dtype* sign_3_data = sign_3_.cpu_data();
          const Dtype* sign_4_data = sign_4_.cpu_data();
          const Dtype* cos_theta_data = cos_theta_.cpu_data();
          const Dtype* cos_theta_quadratic_data = cos_theta_quadratic_.cpu_data();
          const Dtype* cos_theta_cubic_data = cos_theta_cubic_.cpu_data();
          const Dtype* cos_theta_quartic_data = cos_theta_quartic_.cpu_data();
          for (int i = 0; i < M_; i++) {
            const int label_value = static_cast<int>(label[i]);
            for (int j = 0; j < N_; j++) {
              if (label_value != j) {
                caffe_cpu_axpby(K_, (Dtype)1. / ((Dtype)1. + lambda_) * top_diff[i * N_ + j], 
                                weight + j * K_, (Dtype)1., bottom_diff + i * K_);
              } else {
                caffe_cpu_axpby(K_, (Dtype)1. / ((Dtype)1. + lambda_) * top_diff[i * N_ + j] * 
                                    sign_3_data[i * N_ + j] * ((Dtype)32. * cos_theta_cubic_data[i * N_ + j] -
                                    (Dtype)16. * cos_theta_data[i * N_ + j]), 
                                weight + j * K_, (Dtype)1., bottom_diff + i * K_);
                caffe_cpu_axpby(K_, (Dtype)1. / ((Dtype)1. + lambda_) * top_diff[i * N_ + j] /
                                    (-xw_norm_ratio_data[i * N_ + j]) * (sign_3_data[i * N_ + j] * 
                                    ((Dtype)24. * cos_theta_quartic_data[i * N_ + j] - 
                                     (Dtype)8. * cos_theta_quadratic_data[i * N_ + j] - (Dtype)1.) - 
                                    sign_4_data[i * N_ + j]), 
                                bottom_data + i * K_, (Dtype)1., bottom_diff + i * K_);
          caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, M_, K_, N_, lambda_/((Dtype)1. + lambda_),
            top_diff, this->blobs_[0]->cpu_data(), (Dtype)1.,
        default: {
          LOG(FATAL) << "Unknown L-Softmax type.";
    #ifdef CPU_ONLY
    }  // namespace caffe




