zoukankan      html  css  js  c++  java
  • cascade DecodeBBox层

    https://zhuanlan.zhihu.com/p/36095768

    我的推断,第二第三阶段应该不是把所有anchor进行bounding box regression,然后再选取当前条件下的所有roi,而是第一阶段选取512个roi,然后把在第一阶段匹配好的roi送到第二、三阶段

    layer {
      name: "proposals_2nd"
      type: "DecodeBBox"
      bottom: "bbox_pred"
      bottom: "rois"
      bottom: "match_gt_boxes"
      top: "proposals_2nd"
      bbox_reg_param {
        bbox_mean: 0 bbox_mean: 0 bbox_mean: 0 bbox_mean: 0
        bbox_std: 0.1 bbox_std: 0.1 bbox_std: 0.2 bbox_std: 0.2
      }
      propagate_down: 0
      propagate_down: 0
      propagate_down: 0
    }

    这段代码就证明了这个想法:rois来自于第一阶段proposal_info,这些rois也是在第一阶段做roi-pooling用来训练的。

    个人感觉cascade的模型就是4张图提取512个roi进行训练,然后经过第一阶段训练后,把这512个roi经过回归精修然后去除回归后x1大于x2和y1大于y2的和回归后和gt的iou大于0.95的,这样roi可能就没有512个了.把这些输入给第二阶段的proposal_info_2nd,让这个层再去决定训练样本,这样大可能训练的数据是不足512,并且3个阶段其实都是训练的同一个批roi,也就是说第一阶段进去的那些roi,后面几个阶段实际上也在训练他们,而不是新出来的框DecodeBbox层的输入是bbox_pred,rois和match_gt_boxes.首先明确一点,rpn网络会输出很多proposals出来,ProposalTarget层将这些proposals和gt算iou,确定正负样本并选取1:3的比例,然后输出rois,rois就是拿来具体训练的从rpn中获得那部分预提取框.DecodeBbox层就是将这些原本的rois回归成更精准的框,也就是在原始的rois的坐标上增加经过训练得到的回归的值,这个是通过DecodeBBoxesWithPrior函数实现.DecodeBbox层分为大致3个步骤:1.回归得到更精准的rois  2.去掉回归后x1大于x2和y1大于y2的框  3.去掉回归后和gt的iou大于0.95的框
    这部分的结果

    #include <cfloat>
    #include <vector>
    
    #include "caffe/util/bbox_util.hpp"
    #include "caffe/layers/decode_bbox_layer.hpp"
    
    namespace caffe {
        
    template <typename Dtype>
    void DecodeBBoxLayer<Dtype>::LayerSetUp(
        const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
      // bbox mean and std
      BBoxRegParameter bbox_reg_param = this->layer_param_.bbox_reg_param();
      bbox_mean_.Reshape(4,1,1,1); bbox_std_.Reshape(4,1,1,1);
      if (bbox_reg_param.bbox_mean_size() > 0 && bbox_reg_param.bbox_std_size() > 0) {
        int num_means = this->layer_param_.bbox_reg_param().bbox_mean_size();
        int num_stds = this->layer_param_.bbox_reg_param().bbox_std_size();
        CHECK_EQ(num_means,4); CHECK_EQ(num_stds,4);
        for (int i = 0; i < 4; i++) {
          bbox_mean_.mutable_cpu_data()[i] = bbox_reg_param.bbox_mean(i);
          bbox_std_.mutable_cpu_data()[i] = bbox_reg_param.bbox_std(i);
          CHECK_GT(bbox_std_.mutable_cpu_data()[i],0);
        }
      } else {
        caffe_set(bbox_mean_.count(), Dtype(0), bbox_mean_.mutable_cpu_data());
        caffe_set(bbox_std_.count(), Dtype(1), bbox_std_.mutable_cpu_data());
      }
    }
    
    template <typename Dtype>
    void DecodeBBoxLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
          const vector<Blob<Dtype>*>& top) {  
      // bottom: bbox_blob, prior_blob, (match_gt_boxes)
      CHECK_EQ(bottom[0]->num(),bottom[1]->num());
      if (bottom.size()>=3) {
        CHECK_EQ(bottom[0]->num(),bottom[2]->num());
        CHECK(this->phase_ == TRAIN);
      }
      CHECK_EQ(bottom[0]->channels(),8); 
      CHECK_EQ(bottom[1]->channels(),5); 
      bbox_pred_.ReshapeLike(*bottom[0]);
      top[0]->ReshapeLike(*bottom[1]);
    }
    
    template <typename Dtype>
    void DecodeBBoxLayer<Dtype>::Forward_cpu(
        const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
      const int num = bottom[0]->num();
      const int bbox_dim = bottom[0]->channels();
      const int prior_dim = bottom[1]->channels();
      
      //decode prior box [img_id x1 y1 x2 y2]
      const Dtype* prior_data = bottom[1]->cpu_data(); 
      vector<BBox> prior_bboxes;
      for (int i = 0; i < num; i++) {
        BBox bbox;
        bbox.xmin = prior_data[i*prior_dim + 1];
        bbox.ymin = prior_data[i*prior_dim + 2];
        bbox.xmax = prior_data[i*prior_dim + 3];
        bbox.ymax = prior_data[i*prior_dim + 4];
        prior_bboxes.push_back(bbox);
      }
       
      // decode bbox predictions
      const Dtype* bbox_data = bottom[0]->cpu_data();
      Dtype* bbox_pred_data = bbox_pred_.mutable_cpu_data();
      
      DecodeBBoxesWithPrior(bbox_data,prior_bboxes,bbox_dim,bbox_mean_.cpu_data(),
              bbox_std_.cpu_data(),bbox_pred_data);
      
      vector<bool> valid_bbox_flags(num,true);
      // screen out mal-boxes
      if (this->phase_ == TRAIN) {
        for (int i = 0; i < num; i++) {
          const int base_index = i*bbox_dim+4;
          if (bbox_pred_data[base_index] > bbox_pred_data[base_index+2] 
                  || bbox_pred_data[base_index+1] > bbox_pred_data[base_index+3]) {
            valid_bbox_flags[i] = false;
          }
        }
      } 
      // screen out high IoU boxes, to remove redundant gt boxes
      if (bottom.size()==3 && this->phase_ == TRAIN) {
        const Dtype* match_gt_boxes = bottom[2]->cpu_data();
        const int gt_dim = bottom[2]->channels();
        const float gt_iou_thr = this->layer_param_.decode_bbox_param().gt_iou_thr();
        for (int i = 0; i < num; i++) {
          const float overlap = match_gt_boxes[i*gt_dim+gt_dim-1];
          if (overlap >= gt_iou_thr) {
            valid_bbox_flags[i] = false;
          }
        }
      }
      
      vector<int> valid_bbox_ids;
      for (int i = 0; i < num; i++) {
        if (valid_bbox_flags[i]) {
          valid_bbox_ids.push_back(i);
        }
      }
      const int keep_num = valid_bbox_ids.size();
      CHECK_GT(keep_num,0);
      
      top[0]->Reshape(keep_num, prior_dim, 1, 1);
      Dtype* decoded_bbox_data = top[0]->mutable_cpu_data();
      for (int i = 0; i < keep_num; i++) {
        const int keep_id = valid_bbox_ids[i];
        const int base_index = keep_id*bbox_dim+4;
        decoded_bbox_data[i*prior_dim] =  prior_data[keep_id*prior_dim];
        decoded_bbox_data[i*prior_dim+1] = bbox_pred_data[base_index]; 
        decoded_bbox_data[i*prior_dim+2] = bbox_pred_data[base_index+1]; 
        decoded_bbox_data[i*prior_dim+3] = bbox_pred_data[base_index+2]; 
        decoded_bbox_data[i*prior_dim+4] = bbox_pred_data[base_index+3];
      }
    }
    
    INSTANTIATE_CLASS(DecodeBBoxLayer);
    REGISTER_LAYER_CLASS(DecodeBBox);
    
    } // namespace caffe

    DecodeBBoxesWithPrior函数在bbox_util.cpp里实现,完成的功能就是把bounding box regression的结果对输入的prior_bbox(其实就是faster中的输入的region proposal)进行回归获得更精确的框坐标,然后存储在pred_data

    template <typename Dtype>
    void DecodeBBoxesWithPrior(const Dtype* bbox_data, const vector<BBox> prior_bboxes,  
            const int bbox_dim, const Dtype* means, const Dtype* stds, 
            Dtype* pred_data) {
      const int num = prior_bboxes.size();
      const int cls_num = bbox_dim/4;
      for (int i = 0; i < num; i++) {
        Dtype pw, ph, cx, cy;
        pw = prior_bboxes[i].xmax-prior_bboxes[i].xmin+1; 
        ph = prior_bboxes[i].ymax-prior_bboxes[i].ymin+1;
        cx = 0.5*(prior_bboxes[i].xmax+prior_bboxes[i].xmin); 
        cy = 0.5*(prior_bboxes[i].ymax+prior_bboxes[i].ymin);
        for (int c = 0; c < cls_num; c++) {
          Dtype bx, by, bw, bh;
          // bbox de-normalization
          bx = bbox_data[i*bbox_dim+4*c]*stds[0]+means[0];
          by = bbox_data[i*bbox_dim+4*c+1]*stds[1]+means[1];
          bw = bbox_data[i*bbox_dim+4*c+2]*stds[2]+means[2];
          bh = bbox_data[i*bbox_dim+4*c+3]*stds[3]+means[3];
    
          Dtype tx, ty, tw, th;
          tx = bx*pw+cx; ty = by*ph+cy;
          tw = pw*exp(bw); th = ph*exp(bh);
          tx -= (tw-1)/2; ty -= (th-1)/2;
          pred_data[i*bbox_dim+4*c] = tx; 
          pred_data[i*bbox_dim+4*c+1] = ty;
          pred_data[i*bbox_dim+4*c+2] = tx+tw-1; 
          pred_data[i*bbox_dim+4*c+3] = ty+th-1;
        }
      }
    }
  • 相关阅读:
    【51NOD 1478】括号序列的最长合法子段
    【BZOJ 3527】【ZJOI 2014】力
    【BZOJ 2194】快速傅立叶之二
    【CodeVS 3123】高精度练习之超大整数乘法 &【BZOJ 2197】FFT快速傅立叶
    【BZOJ 2693】jzptab
    【BZOJ 2154】Crash的数字表格
    【BZOJ 3529】【SDOI 2014】数表
    【BZOJ 2820】YY的GCD
    【BZOJ 2301】【HAOI 2011】Problem b
    【POJ 3294】Life Forms 不小于k个字符串中的最长子串
  • 原文地址:https://www.cnblogs.com/ymjyqsx/p/9253228.html
Copyright © 2011-2022 走看看