zoukankan      html  css  js  c++  java
  • faster-rcnn 笔记

    2019-02-18,15点00
    '''
    下面是别人写的原始的笔记,我在上面自己补充了一些.
    
    '''
    #https://www.cnblogs.com/the-home-of-123/p/9747963.html
    
    
    
    
     #  以voc数据集为例,按照imdb的命名,利用pascal_voc()函数生成不同的imdb
    
    '''
    
    
    for year in ['2007', '2012']:
      for split in ['train', 'val', 'trainval', 'test']:
        name = 'voc_{}_{}'.format(year, split)  #year='2007', split='trainval'
        __sets[name] = (lambda split=split, year=year: pascal_voc(split, year))
    
    
    def get_imdb(name):
      """Get an imdb (image database) by name."""
      if name not in __sets:
        raise KeyError('Unknown dataset: {}'.format(name))
      return __sets[name]()
    
    '''
    
    
    # self._data_path = os.path.join(self._devkit_path, 'VOC' + self._year)  #数据库路径
    # self._classes = ('__background__',  # always index 0, 训练类别标签,包含背景类
    #                   'person')
    #  # Default to roidb handler
    # self._roidb_handler = self.gt_roidb #感兴趣区域(ROI)数据库
    # self._salt = str(uuid.uuid4()) #??
    # self._comp_id = 'comp4' # ??
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    def _build_network(self, is_training=True):
        # select initializers进行初始化
        if cfg.TRAIN.TRUNCATED:
          initializer = tf.truncated_normal_initializer(mean=0.0, stddev=0.01)
          initializer_bbox = tf.truncated_normal_initializer(mean=0.0, stddev=0.001)
        else:
          initializer = tf.random_normal_initializer(mean=0.0, stddev=0.01)
          initializer_bbox = tf.random_normal_initializer(mean=0.0, stddev=0.001)
    
        net_conv = self._image_to_head(is_training)##经过特征提取网络,初步提取特征
        with tf.variable_scope(self._scope, self._scope):
          # build the anchors for the image
          self._anchor_component()###产生anchor
          # region proposal network ###产生proposal的坐标
          rois = self._region_proposal(net_conv, is_training, initializer)
          #这里面rois表示的是那些非背景的区域对应到feature_map上的坐标组成的数组.
          '''
          上面一行的代码是和兴!!
          '''
          # region of interest pooling
          if cfg.POOLING_MODE == 'crop':
            pool5 = self._crop_pool_layer(net_conv, rois, "pool5") ###对产生的porposal进行ROI池化,统一格式
          else:
            raise NotImplementedError
          '''
          这里面得到的pool5就是把rois
          '''
    
    
    
    
        fc7 = self._head_to_tail(pool5, is_training)
        with tf.variable_scope(self._scope, self._scope):
          # region classification 输入到Fast-RCNN网络中,对样本进行分类和预测框回归
          cls_prob, bbox_pred = self._region_classification(fc7, is_training,
                                                            initializer, initializer_bbox)
    
        '''
        利用self._region_classification 里面的fc 层和softmax层输出 cls_prob, bbox_pred.得到最总的预测结果.
        '''
        self._score_summaries.update(self._predictions)
    
        return rois, cls_prob, bbox_pred
    
    
    
    '''
    下面是上面说的核心代码的分析
    '''
    
    def _region_proposal(self, net_conv, is_training, initializer):
        rpn = slim.conv2d(net_conv, cfg.RPN_CHANNELS, [3, 3], trainable=is_training, weights_initializer=initializer,
                            scope="rpn_conv/3x3") ##经过一个3X3卷积,之后分两条线
        self._act_summaries.append(rpn)
    
        '''
        下面一行的代码就是唐老师说的最精髓的地方.
        anchors这些概念都是虚拟的.其实都没有.都是通过学习得到的.
        
        输出的维度是self._num_anchors * 2,   每2个数表示一种anchor对应的得分.至于到底哪个数对应哪个anchor,
        不用指明,这些完全是通过学习获得的.这样避免人工干预,效果更好.更加end_to_end.
        
        从这里面kernal=[1*1]就表示每一个像素点对应9个anchor!
        
        
        '''
    
    
        rpn_cls_score = slim.conv2d(rpn, self._num_anchors * 2, [1, 1], trainable=is_training,
                                    weights_initializer=initializer,
                                    padding='VALID', activation_fn=None, scope='rpn_cls_score') ###第一条线产生预测类别确定是背景还是类别
    
        '''
        我纳闷的地方是这里面得到的rpn_cls_score:(1,height,width,18) 表示的是9个框的分数.而表示不了各个分类的分数
        那么后面的nms怎么做?
        
        其实这个地方只是对是否是背景做nms
        '''
    
    
    
        # change it so that the score has 2 as its channel size
        rpn_cls_score_reshape = self._reshape_layer(rpn_cls_score, 2, 'rpn_cls_score_reshape')
        rpn_cls_prob_reshape = self._softmax_layer(rpn_cls_score_reshape, "rpn_cls_prob_reshape")
        rpn_cls_pred = tf.argmax(tf.reshape(rpn_cls_score_reshape, [-1, 2]), axis=1, name="rpn_cls_pred")
    
    
    
        rpn_cls_prob = self._reshape_layer(rpn_cls_prob_reshape, self._num_anchors * 2, "rpn_cls_prob")
    
    
    
    
        rpn_bbox_pred = slim.conv2d(rpn, self._num_anchors * 4, [1, 1], trainable=is_training,  ###第二条线产生预测框坐标,对预测框坐标进行预测
                                    weights_initializer=initializer,
                                    padding='VALID', activation_fn=None, scope='rpn_bbox_pred')
        if is_training:
          rois, roi_scores = self._proposal_layer(rpn_cls_prob, rpn_bbox_pred, "rois") ###根据预测的类别和预测框坐标对porposa进行筛选,对前N个进行NMS,这里面nms只是找这些框,那些不是背景的得分高.把那些是背景概率高的框去掉.
          rpn_labels = self._anchor_target_layer(rpn_cls_score, "anchor")
          # Try to have a deterministic order for the computing graph, for reproducibility
          with tf.control_dependencies([rpn_labels]):
            rois, _ = self._proposal_target_layer(rois, roi_scores, "rpn_rois")
        else:
          if cfg.TEST.MODE == 'nms':
            rois, _ = self._proposal_layer(rpn_cls_prob, rpn_bbox_pred, "rois")
          elif cfg.TEST.MODE == 'top':
            rois, _ = self._proposal_top_layer(rpn_cls_prob, rpn_bbox_pred, "rois")
          else:
            raise NotImplementedError
    
        self._predictions["rpn_cls_score"] = rpn_cls_score
        self._predictions["rpn_cls_score_reshape"] = rpn_cls_score_reshape
        self._predictions["rpn_cls_prob"] = rpn_cls_prob
        self._predictions["rpn_cls_pred"] = rpn_cls_pred
        self._predictions["rpn_bbox_pred"] = rpn_bbox_pred
        self._predictions["rois"] = rois
    
        return rois
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    def _crop_pool_layer(self, bottom, rois, name): ####bottom为convert层卷积输出---也就是特征图, feat_stride为补偿乘积,用来求得原图的w,h.rois为选出的256个anchor的坐标,这些坐标是特征图上的坐标.
        '''
        结果就是在特征图上,把rois这些子图都扣出来.然后按照比例反映射到原始input_image里面的部分.
        类似感受眼这个东西.
    
    
    
        '''
        with tf.variable_scope(name) as scope:
          batch_ids = tf.squeeze(tf.slice(rois, [0, 0], [-1, 1], name="batch_id"), [1])
          # Get the normalized coordinates of bounding boxes
          bottom_shape = tf.shape(bottom)
          height = (tf.to_float(bottom_shape[1]) - 1.) * np.float32(self._feat_stride[0])
          width = (tf.to_float(bottom_shape[2]) - 1.) * np.float32(self._feat_stride[0])
    
          '''
          yinwei rois是针对原始图片的坐标.所以相对坐标是需要/htight or width
          '''
          x1 = tf.slice(rois, [0, 1], [-1, 1], name="x1") / width
          y1 = tf.slice(rois, [0, 2], [-1, 1], name="y1") / height
          x2 = tf.slice(rois, [0, 3], [-1, 1], name="x2") / width
          y2 = tf.slice(rois, [0, 4], [-1, 1], name="y2") / height###得到相对位置
          # Won't be back-propagated to rois anyway, but to save time
    
          '''
          因为bboxes.需要的是图片中子图的相对坐标位置,也就是4个百分比位置
          '''
          bboxes = tf.stop_gradient(tf.concat([y1, x1, y2, x2], axis=1))
          '''
          表示bboxes这个变量,不计算梯度.
          '''
    
    
          pre_pool_size = cfg.POOLING_SIZE * 2
          crops = tf.image.crop_and_resize(bottom, bboxes, tf.to_int32(batch_ids), [pre_pool_size, pre_pool_size], name="crops")##利用tensorflow的自带函数作用类似于ROI池化
          '''
          因为pre_pool_size = cfg.POOLING_SIZE * 2,所以下面再maxpool一下把图片缩小回去.
          '''
        return slim.max_pool2d(crops, [2, 2], padding='SAME')
    
    import tensorflow as tf
    # help(tf.image.crop_and_resize)
    
    
    
    
    def _region_classification(self, fc7, is_training, initializer, initializer_bbox):
        cls_score = slim.fully_connected(fc7, self._num_classes,
                                           weights_initializer=initializer,
                                           trainable=is_training,
                                           activation_fn=None, scope='cls_score')
        cls_prob = self._softmax_layer(cls_score, "cls_prob")
        cls_pred = tf.argmax(cls_score, axis=1, name="cls_pred")
        bbox_pred = slim.fully_connected(fc7, self._num_classes * 4,
                                         weights_initializer=initializer_bbox,
                                         trainable=is_training,
                                         activation_fn=None, scope='bbox_pred')
    
        self._predictions["cls_score"] = cls_score
        self._predictions["cls_pred"] = cls_pred
        self._predictions["cls_prob"] = cls_prob
        self._predictions["bbox_pred"] = bbox_pred
    
        return cls_prob, bbox_pred
    View Code
  • 相关阅读:
    Java闭包和回调
    Java通过字节分割字符串
    编译型语言和解释型语言的简单介绍
    对JavaScript事件处理程序/事件监听器的设定的简单介绍
    Linux netstat命令详解
    CentOS 7下iptables配置添加修改规则端口方法(转)
    设计模式
    设计模式
    Mysql8.0主从配置
    设计模式
  • 原文地址:https://www.cnblogs.com/zhangbo2008/p/10395680.html
Copyright © 2011-2022 走看看