  • CTPN项目部分代码学习

        上次拜读了CTPN论文,趁热打铁,今天就从网上找到CTPN 的tensorflow代码实现一下,这里放出大佬的github项目地址:https://github.com/eragonruan/text-detection-ctpn

    博客里的代码都是经过实际操作可以运行的,这里只是总结一下代码的实现过程,提高一下自己的代码能力,争取早日会自己写代码 !!!》o《!!!


    import pprint
    import sys
    import os.path
    sys.path.append(os.getcwd())#os.getcwd 用于返回当前工作目录 sys.path.append()用于将前面得到的工作目录添加到搜索路径中
    this_dir = os.path.dirname(__file__)#os.path.dirname 获取当前运行脚本的绝对路径。
    from lib.fast_rcnn.train import get_training_roidb, train_net
    from lib.fast_rcnn.config import cfg_from_file, get_output_dir, get_log_dir
    from lib.datasets.factory import get_imdb
    from lib.networks.factory import get_network
    from lib.fast_rcnn.config import cfg
    if __name__ == '__main__':
        cfg_from_file('ctpn/text.yml')#text.yml 存放的是训练时的一些参数
        print('Using config:')
        pprint.pprint(cfg)#pprint函数时pprint模块下方法,是一种标准、格式化输出方式。pprint(object, stream=None, indent=1, width=80, depth=None, *, compact=False)
        imdb = get_imdb('voc_2007_trainval')#读取VOC中的数据集
        print('Loaded dataset `{:s}` for training'.format(imdb.name))
        roidb = get_training_roidb(imdb)#获得感兴趣区域的数据集
        output_dir = get_output_dir(imdb, None)#返回程序运行结果存放的文件夹的路径
        log_dir = get_log_dir(imdb)#返回程序运行时中间过程产生的文件。
        print('Output will be saved to `{:s}`'.format(output_dir))
        print('Logs will be saved to `{:s}`'.format(log_dir)) 
        network = get_network('VGGnet_train')#获取VGG网络结构
        train_net(network, imdb, roidb,
                  max_iters=int(cfg.TRAIN.max_steps),restore=bool(int(cfg.TRAIN.restore)))#采用VGG_Net 输入训练图片的数据集,感兴趣区域的数据集等开始训练。。


    network = get_network('VGGnet_train')#获取VGG网络结构
    train_net(network, imdb, roidb,
                  max_iters=int(cfg.TRAIN.max_steps),restore=bool(int(cfg.TRAIN.restore)))#采用VGG_Net 输入训练图片的数据集,感兴趣区域的数据集等开始训练。。


    def get_network(name):
        """Get a network by name."""
        if name.split('_')[0] == 'VGGnet':
            if name.split('_')[1] == 'test':
               return VGGnet_test()
            elif name.split('_')[1] == 'train':
               return VGGnet_train()
               raise KeyError('Unknown dataset: {}'.format(name))
            raise KeyError('Unknown dataset: {}'.format(name))


    class VGGnet_train(Network):#定义VGGnet网络结构类
        def __init__(self, trainable=True):
            self.inputs = []
            self.data = tf.placeholder(tf.float32, shape=[None, None, None, 3], name='data')#定义输入图片的占位符,图片为三通道的大小不设置
            self.im_info = tf.placeholder(tf.float32, shape=[None, 3], name='im_info')#
            self.gt_boxes = tf.placeholder(tf.float32, shape=[None, 5], name='gt_boxes')#定义gt框的占位符包含一个标签
            self.gt_ishard = tf.placeholder(tf.int32, shape=[None], name='gt_ishard')
            self.dontcare_areas = tf.placeholder(tf.float32, shape=[None, 4], name='dontcare_areas')#定义非关心区的占位符
            self.keep_prob = tf.placeholder(tf.float32)
            self.layers = dict({'data':self.data, 'im_info':self.im_info, 'gt_boxes':self.gt_boxes,
                                'gt_ishard': self.gt_ishard, 'dontcare_areas': self.dontcare_areas})
            self.trainable = trainable
        def setup(self):
            # n_classes = 21
            n_classes = cfg.NCLASSES#设置数据集中的类别数
            # anchor_scales = [8, 16, 32]
            anchor_scales = cfg.ANCHOR_SCALES#定义anchor的尺寸
            _feat_stride = [16, ]#滑动步长为16
                 .conv(3, 3, 64, 1, 1, name='conv1_1')
                 .conv(3, 3, 64, 1, 1, name='conv1_2')
                 .max_pool(2, 2, 2, 2, padding='VALID', name='pool1')
                 .conv(3, 3, 128, 1, 1, name='conv2_1')
                 .conv(3, 3, 128, 1, 1, name='conv2_2')
                 .max_pool(2, 2, 2, 2, padding='VALID', name='pool2')
                 .conv(3, 3, 256, 1, 1, name='conv3_1')
                 .conv(3, 3, 256, 1, 1, name='conv3_2')
                 .conv(3, 3, 256, 1, 1, name='conv3_3')
                 .max_pool(2, 2, 2, 2, padding='VALID', name='pool3')
                 .conv(3, 3, 512, 1, 1, name='conv4_1')
                 .conv(3, 3, 512, 1, 1, name='conv4_2')
                 .conv(3, 3, 512, 1, 1, name='conv4_3')
                 .max_pool(2, 2, 2, 2, padding='VALID', name='pool4')
                 .conv(3, 3, 512, 1, 1, name='conv5_1')
                 .conv(3, 3, 512, 1, 1, name='conv5_2')
                 .conv(3, 3, 512, 1, 1, name='conv5_3'))
            #========= RPN ============
            (self.feed('lstm_o').lstm_fc(512,len(anchor_scales) * 10 * 4, name='rpn_bbox_pred'))
            (self.feed('lstm_o').lstm_fc(512,len(anchor_scales) * 10 * 2,name='rpn_cls_score'))
            # generating training labels on the fly
            # output: rpn_labels(HxWxA, 2) rpn_bbox_targets(HxWxA, 4) rpn_bbox_inside_weights rpn_bbox_outside_weights
            # 给每个anchor上标签,并计算真值(也是delta的形式),以及内部权重和外部权重
            (self.feed('rpn_cls_score', 'gt_boxes', 'gt_ishard', 'dontcare_areas', 'im_info')
                 .anchor_target_layer(_feat_stride, anchor_scales, name = 'rpn-data' ))
            # shape is (1, H, W, Ax2) -> (1, H, WxA, 2)
            # 给之前得到的score进行softmax,得到0-1之间的得分
                 .spatial_reshape_layer(2, name = 'rpn_cls_score_reshape')


    # -*- coding:utf-8 -*-
    import numpy as np
    import tensorflow as tf
    from ..fast_rcnn.config import cfg
    from ..rpn_msr.proposal_layer_tf import proposal_layer as proposal_layer_py
    from ..rpn_msr.anchor_target_layer_tf import anchor_target_layer as anchor_target_layer_py
    DEFAULT_PADDING = 'SAME'#定义padding 为"SAME"
    def layer(op):
        def layer_decorated(self, *args, **kwargs):
            # Automatically set a name if not provided.
            name = kwargs.setdefault('name', self.get_unique_name(op.__name__))
            # Figure out the layer inputs.
            if len(self.inputs)==0:
                raise RuntimeError('No input variables found for layer %s.'%name)
            elif len(self.inputs)==1:
                layer_input = self.inputs[0]
                layer_input = list(self.inputs)
            # Perform the operation and get the output.
            layer_output = op(self, layer_input, *args, **kwargs)
            # Add to layer LUT.
            self.layers[name] = layer_output
            # This output is now the input for the next layer.
            # Return self for chained calls.
            return self
        return layer_decorated
    class Network(object):#这里定义了一个网络的类,内部含有所有搭建网络所需操作函数的定义
        def __init__(self, inputs, trainable=True):
            self.inputs = []
            self.layers = dict(inputs)#网络层为一个字典类型
            self.trainable = trainable#是否可以训练
        def setup(self):
            raise NotImplementedError('Must be subclassed.')#预留一个方法不实现,在其子类中进行实现。
        def load(self, data_path, session, ignore_missing=False):
            data_dict = np.load(data_path,encoding='latin1').item()
            for key in data_dict:
                with tf.variable_scope(key, reuse=True):
                    for subkey in data_dict[key]:
                            var = tf.get_variable(subkey)
                            print("assign pretrain model "+subkey+ " to "+key)
                        except ValueError:
                            print("ignore "+key)
                            if not ignore_missing:
        def feed(self, *args):#添加网络层,搭建网络
            assert len(args)!=0
            self.inputs = []
            for layer in args:
                if isinstance(layer, str):
                        layer = self.layers[layer]#输入网络层
                    except KeyError:
                        raise KeyError('Unknown layer name fed: %s'%layer)
            return self
        def get_output(self, layer):
                layer = self.layers[layer]
            except KeyError:
                raise KeyError('Unknown layer name fed: %s'%layer)
            return layer
        def get_unique_name(self, prefix):
            id = sum(t.startswith(prefix) for t,_ in list(self.layers.items()))+1
            return '%s_%d'%(prefix, id)
        def make_var(self, name, shape, initializer=None, trainable=True, regularizer=None):
            return tf.get_variable(name, shape, initializer=initializer, trainable=trainable, regularizer=regularizer)
            #tf.get_variable 如果已经创建变量对象,就将此对象返回,如果没有,就创建一个。
        def validate_padding(self, padding):
            assert padding in ('SAME', 'VALID')
        #也就是说@A def f(): 是非法的。只可以在模块或类定义层内对函数进行修饰,不允许修修饰一个类。
        def Bilstm(self, input, d_i, d_h, d_o, name, trainable=True):
            img = input
            with tf.variable_scope(name) as scope:
                shape = tf.shape(img)
                N, H, W, C = shape[0], shape[1], shape[2], shape[3]#样本数,高,宽,通道数
                img = tf.reshape(img, [N * H, W, C])
                img.set_shape([None, None, d_i])#更新img中的shape
                lstm_fw_cell = tf.contrib.rnn.LSTMCell(d_h, state_is_tuple=True)#d_h为单元的个数
                lstm_bw_cell = tf.contrib.rnn.LSTMCell(d_h, state_is_tuple=True)#若state_is_tuple为True,返回c_state和m_state的元组
                lstm_out, last_state = tf.nn.bidirectional_dynamic_rnn(lstm_fw_cell,lstm_bw_cell, img, dtype=tf.float32)#论文中提到的双向RNN进而实现双向LSTM
                lstm_out = tf.concat(lstm_out, axis=-1)#连接两个矩阵的操作,axis=-1表示在最后一维上进行连接
                lstm_out = tf.reshape(lstm_out, [N * H * W, 2*d_h])#双向LSTM的输出
                init_weights = tf.truncated_normal_initializer(stddev=0.1)
                init_biases = tf.constant_initializer(0.0)
                weights = self.make_var('weights', [2*d_h, d_o], init_weights, trainable, 
                biases = self.make_var('biases', [d_o], init_biases, trainable)
                outputs = tf.matmul(lstm_out, weights) + biases
                outputs = tf.reshape(outputs, [N, H, W, d_o])
                return outputs
        def lstm(self, input, d_i,d_h,d_o, name, trainable=True):
            img = input
            with tf.variable_scope(name) as scope:
                shape = tf.shape(img)
                N,H,W,C = shape[0], shape[1],shape[2], shape[3]
                img = tf.reshape(img,[N*H,W,C])
                lstm_cell = tf.contrib.rnn.LSTMCell(d_h, state_is_tuple=True)
                initial_state = lstm_cell.zero_state(N*H, dtype=tf.float32)
                lstm_out, last_state = tf.nn.dynamic_rnn(lstm_cell, img,
                lstm_out = tf.reshape(lstm_out,[N*H*W,d_h])
                init_weights = tf.truncated_normal_initializer(stddev=0.1)
                init_biases = tf.constant_initializer(0.0)
                weights = self.make_var('weights', [d_h, d_o], init_weights, trainable, 
                biases = self.make_var('biases', [d_o], init_biases, trainable)
                outputs = tf.matmul(lstm_out, weights) + biases
                outputs = tf.reshape(outputs, [N,H,W,d_o])
                return outputs
        def lstm_fc(self, input, d_i, d_o, name, trainable=True):#定义LSTM的全连接层
            with tf.variable_scope(name) as scope:
                shape = tf.shape(input)
                N, H, W, C = shape[0], shape[1], shape[2], shape[3]
                input = tf.reshape(input, [N*H*W,C])
                init_weights = tf.truncated_normal_initializer(0.0, stddev=0.01)
                init_biases = tf.constant_initializer(0.0)
                kernel = self.make_var('weights', [d_i, d_o], init_weights, trainable,
                biases = self.make_var('biases', [d_o], init_biases, trainable)
                _O = tf.matmul(input, kernel) + biases
                return tf.reshape(_O, [N, H, W, int(d_o)])
        def conv(self, input, k_h, k_w, c_o, s_h, s_w, name, biased=True,relu=True, padding=DEFAULT_PADDING, trainable=True):
        # self,输入,核高,核宽,输出数,步长高,步长宽,名字。。。
            """ contribution by miraclebiu, and biased option"""
            c_i = input.get_shape()[-1]#获得input中最后一维的值
            convolve = lambda i, k: tf.nn.conv2d(i, k, [1, s_h, s_w, 1], padding=padding)#定义卷积过程
            with tf.variable_scope(name) as scope:
                init_weights = tf.truncated_normal_initializer(0.0, stddev=0.01)#初始化权重
                init_biases = tf.constant_initializer(0.0)#初始化偏差为0
                kernel = self.make_var('weights', [k_h, k_w, c_i, c_o], init_weights, trainable, 
                if biased:
                    biases = self.make_var('biases', [c_o], init_biases, trainable)
                    conv = convolve(input, kernel)
                    if relu:#RELU
                        bias = tf.nn.bias_add(conv, biases)
                        return tf.nn.relu(bias, name=scope.name)
                    return tf.nn.bias_add(conv, biases, name=scope.name)
                    conv = convolve(input, kernel)
                    if relu:
                        return tf.nn.relu(conv, name=scope.name)
                    return conv
        def relu(self, input, name):#定义RELU
            return tf.nn.relu(input, name=name)
        def max_pool(self, input, k_h, k_w, s_h, s_w, name, padding=DEFAULT_PADDING):#定义最大池化层
            return tf.nn.max_pool(input,
                                  ksize=[1, k_h, k_w, 1],
                                  strides=[1, s_h, s_w, 1],
        def avg_pool(self, input, k_h, k_w, s_h, s_w, name, padding=DEFAULT_PADDING):#定义平均池化层
            return tf.nn.avg_pool(input,
                                  ksize=[1, k_h, k_w, 1],
                                  strides=[1, s_h, s_w, 1],
        def proposal_layer(self, input, _feat_stride, anchor_scales, cfg_key, name):
            if isinstance(input[0], tuple):
                input[0] = input[0][0]
                # input[0] shape is (1, H, W, Ax2)
                # rpn_rois <- (1 x H x W x A, 5) [0, x1, y1, x2, y2]
            with tf.variable_scope(name) as scope:
                blob,bbox_delta = tf.py_func(proposal_layer_py,[input[0],input[1],input[2], cfg_key, _feat_stride, anchor_scales],
                rpn_rois = tf.convert_to_tensor(tf.reshape(blob,[-1, 5]), name = 'rpn_rois') # shape is (1 x H x W x A, 2)
                rpn_targets = tf.convert_to_tensor(bbox_delta, name = 'rpn_targets') # shape is (1 x H x W x A, 4)
                self.layers['rpn_rois'] = rpn_rois
                self.layers['rpn_targets'] = rpn_targets
                return rpn_rois, rpn_targets
        def anchor_target_layer(self, input, _feat_stride, anchor_scales, name):#给每个anchor加标签,并计算groundTruth
            if isinstance(input[0], tuple):
                input[0] = input[0][0]
            with tf.variable_scope(name) as scope:
                # 'rpn_cls_score', 'gt_boxes', 'gt_ishard', 'dontcare_areas', 'im_info'分别用input[0]~[5]进行表示
                rpn_labels,rpn_bbox_targets,rpn_bbox_inside_weights,rpn_bbox_outside_weights = 
                               [input[0],input[1],input[2],input[3],input[4], _feat_stride, anchor_scales],
                rpn_labels = tf.convert_to_tensor(tf.cast(rpn_labels,tf.int32), name = 'rpn_labels') # shape is (1 x H x W x A, 2)
                rpn_bbox_targets = tf.convert_to_tensor(rpn_bbox_targets, name = 'rpn_bbox_targets') # shape is (1 x H x W x A, 4)
                rpn_bbox_inside_weights = tf.convert_to_tensor(rpn_bbox_inside_weights , name = 'rpn_bbox_inside_weights') # shape is (1 x H x W x A, 4)
                rpn_bbox_outside_weights = tf.convert_to_tensor(rpn_bbox_outside_weights , name = 'rpn_bbox_outside_weights') # shape is (1 x H x W x A, 4)
                return rpn_labels, rpn_bbox_targets, rpn_bbox_inside_weights, rpn_bbox_outside_weights
        def reshape_layer(self, input, d, name):
            input_shape = tf.shape(input)
            if name == 'rpn_cls_prob_reshape':
                # transpose: (1, AxH, W, 2) -> (1, 2, AxH, W)
                # reshape: (1, 2xA, H, W)
                # transpose: -> (1, H, W, 2xA)
                 return tf.transpose(tf.reshape(tf.transpose(input,[0,3,1,2]),
                                                [   input_shape[0],
                 return tf.transpose(tf.reshape(tf.transpose(input,[0,3,1,2]),
                                            [   input_shape[0],
        def spatial_reshape_layer(self, input, d, name):
            input_shape = tf.shape(input)
            # transpose: (1, H, W, A x d) -> (1, H, WxA, d)
            return tf.reshape(input,
        def lrn(self, input, radius, alpha, beta, name, bias=1.0):
            return tf.nn.local_response_normalization(input,
        def concat(self, inputs, axis, name):
            return tf.concat(concat_dim=axis, values=inputs, name=name)
        def fc(self, input, num_out, name, relu=True, trainable=True):
            with tf.variable_scope(name) as scope:
                # only use the first input
                if isinstance(input, tuple):
                    input = input[0]
                input_shape = input.get_shape()
                if input_shape.ndims == 4:
                    dim = 1
                    for d in input_shape[1:].as_list():
                        dim *= d
                    feed_in = tf.reshape(tf.transpose(input,[0,3,1,2]), [-1, dim])
                    feed_in, dim = (input, int(input_shape[-1]))
                if name == 'bbox_pred':
                    init_weights = tf.truncated_normal_initializer(0.0, stddev=0.001)
                    init_biases = tf.constant_initializer(0.0)
                    init_weights = tf.truncated_normal_initializer(0.0, stddev=0.01)
                    init_biases = tf.constant_initializer(0.0)
                weights = self.make_var('weights', [dim, num_out], init_weights, trainable, 
                biases = self.make_var('biases', [num_out], init_biases, trainable)
                op = tf.nn.relu_layer if relu else tf.nn.xw_plus_b
                fc = op(feed_in, weights, biases, name=scope.name)
                return fc
        def softmax(self, input, name):
            input_shape = tf.shape(input)
            if name == 'rpn_cls_prob':
                return tf.reshape(tf.nn.softmax(tf.reshape(input,[-1,input_shape[3]])),[-1,input_shape[1],input_shape[2],input_shape[3]],name=name)
                return tf.nn.softmax(input,name=name)
        def spatial_softmax(self, input, name):
            input_shape = tf.shape(input)
            # d = input.get_shape()[-1]
            return tf.reshape(tf.nn.softmax(tf.reshape(input, [-1, input_shape[3]])),
                              [-1, input_shape[1], input_shape[2], input_shape[3]], name=name)
        def add(self,input,name):
            """contribution by miraclebiu"""
            return tf.add(input[0],input[1])
        def batch_normalization(self,input,name,relu=True,is_training=False):
            """contribution by miraclebiu"""
            if relu:
                return tf.nn.relu(temp_layer)
                return tf.contrib.layers.batch_norm(input,scale=True,center=True,is_training=is_training,scope=name)
        def dropout(self, input, keep_prob, name):
            return tf.nn.dropout(input, keep_prob, name=name)
        def l2_regularizer(self, weight_decay=0.0005, scope=None):
            def regularizer(tensor):
                with tf.name_scope(scope, default_name='l2_regularizer', values=[tensor]):
                    l2_weight = tf.convert_to_tensor(weight_decay,
                    #return tf.mul(l2_weight, tf.nn.l2_loss(tensor), name='value')
                    return tf.multiply(l2_weight, tf.nn.l2_loss(tensor), name='value')
            return regularizer
        def smooth_l1_dist(self, deltas, sigma2=9.0, name='smooth_l1_dist'):
            with tf.name_scope(name=name) as scope:
                deltas_abs = tf.abs(deltas)
                smoothL1_sign = tf.cast(tf.less(deltas_abs, 1.0/sigma2), tf.float32)
                return tf.square(deltas) * 0.5 * sigma2 * smoothL1_sign + 
                            (deltas_abs - 0.5 / sigma2) * tf.abs(smoothL1_sign - 1)
        def build_loss(self, ohem=False):#定义损失函数,一个为RPN的分类,一个为RPN回归
            # classification loss
            rpn_cls_score = tf.reshape(self.get_output('rpn_cls_score_reshape'), [-1, 2])  # shape (HxWxA, 2)
            rpn_label = tf.reshape(self.get_output('rpn-data')[0], [-1])  # shape (HxWxA)
            # ignore_label(-1)
            fg_keep = tf.equal(rpn_label, 1)
            rpn_keep = tf.where(tf.not_equal(rpn_label, -1))
            rpn_cls_score = tf.gather(rpn_cls_score, rpn_keep) # shape (N, 2)
            rpn_label = tf.gather(rpn_label, rpn_keep)
            rpn_cross_entropy_n = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=rpn_label,logits=rpn_cls_score)
            # box loss
            rpn_bbox_pred = self.get_output('rpn_bbox_pred') # shape (1, H, W, Ax4)
            rpn_bbox_targets = self.get_output('rpn-data')[1]
            rpn_bbox_inside_weights = self.get_output('rpn-data')[2]
            rpn_bbox_outside_weights = self.get_output('rpn-data')[3]
            rpn_bbox_pred = tf.gather(tf.reshape(rpn_bbox_pred, [-1, 4]), rpn_keep) # shape (N, 4)
            rpn_bbox_targets = tf.gather(tf.reshape(rpn_bbox_targets, [-1, 4]), rpn_keep)
            rpn_bbox_inside_weights = tf.gather(tf.reshape(rpn_bbox_inside_weights, [-1, 4]), rpn_keep)
            rpn_bbox_outside_weights = tf.gather(tf.reshape(rpn_bbox_outside_weights, [-1, 4]), rpn_keep)
            rpn_loss_box_n = tf.reduce_sum(rpn_bbox_outside_weights * self.smooth_l1_dist(
                rpn_bbox_inside_weights * (rpn_bbox_pred - rpn_bbox_targets)), reduction_indices=[1])
            rpn_loss_box = tf.reduce_sum(rpn_loss_box_n) / (tf.reduce_sum(tf.cast(fg_keep, tf.float32)) + 1)
            rpn_cross_entropy = tf.reduce_mean(rpn_cross_entropy_n)
            model_loss = rpn_cross_entropy +  rpn_loss_box
            regularization_losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)#tf.get_collection(collection_name)返回某个collection的列表  
            total_loss = tf.add_n(regularization_losses) + model_loss
            return total_loss,model_loss, rpn_cross_entropy, rpn_loss_box


    # -*- coding:utf-8 -*-
    import numpy as np
    import numpy.random as npr
    from .generate_anchors import generate_anchors
    from ..utils.bbox import bbox_overlaps, bbox_intersections
    from ..fast_rcnn.config import cfg
    from ..fast_rcnn.bbox_transform import bbox_transform
    DEBUG = False
    def anchor_target_layer(rpn_cls_score, gt_boxes, gt_ishard, dontcare_areas, im_info, _feat_stride = [16,], anchor_scales = [16,]):
        Assign anchors to ground-truth targets. Produces anchor classification
        labels and bounding-box regression targets.
        rpn_cls_score: (1, H, W, Ax2) bg/fg scores of previous conv layer
        gt_boxes: (G, 5) vstack of [x1, y1, x2, y2, class]
        gt_ishard: (G, 1), 1 or 0 indicates difficult or not
        dontcare_areas: (D, 4), some areas may contains small objs but no labelling. D may be 0
        im_info: a list of [image_height, image_width, scale_ratios]
        _feat_stride: the downsampling ratio of feature map to the original input image
        anchor_scales: the scales to the basic_anchor (basic anchor is [16, 16])
        rpn_labels : (HxWxA, 1), for each anchor, 0 denotes bg, 1 fg, -1 dontcare
        rpn_bbox_targets: (HxWxA, 4), distances of the anchors to the gt_boxes(may contains some transform)
                                that are the regression objectives
        rpn_bbox_inside_weights: (HxWxA, 4) weights of each boxes, mainly accepts hyper param in cfg
        rpn_bbox_outside_weights: (HxWxA, 4) used to balance the fg/bg,
                                beacuse the numbers of bgs and fgs mays significiantly different
        _anchors = generate_anchors(scales=np.array(anchor_scales))#生成基本的anchor,一共9个
        _num_anchors = _anchors.shape[0]#9个anchor
        if DEBUG:
            print('anchor shapes:')
                _anchors[:, 2::4] - _anchors[:, 0::4],
                _anchors[:, 3::4] - _anchors[:, 1::4],
            _counts = cfg.EPS
            _sums = np.zeros((1, 4))
            _squared_sums = np.zeros((1, 4))
            _fg_sum = 0
            _bg_sum = 0
            _count = 0
        # allow boxes to sit over the edge by a small amount
        _allowed_border =  0
        # map of shape (..., H, W)
        #height, width = rpn_cls_score.shape[1:3]
        im_info = im_info[0]#图像的高宽及通道数
        # Algorithm:
        # for each (H, W) location i
        #   generate 9 anchor boxes centered on cell i
        #   apply predicted bbox deltas at cell i to each of the 9 anchors
        # filter out-of-image anchors
        # measure GT overlap
        assert rpn_cls_score.shape[0] == 1, 
            'Only single item batches are supported'
        # map of shape (..., H, W)
        height, width = rpn_cls_score.shape[1:3]#feature-map的高宽
        if DEBUG:
            print('AnchorTargetLayer: height', height, 'width', width)
            print('im_size: ({}, {})'.format(im_info[0], im_info[1]))
            print('scale: {}'.format(im_info[2]))
            print('height,  ({}, {})'.format(height, width))
            print('rpn: gt_boxes.shape', gt_boxes.shape)
            print('rpn: gt_boxes', gt_boxes)
        # 1. Generate proposals from bbox deltas and shifted anchors
        shift_x = np.arange(0, width) * _feat_stride
        shift_y = np.arange(0, height) * _feat_stride
        shift_x, shift_y = np.meshgrid(shift_x, shift_y) # in W H order
        # K is H x W
        shifts = np.vstack((shift_x.ravel(), shift_y.ravel(),#.ravel 将多维数组转为一维数组。reshape(-1)可以“拉平”多维数组
                            shift_x.ravel(), shift_y.ravel())).transpose()#生成feature-map和真实image上anchor之间的偏移量
        # add A anchors (1, A, 4) to
        # cell K shifts (K, 1, 4) to get
        # shift anchors (K, A, 4)
        # reshape to (K*A, 4) shifted anchors
        A = _num_anchors#9个anchor
        K = shifts.shape[0]#50*37,feature-map的宽乘高的大小
        all_anchors = (_anchors.reshape((1, A, 4)) +
                       shifts.reshape((1, K, 4)).transpose((1, 0, 2)))#相当于复制宽高的维度,然后相加
        all_anchors = all_anchors.reshape((K * A, 4))
        total_anchors = int(K * A)
        # only keep anchors inside the image
        inds_inside = np.where(
            (all_anchors[:, 0] >= -_allowed_border) &
            (all_anchors[:, 1] >= -_allowed_border) &
            (all_anchors[:, 2] < im_info[1] + _allowed_border) &  # width
            (all_anchors[:, 3] < im_info[0] + _allowed_border)    # height
        if DEBUG:
            print('total_anchors', total_anchors)
            print('inds_inside', len(inds_inside))
        # keep only inside anchors
        anchors = all_anchors[inds_inside, :]#保留那些在图像内的anchor
        if DEBUG:
            print('anchors.shape', anchors.shape)
        # label: 1 is positive, 0 is negative, -1 is dont care
        # (A)
        labels = np.empty((len(inds_inside), ), dtype=np.float32)
        # overlaps between the anchors and the gt boxes
        # overlaps (ex, gt), shape is A x G
        overlaps = bbox_overlaps(
            np.ascontiguousarray(anchors, dtype=np.float),#np.ascontiguousarray 返回一个地址连续的数组
            np.ascontiguousarray(gt_boxes, dtype=np.float))#假设anchors有x个,gt_boxes有y个,返回的是一个(x,y)的数组
        # 存放每一个anchor和每一个gtbox之间的overlap
        argmax_overlaps = overlaps.argmax(axis=1) # (A)#找到和每一个gtbox,overlap最大的那个anchor
        max_overlaps = overlaps[np.arange(len(inds_inside)), argmax_overlaps]
        gt_argmax_overlaps = overlaps.argmax(axis=0) # G#找到每个位置上9个anchor中与gtbox,overlap最大的那个
        gt_max_overlaps = overlaps[gt_argmax_overlaps,
        gt_argmax_overlaps = np.where(overlaps == gt_max_overlaps)[0]
            # assign bg labels first so that positive labels can clobber them
            labels[max_overlaps < cfg.TRAIN.RPN_NEGATIVE_OVERLAP] = 0#先给背景上标签,小于0.3overlap的
        # fg label: for each gt, anchor with highest overlap
        labels[gt_argmax_overlaps] = 1#每个位置上的9个anchor中overlap最大的认为是前景
        # fg label: above threshold IOU
        labels[max_overlaps >= cfg.TRAIN.RPN_POSITIVE_OVERLAP] = 1#overlap大于0.7的认为是前景
            # assign bg labels last so that negative labels can clobber positives
            labels[max_overlaps < cfg.TRAIN.RPN_NEGATIVE_OVERLAP] = 0
        # preclude dontcare areas
        if dontcare_areas is not None and dontcare_areas.shape[0] > 0:#这里我们暂时不考虑有doncare_area的存在
            # intersec shape is D x A
            intersecs = bbox_intersections(
                np.ascontiguousarray(dontcare_areas, dtype=np.float), # D x 4
                np.ascontiguousarray(anchors, dtype=np.float) # A x 4
            intersecs_ = intersecs.sum(axis=0) # A x 1
            labels[intersecs_ > cfg.TRAIN.DONTCARE_AREA_INTERSECTION_HI] = -1
        # preclude hard samples that are highly occlusioned, truncated or difficult to see
        if cfg.TRAIN.PRECLUDE_HARD_SAMPLES and gt_ishard is not None and gt_ishard.shape[0] > 0:
            assert gt_ishard.shape[0] == gt_boxes.shape[0]
            gt_ishard = gt_ishard.astype(int)
            gt_hardboxes = gt_boxes[gt_ishard == 1, :]
            if gt_hardboxes.shape[0] > 0:
                # H x A
                hard_overlaps = bbox_overlaps(
                    np.ascontiguousarray(gt_hardboxes, dtype=np.float), # H x 4
                    np.ascontiguousarray(anchors, dtype=np.float)) # A x 4
                hard_max_overlaps = hard_overlaps.max(axis=0)  # (A)
                labels[hard_max_overlaps >= cfg.TRAIN.RPN_POSITIVE_OVERLAP] = -1
                max_intersec_label_inds = hard_overlaps.argmax(axis=1) # H x 1
                labels[max_intersec_label_inds] = -1 #
        # subsample positive labels if we have too many
        # 限制正样本的数量不超过128个
        #TODO 这个后期可能还需要修改,毕竟如果使用的是字符的片段,那个正样本的数量是很多的。
        num_fg = int(cfg.TRAIN.RPN_FG_FRACTION * cfg.TRAIN.RPN_BATCHSIZE)
        fg_inds = np.where(labels == 1)[0]
        if len(fg_inds) > num_fg:
            disable_inds = npr.choice(#npr.choice 返回一个列表,元组或字符串的随机项
                fg_inds, size=(len(fg_inds) - num_fg), replace=False)#随机去除掉一些正样本
            labels[disable_inds] = -1#变为-1
        # subsample negative labels if we have too many
        # 正负样本总数是256,限制正样本数目最多128,
        # 如果正样本数量小于128,差的那些就用负样本补上,凑齐256个样本
        num_bg = cfg.TRAIN.RPN_BATCHSIZE - np.sum(labels == 1)
        bg_inds = np.where(labels == 0)[0]
        if len(bg_inds) > num_bg:
            disable_inds = npr.choice(
                bg_inds, size=(len(bg_inds) - num_bg), replace=False)
            labels[disable_inds] = -1
            #print "was %s inds, disabling %s, now %s inds" % (
                #len(bg_inds), len(disable_inds), np.sum(labels == 0))
        # 至此, 上好标签,开始计算rpn-box的真值
        bbox_targets = np.zeros((len(inds_inside), 4), dtype=np.float32)
        bbox_targets = _compute_targets(anchors, gt_boxes[argmax_overlaps, :])#根据anchor和gtbox计算得真值(anchor和gtbox之间的偏差)
        bbox_inside_weights = np.zeros((len(inds_inside), 4), dtype=np.float32)
        bbox_inside_weights[labels == 1, :] = np.array(cfg.TRAIN.RPN_BBOX_INSIDE_WEIGHTS)#内部权重,前景就给1,其他是0
        bbox_outside_weights = np.zeros((len(inds_inside), 4), dtype=np.float32)
        if cfg.TRAIN.RPN_POSITIVE_WEIGHT < 0:#暂时使用uniform 权重,也就是正样本是1,负样本是0
            # uniform weighting of examples (given non-uniform sampling)
            num_examples = np.sum(labels >= 0) + 1
            # positive_weights = np.ones((1, 4)) * 1.0 / num_examples
            # negative_weights = np.ones((1, 4)) * 1.0 / num_examples
            positive_weights = np.ones((1, 4))
            negative_weights = np.zeros((1, 4))
            assert ((cfg.TRAIN.RPN_POSITIVE_WEIGHT > 0) &
                    (cfg.TRAIN.RPN_POSITIVE_WEIGHT < 1))
            positive_weights = (cfg.TRAIN.RPN_POSITIVE_WEIGHT /
                                (np.sum(labels == 1)) + 1)
            negative_weights = ((1.0 - cfg.TRAIN.RPN_POSITIVE_WEIGHT) /
                                (np.sum(labels == 0)) + 1)
        bbox_outside_weights[labels == 1, :] = positive_weights#外部权重,前景是1,背景是0
        bbox_outside_weights[labels == 0, :] = negative_weights
        if DEBUG:
            _sums += bbox_targets[labels == 1, :].sum(axis=0)
            _squared_sums += (bbox_targets[labels == 1, :] ** 2).sum(axis=0)
            _counts += np.sum(labels == 1)
            means = _sums / _counts
            stds = np.sqrt(_squared_sums / _counts - means ** 2)
        # map up to original set of anchors
        # 一开始是将超出图像范围的anchor直接丢掉的,现在在加回来
        labels = _unmap(labels, total_anchors, inds_inside, fill=-1)#这些anchor的label是-1,也即dontcare
        bbox_targets = _unmap(bbox_targets, total_anchors, inds_inside, fill=0)#这些anchor的真值是0,也即没有值
        bbox_inside_weights = _unmap(bbox_inside_weights, total_anchors, inds_inside, fill=0)#内部权重以0填充
        bbox_outside_weights = _unmap(bbox_outside_weights, total_anchors, inds_inside, fill=0)#外部权重以0填充
        if DEBUG:
            print('rpn: max max_overlap', np.max(max_overlaps))
            print('rpn: num_positive', np.sum(labels == 1))
            print('rpn: num_negative', np.sum(labels == 0))
            _fg_sum += np.sum(labels == 1)
            _bg_sum += np.sum(labels == 0)
            _count += 1
            print('rpn: num_positive avg', _fg_sum / _count)
            print('rpn: num_negative avg', _bg_sum / _count)
        # labels
        labels = labels.reshape((1, height, width, A))#reshap一下label
        rpn_labels = labels
        # bbox_targets
        bbox_targets = bbox_targets 
            .reshape((1, height, width, A * 4))#reshape
        rpn_bbox_targets = bbox_targets
        # bbox_inside_weights
        bbox_inside_weights = bbox_inside_weights 
            .reshape((1, height, width, A * 4))
        rpn_bbox_inside_weights = bbox_inside_weights
        # bbox_outside_weights
        bbox_outside_weights = bbox_outside_weights 
            .reshape((1, height, width, A * 4))
        rpn_bbox_outside_weights = bbox_outside_weights
        return rpn_labels, rpn_bbox_targets, rpn_bbox_inside_weights, rpn_bbox_outside_weights
    def _unmap(data, count, inds, fill=0):
        """ Unmap a subset of item (data) back to the original set of items (of
        size count) """
        if len(data.shape) == 1:
            ret = np.empty((count, ), dtype=np.float32)
            ret[inds] = data
            ret = np.empty((count, ) + data.shape[1:], dtype=np.float32)
            ret[inds, :] = data
        return ret
    def _compute_targets(ex_rois, gt_rois):
        """Compute bounding-box regression targets for an image."""
        assert ex_rois.shape[0] == gt_rois.shape[0]
        assert ex_rois.shape[1] == 4
        assert gt_rois.shape[1] == 5
        return bbox_transform(ex_rois, gt_rois[:, :4]).astype(np.float32, copy=False)


    import numpy as np
    def generate_basic_anchors(sizes, base_size=16):
        base_anchor = np.array([0, 0, base_size - 1, base_size - 1], np.int32)#base_anchor[0,0,15,15]
        anchors = np.zeros((len(sizes), 4), np.int32)#anchors的shape为[10,4]
        index = 0
        for h, w in sizes:
            anchors[index] = scale_anchor(base_anchor, h, w)
            index += 1
        return anchors
    def scale_anchor(anchor, h, w):
        x_ctr = (anchor[0] + anchor[2]) * 0.5#7.5
        y_ctr = (anchor[1] + anchor[3]) * 0.5#7.5
        scaled_anchor = anchor.copy()
        scaled_anchor[0] = x_ctr - w / 2  # xmin
        scaled_anchor[2] = x_ctr + w / 2  # xmax
        scaled_anchor[1] = y_ctr - h / 2  # ymin
        scaled_anchor[3] = y_ctr + h / 2  # ymax
        return scaled_anchor
    def generate_anchors(base_size=16, ratios=[0.5, 1, 2],
                         scales=2**np.arange(3, 6)):
        heights = [11, 16, 23, 33, 48, 68, 97, 139, 198, 283]#定义10个高度
        widths = [16]
        sizes = []
        for h in heights:
            for w in widths:
                sizes.append((h, w))#sizes为[10,2]
        return generate_basic_anchors(sizes)
    if __name__ == '__main__':
        import time
        t = time.time()
        a = generate_anchors()
        print(time.time() - t)
        from IPython import embed; embed()


    # coding: utf-8 
    from __future__ import print_function
    import numpy as np
    import os
    import tensorflow as tf
    from ..roi_data_layer.layer import RoIDataLayer
    from ..utils.timer import Timer
    from ..roi_data_layer import roidb as rdl_roidb
    from ..fast_rcnn.config import cfg
    _DEBUG = False
    class SolverWrapper(object):
        def __init__(self, sess, network, imdb, roidb, output_dir,logdir,pretrained_model=None):
            #Initialize the SolverWrapper.
            self.net = network
            self.imdb = imdb
            self.roidb = roidb
            self.output_dir = output_dir
            self.pretrained_model = pretrained_model
            print('Computing bounding-box regression targets...')
            if cfg.TRAIN.BBOX_REG:
                self.bbox_means, self.bbox_stds = rdl_roidb.add_bbox_regression_targets(roidb)
            # For checkpoint
            self.saver = tf.train.Saver(max_to_keep=100,write_version=tf.train.SaverDef.V2)
            self.writer = tf.summary.FileWriter(logdir=logdir,
        def snapshot(self, sess, iter):
            net = self.net
            if cfg.TRAIN.BBOX_REG and 'bbox_pred' in net.layers and cfg.TRAIN.BBOX_NORMALIZE_TARGETS:
                # save original values
                with tf.variable_scope('bbox_pred', reuse=True):
                    weights = tf.get_variable("weights")
                    biases = tf.get_variable("biases")
                orig_0 = weights.eval()
                orig_1 = biases.eval()
                # scale and shift with bbox reg unnormalization; then save snapshot
                weights_shape = weights.get_shape().as_list()
                sess.run(weights.assign(orig_0 * np.tile(self.bbox_stds, (weights_shape[0],1))))
                sess.run(biases.assign(orig_1 * self.bbox_stds + self.bbox_means))
            if not os.path.exists(self.output_dir):
            infix = ('_' + cfg.TRAIN.SNAPSHOT_INFIX
                     if cfg.TRAIN.SNAPSHOT_INFIX != '' else '')
            filename = (cfg.TRAIN.SNAPSHOT_PREFIX + infix +
                        '_iter_{:d}'.format(iter+1) + '.ckpt')
            filename = os.path.join(self.output_dir, filename)
            self.saver.save(sess, filename)
            print('Wrote snapshot to: {:s}'.format(filename))
            if cfg.TRAIN.BBOX_REG and 'bbox_pred' in net.layers:
                # restore net to original state
        def build_image_summary(self):
            # A simple graph for write image summary
            log_image_data = tf.placeholder(tf.uint8, [None, None, 3])
            log_image_name = tf.placeholder(tf.string)
            # import tensorflow.python.ops.gen_logging_ops as logging_ops
            from tensorflow.python.ops import gen_logging_ops
            from tensorflow.python.framework import ops as _ops
            log_image = gen_logging_ops._image_summary(log_image_name, tf.expand_dims(log_image_data, 0), max_images=1)
            _ops.add_to_collection(_ops.GraphKeys.SUMMARIES, log_image)
            # log_image = tf.summary.image(log_image_name, tf.expand_dims(log_image_data, 0), max_outputs=1)
            return log_image, log_image_data, log_image_name
        def train_model(self, sess, max_iters, restore=False):
            #Network training loop.
            data_layer = get_data_layer(self.roidb, self.imdb.num_classes)
            total_loss,model_loss, rpn_cross_entropy, rpn_loss_box=self.net.build_loss(ohem=cfg.TRAIN.OHEM)
            # scalar summary
            tf.summary.scalar('rpn_reg_loss', rpn_loss_box)
            tf.summary.scalar('rpn_cls_loss', rpn_cross_entropy)
            tf.summary.scalar('model_loss', model_loss)
            summary_op = tf.summary.merge_all()
            log_image, log_image_data, log_image_name =
            # optimizer
            lr = tf.Variable(cfg.TRAIN.LEARNING_RATE, trainable=False)
            if cfg.TRAIN.SOLVER == 'Adam':
                opt = tf.train.AdamOptimizer(cfg.TRAIN.LEARNING_RATE)
            elif cfg.TRAIN.SOLVER == 'RMS':
                opt = tf.train.RMSPropOptimizer(cfg.TRAIN.LEARNING_RATE)
                # lr = tf.Variable(0.0, trainable=False)
                momentum = cfg.TRAIN.MOMENTUM
                opt = tf.train.MomentumOptimizer(lr, momentum)
            global_step = tf.Variable(0, trainable=False)
            with_clip = True
            if with_clip:
                tvars = tf.trainable_variables()#tf.trainable_variables返回的是需要训练的变量列表
                grads, norm = tf.clip_by_global_norm(tf.gradients(total_loss, tvars), 10.0)#Gradient Clipping的引入是为了处理gradient explosion或者gradients vanishing的问题。
                #当在一次迭代中权重的更新过于迅猛的话,很容易导致loss divergence。Gradient Clipping的直观作用就是让权重的更新限制在一个合适的范围。
                train_op = opt.apply_gradients(list(zip(grads, tvars)), global_step=global_step)
                train_op = opt.minimize(total_loss, global_step=global_step)
            # intialize variables
            restore_iter = 0
            # load vgg16
            if self.pretrained_model is not None and not restore:            
                    print(('Loading pretrained model '
                       'weights from {:s}').format(self.pretrained_model))         
                    self.net.load(self.pretrained_model, sess,True)                
                    raise Exception('Check your pretrained model {:s}'.format(self.pretrained_model))
                    self.net.load(self.pretrained_model, sess,True)
            # resuming a trainer
            if restore:
                    ckpt = tf.train.get_checkpoint_state(self.output_dir)
                    print('Restoring from {}...'.format(ckpt.model_checkpoint_path), end=' ')
                    self.saver.restore(sess, ckpt.model_checkpoint_path)
                    stem = os.path.splitext(os.path.basename(ckpt.model_checkpoint_path))[0]
                    restore_iter = int(stem.split('_')[-1])
                    raise Exception('Check your pretrained {:s}'.format(ckpt.model_checkpoint_path))
            last_snapshot_iter = -1
            timer = Timer()
            for iter in range(restore_iter, max_iters):
                # learning rate
                if iter != 0 and iter % cfg.TRAIN.STEPSIZE == 0:
                    sess.run(tf.assign(lr, lr.eval() * cfg.TRAIN.GAMMA))
                # get one batch
                blobs = data_layer.forward()
                    self.net.data: blobs['data'],
                    self.net.im_info: blobs['im_info'],
                    self.net.keep_prob: 0.5,
                    self.net.gt_boxes: blobs['gt_boxes'],
                    self.net.gt_ishard: blobs['gt_ishard'],
                    self.net.dontcare_areas: blobs['dontcare_areas']
                fetch_list = [total_loss,model_loss, rpn_cross_entropy, rpn_loss_box,
                              train_op] + res_fetches
                total_loss_val,model_loss_val, rpn_loss_cls_val, rpn_loss_box_val, 
                    summary_str, _ = sess.run(fetches=fetch_list, feed_dict=feed_dict)
                self.writer.add_summary(summary=summary_str, global_step=global_step.eval())
                _diff_time = timer.toc(average=False)
                if (iter) % (cfg.TRAIN.DISPLAY) == 0:
                    print('iter: %d / %d, total loss: %.4f, model loss: %.4f, rpn_loss_cls: %.4f, rpn_loss_box: %.4f, lr: %f'%
                            (iter, max_iters, total_loss_val,model_loss_val,rpn_loss_cls_val,rpn_loss_box_val,lr.eval()))
                    print('speed: {:.3f}s / iter'.format(_diff_time))
                if (iter+1) % cfg.TRAIN.SNAPSHOT_ITERS == 0:
                    last_snapshot_iter = iter
                    self.snapshot(sess, iter)
            if last_snapshot_iter != iter:
                self.snapshot(sess, iter)
    def get_training_roidb(imdb):
        """Returns a roidb (Region of Interest database) for use in training."""
        if cfg.TRAIN.USE_FLIPPED:#使用数据增强
            print('Appending horizontally-flipped training examples...')
        print('Preparing training data...')
        if cfg.TRAIN.HAS_RPN:
        return imdb.roidb
    def get_data_layer(roidb, num_classes):
        """return a data layer."""
        if cfg.TRAIN.HAS_RPN:
            if cfg.IS_MULTISCALE:
                # obsolete
                # layer = GtDataLayer(roidb)
                raise Exception("Calling caffe modules...")
                layer = RoIDataLayer(roidb, num_classes)
            layer = RoIDataLayer(roidb, num_classes)
        return layer
    def train_net(network, imdb, roidb, output_dir, log_dir, pretrained_model=None, max_iters=40000, restore=False):
        """Train a Fast R-CNN network."""
    #    config = tf.ConfigProto(allow_soft_placement=True)
     #   config.gpu_options.allocator_type = 'BFC'
      #  config.gpu_options.per_process_gpu_memory_fraction = 0.75
    #    with tf.Session(config=config) as sess:
        with tf.Session() as sess:
            sw = SolverWrapper(sess,network, imdb, roidb, output_dir,log_dir, pretrained_model=pretrained_model)
            sw.train_model(sess, max_iters,restore)
            print('done solving')



