zoukankan html css js c++ java

mask-rcnn代码解读（八）：数据完整准备代码

关于mask-rcnn 网络模型resnet101或resnet50的结构，相信很多读者都能理解，或许还会觉得这一部分源码解读较为容易。而之后原始数据的处理及rpn网络之后的数据处理较难，为此本文解决前者问题。主要处理问题如下：

　　1.给定原始图片大小，如何处理成模型训练的图片尺寸及处理对应的mask图片。

　　2.如何根据mask得到真实的gt_bboxes及对应的类别gt_class_ids。

　　3.如何获得rpn_match及rpn_bbox。

　　4.为了数据更加泛化，如何处理数据，得到rpn_rois等。

然而在数据处理会用到很多理论，如NMS、iou、anchors、数据增强等。我将不再一一介绍数据处理过程中遇到的理论，请读者阅读下面代码，代码已包含了很多注解。我想说，本文可以帮助读者对mask-rcnn模型数据处理的理解，帮助读者对重新构建自己mask-rcnn提供技术帮助。



# 代码如下（已可运行）：

"""
Mask R-CNN
Base Configurations class and DATA treating class.
Written by tang jun
"""


import os
import cv2 as cv
from PIL import Image
import yaml
import math
import skimage.transform
import scipy.ndimage
import random
from imgaug import augmenters as iaa
import numpy as np


############################################################
#  Configuration
############################################################

class Config(object):
    # 添加类的列表
    classes_name_list = ["line_bulge","dot_concave","dot_bulge","Irregular_concave"]  # 添加新增的类列表
    backbone = "resnet101"
    batch_size = 1
    image_channel = 3
    num_classes = 1 + 4  # 包含背景类BG
    back_strides = [4, 8, 16, 32, 64]
    anchor_rations = [0.5, 1, 2]  # 产生3个anchor的比列
    anchor_scales = (32, 64, 128, 256, 512)  # 用于生成不同金子塔层对应anchor高与宽的尺度
    std_dev = np.array([0.1, 0.1, 0.2, 0.2])
    fc_layers = 1024
    rpn_pyramid = 256
    proposal_count = 6000
    # 图像预处理的参数
    rpn_anchor_stride = 1
    # 用于生成[p2,p3,p4,p5,p6]开始提取share使用，是KL.Conv2卷积的步长，若为1表示图像大小不变。
    rpn_data_train_anchor_per_img = 256  # 产生数据使用，决定 rpn_bbox个数
    target_data_per_img = 100  # 作为产生数据 200
    target_data_fg_rate = 0.33

    step_per_epoch=1000
    validation_steps = 50
    # COMPUTE_BACKBONE_SHAPE = None
    proposal_nms_threshold = 0.7
    # ROIs kept after non-maximum suppression (training and inference)
    proposal_count_train = 2000
    proposal_count_inference = 1000
    # If enabled, resizes instance masks to a smaller size to reduce
    # Input image resizing
    # Generally, use the "square" resizing mode for training and predicting
    # and it should work well in most cases. In this mode, images are scaled
    # up such that the small side is = IMAGE_MIN_DIM, but ensuring that the
    # scaling doesn't make the long side > IMAGE_MAX_DIM. Then the image is
    # padded with zeros to make it a square so multiple images can be put
    # in one batch.
    # Available resizing modes:
    # none:   No resizing or padding. Return the image unchanged.
    # square: Resize and pad with zeros to get a square image
    #         of size [max_dim, max_dim].
    # pad64:  Pads width and height with zeros to make them multiples of 64.
    #         If IMAGE_MIN_DIM or IMAGE_MIN_SCALE are not None, then it scales
    #         up before padding. IMAGE_MAX_DIM is ignored in this mode.
    #         The multiple of 64 is needed to ensure smooth scaling of feature
    #         maps up and down the 6 levels of the FPN pyramid (2**6=64).
    # crop:   Picks random crops from the image. First, scales the image based
    #         on IMAGE_MIN_DIM and IMAGE_MIN_SCALE, then picks a random crop of
    #         size IMAGE_MIN_DIM x IMAGE_MIN_DIM. Can be used in training only.
    #         IMAGE_MAX_DIM is not used in this mode.
    resize_mode = "square"  # 原始图像resize的方法，共有2种，分别为：square与pad64
    image_min_dim = 128
    image_max_dim = 128
    resize_min_scale = 0  # 图像resize倍数必须满足这个最小值，若为0将不起任何作用
    # Image mean (RGB)
    mean_pixel = np.array([123.7, 116.8, 103.9])
    # Number of ROIs per image to feed to classifier/mask heads
    # The Mask RCNN paper uses 512 but often the RPN doesn't generate
    # enough positive proposals to fill this and keep a positive:negative
    # ratio of 1:3. You can increase the number of proposals by adjusting
    # the RPN NMS threshold.
    # TRAIN_ROIS_PER_IMAGE = 200
    # Percent of positive ROIs used to train classifier/mask heads
    # ROI_POSITIVE_RATIO = 0.33
    # Pooled ROIs
    fpn_pool_size = 7
    fpn_mask_pool_size = 14

    # Shape of output mask
    # To change this you also need to change the neural network mask branch
    mask_shape = [28, 28]
    # Maximum number of ground truth instances to use in one image
    max_gt_data_instance = 40
    # Bounding box refinement standard deviation for RPN and final detections.
    std_dev = np.array([0.1, 0.1, 0.2, 0.2])
    # Max number of final detections
    detection_max_instances=100
    # Minimum probability value to accept a detected instance
    # ROIs below this threshold are skipped
    detection_min_threshold = 0.7
    # Non-maximum suppression threshold for detection
    detection_nms_threshold = 0.3
    # Learning rate and momentum
    # The Mask RCNN paper uses lr=0.02, but on TensorFlow it causes
    # weights to explode. Likely due to differences in optimizer
    # implementation.
    learn_rate = 0.001
    learn_momentum = 0.9
    # Weight decay regularization
    weight_decay = 0.0001
    # Loss weights for more precise optimization.
    # Can be used for R-CNN training setup.
    loss_weights = {
        "rpn_class_loss": 1.,
        "rpn_bbox_loss": 1.,
        "mrcnn_class_loss": 1.,
        "mrcnn_bbox_loss": 1.,
        "mrcnn_mask_loss": 1.
    }
    # Use RPN ROIs or externally generated ROIs for training
    # Keep this True for most situations. Set to False if you want to train
    # the head branches on ROI generated by code rather than the ROIs from
    # the RPN. For example, to debug the classifier head without having to
    # train the RPN.
    data_proposal_rpn_rois = True
    # Train or freeze batch normalization layers
    #     None: Train BN layers. This is the normal mode
    #     False: Freeze BN layers. Good when using a small batch size
    #     True: (don't use). Set layer in training mode even when predicting
    train_BN = False  # Defaulting to False since batch size is often small
    # Gradient norm clipping
    clip_gradient = 5.0  # 解决梯度梯度爆炸的一个因子 ############
    image_shape_crop = np.array([image_min_dim, image_max_dim, image_channel])  # 800,800,3
    image_shape = np.array([image_max_dim, image_max_dim, image_channel])  # 1024,1024,3
    image_meta_size = 1 + 3 + 3 + 4 + 1 + num_classes

    def display(self):
        """Display Configuration values."""
        print("
Configurations:")
        for a in dir(self):  # dir() 函数不带参数时，返回当前范围内的变量、方法和定义的类型列表；带参数时，返回参数的属性、方法列表
            if not a.startswith("__") and not callable(getattr(self, a)):  # getattr() 函数用于返回一个对象属性值
                print("{:30} {}".format(a, getattr(self, a)))
        print("
")

############################################################
#  Dataset
############################################################

class Dataset(Config):
    """The base class for dataset classes.
    To use it, create a new class that adds functions specific to the dataset
    you want to use. For example:

    class CatsAndDogsDataset(Dataset):
        def load_cats_and_dogs(self):
            ...
        def load_mask(self, image_id):
            ...
        def image_reference(self, image_id):
            ...

    See COCODataset and ShapesDataset as examples.
    """

    def __init__(self, class_map=None):
        self.image_ids = []  # 保存图片序列
        self.image_info = []  # 保存图像路径等信息
        self.class_info = [{"id": 0, "name": "BG"}]  # Background is always the first class
        self.class_names = []  # 保存分类的名称，为一个列表 （包含背景BG为第一个）
    def add_class(self, class_name_list):
        '''
        :param class_name_list: 类别的列表，不能重复
        '''
        self.class_names.append('BG')  # 背景类
        name_list = len(class_name_list)
        for i in range(name_list):
            class_id = i+1
            class_name = class_name_list[i]
            self.class_info.append({"id": class_id, "name": class_name})
            self.class_names.append(class_name)

    def add_image(self, img_floder):
        '''
        :param img_floder: 经过labelme后的文件，该文件夹包含很多子文件，
        每个子文件包含3张图片（一张是原图，一张是mask图，一张是label图（没用）），1个yaml的文件
        :return: 填充image_info信息
        '''
        image_id=0
        for sorce_path in os.listdir(img_floder):  # 遍历所有文件夹
            yaml_path = os.path.join(img_floder + '\' + sorce_path, 'info.yaml')  # label_names: - _background_  - NG
            mask_path = os.path.join(img_floder + '\' + sorce_path, 'label.png')
            img_path = os.path.join(img_floder + '\' + sorce_path, 'img.png')
            cv_img = cv.imdecode(np.fromfile(mask_path, dtype=np.uint8), cv.IMREAD_UNCHANGED)  # np.fromfile以np.uint8读取文件  # cv2.imdecode缓存中读取数据，并解码成图像格式
            image_info = {"id": image_id, "path": img_path, "width": cv_img.shape[1], "height": cv_img.shape[0],
                          "mask_path": mask_path, "yaml_path": yaml_path }
            self.image_info.append(image_info)
            image_id += 1
        self.num_images = len(self.image_info)
        self.image_ids = np.arange(self.num_images)   # 保存image的ids

    def data_generator(self, shuffle=True,  augmentation=False, rois_count=0, batch_size=Config.batch_size, detection_targets=False):
        """A generator that returns images and corresponding target class ids,
        bounding box deltas, and masks.

        dataset: The Dataset object to pick data from
        config: The model config object
        shuffle: If True, shuffles the samples before every epoch
        augment: (Depricated. Use augmentation instead). If true, apply random
            image augmentation. Currently, only horizontal flipping is offered.
        augmentation: Optional. An imgaug (https://github.com/aleju/imgaug) augmentation.
            For example, passing imgaug.augmenters.Fliplr(0.5) flips images
            right/left 50% of the time.
        random_rois: If > 0 then generate proposals to be used to train the
                     network classifier and mask heads. Useful if training
                     the Mask RCNN part without the RPN.
        batch_size: How many images to return in each call
        detection_targets: If True, generate detection targets (class IDs, bbox
            deltas, and masks). Typically for debugging or visualizations because
            in trainig detection targets are generated by DetectionTargetLayer.

        Returns a Python generator. Upon calling next() on it, the
        generator returns two lists, inputs and outputs. The containtes
        of the lists differs depending on the received arguments:
        inputs list:
        - images: [batch, H, W, C]
        - image_meta: [batch, (meta data)] Image details. See compose_image_meta()
        - rpn_match: [batch, N] Integer (1=positive anchor, -1=negative, 0=neutral)
        - rpn_bbox: [batch, N, (dy, dx, log(dh), log(dw))] Anchor bbox deltas.
        - gt_class_ids: [batch, MAX_GT_INSTANCES] Integer class IDs
        - gt_boxes: [batch, MAX_GT_INSTANCES, (y1, x1, y2, x2)]
        - gt_masks: [batch, height, width, MAX_GT_INSTANCES]. The height and width
                    are those of the image unless use_mini_mask is True, in which
                    case they are defined in MINI_MASK_SHAPE.

        outputs list: Usually empty in regular training. But if detection_targets
            is True then the outputs list contains target class_ids, bbox deltas,
            and masks.
        """
        b = 0    # batch item index
        image_index = -1
        image_ids = np.copy(self.image_ids)
        # Anchors
        backbone_shapes = np.array([[int(math.ceil(Config.image_shape[0] / stride)),
                                     int(math.ceil(Config.image_shape[1] / stride))]
                                     for stride in Config.back_strides])
        '''
        back_strides = [4, 8, 16, 32, 64]  anchor_scales = (32, 64, 128, 256, 512)
        '''
        anchors = []
        for i in range(len(Config.anchor_scales)):
            anchor = self.generate_anchors(Config.anchor_scales[i], Config.anchor_rations, backbone_shapes[i],
                                            Config.back_strides[i], Config.rpn_anchor_stride)
            anchors.append(anchor)
        anchors=np.concatenate(anchors, axis=0)
        # Anchors
        # Keras requires a generator to run indefinately.



        while True:
            # while每次循环都是对一张图进行处理。

            try:
                # Increment index to pick next image. Shuffle if at the start of an epoch.
                image_index = (image_index + 1) % len(image_ids)   # %表示取余数
                if shuffle and image_index == 0:
                    np.random.shuffle(image_ids)
                # Get GT bounding boxes and masks for image.
                image_id = image_ids[image_index]
                image, image_meta, gt_class_ids, gt_boxes, gt_masks = 
                    self.load_image_gt(image_id, augmentation=augmentation)

                '''
                gt_boxes gained from gt_masks
                gt_class_ids representation class sequence of every image that ID is image_id。for example [2,3,2...]
                '''
                # Skip images that have no instances. This can happen in cases
                # where we train on a subset of classes and the image doesn't
                # have any of the classes we care about.
                if not np.any(gt_class_ids > 0):
                    print('warning:picture id= {} is not real class,please check!'.format(image_id))
                    continue

                # RPN Targets
                rpn_match, rpn_bbox = self.build_rpn_targets(anchors, gt_class_ids, gt_boxes)
                '''
                rpn_bbox是anchors与gt_boxes共同决定，正样本分配到rpn_bbox中，
                并根据对应anchors做了refine，而多余的个数令其rpn_match=0，负样本的rpn_match=-1.                
                '''

                # Mask R-CNN Targets  # rpn_rois的数量由rois_count决定。
                if rois_count:
                    rpn_rois = self.generate_random_rois(image.shape, rois_count,  gt_boxes)
                    '''
                    rpn_rois是根据roi_count的90%数量平均分配给每个gt_boxes实列box延长2倍，随机产生多个新的boxes，
                    即为rpn_rois（不能超过图像本身的尺寸）。剩余10%的roi_count数量就在图像中随机产生boxes。
                    '''
                    if detection_targets:
                        rois, mrcnn_class_ids, mrcnn_bbox, mrcnn_mask = 
                            self.build_detection_targets(rpn_rois, gt_class_ids, gt_boxes, gt_masks)
                        '''
                        该函数分为以下几点说明：
                        ① 通过gt_class_ids >0 筛选出gt_boxes与gt_masks与gt_class_ids。
                        ② 通过建立[rpn_rois,gt_boxes]的iou（overlap），在每行中找到最大值iou。
                        ③ 通过iou找到我们需要的正样本数量（0.33*N），多余随机选择并抛弃，
                           负样本处理类似正样本处理。其中不足我们需要的数量，将用负样本随机重复选择不足数量。
                        ④ 将得到负样本对应的gt_boxes与class_ids改成0替换原来的数值(真实数据)。
                        ⑤ rois 就是正样本rpn_boxes与负样本rpn_boxes。
                        ⑥ 找到正样本的gt_boxes并与对应的rpn_boxes进行处理，随后根据对应的id得到gt_mask
                        注：mrcnn_class_ids与rois一一对应，负样本类已经变成了0，正样本类不变；
                            mrcnn_bbox是从正样本对应的gt_boxes与rpn_boxes获得；
                            mrcnn_mask是从正样本对应ids的gt_mask获得。                     
                        '''
                # Init batch arrays
                if b == 0:  # b是累加的，实际先初始化有batch的数组，以便上面得到的添加到相应的位置中
                    batch_image_meta = np.zeros(
                        (batch_size,) + image_meta.shape, dtype=image_meta.dtype)
                    batch_rpn_match = np.zeros(
                        [batch_size, anchors.shape[0], 1], dtype=rpn_match.dtype)
                    batch_rpn_bbox = np.zeros(
                        [batch_size, Config.rpn_data_train_anchor_per_img, 4], dtype=rpn_bbox.dtype)
                    batch_images = np.zeros(
                        (batch_size,) + image.shape, dtype=np.float32)
                    batch_gt_class_ids = np.zeros(
                        (batch_size, Config.max_gt_data_instance), dtype=np.int32)
                    batch_gt_boxes = np.zeros(
                        (batch_size, Config.max_gt_data_instance, 4), dtype=np.int32)
                    batch_gt_masks = np.zeros(
                        (batch_size, gt_masks.shape[0], gt_masks.shape[1],
                         Config.max_gt_data_instance), dtype=gt_masks.dtype)
                    if rois_count:  # 只有roi_count有值，才会有rpn_rois
                        batch_rpn_rois = np.zeros(
                            (batch_size, rpn_rois.shape[0], 4), dtype=rpn_rois.dtype)
                        if detection_targets:  # 只有detection_targets有值，才会有rois，mrcnn_class_ids,mrcnn_bbox与mrcnn_mask
                            batch_rois = np.zeros(
                                (batch_size,) + rois.shape, dtype=rois.dtype)
                            batch_mrcnn_class_ids = np.zeros(
                                (batch_size,) + mrcnn_class_ids.shape, dtype=mrcnn_class_ids.dtype)
                            batch_mrcnn_bbox = np.zeros(
                                (batch_size,) + mrcnn_bbox.shape, dtype=mrcnn_bbox.dtype)
                            batch_mrcnn_mask = np.zeros(
                                (batch_size,) + mrcnn_mask.shape, dtype=mrcnn_mask.dtype)

                # If more instances than fits in the array, sub-sample from them.
                if gt_boxes.shape[0] > Config.max_gt_data_instance:
                    print('warning: real instances over Config.max_gt_data_instance ,'
                          'though we deal with problem happened, we advise to enlarge '
                          'value of Config.max_gt_data_instance')
                    ids = np.random.choice(np.arange(gt_boxes.shape[0]), Config.max_gt_data_instance, replace=False)
                    gt_class_ids = gt_class_ids[ids]
                    gt_boxes = gt_boxes[ids]
                    gt_masks = gt_masks[:, :, ids]

                # 将上面求出的数据添加到相应的矩阵中
                batch_image_meta[b] = image_meta
                batch_rpn_match[b] = rpn_match[:, np.newaxis]
                batch_rpn_bbox[b] = rpn_bbox
                image_mean=image.astype(np.float32)-Config.mean_pixel
                batch_images[b] = image_mean
                batch_gt_class_ids[b, :gt_class_ids.shape[0]] = gt_class_ids
                batch_gt_boxes[b, :gt_boxes.shape[0]] = gt_boxes
                batch_gt_masks[b, :, :, :gt_masks.shape[-1]] = gt_masks
                if rois_count:
                    batch_rpn_rois[b] = rpn_rois
                    if detection_targets:
                        batch_rois[b] = rois
                        batch_mrcnn_class_ids[b] = mrcnn_class_ids
                        batch_mrcnn_bbox[b] = mrcnn_bbox
                        batch_mrcnn_mask[b] = mrcnn_mask
                b += 1


                # Batch full?
                # outputs=9
                if b >= batch_size:
                    inputs = [batch_images, batch_image_meta, batch_rpn_match, batch_rpn_bbox,
                              batch_gt_class_ids, batch_gt_boxes, batch_gt_masks]
                    outputs = []
                    if rois_count:
                        inputs.extend([batch_rpn_rois])
                        if detection_targets:
                            inputs.extend([batch_rois])
                            # Keras requires that output and targets have the same number of dimensions
                            batch_mrcnn_class_ids = np.expand_dims(
                                batch_mrcnn_class_ids, -1)
                            outputs.extend(
                                [batch_mrcnn_class_ids, batch_mrcnn_bbox, batch_mrcnn_mask])

                    yield inputs, outputs
            except :
                raise Exception("data_generator function happen error,please check!")

    def generate_anchors(self, scales, ratios, shape, feature_stride, anchor_stride):
        """
        scales: 1D array of anchor sizes in pixels. Example: [32, 64, 128]
        ratios: 1D array of anchor ratios of width/height. Example: [0.5, 1, 2]
        shape: [height, width] spatial shape of the feature map over which
                to generate anchors.
        feature_stride: Stride of the feature map relative to the image in pixels.
        anchor_stride: Stride of anchors on the feature map. For example, if the
            value is 2 then generate anchors for every other feature map pixel.
        """
        # Get all combinations of scales and ratios
        scales, ratios = np.meshgrid(np.array(scales), np.array(ratios))
        scales = scales.flatten()  # 只有三位数，自己细心体会
        ratios = ratios.flatten()  # 只有三位数，自己细心体会

        # Enumerate heights and widths from scales and ratios
        heights = scales / np.sqrt(ratios)  # 只有三位数，自己细心体会
        widths = scales * np.sqrt(ratios)  # 只有三位数，自己细心体会

        # Enumerate shifts in feature space
        shifts_y = np.arange(0, shape[0], anchor_stride) * feature_stride
        shifts_x = np.arange(0, shape[1], anchor_stride) * feature_stride
        shifts_x, shifts_y = np.meshgrid(shifts_x, shifts_y)

        # Enumerate combinations of shifts, widths, and heights
        box_widths, box_centers_x = np.meshgrid(widths, shifts_x)
        box_heights, box_centers_y = np.meshgrid(heights, shifts_y)

        # Reshape to get a list of (y, x) and a list of (h, w)
        box_centers = np.stack([box_centers_y, box_centers_x], axis=2).reshape([-1, 2])
        box_sizes = np.stack([box_heights, box_widths], axis=2).reshape([-1, 2])
        # Convert to corner coordinates (y1, x1, y2, x2)
        boxes = np.concatenate([box_centers - 0.5 * box_sizes, box_centers + 0.5 * box_sizes], axis=1)
        return boxes

    def load_image(self, image_id):
        """Load the specified image and return a [H,W,3] Numpy array.
           此处载入序列为image_id图片，并保存为数组。
        """
        image = cv.imread(self.image_info[image_id]['path'])
        if image.ndim == 3:  # .ndim查看维度，若不为彩色图，则将其转换为3通道的彩色图
            _, _, chanel = image.shape
            if chanel != 3:
                raise Exception('channel  for image exist error，please check！')
        else:
            raise Exception('input image exist error,please check!')
        return image

    def load_mask(self, image_id):
        """
        只处理图片序列为image_id的图片。
        该mask是一张图，返回mask与class_ids(class_ids是mask对应不良所对应self.class_names中的索引号)，
        其中mask为[w,h,object],class_ids为[object]，如[w,h,4]与[1,3,1,2]
        """
        info = self.image_info[image_id]  # according  image_id that belong int to choose image_info information
        img = Image.open(info[
                             'mask_path'])  # loading mask_path from label_image that original image handled have changed mask image with label
        num_obj = np.max(img)  # 取一个最大值,得到验证有多少个物体就会是多少，如这张图有3个mask则该值等于3
        mask = np.zeros([info['height'], info['width'], num_obj], dtype=np.uint8)
        img_info = self.image_info[image_id]
        for index in range(num_obj):
            for i in range(img_info['width']):
                for j in range(img_info['height']):
                    '''
                    说明一下，经过labelme后的mask是[h，w,object_count] 若有3个目标，则mask最大值
                    为3，即第一目标mask用1填充，第二个目标mask用2填充，第三个目标mask用3填充，
                    以此类推。  因此，下面的代码是将拥有mask的填充全部变成1，而不拥有目标的mask全为0.

                    '''
                    at_pixel = img.getpixel((i, j))
                    if at_pixel == index + 1:
                        mask[j, i, index] = 1
        # 下面的代码处理inage_id编号图片mask的不良名称
        yaml_info = self.image_info[image_id]
        with open(yaml_info['yaml_path']) as f:
            temp = yaml.load(f.read(), Loader=yaml.FullLoader)
            labels = temp['label_names']
            del labels[0]
        class_ids = []
        for label in labels:
            if label in self.class_names:
                class_ids.append(self.class_names.index(label))
            else:
                raise Exception('there are value in labels that is not included in class_names,please check')
        class_ids = np.array(class_ids)
        # 按照class_ids 选定图片，然后按照yaml文件的分类匹配到class中，并给出整数代表
        return mask, class_ids.astype(np.int32)

    def load_image_gt(self, image_id,  augmentation=True):
        """Load and return ground truth data for an image (image, mask, bounding boxes).

        augment: (Depricated. Use augmentation instead). If true, apply random
            image augmentation. Currently, only horizontal flipping is offered.
        augmentation: Optional. An imgaug (https://github.com/aleju/imgaug) augmentation.
            For example, passing imgaug.augmenters.Fliplr(0.5) flips images
            right/left 50% of the time.
        use_mini_mask: If False, returns full-size masks that are the same height
            and width as the original image. These can be big, for example
            1024x1024x100 (for 100 instances). Mini masks are smaller, typically,
            224x224 and are generated by extracting the bounding box of the
            object and resizing it to MINI_MASK_SHAPE.

        Returns:
        image: [height, width, 3]
        shape: the original shape of the image before resizing and cropping.
        class_ids: [instance_count] Integer class IDs
        bbox: [instance_count, (y1, x1, y2, x2)]
        mask: [height, width, instance_count]. The height and width are those
            of the image unless use_mini_mask is True, in which case they are
            defined in MINI_MASK_SHAPE.
        """
        # Load image and mask
        image = self.load_image(image_id)
        mask, class_ids = self.load_mask(image_id)
        original_shape = image.shape
        image, window, scale, padding = self.resize_image(
            image,
            min_dim=Config.image_min_dim,
            min_scale=Config.resize_min_scale,
            max_dim=Config.image_max_dim,
            mode=Config.resize_mode)

        mask = self.resize_mask(mask, scale, padding)

        # Random horizontal flips.
        # TODO: will be removed in a future update in favor of augmentation
        if augmentation:
            if random.randint(0, 1):
                image = np.fliplr(image)  # fliplr()在左右方向上翻转。
                mask = np.fliplr(mask)
            if random.randint(0,1):
                image = np.flipud(image)  # fliplr()在上下方向上翻转。
                mask = np.flipud(mask)

            if random.randint(0, 1):  # 50%的概率决定是否对图像进行高斯滤波及图像明暗度等
                image = np.expand_dims(image, axis=0)
                # Store shapes before augmentation to compare
                seq = iaa.Sequential([
                    iaa.Sometimes(0.5, iaa.GaussianBlur(sigma=(0, 0.5))),
                    iaa.ContrastNormalization((0.75, 1.5), per_channel=True),  # 0.75-1.5随机数值为alpha，对图像进行对比度增强，该alpha应用于每个通道
                    iaa.Multiply((0.8, 1.2), per_channel=0.2),  # 20%的图片像素值乘以0.8-1.2中间的数值,用以增加图片明亮度或改变颜色
                ], random_order=True)  # 打乱定义图像增强的顺序
                image = seq.augment_images(image)
                image=np.squeeze(image, axis=0)
            mask = mask.astype(np.bool)

        image_shape = image.shape
        # Note that some boxes might be all zeros if the corresponding mask got cropped out.
        # and here is to filter them out
        _idx = np.sum(mask, axis=(0, 1)) > 0  # 一/二维度所有值相加，大于0为true，否则为false
        mask = mask[:, :, _idx]  # 选择大于0的mask
        class_ids = class_ids[_idx]  # 选择大于0的mask对应的类别
        # Bounding boxes. Note that some boxes might be all zeros
        # if the corresponding mask got cropped out.
        # bbox: [num_instances, (y1, x1, y2, x2)]
        bbox = self.extract_bboxes(mask)  # 根据mask找到bbox

        # Active classes
        # Different datasets have different classes, so track the
        # classes supported in the dataset of this image.
        active_class_ids = np.ones([Config.num_classes], dtype=np.int32)

        image_meta = np.array(
            [image_id] +  # size=1
            list(original_shape) +  # size=3
            list(image_shape) +  # size=3
            list(window) +  # size=4 (y1, x1, y2, x2) in image cooredinates
            [scale] +  # size=1
            list(active_class_ids)  # size=num_classes
        )
        return image, image_meta, class_ids, bbox, mask

    def resize_image(self, image, min_dim=Config.image_min_dim, max_dim=Config.image_max_dim, min_scale=0, mode="square"):
        """
        该函数有2个作用：
        第一个作用：
        该函数主要根据最小Config_image_min_dim与Config_image_max_dim对输入的原始图像进行resize变化，
        首先将图片保存在Config_image_min_dim的画布中，若图片此时缩放倍数大于等于min_scale定义图像缩放的倍数，则按照该scale，
        否则按照min_sacle的倍数，而后在判断是否该倍数在Config_image_max_dim画布中，若超出则按照Config_image_max_dim更改scale，
        否则不更改scale。
        第二个作用：
        按照mode方式对图像按照sacle倍数进行resize的新图像按照mode方法放在Config_image_max_dim画布中，未放置的区域用0填充。

        该函数返回的图像矩阵的值在0-1范围内，原因是使用了skimage_transform_resize函数。
        Resizes an image keeping the aspect ratio unchanged.
        min_dim: if provided, resizes the image such that it's smaller
            dimension == min_dim
        max_dim: if provided, ensures that the image longest side doesn't
            exceed this value.
        min_scale: if provided, ensure that the image is scaled up by at least
            this percent even if min_dim doesn't require it.
        mode: Resizing mode.
            none: No resizing. Return the image unchanged.
            square: Resize and pad with zeros to get a square image
                of size [max_dim, max_dim].
            pad64: Pads width and height with zeros to make them multiples of 64.
                   If min_dim or min_scale are provided, it scales the image up
                   before padding. max_dim is ignored in this mode.
                   The multiple of 64 is needed to ensure smooth scaling of feature
                   maps up and down the 6 levels of the FPN pyramid (2**6=64).

        Returns:
        image: the resized image
        window: (y1, x1, y2, x2). If max_dim is provided, padding might
            be inserted in the returned image. If so, this window is the
            coordinates of the image part of the full image (excluding
            the padding). The x2, y2 pixels are not included.
        scale: The scale factor used to resize the image
        padding: Padding added to the image [(top, bottom), (left, right), (0, 0)]
        """
        # Keep track of image dtype and return results in the same dtype
        image_dtype = image.dtype
        # Default window (y1, x1, y2, x2) and default scale == 1.
        h, w = image.shape[:2]
        window = (0, 0, h, w)
        scale = 1
        padding = [(0, 0), (0, 0), (0, 0)]
        if mode == "none":
            return image, window, scale, padding

        # Scale?
        if min_dim:
            # Scale up but not down
            scale = max(1, min_dim / min(h, w))
        if min_scale and scale < min_scale:
            scale = min_scale

        # Does it exceed max dim?
        if max_dim and mode == "square":
            image_max = max(h, w)
            if round(image_max * scale) > max_dim:  # 主要防止上面代码更改的scale引起超出最大边界max_dim
                scale = max_dim / image_max
        # Resize image using bilinear interpolation
        if scale != 1:
            image = skimage.transform.resize(image, (round(h * scale), round(w * scale)),
                                             order=1, mode="constant", preserve_range=True)  # 该句代码将其转换为float64的类型，取值0-1之间
            # order=1 表示双线性插值


        # Need padding or cropping?
        if mode == "square":
            # Get new height and width
            h, w = image.shape[:2]
            top_pad = (max_dim - h) // 2  # //表示取整除数
            bottom_pad = max_dim - h - top_pad
            left_pad = (max_dim - w) // 2
            right_pad = max_dim - w - left_pad
            padding = [(top_pad, bottom_pad), (left_pad, right_pad), (0, 0)]
            image = np.pad(image, padding, mode='constant', constant_values=0)  # 没有的用0填充
            window = (top_pad, left_pad, h + top_pad, w + left_pad)  # 保存经过resize后的图片大小的左上角与右下角坐标
        elif mode == "pad64":
            h, w = image.shape[:2]
            # Both sides must be divisible by 64
            assert min_dim % 64 == 0, "Minimum dimension must be a multiple of 64"
            # Height
            if h % 64 > 0:
                max_h = h - (h % 64) + 64
                top_pad = (max_h - h) // 2
                bottom_pad = max_h - h - top_pad
            else:
                top_pad = bottom_pad = 0
            # Width
            if w % 64 > 0:
                max_w = w - (w % 64) + 64
                left_pad = (max_w - w) // 2
                right_pad = max_w - w - left_pad
            else:
                left_pad = right_pad = 0
            padding = [(top_pad, bottom_pad), (left_pad, right_pad), (0, 0)]
            image = np.pad(image, padding, mode='constant', constant_values=0)
            window = (top_pad, left_pad, h + top_pad, w + left_pad)
        else:
            raise Exception(" process of resizing image happen error , "
                            "please check resizing image function in class data")
        return image.astype(image_dtype), window, scale, padding
    def resize_mask(self,mask, scale, padding):
        """Resizes a mask using the given scale and padding.
        Typically, you get the scale and padding from resize_image() to
        ensure both, the image and the mask, are resized consistently.
        scale: mask scaling factor
        padding: Padding to add to the mask in the form
                [(top, bottom), (left, right), (0, 0)]
        """
        #  the output shape of zoom() is calculated with round() instead of int()
        mask = scipy.ndimage.zoom(mask, zoom=[scale, scale, 1], order=0)  # order=0表示邻近插值，其值类似cv2.resize
        mask = np.pad(mask, padding, mode='constant', constant_values=0)
        return mask

    def extract_bboxes(self,mask):
        """Compute bounding boxes from masks.
        mask: [height, width, num_instances]. Mask pixels are either 1 or 0.

        Returns: bbox array [num_instances, (y1, x1, y2, x2)].
        """
        boxes = np.zeros([mask.shape[-1], 4], dtype=np.int32)
        for i in range(mask.shape[-1]):
            m = mask[:, :, i]
            # Bounding box.
            horizontal_indicies = np.where(np.any(m, axis=0))[0]
            vertical_indicies = np.where(np.any(m, axis=1))[0]
            if horizontal_indicies.shape[0]:
                x1, x2 = horizontal_indicies[[0, -1]]
                y1, y2 = vertical_indicies[[0, -1]]
                # x2 and y2 should not be part of the box. Increment by 1.
                x2 += 1
                y2 += 1
            else:
                # No mask for this instance. Might happen due to
                # resizing or cropping. Set bbox to zeros
                x1, x2, y1, y2 = 0, 0, 0, 0
            boxes[i] = np.array([y1, x1, y2, x2])
        return boxes.astype(np.int32)

    def build_rpn_targets(self,  anchors, gt_class_ids, gt_boxes):
        """Given the anchors and GT boxes, compute overlaps and identify positive
        anchors and deltas to refine them to match their corresponding GT boxes.

        anchors: [num_anchors, (y1, x1, y2, x2)]
        gt_class_ids: [num_gt_boxes] Integer class IDs.每张图片实列的标签
        gt_boxes: [num_gt_boxes, (y1, x1, y2, x2)]

        Returns:
        rpn_match: [N] (int32) matches between anchors and GT boxes.
                   1 = positive anchor, -1 = negative anchor, 0 = neutral
        rpn_bbox: [N, (dy, dx, log(dh), log(dw))] Anchor bbox deltas.
        其中N个数为anchors，但是rpn_box只有(Config.rpn_data_train_anchor_per_img // 2)的正样本
        """
        # RPN Match: 1 = positive anchor, -1 = negative anchor, 0 = neutral
        rpn_match = np.zeros([anchors.shape[0]], dtype=np.int32)
        # RPN bounding boxes: [max anchors per image, (dy, dx, log(dh), log(dw))]
        rpn_bbox = np.zeros((Config.rpn_data_train_anchor_per_img, 4))  # 256  4

        # Handle COCO crowds
        # A crowd box in COCO is a bounding box around several instances. Exclude
        # them from training. A crowd box is given a negative class ID.
        crowd_ix = np.where(gt_class_ids < 0)[0]  # 计算拥挤的实列的类别
        if crowd_ix.shape[0] > 0:
            # Filter out crowds from ground truth class IDs and boxes
            non_crowd_ix = np.where(gt_class_ids > 0)[0]  # 计算非拥挤的实列的类别
            crowd_boxes = gt_boxes[crowd_ix]
            gt_boxes = gt_boxes[non_crowd_ix]
            # Compute overlaps with crowd boxes [anchors, crowds]
            crowd_overlaps = self.compute_overlaps(anchors, crowd_boxes)  # [anchors，gt_crowd_boxes]
            crowd_iou_max = np.amax(crowd_overlaps, axis=1)  # 在列中找最大的，每一行中取最大的值，即每个anchor对应的最大gt_box
            no_crowd_bool = (crowd_iou_max < 0.001)  # 每个anchors与gt_box值小于0.001的就为真
        else:
            # All anchors don't intersect a crowd
            no_crowd_bool = np.ones([anchors.shape[0]], dtype=bool)

        # Compute overlaps [num_anchors, num_gt_boxes]
        overlaps = self.compute_overlaps(anchors, gt_boxes)

        # Match anchors to GT Boxes
        # If an anchor overlaps a GT box with IoU >= 0.7 then it's positive.
        # If an anchor overlaps a GT box with IoU < 0.3 then it's negative.
        # Neutral anchors are those that don't match the conditions above,
        # and they don't influence the loss function.
        # However, don't keep any GT box unmatched (rare, but happens). Instead,
        # match it to the closest anchor (even if its max IoU is < 0.3).
        #
        # 1. Set negative anchors first. They get overwritten below if a GT box is
        # matched to them. Skip boxes in crowd areas.
        anchor_iou_argmax = np.argmax(overlaps, axis=1)  # 得到每行中最大值的索引值
        anchor_iou_max = overlaps[np.arange(overlaps.shape[0]), anchor_iou_argmax]  # 得到iou值 [anchors]
        rpn_match[(anchor_iou_max < 0.3) & (no_crowd_bool)] = -1
        # 2. Set an anchor for each GT box (regardless of IoU value).
        # TODO: If multiple anchors have the same IoU match all of them
        gt_iou_argmax = np.argmax(overlaps, axis=0)  # 为每一个gt_box寻找一个anchor对应的，要求其iou值最大
        rpn_match[gt_iou_argmax] = 1  # 将上面代码找到的anchor设置为正样本
        # 3. Set anchors with high overlap as positive.
        rpn_match[anchor_iou_max >= 0.7] = 1

        # Subsample to balance positive and negative anchors
        # Don't let positives be more than half the anchors
        ids = np.where(rpn_match == 1)[0]
        extra = len(ids) - (Config.rpn_data_train_anchor_per_img // 2)
        if extra > 0:  # 表示正样本超过
            # Reset the extra ones to neutral
            ids = np.random.choice(ids, extra, replace=False)  # 随机多余的正样本，并将其rpn_match置0
            rpn_match[ids] = 0
        # Same for negative proposals
        ids = np.where(rpn_match == -1)[0]
        extra = len(ids) - (Config.rpn_data_train_anchor_per_img - np.sum(rpn_match == 1))
        if extra > 0:
            # Rest the extra ones to neutral
            ids = np.random.choice(ids, extra, replace=False)  # 将多余的负样本随机选择，并将其rpn_match置0
            rpn_match[ids] = 0

        # For positive anchors, compute shift and scale needed to transform them
        # to match the corresponding GT boxes.
        ids = np.where(rpn_match == 1)[0]  # 选择rpn_match=1的anchor序列
        ix = 0  # index into rpn_bbox
        # TODO: use box_refinement() rather than duplicating the code here
        for i, a in zip(ids, anchors[ids]):
            # Closest gt box (it might have IoU < 0.7)
            gt = gt_boxes[anchor_iou_argmax[i]]
            # Convert coordinates to center plus width/height.
            # GT Box
            gt_h = gt[2] - gt[0]
            gt_w = gt[3] - gt[1]
            gt_center_y = gt[0] + 0.5 * gt_h
            gt_center_x = gt[1] + 0.5 * gt_w
            # Anchor
            a_h = a[2] - a[0]
            a_w = a[3] - a[1]
            a_center_y = a[0] + 0.5 * a_h
            a_center_x = a[1] + 0.5 * a_w
            # Compute the bbox refinement that the RPN should predict.
            rpn_bbox[ix] = [
                (gt_center_y - a_center_y) / a_h,
                (gt_center_x - a_center_x) / a_w,
                np.log(gt_h / a_h),
                np.log(gt_w / a_w)]
            # Normalize
            rpn_bbox[ix] /= Config.std_dev
            ix += 1  # ix主要是为rpn_bbox的序列，若没有那么多就是0
        return rpn_match, rpn_bbox

    def compute_overlaps(self, boxes1, boxes2):
        '''
        here boxes1 represent anchors，boxes2 represent gt_boxes
        :param boxes1: [n1,4]
        :param boxes2: [n2,4]
        :return: [n1,n2]
        '''
        # Areas of anchors and GT boxes
        area1 = (boxes1[:, 2] - boxes1[:, 0]) * (boxes1[:, 3] - boxes1[:, 1])
        area2 = (boxes2[:, 2] - boxes2[:, 0]) * (boxes2[:, 3] - boxes2[:, 1])
        # Compute overlaps to generate matrix [boxes1 count, boxes2 count]
        # Each cell contains the IoU value.
        overlaps = np.zeros((boxes1.shape[0], boxes2.shape[0]))
        for i in range(overlaps.shape[1]):  # 遍历每个box2
            box2 = boxes2[i]
            y1 = np.maximum(box2[0], boxes1[:, 0])  # boxes与box比较，取最大的h，下面几个同理类推
            y2 = np.minimum(box2[2], boxes1[:, 2])
            x1 = np.maximum(box2[1], boxes1[:, 1])
            x2 = np.minimum(box2[3], boxes1[:, 3])
            intersection = np.maximum(x2 - x1, 0) * np.maximum(y2 - y1, 0)  # 有一个0防止为0或者出现错误的负值（这些情况应该不能发生）
            union = area2[i] + area1[:] - intersection[:]  # box_area是一个值，boxes_area与intersection是一组值
            iou = intersection / union  # 是一组值
            overlaps[:, i] = iou
        return overlaps

    def generate_random_rois(self, image_shape, count,  gt_boxes):
        """Generates ROI proposals similar to what a region proposal network
        would generate.

        image_shape: [Height, Width, Depth]
        count: Number of ROIs to generate
        gt_class_ids: [N] Integer ground truth class IDs
        gt_boxes: [N, (y1, x1, y2, x2)] Ground truth boxes in pixels.

        Returns: [count, (y1, x1, y2, x2)] ROI boxes in pixels.
        """
        # placeholder
        rois = np.zeros((count, 4), dtype=np.int32)  # 建立产生roi的空矩阵
        # Generate random ROIs around GT boxes (90% of count)
        rois_per_box = int(0.9 * count / gt_boxes.shape[0])  # 为每个gt_boxes将0.9*count平均分配。
        # 我们想要的roi应该大于每幅图的gt_boxes
        for i in range(gt_boxes.shape[0]):
            gt_y1, gt_x1, gt_y2, gt_x2 = gt_boxes[i]
            h = gt_y2 - gt_y1
            w = gt_x2 - gt_x1
            # random boundaries
            r_y1 = max(gt_y1 - h, 0)
            r_y2 = min(gt_y2 + h, image_shape[0])
            r_x1 = max(gt_x1 - w, 0)
            r_x2 = min(gt_x2 + w, image_shape[1])

            # To avoid generating boxes with zero area, we generate double what
            # we need and filter out the extra. If we get fewer valid boxes
            # than we need, we loop and try again.
            while True:
                y1y2 = np.random.randint(r_y1, r_y2, (rois_per_box * 2, 2))
                x1x2 = np.random.randint(r_x1, r_x2, (rois_per_box * 2, 2))
                # Filter out zero area boxes
                threshold = 1  # 表示一个像素
                y1y2 = y1y2[np.abs(y1y2[:, 0] - y1y2[:, 1]) >= threshold][:rois_per_box]
                x1x2 = x1x2[np.abs(x1x2[:, 0] - x1x2[:, 1]) >= threshold][:rois_per_box]
                if y1y2.shape[0] == rois_per_box and x1x2.shape[0] == rois_per_box:
                    # 上一行代码限制y1y2与x1x2必须取rois_per_box个，否则重复
                    break
            # Sort on axis 1 to ensure x1 <= x2 and y1 <= y2 and then reshape
            # into x1, y1, x2, y2 order
            x1, x2 = np.split(np.sort(x1x2, axis=1), 2, axis=1)  # np.sort默认按行排序，从小到大排序
            y1, y2 = np.split(np.sort(y1y2, axis=1), 2, axis=1)
            box_rois = np.hstack([y1, x1, y2, x2])
            rois[rois_per_box * i:rois_per_box * (i + 1)] = box_rois

        # Generate random ROIs anywhere in the image (10% of count)
        remaining_count = count - (rois_per_box * gt_boxes.shape[0])
        # To avoid generating boxes with zero area, we generate double what
        # we need and filter out the extra. If we get fewer valid boxes
        # than we need, we loop and try again.
        while True:
            y1y2 = np.random.randint(0, image_shape[0], (remaining_count * 2, 2))
            x1x2 = np.random.randint(0, image_shape[1], (remaining_count * 2, 2))
            # Filter out zero area boxes
            threshold = 1
            y1y2 = y1y2[np.abs(y1y2[:, 0] - y1y2[:, 1]) >= threshold][:remaining_count]
            x1x2 = x1x2[np.abs(x1x2[:, 0] - x1x2[:, 1]) >= threshold][:remaining_count]
            if y1y2.shape[0] == remaining_count and x1x2.shape[0] == remaining_count:
                break

        # Sort on axis 1 to ensure x1 <= x2 and y1 <= y2 and then reshape
        # into x1, y1, x2, y2 order
        x1, x2 = np.split(np.sort(x1x2, axis=1), 2, axis=1)
        y1, y2 = np.split(np.sort(y1y2, axis=1), 2, axis=1)
        global_rois = np.hstack([y1, x1, y2, x2])
        rois[-remaining_count:] = global_rois
        return rois

    def build_detection_targets(self, rpn_rois, gt_class_ids, gt_boxes, gt_masks):
        """Generate targets for training Stage 2 classifier and mask heads.
        This is not used in normal training. It's useful for debugging or to train
        the Mask RCNN heads without using the RPN head.

        Inputs:
        rpn_rois: [N, (y1, x1, y2, x2)] proposal boxes.
        gt_class_ids: [instance count] Integer class IDs
        gt_boxes: [instance count, (y1, x1, y2, x2)]
        gt_masks: [height, width, instance count] Grund truth masks. Can be full
                  size or mini-masks.

        Returns:
        rois: [TRAIN_ROIS_PER_IMAGE, (y1, x1, y2, x2)]
        class_ids: [TRAIN_ROIS_PER_IMAGE]. Integer class IDs.
        bboxes: [TRAIN_ROIS_PER_IMAGE, NUM_CLASSES, (y, x, log(h), log(w))]. Class-specific
                bbox refinements.
        masks: [TRAIN_ROIS_PER_IMAGE, height, width, NUM_CLASSES). Class specific masks cropped
               to bbox boundaries and resized to neural network output size.
        """
        assert rpn_rois.shape[0] > 0
        assert gt_class_ids.dtype == np.int32, 
            "Expected int but got {} in build_detection_targets function".format(gt_class_ids.dtype)
        assert gt_boxes.dtype == np.int32,
            "Expected int but got {} in build_detection_targets function".format(gt_boxes.dtype)
        assert gt_masks.dtype == np.bool_, 
            "Expected bool but got {} in build_detection_targets function".format(gt_masks.dtype)

        # It's common to add GT Boxes to ROIs but we don't do that here because
        # according to XinLei Chen's paper, it doesn't help.

        # Trim empty padding in gt_boxes and gt_masks parts
        instance_ids = np.where(gt_class_ids > 0)[0]  # 这一步排除<=0为crow的box，但是我们训练集没有，可以不需要此步骤
        assert instance_ids.shape[0] > 0, "Image must contain instances."
        gt_class_ids = gt_class_ids[instance_ids]
        gt_boxes = gt_boxes[instance_ids]
        gt_masks = gt_masks[:, :, instance_ids]

        # Compute areas of ROIs and ground truth boxes.
        rpn_roi_area = (rpn_rois[:, 2] - rpn_rois[:, 0]) * (rpn_rois[:, 3] - rpn_rois[:, 1])
        gt_box_area = (gt_boxes[:, 2] - gt_boxes[:, 0]) * (gt_boxes[:, 3] - gt_boxes[:, 1])

        # Compute overlaps [rpn_rois, gt_boxes]
        overlaps = np.zeros((rpn_rois.shape[0], gt_boxes.shape[0]))
        for i in range(overlaps.shape[1]):
            gt = gt_boxes[i]
            overlaps[:, i] = self.compute_iou(gt, rpn_rois, gt_box_area[i], rpn_roi_area)
        # Assign ROIs to GT boxes
        rpn_roi_iou_argmax = np.argmax(overlaps, axis=1)  # 每一行中最大iou值，即每个rpn_rois对应gt_box的最大值的序列
        rpn_roi_iou_max = overlaps[np.arange(overlaps.shape[0]), rpn_roi_iou_argmax]  # 找到该最大值iou
        # GT box assigned to each ROI
        rpn_roi_gt_boxes = gt_boxes[rpn_roi_iou_argmax]  # 找到对应的真实gt_boxes
        rpn_roi_gt_class_ids = gt_class_ids[rpn_roi_iou_argmax]  # 找到对应的类序列
        # Positive ROIs are those with >= 0.5 IoU with a GT box.
        fg_ids = np.where(rpn_roi_iou_max > 0.5)[0]  # 找到对应的iou值大于0.5的序列，为rpn_roi对应序列，作为前景

        # Negative ROIs are those with max IoU 0.1-0.5 (hard example mining)
        # TODO: To hard example mine or not to hard example mine, that's the question
        #     bg_ids = np.where((rpn_roi_iou_max >= 0.1) & (rpn_roi_iou_max < 0.5))[0]
        bg_ids = np.where(rpn_roi_iou_max < 0.5)[0]  # 与以上类似，作为背景

        # Subsample ROIs. Aim for 33% foreground.
        # FG
        fg_roi_count = int(Config.target_data_per_img * Config.target_data_fg_rate)
        if fg_ids.shape[0] > fg_roi_count:  # 如果超出前景数量，就随机选择
            keep_fg_ids = np.random.choice(fg_ids, fg_roi_count, replace=False)
        else:
            keep_fg_ids = fg_ids
        # BG
        remaining = Config.target_data_per_img - keep_fg_ids.shape[0]
        if bg_ids.shape[0] > remaining:  # 若超出背景数量，就随机选择
            keep_bg_ids = np.random.choice(bg_ids, remaining, replace=False)
        else:
            keep_bg_ids = bg_ids
        # Combine indicies of ROIs to keep
        keep = np.concatenate([keep_fg_ids, keep_bg_ids])
        # Need more?
        remaining = Config.target_data_per_img - keep.shape[0]
        if remaining > 0:
            # Looks like we don't have enough samples to maintain the desired
            # balance. Reduce requirements and fill in the rest. This is
            # likely different from the Mask RCNN paper.
            # There is a small chance we have neither fg nor bg samples.
            if keep.shape[0] == 0:
                # Pick bg regions with easier IoU threshold
                bg_ids = np.where(rpn_roi_iou_max < 0.5)[0]
                assert bg_ids.shape[0] >= remaining
                keep_bg_ids = np.random.choice(bg_ids, remaining, replace=False)
                assert keep_bg_ids.shape[0] == remaining
                keep = np.concatenate([keep, keep_bg_ids])
            else:
                # Fill the rest with repeated bg rois.
                # 代码应该执行此种情况，而不会执行上面的情况。即用背景取填充缺失的位置
                keep_extra_ids = np.random.choice(keep_bg_ids, remaining, replace=True)
                keep = np.concatenate([keep, keep_extra_ids])

        # Reset the gt boxes assigned to BG ROIs.
        rpn_roi_gt_boxes[keep_bg_ids, :] = 0  # 将负样本真实box设置为0
        rpn_roi_gt_class_ids[keep_bg_ids] = 0  # 类别也设置成0

        # For each kept ROI, assign a class_id, and for FG ROIs also add bbox refinement.
        rois = rpn_rois[keep]  # 正样本与负样本
        roi_gt_boxes = rpn_roi_gt_boxes[keep]  # 得到真实gt_box,但是bg的box已经变成了0
        roi_gt_class_ids = rpn_roi_gt_class_ids[keep]  # 得到真实class_ids，但是bg的ids已经变成了0
        roi_gt_assignment = rpn_roi_iou_argmax[keep]   # 得到rpn_rois行对应的真实gt_box的序列。

        # Class-aware bbox deltas. [y, x, log(h), log(w)]
        bboxes = np.zeros((Config.target_data_per_img, Config.num_classes, 4), dtype=np.float32)
        pos_ids = np.where(roi_gt_class_ids > 0)[0]  # 只要正样本
        bboxes[pos_ids, roi_gt_class_ids[pos_ids]] = 
            self.box_refinement(rois[pos_ids], roi_gt_boxes[pos_ids, :4])  # 2个boxes是一一对应
        # Normalize bbox refinements
        bboxes /= Config.std_dev
        # Generate class-specific target masks
        masks = np.zeros((Config.target_data_per_img, Config.mask_shape[0],
                          Config.mask_shape[1], Config.num_classes), dtype=np.float32)
        for i in pos_ids:  # 求得正样本的mask
            class_id = roi_gt_class_ids[i]
            assert class_id > 0, "class id must be greater than 0"
            gt_id = roi_gt_assignment[i]
            class_mask = gt_masks[:, :, gt_id]
            # Pick part of the mask and resize it
            y1, x1, y2, x2 = rois[i].astype(np.int32)
            m = class_mask[y1:y2, x1:x2]
            mask = skimage.transform.resize(m, Config.mask_shape, order=1, mode="constant")
            masks[i, :, :, class_id] = mask
        return rois, roi_gt_class_ids, bboxes, masks

    def box_refinement(self, box, gt_box):
        """Compute refinement needed to transform box to gt_box.
        box and gt_box are [N, (y1, x1, y2, x2)]. (y2, x2) is
        assumed to be outside the box.
        """
        box = box.astype(np.float32)
        gt_box = gt_box.astype(np.float32)
        height = box[:, 2] - box[:, 0]
        width = box[:, 3] - box[:, 1]
        center_y = box[:, 0] + 0.5 * height
        center_x = box[:, 1] + 0.5 * width
        gt_height = gt_box[:, 2] - gt_box[:, 0]
        gt_width = gt_box[:, 3] - gt_box[:, 1]
        gt_center_y = gt_box[:, 0] + 0.5 * gt_height
        gt_center_x = gt_box[:, 1] + 0.5 * gt_width
        dy = (gt_center_y - center_y) / height
        dx = (gt_center_x - center_x) / width
        dh = np.log(gt_height / height)
        dw = np.log(gt_width / width)
        return np.stack([dy, dx, dh, dw], axis=1)
    def compute_iou(self, box, boxes, box_area, boxes_area):
        """Calculates IoU of the given box with the array of the given boxes.
        box: 1D vector [y1, x1, y2, x2] 只有一个
        boxes: [boxes_count, (y1, x1, y2, x2)] 是一组box
        box_area: float. the area of 'box' box的面积
        boxes_area: array of length boxes_count.boxes中每个box的面积

        Note: the areas are passed in rather than calculated here for
              efficency. Calculate once in the caller to avoid duplicate work.
        """
        # Calculate intersection areas
        y1 = np.maximum(box[0], boxes[:, 0])  # boxes与box比较，取最大的h，下面几个同理类推
        y2 = np.minimum(box[2], boxes[:, 2])
        x1 = np.maximum(box[1], boxes[:, 1])
        x2 = np.minimum(box[3], boxes[:, 3])
        intersection = np.maximum(x2 - x1, 0) * np.maximum(y2 - y1, 0)  # 有一个0防止为0或者出现错误的负值（这些情况应该不能发生）
        union = box_area + boxes_area[:] - intersection[:]  # box_area是一个值，boxes_area与intersection是一组值
        iou = intersection / union  # 是一组值
        return iou




if __name__=='__main__':
    D=Dataset()
    # D.display()
    name=['line_bulge','dot_concave','dot_bulge','Irregular_concave']
    D.add_class(name)
    D.add_image('D:\MASKRCNN\mask-rcnn-me\MASKRCNN_myself\train_json\1021')
    c=D.data_generator(shuffle=True,  augmentation=True, rois_count=300, batch_size=D.batch_size, detection_targets=True)
    # print(c)
    # D.display()

查看全文

相关阅读:
Ubuntu18.04安装NAVIDIA驱动
 ubuntu 设置root用户密码并实现root用户登录
 配置ubuntu允许远程SSH连接
 Centos7安装yum命令
 NVDIA往期在线研讨会地址论坛提问地址
 二进制安装单master节点测试环境k8s集群
 kubeadm初始化k8s-延长证书过期时间
 kubeadm初始化k8s-删除控制节点-重新把控制节点加入集群步骤
 kubeadm安装的多master节点的k8s高可用集群
 二进制安装多master节点的k8s集群

原文地址：https://www.cnblogs.com/tangjunjun/p/12336582.html