zoukankan      html  css  js  c++  java
  • 『TensorFlow』SSD源码学习_其五:TFR数据读取&数据预处理

    Fork版本项目地址:SSD

    一、TFR数据读取

    创建slim.dataset.Dataset对象

    在train_ssd_network.py获取数据操作如下,首先需要slim.dataset.Dataset对象

    # Select the dataset.
    # 'imagenet', 'train', tfr文件存储位置
    # TFR文件命名格式:'voc_2012_%s_*.tfrecord',%s使用train或者test
    dataset = dataset_factory.get_dataset(
        FLAGS.dataset_name, FLAGS.dataset_split_name, FLAGS.dataset_dir)
    

    获取过程会经过一系列臃肿的调用,我把中间被调用的函数(们)写在了下面,由上到下依次调用:

    def get_dataset(name, split_name, dataset_dir, file_pattern=None, reader=None):
        """
        Returns:
            A `Dataset` class.
        Raises:
            ValueError: If the dataset `name` is unknown.
        """
        if name not in datasets_map:
            raise ValueError('Name of dataset unknown %s' % name)
        # pascalvoc_2012.get_split
        return datasets_map[name].get_split(split_name,
                                            dataset_dir,
                                            file_pattern,
                                            reader)
    
    
    def get_split(split_name, dataset_dir, file_pattern=None, reader=None):
        """
        Returns:
          A `Dataset` namedtuple.
        Raises:
            ValueError: if `split_name` is not a valid train/test split.
        """
        if not file_pattern:
            file_pattern = FILE_PATTERN  # 需要文件命名格式满足:'voc_2012_%s_*.tfrecord'
        return pascalvoc_common.get_split(split_name, dataset_dir,
                                          file_pattern, reader,
                                          SPLITS_TO_SIZES,  # {'train': 17125,}
                                          ITEMS_TO_DESCRIPTIONS,
                                          NUM_CLASSES  # 20
                                          )
        """
        ITEMS_TO_DESCRIPTIONS = {
        'image': 'A color image of varying height and width.',
        'shape': 'Shape of the image',
        'object/bbox': 'A list of bounding boxes, one per each object.',
        'object/label': 'A list of labels, one per each object.',
        }
        """
    

    最终调用,获取slim.dataset.Dataset(解析见『TensorFlow』从磁盘读取数据),实际上能够传入满足slim.dataset.Dataset的参数即可:

    def get_split(split_name, dataset_dir, file_pattern, reader,
                  split_to_sizes, items_to_descriptions, num_classes):
        """Gets a dataset tuple with instructions for reading Pascal VOC dataset.
    
        Args:
          split_name: A train/test split name.
          dataset_dir: The base directory of the dataset sources.
          file_pattern: The file pattern to use when matching the dataset sources.
            It is assumed that the pattern contains a '%s' string so that the split
            name can be inserted.
          reader: The TensorFlow reader type.
    
        Returns:
          A `Dataset` namedtuple.
    
        Raises:
            ValueError: if `split_name` is not a valid train/test split.
        """
        # 'train'
        if split_name not in split_to_sizes:
            raise ValueError('split name %s was not recognized.' % split_name)
        file_pattern = os.path.join(dataset_dir, file_pattern % split_name)
    
        # Allowing None in the signature so that dataset_factory can use the default.
        if reader is None:
            reader = tf.TFRecordReader
        # Features in Pascal VOC TFRecords.
        keys_to_features = {  # 解码TFR文件方式
            'image/encoded': tf.FixedLenFeature((), tf.string, default_value=''),
            'image/format': tf.FixedLenFeature((), tf.string, default_value='jpeg'),
            'image/height': tf.FixedLenFeature([1], tf.int64),
            'image/width': tf.FixedLenFeature([1], tf.int64),
            'image/channels': tf.FixedLenFeature([1], tf.int64),
            'image/shape': tf.FixedLenFeature([3], tf.int64),
            'image/object/bbox/xmin': tf.VarLenFeature(dtype=tf.float32),
            'image/object/bbox/ymin': tf.VarLenFeature(dtype=tf.float32),
            'image/object/bbox/xmax': tf.VarLenFeature(dtype=tf.float32),
            'image/object/bbox/ymax': tf.VarLenFeature(dtype=tf.float32),
            'image/object/bbox/label': tf.VarLenFeature(dtype=tf.int64),
            'image/object/bbox/difficult': tf.VarLenFeature(dtype=tf.int64),
            'image/object/bbox/truncated': tf.VarLenFeature(dtype=tf.int64),
        }
        items_to_handlers = {  # 解码二进制数据条目
            'image': slim.tfexample_decoder.Image('image/encoded', 'image/format'),
            'shape': slim.tfexample_decoder.Tensor('image/shape'),
            'object/bbox': slim.tfexample_decoder.BoundingBox(
                    ['ymin', 'xmin', 'ymax', 'xmax'], 'image/object/bbox/'),
            'object/label': slim.tfexample_decoder.Tensor('image/object/bbox/label'),
            'object/difficult': slim.tfexample_decoder.Tensor('image/object/bbox/difficult'),
            'object/truncated': slim.tfexample_decoder.Tensor('image/object/bbox/truncated'),
        }
        # 解码实施
        decoder = slim.tfexample_decoder.TFExampleDecoder(
            keys_to_features, items_to_handlers)
    
        labels_to_names = None
        # tf.gfile.Exists(os.path.join(dataset_dir, 'labels.txt'))
        if dataset_utils.has_labels(dataset_dir):
            labels_to_names = dataset_utils.read_label_file(dataset_dir)
        # else:
        #     labels_to_names = create_readable_names_for_imagenet_labels()
        #     dataset_utils.write_label_file(labels_to_names, dataset_dir)
    
        return slim.dataset.Dataset(
                data_sources=file_pattern,                    # TFR文件名
                reader=reader,                                # 阅读器
                decoder=decoder,                              # 解码Tensor
                num_samples=split_to_sizes[split_name],       # 数目
                items_to_descriptions=items_to_descriptions,  # decoder条目描述字段
                num_classes=num_classes,                      # 类别数
                labels_to_names=labels_to_names               # 字典{图片:类别,……}
        )
    
    ''' items_to_descriptions:
        {'image': 'A color image of varying height and width.',
         'shape': 'Shape of the image',
         'object/bbox': 'A list of bounding boxes, one per each object.',
         'object/label': 'A list of labels, one per each object.',}
    '''
    

    这里额外说一句,存储数据中ymin、xmin、ymax、xmax格子存储为(n,)的shape(n表示图像中对象数目),但是在进行了items_to_handlers之后,新的handlers:object/bbox形状变化为(n, 4),由于这涉及到多目标检测后续一系列处理,所以值得注意。

    从TFR中获取 batch数据

                with tf.name_scope(FLAGS.dataset_name + '_data_provider'):
                    provider = slim.dataset_data_provider.DatasetDataProvider(
                        dataset,  # DatasetDataProvider 需要 slim.dataset.Dataset 做参数
                        num_readers=FLAGS.num_readers,
                        common_queue_capacity=20 * FLAGS.batch_size,
                        common_queue_min=10 * FLAGS.batch_size,
                        shuffle=True)
                # Get for SSD network: image, labels, bboxes.c
                # DatasetDataProvider可以通过TFR字段获取batch size数据
                [image, shape, glabels, gbboxes] = provider.get(['image', 'shape',
                                                                 'object/label',
                                                                 'object/bbox'])
    

     此时数据已经获取完毕,预处理之后即可加入运算。

    注意,直到现在为止,我们仅对图片数据进行了解码,并没有扩充维度,也就是说其维度依然是3维

    二、数据处理

    获取对应数据集的预处里函数,并使用其处理上面小结中获取的batch数据,

    image_preprocessing_fn = preprocessing_factory.get_preprocessing(
                preprocessing_name, is_training=True)
    
    # Pre-processing image, labels and bboxes.
    image, glabels, gbboxes = 
        image_preprocessing_fn(image, glabels, gbboxes,
                               out_shape=ssd_shape,  # (300,300)
                               data_format=DATA_FORMAT)  # 'NCHW'
    

     有的时候你会觉得这种层层调用非常的sb……下面两步依旧是个调用链,

    def get_preprocessing(name, is_training=False):
        preprocessing_fn_map = {
            'ssd_300_vgg': ssd_vgg_preprocessing,
            'ssd_512_vgg': ssd_vgg_preprocessing,
        }
    
        if name not in preprocessing_fn_map:
            raise ValueError('Preprocessing name [%s] was not recognized' % name)
    
        def preprocessing_fn(image, labels, bboxes,
                             out_shape, data_format='NHWC', **kwargs):
            return preprocessing_fn_map[name].preprocess_image(
                image, labels, bboxes, out_shape, data_format=data_format,
                is_training=is_training, **kwargs)
        return preprocessing_fn
    
    
    def preprocess_image(image,
                         labels,
                         bboxes,
                         out_shape,
                         data_format,
                         is_training=False,
                         **kwargs):
        if is_training:
            return preprocess_for_train(image, labels, bboxes,
                                        out_shape=out_shape,
                                        data_format=data_format)
        else:
            return preprocess_for_eval(image, labels, bboxes,
                                       out_shape=out_shape,
                                       data_format=data_format,
                                       **kwargs)
    

    之后就是数据具体的预处理函数,本篇我们仅仅关注训练预处理。

    训练数据预处理概览

    大致流程是:

    有条件的在原图上裁剪一个区域

    计算裁剪后区域和各个标注框的重叠,视阈值保留bboxes和labels

    裁剪出来的图片放大到输入图片大小(bbox都是归一化的,不需要放缩)

    随机翻转(bbox要同步翻转)

    其他预处理(不涉及bbox)

    返回image, labels, bboxes

    def preprocess_for_train(image, labels, bboxes,
                             out_shape, data_format='NHWC',
                             scope='ssd_preprocessing_train'):
        """Preprocesses the given image for training.
        """
        fast_mode = False
        with tf.name_scope(scope, 'ssd_preprocessing_train', [image, labels, bboxes]):
            if image.get_shape().ndims != 3:
                raise ValueError('Input must be of size [height, width, C>0]')
            # Convert to float scaled [0, 1].
            if image.dtype != tf.float32:
                image = tf.image.convert_image_dtype(image, dtype=tf.float32)
            tf_summary_image(image, bboxes, 'image_with_bboxes')
            # 上面保证了图片是3维的tf.float32格式
    
            # (有条件的)随机裁剪,筛选调整后的labels(n,)bboxes(n, 4),裁剪图片对应原图坐标(4,)
            dst_image, labels, bboxes, distort_bbox = 
                distorted_bounding_box_crop(image, labels, bboxes,
                                            min_object_covered=MIN_OBJECT_COVERED,  # 0.25
                                            aspect_ratio_range=CROP_RATIO_RANGE)  # (0.6, 1.67)
    
            # Resize image to output size.
            dst_image = tf_image.resize_image(dst_image, out_shape,
                                              method=tf.image.ResizeMethod.BILINEAR,
                                              align_corners=False)
            tf_summary_image(dst_image, bboxes, 'image_shape_distorted')
    
            # Randomly flip the image horizontally.
            dst_image, bboxes = tf_image.random_flip_left_right(dst_image, bboxes)
    
            # Randomly distort the colors. There are 4 ways to do it.
            dst_image = apply_with_random_selector(
                    dst_image,
                    lambda x, ordering: distort_color(x, ordering, fast_mode),
                    num_cases=4)
            tf_summary_image(dst_image, bboxes, 'image_color_distorted')
    
            # Rescale to VGG input scale.
            image = dst_image * 255.
            image = tf_image_whitened(image, [_R_MEAN, _G_MEAN, _B_MEAN])
            # mean = tf.constant(means, dtype=image.dtype)
            # image = image - mean
    
            # Image data format.
            if data_format == 'NCHW':
                image = tf.transpose(image, perm=(2, 0, 1))
            # 'NHWC' (n,) (n, 4)
            return image, labels, bboxes
    

    裁剪图片并调整labels、bboxes

    整体流程如下,

    调用内置函数保证裁剪的大小范围以及一定会包含一些关注目标,返回裁剪参数

    裁剪(注意保留裁剪位置参数)图片

    计算裁剪框和各个检测框的重叠,并设置阈值舍弃、调整保留框坐标

    def distorted_bounding_box_crop(image,
                                    labels,
                                    bboxes,
                                    min_object_covered=0.3,
                                    aspect_ratio_range=(0.9, 1.1),
                                    area_range=(0.1, 1.0),
                                    max_attempts=200,
                                    clip_bboxes=True,
                                    scope=None):
        """Generates cropped_image using a one of the bboxes randomly distorted.
    
        See `tf.image.sample_distorted_bounding_box` for more documentation.
    
        Args:
            image: 3-D Tensor of image (it will be converted to floats in [0, 1]).
            bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords]
                where each coordinate is [0, 1) and the coordinates are arranged
                as [ymin, xmin, ymax, xmax]. If num_boxes is 0 then it would use the whole
                image.
            min_object_covered: An optional `float`. Defaults to `0.1`. The cropped
                area of the image must contain at least this fraction of any bounding box
                supplied.
            aspect_ratio_range: An optional list of `floats`. The cropped area of the
                image must have an aspect ratio = width / height within this range.
            area_range: An optional list of `floats`. The cropped area of the image
                must contain a fraction of the supplied image within in this range.
            max_attempts: An optional `int`. Number of attempts at generating a cropped
                region of the image of the specified constraints. After `max_attempts`
                failures, return the entire image.
            scope: Optional scope for name_scope.
        Returns:
            A tuple, a 3-D Tensor cropped_image and the distorted bbox
        """
        with tf.name_scope(scope, 'distorted_bounding_box_crop', [image, bboxes]):
            # 高级的随机裁剪
            # The bounding box coordinates are floats in `[0.0, 1.0]` relative to the width
            # and height of the underlying image.
            # 1-D, 1-D, [1, 1, 4]
            bbox_begin, bbox_size, distort_bbox = tf.image.sample_distorted_bounding_box(
                    tf.shape(image),
                    bounding_boxes=tf.expand_dims(bboxes, 0),  # [1, n, 4]
                    min_object_covered=min_object_covered,
                    aspect_ratio_range=aspect_ratio_range,
                    area_range=area_range,
                    max_attempts=max_attempts,
                    use_image_if_no_bounding_boxes=True)
            '''
            Returns:
                A tuple of `Tensor` objects (begin, size, bboxes).
    
            begin: A `Tensor`. Has the same type as `image_size`. 1-D, containing `[offset_height, offset_width, 0]`. 
                Provide as input to `tf.slice`.
            size: A `Tensor`. Has the same type as `image_size`. 1-D, containing `[target_height, target_width, -1]`. 
                Provide as input to `tf.slice`.
            bboxes: A `Tensor` of type `float32`. 3-D with shape `[1, 1, 4]` containing the distorted bounding box.
                Provide as input to `tf.image.draw_bounding_boxes`.
            '''
            # [4]
            distort_bbox = distort_bbox[0, 0]
    
            # Crop the image to the specified bounding box.
            cropped_image = tf.slice(image, bbox_begin, bbox_size)
            # Restore the shape since the dynamic slice loses 3rd dimension.
            cropped_image.set_shape([None, None, 3])  # <-----设置了尺寸了哈
    
            # Update bounding boxes: resize and filter out.
            bboxes = tfe.bboxes_resize(distort_bbox, bboxes)  # [4], [n, 4]
            labels, bboxes = tfe.bboxes_filter_overlap(labels, bboxes,
                                                       threshold=BBOX_CROP_OVERLAP,  # 0.5
                                                       assign_negative=False)
            # 返回随机裁剪的图片,筛选调整后的labels(n,)、bboxes(n, 4),裁剪图片对应原图坐标(4,)
            return cropped_image, labels, bboxes, distort_bbox
    

    三个关键函数:

    tf.image.sample_distorted_bounding_box 裁剪,用法查看文档,就是裁剪一个子图,返回最后参数是子图坐标

    bboxes_resize 框坐标原点置为裁剪框左上角点,xy单位长度置为裁剪框wh(归一化)

    bboxes_filter_overlap 计算重叠区/原框的百分比,舍弃达不到阈值的labels和bboxes

    其中第二个函数我们前面并未强调,但是,由于所有的涉及框坐标的计算都是基于图像坐标归一化之后(tf内置函数都是这样),所以这一步计算是必要的,将坐标系由原图(注意是图,这也导致了两者单位长度差别很大)转换为裁剪框,并设定单位长度。

    def bboxes_resize(bbox_ref, bboxes, name=None):
        # Tensors inputs.
        with tf.name_scope(name, 'bboxes_resize'):
            # Translate.
            # bbox_ref:['ymin', 'xmin', 'ymax', 'xmax']
            v = tf.stack([bbox_ref[0], bbox_ref[1], bbox_ref[0], bbox_ref[1]])
            bboxes = bboxes - v
            # Scale.
            s = tf.stack([bbox_ref[2] - bbox_ref[0],  # h
                          bbox_ref[3] - bbox_ref[1],  # w
                          bbox_ref[2] - bbox_ref[0],
                          bbox_ref[3] - bbox_ref[1]])
            bboxes = bboxes / s
            return bboxes
    
    def bboxes_filter_overlap(labels, bboxes,
                              threshold=0.5, assign_negative=False,
                              scope=None):
        """Filter out bounding boxes based on (relative )overlap with reference
        box [0, 0, 1, 1].  Remove completely bounding boxes, or assign negative
        labels to the one outside (useful for latter processing...).
    
        Return:
          labels, bboxes: Filtered (or newly assigned) elements.
        """
        with tf.name_scope(scope, 'bboxes_filter', [labels, bboxes]):
            # (N,) Tensor:和[0,0,1,1]相交面积大于0的位置返回面积比(相交/原本),小于0的位置返回0
            scores = bboxes_intersection(tf.constant([0, 0, 1, 1], bboxes.dtype),
                                         bboxes)
            mask = scores > threshold
            if assign_negative:  # 保留所有的label和框,重叠区不够的label置负
                labels = tf.where(mask, labels, -labels)  # 交叉满足的标记为正,否则为负
            else:  # 删除重叠区不够的label和框
                labels = tf.boolean_mask(labels, mask)  # bool掩码,类似于array的bool切片
                bboxes = tf.boolean_mask(bboxes, mask)
            return labels, bboxes
    
    
    # 被上面函数调用,计算相交(和裁剪框)面积占原框面积比值
    def bboxes_intersection(bbox_ref, bboxes, name=None):
        """Compute relative intersection between a reference box and a
        collection of bounding boxes. Namely, compute the quotient between
        intersection area and box area.
    
        Args:
          bbox_ref: (N, 4) or (4,) Tensor with reference bounding box(es).
          bboxes: (N, 4) Tensor, collection of bounding boxes.
        Return:
          (N,) Tensor with relative intersection.
        """
        with tf.name_scope(name, 'bboxes_intersection'):
            # Should be more efficient to first transpose.
            bboxes = tf.transpose(bboxes)
            bbox_ref = tf.transpose(bbox_ref)
            # Intersection bbox and volume.
            int_ymin = tf.maximum(bboxes[0], bbox_ref[0])
            int_xmin = tf.maximum(bboxes[1], bbox_ref[1])
            int_ymax = tf.minimum(bboxes[2], bbox_ref[2])
            int_xmax = tf.minimum(bboxes[3], bbox_ref[3])
            h = tf.maximum(int_ymax - int_ymin, 0.)
            w = tf.maximum(int_xmax - int_xmin, 0.)
            # Volumes.
            inter_vol = h * w  # 各个框在[0,0,1,1]内的面积
            bboxes_vol = (bboxes[2] - bboxes[0]) * (bboxes[3] - bboxes[1])  # 各个框面积
            scores = tfe_math.safe_divide(inter_vol, bboxes_vol, 'intersection')
            # from tensorflow.python.ops import math_ops
            # 大于0的位置返回面积比,小于0的位置返回0
            # tf.where(math_ops.greater(bboxes_vol, 0),  # 返回bool表是否大于0
            #          math_ops.divide(inter_vol, bboxes_vol),
            #          tf.zeros_like(inter_vol), name=name)
            return scores
    

    其他预处理函数没什么特别注意的,不多介绍,自行查看源码即可。

    至此,数据预处理完成,我们给出自从TFR中获取数据到预处理完成的局部代码,如下,

            with tf.device(deploy_config.inputs_device()):
                with tf.name_scope(FLAGS.dataset_name + '_data_provider'):
                    provider = slim.dataset_data_provider.DatasetDataProvider(
                        dataset,  # DatasetDataProvider 需要 slim.dataset.Dataset 做参数
                        num_readers=FLAGS.num_readers,
                        common_queue_capacity=20 * FLAGS.batch_size,
                        common_queue_min=10 * FLAGS.batch_size,
                        shuffle=True)
                # Get for SSD network: image, labels, bboxes.c
                # DatasetDataProvider可以通过TFR字段获取batch size数据
                [image, shape, glabels, gbboxes] = provider.get(['image', 'shape',
                                                                 'object/label',
                                                                 'object/bbox'])
                # Pre-processing image, labels and bboxes.
                # 'CHW' (n,) (n, 4)
                image, glabels, gbboxes = 
                    image_preprocessing_fn(image, glabels, gbboxes,
                                           out_shape=ssd_shape,  # (300,300)
                                           data_format=DATA_FORMAT)  # 'NCHW'
    
  • 相关阅读:
    【LeetCode】17. Letter Combinations of a Phone Number
    【LeetCode】16. 3Sum Closest
    【LeetCode】15. 3Sum 三个数和为0
    【LeetCode】14. Longest Common Prefix 最长前缀子串
    【LeetCode】13. Roman to Integer 罗马数字转整数
    【LeetCode】12. Integer to Roman 整型数转罗马数
    【LeetCode】11. Container With Most Water
    【LeetCode】10. Regular Expression Matching
    Models of good programmer
    RSA Algorithm
  • 原文地址:https://www.cnblogs.com/hellcat/p/9341921.html
Copyright © 2011-2022 走看看