zoukankan      html  css  js  c++  java
  • 『TensorFlow』SSD源码学习_其五:TFR数据读取&数据预处理

    Fork版本项目地址:SSD

    一、TFR数据读取

    创建slim.dataset.Dataset对象

    在train_ssd_network.py获取数据操作如下,首先需要slim.dataset.Dataset对象

    # Select the dataset.
    # 'imagenet', 'train', tfr文件存储位置
    # TFR文件命名格式:'voc_2012_%s_*.tfrecord',%s使用train或者test
    dataset = dataset_factory.get_dataset(
        FLAGS.dataset_name, FLAGS.dataset_split_name, FLAGS.dataset_dir)
    

    获取过程会经过一系列臃肿的调用,我把中间被调用的函数(们)写在了下面,由上到下依次调用:

    def get_dataset(name, split_name, dataset_dir, file_pattern=None, reader=None):
        """
        Returns:
            A `Dataset` class.
        Raises:
            ValueError: If the dataset `name` is unknown.
        """
        if name not in datasets_map:
            raise ValueError('Name of dataset unknown %s' % name)
        # pascalvoc_2012.get_split
        return datasets_map[name].get_split(split_name,
                                            dataset_dir,
                                            file_pattern,
                                            reader)
    
    
    def get_split(split_name, dataset_dir, file_pattern=None, reader=None):
        """
        Returns:
          A `Dataset` namedtuple.
        Raises:
            ValueError: if `split_name` is not a valid train/test split.
        """
        if not file_pattern:
            file_pattern = FILE_PATTERN  # 需要文件命名格式满足:'voc_2012_%s_*.tfrecord'
        return pascalvoc_common.get_split(split_name, dataset_dir,
                                          file_pattern, reader,
                                          SPLITS_TO_SIZES,  # {'train': 17125,}
                                          ITEMS_TO_DESCRIPTIONS,
                                          NUM_CLASSES  # 20
                                          )
        """
        ITEMS_TO_DESCRIPTIONS = {
        'image': 'A color image of varying height and width.',
        'shape': 'Shape of the image',
        'object/bbox': 'A list of bounding boxes, one per each object.',
        'object/label': 'A list of labels, one per each object.',
        }
        """
    

    最终调用,获取slim.dataset.Dataset(解析见『TensorFlow』从磁盘读取数据),实际上能够传入满足slim.dataset.Dataset的参数即可:

    def get_split(split_name, dataset_dir, file_pattern, reader,
                  split_to_sizes, items_to_descriptions, num_classes):
        """Gets a dataset tuple with instructions for reading Pascal VOC dataset.
    
        Args:
          split_name: A train/test split name.
          dataset_dir: The base directory of the dataset sources.
          file_pattern: The file pattern to use when matching the dataset sources.
            It is assumed that the pattern contains a '%s' string so that the split
            name can be inserted.
          reader: The TensorFlow reader type.
    
        Returns:
          A `Dataset` namedtuple.
    
        Raises:
            ValueError: if `split_name` is not a valid train/test split.
        """
        # 'train'
        if split_name not in split_to_sizes:
            raise ValueError('split name %s was not recognized.' % split_name)
        file_pattern = os.path.join(dataset_dir, file_pattern % split_name)
    
        # Allowing None in the signature so that dataset_factory can use the default.
        if reader is None:
            reader = tf.TFRecordReader
        # Features in Pascal VOC TFRecords.
        keys_to_features = {  # 解码TFR文件方式
            'image/encoded': tf.FixedLenFeature((), tf.string, default_value=''),
            'image/format': tf.FixedLenFeature((), tf.string, default_value='jpeg'),
            'image/height': tf.FixedLenFeature([1], tf.int64),
            'image/width': tf.FixedLenFeature([1], tf.int64),
            'image/channels': tf.FixedLenFeature([1], tf.int64),
            'image/shape': tf.FixedLenFeature([3], tf.int64),
            'image/object/bbox/xmin': tf.VarLenFeature(dtype=tf.float32),
            'image/object/bbox/ymin': tf.VarLenFeature(dtype=tf.float32),
            'image/object/bbox/xmax': tf.VarLenFeature(dtype=tf.float32),
            'image/object/bbox/ymax': tf.VarLenFeature(dtype=tf.float32),
            'image/object/bbox/label': tf.VarLenFeature(dtype=tf.int64),
            'image/object/bbox/difficult': tf.VarLenFeature(dtype=tf.int64),
            'image/object/bbox/truncated': tf.VarLenFeature(dtype=tf.int64),
        }
        items_to_handlers = {  # 解码二进制数据条目
            'image': slim.tfexample_decoder.Image('image/encoded', 'image/format'),
            'shape': slim.tfexample_decoder.Tensor('image/shape'),
            'object/bbox': slim.tfexample_decoder.BoundingBox(
                    ['ymin', 'xmin', 'ymax', 'xmax'], 'image/object/bbox/'),
            'object/label': slim.tfexample_decoder.Tensor('image/object/bbox/label'),
            'object/difficult': slim.tfexample_decoder.Tensor('image/object/bbox/difficult'),
            'object/truncated': slim.tfexample_decoder.Tensor('image/object/bbox/truncated'),
        }
        # 解码实施
        decoder = slim.tfexample_decoder.TFExampleDecoder(
            keys_to_features, items_to_handlers)
    
        labels_to_names = None
        # tf.gfile.Exists(os.path.join(dataset_dir, 'labels.txt'))
        if dataset_utils.has_labels(dataset_dir):
            labels_to_names = dataset_utils.read_label_file(dataset_dir)
        # else:
        #     labels_to_names = create_readable_names_for_imagenet_labels()
        #     dataset_utils.write_label_file(labels_to_names, dataset_dir)
    
        return slim.dataset.Dataset(
                data_sources=file_pattern,                    # TFR文件名
                reader=reader,                                # 阅读器
                decoder=decoder,                              # 解码Tensor
                num_samples=split_to_sizes[split_name],       # 数目
                items_to_descriptions=items_to_descriptions,  # decoder条目描述字段
                num_classes=num_classes,                      # 类别数
                labels_to_names=labels_to_names               # 字典{图片:类别,……}
        )
    
    ''' items_to_descriptions:
        {'image': 'A color image of varying height and width.',
         'shape': 'Shape of the image',
         'object/bbox': 'A list of bounding boxes, one per each object.',
         'object/label': 'A list of labels, one per each object.',}
    '''
    

    这里额外说一句,存储数据中ymin、xmin、ymax、xmax格子存储为(n,)的shape(n表示图像中对象数目),但是在进行了items_to_handlers之后,新的handlers:object/bbox形状变化为(n, 4),由于这涉及到多目标检测后续一系列处理,所以值得注意。

    从TFR中获取 batch数据

                with tf.name_scope(FLAGS.dataset_name + '_data_provider'):
                    provider = slim.dataset_data_provider.DatasetDataProvider(
                        dataset,  # DatasetDataProvider 需要 slim.dataset.Dataset 做参数
                        num_readers=FLAGS.num_readers,
                        common_queue_capacity=20 * FLAGS.batch_size,
                        common_queue_min=10 * FLAGS.batch_size,
                        shuffle=True)
                # Get for SSD network: image, labels, bboxes.c
                # DatasetDataProvider可以通过TFR字段获取batch size数据
                [image, shape, glabels, gbboxes] = provider.get(['image', 'shape',
                                                                 'object/label',
                                                                 'object/bbox'])
    

     此时数据已经获取完毕,预处理之后即可加入运算。

    注意,直到现在为止,我们仅对图片数据进行了解码,并没有扩充维度,也就是说其维度依然是3维

    二、数据处理

    获取对应数据集的预处里函数,并使用其处理上面小结中获取的batch数据,

    image_preprocessing_fn = preprocessing_factory.get_preprocessing(
                preprocessing_name, is_training=True)
    
    # Pre-processing image, labels and bboxes.
    image, glabels, gbboxes = 
        image_preprocessing_fn(image, glabels, gbboxes,
                               out_shape=ssd_shape,  # (300,300)
                               data_format=DATA_FORMAT)  # 'NCHW'
    

     有的时候你会觉得这种层层调用非常的sb……下面两步依旧是个调用链,

    def get_preprocessing(name, is_training=False):
        preprocessing_fn_map = {
            'ssd_300_vgg': ssd_vgg_preprocessing,
            'ssd_512_vgg': ssd_vgg_preprocessing,
        }
    
        if name not in preprocessing_fn_map:
            raise ValueError('Preprocessing name [%s] was not recognized' % name)
    
        def preprocessing_fn(image, labels, bboxes,
                             out_shape, data_format='NHWC', **kwargs):
            return preprocessing_fn_map[name].preprocess_image(
                image, labels, bboxes, out_shape, data_format=data_format,
                is_training=is_training, **kwargs)
        return preprocessing_fn
    
    
    def preprocess_image(image,
                         labels,
                         bboxes,
                         out_shape,
                         data_format,
                         is_training=False,
                         **kwargs):
        if is_training:
            return preprocess_for_train(image, labels, bboxes,
                                        out_shape=out_shape,
                                        data_format=data_format)
        else:
            return preprocess_for_eval(image, labels, bboxes,
                                       out_shape=out_shape,
                                       data_format=data_format,
                                       **kwargs)
    

    之后就是数据具体的预处理函数,本篇我们仅仅关注训练预处理。

    训练数据预处理概览

    大致流程是:

    有条件的在原图上裁剪一个区域

    计算裁剪后区域和各个标注框的重叠,视阈值保留bboxes和labels

    裁剪出来的图片放大到输入图片大小(bbox都是归一化的,不需要放缩)

    随机翻转(bbox要同步翻转)

    其他预处理(不涉及bbox)

    返回image, labels, bboxes

    def preprocess_for_train(image, labels, bboxes,
                             out_shape, data_format='NHWC',
                             scope='ssd_preprocessing_train'):
        """Preprocesses the given image for training.
        """
        fast_mode = False
        with tf.name_scope(scope, 'ssd_preprocessing_train', [image, labels, bboxes]):
            if image.get_shape().ndims != 3:
                raise ValueError('Input must be of size [height, width, C>0]')
            # Convert to float scaled [0, 1].
            if image.dtype != tf.float32:
                image = tf.image.convert_image_dtype(image, dtype=tf.float32)
            tf_summary_image(image, bboxes, 'image_with_bboxes')
            # 上面保证了图片是3维的tf.float32格式
    
            # (有条件的)随机裁剪,筛选调整后的labels(n,)bboxes(n, 4),裁剪图片对应原图坐标(4,)
            dst_image, labels, bboxes, distort_bbox = 
                distorted_bounding_box_crop(image, labels, bboxes,
                                            min_object_covered=MIN_OBJECT_COVERED,  # 0.25
                                            aspect_ratio_range=CROP_RATIO_RANGE)  # (0.6, 1.67)
    
            # Resize image to output size.
            dst_image = tf_image.resize_image(dst_image, out_shape,
                                              method=tf.image.ResizeMethod.BILINEAR,
                                              align_corners=False)
            tf_summary_image(dst_image, bboxes, 'image_shape_distorted')
    
            # Randomly flip the image horizontally.
            dst_image, bboxes = tf_image.random_flip_left_right(dst_image, bboxes)
    
            # Randomly distort the colors. There are 4 ways to do it.
            dst_image = apply_with_random_selector(
                    dst_image,
                    lambda x, ordering: distort_color(x, ordering, fast_mode),
                    num_cases=4)
            tf_summary_image(dst_image, bboxes, 'image_color_distorted')
    
            # Rescale to VGG input scale.
            image = dst_image * 255.
            image = tf_image_whitened(image, [_R_MEAN, _G_MEAN, _B_MEAN])
            # mean = tf.constant(means, dtype=image.dtype)
            # image = image - mean
    
            # Image data format.
            if data_format == 'NCHW':
                image = tf.transpose(image, perm=(2, 0, 1))
            # 'NHWC' (n,) (n, 4)
            return image, labels, bboxes
    

    裁剪图片并调整labels、bboxes

    整体流程如下,

    调用内置函数保证裁剪的大小范围以及一定会包含一些关注目标,返回裁剪参数

    裁剪(注意保留裁剪位置参数)图片

    计算裁剪框和各个检测框的重叠,并设置阈值舍弃、调整保留框坐标

    def distorted_bounding_box_crop(image,
                                    labels,
                                    bboxes,
                                    min_object_covered=0.3,
                                    aspect_ratio_range=(0.9, 1.1),
                                    area_range=(0.1, 1.0),
                                    max_attempts=200,
                                    clip_bboxes=True,
                                    scope=None):
        """Generates cropped_image using a one of the bboxes randomly distorted.
    
        See `tf.image.sample_distorted_bounding_box` for more documentation.
    
        Args:
            image: 3-D Tensor of image (it will be converted to floats in [0, 1]).
            bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords]
                where each coordinate is [0, 1) and the coordinates are arranged
                as [ymin, xmin, ymax, xmax]. If num_boxes is 0 then it would use the whole
                image.
            min_object_covered: An optional `float`. Defaults to `0.1`. The cropped
                area of the image must contain at least this fraction of any bounding box
                supplied.
            aspect_ratio_range: An optional list of `floats`. The cropped area of the
                image must have an aspect ratio = width / height within this range.
            area_range: An optional list of `floats`. The cropped area of the image
                must contain a fraction of the supplied image within in this range.
            max_attempts: An optional `int`. Number of attempts at generating a cropped
                region of the image of the specified constraints. After `max_attempts`
                failures, return the entire image.
            scope: Optional scope for name_scope.
        Returns:
            A tuple, a 3-D Tensor cropped_image and the distorted bbox
        """
        with tf.name_scope(scope, 'distorted_bounding_box_crop', [image, bboxes]):
            # 高级的随机裁剪
            # The bounding box coordinates are floats in `[0.0, 1.0]` relative to the width
            # and height of the underlying image.
            # 1-D, 1-D, [1, 1, 4]
            bbox_begin, bbox_size, distort_bbox = tf.image.sample_distorted_bounding_box(
                    tf.shape(image),
                    bounding_boxes=tf.expand_dims(bboxes, 0),  # [1, n, 4]
                    min_object_covered=min_object_covered,
                    aspect_ratio_range=aspect_ratio_range,
                    area_range=area_range,
                    max_attempts=max_attempts,
                    use_image_if_no_bounding_boxes=True)
            '''
            Returns:
                A tuple of `Tensor` objects (begin, size, bboxes).
    
            begin: A `Tensor`. Has the same type as `image_size`. 1-D, containing `[offset_height, offset_width, 0]`. 
                Provide as input to `tf.slice`.
            size: A `Tensor`. Has the same type as `image_size`. 1-D, containing `[target_height, target_width, -1]`. 
                Provide as input to `tf.slice`.
            bboxes: A `Tensor` of type `float32`. 3-D with shape `[1, 1, 4]` containing the distorted bounding box.
                Provide as input to `tf.image.draw_bounding_boxes`.
            '''
            # [4]
            distort_bbox = distort_bbox[0, 0]
    
            # Crop the image to the specified bounding box.
            cropped_image = tf.slice(image, bbox_begin, bbox_size)
            # Restore the shape since the dynamic slice loses 3rd dimension.
            cropped_image.set_shape([None, None, 3])  # <-----设置了尺寸了哈
    
            # Update bounding boxes: resize and filter out.
            bboxes = tfe.bboxes_resize(distort_bbox, bboxes)  # [4], [n, 4]
            labels, bboxes = tfe.bboxes_filter_overlap(labels, bboxes,
                                                       threshold=BBOX_CROP_OVERLAP,  # 0.5
                                                       assign_negative=False)
            # 返回随机裁剪的图片,筛选调整后的labels(n,)、bboxes(n, 4),裁剪图片对应原图坐标(4,)
            return cropped_image, labels, bboxes, distort_bbox
    

    三个关键函数:

    tf.image.sample_distorted_bounding_box 裁剪,用法查看文档,就是裁剪一个子图,返回最后参数是子图坐标

    bboxes_resize 框坐标原点置为裁剪框左上角点,xy单位长度置为裁剪框wh(归一化)

    bboxes_filter_overlap 计算重叠区/原框的百分比,舍弃达不到阈值的labels和bboxes

    其中第二个函数我们前面并未强调,但是,由于所有的涉及框坐标的计算都是基于图像坐标归一化之后(tf内置函数都是这样),所以这一步计算是必要的,将坐标系由原图(注意是图,这也导致了两者单位长度差别很大)转换为裁剪框,并设定单位长度。

    def bboxes_resize(bbox_ref, bboxes, name=None):
        # Tensors inputs.
        with tf.name_scope(name, 'bboxes_resize'):
            # Translate.
            # bbox_ref:['ymin', 'xmin', 'ymax', 'xmax']
            v = tf.stack([bbox_ref[0], bbox_ref[1], bbox_ref[0], bbox_ref[1]])
            bboxes = bboxes - v
            # Scale.
            s = tf.stack([bbox_ref[2] - bbox_ref[0],  # h
                          bbox_ref[3] - bbox_ref[1],  # w
                          bbox_ref[2] - bbox_ref[0],
                          bbox_ref[3] - bbox_ref[1]])
            bboxes = bboxes / s
            return bboxes
    
    def bboxes_filter_overlap(labels, bboxes,
                              threshold=0.5, assign_negative=False,
                              scope=None):
        """Filter out bounding boxes based on (relative )overlap with reference
        box [0, 0, 1, 1].  Remove completely bounding boxes, or assign negative
        labels to the one outside (useful for latter processing...).
    
        Return:
          labels, bboxes: Filtered (or newly assigned) elements.
        """
        with tf.name_scope(scope, 'bboxes_filter', [labels, bboxes]):
            # (N,) Tensor:和[0,0,1,1]相交面积大于0的位置返回面积比(相交/原本),小于0的位置返回0
            scores = bboxes_intersection(tf.constant([0, 0, 1, 1], bboxes.dtype),
                                         bboxes)
            mask = scores > threshold
            if assign_negative:  # 保留所有的label和框,重叠区不够的label置负
                labels = tf.where(mask, labels, -labels)  # 交叉满足的标记为正,否则为负
            else:  # 删除重叠区不够的label和框
                labels = tf.boolean_mask(labels, mask)  # bool掩码,类似于array的bool切片
                bboxes = tf.boolean_mask(bboxes, mask)
            return labels, bboxes
    
    
    # 被上面函数调用,计算相交(和裁剪框)面积占原框面积比值
    def bboxes_intersection(bbox_ref, bboxes, name=None):
        """Compute relative intersection between a reference box and a
        collection of bounding boxes. Namely, compute the quotient between
        intersection area and box area.
    
        Args:
          bbox_ref: (N, 4) or (4,) Tensor with reference bounding box(es).
          bboxes: (N, 4) Tensor, collection of bounding boxes.
        Return:
          (N,) Tensor with relative intersection.
        """
        with tf.name_scope(name, 'bboxes_intersection'):
            # Should be more efficient to first transpose.
            bboxes = tf.transpose(bboxes)
            bbox_ref = tf.transpose(bbox_ref)
            # Intersection bbox and volume.
            int_ymin = tf.maximum(bboxes[0], bbox_ref[0])
            int_xmin = tf.maximum(bboxes[1], bbox_ref[1])
            int_ymax = tf.minimum(bboxes[2], bbox_ref[2])
            int_xmax = tf.minimum(bboxes[3], bbox_ref[3])
            h = tf.maximum(int_ymax - int_ymin, 0.)
            w = tf.maximum(int_xmax - int_xmin, 0.)
            # Volumes.
            inter_vol = h * w  # 各个框在[0,0,1,1]内的面积
            bboxes_vol = (bboxes[2] - bboxes[0]) * (bboxes[3] - bboxes[1])  # 各个框面积
            scores = tfe_math.safe_divide(inter_vol, bboxes_vol, 'intersection')
            # from tensorflow.python.ops import math_ops
            # 大于0的位置返回面积比,小于0的位置返回0
            # tf.where(math_ops.greater(bboxes_vol, 0),  # 返回bool表是否大于0
            #          math_ops.divide(inter_vol, bboxes_vol),
            #          tf.zeros_like(inter_vol), name=name)
            return scores
    

    其他预处理函数没什么特别注意的,不多介绍,自行查看源码即可。

    至此,数据预处理完成,我们给出自从TFR中获取数据到预处理完成的局部代码,如下,

            with tf.device(deploy_config.inputs_device()):
                with tf.name_scope(FLAGS.dataset_name + '_data_provider'):
                    provider = slim.dataset_data_provider.DatasetDataProvider(
                        dataset,  # DatasetDataProvider 需要 slim.dataset.Dataset 做参数
                        num_readers=FLAGS.num_readers,
                        common_queue_capacity=20 * FLAGS.batch_size,
                        common_queue_min=10 * FLAGS.batch_size,
                        shuffle=True)
                # Get for SSD network: image, labels, bboxes.c
                # DatasetDataProvider可以通过TFR字段获取batch size数据
                [image, shape, glabels, gbboxes] = provider.get(['image', 'shape',
                                                                 'object/label',
                                                                 'object/bbox'])
                # Pre-processing image, labels and bboxes.
                # 'CHW' (n,) (n, 4)
                image, glabels, gbboxes = 
                    image_preprocessing_fn(image, glabels, gbboxes,
                                           out_shape=ssd_shape,  # (300,300)
                                           data_format=DATA_FORMAT)  # 'NCHW'
    
  • 相关阅读:
    OpenJudge 2764 数根 C++
    OpenJudge / Poj 1835 宇航员 C++
    elasticsearch系统性能调优总结
    ES基本查询总结
    Vim最全快捷键键位图
    Idea常用插件整合
    微信小程序开发资源汇总
    Java JNI调用本地动态库使用详解
    java开发调试定位分析工具大全
    Redux-saga使用教程详解
  • 原文地址:https://www.cnblogs.com/hellcat/p/9341921.html
Copyright © 2011-2022 走看看