zoukankan      html  css  js  c++  java
  • 数据集拆分,互转,可视化,查错

    分享数据集的集中常用处理代码,使用的时候记得改一下自己的路径,ann_dir是coco的json文件路径,img_dir是图片路径。如果用pycharm控制台输出的中文为乱码,将pycharm中的编码全改成utf-8(设置->编辑器->文件编码),把能改成utf-8的选项都改了。

    1️⃣ 有些数据集中含有unicode编码,也就是对应的中文,我们记录好每个unicode编码对应的id。然后将文件中的unicode编码转成id。

    # -*- coding: utf-8 -*-
    import json
    import os
    import random
    import time
    import shutil
    import glob
    
    category=['无瑕疵','花板跳', '水渍', '星跳', '浆斑', '油渍', '烧毛痕', '死皱', '筘路', '浪纹档', '三丝', '跳纱', '双经', '修痕',
              '污渍', '百脚', '松经', '跳花', '吊经', '纬纱不良', '断氨纶', '双纬', '粗维', '磨痕', '云织', '整经结', '稀密档', '断经',
              '粗经', '纬缩', '色差档', '毛粒', '破洞', '结头', '轧痕']
    
    root_path=os.getcwd()
    ann_dir=os.path.join(root_path,"smartdiagnosisofclothflaw_round1train1_datasets",
                                 "guangdong1_round1_train1_20190818","Annotations")
    img_dir=os.path.join(root_path,"smartdiagnosisofclothflaw_round1train1_datasets",
                                 "guangdong1_round1_train1_20190818","defect_Images")
    
    # 训练集比例
    train_percent = 0.8
    
    #####################################################################################
    #####                              数据集中文改英文
    #####################################################################################
    def unicode2id():
        ann_file=os.path.join(ann_dir,"anno_train.json")
        print(ann_file)
    
        # # 输出训练数据集中所有的类别
        # category_temp=set()
        # with open(anno_file, 'r', encoding='unicode_escape') as f:
        #     json_data = json.load(f)
        #     for i in json_data:
        #         category.add(i['defect_name'])
        # print(category_temp)
        # print(len(category_temp))
    
        data1=[]
        # 将数据集中的中文unicode编码,改编成数字id
        with open(ann_file, 'r', encoding='unicode_escape') as f:
            json_data = json.load(f)
            for i in json_data:
                data1.append({'name':i['name'],'defect_name':category.index(i['defect_name']),'bbox':i['bbox']})
    
        with open(os.path.join(ann_dir,'data.json'), 'w') as f:
             json.dump(data1, f)
    

    2️⃣ COCO数据集划分为train和val

    #####################################################################################
    #####                              COCO数据集划分为train,val
    #####################################################################################
    def coco_dataset_split():
        time_start = time.time()
    
        # 建立输出文件夹
        if not os.path.exists(os.path.join(root_path, "COCO2017")):
            os.makedirs(os.path.join(root_path, "COCO2017"))
        if not os.path.exists(os.path.join(root_path, "COCO2017","annotations")):
            os.makedirs(os.path.join(root_path,"COCO2017", "annotations"))
        if not os.path.exists(os.path.join(root_path, "COCO2017","train2017")):
            os.makedirs(os.path.join(root_path,"COCO2017", "train2017"))
        if not os.path.exists(os.path.join(root_path,"COCO2017", "val2017")):
            os.makedirs(os.path.join(root_path,"COCO2017", "val2017"))
    
        # 保存路径
        save_img_train_dir = os.path.join(root_path, "COCO2017", "train2017")
        save_img_val_dir = os.path.join(root_path, "COCO2017", "val2017")
        save_ann_train_file = os.path.join(root_path, "COCO2017", "annotations", "instances_train2017.json")
        save_ann_val_file = os.path.join(root_path, "COCO2017", "annotations", "instances_val2017.json")
    
        # 数据集类别及数量
        images_list = os.listdir(img_dir)
        images_num = len(images_list)
    
        train_num = int(images_num * train_percent)
        val_num=images_num-train_num
        train_list = random.sample(images_list, train_num)
        val_list = list(set(images_list) - set(train_list))
        print("| Images num: ",images_num)
        print("| Train num: ",train_num)
        print("| Val num: ",val_num)
    
        # 复制图片。
        for image_name in train_list:
            shutil.copy(os.path.join(img_dir, image_name), os.path.join(save_img_train_dir, image_name))
        for image_name in val_list:
            shutil.copy(os.path.join(img_dir, image_name), os.path.join(save_img_val_dir, image_name))
    
        ann_path=os.path.join(ann_dir,"anno_train.json")
    
        # 提取annotation
        train2017=[]
        val2017=[]
        with open(ann_path, 'r', encoding='unicode_escape') as fp:
            json_data = json.load(fp)
            for i in json_data:
                if i['name'] not in val_list:
                    train2017.append({'name':i['name'],'defect_name':category.index(i['defect_name']),'bbox':i['bbox']})
                else:
                    val2017.append({'name':i['name'],'defect_name':category.index(i['defect_name']),'bbox':i['bbox']})
        # 写入annotation
        with open(save_ann_train_file, 'w') as fp:
            json.dump(train2017, fp)
        with open(save_ann_val_file, 'w') as fp:
            json.dump(val2017, fp)
    
        time_end = time.time()
        cost_time=time_end-time_start
        print("| Cost time: ",cost_time//60//60,"hour",cost_time//60%60,"min",cost_time%60,"s")
    

    3️⃣ COCO数据集转换成VOC数据集,复制图片比较耗时,耐心等待就行了。为了节省时间,没有可视化复制图片的进度。如果想加,可以百度一下tqdm,加到复制图片的for循环中就可以了。

    #####################################################################################
    #####                              coco数据集转换成voc数据集
    #####################################################################################
    def coco2voc():
        from lxml.etree import Element, SubElement, tostring
        from xml.dom.minidom import parseString
    
        # 创建保存的文件夹
        if not os.path.exists(os.path.join(root_path, "VOCdevkit", "VOC2012")):
            os.makedirs(os.path.join(root_path, "VOCdevkit", "VOC2012"))
            os.makedirs(os.path.join(root_path, "VOCdevkit", "VOC2012", "Annotations"))
            os.makedirs(os.path.join(root_path, "VOCdevkit", "VOC2012", "ImageSets"))
            os.makedirs(os.path.join(root_path, "VOCdevkit", "VOC2012", "ImageSets", "Main"))
            os.makedirs(os.path.join(root_path, "VOCdevkit", "VOC2012", "JPEGImages"))
    
        # json文件路径
        ann_path = os.path.join(ann_dir,"data.json")
        ann_file = open(ann_path, "r", encoding='utf-8')
        ann_json_list = json.load(ann_file)
    
        save_xml_path = os.path.join(root_path, "VOCdevkit", 'VOC2012', 'Annotations')
    
        # 保存每个图片对应的category以及bbox.
        img_names = []
        img_bbox_category = {}
        for ann in ann_json_list:
            # 获取coco数据集中json的信息
            img_name = ann['name']
            category = ann['defect_name']
            bbox = ann['bbox']
            if img_name not in img_names:
                img_names.append(img_name)
                img_bbox_category[img_name] = [{"category":category,"bbox":bbox}]
            else:
                img_bbox_category[img_name].append({"category":category,"bbox":bbox})
    
        print('| Images start copy.')
        # 复制所有的图片到voc数据集中。
        for img_name in img_names:
            shutil.copy(os.path.join(img_dir, img_name), os.path.join(root_path, "VOCdevkit", 'VOC2012', 'JPEGImages', img_name))
        print('| Images copy finish.')
    
        print('| Jsons start transform')
        # 第一层循环遍历所有的照片,提出json中所有的信息,并分别放到不同xml文件中。
        for img_name in img_bbox_category.keys():
            # 获取图片名字
            img_name_temp = img_name
            root_node = Element('annotation')
            node_filename = SubElement(root_node, 'filename')
            node_filename.text = img_name_temp
    
            from PIL import Image
            node_size = SubElement(root_node, 'size')
            node_width = SubElement(node_size, 'width')
            node_height = SubElement(node_size, 'height')
            img_m = Image.open(os.path.join(img_dir,img_name))
            node_width.text = str(img_m.width)       # 图片的宽
            node_height.text = str(img_m.height)     # 图片的高
    
            # 第二层循环遍历有多少个框
            for bbox_and_category in img_bbox_category[img_name_temp]:
                category_temp = bbox_and_category["category"]
                bbox_temp = bbox_and_category["bbox"]
    
                # 类别名字
                node_object = SubElement(root_node, 'object')
                node_name = SubElement(node_object, 'name')
                node_name.text = str(category_temp)
    
                node_bndbox = SubElement(node_object, 'bndbox')
                node_xmin = SubElement(node_bndbox, 'xmin')
                node_xmin.text = str(bbox_temp[0])
                node_ymin = SubElement(node_bndbox, 'ymin')
                node_ymin.text = str(bbox_temp[1])
                node_xmax = SubElement(node_bndbox, 'xmax')
                node_xmax.text = str(bbox_temp[2])
                node_ymax = SubElement(node_bndbox, 'ymax')
                node_ymax.text = str(bbox_temp[3])
    
            xml = tostring(root_node)
            dom = parseString(xml)
            # print xml 打印查看结果
            img_name_temp = img_name_temp.replace(".jpg", "")
            xml_name = os.path.join(save_xml_path, img_name_temp+'.xml')
            with open(xml_name, 'wb') as f:
                f.write(dom.toprettyxml(indent='\t', encoding='utf-8'))
                # f.write(dom.toprettyxml(indent='\t',))
        print('| Jsons transform finish.')
    

    4️⃣ voc数据集转换成coco数据集

    #####################################################################################
    #####                           voc数据集转换成coco数据集
    #####################################################################################
    def voc2coco():
        import datetime
        from PIL import Image
    
        # 处理coco数据集中category字段。
        # 创建一个 {类名 : id} 的字典,并保存到 总标签data 字典中。
        class_name_to_id = {'class1':1, 'class2':2, 'class3':3, 'class4':4, 'class5':5, 'class6':6, 'class7':7, 'class8':8}
        
        # 创建coco的文件夹
        if not os.path.exists(os.path.join(root_path, "coco2017")):
            os.makedirs(os.path.join(root_path, "coco2017"))
            os.makedirs(os.path.join(root_path, "coco2017", "annotations"))
            os.makedirs(os.path.join(root_path, "coco2017", "train2017"))
            os.makedirs(os.path.join(root_path, "coco2017", "val2017"))
    
        # 创建 总标签data
        now = datetime.datetime.now()
        data = dict(
            info=dict(
                description=None,
                url=None,
                version=None,
                year=now.year,
                contributor=None,
                date_created=now.strftime("%Y-%m-%d %H:%M:%S.%f"),
            ),
            licenses=[dict(url=None, id=0, name=None, )],
            images=[
                # license, file_name,url, height, width, date_captured, id
            ],
            type="instances",
            annotations=[
                # segmentation, area, iscrowd, image_id, bbox, category_id, id
            ],
            categories=[
                # supercategory, id, name
            ],
        )
    
        for name,id in class_name_to_id.items():
            data["categories"].append(
                dict(supercategory=None, id=id, name=name, )
            )
    
        # 处理coco数据集train中images字段。
        images_dir=os.path.join(root_path,'VOCdevkit','VOC2012','JPEGImages')
        images=os.listdir(images_dir)
    
        # 生成每个图片对应的image_id
        images_id={}
        for idx,image_name in enumerate(images):
            images_id.update({image_name[:-4]:idx})
    
        # 获取训练图片
        train_img=[]
        fp = open(os.path.join(root_path,'VOCdevkit','VOC2012','ImageSets','Main','train.txt'))
        for i in fp.readlines():
            train_img.append(i[:-1]+".jpg")
    
        # 获取训练图片的数据
        for image in train_img:
            img = Image.open(os.path.join(images_dir,image))
            data["images"].append(
                dict(
                    license=0,
                    url=None,
                    file_name=image,              # 图片的文件名带后缀
                    height=img.height,
                    width=img.width,
                    date_captured=None,
                    # id=image[:-4],
                    id=images_id[image[:-4]],
                )
            )
    
        # 获取coco数据集train中annotations字段。
        train_xml=[i[:-4]+'.xml' for i in train_img]
    
        bbox_id=0
        for xml in train_xml:
            category = []
            xmin = []
            ymin = []
            xmax = []
            ymax = []
            import xml.etree.ElementTree as ET
            tree = ET.parse(os.path.join(root_path,'VOCdevkit','VOC2012','Annotations',xml))
            root = tree.getroot()
            object = root.findall('object')
            for i in object:
                category.append(class_name_to_id[i.findall('name')[0].text])
                bndbox = i.findall('bndbox')
                for j in bndbox:
                    xmin.append(float(j.findall('xmin')[0].text))
                    ymin.append(float(j.findall('ymin')[0].text))
                    xmax.append(float(j.findall('xmax')[0].text))
                    ymax.append(float(j.findall('ymax')[0].text))
            for i in range(len(category)):
                data["annotations"].append(
                    dict(
                        id=bbox_id,
                        image_id=images_id[xml[:-4]],
                        category_id=category[i],
                        area=(xmax[i]-xmin[i])*(ymax[i]-ymin[i]),
                        bbox=[xmin[i],ymin[i],xmax[i]-xmin[i],ymax[i]-ymin[i]],
                        iscrowd=0,
                    )
                )
                bbox_id+=1
        # 生成训练集的json
        json.dump(data, open(os.path.join(root_path,'coco2017','annotations','instances_train2017.json'), 'w'))
    
        # 获取验证图片
        val_img = []
        fp = open(os.path.join(root_path, 'VOCdevkit', 'VOC2012', 'ImageSets', 'Main', 'val.txt'))
        for i in fp.readlines():
            val_img.append(i[:-1] + ".jpg")
    
        # 将训练的images和annotations清空,
        del data['images']
        data['images']=[]
        del data['annotations']
        data['annotations']=[]
    
        # 获取验证集图片的数据
        for image in val_img:
            img = Image.open(os.path.join(images_dir, image))
            data["images"].append(
                dict(
                    license=0,
                    url=None,
                    file_name=image,  # 图片的文件名带后缀
                    height=img.height,
                    width=img.width,
                    date_captured=None,
                    id=images_id[image[:-4]],
                )
            )
    
        # 处理coco数据集验证集中annotations字段。
        val_xml=[i[:-4]+'.xml' for i in val_img]
    
        for xml in val_xml:
            category = []
            xmin = []
            ymin = []
            xmax = []
            ymax = []
            import xml.etree.ElementTree as ET
            tree = ET.parse(os.path.join(root_path,'VOCdevkit','VOC2012','Annotations',xml))
            root = tree.getroot()
            object = root.findall('object')
            for i in object:
                category.append(class_name_to_id[i.findall('name')[0].text])
                bndbox = i.findall('bndbox')
                for j in bndbox:
                    xmin.append(float(j.findall('xmin')[0].text))
                    ymin.append(float(j.findall('ymin')[0].text))
                    xmax.append(float(j.findall('xmax')[0].text))
                    ymax.append(float(j.findall('ymax')[0].text))
            for i in range(len(category)):
                data["annotations"].append(
                    dict(
                        id=bbox_id,
                        image_id=images_id[xml[:-4]],
                        category_id=category[i],
                        area=(xmax[i]-xmin[i])*(ymax[i]-ymin[i]),
                        bbox=[xmin[i],ymin[i],xmax[i]-xmin[i],ymax[i]-ymin[i]],
                        iscrowd=0,
                    )
                )
                bbox_id+=1
        # 生成验证集的json
        json.dump(data, open(os.path.join(root_path,'coco2017','annotations','instances_val2017.json'), 'w'))
        print('| VOC -> COCO annotations transform finish.')
        print('Start copy images...')
    
         for img_name in train_img:
             shutil.copy(os.path.join(root_path,"VOCdevkit", "VOC2012", "JPEGImages", img_name), os.path.join(root_path, "coco2017", 'train2017', img_name))
         print('| Train images copy finish.')
        
         for img_name in val_img:
             shutil.copy(os.path.join(root_path,"VOCdevkit", "VOC2012", "JPEGImages", img_name), os.path.join(root_path, "coco2017", 'val2017', img_name))
         print('| Val images copy finish.')
    

    5️⃣ VOC数据集划分为train和val

    #####################################################################################
    #####                              voc数据集划分为train,val
    #####################################################################################
    def voc_dataset_split():
        file_train = open(
            os.path.join(root_path, "VOCdevkit", "VOC2012", "ImageSets", "Main", "train.txt"), 'w')
        file_val = open(
            os.path.join(root_path, "VOCdevkit", "VOC2012", "ImageSets", "Main", "val.txt"), 'w')
    
        xml_total_filename = glob.glob(os.path.join(root_path, "VOCdevkit", 'VOC2012', 'Annotations', "*.xml"))
        for idx,xml in enumerate(xml_total_filename):
            xml_total_filename[idx]=xml.split('\\')[-1]
        num_total = len(xml_total_filename)
        num_train = int(num_total*train_percent)
        train_sample = random.sample(xml_total_filename, num_train)
    
        for name in xml_total_filename:
            if name in train_sample:
                file_train.write(name[:-4]+'\n')
            else:
                file_val.write(name[:-4]+'\n')
    
        file_train.close()
        file_val.close()
    

    6️⃣ 检查数据集中图片是否有损坏

    #####################################################################################
    #####        OSError: image file is truncated (9 bytes not processed)
    #####        检查数据集中图片是否有损坏。找到有问题图片,删掉它,并修改数据集。
    #####################################################################################
    def check_images():
        from PIL import Image
        images_dir=os.path.join(root_path,'VOCdevkit','VOC2012','JPEGImages')
        images=os.listdir(images_dir)
        for i in images:
            try:
                img = Image.open(os.path.join(root_path,'VOCdevkit','VOC2012','JPEGImages',i))  # 如果图片不存在,报错FileNotFoundError
                img.load()  # 如果图片不完整,报错OSError: image file is truncated
            except (FileNotFoundError, OSError):
                print(i)
    

    7️⃣ coco数据集将gt可视化,查看

    #####################################################################################
    # #####                         coco数据集将gt可视化,查看
    # #####################################################################################
    def visiual_gt():
        import cv2
    
        # 获取bboxes
        json_file = os.path.join(root_path,'COCO2017','annotations','instances_train2017.json')
        data = json.load(open(json_file, 'r'))
        # annotations = data['annotations']
        images=[]
        for d in data:
            images.append(d['name'])
    
        # 读取图片
        for i in random.sample(range(len(images)),5):
            img = cv2.imread(os.path.join(root_path,'COCO2017','train2017',images[i]))
    
            bboxes = []                                                    # 获取每个图片的bboxes
            for d in data:
                if d['name']==images[i]:
                    bboxes.append(d["bbox"])
    
            # 生成锚框
            for bbox in bboxes:
                left_top = (int(bbox[0]), int(bbox[1]))                     # 这里数据集中bbox的含义是,左上角坐标和右下角坐标。
                right_bottom = (int(bbox[2]), int(bbox[3]))                 # 根据不同数据集中bbox的含义,进行修改。
                cv2.rectangle(img, left_top, right_bottom, (0, 255, 0), 2)  # 图像,左上角,右下坐标,颜色,粗细
    
            cv2.imshow('image', img)
            cv2.waitKey(0)
        cv2.destroyAllWindows()
    

    博客所有的代码,都同一放到一个python文件中,用那个就调用那个文件。

    if __name__ == '__main__':
        random.seed(777)
        print("—" * 50)
        # unicode2id()                      # 数据集unicode编码转id
        # coco_dataset_split()              # coco数据集拆分。
        # coco2voc()                          # coco数据集转换成voc数据集
        # voc_dataset_split()               # voc数据集拆分
        # check_images()                    # 检查图片是否有损坏
        # visiual_gt()                        # coco数据集将gt可视化,查看
        voc2coco()                          # voc数据集转换成coco数据集
        print("—" * 50)
    

    ⭐ 完结撒花,如果有需要帮助的评论或者私聊都可以,看到就回答了。

  • 相关阅读:
    java.lang.NoSuchMethodError:antlr.collections.AST.getLine() I
    T7 java Web day01 标签HTML
    T6 s1 day19
    T5 s5 Day18
    T5 s4 Day 17
    T5 s3 day16
    T5 s2 Day 15
    T5 s1 day14
    T4 S03 day 12
    T4 S01 day1
  • 原文地址:https://www.cnblogs.com/gy77/p/15648515.html
Copyright © 2011-2022 走看看