zoukankan      html  css  js  c++  java
  • 数据集拆分,互转,可视化,查错

    分享数据集的集中常用处理代码,使用的时候记得改一下自己的路径,ann_dir是coco的json文件路径,img_dir是图片路径。如果用pycharm控制台输出的中文为乱码,将pycharm中的编码全改成utf-8(设置->编辑器->文件编码),把能改成utf-8的选项都改了。

    1️⃣ 有些数据集中含有unicode编码,也就是对应的中文,我们记录好每个unicode编码对应的id。然后将文件中的unicode编码转成id。

    # -*- coding: utf-8 -*-
    import json
    import os
    import random
    import time
    import shutil
    import glob
    
    category=['无瑕疵','花板跳', '水渍', '星跳', '浆斑', '油渍', '烧毛痕', '死皱', '筘路', '浪纹档', '三丝', '跳纱', '双经', '修痕',
              '污渍', '百脚', '松经', '跳花', '吊经', '纬纱不良', '断氨纶', '双纬', '粗维', '磨痕', '云织', '整经结', '稀密档', '断经',
              '粗经', '纬缩', '色差档', '毛粒', '破洞', '结头', '轧痕']
    
    root_path=os.getcwd()
    ann_dir=os.path.join(root_path,"smartdiagnosisofclothflaw_round1train1_datasets",
                                 "guangdong1_round1_train1_20190818","Annotations")
    img_dir=os.path.join(root_path,"smartdiagnosisofclothflaw_round1train1_datasets",
                                 "guangdong1_round1_train1_20190818","defect_Images")
    
    # 训练集比例
    train_percent = 0.8
    
    #####################################################################################
    #####                              数据集中文改英文
    #####################################################################################
    def unicode2id():
        ann_file=os.path.join(ann_dir,"anno_train.json")
        print(ann_file)
    
        # # 输出训练数据集中所有的类别
        # category_temp=set()
        # with open(anno_file, 'r', encoding='unicode_escape') as f:
        #     json_data = json.load(f)
        #     for i in json_data:
        #         category.add(i['defect_name'])
        # print(category_temp)
        # print(len(category_temp))
    
        data1=[]
        # 将数据集中的中文unicode编码,改编成数字id
        with open(ann_file, 'r', encoding='unicode_escape') as f:
            json_data = json.load(f)
            for i in json_data:
                data1.append({'name':i['name'],'defect_name':category.index(i['defect_name']),'bbox':i['bbox']})
    
        with open(os.path.join(ann_dir,'data.json'), 'w') as f:
             json.dump(data1, f)
    

    2️⃣ COCO数据集划分为train和val

    #####################################################################################
    #####                              COCO数据集划分为train,val
    #####################################################################################
    def coco_dataset_split():
        time_start = time.time()
    
        # 建立输出文件夹
        if not os.path.exists(os.path.join(root_path, "COCO2017")):
            os.makedirs(os.path.join(root_path, "COCO2017"))
        if not os.path.exists(os.path.join(root_path, "COCO2017","annotations")):
            os.makedirs(os.path.join(root_path,"COCO2017", "annotations"))
        if not os.path.exists(os.path.join(root_path, "COCO2017","train2017")):
            os.makedirs(os.path.join(root_path,"COCO2017", "train2017"))
        if not os.path.exists(os.path.join(root_path,"COCO2017", "val2017")):
            os.makedirs(os.path.join(root_path,"COCO2017", "val2017"))
    
        # 保存路径
        save_img_train_dir = os.path.join(root_path, "COCO2017", "train2017")
        save_img_val_dir = os.path.join(root_path, "COCO2017", "val2017")
        save_ann_train_file = os.path.join(root_path, "COCO2017", "annotations", "instances_train2017.json")
        save_ann_val_file = os.path.join(root_path, "COCO2017", "annotations", "instances_val2017.json")
    
        # 数据集类别及数量
        images_list = os.listdir(img_dir)
        images_num = len(images_list)
    
        train_num = int(images_num * train_percent)
        val_num=images_num-train_num
        train_list = random.sample(images_list, train_num)
        val_list = list(set(images_list) - set(train_list))
        print("| Images num: ",images_num)
        print("| Train num: ",train_num)
        print("| Val num: ",val_num)
    
        # 复制图片。
        for image_name in train_list:
            shutil.copy(os.path.join(img_dir, image_name), os.path.join(save_img_train_dir, image_name))
        for image_name in val_list:
            shutil.copy(os.path.join(img_dir, image_name), os.path.join(save_img_val_dir, image_name))
    
        ann_path=os.path.join(ann_dir,"anno_train.json")
    
        # 提取annotation
        train2017=[]
        val2017=[]
        with open(ann_path, 'r', encoding='unicode_escape') as fp:
            json_data = json.load(fp)
            for i in json_data:
                if i['name'] not in val_list:
                    train2017.append({'name':i['name'],'defect_name':category.index(i['defect_name']),'bbox':i['bbox']})
                else:
                    val2017.append({'name':i['name'],'defect_name':category.index(i['defect_name']),'bbox':i['bbox']})
        # 写入annotation
        with open(save_ann_train_file, 'w') as fp:
            json.dump(train2017, fp)
        with open(save_ann_val_file, 'w') as fp:
            json.dump(val2017, fp)
    
        time_end = time.time()
        cost_time=time_end-time_start
        print("| Cost time: ",cost_time//60//60,"hour",cost_time//60%60,"min",cost_time%60,"s")
    

    3️⃣ COCO数据集转换成VOC数据集,复制图片比较耗时,耐心等待就行了。为了节省时间,没有可视化复制图片的进度。如果想加,可以百度一下tqdm,加到复制图片的for循环中就可以了。

    #####################################################################################
    #####                              coco数据集转换成voc数据集
    #####################################################################################
    def coco2voc():
        from lxml.etree import Element, SubElement, tostring
        from xml.dom.minidom import parseString
    
        # 创建保存的文件夹
        if not os.path.exists(os.path.join(root_path, "VOCdevkit", "VOC2012")):
            os.makedirs(os.path.join(root_path, "VOCdevkit", "VOC2012"))
            os.makedirs(os.path.join(root_path, "VOCdevkit", "VOC2012", "Annotations"))
            os.makedirs(os.path.join(root_path, "VOCdevkit", "VOC2012", "ImageSets"))
            os.makedirs(os.path.join(root_path, "VOCdevkit", "VOC2012", "ImageSets", "Main"))
            os.makedirs(os.path.join(root_path, "VOCdevkit", "VOC2012", "JPEGImages"))
    
        # json文件路径
        ann_path = os.path.join(ann_dir,"data.json")
        ann_file = open(ann_path, "r", encoding='utf-8')
        ann_json_list = json.load(ann_file)
    
        save_xml_path = os.path.join(root_path, "VOCdevkit", 'VOC2012', 'Annotations')
    
        # 保存每个图片对应的category以及bbox.
        img_names = []
        img_bbox_category = {}
        for ann in ann_json_list:
            # 获取coco数据集中json的信息
            img_name = ann['name']
            category = ann['defect_name']
            bbox = ann['bbox']
            if img_name not in img_names:
                img_names.append(img_name)
                img_bbox_category[img_name] = [{"category":category,"bbox":bbox}]
            else:
                img_bbox_category[img_name].append({"category":category,"bbox":bbox})
    
        print('| Images start copy.')
        # 复制所有的图片到voc数据集中。
        for img_name in img_names:
            shutil.copy(os.path.join(img_dir, img_name), os.path.join(root_path, "VOCdevkit", 'VOC2012', 'JPEGImages', img_name))
        print('| Images copy finish.')
    
        print('| Jsons start transform')
        # 第一层循环遍历所有的照片,提出json中所有的信息,并分别放到不同xml文件中。
        for img_name in img_bbox_category.keys():
            # 获取图片名字
            img_name_temp = img_name
            root_node = Element('annotation')
            node_filename = SubElement(root_node, 'filename')
            node_filename.text = img_name_temp
    
            from PIL import Image
            node_size = SubElement(root_node, 'size')
            node_width = SubElement(node_size, 'width')
            node_height = SubElement(node_size, 'height')
            img_m = Image.open(os.path.join(img_dir,img_name))
            node_width.text = str(img_m.width)       # 图片的宽
            node_height.text = str(img_m.height)     # 图片的高
    
            # 第二层循环遍历有多少个框
            for bbox_and_category in img_bbox_category[img_name_temp]:
                category_temp = bbox_and_category["category"]
                bbox_temp = bbox_and_category["bbox"]
    
                # 类别名字
                node_object = SubElement(root_node, 'object')
                node_name = SubElement(node_object, 'name')
                node_name.text = str(category_temp)
    
                node_bndbox = SubElement(node_object, 'bndbox')
                node_xmin = SubElement(node_bndbox, 'xmin')
                node_xmin.text = str(bbox_temp[0])
                node_ymin = SubElement(node_bndbox, 'ymin')
                node_ymin.text = str(bbox_temp[1])
                node_xmax = SubElement(node_bndbox, 'xmax')
                node_xmax.text = str(bbox_temp[2])
                node_ymax = SubElement(node_bndbox, 'ymax')
                node_ymax.text = str(bbox_temp[3])
    
            xml = tostring(root_node)
            dom = parseString(xml)
            # print xml 打印查看结果
            img_name_temp = img_name_temp.replace(".jpg", "")
            xml_name = os.path.join(save_xml_path, img_name_temp+'.xml')
            with open(xml_name, 'wb') as f:
                f.write(dom.toprettyxml(indent='\t', encoding='utf-8'))
                # f.write(dom.toprettyxml(indent='\t',))
        print('| Jsons transform finish.')
    

    4️⃣ voc数据集转换成coco数据集

    #####################################################################################
    #####                           voc数据集转换成coco数据集
    #####################################################################################
    def voc2coco():
        import datetime
        from PIL import Image
    
        # 处理coco数据集中category字段。
        # 创建一个 {类名 : id} 的字典,并保存到 总标签data 字典中。
        class_name_to_id = {'class1':1, 'class2':2, 'class3':3, 'class4':4, 'class5':5, 'class6':6, 'class7':7, 'class8':8}
        
        # 创建coco的文件夹
        if not os.path.exists(os.path.join(root_path, "coco2017")):
            os.makedirs(os.path.join(root_path, "coco2017"))
            os.makedirs(os.path.join(root_path, "coco2017", "annotations"))
            os.makedirs(os.path.join(root_path, "coco2017", "train2017"))
            os.makedirs(os.path.join(root_path, "coco2017", "val2017"))
    
        # 创建 总标签data
        now = datetime.datetime.now()
        data = dict(
            info=dict(
                description=None,
                url=None,
                version=None,
                year=now.year,
                contributor=None,
                date_created=now.strftime("%Y-%m-%d %H:%M:%S.%f"),
            ),
            licenses=[dict(url=None, id=0, name=None, )],
            images=[
                # license, file_name,url, height, width, date_captured, id
            ],
            type="instances",
            annotations=[
                # segmentation, area, iscrowd, image_id, bbox, category_id, id
            ],
            categories=[
                # supercategory, id, name
            ],
        )
    
        for name,id in class_name_to_id.items():
            data["categories"].append(
                dict(supercategory=None, id=id, name=name, )
            )
    
        # 处理coco数据集train中images字段。
        images_dir=os.path.join(root_path,'VOCdevkit','VOC2012','JPEGImages')
        images=os.listdir(images_dir)
    
        # 生成每个图片对应的image_id
        images_id={}
        for idx,image_name in enumerate(images):
            images_id.update({image_name[:-4]:idx})
    
        # 获取训练图片
        train_img=[]
        fp = open(os.path.join(root_path,'VOCdevkit','VOC2012','ImageSets','Main','train.txt'))
        for i in fp.readlines():
            train_img.append(i[:-1]+".jpg")
    
        # 获取训练图片的数据
        for image in train_img:
            img = Image.open(os.path.join(images_dir,image))
            data["images"].append(
                dict(
                    license=0,
                    url=None,
                    file_name=image,              # 图片的文件名带后缀
                    height=img.height,
                    width=img.width,
                    date_captured=None,
                    # id=image[:-4],
                    id=images_id[image[:-4]],
                )
            )
    
        # 获取coco数据集train中annotations字段。
        train_xml=[i[:-4]+'.xml' for i in train_img]
    
        bbox_id=0
        for xml in train_xml:
            category = []
            xmin = []
            ymin = []
            xmax = []
            ymax = []
            import xml.etree.ElementTree as ET
            tree = ET.parse(os.path.join(root_path,'VOCdevkit','VOC2012','Annotations',xml))
            root = tree.getroot()
            object = root.findall('object')
            for i in object:
                category.append(class_name_to_id[i.findall('name')[0].text])
                bndbox = i.findall('bndbox')
                for j in bndbox:
                    xmin.append(float(j.findall('xmin')[0].text))
                    ymin.append(float(j.findall('ymin')[0].text))
                    xmax.append(float(j.findall('xmax')[0].text))
                    ymax.append(float(j.findall('ymax')[0].text))
            for i in range(len(category)):
                data["annotations"].append(
                    dict(
                        id=bbox_id,
                        image_id=images_id[xml[:-4]],
                        category_id=category[i],
                        area=(xmax[i]-xmin[i])*(ymax[i]-ymin[i]),
                        bbox=[xmin[i],ymin[i],xmax[i]-xmin[i],ymax[i]-ymin[i]],
                        iscrowd=0,
                    )
                )
                bbox_id+=1
        # 生成训练集的json
        json.dump(data, open(os.path.join(root_path,'coco2017','annotations','instances_train2017.json'), 'w'))
    
        # 获取验证图片
        val_img = []
        fp = open(os.path.join(root_path, 'VOCdevkit', 'VOC2012', 'ImageSets', 'Main', 'val.txt'))
        for i in fp.readlines():
            val_img.append(i[:-1] + ".jpg")
    
        # 将训练的images和annotations清空,
        del data['images']
        data['images']=[]
        del data['annotations']
        data['annotations']=[]
    
        # 获取验证集图片的数据
        for image in val_img:
            img = Image.open(os.path.join(images_dir, image))
            data["images"].append(
                dict(
                    license=0,
                    url=None,
                    file_name=image,  # 图片的文件名带后缀
                    height=img.height,
                    width=img.width,
                    date_captured=None,
                    id=images_id[image[:-4]],
                )
            )
    
        # 处理coco数据集验证集中annotations字段。
        val_xml=[i[:-4]+'.xml' for i in val_img]
    
        for xml in val_xml:
            category = []
            xmin = []
            ymin = []
            xmax = []
            ymax = []
            import xml.etree.ElementTree as ET
            tree = ET.parse(os.path.join(root_path,'VOCdevkit','VOC2012','Annotations',xml))
            root = tree.getroot()
            object = root.findall('object')
            for i in object:
                category.append(class_name_to_id[i.findall('name')[0].text])
                bndbox = i.findall('bndbox')
                for j in bndbox:
                    xmin.append(float(j.findall('xmin')[0].text))
                    ymin.append(float(j.findall('ymin')[0].text))
                    xmax.append(float(j.findall('xmax')[0].text))
                    ymax.append(float(j.findall('ymax')[0].text))
            for i in range(len(category)):
                data["annotations"].append(
                    dict(
                        id=bbox_id,
                        image_id=images_id[xml[:-4]],
                        category_id=category[i],
                        area=(xmax[i]-xmin[i])*(ymax[i]-ymin[i]),
                        bbox=[xmin[i],ymin[i],xmax[i]-xmin[i],ymax[i]-ymin[i]],
                        iscrowd=0,
                    )
                )
                bbox_id+=1
        # 生成验证集的json
        json.dump(data, open(os.path.join(root_path,'coco2017','annotations','instances_val2017.json'), 'w'))
        print('| VOC -> COCO annotations transform finish.')
        print('Start copy images...')
    
         for img_name in train_img:
             shutil.copy(os.path.join(root_path,"VOCdevkit", "VOC2012", "JPEGImages", img_name), os.path.join(root_path, "coco2017", 'train2017', img_name))
         print('| Train images copy finish.')
        
         for img_name in val_img:
             shutil.copy(os.path.join(root_path,"VOCdevkit", "VOC2012", "JPEGImages", img_name), os.path.join(root_path, "coco2017", 'val2017', img_name))
         print('| Val images copy finish.')
    

    5️⃣ VOC数据集划分为train和val

    #####################################################################################
    #####                              voc数据集划分为train,val
    #####################################################################################
    def voc_dataset_split():
        file_train = open(
            os.path.join(root_path, "VOCdevkit", "VOC2012", "ImageSets", "Main", "train.txt"), 'w')
        file_val = open(
            os.path.join(root_path, "VOCdevkit", "VOC2012", "ImageSets", "Main", "val.txt"), 'w')
    
        xml_total_filename = glob.glob(os.path.join(root_path, "VOCdevkit", 'VOC2012', 'Annotations', "*.xml"))
        for idx,xml in enumerate(xml_total_filename):
            xml_total_filename[idx]=xml.split('\\')[-1]
        num_total = len(xml_total_filename)
        num_train = int(num_total*train_percent)
        train_sample = random.sample(xml_total_filename, num_train)
    
        for name in xml_total_filename:
            if name in train_sample:
                file_train.write(name[:-4]+'\n')
            else:
                file_val.write(name[:-4]+'\n')
    
        file_train.close()
        file_val.close()
    

    6️⃣ 检查数据集中图片是否有损坏

    #####################################################################################
    #####        OSError: image file is truncated (9 bytes not processed)
    #####        检查数据集中图片是否有损坏。找到有问题图片,删掉它,并修改数据集。
    #####################################################################################
    def check_images():
        from PIL import Image
        images_dir=os.path.join(root_path,'VOCdevkit','VOC2012','JPEGImages')
        images=os.listdir(images_dir)
        for i in images:
            try:
                img = Image.open(os.path.join(root_path,'VOCdevkit','VOC2012','JPEGImages',i))  # 如果图片不存在,报错FileNotFoundError
                img.load()  # 如果图片不完整,报错OSError: image file is truncated
            except (FileNotFoundError, OSError):
                print(i)
    

    7️⃣ coco数据集将gt可视化,查看

    #####################################################################################
    # #####                         coco数据集将gt可视化,查看
    # #####################################################################################
    def visiual_gt():
        import cv2
    
        # 获取bboxes
        json_file = os.path.join(root_path,'COCO2017','annotations','instances_train2017.json')
        data = json.load(open(json_file, 'r'))
        # annotations = data['annotations']
        images=[]
        for d in data:
            images.append(d['name'])
    
        # 读取图片
        for i in random.sample(range(len(images)),5):
            img = cv2.imread(os.path.join(root_path,'COCO2017','train2017',images[i]))
    
            bboxes = []                                                    # 获取每个图片的bboxes
            for d in data:
                if d['name']==images[i]:
                    bboxes.append(d["bbox"])
    
            # 生成锚框
            for bbox in bboxes:
                left_top = (int(bbox[0]), int(bbox[1]))                     # 这里数据集中bbox的含义是,左上角坐标和右下角坐标。
                right_bottom = (int(bbox[2]), int(bbox[3]))                 # 根据不同数据集中bbox的含义,进行修改。
                cv2.rectangle(img, left_top, right_bottom, (0, 255, 0), 2)  # 图像,左上角,右下坐标,颜色,粗细
    
            cv2.imshow('image', img)
            cv2.waitKey(0)
        cv2.destroyAllWindows()
    

    博客所有的代码,都同一放到一个python文件中,用那个就调用那个文件。

    if __name__ == '__main__':
        random.seed(777)
        print("—" * 50)
        # unicode2id()                      # 数据集unicode编码转id
        # coco_dataset_split()              # coco数据集拆分。
        # coco2voc()                          # coco数据集转换成voc数据集
        # voc_dataset_split()               # voc数据集拆分
        # check_images()                    # 检查图片是否有损坏
        # visiual_gt()                        # coco数据集将gt可视化,查看
        voc2coco()                          # voc数据集转换成coco数据集
        print("—" * 50)
    

    ⭐ 完结撒花,如果有需要帮助的评论或者私聊都可以,看到就回答了。

  • 相关阅读:
    Chrome禁用缓存
    国内阿里Maven仓库镜像Maven配置文件Maven仓库速度快
    spring boot 之热部署
    Spring Boot的Maven插件Spring Boot Maven plugin详解
    在Extjs 的 TabPanel在 title标题栏上加按扭button
    Entity Framework 基于Oracle的code first 问题汇总
    面向对象设计原则
    Asp.Net MVC 缓存设计
    Asp.Net MVC 身份认证
    Asp.Net MVC 请求原理分析
  • 原文地址:https://www.cnblogs.com/gy77/p/15648515.html
Copyright © 2011-2022 走看看