zoukankan      html  css  js  c++  java
  • python 利用已有Ner模型进行数据清洗合并

    # -*- coding: utf-8 -*-
    from kashgari.corpus import DataReader
    import re
    from tqdm import tqdm
    
    
    def cut_text(text, lenth):
        textArr = re.findall('.{' + str(lenth) + '}', text)
        textArr.append(text[(len(textArr) * lenth):])
        return textArr
    
    
    def clean_data(source_file, target_file, ner_model):
        
        data_x, data_y = DataReader().read_conll_format_file(source_file)
    
        with tqdm(total=len(data_x)) as pbar:
            for idx, text_array in enumerate(data_x):
                if len(text_array) <= 100:
                    ners = ner_model.predict([text_array])
                    ner = ners[0]
                else:
                    texts = cut_text(''.join(text_array), 100)
                    ners = []
                    for text in texts:
                        ner = ner_model.predict([[char for char in text]])
                        ners = ners + ner[0]
                    ner = ners         
                # print('[-----------------------', idx, len(data_x))
                # print(data_y[idx])
                # print(ner)
            
                for jdx, t in enumerate(text_array):
                    if ner[jdx].startswith('B') or ner[jdx].startswith('I') :
                        if data_y[idx][jdx] == 'O':
                            data_y[idx][jdx] = ner[jdx]
               
                # print(data_y[idx])
                # print('-----------------------]')  
                pbar.update(1)
                
        f = open(target_file, 'a', encoding="utf-8")    
        for idx, text_array in enumerate(data_x):
            if idx != 0:
                f.writelines(['
    '])   
            for jdx, t in enumerate(text_array):
                text = t + ' ' + data_y[idx][jdx] 
                if idx == 0 and jdx == 0:
                    text = text
                else:
                    text = '
    ' + text
                f.writelines([text])   
        
        f.close()   
        
        data_x2, data_y2 = DataReader().read_conll_format_file(source_file)
        print(data_x == data_x2, len(data_y) == len(data_y2), '数据清洗完成')              
    
    # -*- coding: utf-8 -*-
    import kashgari
    from data_tools import clean_data
    time_ner = kashgari.utils.load_model('time_ner.h5')
    clean_data('./data/example.dev', 'example.dev', time_ner)
    
  • 相关阅读:
    Ubuntu14.04下Sublime Text 3解决无法输入中文
    100% 解决wine中文乱码问题
    历史朝代顺序表
    HTTP状态码分类
    Kali Linux 2019
    多线程下载器(针对于百度云下载)
    百度云破解版&&第三方下载工具&&分享链接无需提取码直接下载
    MongoDB 4.0.10 GridFS操作
    MongoDB 4.0.10 游标
    MongoDB 4.0.10 用户管理
  • 原文地址:https://www.cnblogs.com/gmhappy/p/11863935.html
Copyright © 2011-2022 走看看