1.中文语料常常遇到编码问题,将任意字符集文件转为utf-8编码
1 import chardet 2 import codecs 3 from django.utils.encoding import smart_text 4 5 def check_file_charset(file): #查看file文件的编码 6 with open(file, 'rb') as f: 7 return chardet.detect(f.read()) 8 9 def Convert_file_character(File_path): 10 f_type = check_file_charset(File_path) 11 print (File_path,"字符集为:",f_type['encoding']) 12 try: 13 if f_type and 'encoding' in f_type.keys() and f_type['encoding'] != 'utf-8': 14 with codecs.open(File_path, 'rb', f_type['encoding'],errors='ignore') as f: 15 content = smart_text(f.read()) 16 with codecs.open(File_path, 'wb', 'utf-8') as f: 17 f.write(content) 18 print ("字符集转换成功") 19 else: 20 print("字符集为 utf-8,不需要进行转换") 21 except Exception as ERR: 22 print("字符集转换失败") 23 print (ERR) 24 25 corpus_path = './unlabel' 26 raw_train_files = [corpus_path + os.sep + file_name for file_name in os.listdir(corpus_path)] 27 for raw_train_file in raw_train_files: 28 Convert_file_character(raw_train_file)
参考:https://blog.csdn.net/qq_35751770/article/details/103664496
2.将unlabel文件夹中的所有.txt文件合并,每个文件之间空一行
先调用上面的代码转换编码
1 def combine(corpus_path, outpath): 2 output = open(outpath, 'a', encoding='utf-8') 3 4 raw_train_files = [corpus_path + os.sep + file_name for file_name in os.listdir(corpus_path)] 5 for raw_train_file in raw_train_files: 6 7 f_type = check_file_charset(raw_train_file) #查看文件的编码 8 print (raw_train_file,"字符集为:",f_type['encoding']) 9 with open(raw_train_file, 'r+', encoding='utf-8') as f: 10 context = f.readlines() 11 12 for x in context: 13 output.write(x) 14 output.write(' ') 15 16 combine('./unlabel', 'all_unlabel.txt')
3.随机抽取.txt文件中的60%,20%,5%
1 def part(filename, outpath, ratio): 2 output = open(outpath, 'w+', encoding='utf-8') 3 context = [] 4 with open(filename, 'r+', encoding='utf-8') as f: 5 context.extend(f.readlines()) 6 7 length = len(context) 8 random_order = list(range(length)) 9 np.random.shuffle(random_order) 10 11 batch_size = int(length*ratio) 12 print(batch_size) 13 for x in context[:batch_size]: 14 output.write(x) 15 16 ratio1, ratio2, ratio3 = 0.6, 0.2, 0.05 17 part('training/law_train.txt', 'training/law_train1.txt', ratio1) 18 part('training/law_train.txt', 'training/law_train2.txt', ratio2) 19 part('training/law_train.txt', 'training/law_train3.txt', ratio3)
4.将已经分好词的文件去掉空格(正则),恢复成文件原来的样子
1 def deal_data(filename, outpath): 2 output = open(outpath, 'w+', encoding='utf-8') 3 4 with open(filename, 'r+', encoding='utf-8') as f: 5 context = f.readlines() 6 for data in context: #data为某一行数据 7 x = re.sub('s+', '', data).strip() 8 output.write(x) 9 10 11 deal_data('evaluate/law/Law_contract_test.txt', 'evaluate/gold/Law_contract_test.txt') 12 deal_data('evaluate/law/Law_marriage_test.txt', 'evaluate/gold/Law_marriage_test.txt') 13 deal_data('evaluate/law/Law_mixed_test.txt', 'evaluate/gold/Law_mixed_test.txt')