一、代码
# -*- coding: UTF-8 -*-
import json
import pandas as pd
"""获得所有的文本"""
def get_all_text():
file_path = "../datas/format/primary.json"
names = []
roles = []
texts = []
with open(file_path, "r", encoding="utf8") as f:
for data_line in f.readlines():
json_data = json.loads(data_line)
file_name = json_data["file_name"]
file_data = json_data["datas"]
for k,v in file_data.items():
names.append(file_name)
roles.append(k)
texts.append(v)
file_out = "../datas/format/all_text.csv"
dataframe = pd.DataFrame({'names': names, 'roles': roles, "texts": texts})
dataframe.to_csv(file_out, index=False, sep=' ')
"""从csv搜索数据"""
def search_text(key):
file_out = "../datas/classes/" + key + ".csv"
file_path = "../datas/format/all_text.csv"
data = pd.read_csv(file_path, sep=" ")
da = data[data["texts"].str.contains(key)]
da.to_csv(file_out, index=False, sep=' ')
"""提取带有婚字的数据"""
def data_annotate():
file_in = "../datas/format/primary.json"
file_out = "../datas/annotate/label.json"
with open(file_out, "w", encoding="utf8") as fo:
with open(file_in, "r", encoding="utf8") as f:
for line in f.readlines():
item = {}
label = 0
json_data = json.loads(line)
for k,v in json_data["datas"].items():
if "婚" in v:
label = 1
if label == 1:
item["name"] = json_data["file_name"]
item["label"] = ""
item["datas"] = json_data["datas"]
fo.write(json.dumps(item, ensure_ascii=False) + "
")
return "success"
"""提取标注过的数据"""
def annotate():
file_in = "../datas/annotate/label.json"
file_labeled = "../datas/annotate/labeled.json"
file_unlabeled = "../datas/annotate/unlabel.json"
with open(file_in, "r", encoding="utf8") as f_in:
with open(file_labeled, "w", encoding="utf8") as f_labeled:
with open(file_unlabeled, "w", encoding="utf8") as f_unlabeled:
for line in f_in.readlines():
json_data = json.loads(line)
if json_data["label"]:
f_labeled.write(json.dumps(json_data, ensure_ascii=False) + "
")
else:
f_unlabeled.write(json.dumps(json_data, ensure_ascii=False) + "
")
return "success"
def label_to_csv():
file_path = "../datas/annotate/labeled.json"
labels = []
datas = []
data_dict = []
with open(file_path, "r", encoding="utf8") as f:
for data_line in f.readlines():
json_data = json.loads(data_line)
_label = json_data["label"]
_data = "|".join(json_data["datas"].values())
labels.append(_label)
datas.append(_data)
data_dict.append(data_line.replace("
", ""))
file_out = "../datas/annotate/labeled.csv"
dataframe = pd.DataFrame({'labels': labels, 'datas': datas, "data_dict": data_dict})
dataframe.to_csv(file_out, index=False, sep=' ')
"""提取带工作的数据"""
def get_work():
search_text("工作")
if __name__ == '__main__':
label_to_csv()