import pandas as pd
import numpy as np
class Process():
def __init__(self):
self.path_1 = "3000条无意义.xlsx"
self.path_2 = "录音跟听0526.xlsx"
self.s = []
self.l = []
def read_path_1(self):
readbook = pd.read_excel(self.path_1, sheet_name="Sheet1")
s1 = readbook["语句"]
y = readbook["语义"]
for s,l in zip(s1, y):
if str(l) == "nan":
continue
else:
s = s.replace(" ", "").replace("
", "").replace("
", "").replace(" ", "")
l = l.replace(" ", "").replace("
", "").replace("
", "").replace(" ", "")
self.s.append(s)
self.l.append(l)
def read_path_2(self):
t1 = pd.read_excel(self.path_2, sheet_name="0526")
t2 = pd.read_excel(self.path_2, sheet_name="0525")
t3 = pd.read_excel(self.path_2, sheet_name="0524")
s1 = t1["内容"]
y1 = t1["正确标签"]
for s,l in zip(s1, y1):
if str(l) == "nan":
continue
else:
s = s.replace(" ", "").replace("
", "").replace("
", "").replace(" ", "")
l = l.replace(" ", "").replace("
", "").replace("
", "").replace(" ", "")
self.s.append(s)
self.l.append(l)
s2 = t2["内容"]
y2 = t2["正确标签"]
for s,l in zip(s2, y2):
if str(l) == "nan":
continue
else:
s = s.replace(" ", "").replace("
", "").replace("
", "").replace(" ", "")
l = l.replace(" ", "").replace("
", "").replace("
", "").replace(" ", "")
self.s.append(s)
self.l.append(l)
s3 = t3["内容"]
y3 = t3["正确标签"]
for s,l in zip(s3, y3):
if str(l) == "nan":
continue
else:
s = s.replace(" ", "").replace("
", "").replace("
", "").replace(" ", "")
l = l.replace(" ", "").replace("
", "").replace("
", "").replace(" ", "")
self.s.append(s)
self.l.append(l)
def noSemantic_1(self):
path_in = "3000_1.xlsx"
path_out = "new_data/3000_1.csv"
t1 = pd.read_excel(path_in, sheet_name="Sheet1")
s1 = t1["话术"]
y1 = t1["语义"]
sentence = []
label = []
predict = []
for s, l in zip(s1, y1):
if str(l) == "nan":
continue
else:
s = s.replace(" ", "").replace("
", "").replace("
", "").replace(" ", "")
l = l.replace(" ", "").replace("
", "").replace("
", "").replace(" ", "")
sentence.append(s)
label.append(l)
predict.append("无意义")
dataframe = pd.DataFrame({'sentence': sentence,"predict":predict, 'label': label })
dataframe.to_csv(path_out, index=False, sep=' ', encoding="utf8")
def noSemantic_2(self):
path_in = "3000_2.xlsx"
path_out = "new_data/3000_2.csv"
t1 = pd.read_excel(path_in, sheet_name="对话文本")
s1 = t1["客户语句"]
y1 = t1["语义小类"]
sentence = []
label = []
predict = []
for s, l in zip(s1, y1):
if str(l) == "nan":
continue
else:
s = s.replace(" ", "").replace("
", "").replace("
", "").replace(" ", "")
l = l.replace(" ", "").replace("
", "").replace("
", "").replace(" ", "")
sentence.append(s)
label.append(l)
predict.append("无意义")
dataframe = pd.DataFrame({'sentence': sentence, "predict": predict, 'label': label})
dataframe.to_csv(path_out, index=False, sep=' ', encoding="utf8")
def noSemantic_3(self):
path_in = "3000_3.xlsx"
path_out = "new_data/3000_3.csv"
t1 = pd.read_excel(path_in, sheet_name="Sheet1")
s1 = t1["语句"]
y1 = t1["语义"]
sentence = []
label = []
predict = []
for s, l in zip(s1, y1):
if str(l) == "nan":
continue
else:
s = s.replace(" ", "").replace("
", "").replace("
", "").replace(" ", "")
l = l.replace(" ", "").replace("
", "").replace("
", "").replace(" ", "")
sentence.append(s)
label.append(l)
predict.append("无意义")
dataframe = pd.DataFrame({'sentence': sentence, "predict": predict, 'label': label})
dataframe.to_csv(path_out, index=False, sep=' ', encoding="utf8")
def luyin_1(self):
path_in = "录音跟听_1.xlsx"
path_out = "new_data/record_1.csv"
t1 = pd.read_excel(path_in, sheet_name="Sheet1")
s1 = t1["内容"]
p1 = t1["预测标签"]
y1 = t1["正确标签"]
sentence = []
label = []
predict = []
for s, l, pre in zip(s1, y1, p1):
if str(l) == "nan":
continue
else:
s = s.replace(" ", "").replace("
", "").replace("
", "").replace(" ", "")
l = l.replace(" ", "").replace("
", "").replace("
", "").replace(" ", "")
p = pre.replace(" ", "").replace("
", "").replace("
", "").replace(" ", "")
sentence.append(s)
label.append(l)
predict.append(p)
dataframe = pd.DataFrame({'sentence': sentence, "predict": predict, 'label': label})
dataframe.to_csv(path_out, index=False, sep=' ', encoding="utf8")
dataframe.to_excel('new_data/record_1.xlsx', sheet_name='data', index=False, encoding="utf8")
def luyin_2(self):
path_in = "录音跟听_2.xlsx"
path_out = "new_data/record_2.csv"
t1 = pd.read_excel(path_in, sheet_name="Sheet1")
s1 = t1["内容"]
p1 = t1["预测标签"]
y1 = t1["正确标签"]
sentence = []
label = []
predict = []
for s, l, pre in zip(s1, y1, p1):
if str(l) == "nan":
continue
else:
s = s.replace(" ", "").replace("
", "").replace("
", "").replace(" ", "")
l = l.replace(" ", "").replace("
", "").replace("
", "").replace(" ", "")
p = pre.replace(" ", "").replace("
", "").replace("
", "").replace(" ", "")
sentence.append(s)
label.append(l)
predict.append(p)
dataframe = pd.DataFrame({'sentence': sentence, "predict": predict, 'label': label})
dataframe.to_csv(path_out, index=False, sep=' ', encoding="utf8")
dataframe.to_excel('new_data/record_2.xlsx', sheet_name='data', index=False, encoding="utf8")
def luyin_3(self):
path_in = "录音跟听_3.xlsx"
path_out = "new_data/record_3.csv"
t1 = pd.read_excel(path_in, sheet_name="Sheet1")
s1 = t1["内容"]
p1 = t1["预测标签"]
y1 = t1["正确标签"]
sentence = []
label = []
predict = []
for s, l, pre in zip(s1, y1, p1):
if str(l) == "nan":
continue
if str(pre) == "nan":
continue
else:
s = s.replace(" ", "").replace("
", "").replace("
", "").replace(" ", "")
l = l.replace(" ", "").replace("
", "").replace("
", "").replace(" ", "")
p = pre.replace(" ", "").replace("
", "").replace("
", "").replace(" ", "")
sentence.append(s)
label.append(l)
predict.append(p)
dataframe = pd.DataFrame({'sentence': sentence, "predict": predict, 'label': label})
dataframe.to_csv(path_out, index=False, sep=' ', encoding="utf8")
dataframe.to_excel('new_data/record_3.xlsx', sheet_name='data', index=False, encoding="utf8")
def liucheng(self):
path_in = "流程跟听.xlsx"
path_out = "new_data/procedure.csv"
t1 = pd.read_excel(path_in, sheet_name="Sheet1")
s1 = t1["语句"]
p1 = t1["预测标签"]
y1 = t1["语义"]
sentence = []
label = []
predict = []
for s, l, pre in zip(s1, y1, p1):
if str(l) == "nan":
continue
if str(pre) == "nan":
continue
else:
s = s.replace(" ", "").replace("
", "").replace("
", "").replace(" ", "")
l = l.replace(" ", "").replace("
", "").replace("
", "").replace(" ", "")
p = pre.replace(" ", "").replace("
", "").replace("
", "").replace(" ", "")
sentence.append(s)
label.append(l)
predict.append(p)
dataframe = pd.DataFrame({'sentence': sentence, "predict": predict, 'label': label})
dataframe.to_csv(path_out, index=False, sep=' ', encoding="utf8")
dataframe.to_excel('new_data/procedure.xlsx', sheet_name='data',index=False, encoding="utf8")
def main(self):
self.read_path_1()
self.read_path_2()
dataframe = pd.DataFrame({'l': self.l, 's': self.s})
dataframe.to_csv("all_0607.csv", index=False, sep=' ', encoding="utf8")
# def update_1(self):
# path = "excel/3000_拒识语料.csv"
# data = pd.read_csv(path, sep=" ")
# sentence1 = data["sentence"].tolist()
# label1 = data["label"].tolist()
#
# t2 = pd.read_excel("excel/语义优化_0608_1.xlsx", sheet_name="data")
# sentence2 = t2["sentence"].tolist()
# label2 = t2["label"].tolist()
#
# t3 = pd.read_excel("excel/语义优化_0608_2.xlsx", sheet_name="data")
# sentence3 = t3["sentence"].tolist()
# label3 = t3["label"].tolist()
#
# s = sentence1 + sentence2 + sentence3
# l = label1 + label2 + label3
# with open("all.txt", "a+", encoding="utf8") as f:
# for _l,_s in zip(l, s):
# _l = _l.replace(" ", "").replace("
", "").replace("
", "").replace(" ", "")
# _s = _s.replace(" ", "").replace("
", "").replace("
", "").replace(" ", "")
# line = _l + " " + _s
# f.write(line + "
")
def no_semantic(self):
t1 = pd.read_csv("new_data/3000_1.csv", sep=" ")
t2 = pd.read_csv("new_data/3000_2.csv", sep=" ")
t3 = pd.read_csv("new_data/3000_3.csv", sep=" ")
df = pd.concat([t1, t2, t3], ignore_index=True)
df.to_excel('excel/nosemantic.xlsx', sheet_name='data',index=False, encoding="utf8")
if __name__ == '__main__':
Process().no_semantic()