#读取ontonote4并转换格式
f=open("./data/ontonote4/train.char.bmes","r+",encoding="utf-8")
sentences = []
sentence = []
label_set=set()
cnt_line=0
for line in f:
sentence = []
cnt_line+=1
splits = line.split(' ')
splits[0]=splits[0].split(' ')
splits[1]=splits[1].split(' ')
len_line=len(splits[0])
for i in range(len_line):
sentence.append([splits[0][i],splits[1][i]])
label_set.add(splits[1][i])
#print(sentence)
sentences.append(sentence)
f=open("./data/ontonote4/train-trans.char.bmes","w+",encoding="utf-8")
for sen in sentences:
for word in sen:
char=word[0]
label=word[1]
if(label[0]=='S'):
label='B'+label[1:]
elif(label[0]=='E' or label[0]=='M'):
label='I'+label[1:]
f.write(f'{char} {label}
')
#f.write('
') 数据集的结尾已经有了
f.close()
#综合分析数据集 输入sentences,输出标签名称 出现次数 B的次数 I的次数 实体种类数 意义
sentences
label_set
class_set={i[2:] for i in label_set if i[0]=='B'}
class_set#{'GPE', 'LOC', 'ORG', 'PER'}
class_map={i:[0,0,0] for i in class_set}
class_map
#出现次数
for sen in sentences:
for word in sen:
label=word[1]
if(label=='O'):
continue
class_map[label[2:]][0]+=1
if(label[0]=='B'):
class_map[label[2:]][1]+=1
if(label[0]=='I'):
class_map[label[2:]][2]+=1
#实体种类数
class_entity={i:set() for i in class_set}
for sen in sentences:
entity=''
for i in range(len(sen)):
word=sen[i]
char=word[0]
label=word[1]
#print(label)
if(label[0]=='B'):
if(entity!=''):
#print(entity)
#print(state)
class_entity[state[2:]].add(entity)
entity=''
entity+=char
elif(label[0]=='O'):
if(entity!=''):
#print(entity)
#print(state)
class_entity[state[2:]].add(entity)
entity=''
#state=label[2:]
#print(state)
#print(entity)
elif(label[0]=='I'):
entity+=char
state=label
if(entity!=''):
#print(entity)
#print(state)
class_entity[state[2:]].add(entity)
entity=''
#for sen in sentences:
#for word in sen:
class_map