GENIA命名实体数据集解析
欢迎联系2448267954@qq.com指正交流。
代码
import xml.sax
xml_ds="GENIA-term.xml"
out_file="result.txt"
class SentenceHandler(xml.sax.ContentHandler):
def __init__(self):
self.text = ""
self.labels = ""
self.totag = []
self.indexer = 0
self.writer = open("result.txt",'w',encoding='utf8')
# 开始元素处理
def startElement(self, tag, attributes):
self.CurrentData = tag
if tag == "cons":
try:
self.totag.append(attributes["sem"])
except AttributeError:
pass
except KeyError:
pass
if tag == "sentence":
self.writer.write("
")
# 元素结束处理
def endElement(self, tag):
if self.CurrentData == "cons":
if len(self.totag) > 1:
self.totag = self.totag[:-2]
else:
self.totag.clear()
# 内容事件处理
def characters(self, content):
content = content.split(" ")
for word in content:
if word == "" or word == "
":
continue
tow = ' '.join([str(self.indexer), word, "|".join(self.totag)])
self.writer.write(tow+'
')
self.indexer += 1
if (__name__ == "__main__"):
parser = xml.sax.make_parser()
parser.setFeature(xml.sax.handler.feature_namespaces, 0)
Headler = SentenceHandler()
parser.setContentHandler(Headler)
parser.parse(xml_ds)
输出格式
index word tag1|tag2|tag3