当我用Stanford CoreNLP和A Python wrapper for the Java Stanford Core NLP tools(NLP的python调用工具)进行句法分析时,遇到一个很讨人厌的事情。
"(ROOT (IP (ADVP (AD u4f46)) (NP (NN u52a0u5de5u5382)) (VP (ADVP (AD u8fd8u662f)) (VP (VV u7ee7u7eed) (VP (VP (VV u71c3u70e7) (NP (NN u80f6) (NN u5236u54c1))) (VP (VV u6392u653e) (NP (NN u5e9fu6c14)))))) (PU u3002)))"
句法分析结果如上述所示,但是并没有相关的python包能提供像这样的把一个字符串里面包括层次结果的数据结构化。(好吧,是我了解不深,如果您有的话,请一定要告诉我)我所说的结构化的意思是把上述字符串转换成json结构,方便继续分析。
[{u'IP 2': [{u'ADVP 3': u'AD u4f46 4'}, {u'NP 5': u'NN u52a0u5de5u5382 6'}, {u'VP 7': [{u'ADVP 8': u'AD u8fd8u662f 9'}, {u'VP 10': [u'VV u7ee7u7eed 11', {u'VP 12': [{u'VP 13': [u'VV u71c3u70e7 14', {u'NP 15': [u'NN u80f6 16', u'NN u5236u54c1 17']}]}, {u'VP 18': [u'VV u6392u653e 19', {u'NP 20': u'NN u5e9fu6c14 21'}]}]}]}]}, u'PU u3002 22']}]
#ROOT为最外层ID=1,其他按字符的出现次序递增
#字符后面加个ID 是当字符相同时用ID 区别他们
故,自己写了一个算法,先把算法逻辑po上。
#1.分层及父级计算算法 #1.1分层: #条目所在层数=条目ID值-条目前”)”数目 #1.2条目的父级条目计算公式: #if ∆”)”==0:父ID=self.ID-1 #if ∆”)”>0:父ID=查找self.ID之前的数据,选取最后层数=当前条目层数的父ID为自身的父ID(其中∆”)”为相邻的两个条目的右括号数目之差,如NN的父ID为8、VP的父ID为7)
附上代码:
# -*- coding:utf-8 -*- import re from pprint import pprint from pandas import DataFrame class Transtrees: """ 将字符串按“)”数目分层和计算父节点ID 1、层数的计算公式为ID-")"数目 2、父节点计算方式为: ∆")"数目 =0:父ID 为自身ID-1 ∆")"数目 >0 :父ID 为搜索自身之前所有的ID,取最后层数相同的节点的父ID为自身父ID 例如: u'(ROOT(IP(LCP(IP(NP(NN)) VP(VV)))))' ROOT IP LCP IP NP NN VP VV id 1 2 3 4 5 6 7 8 ")" 0 0 0 0 0 0 2 2 floor 1 2 3 4 5 6 5 6 fid 0 1 2 3 4 5 4 7 """ def __init__(self,strtree): """ 初始化的同时,加载数据结果 _nlist 为一个装有[id,文本,右括号数目,层数]的列表 _fid_dict 是一个格式如{id:父ID}的字典 _tree_data 是一个装有[id,文本,层数,父ID]的列表 """ self._strtree = strtree self._nlist = [] self._nlist = self.get_ID_COUNT() self._fid_dict ={} self._fid_dict = self.get_fid() self._tree_data =[] self._tree_data = self.get_tree_data() def get_ID_COUNT(self): flag=1 namelist = re.findall("[^()]+",self._strtree) while flag: try: namelist.remove(' ') except: flag=0 for i in range(len(namelist)): name_index = len(self._strtree)-len(self._strtree.split('(',i+1)[-1])-1 bk_count = self._strtree[:name_index].count(')') self._nlist.append([i,namelist[i],bk_count,i-bk_count]) return self._nlist def get_fid(self): treedata = DataFrame(self._nlist, columns=["ID", "TEXT", "BK_COUNT", "FLOOR"]) tag=0 for i in range(1,len(treedata)): if treedata["BK_COUNT"][i] == treedata["BK_COUNT"][i-1]: self._fid_dict[i] = treedata['ID'][i-1] else: for j in range(1,i): if treedata['FLOOR'][j] == treedata['FLOOR'][i]: self._fid_dict[i] = self._fid_dict[j] self._fid_dict[0] =-1 return self._fid_dict def get_tree_data(self): for i in range(len(self._nlist)): self._tree_data.append([str(i+1),self._nlist[i][1]+' '+str(i+1),str(self._fid_dict[i]+1)]) return self._tree_data class bandModel: """ 绑定父子关系辅助类 """ def __init__(self, ID, TEXT, FID): self._ID = ID self._TEXT = TEXT self._FID = FID self._children = [] def addChild(self, *child): self._children += child def printTree(self,blist): """ blist 返回的结果为:(以u'(ROOT(IP(LCP(IP(NP(NN)) VP(VV)))))'为例子) [{VP:VV},{NP:NN},{IP:NP},{LCP:IP},{IP:LCP},{ROOT:IP}] 即由深到浅,每层的父子关系的列表。 """ map(lambda child:child.printTree(blist), self._children) for i in self._children: blist.append({self._TEXT:i._TEXT}) def get_tree(a,b): """ 根据bandModel生成的blist重构树状结构 """ if b.values()[0] in a.keys(): if (b.keys()[0] in a.keys()) & (b.values()[0] in a.keys()): if isinstance(a[b.keys()[0]], list): a[b.keys()[0]].append({b.values()[0]:a.pop(b.values()[0])}) return a a[b.keys()[0]] =[a[b.keys()[0]],{b.values()[0]:a.pop(b.values()[0])}] return a if b.keys()[0] in a.keys(): a[b.keys()[0]].append({b.values()[0]:a.pop(b.values()[0])}) a[b.keys()[0]] = {b.values()[0]:a.pop(b.values()[0])} return a for k in range(len(a.keys())): if a.keys()[k] == b.keys()[0]: if isinstance(a[b.keys()[0]], list): a[b.keys()[0]].append(b.values()[0]) return a a[b.keys()[0]] =[a[b.keys()[0]], b.values()[0]] return a return dict(a,**b) def get_id(tree,strs=None): """ 查找时的辅助函数,获取输入字符串的id列表 """ id_list = [] data = DataFrame(tree._nlist,columns=['id', 'text', 'bk_count', 'floor']) for i in range(len(tree._nlist)): if len(re.findall("^"+strs,data['text'][i])) == 1 : id_list.append(i+1) return id_list def get_result(tree,strs): """ 结果返回 id_list 查找的字符串的id的列表 tree_list 每个ID节点的具体结构组成的列表 """ alist=[] blist=[] tree_list=[] for i in tree._tree_data: alist.append(bandModel(i[0],i[1],i[2])) for i in range(0, len(alist)): for j in range(0, len(alist)): if alist[j]._FID == alist[i]._ID: alist[i].addChild(alist[j]) id_list = get_id(tree, strs) if len(id_list) == 0: return "'%s' is not the nodes of the tree!" % strs,id_list else: z=0 for i in id_list: if len(alist[i-1]._children) == 0: tree_list.append("%s is the deepest nodes of the tree!" % tree._nlist[i-1][1]) else: alist[i-1].printTree(blist) z=reduce(get_tree,blist) if z : tree_list.append(z) return tree_list,id_list if __name__ == '__main__': tree=Transtrees(u"(ROOT (IP (ADVP (AD u4f46)) (NP (NN u52a0u5de5u5382)) (VP (ADVP (AD u8fd8u662f)) (VP (VV u7ee7u7eed) (VP (VP (VV u71c3u70e7) (NP (NN u80f6) (NN u5236u54c1))) (VP (VV u6392u653e) (NP (NN u5e9fu6c14)))))) (PU u3002)))") tree_list,id_list=get_result(tree,'IP') pprint(tree_list)
完全手工,因为只是一个抽取工具,所以没有做相应的优化。
个人劳动成果,转载请注明。