zoukankan      html  css  js  c++  java
  • 使用NLPIR 进行中文分词并标注词性

    背景

    在许多时候为了更好的解析文本,我们不仅仅需要将文本分词,去停这么简单,除了获取关键词与新词汇以外,我们还需要对获取每个粒度的其他信息,比如词性标注,在python中NLPIR就可以很好的完成这个任务,如果你没有NLPIR那么你可以参考这篇文章NLPIR快速搭建,或者直接下载我已经准备好的汉语自然语言处理文件包NLP源码集合

    代码,亦是我的笔记

    # - * - coding: utf - 8 -*-
    #
    # 作者:田丰(FontTian)
    # 创建时间:'2017/7/3'
    # 邮箱:fonttian@Gmaill.com
    # CSDN:http://blog.csdn.net/fontthrone
    
    import nltk
    import sys
    import nlpir
    
    sys.path.append("../")
    
    reload(sys)
    sys.setdefaultencoding('utf-8')
    
    import jieba
    from jieba import posseg
    
    
    def cutstrpos(txt):
        # 分词+词性
        cutstr = posseg.cut(txt)
        result = ""
        for word, flag in cutstr:
            result += word + "/" + flag + ' '
        return result
    
    
    def cutstring(txt):
        # 分词
        cutstr = jieba.cut(txt)
        result = " ".join(cutstr)
        return result
    
    
    # 读取文件
    txtfileobject = open('txt/nltest1.txt')
    textstr = ""
    try:
        filestr = txtfileobject.read()
    finally:
        txtfileobject.close()
    
    
    # 使用NLPIR2016 进行分词
    def ChineseWordsSegmentationByNLPIR2016(text):
        txt = nlpir.seg(text)
        seg_list = []
    
        for t in txt:
            seg_list.append(t[0].encode('utf-8'))
    
        return seg_list
    
    
    stopwords_path = 'stopwordsstopwords1893.txt'  # 停用词词表
    
    
    # 去除停用词
    def ClearStopWordsWithListByNLPIR2016(seg_list):
        mywordlist = []
        liststr = "/ ".join(seg_list)
        f_stop = open(stopwords_path)
        try:
            f_stop_text = f_stop.read()
            f_stop_text = unicode(f_stop_text, 'utf-8')
        finally:
            f_stop.close()
        f_stop_seg_list = f_stop_text.split('
    ')
        for myword in liststr.split('/'):
            if not (myword.strip() in f_stop_seg_list) and len(myword.strip()) > 1:
                mywordlist.append(myword)
        return ''.join(mywordlist)
    
    
    # print filestr
    filestr2 = ClearStopWordsWithListByNLPIR2016(ChineseWordsSegmentationByNLPIR2016(filestr)).replace(' ', '')
    
    # 中文分词并标注词性
    posstr = cutstrpos(filestr2)
    
    print '**** show is end ****'
    
    print ' '
    print 'This is posster'
    print posstr
    
    strtag = [nltk.tag.str2tuple(word) for word in posstr.split()]
    # for item in strtag:
    #     print item
    strsBySeg = nlpir.seg(filestr)
    strsBySeg2 = nlpir.seg(filestr2)
    strsByParagraphProcess = nlpir.ParagraphProcess(filestr, 1)
    strsByParagraphProcessA = nlpir.ParagraphProcessA(filestr, ChineseWordsSegmentationByNLPIR2016(filestr)[0], 1)
    
    print ' '
    print ' '
    print '**** strtag ****'
    
    for word, tag in strtag:
        print word, "/", tag, "|",
    
    print ' '
    print ' '
    print '**** strsBySeg ****'
    for word, tag in strsBySeg:
        print word, "/", tag, "|",
    
    print ' '
    print ' '
    print '**** strsBySeg2 ****'
    for word, tag in strsBySeg2:
        print word, "/", tag, "|",
    
    print ' '
    print ' '
    print '**** strsByParagraphProcess ****'
    print strsByParagraphProcess
    
    # print ' '
    # print ' '
    # print '**** strsByParagraphProcessA ****'
    # 
    # for item in strsByParagraphProcessA:
    #     print item,
    
    print ' '
    print ' '
    print '**** show is end ****
    

    实用示例

    NLPIR会自动对人名进行分词与标注,借助该功能我们可以获取自定义新词,或者提取与某类人有关的句子.下面是我前段时间在写一个项目demon时刚写的测试代码

    # - * - coding: utf - 8 -*-
    #
    # 作者:田丰(FontTian)
    # 创建时间:'2017/7/11'
    # 邮箱:fonttian@Gmaill.com
    # CSDN:http://blog.csdn.net/fontthrone
    from os import path
    from scipy.misc import imread
    import matplotlib.pyplot as plt
    import jieba
    from nlpir import *
    from wordcloud import WordCloud, ImageColorGenerator
    import sys
    reload(sys)
    sys.setdefaultencoding('utf-8')
    d = path.dirname(__file__)
    
    text = '接待钟世镇院士,筹备杨东奇部长接待事宜。'
    stopwords_path = 'stopwordsCNENstopwords.txt'  # 停用词词表
    number = 10
    
    def ShowByItem(List):
        print '********* show ', str(List), ' end *********'
        for item in List:
            print item,
        print
        print '********* show ', str(List), ' end *********'
    
    #  使用NLPIR2016 获取名字
    def FindAcademicianNameByNLPIR2016(text,isAddYuanShi):
        txt = seg(text)
        seg_list = []
    
        for i in range(len(txt)):
            if txt[i][1] == 'nr' and txt[i+1][0] == '院士':
                if isAddYuanShi == 1:
                    seg_list.append(txt[i][0].encode('utf-8')+'院士')
                else:
                    seg_list.append(txt[i][0].encode('utf-8'))
    
    
        return seg_list
    
    str2 = FindAcademicianNameByNLPIR2016(text,1)
    
    ShowByItem(str2)
    
    
    # 输出
    ********* show  ['xe9x92x9fxe4xb8x96xe9x95x87xe9x99xa2xe5xa3xab']  end 
    钟世镇院士
    ********* show  ['xe9x92x9fxe4xb8x96xe9x95x87xe9x99xa2xe5xa3xab']  end 

    在demon中使用的

    使用NLPIR2016 获取名字
    def FindAcademicianNameByNLPIR2016(text,isAddYuanShi):
        txt = seg(text)
        seg_list = []
    
        for i in range(len(txt)):
            if txt[i][1] == 'nr' and txt[i+1][0] == '院士':
                if isAddYuanShi == 1:
                    seg_list.append(txt[i][0].encode('utf-8')+'院士')
                else:
                    seg_list.append(txt[i][0].encode('utf-8'))
    
    
    strAcademicianName = FindAcademicianNameByNLPIR2016(fullContent,1)
    strAcademicianName = list(set(strAcademicianName))
    # 利用pandas存储
    dfAcademicianName = pd.DataFrame(strAcademicianName)
    dfAcademicianName.columns = ['AcademicianName']
    dfAcademicianName.to_csv('csv/dfAcademicianName')
    # 利用Pandas 获取
    dfNewWords = pd.read_csv("csv/dfNewWords")
    dfAcademicianName = pd.read_csv("csv/dfAcademicianName")
    
    # 你也可以将其加入用户新词汇
    # add_word(dfAcademicianName['AcademicianName'])
    
    # 提取所有带有院士的报告
    def GetAcademicianCSV(df,strColumn,df1):
        dfAcademicianName = pd.read_csv("csv/dfAcademicianName")
        listAcademicianName = list(dfAcademicianName['AcademicianName'])
        print type(listAcademicianName)
    
        mywordlistAcademicianName =[]
        mywordlisttime = []
        mywordAca = []
        df1 = df1.copy()
        numlen = len(df1.index)
        for i in range(numlen):
            for myword in df1.loc[i, strColumn].split():
                if (myword in listAcademicianName) and len(myword) > 1:
                    print myword
                    mywordlistAcademicianName.append(df.loc[i, strColumn])
                    mywordAca.append(myword)
                    mywordlisttime.append(df.loc[i, 'time'])
    
        return mywordlistAcademicianName,mywordlisttime,mywordAca
    
    # 返回的信息
    mywordlistAcademicianName, mywordlisttime,mywordAca = GetAcademicianCSV(df,'content',df1)

    效果如下

    获取的院士名字

    获取的院士报告

  • 相关阅读:
    Construct Binary Tree from Preorder and Inorder Traversal
    Construct Binary Tree from Inorder and Postorder Traversal
    Maximum Depth of Binary Tree
    Sharepoint 2013 创建TimeJob 自动发送邮件
    IE8 不能够在Sharepoint平台上在线打开Office文档解决方案
    TFS安装与管理
    局域网通过IP查看对方计算机名,通过计算机名查看对方IP以及查看在线所有电脑IP
    JS 隐藏Sharepoint中List Item View页面的某一个字段
    SharePoint Calculated Column Formulas & Functions
    JS 两个一组数组转二维数组
  • 原文地址:https://www.cnblogs.com/fonttian/p/9162807.html
Copyright © 2011-2022 走看看