zoukankan      html  css  js  c++  java
  • 使用NLPIR 进行中文分词并标注词性

    背景

    在许多时候为了更好的解析文本,我们不仅仅需要将文本分词,去停这么简单,除了获取关键词与新词汇以外,我们还需要对获取每个粒度的其他信息,比如词性标注,在python中NLPIR就可以很好的完成这个任务,如果你没有NLPIR那么你可以参考这篇文章NLPIR快速搭建,或者直接下载我已经准备好的汉语自然语言处理文件包NLP源码集合

    代码,亦是我的笔记

    # - * - coding: utf - 8 -*-
    #
    # 作者:田丰(FontTian)
    # 创建时间:'2017/7/3'
    # 邮箱:fonttian@Gmaill.com
    # CSDN:http://blog.csdn.net/fontthrone
    
    import nltk
    import sys
    import nlpir
    
    sys.path.append("../")
    
    reload(sys)
    sys.setdefaultencoding('utf-8')
    
    import jieba
    from jieba import posseg
    
    
    def cutstrpos(txt):
        # 分词+词性
        cutstr = posseg.cut(txt)
        result = ""
        for word, flag in cutstr:
            result += word + "/" + flag + ' '
        return result
    
    
    def cutstring(txt):
        # 分词
        cutstr = jieba.cut(txt)
        result = " ".join(cutstr)
        return result
    
    
    # 读取文件
    txtfileobject = open('txt/nltest1.txt')
    textstr = ""
    try:
        filestr = txtfileobject.read()
    finally:
        txtfileobject.close()
    
    
    # 使用NLPIR2016 进行分词
    def ChineseWordsSegmentationByNLPIR2016(text):
        txt = nlpir.seg(text)
        seg_list = []
    
        for t in txt:
            seg_list.append(t[0].encode('utf-8'))
    
        return seg_list
    
    
    stopwords_path = 'stopwordsstopwords1893.txt'  # 停用词词表
    
    
    # 去除停用词
    def ClearStopWordsWithListByNLPIR2016(seg_list):
        mywordlist = []
        liststr = "/ ".join(seg_list)
        f_stop = open(stopwords_path)
        try:
            f_stop_text = f_stop.read()
            f_stop_text = unicode(f_stop_text, 'utf-8')
        finally:
            f_stop.close()
        f_stop_seg_list = f_stop_text.split('
    ')
        for myword in liststr.split('/'):
            if not (myword.strip() in f_stop_seg_list) and len(myword.strip()) > 1:
                mywordlist.append(myword)
        return ''.join(mywordlist)
    
    
    # print filestr
    filestr2 = ClearStopWordsWithListByNLPIR2016(ChineseWordsSegmentationByNLPIR2016(filestr)).replace(' ', '')
    
    # 中文分词并标注词性
    posstr = cutstrpos(filestr2)
    
    print '**** show is end ****'
    
    print ' '
    print 'This is posster'
    print posstr
    
    strtag = [nltk.tag.str2tuple(word) for word in posstr.split()]
    # for item in strtag:
    #     print item
    strsBySeg = nlpir.seg(filestr)
    strsBySeg2 = nlpir.seg(filestr2)
    strsByParagraphProcess = nlpir.ParagraphProcess(filestr, 1)
    strsByParagraphProcessA = nlpir.ParagraphProcessA(filestr, ChineseWordsSegmentationByNLPIR2016(filestr)[0], 1)
    
    print ' '
    print ' '
    print '**** strtag ****'
    
    for word, tag in strtag:
        print word, "/", tag, "|",
    
    print ' '
    print ' '
    print '**** strsBySeg ****'
    for word, tag in strsBySeg:
        print word, "/", tag, "|",
    
    print ' '
    print ' '
    print '**** strsBySeg2 ****'
    for word, tag in strsBySeg2:
        print word, "/", tag, "|",
    
    print ' '
    print ' '
    print '**** strsByParagraphProcess ****'
    print strsByParagraphProcess
    
    # print ' '
    # print ' '
    # print '**** strsByParagraphProcessA ****'
    # 
    # for item in strsByParagraphProcessA:
    #     print item,
    
    print ' '
    print ' '
    print '**** show is end ****
    

    实用示例

    NLPIR会自动对人名进行分词与标注,借助该功能我们可以获取自定义新词,或者提取与某类人有关的句子.下面是我前段时间在写一个项目demon时刚写的测试代码

    # - * - coding: utf - 8 -*-
    #
    # 作者:田丰(FontTian)
    # 创建时间:'2017/7/11'
    # 邮箱:fonttian@Gmaill.com
    # CSDN:http://blog.csdn.net/fontthrone
    from os import path
    from scipy.misc import imread
    import matplotlib.pyplot as plt
    import jieba
    from nlpir import *
    from wordcloud import WordCloud, ImageColorGenerator
    import sys
    reload(sys)
    sys.setdefaultencoding('utf-8')
    d = path.dirname(__file__)
    
    text = '接待钟世镇院士,筹备杨东奇部长接待事宜。'
    stopwords_path = 'stopwordsCNENstopwords.txt'  # 停用词词表
    number = 10
    
    def ShowByItem(List):
        print '********* show ', str(List), ' end *********'
        for item in List:
            print item,
        print
        print '********* show ', str(List), ' end *********'
    
    #  使用NLPIR2016 获取名字
    def FindAcademicianNameByNLPIR2016(text,isAddYuanShi):
        txt = seg(text)
        seg_list = []
    
        for i in range(len(txt)):
            if txt[i][1] == 'nr' and txt[i+1][0] == '院士':
                if isAddYuanShi == 1:
                    seg_list.append(txt[i][0].encode('utf-8')+'院士')
                else:
                    seg_list.append(txt[i][0].encode('utf-8'))
    
    
        return seg_list
    
    str2 = FindAcademicianNameByNLPIR2016(text,1)
    
    ShowByItem(str2)
    
    
    # 输出
    ********* show  ['xe9x92x9fxe4xb8x96xe9x95x87xe9x99xa2xe5xa3xab']  end 
    钟世镇院士
    ********* show  ['xe9x92x9fxe4xb8x96xe9x95x87xe9x99xa2xe5xa3xab']  end 

    在demon中使用的

    使用NLPIR2016 获取名字
    def FindAcademicianNameByNLPIR2016(text,isAddYuanShi):
        txt = seg(text)
        seg_list = []
    
        for i in range(len(txt)):
            if txt[i][1] == 'nr' and txt[i+1][0] == '院士':
                if isAddYuanShi == 1:
                    seg_list.append(txt[i][0].encode('utf-8')+'院士')
                else:
                    seg_list.append(txt[i][0].encode('utf-8'))
    
    
    strAcademicianName = FindAcademicianNameByNLPIR2016(fullContent,1)
    strAcademicianName = list(set(strAcademicianName))
    # 利用pandas存储
    dfAcademicianName = pd.DataFrame(strAcademicianName)
    dfAcademicianName.columns = ['AcademicianName']
    dfAcademicianName.to_csv('csv/dfAcademicianName')
    # 利用Pandas 获取
    dfNewWords = pd.read_csv("csv/dfNewWords")
    dfAcademicianName = pd.read_csv("csv/dfAcademicianName")
    
    # 你也可以将其加入用户新词汇
    # add_word(dfAcademicianName['AcademicianName'])
    
    # 提取所有带有院士的报告
    def GetAcademicianCSV(df,strColumn,df1):
        dfAcademicianName = pd.read_csv("csv/dfAcademicianName")
        listAcademicianName = list(dfAcademicianName['AcademicianName'])
        print type(listAcademicianName)
    
        mywordlistAcademicianName =[]
        mywordlisttime = []
        mywordAca = []
        df1 = df1.copy()
        numlen = len(df1.index)
        for i in range(numlen):
            for myword in df1.loc[i, strColumn].split():
                if (myword in listAcademicianName) and len(myword) > 1:
                    print myword
                    mywordlistAcademicianName.append(df.loc[i, strColumn])
                    mywordAca.append(myword)
                    mywordlisttime.append(df.loc[i, 'time'])
    
        return mywordlistAcademicianName,mywordlisttime,mywordAca
    
    # 返回的信息
    mywordlistAcademicianName, mywordlisttime,mywordAca = GetAcademicianCSV(df,'content',df1)

    效果如下

    获取的院士名字

    获取的院士报告

  • 相关阅读:
    Java学习二十九天
    Java学习二十八天
    47. Permutations II 全排列可重复版本
    46. Permutations 全排列,无重复
    subset ii 子集 有重复元素
    339. Nested List Weight Sum 339.嵌套列表权重总和
    251. Flatten 2D Vector 平铺二维矩阵
    217. Contains Duplicate数组重复元素
    209. Minimum Size Subarray Sum 结果大于等于目标的最小长度数组
    438. Find All Anagrams in a String 查找字符串中的所有Anagrams
  • 原文地址:https://www.cnblogs.com/fonttian/p/9162807.html
Copyright © 2011-2022 走看看