zoukankan      html  css  js  c++  java
  • python对文章词频的统计

    import os
    import re
    
    from nltk import ne_chunk, pos_tag, word_tokenize
    import nltk
    from docx import Document
    import langid
    import pandas as pd
    
    
    def readWord():
        text = ""
        rootdir = 'C:\Users\Administrator\Desktop\一季度'
        list = os.listdir(rootdir)  # 列出文件夹下所有的目录与文件
        for i in range(0, len(list)):
            path = os.path.join(rootdir, list[i])
            print(path)
            document = Document(path)
            # 获取所有段落
            all_paragraphs = document.paragraphs
            for paragraph in all_paragraphs:
                if langid.classify(paragraph.text)[0] == 'en':
                    text += paragraph.text + "
    "
        return text
    
    
    def get_entities():
        obj = {}
        arr = []
        # 对文章分词
        # sentence = "I am named John Doe  AI AI AI AI"
        sentence = readWord()
    
        obj = {}
        tagged_sentence = ne_chunk(pos_tag(word_tokenize(sentence)))
        for tagged in tagged_sentence:
            if len(tagged) == 2:
                # print(tagged[1])
                pattern = re.compile("’|”|—|[|…|/|s|P|II|R|A|]")
                if pattern.findall(tagged[0][0]) and (tagged[1] == "NNP" or tagged[1] == "NNPS"):
                    # if (tagged[1] == "NNP" or tagged[1] == "NNPS") and tagged[0] != "" and tagged[0][0] != "" and tagged[0][0] != "" and tagged[0][0] != "[" and tagged[0][0] != "]" and tagged[0][0] != "" and tagged[0][0] != "@" and tagged[0][0] != "/" and tagged[0][0] != "s" and tagged[0][0] != "|" and tagged[0][0] != "|" and tagged[0][0] != "P" and tagged[0][0] != "" and tagged[0][0] != "II" and tagged[0][0] != "R" and tagged[0][0] != "A" and tagged[0][0] != "" and tagged[0][0] != "" and tagged[0][0] != "":
    
                    if obj.get(tagged[0]) is not None:
                        obj[tagged[0]] += 1
                    else:
                        obj[tagged[0]] = 1
            else:
                # print(tagged)
                # print(tagged[0])
                if len(tagged[0]) == 2:
                    # print(tagged[1])
                    if (tagged[0][1] == "NNP" or tagged[0][1] == "NNPS") and tagged[0][0] != "" and tagged[0][0] != "" and 
                            tagged[0][0] != "" and tagged[0][0] != "[" and tagged[0][0] != "]" and tagged[0][0] != "" and 
                            tagged[0][0] != "@" and tagged[0][0] != "/" and tagged[0][0] != "s" and tagged[0][0] != "|" and 
                            tagged[0][0] != "|" and tagged[0][0] != "P" and tagged[0][0] != "" and tagged[0][0] != "II" and 
                            tagged[0][0] != "R" and tagged[0][0] != "A" and tagged[0][0] != "" and tagged[0][0] != "" and 
                            tagged[0][0] != "":
                        if obj.get(tagged[0][0]) is not None:
                            obj[tagged[0][0]] += 1
                        else:
                            obj[tagged[0][0]] = 1
    
        # # tagged_sentence = nltk.tag.pos_tag(sentence.split())
        # tagged_sentence = ne_chunk(pos_tag(word_tokenize(sentence)))
        # # print(ne_chunk(pos_tag(word_tokenize(sentence))))
        # # print(tagged_sentence)
        # for tagged in tagged_sentence:
        #     if tagged[1] == "NNP" or tagged[1] == "NNPS":
        #         # if obj.get(tagged[0]) is not None:
        #         #     obj[tagged[0]] += 1
        #         # else:
        #         #     obj[tagged[0]] = 1
        #         if obj.get(tagged[0].strip(",")) is not None:
        #             obj[tagged[0].strip(",").strip(".")] += 1
        #         else:
        #             obj[tagged[0]] = 1
      
      # 将对象转为数组对象,便于pd将数据转为一种数据结构,写入excel中 dataframe是一种表格型的数据存储结构,可以看作是几个serie的集合。dataframe既有行索引,也有列索引。 for o in obj: obja = {"word": o, "num": obj[o]} arr.append(obja) p = pd.DataFrame(arr) # print(p) p.to_csv('c4i.csv', encoding='utf_8_sig') # print(p) if __name__ == '__main__': get_entities() # readWord()

     使用的依赖库如下所示:

    python-docx==0.8.11
  • 相关阅读:
    如何做实时监控?—— 参考 Spring Boot 实现
    如何做实时监控?—— 参考 Spring Boot 实现
    spring boot application properties配置详解
    Jrebel 6.2.1破解
    智能社-JS -wiki
    hibernate.properties
    Tomcat 的 socket bind failed的解决方法
    js 排序 SORT 各种方法
    java EE 如何使用Eclipse启动一个项目
    2016-06-06 数组的几个重要方法
  • 原文地址:https://www.cnblogs.com/lxz123/p/15137721.html
Copyright © 2011-2022 走看看