zoukankan      html  css  js  c++  java
  • python对文章词频的统计

    import os
    import re
    
    from nltk import ne_chunk, pos_tag, word_tokenize
    import nltk
    from docx import Document
    import langid
    import pandas as pd
    
    
    def readWord():
        text = ""
        rootdir = 'C:\Users\Administrator\Desktop\一季度'
        list = os.listdir(rootdir)  # 列出文件夹下所有的目录与文件
        for i in range(0, len(list)):
            path = os.path.join(rootdir, list[i])
            print(path)
            document = Document(path)
            # 获取所有段落
            all_paragraphs = document.paragraphs
            for paragraph in all_paragraphs:
                if langid.classify(paragraph.text)[0] == 'en':
                    text += paragraph.text + "
    "
        return text
    
    
    def get_entities():
        obj = {}
        arr = []
        # 对文章分词
        # sentence = "I am named John Doe  AI AI AI AI"
        sentence = readWord()
    
        obj = {}
        tagged_sentence = ne_chunk(pos_tag(word_tokenize(sentence)))
        for tagged in tagged_sentence:
            if len(tagged) == 2:
                # print(tagged[1])
                pattern = re.compile("’|”|—|[|…|/|s|P|II|R|A|]")
                if pattern.findall(tagged[0][0]) and (tagged[1] == "NNP" or tagged[1] == "NNPS"):
                    # if (tagged[1] == "NNP" or tagged[1] == "NNPS") and tagged[0] != "" and tagged[0][0] != "" and tagged[0][0] != "" and tagged[0][0] != "[" and tagged[0][0] != "]" and tagged[0][0] != "" and tagged[0][0] != "@" and tagged[0][0] != "/" and tagged[0][0] != "s" and tagged[0][0] != "|" and tagged[0][0] != "|" and tagged[0][0] != "P" and tagged[0][0] != "" and tagged[0][0] != "II" and tagged[0][0] != "R" and tagged[0][0] != "A" and tagged[0][0] != "" and tagged[0][0] != "" and tagged[0][0] != "":
    
                    if obj.get(tagged[0]) is not None:
                        obj[tagged[0]] += 1
                    else:
                        obj[tagged[0]] = 1
            else:
                # print(tagged)
                # print(tagged[0])
                if len(tagged[0]) == 2:
                    # print(tagged[1])
                    if (tagged[0][1] == "NNP" or tagged[0][1] == "NNPS") and tagged[0][0] != "" and tagged[0][0] != "" and 
                            tagged[0][0] != "" and tagged[0][0] != "[" and tagged[0][0] != "]" and tagged[0][0] != "" and 
                            tagged[0][0] != "@" and tagged[0][0] != "/" and tagged[0][0] != "s" and tagged[0][0] != "|" and 
                            tagged[0][0] != "|" and tagged[0][0] != "P" and tagged[0][0] != "" and tagged[0][0] != "II" and 
                            tagged[0][0] != "R" and tagged[0][0] != "A" and tagged[0][0] != "" and tagged[0][0] != "" and 
                            tagged[0][0] != "":
                        if obj.get(tagged[0][0]) is not None:
                            obj[tagged[0][0]] += 1
                        else:
                            obj[tagged[0][0]] = 1
    
        # # tagged_sentence = nltk.tag.pos_tag(sentence.split())
        # tagged_sentence = ne_chunk(pos_tag(word_tokenize(sentence)))
        # # print(ne_chunk(pos_tag(word_tokenize(sentence))))
        # # print(tagged_sentence)
        # for tagged in tagged_sentence:
        #     if tagged[1] == "NNP" or tagged[1] == "NNPS":
        #         # if obj.get(tagged[0]) is not None:
        #         #     obj[tagged[0]] += 1
        #         # else:
        #         #     obj[tagged[0]] = 1
        #         if obj.get(tagged[0].strip(",")) is not None:
        #             obj[tagged[0].strip(",").strip(".")] += 1
        #         else:
        #             obj[tagged[0]] = 1
      
      # 将对象转为数组对象,便于pd将数据转为一种数据结构,写入excel中 dataframe是一种表格型的数据存储结构,可以看作是几个serie的集合。dataframe既有行索引,也有列索引。 for o in obj: obja = {"word": o, "num": obj[o]} arr.append(obja) p = pd.DataFrame(arr) # print(p) p.to_csv('c4i.csv', encoding='utf_8_sig') # print(p) if __name__ == '__main__': get_entities() # readWord()

     使用的依赖库如下所示:

    python-docx==0.8.11
  • 相关阅读:
    填坑总结:python内存泄漏排查小技巧
    springMVC注解中@RequestMapping中常用参数value params 以及@RequestParam 详解
    springMVC 自定义类型转换器
    为什么Java需要lambda 表达式? 上帝爱吃苹果
    利器| Cypress 强大全新的 Web UI 测试框架应用尝鲜
    缺少锻炼面试的机会?城市群之北上广杭一起来了!
    实战 | 基于JMeter 完成典型电商场景(首页浏览)的性能压测
    一文搞定 pytest 自动化测试框架(一)
    测试面试 | Java 经典面试题汇总
    软件测试工程师成长痛点和职业发展建议
  • 原文地址:https://www.cnblogs.com/lxz123/p/15137721.html
Copyright © 2011-2022 走看看