zoukankan      html  css  js  c++  java
  • 文件方式实现完整的英文词频统计实例

    1.读入待分析的字符串

    2.分解提取单词 

    3.计数字典

    4.排除语法型词汇

    5.排序

    6.输出TOP(20)

    fo=open ('test.py','r')  #1.读入
    b=fo.read()
    exc={'the','of','and','on','a','in','by','since'}  
    b.lower()
    for i in ',.?-_!':
        b=b.replace(i,' ')
    words=b.split()#单词的列表2.分解提取单词
    #print(words)
    di={}
    keys=set(words)#出现单词的集合,字典的key
    keys=keys-exc  #排除词汇
    #print(keys)
    
    for i in keys:
        di[i]=0
        di[i]=words.count(i)#单词出现的次数3.计数
    #print(di)
        
    wc=list(di.items())#列表
    wc.sort(key=lambda x:x[1],reverse=True)#排序5.输入top20
    #print(wc)
    
    for i in range(15):
        print(wc[i])
    fo.close()
    

      结果

    #单词列表
    ["Myanmar's", 'Aung', 'San', 'Suu', 'Kyi', 'is', 'facing', 'mounting', 'international', 'pressure', 'for', 'her', 'handling', 'of', 'violence', 'in', 'Rakhine', 'state', 'and', 'the', 'Rohingya', 'refugee', 'crisis', 'In', 'a', 'speech', 'on', 'Tuesday', 'the', 'de', 'facto', 'leader', 'condemned', 'rights', 'abuses', 'but', 'did', 'not', 'blame', 'the', 'army', 'or', 'address', 'allegations', 'of', 'ethnic', 'cleansing', 'Leaders', 'and', 'diplomats', 'from', 'several', 'countries', 'have', 'since', 'expressed', 'strong', 'disappointment', 'with', 'her', 'stance', 'More', 'than', '400', '000', 'Rohingya', 'have', 'fled', 'to', 'Bangladesh', 'since', 'late', 'August', 'The', 'latest', 'unrest', 'in', 'troubled', 'Rakhine', 'was', 'sparked', 'by', 'deadly', 'attacks', 'on', 'police', 'stations', 'across', 'the', 'state', 'last', 'month', 'blamed', 'on', 'a', 'newly', 'emerged', 'militant', 'group', 'the', 'Arakan', 'Rohingya', 'Salvation', 'Army', '(Arsa)', 'Scores', 'of', 'people', 'were', 'killed', 'in', 'an', 'ensuing', 'military', 'crackdown', 'and', 'there', 'are', 'widespread', 'allegations', 'of', 'villages', 'being', 'burned', 'and', 'Rohingya', 'being', 'driven', 'out']
    #提取单词
    {'Aung', 'facto', 'condemned', 'Rohingya', 'killed', 'is', 'with', 'More', '000', 'stance', 'latest', 'unrest', 'several', 'month', 'international', 'sparked', 'ethnic', 'group', 'her', 'blame', 'last', 'speech', 'to', 'Rakhine', 'out', 'facing', 'widespread', 'de', 'Leaders', 'violence', 'late', 'Tuesday', "Myanmar's", 'army', 'stations', 'or', 'San', 'but', 'did', 'Suu', 'handling', 'Army', 'an', 'burned', 'blamed', 'allegations', 'address', 'disappointment', 'there', 'In', 'not', 'mounting', 'police', 'are', '(Arsa)', 'Arakan', 'deadly', 'military', 'pressure', 'have', 'crisis', 'being', 'across', 'August', 'leader', 'from', 'for', 'The', 'Kyi', 'than', 'abuses', 'people', 'Scores', 'Bangladesh', 'cleansing', 'fled', 'rights', 'militant', 'emerged', '400', 'driven', 'refugee', 'troubled', 'expressed', 'were', 'crackdown', 'ensuing', 'strong', 'state', 'countries', 'was', 'diplomats', 'Salvation', 'villages', 'newly', 'attacks'}
    #计数
    {'Aung': 1, 'facto': 1, 'condemned': 1, 'Rohingya': 4, 'killed': 1, 'is': 1, 'with': 1, 'More': 1, '000': 1, 'stance': 1, 'latest': 1, 'unrest': 1, 'several': 1, 'month': 1, 'international': 1, 'sparked': 1, 'ethnic': 1, 'group': 1, 'her': 2, 'blame': 1, 'last': 1, 'speech': 1, 'to': 1, 'Rakhine': 2, 'out': 1, 'facing': 1, 'widespread': 1, 'de': 1, 'Leaders': 1, 'violence': 1, 'late': 1, 'Tuesday': 1, "Myanmar's": 1, 'army': 1, 'stations': 1, 'or': 1, 'San': 1, 'but': 1, 'did': 1, 'Suu': 1, 'handling': 1, 'Army': 1, 'an': 1, 'burned': 1, 'blamed': 1, 'allegations': 2, 'address': 1, 'disappointment': 1, 'there': 1, 'In': 1, 'not': 1, 'mounting': 1, 'police': 1, 'are': 1, '(Arsa)': 1, 'Arakan': 1, 'deadly': 1, 'military': 1, 'pressure': 1, 'have': 2, 'crisis': 1, 'being': 2, 'across': 1, 'August': 1, 'leader': 1, 'from': 1, 'for': 1, 'The': 1, 'Kyi': 1, 'than': 1, 'abuses': 1, 'people': 1, 'Scores': 1, 'Bangladesh': 1, 'cleansing': 1, 'fled': 1, 'rights': 1, 'militant': 1, 'emerged': 1, '400': 1, 'driven': 1, 'refugee': 1, 'troubled': 1, 'expressed': 1, 'were': 1, 'crackdown': 1, 'ensuing': 1, 'strong': 1, 'state': 2, 'countries': 1, 'was': 1, 'diplomats': 1, 'Salvation': 1, 'villages': 1, 'newly': 1, 'attacks': 1}
    #排序
    [('Rohingya', 4), ('her', 2), ('Rakhine', 2), ('allegations', 2), ('have', 2), ('being', 2), ('state', 2), ('Aung', 1), ('facto', 1), ('condemned', 1), ('killed', 1), ('is', 1), ('with', 1), ('More', 1), ('000', 1), ('stance', 1), ('latest', 1), ('unrest', 1), ('several', 1), ('month', 1), ('international', 1), ('sparked', 1), ('ethnic', 1), ('group', 1), ('blame', 1), ('last', 1), ('speech', 1), ('to', 1), ('out', 1), ('facing', 1), ('widespread', 1), ('de', 1), ('Leaders', 1), ('violence', 1), ('late', 1), ('Tuesday', 1), ("Myanmar's", 1), ('army', 1), ('stations', 1), ('or', 1), ('San', 1), ('but', 1), ('did', 1), ('Suu', 1), ('handling', 1), ('Army', 1), ('an', 1), ('burned', 1), ('blamed', 1), ('address', 1), ('disappointment', 1), ('there', 1), ('In', 1), ('not', 1), ('mounting', 1), ('police', 1), ('are', 1), ('(Arsa)', 1), ('Arakan', 1), ('deadly', 1), ('military', 1), ('pressure', 1), ('crisis', 1), ('across', 1), ('August', 1), ('leader', 1), ('from', 1), ('for', 1), ('The', 1), ('Kyi', 1), ('than', 1), ('abuses', 1), ('people', 1), ('Scores', 1), ('Bangladesh', 1), ('cleansing', 1), ('fled', 1), ('rights', 1), ('militant', 1), ('emerged', 1), ('400', 1), ('driven', 1), ('refugee', 1), ('troubled', 1), ('expressed', 1), ('were', 1), ('crackdown', 1), ('ensuing', 1), ('strong', 1), ('countries', 1), ('was', 1), ('diplomats', 1), ('Salvation', 1), ('villages', 1), ('newly', 1), ('attacks', 1)]
    #输出前20
    ('Rohingya', 4)
    ('her', 2)
    ('Rakhine', 2)
    ('allegations', 2)
    ('have', 2)
    ('being', 2)
    ('state', 2)
    ('Aung', 1)
    ('facto', 1)
    ('condemned', 1)
    ('killed', 1)
    ('is', 1)
    ('with', 1)
    ('More', 1)
    ('000', 1)
    ('stance', 1)
    ('latest', 1)
    ('unrest', 1)
    ('several', 1)
    ('month', 1)
    

      

  • 相关阅读:
    docker下安装mysql数据库
    asp.net core3.0 mvc 用 autofac
    遍历Map的方式
    JAVA 每次从List中取出100条记录
    JAVA 必须掌握技能(三)-Java 基础知识
    JAVA 必须掌握技能-Java 知识结构图
    JAVA 必须掌握技能(二)-Java IO流学习之输入输出流
    JAVA 必须掌握技能(一)-集合类型那么多,如何选择使用List, Set, Map?
    JavaScript 开发必须掌握技能(四)- 更好的使用jQuery attr方法
    JavaScript 开发必须掌握技能(三)- 更好的使用for循环方法
  • 原文地址:https://www.cnblogs.com/YyYyYy11/p/7595106.html
Copyright © 2011-2022 走看看