1.读入待分析的字符串
2.分解提取单词
3.计数字典
4.排除语法型词汇
5.排序
6.输出TOP(20)
fo=open ('test.py','r') #1.读入 b=fo.read() exc={'the','of','and','on','a','in','by','since'} b.lower() for i in ',.?-_!': b=b.replace(i,' ') words=b.split()#单词的列表2.分解提取单词 #print(words) di={} keys=set(words)#出现单词的集合,字典的key keys=keys-exc #排除词汇 #print(keys) for i in keys: di[i]=0 di[i]=words.count(i)#单词出现的次数3.计数 #print(di) wc=list(di.items())#列表 wc.sort(key=lambda x:x[1],reverse=True)#排序5.输入top20 #print(wc) for i in range(15): print(wc[i]) fo.close()
结果
#单词列表 ["Myanmar's", 'Aung', 'San', 'Suu', 'Kyi', 'is', 'facing', 'mounting', 'international', 'pressure', 'for', 'her', 'handling', 'of', 'violence', 'in', 'Rakhine', 'state', 'and', 'the', 'Rohingya', 'refugee', 'crisis', 'In', 'a', 'speech', 'on', 'Tuesday', 'the', 'de', 'facto', 'leader', 'condemned', 'rights', 'abuses', 'but', 'did', 'not', 'blame', 'the', 'army', 'or', 'address', 'allegations', 'of', 'ethnic', 'cleansing', 'Leaders', 'and', 'diplomats', 'from', 'several', 'countries', 'have', 'since', 'expressed', 'strong', 'disappointment', 'with', 'her', 'stance', 'More', 'than', '400', '000', 'Rohingya', 'have', 'fled', 'to', 'Bangladesh', 'since', 'late', 'August', 'The', 'latest', 'unrest', 'in', 'troubled', 'Rakhine', 'was', 'sparked', 'by', 'deadly', 'attacks', 'on', 'police', 'stations', 'across', 'the', 'state', 'last', 'month', 'blamed', 'on', 'a', 'newly', 'emerged', 'militant', 'group', 'the', 'Arakan', 'Rohingya', 'Salvation', 'Army', '(Arsa)', 'Scores', 'of', 'people', 'were', 'killed', 'in', 'an', 'ensuing', 'military', 'crackdown', 'and', 'there', 'are', 'widespread', 'allegations', 'of', 'villages', 'being', 'burned', 'and', 'Rohingya', 'being', 'driven', 'out'] #提取单词 {'Aung', 'facto', 'condemned', 'Rohingya', 'killed', 'is', 'with', 'More', '000', 'stance', 'latest', 'unrest', 'several', 'month', 'international', 'sparked', 'ethnic', 'group', 'her', 'blame', 'last', 'speech', 'to', 'Rakhine', 'out', 'facing', 'widespread', 'de', 'Leaders', 'violence', 'late', 'Tuesday', "Myanmar's", 'army', 'stations', 'or', 'San', 'but', 'did', 'Suu', 'handling', 'Army', 'an', 'burned', 'blamed', 'allegations', 'address', 'disappointment', 'there', 'In', 'not', 'mounting', 'police', 'are', '(Arsa)', 'Arakan', 'deadly', 'military', 'pressure', 'have', 'crisis', 'being', 'across', 'August', 'leader', 'from', 'for', 'The', 'Kyi', 'than', 'abuses', 'people', 'Scores', 'Bangladesh', 'cleansing', 'fled', 'rights', 'militant', 'emerged', '400', 'driven', 'refugee', 'troubled', 'expressed', 'were', 'crackdown', 'ensuing', 'strong', 'state', 'countries', 'was', 'diplomats', 'Salvation', 'villages', 'newly', 'attacks'} #计数 {'Aung': 1, 'facto': 1, 'condemned': 1, 'Rohingya': 4, 'killed': 1, 'is': 1, 'with': 1, 'More': 1, '000': 1, 'stance': 1, 'latest': 1, 'unrest': 1, 'several': 1, 'month': 1, 'international': 1, 'sparked': 1, 'ethnic': 1, 'group': 1, 'her': 2, 'blame': 1, 'last': 1, 'speech': 1, 'to': 1, 'Rakhine': 2, 'out': 1, 'facing': 1, 'widespread': 1, 'de': 1, 'Leaders': 1, 'violence': 1, 'late': 1, 'Tuesday': 1, "Myanmar's": 1, 'army': 1, 'stations': 1, 'or': 1, 'San': 1, 'but': 1, 'did': 1, 'Suu': 1, 'handling': 1, 'Army': 1, 'an': 1, 'burned': 1, 'blamed': 1, 'allegations': 2, 'address': 1, 'disappointment': 1, 'there': 1, 'In': 1, 'not': 1, 'mounting': 1, 'police': 1, 'are': 1, '(Arsa)': 1, 'Arakan': 1, 'deadly': 1, 'military': 1, 'pressure': 1, 'have': 2, 'crisis': 1, 'being': 2, 'across': 1, 'August': 1, 'leader': 1, 'from': 1, 'for': 1, 'The': 1, 'Kyi': 1, 'than': 1, 'abuses': 1, 'people': 1, 'Scores': 1, 'Bangladesh': 1, 'cleansing': 1, 'fled': 1, 'rights': 1, 'militant': 1, 'emerged': 1, '400': 1, 'driven': 1, 'refugee': 1, 'troubled': 1, 'expressed': 1, 'were': 1, 'crackdown': 1, 'ensuing': 1, 'strong': 1, 'state': 2, 'countries': 1, 'was': 1, 'diplomats': 1, 'Salvation': 1, 'villages': 1, 'newly': 1, 'attacks': 1} #排序 [('Rohingya', 4), ('her', 2), ('Rakhine', 2), ('allegations', 2), ('have', 2), ('being', 2), ('state', 2), ('Aung', 1), ('facto', 1), ('condemned', 1), ('killed', 1), ('is', 1), ('with', 1), ('More', 1), ('000', 1), ('stance', 1), ('latest', 1), ('unrest', 1), ('several', 1), ('month', 1), ('international', 1), ('sparked', 1), ('ethnic', 1), ('group', 1), ('blame', 1), ('last', 1), ('speech', 1), ('to', 1), ('out', 1), ('facing', 1), ('widespread', 1), ('de', 1), ('Leaders', 1), ('violence', 1), ('late', 1), ('Tuesday', 1), ("Myanmar's", 1), ('army', 1), ('stations', 1), ('or', 1), ('San', 1), ('but', 1), ('did', 1), ('Suu', 1), ('handling', 1), ('Army', 1), ('an', 1), ('burned', 1), ('blamed', 1), ('address', 1), ('disappointment', 1), ('there', 1), ('In', 1), ('not', 1), ('mounting', 1), ('police', 1), ('are', 1), ('(Arsa)', 1), ('Arakan', 1), ('deadly', 1), ('military', 1), ('pressure', 1), ('crisis', 1), ('across', 1), ('August', 1), ('leader', 1), ('from', 1), ('for', 1), ('The', 1), ('Kyi', 1), ('than', 1), ('abuses', 1), ('people', 1), ('Scores', 1), ('Bangladesh', 1), ('cleansing', 1), ('fled', 1), ('rights', 1), ('militant', 1), ('emerged', 1), ('400', 1), ('driven', 1), ('refugee', 1), ('troubled', 1), ('expressed', 1), ('were', 1), ('crackdown', 1), ('ensuing', 1), ('strong', 1), ('countries', 1), ('was', 1), ('diplomats', 1), ('Salvation', 1), ('villages', 1), ('newly', 1), ('attacks', 1)] #输出前20 ('Rohingya', 4) ('her', 2) ('Rakhine', 2) ('allegations', 2) ('have', 2) ('being', 2) ('state', 2) ('Aung', 1) ('facto', 1) ('condemned', 1) ('killed', 1) ('is', 1) ('with', 1) ('More', 1) ('000', 1) ('stance', 1) ('latest', 1) ('unrest', 1) ('several', 1) ('month', 1)