zoukankan html css js c++ java

Python中文词频统计

以下是关于小说的中文词频统计

这里有三个文件，分别为novel.txt、punctuation.txt、meaningless.txt。
这三个是小说文本、特殊符号和无意义词

Python代码统计词频如下：

import jieba # jieba中文分词库
# 从文件读入小说
with open('novel.txt', 'r', encoding='UTF-8') as novelFile:
    novel = novelFile.read()

# 将小说中的特殊符号过滤
with open('punctuation.txt', 'r', encoding='UTF-8') as punctuationFile:
    for punctuation in punctuationFile.readlines():
        novel = novel.replace(punctuation[0], ' ')

# 添加特定词到词库
jieba.add_word('凤十')
jieba.add_word('林胖子')
jieba.add_word('黑道')
jieba.add_word('饿狼帮')
# 从文件独处无意义词
with open('meaningless.txt', 'r', encoding='UTF-8') as meaninglessFile:
    mLessSet = set(meaninglessFile.read().split('
'))
mLessSet.add(' ')

novelList = list(jieba.cut(novel))
novelSet = set(novelList) - mLessSet # 将无意义词从词语集合中删除
novelDict = {}
# 统计出词频字典
for word in novelSet:
    novelDict[word] = novelList.count(word)

# 对词频进行排序
novelListSorted = list(novelDict.items())
novelListSorted.sort(key=lambda e: e[1], reverse=True)

# 打印前20词频
topWordNum = 0
for topWordTup in novelListSorted:
    if topWordNum == 20:
        break
    print(topWordTup)
    topWordNum += 1
    
# 打印记录： 
# ('杨易', 906)
# ('说道', 392)
# ('一个', 349)
# ('林胖子', 338)
# ('知道', 295)
# ('和', 218)
# ('心里', 217)
# ('已经', 217)
# ('没有', 217)
# ('这个', 206)
# ('有点', 198)
# ('道', 195)
# ('徐明', 194)
# ('就是', 192)
# ('看', 191)
# ('走', 185)
# ('有', 178)
# ('上', 176)
# ('好', 176)
# ('来', 170)