import re from operator import itemgetter import matplotlib.pyplot as plt from collections import Counter # 英文: frequency = {} with open("Alice's adventures in wonderland.txt") as f: file_to_string = f.read() words = re.findall(r"([A-Za-z][a-z]{2,9})", file_to_string) for word in words: count = frequency.get(word, 0) frequency[word] = count + 1 # 用于打印输出前100名 for key, value in sorted(frequency.items(), key=itemgetter(1), reverse=True)[:100]: print(key, value) sorted_freq = sorted(frequency.values(), reverse=True) # 用matplotlib验证Zipf-Law并出图 plt.title("Zipf-Law") plt.xlabel("rank") plt.ylabel("freq") x = [i for i in range(100)] plt.loglog(x, sorted_freq[:100]) plt.show() # 条形图 plt.bar(x, sorted_freq[:100]) plt.show()