python统计文档中词频的小程序
python版本2.7
效果如下:
程序如下,测试文件与完整程序在我的github中
1 #统计空格数与单词数 本函数只返回了空格数 需要的可以自己返回多个值 2 def count_space(path): 3 number_counts = 0 4 space_counts = 0 5 number_list = [] 6 7 with open(path, 'r') as f: 8 for line in f: 9 line = line.strip() 10 space_split_list = line.split(' ') 11 space_counts += len(space_split_list) - 1 12 for word in space_split_list: 13 if word.isdigit(): 14 number_list.append(word) 15 number_counts = len(number_list) 16 17 return space_counts 18 #大写转小写 过滤特殊字符等 19 def count_word(path): 20 result = {} 21 with open(path) as fileread: 22 alltext = fileread.read() 23 24 alltext = alltext.lower() 25 26 alltext = re.sub(""|,|.", "", alltext) 27 28 for word in alltext.split(): 29 if word not in result: 30 result[word] = 0 31 result[word] += 1 32 33 return result 34 35 36 def sort_by_count(d): 37 38 d = collections.OrderedDict(sorted(d.items(), key = lambda t: -t[1])) 39 return d 40 41 42 if __name__ == '__main__': 43 try: 44 filename = 'read.txt' 45 46 dword = count_word(filename) 47 dword = sort_by_count(dword) 48 49 countspace = count_space(filename) 50 print "space_counts", countspace 51 count_word(filename) 52 for key,value in dword.items(): 53 print key + ":%d" % value 54 55 except IOError: 56 print 'cannot open file %s for read' % filename