需求:
1、某随机序列中的[12,5,6,4,6,8,9,5]如何找到其中出现次数最高的三个元素,出现的次数是多少?
2、对某篇英文文章进行词频统计,找出出现次数最高的十个单词,它们出现的次数是多少?
思路:
1、将序列转换成字典{'元素':'频度'},根据其中的值进行排序
2、使用collections中的Count对象
代码:
# 方法一:
In [16]: from random import randint
In [17]: data = [ randint(0,20) for _ in range(30)]
In [18]: data
Out[18]:
[10,
17,
1,
17,
6,
17,
12,
19,
13,
5,
18,
19,
14,
8,
12,
19,
9,
9,
11,
1,
0,
4,
6,
0,
4,
9,
17,
2,
8,
4]
In [19]: d = dict.fromkeys(data,0)
In [20]: d
Out[20]:
{10: 0,
17: 0,
1: 0,
6: 0,
12: 0,
19: 0,
13: 0,
5: 0,
18: 0,
14: 0,
8: 0,
9: 0,
11: 0,
0: 0,
4: 0,
2: 0}
In [21]: for x in data:
...: d[x] += 1
...:
In [22]: d
Out[22]:
{10: 1,
17: 4,
1: 2,
6: 2,
12: 2,
19: 3,
13: 1,
5: 1,
18: 1,
14: 1,
8: 2,
9: 3,
11: 1,
0: 2,
4: 3,
2: 1}
In [23]: sorted([(v,k) for k,v in d.items()],reverse=True)
Out[23]:
[(4, 17),
(3, 19),
(3, 9),
(3, 4),
(2, 12),
(2, 8),
(2, 6),
(2, 1),
(2, 0),
(1, 18),
(1, 14),
(1, 13),
(1, 11),
(1, 10),
(1, 5),
(1, 2)]
In [24]: sorted([(v,k) for k,v in d.items()],reverse=True)[:3]
Out[24]: [(4, 17), (3, 19), (3, 9)]
In [25]: sorted(((v,k) for k,v in d.items()),reverse=True)[:3] # 使用生成器解析更加节省空间
Out[25]: [(4, 17), (3, 19), (3, 9)]
In [26]: import heapq
In [27]: heapq.nlargest(3,((v,k) for k,v in d.items()))
Out[27]: [(4, 17), (3, 19), (3, 9)]
# 方法二:
In [28]: from collections import Counter
In [29]: data
Out[29]:
[10,
17,
1,
17,
6,
17,
12,
19,
13,
5,
18,
19,
14,
8,
12,
19,
9,
9,
11,
1,
0,
4,
6,
0,
4,
9,
17,
2,
8,
4]
In [30]: Counter(data)
Out[30]:
Counter({10: 1,
17: 4,
1: 2,
6: 2,
12: 2,
19: 3,
13: 1,
5: 1,
18: 1,
14: 1,
8: 2,
9: 3,
11: 1,
0: 2,
4: 3,
2: 1})
In [31]: c = Counter(data)
In [32]: c.most_common(3)
Out[32]: [(17, 4), (19, 3), (9, 3)]
# 统计词数:
>>> txt = open('./example.txt').read()
>>> import re
>>> word_list = re.split('W+',txt) # 以不止一个非单词字符进行切割
>>> from collections import Counter
>>> c2 = Counter(word_list)
>>> c2.most_common(10)
[('the', 18),
('to', 17),
('a', 16),
('it', 12),
('of', 12),
('and', 11),
('is', 11),
('that', 8),
('we', 8),
('enough', 7)]
>>>