pandas-08 pd.cut()的功能和作用
pd.cut()的作用,有点类似给成绩设定优良中差,比如:0-59分为差,60-70分为中,71-80分为优秀等等,在pandas中,也提供了这样一个方法来处理这些事儿。直接上代码:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
np.random.seed(666)
score_list = np.random.randint(25, 100, size=20)
print(score_list)
# [27 70 55 87 95 98 55 61 86 76 85 53 39 88 41 71 64 94 38 94]
# 指定多个区间
bins = [0, 59, 70, 80, 100]
score_cut = pd.cut(score_list, bins)
print(type(score_cut)) # <class 'pandas.core.arrays.categorical.Categorical'>
print(score_cut)
'''
[(0, 59], (59, 70], (0, 59], (80, 100], (80, 100], ..., (70, 80], (59, 70], (80, 100], (0, 59], (80, 100]]
Length: 20
Categories (4, interval[int64]): [(0, 59] < (59, 70] < (70, 80] < (80, 100]]
'''
print(pd.value_counts(score_cut)) # 统计每个区间人数
'''
(80, 100] 8
(0, 59] 7
(59, 70] 3
(70, 80] 2
dtype: int64
'''
df = DataFrame()
df['score'] = score_list
df['student'] = [pd.util.testing.rands(3) for i in range(len(score_list))]
print(df)
'''
score student
0 27 1ul
1 70 yuK
2 55 WWK
3 87 EU6
4 95 Vqn
5 98 KAf
6 55 QNT
7 61 HaE
8 86 aBo
9 76 MMa
10 85 Ctc
11 53 5BI
12 39 wBp
13 88 WMB
14 41 q5t
15 71 MjZ
16 64 nTc
17 94 Kyx
18 38 Rlh
19 94 2uV
'''
# 使用cut方法进行分箱
print(pd.cut(df['score'], bins))
'''
0 (0, 59]
1 (59, 70]
2 (0, 59]
3 (80, 100]
4 (80, 100]
5 (80, 100]
6 (0, 59]
7 (59, 70]
8 (80, 100]
9 (70, 80]
10 (80, 100]
11 (0, 59]
12 (0, 59]
13 (80, 100]
14 (0, 59]
15 (70, 80]
16 (59, 70]
17 (80, 100]
18 (0, 59]
19 (80, 100]
Name: score, dtype: category
Categories (4, interval[int64]): [(0, 59] < (59, 70] < (70, 80] < (80, 100]]
'''
df['Categories'] = pd.cut(df['score'], bins)
print(df)
'''
score student Categories
0 27 1ul (0, 59]
1 70 yuK (59, 70]
2 55 WWK (0, 59]
3 87 EU6 (80, 100]
4 95 Vqn (80, 100]
5 98 KAf (80, 100]
6 55 QNT (0, 59]
7 61 HaE (59, 70]
8 86 aBo (80, 100]
9 76 MMa (70, 80]
10 85 Ctc (80, 100]
11 53 5BI (0, 59]
12 39 wBp (0, 59]
13 88 WMB (80, 100]
14 41 q5t (0, 59]
15 71 MjZ (70, 80]
16 64 nTc (59, 70]
17 94 Kyx (80, 100]
18 38 Rlh (0, 59]
19 94 2uV (80, 100]
'''
# 但是这样的方法不是很适合阅读,可以使用cut方法中的label参数
# 为每个区间指定一个label
df['Categories'] = pd.cut(df['score'], bins, labels=['low', 'middle', 'good', 'perfect'])
print(df)
'''
score student Categories
0 27 1ul low
1 70 yuK middle
2 55 WWK low
3 87 EU6 perfect
4 95 Vqn perfect
5 98 KAf perfect
6 55 QNT low
7 61 HaE middle
8 86 aBo perfect
9 76 MMa good
10 85 Ctc perfect
11 53 5BI low
12 39 wBp low
13 88 WMB perfect
14 41 q5t low
15 71 MjZ good
16 64 nTc middle
17 94 Kyx perfect
18 38 Rlh low
19 94 2uV perfect
'''