通过聚合运算可以得到我们比较感兴趣的数据以方便处理
1 import pandas as pd 2 import numpy as np 3 4 # 先创建一组数据表DataFrame 5 df = pd.DataFrame({'key1':['a','a','b','b','a'], 6 'key2':['one','two','one','two','one'], 7 'data1':np.random.randint(1,10,5), 8 'data2':np.random.randint(1,10,5)}) 9 # 这组数据的key1列与key2列有相同的有各自相同的索引行 10 11 df.groupby('key1').describe() 12 # 通过这一行我们可以看到关于key1列分组后的一些计算结果, 13 # count mean std min 25% 50% 75% max 14 15 grouped = df.groupby('key1') 16 # 它返回的是DataFrom对象 17 18 def peak_range(s): 19 print(type(s)) 20 return s.max() - s.min() 21 22 # 接下来就是看看聚合的应用了 23 grouped.agg(['std','mean','sum',('range',peak_range)]) 24 # 这个函数聚合了df.groupby('key1').describe()所展现的一些计算结果 25 # ('range',peak_range)中的range是peak_range函数计算结果的列名 26 27 28 # 通过创建字典进行聚合自己想要的部分数据 29 d = {'data1':'mean', 30 'data2':'sum'} 31 grouped.agg(d) 32 33 d = {'data1':['mean',('range',peak_range)], 34 'data2':'sum'} 35 grouped.agg(d) 36 grouped.agg(d).reset_index() # 不把key1作为索引来处理 37 df.groupby('key1',as_index=False).agg(d) # 与上一行效果一样
1 import pandas as pd 2 import numpy as np 3 4 # 先创建一个DataFrame 5 df = pd.DataFrame({'key1':['a','a','b','b','a'], 6 'key2':['one','two','one','two','one'], 7 'data1':np.random.randint(1,10,5), 8 'data2':np.random.randint(1,10,5)}) 9 # 合并方式一 10 k1_mean = df.groupby('key1').mean().add_prefix('mean_') 11 # 将key1分组取均值,并将data1与data2之前加上mean_ 12 pd.merge(df, k1_mean, left_on='key1',right_index=True) # 通过merge合并 13 14 k1_mean = df.groupby('key1').transform(np.mean).add_prefix('mean_') 15 # 它通过分组求平均,保持原来的索引位置与行数 16 # 这样可以通过下面的代码实现合并 17 df[k1_mean.columns] = k1_mean
1 import pandas as pd 2 import numpy as np 3 4 df = pd.DataFrame(np.random.randint(1,10,(5,5)), 5 columns=['a','b','c','d','e'], 6 index=['Alice','Bob','Candy','Dark','Emily']) 7 8 def demean(s): 9 return s - s.mean() 10 11 key = ['one','one','two','one','two'] 12 demeaned = df.groupby(key).transform(demean) 13 14 demeaned.groupby(key).mean() # 输出的值都是0或者接近0
1 import pandas as pd 2 import numpy as np 3 4 df = pd.DataFrame({'key1':['a','a','b','b','a','a','a','b','b','a'], 5 'key2':['one','two','one','two','one','one','two','one','two','one'], 6 'data1':np.random.randint(1,10,10), 7 'data2':np.random.randint(1,10,10)}) 8 9 10 def top(g, n=2, column='data1'): 11 return g.sort_values(by=column,ascending=False)[:n] 12 13 df.groupby('key1').apply(top, n=3, column='data2') 14 # 分组后只对data2列的值排序取出前三行
1 import pandas as pd 2 import numpy as np 3 4 # 下面的例子是填充NaN值的方法 5 6 states = ['Ohio','New York','Vermont','Florida', 7 'Oregon','Nevada','California','Idaho'] 8 group_key = ['East'] * 4 + ['West'] * 4 9 data = pd.Series(np.random.randn(8), index=states) 10 data[['Vermont','Nevada','Idaho']] = np.nan 11 # 输出 12 Ohio 0.133410 13 New York 2.147483 14 Vermont NaN 15 Florida -0.608754 16 Oregon 0.978375 17 Nevada NaN 18 California -1.297183 19 Idaho NaN 20 dtype: float64 21 22 data.groupby(group_key).mean() 23 # 输出 24 East 0.557380 25 West -0.159404 26 dtype: float64 27 28 data.groupby(group_key).apply(lambda g: g.fillna(g.mean())) 29 # 输出 30 Ohio 0.133410 31 New York 2.147483 32 Vermont 0.557380 33 Florida -0.608754 34 Oregon 0.978375 35 Nevada -0.159404 36 California -1.297183 37 Idaho -0.159404 38 dtype: float64