# -*- coding: utf-8 -*-
"""
Created on Tue Feb 21 22:07:13 2017
@author: zzpp220
"""
from pandas import DataFrame,Series
from numpy import nan as NaN
import pandas as pd
import numpy as np
df=DataFrame({'k1':list('aabba'),'k2':['one','two','one','two','one'],'data1':np.random.randn(5),'data2':np.random.randn(5)})
#分组键为Series对象
grouped=df.groupby('k1')
desc_group=grouped.describe()
grouped_1=df['data1'].groupby([df['k1'],df['k2']]).sum()##group 是一个series对象
## 有多重索引的Series和DataFrame中的相互转变
##分组键为groupby前面对象中的列名
df.groupby(['k1','k2']).sum().unstack()
df.groupby(['k1','k2']).sum().unstack().stack([0,1])
##粉组件是任何长度适当的数组
states=np.array(['sd','gd','sd','fj','fj'])
year=np.array([2005,2006,2005,2005,2006])
df.groupby([states,year]).mean()
##GroupBy 对象的数据聚合 按照不同的的聚合函数聚合
tips=pd.read_csv('/media/zzpp220/Data/Linux_Documents/DOWNLOAD/python-DataAnalysis/pydata-book-master/ch08/tips.csv')
tips['tip_pct']=tips['tip']/tips['total_bill']
gro_tips=tips.groupby(['sex','smoker'])
#对聚合后的1列应用1个聚合函数:
gro_tips['tip_pct'].mean()
gro_tips['tip_pct'].agg('mean')##同上,只不过是把函数名用字符串的形似传入
#对聚合后的1列应用多个聚合函数:
gro_tips['tip_pct'].agg(['mean','std','var'])##传入一组函数名,得到DataFrame的列就会以相应的函数命名
gro_tips['tip_pct'].agg([('avg','mean'),('vag','std'),('gav','var')])#按照(name.funciton)传入二元元祖列表,返回的表的各列名是元祖第一个元素
'''
gro_tips['tip_pct'].mean()
Out[94]:
sex smoker
Female No 0.156921
Yes 0.182150
Male No 0.160669
Yes 0.152771
Name: tip_pct, dtype: float64
gro_tips['tip_pct'].agg(['mean','std','var'])
Out[98]:
mean std var
sex smoker
Female No 0.156921 0.036421 0.001327
Yes 0.182150 0.071595 0.005126
Male No 0.160669 0.041849 0.001751
Yes 0.152771 0.090588 0.008206
gro_tips['tip_pct'].agg([('avg','mean'),('vag','std'),('gav','var')])
Out[99]:
avg vag gav
sex smoker
Female No 0.156921 0.036421 0.001327
Yes 0.182150 0.071595 0.005126
Male No 0.160669 0.041849 0.001751
Yes 0.152771 0.090588 0.008206
'''
##设计对表中的多个列应用同样的多个函数
## 对'tip_pct','total_bill' 分别计算functions中的三个统计信息
functions=['count','mean','max']
asix=[('name1','count'),('name2','mean'),('name3','max')]
result=gro_tips['tip_pct','total_bill'].agg(functions)
asix_result=gro_tips['tip_pct','total_bill'].agg(asix)
'''
result
Out[102]:
tip_pct total_bill
count mean max count mean max
sex smoker
Female No 54 0.156921 0.252672 54 18.105185 35.83
Yes 33 0.182150 0.416667 33 17.977879 44.30
Male No 97 0.160669 0.291990 97 19.791237 48.33
Yes 60 0.152771 0.710345 60 22.284500 50.81
asix_result
Out[105]:
tip_pct total_bill
name1 name2 name3 name1 name2 name3
sex smoker
Female No 54 0.156921 0.252672 54 18.105185 35.83
Yes 33 0.182150 0.416667 33 17.977879 44.30
Male No 97 0.160669 0.291990 97 19.791237 48.33
Yes 60 0.152771 0.710345 60 22.284500 50.81
如上表,result拥有层次化的列,这相当于对各列应用不同的聚合函数后,再分别进行聚合,然后用concat进行聚合。
'''
##`对多个不同的列应用不同的函数,传入列名和带应用函数名的字典
map={'tip_pct':['min','max','mean','std'],'size':'sum',}
diff_to_diff=gro_tips.agg(map)
''' diff_to_diff
Out[109]:
tip_pct size
min max mean std sum
sex smoker
Female No 0.056797 0.252672 0.156921 0.036421 140
Yes 0.056433 0.416667 0.182150 0.071595 74
Male No 0.071804 0.291990 0.160669 0.041849 263
Yes 0.035638 0.710345 0.152771 0.090588 150
'''
#-------------apply 函数,返回一个pandas对象或者标量值-----------------------------------
'''定义函数得到表df按照列名为column的升序排序后的结果,选出最大5个tip_pct所在的行‘'''
def top(df,n=5,column='tip_pct'):#按照
return df.sort_index(by=column)[-n:]
top_col=top(tips,n=6)
tips.groupby('smoker').apply(top,n=3)##返回一个层次化索引的dataframe
tips.groupby('smoker')['tip_pct'].describe().unstack(0)
test2=DataFrame(np.arange(24).reshape(6,4),index=['row_1', 'row_2', 'row_3', 'row_4', 'row_5', 'row_6'],columns=['col_1','col_2','col_3', 'col_4'])
test2['smoker']=Series(['no','yes','no','yes','yes','yes'],index=['row_1', 'row_2', 'row_3', 'row_4', 'row_5', 'row_6'])
test2=test2.rename_axis({'col_1':'tip_pct'},axis=1)
test2['tip_pct']=[2,5,4,1,2,6]#
test2['col_2']=[7,8,5,-1,4,6]
test2.groupby('smoker').apply(top,n=3)##在groupby对象上调用apply,返回一个层次化索引的dataframe
test2.groupby('smoker').apply(top,n=3).unstack()
##如果不想要层次化的索引,可以把分组键禁止掉
test2.groupby('smoker',group_keys=False).apply(top,n=3)
'''
将tips按找smoker分组,对于每一个组内都应用top 函数,应用完成后,将各个组的应用结果concat起来
返回的结果就有了一个层次化的索引,内层的索引就是tip中原来的 索引值
total_bill tip sex smoker day time size tip_pct
smoker
No 51 10.29 2.60 Female No Sun Dinner 2 0.252672
149 7.51 2.00 Male No Thur Lunch 2 0.266312
232 11.61 3.39 Male No Sat Dinner 2 0.291990
-------------------------------------------------------------------------------concat---------------
Yes 67 3.07 1.00 Female Yes Sat Dinner 1 0.325733
178 9.60 4.00 Female Yes Sun Dinner 2 0.416667
172 7.25 5.15 Male Yes Sun Dinner 2 0.710345
test2.groupby('smoker').apply(top,n=3):
tip_pct col_2 col_3 col_4 smoker
smoker
no row_1 2 7 2 3 no
row_3 4 5 10 11 no
-----------------------------------------------------------concat-------------------------
yes row_5 2 4 18 19 yes
row_2 5 8 6 7 yes
row_6 6 6 22 23 yes
test2.groupby('smoker',group_keys=False).apply(top,n=3):
tip_pct col_2 col_3 col_4 smoker
row_1 2 7 2 3 no
row_3 4 5 10 11 no
row_5 2 4 18 19 yes
row_2 5 8 6 7 yes
row_6 6 6 22 23 yes
用特定于分组的值填充缺失值: '''
s=Series(np.random.rand(6))
s[::2]=np.nan#步长为2
'''
s:
0 NaN
1 0.654905
2 NaN
3 0.475528
4 NaN
5 0.269656
dtype: float64
s.fillna(s.mean())
Out[208]:
0 0.466696
1 0.654905
2 0.466696
3 0.475528
4 0.466696
5 0.269656
dtype: float64
'''
s.fillna(s.mean())