zoukankan      html  css  js  c++  java
  • Pandas分组级运算和转换

    分组级运算和转换

    假设要添加一列的各索引分组平均值

    第一种方法
    import pandas as pd
    from pandas import Series
    import numpy as np
    
    df = pd.DataFrame([[-2.04708,1.393406,'a','one'],
                  [0.478943,0.092908,'a','two'],
                  [-0.519439,0.281746,'b','one'],
                  [-0.555730,0.769023,'b','two'],
                  [1.965781,1.246435,'a','one'],
                 ], columns=['data1','data2','key1','key2'])
    df
    
            data1	  data2	   key1	key2
    0	-2.047080	1.393406	a	one
    1	0.478943	0.092908	a	two
    2	-0.519439	0.281746	b	one
    3	-0.555730	0.769023	b	two
    4	1.965781	1.246435	a	one
    
    # 先聚合求出平均值
    key1_means = df.groupby('key1').mean().add_prefix('mean_')
    key1_means
    
           mean_data1  mean_data2
    key1		
    a	   0.132548	  0.910916
    b	  -0.537584	  0.525385
    
    # 在通过聚合函数加到DataFrame
    pd.merge(df, key1_means,left_on='key1', right_index=True)
    
           data1	  data2	  key1	key2	mean_data1	mean_data2
    0	-2.047080	1.393406	a	one	     0.132548	0.910916
    1	0.478943	0.092908	a	two	     0.132548	0.910916
    4	1.965781	1.246435	a	one	     0.132548	0.910916
    2	-0.519439	0.281746	b	one	    -0.537584	0.525385
    3	-0.555730	0.769023	b	two	    -0.537584	0.525385
    
    第二种方法 transform,会将一个函数应用到各个分组,有严格条件,要么传入可以广播的标量,要么产生一个相同大小的结果数组
    df_mean = df.groupby('key2').transform(np.mean).add_prefix('mean_')
    df_mean
    
        mean_data1	mean_data2
    0	-0.200246	0.973862
    1	-0.038393	0.430966
    2	-0.200246	0.973862
    3	-0.038393	0.430966
    4	-0.200246	0.973862
    
    pd.concat([df,df_mean],axis=1)
    
          data1	      data2	  key1	key2  data1	     data2
    0	-2.047080	1.393406	a	one	-0.200246	0.973862
    1	0.478943	0.092908	a	two	-0.038393	0.430966
    2	-0.519439	0.281746	b	one	-0.200246	0.973862
    3	-0.555730	0.769023	b	two	-0.038393	0.430966
    4	1.965781	1.246435	a	one	-0.200246	0.973862
    

    apply一般性的'拆分-应用-合并'

    apply会将待处理的对象拆分成多个片段,然后对各片段调用传入的函数,最后尝试将各片段组合到一起

    # 选取指定列具有最大值的行的函数
    def top(df, n=3, column='tip_pct'):
        return df.sort_index(by=column)[-n:]
    
    tips = pd.read_csv('C:/Users/1/Desktop/tips.csv')
    tips['tip_pct'] = tips['tip']/tips['total_bill']
    tips.head()
    
     total_bill	tip	      sex	smoker	day	 time	size	tip_pct
    0	16.99	 1.01	Female	 No	    Sun	Dinner	2	   0.059447
    1	10.34	 1.66	Male	 No	    Sun	Dinner	3	   0.160542
    2	21.01	 3.50	Male	 No	    Sun	Dinner	3	   0.166587
    3	23.68	 3.31	Male	 No	    Sun	Dinner	2	   0.139780
    4	24.59	 3.61	Female	 No	    Sun	Dinner	4	   0.146808
    
    # 选取前三个最大值
    top(tips,n=3)
    
    total_bill	tip	     sex	smoker	day	time	size	tip_pct
    67	3.07	1.00	Female	Yes	    Sat	Dinner	 1	   0.325733
    178	9.60	4.00	Female	Yes	    Sun	Dinner	 2	   0.416667
    172	7.25	5.15	Male	Yes	    Sun	Dinner	 2	   0.710345
    
    # 按是否吸烟分组,选前三个最大的值
    # 过程是top函数在各个片段上调用后,结果由pandas.concat组装到一起
    tips.groupby('smoker').apply(top)
    
                 total_bill	tip	     sex  smoker  day	time	size	 tip_pct
    smoker									
    No	   51	    10.29	2.60	Female	No	  Sun	Dinner	 2	     0.252672
           149	    7.51	2.00	Male	No	  Thur	Lunch	 2	     0.266312
           232	    11.61	3.39	Male	No	  Sat	Dinner	 2	     0.291990
    Yes	   67	    3.07	1.00	Female	Yes	  Sat	Dinner	 1	     0.325733
           178	    9.60	4.00	Female	Yes	  Sun	Dinner	 2	     0.416667
           172	    7.25	5.15	Male	Yes	  Sun	Dinner	 2	     0.710345
    
    # 如果传给apply的函数能够接受其他参数或关键字,则可以将这些一并传入
    # 总花费的钱,按是否吸烟和每周的天数来找出每天其中价格最高的,n代表返回的数据前几个
    tips.groupby(['smoker','day']).apply(top, n=1, column='total_bill')
    
    	          total_bill	tip	     sex	smoker	   day	time	size	tip_pct
    smoker day									
    No	   Fri	94	22.75	    3.25	Female	No	       Fri	Dinner	 2	    0.142857
           Sat	212	48.33	    9.00	Male	No	       Sat	Dinner	 4	    0.186220
           Sun	156	48.17	    5.00	Male	No	       Sun	Dinner	 6	    0.103799
           Thur	142	41.19	    5.00	Male	No	       Thur	Lunch	 5	    0.121389
    Yes	   Fri	95	40.17	    4.73	Male	Yes	       Fri	Dinner	 4	    0.117750
           Sat	170	50.81	   10.00	Male	Yes	       Sat	Dinner	 3	    0.196812
           Sun	182	45.35	    3.50	Male	Yes	       Sun	Dinner	 3	    0.077178
           Thur	197	43.11	    5.00	Female	Yes	       Thur	Lunch	 4	    0.115982
    
    # 分组调用describe的方法
    tips.groupby('smoker')['tip_pct'].describe().T
    
    smoker	   No	       Yes
    count	151.000000	93.000000
    mean	0.159328	0.163196
    std	    0.039910	0.085119
    min	    0.056797	0.035638
    25%	    0.136906	0.106771
    50%	    0.155625	0.153846
    75%	    0.185014	0.195059
    max	    0.291990	0.710345
    
    # 本质是,下面两行代码的快捷键而已
    f = lambda x:x.describe()
    tips.groupby('smoker')['tip_pct'].apply(f).unstack('smoker')
    
    smoker	    No	      Yes
    count	151.000000	93.000000
    mean	0.159328	0.163196
    std	    0.039910	0.085119
    min	    0.056797	0.035638
    25%	    0.136906	0.106771
    50%	    0.155625	0.153846
    75%	    0.185014	0.195059
    max	    0.291990	0.710345
    
    # 禁用层次化索引
    tips.groupby('smoker',group_keys=False).apply(top)
    
        total_bill	tip	     sex	smoker	day	    time	  size	 tip_pct
    51	   10.29	2.60	Female	  No	Sun	   Dinner	   2	0.252672
    149	    7.51	2.00	Male	  No	Thur   Lunch	   2	0.266312
    232	   11.61	3.39	Male	  No	Sat	   Dinner	   2	0.291990
    67	   3.07	    1.00	Female	  Yes	Sat	   Dinner	   1	0.325733
    178	   9.60	    4.00	Female	  Yes	Sun	   Dinner	   2	0.416667
    172	   7.25	    5.15	Male	  Yes	Sun	   Dinner	   2	0.710345
    

    分位数和桶分析

    
    frame = pd.DataFrame({'data1':np.random.randn(1000),
                          'data2':np.random.randn(1000)})
    factor = pd.cut(frame['data1'],4)
    factor[:5]
    
    0    (-1.573, 0.112]
    1    (-1.573, 0.112]
    2    (-1.573, 0.112]
    3    (-1.573, 0.112]
    4    (-1.573, 0.112]
    Name: data1, dtype: category
    Categories (4, interval[float64]): [(-3.264, -1.573] < (-1.573, 0.112] < (0.112, 1.797] < (1.797, 3.482]]
    
    def get_stats(group):
        return {'min':group.min(),'max':group.max(),'count':group.count(),'mean':group.mean()}
    
    # 长度即每个区间相等的桶(区间大小相等)
    frame.data2.groupby(factor).apply(get_stats).unstack()
    
                     count	  max	      mean	        min
    data1				
    (-3.264, -1.573]  57.0	3.236024	0.100749	-2.149984
    (-1.573, 0.112]	  484.0	2.843239	-0.058549	-3.606913
    (0.112, 1.797]	  425.0	2.614935	0.065693	-3.463799
    (1.797, 3.482]	  34.0	1.791511	-0.049641	-1.756306
    
    
    
    # 大小相等的桶,labels关闭区间名称(数据点数量相等)
    ppp = pd.qcut(frame['data1'],4,labels=False)
    frame.data2.groupby(ppp).apply(get_stats).unstack()
    
        count	   max	      mean	       min
    data1				
    0	250.0	3.236024	-0.032592	-2.750112
    1	250.0	2.843239	-0.068005	-3.606913
    2	250.0	2.614935	0.103220	-2.380858
    3	250.0	2.612170	0.011922	-3.463799
    
  • 相关阅读:
    HTML中超出的内容显示为省略号
    JS-复习整理
    js-实现鼠标滑轮滚动实现换页
    CSS 控制滚动条样式
    面向对象、继承、抽象方法重载知识点整理
    面向对象知识点
    复习HTML、CSS、JS练习题
    数据库---创建函数,存储函数,触发器实例
    ajax编写购物车遇到的问题
    ajax应用
  • 原文地址:https://www.cnblogs.com/lishi-jie/p/10103648.html
Copyright © 2011-2022 走看看