zoukankan      html  css  js  c++  java
  • Pandas聚合

    数据聚合

    import pandas as pd
    from pandas import Series
    import numpy as np
    
    # 准备数据
    
    df = pd.DataFrame([[-0.204708,1.393406,'a','one'],
                       [0.478943,0.092908,'a','two'],
                       [-0.519439,0.281746,'b','one'],
                       [-0.555730,0.769023,'b','two'],
                       [1.965781,1.246435,'a','one']],columns=['data1','data2','key1','key2'])
    df
    
        data1	      data2	   key1	key2
    0	-0.204708	1.393406	a	one
    1	0.478943	0.092908	a	two
    2	-0.519439	0.281746	b	one
    3	-0.555730	0.769023	b	two
    4	1.965781	1.246435	a	one
    
    # 先生成分组对象
    grouped = df.groupby('key1')
    # 执行具体的运算
    grouped.quantile(0.9)
    
    0.9	  data1	      data2
    key1		
    a	 1.668413	 1.364012
    b	-0.523068	 0.720295
    
    # 自定义的方法(agg接收自定义函数)这种方式速度要慢的多,不太推荐。
    def peak_to_peak(arr):
        return arr.max() - arr.min()
    grouped.agg(peak_to_peak)
    
           data1	  data2
    key1		
    a	2.170489	1.300498
    b	0.036291	0.487277
    
    # 聚合运算也可以用describe()
    grouped.describe().T
    
             key1	    a	       b
    data1	count	3.000000	 2.000000
             mean	0.746672	-0.537584
              std	1.109736	 0.025662
              min	-0.204708	-0.555730
              25%	0.137118	-0.546657
              50%	0.478943	-0.537584
              75%	1.222362	-0.528512
              max	1.965781	-0.519439
    data2	count	3.000000	 2.000000
             mean	0.910916	 0.525385
              std	0.712217	 0.344557
              min	0.092908	 0.281746
              25%	0.669671	 0.403565
              50%	1.246435	 0.525385
              75%	1.319920	 0.647204
              max	1.393406	 0.769023
    
    
    经过优化的groupby方法
    函数名 说明
    count 分组中非NA的数量
    sum 非NA的和
    mean 非NA值的平均值
    median 非NA的算术中位数
    std、var 无偏(分母n-1)标准差和方差
    min、max 非NA值的最小值和最大值
    prod 非NA值的积
    first、last 第一个和最后一个非NA值
    餐馆小费的数据
    tips = pd.read_csv('C:/Users/1/Desktop/tips.csv')
    tips[:6]
    
      total_bill tip	 sex smoker day   time	      size
    0	16.99	1.01	Female	No	Sun	  Dinner	    2
    1	10.34	1.66	Male	No	Sun	  Dinner	    3
    2	21.01	3.50	Male	No	Sun	  Dinner	    3
    3	23.68	3.31	Male	No	Sun	  Dinner	    2
    4	24.59	3.61	Female	No	Sun	  Dinner	    4
    5	25.29	4.71	Male	No	Sun	  Dinner	    4
    
    # 小费占总额百分比的列
    tips['tip_pct'] = tips['tip']/tips['total_bill']
    tips[:6]
    
      total_bill	tip	sex	  smoker	day	time	size	tip_pct
    0	16.99	   1.01	Female	No	    Sun	Dinner	 2	   0.059447
    1	10.34	   1.66	Male	No	    Sun	Dinner	 3	   0.160542
    2	21.01	   3.50	Male	No	    Sun	Dinner	 3	   0.166587
    3	23.68	   3.31	Male	No	    Sun	Dinner	 2	   0.139780
    4	24.59	   3.61	Female	No	    Sun	Dinner	 4	   0.146808
    5	25.29	   4.71	Male	No	    Sun	Dinner	 4	   0.186240
    
    # 根据sex和smoker对tips进行分组
    grouped1 = tips.groupby(['sex','smoker'])
    # 直接将讲过优化的groupby方法传入
    grouped1['tip_pct'].agg('mean')
    
    sex     smoker
    Female  No        0.156921
            Yes       0.182150
    Male    No        0.160669
            Yes       0.152771
    Name: tip_pct, dtype: float64
    
    # 如果传入一组函数或函数名,对应的列名就是函数名
    grouped1['tip_pct'].agg(['mean','std',peak_to_peak])
    
                      mean	       std	  peak_to_peak
    sex	    smoker			
    Female	  No	0.156921	0.036421	0.195876
             Yes	0.182150	0.071595	0.360233
    Male	  No	0.160669	0.041849	0.220186
             Yes	0.152771	0.090588	0.674707
    
    # 传入的是一个由(name,function)元组组成的列表,则元组的第一个元素会被当做列名
    grouped1['tip_pct'].agg([('foo','mean'),('bar',np.std)])
    
    
    	              foo	     bar
    sex	 smoker		
    Female	No  	0.156921	0.036421
            Yes	    0.182150	0.071595
    Male	No 	    0.160669	0.041849
            Yes	    0.152771	0.090588
    
    grouped['tip_pct','total_bill'].agg(['count','mean','max'])
    
                   tip_pct	                          total_bill
                    count	mean	     max	       count	mean	 max
    sex	    smoker						
    Female	  No	  54	0.156921	0.252672	   54	18.105185	35.83
              Yes	  33	0.182150	0.416667	   33	17.977879	44.30
    Male	  No	  97	0.160669	0.291990	   97	19.791237	48.33
              Yes	  60	0.152771	0.710345	   60	22.284500	50.81
    
    # 这里也可以传入带有自定义名称的元组列表,
    ftuples = [('new1','mean'),('new2',np.var)]
    grouped['tip_pct','total_bill'].agg(ftuples)
    
                         tip_pct	total_bill
                    new1	  new2	      new1	     new2
    sex	  smoker				
    Female	No	   0.156921	  0.001327	18.105185	53.092422
            Yes	   0.182150	  0.005126	17.977879	84.451517
    Male	No	   0.160669	  0.001751	19.791237	76.152961
            Yes	   0.152771   0.008206	22.284500	98.244673
    
    #传入字典
    grouped.agg({'tip':np.max,'size':'sum'})
    
    	        tip	    size
    sex	 smoker		
    Female	No	5.2	    140
            Yes	6.5	    74
    Male	No	9.0	    263
            Yes	10.0	150
    
    # 只有将多个函数应用到至少一列时,DataFrame才会拥有层次化的列
    grouped.agg({'tip_pct':['min','max','mean','std'],'size':'sum'})
    
    	                        tip_pct	                         size
                    min	         max	    mean	     std	  sum
    sex	smoker					
    Female	No	  0.056797	  0.252672	  0.156921	  0.036421	  140
            Yes	  0.056433	  0.416667	  0.182150	  0.071595	  74
    Male	No	  0.071804	  0.291990	  0.160669	  0.041849	  263
            Yes	  0.035638	  0.710345	  0.152771	  0.090588	  150
    
    # 以''无索引''的形式返回聚合数据
    # 不需要的时候可以关闭所有的索引,包括层次化索引
    tips.groupby(['sex','smoker'],as_index=False).mean()
    
         sex	smoker	total_bill	  tip	       size	     tip_pct
    0	Female	No	    18.105185	2.773519	2.592593	0.156921
    1	Female	Yes	    17.977879	2.931515	2.242424	0.182150
    2	Male	No	    19.791237	3.113402	2.711340	0.160669
    3	Male	Yes	    22.284500	3.051167	2.500000	0.152771
    
  • 相关阅读:
    vivado操作基本问题
    IIC通信控制的AD5259------在调试过程中遇到的奇葩问题
    FPGA基础架构总结
    PLL到底是个啥么东西呢?
    CSS-3 Transform 的使用
    CSS-3 box-shadow 的使用
    一些CSS3的乐趣
    CSS-3 文字阴影—text-shadow 的使用
    Jquery 较好的效果
    如何关闭输入法
  • 原文地址:https://www.cnblogs.com/lishi-jie/p/10097904.html
Copyright © 2011-2022 走看看