zoukankan      html  css  js  c++  java
  • Pandas分组

    GroupBy技术

    分组运算的过程可以用下面的流程图表示出来

    gai

    import pandas as pd
    from pandas import Series
    import numpy as np
    
    df = pd.DataFrame({
        'key1':['a','a','b','b','a'],
        'key2':['one','two','one','two','one'],
        'data1':np.random.randn(5),
        'data2':np.random.randn(5)
    })
    df
    
       key1	key2	data1	    data2
    0	a	one	  -0.687740	   1.536286
    1	a	two	   0.177156	  -1.132658
    2	b	one	   2.024931	   0.755856
    3	b	two	  -0.304281	  -0.629296
    4	a	one	   0.202405	   0.911700
    
    # 访问key1调用groupby
    grouped = df['data1'].groupby(df['key1'])
    # grouped是一个GroupBy对象,它实际上还没有进行任何计算。但是它已经包含了执行分组运算的一切信息
    grouped
    
    <pandas.core.groupby.groupby.SeriesGroupBy object at 0x0000020473C51978>
    
    # 例如,算平均值
    grouped.mean()
    
          key1
    a   -0.102726
    b    0.860325
    Name: data1, dtype: float64
    
    means = df['data1'].groupby([df['key1'],df['key2']]).mean()
    means
    
          key1      key2
    a     one    -0.242668
          two     0.177156
    b     one     2.024931
          two    -0.304281
    Name: data1, dtype: float64
    
    # 默认操作的是最内层的
    means.unstack()
    
    key2	one	      two
    key1		
    a	 -0.242668	 0.177156
    b	  2.024931	-0.304281
    
    # 自定义分组键
    states = np.array(['Ohio','California','California','Ohio','Ohio'])
    years = np.array([2005,2005,2006,2005,2006])
    df['data1'].groupby([states,years]).mean()
    
    California  2005    0.177156
                2006    2.024931
    Ohio        2005   -0.496010
                2006    0.202405
    Name: data1, dtype: float64
    
    # 将列名用作分组键
    
    df.groupby('key1').mean()
    
             data1	   data2
    key1		
    a	  -0.102726	  0.438443
    b	   0.860325	  0.063280
    
    df.groupby(['key1','key2']).mean()
    
                      data1	          data2
    key1	key2		
    a	    one	    -0.242668	     1.223993
            two	     0.177156	    -1.132658
    b	    one	     2.024931	     0.755856
            two	    -0.304281	    -0.629296
    
    # groupby.size方法,返回一个含有分组大小的Series,统计分组的数量
    df.groupby(['key1','key2']).size()
    
          key1  key2
    a     one     2
          two     1
    b     one     1
          two     1
    dtype: int64
    
    df.groupby('key1').size()
        key1
    a    3
    b    2
    dtype: int64
    

    对分组进行迭代

    for name,group in df.groupby('key1'):
        print(name)
        print(group)
    
    a
      key1 key2     data1     data2
    0    a  one -0.687740  1.536286
    1    a  two  0.177156 -1.132658
    4    a  one  0.202405  0.911700
    b
      key1 key2     data1     data2
    2    b  one  2.024931  0.755856
    3    b  two -0.304281 -0.629296
    
    # 对于多重键的情况
    for k1,group in df.groupby(['key1','key2']):
        print(k1)
        print(group)
    
    ('a', 'one')
      key1 key2     data1     data2
    0    a  one -0.687740  1.536286
    4    a  one  0.202405  0.911700
    ('a', 'two')
      key1 key2     data1     data2
    1    a  two  0.177156 -1.132658
    ('b', 'one')
      key1 key2     data1     data2
    2    b  one  2.024931  0.755856
    ('b', 'two')
      key1 key2     data1     data2
    3    b  two -0.304281 -0.629296
    
    pieces = dict(list(df.groupby('key1')))
    pieces['a']
    
       key1	key2	data1	  data2
    0	a	one	 -0.687740	 1.536286
    1	a	two	  0.177156	-1.132658
    4	a	one	  0.202405	 0.911700
    
    # 按数据类型对列尽心分组
    df.dtypes
    
    key1      object
    key2      object
    data1    float64
    data2    float64
    dtype: object
    
    grouped1 = df.groupby(df.dtypes, axis=1)
    dict(list(grouped1))
    
    {dtype('float64'):       data1     data2
     0 -0.687740  1.536286
     1  0.177156 -1.132658
     2  2.024931  0.755856
     3 -0.304281 -0.629296
     4  0.202405  0.911700, dtype('O'):   key1 key2
     0    a  one
     1    a  two
     2    b  one
     3    b  two
     4    a  one}
    

    选取一个或一个组列

    # 分组通常和运算是一起的,只分组,只能是出来一个分组对象
    df['data1'].groupby(df['key1'])
    <pandas.core.groupby.groupby.SeriesGroupBy object at 0x000002047407C0F0>
    
    df.groupby(['key1','key2'])[['data2']].mean()
    
                  data2
    key1 key2	
    a	 one	 1.223993
         two	-1.132658
    b	 one	 0.755856
         two	-0.629296
    

    通过字典或Series进行分组

    people = pd.DataFrame(np.random.randn(5,5),
                          columns = ['a','b','c','d','e'],
                          index = ['Joe','Steve','Wes','Jim','Travis']
                         )
    people
    
               a	        b	        c	        d	        e
    Joe	   -0.721168	-0.332402	-0.512183	 0.765475	-0.119330
    Steve	0.166230	 2.095133	-0.298735	-2.167574	-0.273929
    Wes	   -1.428962	-2.109915	-0.686155	-1.337891	 0.020153
    Jim	    1.041952	 0.130567	 0.776067	 0.508666	 0.100419
    Travis -0.215110	 0.611383	-0.011443	-0.552580	-0.274047
    
    people.ix['Steve':'Wes',['b','c']] = np.nan
    people
    
                  a	       b	       c	       d	        e
    Joe	    -0.721168	-0.332402	-0.512183	0.765475	-0.119330
    Steve	0.166230	NaN	          NaN	    -2.167574	-0.273929
    Wes	    -1.428962	NaN	          NaN	    -1.337891	 0.020153
    Jim	    1.041952	0.130567	 0.776067	 0.508666	 0.100419
    Travis	-0.215110	0.611383	-0.011443	-0.552580	-0.274047
    
    # 根据分组计算列的总计
    mapping = {'a':'red','b':'red','c':'blue',
              'd':'blue','e':'red','f':'orange'}
    by_column = people.groupby(mapping,axis=1).sum()
    by_column
    
             blue	       red
    Joe	     0.253291	-1.172899
    Steve	-2.167574	-0.107699
    Wes	    -1.337891	-1.408809
    Jim	     1.284733	 1.272937
    Travis	-0.564023	 0.122226
    
    # 检查映射的对应关系
    map_series = Series(mapping)
    map_series
    
    a       red
    b       red
    c      blue
    d      blue
    e       red
    f    orange
    dtype: object
    
    people.groupby(map_series,axis=1).count()
    
          blue	red
    Joe	    2	3
    Steve	1	2
    Wes	    1	2
    Jim	    2	3
    Travis	2	3
    

    通过函数进行分组

    # 仅仅传入len,就可以实现按名字长度分组
    people.groupby(len).sum()
    
           a	       b	      c	           d	      e
    3	-1.108178	-0.201835	0.263884	-0.063751	0.001242
    5	0.166230	0.000000	0.000000	-2.167574	-0.273929
    6	-0.215110	0.611383	-0.011443	-0.552580	-0.274047
    
    key_list = ['one','one','one','two','two']
    people.groupby([len,key_list]).min()
    
               a	      b	          c	           d	       e
    3	one	-1.428962	-0.332402	-0.512183	-1.337891	-0.119330
        two	 1.041952	0.130567	 0.776067	 0.508666	 0.100419
    5	one	 0.166230	NaN	         NaN	    -2.167574	-0.273929
    6	two	-0.215110	0.611383	-0.011443	-0.552580	-0.274047
    
    

    根据索引级别分组

    columns = pd.MultiIndex.from_arrays([['US','US','US','JP','JP'],[1,3,5,1,3]],names=['cty','tenor'])
    hier_df = pd.DataFrame(np.random.randn(4,5),columns=columns)
    hier_df
    
    cty	     US	                                    JP
    tenor	1	        3	         5	         1	        3
    0	-0.637036	 0.230604	 0.425606	 0.579673	-0.408535
    1	 0.987018	-1.313152	-0.322781	-1.393266	-0.393867
    2	-2.343672	 0.385606	 0.992302	-0.791528	-0.774870
    3	-0.809241	-0.654219	-0.860250	 0.222547	 0.049766
    
    hier_df.groupby(level='cty',axis=1).count()
    
    cty	JP	US
    0	2	3
    1	2	3
    2	2	3
    3	2	3
    
  • 相关阅读:
    uWSGI 是一種協議,同時它也是一個程式。
    影像数据的裁切和保存
    Android Studio、IDEA 获取开发版SHA1值和发布版SHA1值,详细过程
    Android Studio的Signature Versions选择,分别是什么意思
    Android Studio更改项目SDK的版本
    c#+ArcGIS Engine-获取矢量图层的空间参考
    64。node.js 中间件express-session使用详解
    63.note.js之 Mongodb在Nodejs上的配置及session会话机制的实现
    62.nodejs中的ejs模板学习
    61.node.js开发错误——Error: Connection strategy not found
  • 原文地址:https://www.cnblogs.com/lishi-jie/p/10083675.html
Copyright © 2011-2022 走看看