zoukankan      html  css  js  c++  java
  • groupby

    In [1]:
    import warnings
    import math
    import pandas as pd
    import numpy as np
    import matplotlib
    
    warnings.filterwarnings('ignore')
    pd.options.display.max_rows = 100
    pd.options.display.max_columns = 100
    pd.set_option('max_colwidth', 500)
    
    get_ipython().magic(u'matplotlib inline')
    matplotlib.style.use('ggplot')
    
    from matplotlib import pyplot as plt
    plt.rcParams['font.sans-serif'] = ['SimHei']
    plt.rcParams['axes.unicode_minus'] = False
    
    myfont = matplotlib.font_manager.FontProperties(fname=u'simsun.ttc', size=14)
    
    In [15]:
    data = pd.DataFrame({
        'age' : np.random.randint(15, 100, 100),
        'height':np.random.randint(140, 180, 100),
        'weight':np.random.randint(40, 80, 100),
        'gender':np.random.randint(0,2, 100),
        'salary':np.random.randint(3000, 30000, 100)
    })
    data.head()
    
    Out[15]:
     
     ageheightweightgendersalary
    0 70 153 76 0 28492
    1 52 167 60 0 13457
    2 60 152 56 0 19341
    3 56 148 46 0 22948
    4 30 171 53 1 27829
    In [16]:
    data.gender = data.gender.map({0:'man', 1:'women'})
    data.head()
    
    Out[16]:
     
     ageheightweightgendersalary
    0 70 153 76 man 28492
    1 52 167 60 man 13457
    2 60 152 56 man 19341
    3 56 148 46 man 22948
    4 30 171 53 women 27829
    In [28]:
    # group 对象
    group = data.groupby('gender', as_index=False)
    list(group)[0]
    
    Out[28]:
    ('man',     age  height  weight gender  salary
     0    70     153      76    man   28492
     1    52     167      60    man   13457
     2    60     152      56    man   19341
     3    56     148      46    man   22948
     5    43     169      78    man   24664
     10   53     155      68    man   18598
     11   78     172      67    man    7968
     12   60     148      42    man    9037
     13   29     164      71    man   18313
     14   46     166      66    man   25126
     17   61     174      51    man    3431
     18   96     159      52    man   10823
     21   96     161      78    man    4995
     26   32     140      41    man    7146
     27   98     168      59    man    5033
     30   67     155      50    man   24194
     35   84     151      78    man   19993
     36   44     148      69    man   18338
     37   79     166      54    man   11029
     39   37     175      52    man    8755
     41   90     175      47    man   15473
     42   23     147      53    man   25314
     43   73     167      73    man   17872
     44   26     168      45    man   27260
     45   50     173      40    man    5016
     46   53     142      78    man   12550
     48   94     174      53    man    7372
     49   65     151      50    man   11583
     53   84     141      79    man   26520
     56   65     147      50    man   21603
     57   94     168      61    man   13765
     58   17     159      60    man    3645
     59   78     140      44    man   19553
     60   42     144      49    man   27545
     61   50     140      59    man   18159
     62   83     179      69    man   11343
     65   47     175      59    man   17985
     66   65     171      65    man   14097
     67   82     154      74    man   15888
     69   58     155      67    man   23449
     70   98     178      40    man   11743
     73   49     165      77    man   15365
     77   74     159      46    man   28667
     79   15     144      55    man   10374
     84   19     142      41    man   21732
     86   16     143      78    man   11782
     87   91     152      57    man    8086
     91   99     147      52    man    5697
     93   29     160      54    man    3031)
    In [25]:
    # agg
    group.agg({'age':'mean','height':'mean'})
    
    Out[25]:
     
     genderageheight
    0 man 60.612245 158.183673
    1 women 59.490196 159.156863
    In [33]:
    # transform
    data['avg_age'] = group['age'].transform('mean')
    data.head()
    
    Out[33]:
     
     ageheightweightgendersalaryavg_age
    0 70 153 76 man 28492 60.612245
    1 52 167 60 man 13457 60.612245
    2 60 152 56 man 19341 60.612245
    3 56 148 46 man 22948 60.612245
    4 30 171 53 women 27829 59.490196
    In [35]:
    # apply
    def oldest(x):
        df = x.sort_values(by='age', ascending=False)
        return df.iloc[-1,:]
    group.apply(oldest)
    
    Out[35]:
     
     ageheightweightgendersalaryavg_age
    0 15 144 55 man 10374 60.612245
    1 16 168 65 women 14140 59.490196
    In [53]:
    def age_level(age):
        return 'young' if age < 30 else ('middle' if age < 60 else 'senior')
    data['level'] = data.age.map(age_level)
    data.head()
    
    Out[53]:
     
     ageheightweightgendersalaryavg_agelevel
    0 70 153 76 man 28492 60.612245 senior
    1 52 167 60 man 13457 60.612245 middle
    2 60 152 56 man 19341 60.612245 senior
    3 56 148 46 man 22948 60.612245 middle
    4 30 171 53 women 27829 59.490196 middle
    In [68]:
    # 分组百分比
    age_dist = data.groupby(['gender', 'level']).agg({'age':'count'})
    age_dist
    
    Out[68]:
     
      age
    genderlevel 
    manmiddle 15
    senior 26
    young 8
    womenmiddle 20
    senior 26
    young 5
    In [69]:
    # gender_pcts 
    age_dist.groupby(level = 0).apply(lambda x:x/float(x.sum()))
    
    Out[69]:
     
      age
    genderlevel 
    manmiddle 0.306122
    senior 0.530612
    young 0.163265
    womenmiddle 0.392157
    senior 0.509804
    young 0.098039
    In [70]:
    age_dist.groupby(level = 'level').apply(lambda x:x/float(x.sum()))
    
    Out[70]:
     
      age
    genderlevel 
    manmiddle 0.428571
    senior 0.500000
    young 0.615385
    womenmiddle 0.571429
    senior 0.500000
    young 0.384615
    In [64]:
    age_dist.groupby(level = 1).apply(lambda x:x/float(x.sum()))
    
    Out[64]:
     
      age
    genderlevel 
    manmiddle 0.428571
    senior 0.500000
    young 0.615385
    womenmiddle 0.571429
    senior 0.500000
    young 0.384615
  • 相关阅读:
    C#语法造成的小问题(编译原理知识)
    COM套间对.NET程序使用COM对象的影响
    为什么连接字符串一定要用StringBuilder(介绍CLR Profiler)
    编译原理系列文章
    .NET与COM互操作系列
    Windows XP SidebySide功能对VC程序的影响
    引起FileNotFoundException原因通用分析过程
    Flex组件的项目渲染器(ItemRenderer)使用总结
    Flex组件开发总结20090209
    如何去掉超链接图片外蓝色的边框
  • 原文地址:https://www.cnblogs.com/qwj-sysu/p/12303757.html
Copyright © 2011-2022 走看看