zoukankan      html  css  js  c++  java
  • pandas 使用总结

    import pandas as pd
    import numpy as np
    
    ## 从字典初始化df
    ipl_data = {'Team': ['Riders', 'Riders', 'Devils', 'Devils', 'Kings',
             'Kings', 'Kings', 'Kings', 'Riders', 'Royals', 'Royals', 'Riders'],
             'Rank': [1, 2, 2, 3, 3,4 ,1 ,1,2 , 4,1,2],
             'Year': [2014,2015,2014,2015,2014,2015,2016,2017,2016,2014,2015,2017],
             'Points':[876,789,863,673,741,812,756,788,694,701,804,690]}
    df = pd.DataFrame(ipl_data)
    print(df)
    
          Team  Rank  Year  Points
    0   Riders     1  2014     876
    1   Riders     2  2015     789
    2   Devils     2  2014     863
    3   Devils     3  2015     673
    4    Kings     3  2014     741
    5    Kings     4  2015     812
    6    Kings     1  2016     756
    7    Kings     1  2017     788
    8   Riders     2  2016     694
    9   Royals     4  2014     701
    10  Royals     1  2015     804
    11  Riders     2  2017     690
    
    print(df.groupby('Team')) ## groupby 返回的对象
    
    <pandas.core.groupby.groupby.DataFrameGroupBy object at 0x7fcbff80a240>
    
    print(df.groupby('Team').groups) ##用groups属性来进行查看每个分组
    
    {'Devils': Int64Index([2, 3], dtype='int64'), 'Kings': Int64Index([4, 5, 6, 7], dtype='int64'), 'Riders': Int64Index([0, 1, 8, 11], dtype='int64'), 'Royals': Int64Index([9, 10], dtype='int64')}
    
    ## 对groupby 后的结果进行遍历
    grouped = df.groupby('Year')
    for name,group in grouped:
        print(name)
        print(group)
    
    2014
         Team  Rank  Year  Points
    0  Riders     1  2014     876
    2  Devils     2  2014     863
    4   Kings     3  2014     741
    9  Royals     4  2014     701
    2015
          Team  Rank  Year  Points
    1   Riders     2  2015     789
    3   Devils     3  2015     673
    5    Kings     4  2015     812
    10  Royals     1  2015     804
    2016
         Team  Rank  Year  Points
    6   Kings     1  2016     756
    8  Riders     2  2016     694
    2017
          Team  Rank  Year  Points
    7    Kings     1  2017     788
    11  Riders     2  2017     690
    
    ## 从多个groups中获取单个group
    grouped = df.groupby('Year')
    print(grouped.get_group(2014))
    
         Team  Rank  Year  Points
    0  Riders     1  2014     876
    2  Devils     2  2014     863
    4   Kings     3  2014     741
    9  Royals     4  2014     701
    
    ## 使用agg聚合函数计算均值
    grouped = df.groupby('Year')
    print(grouped['Points'].agg('mean'))
    
    Year
    2014    795.25
    2015    769.50
    2016    725.00
    2017    739.00
    Name: Points, dtype: float64
    
    ## 使用agg聚合函数计算数据条数
    grouped = df.groupby('Team')
    print(grouped.agg(np.size))
    
            Rank  Year  Points
    Team                      
    Devils     2     2       2
    Kings      4     4       4
    Riders     4     4       4
    Royals     2     2       2
    
    ## 使用多个agg聚合函数进行计算
    grouped = df.groupby('Team')
    print(grouped.agg([np.sum, np.mean, np.std]))
    print(grouped['Points'].agg([np.sum, np.mean, np.std]))
    print(grouped['Points'].agg({'Points':[np.sum, np.mean, np.std],'Rank':[np.mean]}))  ## 分别指定不同的聚合函数
    
           Rank                  Year                   Points                    
            sum  mean       std   sum    mean       std    sum    mean         std
    Team                                                                          
    Devils    5  2.50  0.707107  4029  2014.5  0.707107   1536  768.00  134.350288
    Kings     9  2.25  1.500000  8062  2015.5  1.290994   3097  774.25   31.899582
    Riders    7  1.75  0.500000  8062  2015.5  1.290994   3049  762.25   88.567771
    Royals    5  2.50  2.121320  4029  2014.5  0.707107   1505  752.50   72.831998
             sum    mean         std
    Team                            
    Devils  1536  768.00  134.350288
    Kings   3097  774.25   31.899582
    Riders  3049  762.25   88.567771
    Royals  1505  752.50   72.831998
           Points                        Rank
              sum    mean         std    mean
    Team                                     
    Devils   1536  768.00  134.350288  768.00
    Kings    3097  774.25   31.899582  774.25
    Riders   3049  762.25   88.567771  762.25
    Royals   1505  752.50   72.831998  752.50
    
    
    /home/disk1/data/tangshengyu_dxm/tools/env_py36/lib/python3.6/site-packages/ipykernel_launcher.py:5: FutureWarning: using a dict on a Series for aggregation
    is deprecated and will be removed in a future version
      """
    
    ## grouped数据重新生成dataframe
    print(df.groupby('Year')['Team'].apply(len).reset_index())   ## 一级列名
    print(df.groupby('Year')['Team'].apply(len).to_frame())      ## 多级列名,列变为索引
    
       Year  Team
    0  2014     4
    1  2015     4
    2  2016     2
    3  2017     2
          Team
    Year      
    2014     4
    2015     4
    2016     2
    2017     2
    
    ## 更改聚合后的列名
    grouped_df = grouped.agg({'Points':['min','max','mean']})
    print(grouped_df.columns)
    print(grouped_df.columns.values)
    grouped_df.columns = ['_'.join(col_tuple) for col_tuple in grouped_df.columns.values]
    grouped_df.reset_index()
    
    MultiIndex(levels=[['Points'], ['min', 'max', 'mean']],
               labels=[[0, 0, 0], [0, 1, 2]])
    [('Points', 'min') ('Points', 'max') ('Points', 'mean')]
    
    Team Points_min Points_max Points_mean
    0 Devils 673 863 768.00
    1 Kings 741 812 774.25
    2 Riders 690 876 762.25
    3 Royals 701 804 752.50
    ## group 后的数据进行transform
    grouped = df.groupby('Team')
    score = lambda x: (x - x.mean()) 
    print(grouped.transform(score))
    
        Rank  Year  Points
    0  -0.75  -1.5  113.75
    1   0.25  -0.5   26.75
    2  -0.50  -0.5   95.00
    3   0.50   0.5  -95.00
    4   0.75  -1.5  -33.25
    5   1.75  -0.5   37.75
    6  -1.25   0.5  -18.25
    7  -1.25   1.5   13.75
    8   0.25   0.5  -68.25
    9   1.50  -0.5  -51.50
    10 -1.50   0.5   51.50
    11  0.25   1.5  -72.25
    
    ## filter 过滤 (返回满足条件的)
    grouped = df.groupby('Team')
    print(grouped.filter(lambda x: len(x)>3))
    
          Team  Rank  Year  Points
    0   Riders     1  2014     876
    1   Riders     2  2015     789
    4    Kings     3  2014     741
    5    Kings     4  2015     812
    6    Kings     1  2016     756
    7    Kings     1  2017     788
    8   Riders     2  2016     694
    11  Riders     2  2017     690
    
    ## 每个分组的数据量
    grouped = df.groupby('Team')
    print(grouped.apply(lambda x: len(x)))
    print(type(grouped.apply(lambda x: len(x))))
    
    Team
    Devils    2
    Kings     4
    Riders    4
    Royals    2
    dtype: int64
    <class 'pandas.core.series.Series'>
    
    ## 多行字符串组合成一行
    print(df)
    df_grouped = df.groupby(['Year'])['Team'].apply(';'.join).reset_index()
    print(df_grouped)
    
          Team  Rank  Year  Points
    0   Riders     1  2014     876
    1   Riders     2  2015     789
    2   Devils     2  2014     863
    3   Devils     3  2015     673
    4    Kings     3  2014     741
    5    Kings     4  2015     812
    6    Kings     1  2016     756
    7    Kings     1  2017     788
    8   Riders     2  2016     694
    9   Royals     4  2014     701
    10  Royals     1  2015     804
    11  Riders     2  2017     690
       Year                        Team
    0  2014  Riders;Devils;Kings;Royals
    1  2015  Riders;Devils;Kings;Royals
    2  2016                Kings;Riders
    3  2017                Kings;Riders
    
    ## 一行变多行
    def explode(df,tar_col_name):
        tar_col_list = [tar_col_name]
        rem_col_list = df.columns.difference(tar_col_list)
        rem_col_list = list(rem_col_list)
        df_new = df.set_index(rem_col_list)
        df_explode = pd.DataFrame(df_new[tar_col_name].tolist(),index=df_new.index)
        df_explode = df_explode.stack().to_frame()
        df_explode.columns = tar_col_list
        df_explode = df_explode.reset_index(level= rem_col_list)
        return df_explode
    
    df_grouped['Team'] = df_grouped['Team'].apply(lambda s:s.split(';')) ## 先split得到list
    print(df_grouped)
    explode(df_grouped,'Team')
    
       Year                             Team
    0  2014  [Riders, Devils, Kings, Royals]
    1  2015  [Riders, Devils, Kings, Royals]
    2  2016                  [Kings, Riders]
    3  2017                  [Kings, Riders]
    
    Year Team
    0 2014 Riders
    1 2014 Devils
    2 2014 Kings
    3 2014 Royals
    0 2015 Riders
    1 2015 Devils
    2 2015 Kings
    3 2015 Royals
    0 2016 Kings
    1 2016 Riders
    0 2017 Kings
    1 2017 Riders
    # 将多列合并成一列
    data = [['Alex', 10, 150], ['Bob',  12, 153], ['Clarke', 13, 160], ['Tom', 12, 160]]
    df = pd.DataFrame(data, columns=['Name', 'Age', 'Stature'])
    print(df)
    df_new = df['Age'].astype(str) +'-'+ df['Stature'].astype(str)
    print(df_new)
    
         Name  Age  Stature
    0    Alex   10      150
    1     Bob   12      153
    2  Clarke   13      160
    3     Tom   12      160
    0    10-150
    1    12-153
    2    13-160
    3    12-160
    dtype: object
    
    ## 一列拆分成多列
    ipl_data = {'Team': ['Riders', 'Riders', 'Devils', 'Devils', 'Kings',
             'Kings', 'Kings', 'Kings', 'Riders', 'Royals', 'Royals', 'Riders'],
             'Rank': [1, 2, 2, 3, 3,4 ,1 ,1,2 , 4,1,2],
             'Year': [2014,2015,2014,2015,2014,2015,2016,2017,2016,2014,2015,2017],
             'Points':[876,789,863,673,741,812,756,788,694,701,804,690]}
    df = pd.DataFrame(ipl_data)
    df_grouped = df.groupby(['Year'])['Team'].apply(';'.join).reset_index()
    print(df_grouped)
    df_grouped['Team'].str.split(';', expand=True)
    
       Year                        Team
    0  2014  Riders;Devils;Kings;Royals
    1  2015  Riders;Devils;Kings;Royals
    2  2016                Kings;Riders
    3  2017                Kings;Riders
    
    0 1 2 3
    0 Riders Devils Kings Royals
    1 Riders Devils Kings Royals
    2 Kings Riders None None
    3 Kings Riders None None
    def df2libsvm(df,missing_value='-9999'):
        re_list=[]
        length=len(df)
        for i in range(length):
            row_i=df.iloc[i]
            row_dict=row_i.to_dict()
            row_list=[]
            for key in row_dict:
                if row_dict[key]==missing_value:
                    continue
                row_list.append('%s:%s'%(key,str(row_dict[key])))
            re_list.append(row_list)
        return re_list
    
    def libsvm2df():
        """
        mydict = [{'b': 2, 'c': 3, 'd': 4},
    ...           {'a': 100, 'c': 300, 'd': 400},
    ...           {'a': 1000, 'b': 2000, 'c': 3000}]
        df=pd.DataFrame(mydict)
        """
    
    def calcu_iv(df,feat_col,label_col,good,bad):
        import numpy as np
        def f(x,label_col,good,bad):
            d = {}
            d['bin_bad_cnt'] = (x[label_col]==bad).sum()
            d['bin_good_cnt'] = (x[label_col]==good).sum()
            return pd.Series(d, index=['bin_good_cnt', 'bin_bad_cnt'])
    
        df_woe = df.groupby(feat_col).apply(f,label_col=label_col,good=good,bad=bad).reset_index()
        all_good_cnt = df_woe.bin_good_cnt.sum()
        all_bad_cnt = df_woe.bin_bad_cnt.sum()
        if all_bad_cnt==0:
            all_bad_cnt=1
        if all_good_cnt==0:
            all_good_cnt=1
        df_woe = df_woe.replace({'bin_bad_cnt': {0: 0.1}})
        df_woe = df_woe.replace({'bin_good_cnt': {0: 0.1}})
        df_woe['distribution_good'] = df_woe['bin_good_cnt']/float(all_good_cnt)
        df_woe['distribution_bad'] = df_woe['bin_bad_cnt']/float(all_bad_cnt)
        df_woe['WoE'] = np.log(df_woe['distribution_good']/df_woe['distribution_bad'])
        df_woe['IV'] = df_woe['WoE'] * (df_woe['distribution_good'] - df_woe['distribution_bad'])
        df_woe_inf = df_woe[df_woe['WoE']==np.inf]
        iv = df_woe['IV'].sum()
        return iv,df_woe
    
    

    pandas groupby agg 应用分位数函数

    def percentile(n):
        def percentile_(x):
            return np.percentile(x, n)
        percentile_.__name__ = 'p%s' % n
        return percentile_
    
    df_group = df.groupby(['start_site','end_site','mean_mileage','std_mileage','cv'])
    df_route = df_group[['route_cnt']].agg('sum')
    df_route['diff_mean'] = df_group[['diff']].agg('mean')
    df_route['diff_std'] = df_group[['diff']].agg(np.std)
    df_route['diff_p10'] = df_group[['diff']].agg(percentile(10))
    df_route['diff_p30'] = df_group[['diff']].agg(percentile(30))
    
  • 相关阅读:
    Redis 主从复制架构配置及原理
    IPTABLES详解(10):IPTABLES自定义链
    ipset
    Atitit 数据库核心技术index索引技术 btree hash lsm fulltxt目录1.1. HASH
    Atitit 存储引擎核心技术 总结目录1. 表的存储有三个文件:结构+数据+索引 12. 页式管理
    Atitit 为什么oracle这类大型数据库比mysql的性能机制目录1. 分区机制差别 11.1. Join算
    Atitit 存储与数据库性能调优流程目录1. 数据库出现性能瓶颈,对外表现有几个方面:
    ATITIT db perf enhs 数据库性能优化 目录 第一章 Cache类 1 第一节 查询cache 1 第二节 Update cache 2 第三节 内存表机制 零时表 2 第四节 雾
    Atitit 核心技术有哪些一般 目录 第一章 Rest调用交互 2 第二章 2 第三章 Cmd调用交互 2 第四章 2 第五章 爬虫技术 2 第一节 Httpclient 2 第二节 Html
    Atitit保证架构超前性 前瞻性 目录 第一章 为什么需要修改代码 1 第一节 业务增加功能 1 第二节 增加字段 1 第三节 增加表数据需要查询 修改 1 第四节 类库升级 1 第二章 简单抽象
  • 原文地址:https://www.cnblogs.com/sandy-t/p/9929181.html
Copyright © 2011-2022 走看看