zoukankan      html  css  js  c++  java
  • Pandas学习笔记——综合练习

    在这里插入图片描述
    Task06:综合练习(1天)

    import numpy as np
    import pandas as pd
    
    # 加上这两行可以一次性输出多个变量而不用print
    from IPython.core.interactiveshell import InteractiveShell
    InteractiveShell.ast_node_interactivity = "all"
    
    pd.set_option('display.max_columns', None)
    pd.set_option('display.max_rows', None)
    

    一、2002 年——2018 年上海机动车拍照拍卖

    问题

    (1) 哪一次拍卖的中标率首次小于5%?(考察点:创建列,索引)

    path = 'C:/Users/86187/Desktop/第12期组队学习/组队学习Pandas/'
    df = pd.read_csv(path + '2002年-2018年上海机动车拍照拍卖.csv')
    df.head()
    df.shape
    
    Date Total number of license issued lowest price avg price Total number of applicants
    0 2-Jan 1400 13600 14735 3718
    1 2-Feb 1800 13100 14057 4590
    2 2-Mar 2000 14300 14662 5190
    3 2-Apr 2300 16000 16334 4806
    4 2-May 2350 17800 18357 4665
    (203, 5)
    
    df_1 = df.copy()
    df_1['New'] = df_1['Total number of license issued'] / df_1[
        'Total number of applicants']
    for i in range(df.shape[0]):
        if df_1['New'][i] < 0.05:
            df_1.loc[i]
            break
    
    Date                                 15-May
    Total number of license issued         7482
    lowest price                          79000
    avg price                             79099
    Total number of applicants           156007
    New                               0.0479594
    Name: 159, dtype: object
    

    (2) 将第一列时间列拆分成两个列,一列为年份(格式为20××),另一列为月份(英语缩写),添加到列表作为第一第二列,并将原表第一列删除,其他列依次向后顺延。
    (考察点:拆分列)

    df_2 = df.copy()
    date = pd.MultiIndex.from_tuples(df_2.Date.apply(lambda x: x.split('-')))
    date
    
    MultiIndex([( '2', 'Jan'),
                ( '2', 'Feb'),
                ( '2', 'Mar'),
                ( '2', 'Apr'),
                ( '2', 'May'),
                ( '2', 'Jun'),
                ( '2', 'Jul'),
                ( '2', 'Aug'),
                ( '2', 'Sep'),
                ( '2', 'Oct'),
                ...
                ('18', 'Mar'),
                ('18', 'Apr'),
                ('18', 'May'),
                ('18', 'Jun'),
                ('18', 'Jul'),
                ('18', 'Aug'),
                ('18', 'Sep'),
                ('18', 'Oct'),
                ('18', 'Nov'),
                ('18', 'Dec')],
               length=203)
    
    df_2.index = date
    df_2 = df_2.reset_index().drop(columns='Date')
    df_2.head()
    
    level_0 level_1 Total number of license issued lowest price avg price Total number of applicants
    0 2 Jan 1400 13600 14735 3718
    1 2 Feb 1800 13100 14057 4590
    2 2 Mar 2000 14300 14662 5190
    3 2 Apr 2300 16000 16334 4806
    4 2 May 2350 17800 18357 4665
    df_2['level_0'] = df_2.level_0.apply(lambda x: str(2000+int(x)))
    df_2.head()
    
    level_0 level_1 Total number of license issued lowest price avg price Total number of applicants
    0 2002 Jan 1400 13600 14735 3718
    1 2002 Feb 1800 13100 14057 4590
    2 2002 Mar 2000 14300 14662 5190
    3 2002 Apr 2300 16000 16334 4806
    4 2002 May 2350 17800 18357 4665
    df_2 = df_2.rename({'level_0':'Year', 'level_1':'Month'},axis=1)
    df_2.head()
    
    Year Month Total number of license issued lowest price avg price Total number of applicants
    0 2002 Jan 1400 13600 14735 3718
    1 2002 Feb 1800 13100 14057 4590
    2 2002 Mar 2000 14300 14662 5190
    3 2002 Apr 2300 16000 16334 4806
    4 2002 May 2350 17800 18357 4665

    (3) 按年统计拍卖最低价的下列统计量:最大值、均值、0.75 分位数,要求显示在同一张表上。(考察点:创建列、索引、分组、统计量、分列)

    df_3 = df_2.copy()
    df_3.head()
    df_3.index
    df_3.columns
    
    Year Month Total number of license issued lowest price avg price Total number of applicants
    0 2002 Jan 1400 13600 14735 3718
    1 2002 Feb 1800 13100 14057 4590
    2 2002 Mar 2000 14300 14662 5190
    3 2002 Apr 2300 16000 16334 4806
    4 2002 May 2350 17800 18357 4665
    RangeIndex(start=0, stop=203, step=1)
    
    Index(['Year', 'Month', 'Total number of license issued', 'lowest price ',
           'avg price', 'Total number of applicants'],
          dtype='object')
    
    from collections import OrderedDict
    grouped = df_3.groupby('Year')
    
    def f(df_3):
        data = OrderedDict()
        data['Max'] = df_3['lowest price '].max()
        data['Mean'] = df_3['lowest price '].mean()
        data['quantile_0.75'] = df_3['lowest price '].quantile(q=0.75)
        return pd.Series(data)
    
    grouped.apply(f)
    
    Max Mean quantile_0.75
    Year
    2002 30800.0 20316.666667 24300.0
    2003 38500.0 31983.333333 36300.0
    2004 44200.0 29408.333333 38400.0
    2005 37900.0 31908.333333 35600.0
    2006 39900.0 37058.333333 39525.0
    2007 53800.0 45691.666667 48950.0
    2008 37300.0 29945.454545 34150.0
    2009 36900.0 31333.333333 34150.0
    2010 44900.0 38008.333333 41825.0
    2011 53800.0 47958.333333 51000.0
    2012 68900.0 61108.333333 65325.0
    2013 90800.0 79125.000000 82550.0
    2014 74600.0 73816.666667 74000.0
    2015 85300.0 80575.000000 83450.0
    2016 88600.0 85733.333333 87475.0
    2017 93500.0 90616.666667 92350.0
    2018 89000.0 87825.000000 88150.0

    (4) 现在将表格行索引设为多级索引,外层为年份,内层为原表格第二至第五列的变量名,列索引为月份。(考察点:多级索引)

    df_4 = df_2.copy()
    df_4.head()
    df_4.columns
    
    Year Month Total number of license issued lowest price avg price Total number of applicants
    0 2002 Jan 1400 13600 14735 3718
    1 2002 Feb 1800 13100 14057 4590
    2 2002 Mar 2000 14300 14662 5190
    3 2002 Apr 2300 16000 16334 4806
    4 2002 May 2350 17800 18357 4665
    Index(['Year', 'Month', 'Total number of license issued', 'lowest price ',
           'avg price', 'Total number of applicants'],
          dtype='object')
    
    # 貌似不是想要的结果
    df_4 = df_4.set_index([
        'Year', 'Total number of license issued', 'lowest price ', 'avg price',
        'Total number of applicants'
    ])
    df_4.head()
    
    Month
    Year Total number of license issued lowest price avg price Total number of applicants
    2002 1400 13600 14735 3718 Jan
    1800 13100 14057 4590 Feb
    2000 14300 14662 5190 Mar
    2300 16000 16334 4806 Apr
    2350 17800 18357 4665 May
    df_4 = pd.pivot_table(df_4,
                         index='Year',
                         columns='Month',
                         values=[
                             'Total number of license issued', 'lowest price ',
                             'avg price', 'Total number of applicants'
                         ])
    df_4.head()
    
    Total number of applicants Total number of license issued avg price lowest price
    Month Apr Aug Dec Feb Jan Jul Jun Mar May Nov Oct Sep Apr Aug Dec Feb Jan Jul Jun Mar May Nov Oct Sep Apr Aug Dec Feb Jan Jul Jun Mar May Nov Oct Sep Apr Aug Dec Feb Jan Jul Jun Mar May Nov Oct Sep
    Year
    2002 4806.0 4640.0 3525.0 4590.0 3718.0 3774.0 4502.0 5190.0 4665.0 4021.0 4661.0 4393.0 2300.0 3000.0 3600.0 1800.0 1400.0 3000.0 2800.0 2000.0 2350.0 3200.0 3200.0 3200.0 16334.0 21601.0 27848.0 14057.0 14735.0 20904.0 20178.0 14662.0 18357.0 31721.0 27040.0 24040.0 16000.0 21000.0 27800.0 13100.0 13600.0 19800.0 19600.0 14300.0 17800.0 30800.0 26400.0 23600.0
    2003 8794.0 9315.0 10491.0 12030.0 9442.0 11929.0 15507.0 11219.0 14634.0 9849.0 9383.0 8532.0 3300.0 4500.0 4776.0 3000.0 3000.0 6000.0 5500.0 3000.0 3800.0 5042.0 4500.0 6650.0 34845.0 39369.0 38054.0 25254.0 24267.0 38269.0 37667.0 29551.0 36903.0 34284.0 34842.0 38728.0 34100.0 38500.0 37100.0 23800.0 18800.0 36900.0 36100.0 28800.0 35000.0 33100.0 32800.0 28800.0
    2004 8150.0 15506.0 9005.0 10156.0 8663.0 14464.0 19233.0 9950.0 8114.0 9188.0 9519.0 10634.0 5500.0 6800.0 5500.0 4800.0 5000.0 6600.0 6233.0 4800.0 6527.0 6600.0 6600.0 6640.0 45492.0 25991.0 30282.0 40053.0 39516.0 23544.0 21001.0 43333.0 34226.0 27620.0 29768.0 30033.0 44200.0 25100.0 29300.0 39600.0 38000.0 21800.0 17800.0 43000.0 10800.0 26000.0 28000.0 29300.0
    2005 8113.0 7520.0 8351.0 8949.0 6208.0 8777.0 8409.0 9117.0 9673.0 13633.0 11167.0 10972.0 5000.0 6829.0 5700.0 3800.0 5500.0 6326.0 5690.0 4000.0 5833.0 5700.0 6000.0 6700.0 37355.0 35905.0 36749.0 32425.0 32520.0 38378.0 37479.0 34684.0 35661.0 30320.0 26385.0 28927.0 36800.0 25000.0 35200.0 31700.0 28500.0 37900.0 37000.0 34300.0 35000.0 29800.0 25200.0 26500.0
    2006 7888.0 9190.0 9477.0 12367.0 5907.0 8966.0 8478.0 8904.0 8301.0 110234.0 11237.0 7064.0 5000.0 6200.0 6500.0 3800.0 5000.0 5500.0 4500.0 4500.0 4500.0 6000.0 6500.0 6500.0 38326.0 40459.0 40518.0 34887.0 31220.0 39966.0 39752.0 38932.0 38139.0 38460.0 37899.0 41601.0 37500.0 39900.0 39800.0 34200.0 26900.0 39600.0 39500.0 38500.0 37700.0 37800.0 36300.0 37000.0
    df_4 = df_4.stack(level=0).fillna('-')
    df_4.head()
    # 不知道结果对不对,个人感觉还行,除了月份顺序不定
    
    Month Apr Aug Dec Feb Jan Jul Jun Mar May Nov Oct Sep
    Year
    2002 Total number of applicants 4806.0 4640.0 3525.0 4590 3718.0 3774.0 4502.0 5190.0 4665.0 4021.0 4661.0 4393.0
    Total number of license issued 2300.0 3000.0 3600.0 1800 1400.0 3000.0 2800.0 2000.0 2350.0 3200.0 3200.0 3200.0
    avg price 16334.0 21601.0 27848.0 14057 14735.0 20904.0 20178.0 14662.0 18357.0 31721.0 27040.0 24040.0
    lowest price 16000.0 21000.0 27800.0 13100 13600.0 19800.0 19600.0 14300.0 17800.0 30800.0 26400.0 23600.0
    2003 Total number of applicants 8794.0 9315.0 10491.0 12030 9442.0 11929.0 15507.0 11219.0 14634.0 9849.0 9383.0 8532.0

    (5) 一般而言某个月最低价与上月最低价的差额,会与该月均值与上月均值的差额具有相同的正负号,哪些拍卖时间不具有这个特点?(考察点:统计量,分组,合并,索引排序)

    df_5 = df_2[['Year','Month','lowest price ','avg price']].copy()
    df_5.head()
    df_5.columns
    
    Year Month lowest price avg price
    0 2002 Jan 13600 14735
    1 2002 Feb 13100 14057
    2 2002 Mar 14300 14662
    3 2002 Apr 16000 16334
    4 2002 May 17800 18357
    Index(['Year', 'Month', 'lowest price ', 'avg price'], dtype='object')
    
    df_5 = df_5.set_index(['Year', 'Month'])
    df_5.head()
    
    lowest price avg price
    Year Month
    2002 Jan 13600 14735
    Feb 13100 14057
    Mar 14300 14662
    Apr 16000 16334
    May 17800 18357
    index = df_5.index
    for ind in range(len(index)-1):
        if (df_5.loc[index[ind + 1]][0] - df_5.loc[index[ind]][0]) * (
                df_5.loc[index[ind + 1]][1] - df_5.loc[index[ind]][1]) < 0:
            print(df_2.loc[ind+1,['Year','Month']])
            print('
    ')
    
    Year     2003
    Month     Oct
    Name: 21, dtype: object
    


    Year 2003
    Month Nov
    Name: 22, dtype: object


    Year 2004
    Month Jun
    Name: 29, dtype: object


    Year 2005
    Month Jan
    Name: 36, dtype: object


    Year 2005
    Month Feb
    Name: 37, dtype: object


    Year 2005
    Month Sep
    Name: 44, dtype: object


    Year 2006
    Month May
    Name: 52, dtype: object


    Year 2006
    Month Sep
    Name: 56, dtype: object


    Year 2007
    Month Jan
    Name: 60, dtype: object


    Year 2007
    Month Feb
    Name: 61, dtype: object


    Year 2007
    Month Dec
    Name: 71, dtype: object


    Year 2012
    Month Oct
    Name: 128, dtype: object


    (6) 将某一个月牌照发行量与其前两个月发行量均值的差额定义为发行增益,最初的两个月用0 填充,求发行增益极值出现的时间。(考察点:统计量,索引,排序,差分)

    df_6 = df_2[['Year','Month','Total number of license issued']].copy()
    df_6['Gain'] = 0
    df_6.head()
    df_6.columns
    
    Year Month Total number of license issued Gain
    0 2002 Jan 1400 0
    1 2002 Feb 1800 0
    2 2002 Mar 2000 0
    3 2002 Apr 2300 0
    4 2002 May 2350 0
    Index(['Year', 'Month', 'Total number of license issued', 'Gain'], dtype='object')
    
    df_6 = df_6.set_index(['Year', 'Month'])
    df_6.head()
    
    Total number of license issued Gain
    Year Month
    2002 Jan 1400 0
    Feb 1800 0
    Mar 2000 0
    Apr 2300 0
    May 2350 0
    index = df_6.index
    for ind in range(2, len(index) - 1):
        mean = (df_6.loc[index[ind - 1]][0] + df_6.loc[index[ind - 2]][0]) / 2
        df_6['Gain'][ind] = df_6.loc[index[ind]][0] - mean
    df_6.head()
    
    Total number of license issued Gain
    Year Month
    2002 Jan 1400 0
    Feb 1800 0
    Mar 2000 400
    Apr 2300 400
    May 2350 200
    %matplotlib inline
    import matplotlib.pyplot as plt
    plt.figure(figsize=(20,10))
    df_6['Gain'].plot()
    

    在这里插入图片描述

    max = df_6['Gain'].max()
    min = df_6['Gain'].min()
    max
    min
    
    8500
    -3650
    
    for gain in range(len(df_6['Gain'])):
        if df_6['Gain'][gain] == max:
            print('极大值出现时间:', df_6.index[gain])
        if df_6['Gain'][gain] == min:
            print('极小值出现时间:', df_6.index[gain])
    
    极大值出现时间: ('2008', 'Jan')
    极小值出现时间: ('2008', 'Apr')
    

    二、2007 年——2019 年俄罗斯机场货运航班运载量

    问题

    path = 'C:/Users/86187/Desktop/第12期组队学习/组队学习Pandas/'
    df = pd.read_csv(path + '2007年-2019年俄罗斯货运航班运载量.csv')
    df.head()
    df.shape
    df.columns
    
    Airport name Year January February March April May June July August September October November December Whole year Airport coordinates
    0 Abakan 2019 44.70 66.21 72.7 75.82 100.34 78.38 63.88 73.06 66.74 75.44 110.5 89.8 917.57 (Decimal('91.399735'), Decimal('53.751351'))
    1 Aikhal 2019 0.00 0.00 0.0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.0 0.0 0.00 (Decimal('111.543324'), Decimal('65.957161'))
    2 Loss 2019 0.00 0.00 0.0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.0 0.0 0.00 (Decimal('125.398355'), Decimal('58.602489'))
    3 Amderma 2019 0.00 0.00 0.0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.0 0.0 0.00 (Decimal('61.577429'), Decimal('69.759076'))
    4 Anadyr (Carbon) 2019 81.63 143.01 260.9 304.36 122.00 106.87 84.99 130.00 102.00 118.00 94.0 199.0 1746.76 (Decimal('177.738273'), Decimal('64.713433'))
    (3711, 16)
    Index(['Airport name', 'Year', 'January', 'February', 'March', 'April', 'May',
           'June', 'July', 'August', 'September', 'October', 'November',
           'December', 'Whole year', 'Airport coordinates'],
          dtype='object')
    

    (1) 求每年货运航班总运量。(考察点:统计量,分组)

    df_1 = df.copy()
    df_1.groupby('Year')['Whole year'].sum()
    
    Year
    2007    659438.23
    2008    664682.46
    2009    560809.77
    2010    693033.98
    2011    818691.71
    2012    846388.03
    2013    792337.08
    2014    729457.12
    2015    630208.97
    2016    679370.15
    2017    773662.28
    2018    767095.28
    2019    764606.27
    Name: Whole year, dtype: float64
    

    (2) 每年记录的机场都是相同的吗?(考察点:分组,查看类别值)

    df_2 = df.copy()
    df_2.groupby('Year')['Airport name'].count()
    
    Year
    2007    292
    2008    292
    2009    292
    2010    292
    2011    292
    2012    292
    2013    292
    2014    292
    2015    292
    2016    292
    2017    292
    2018    248
    2019    251
    Name: Airport name, dtype: int64
    

    (3) 按年计算2010 年-2015 年全年货运量记录为0 的机场航班比例。(考察点:分组,统计量,筛选)

    df_3 = df.copy()
    df_3.head()
    
    Airport name Year January February March April May June July August September October November December Whole year Airport coordinates
    0 Abakan 2019 44.70 66.21 72.7 75.82 100.34 78.38 63.88 73.06 66.74 75.44 110.5 89.8 917.57 (Decimal('91.399735'), Decimal('53.751351'))
    1 Aikhal 2019 0.00 0.00 0.0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.0 0.0 0.00 (Decimal('111.543324'), Decimal('65.957161'))
    2 Loss 2019 0.00 0.00 0.0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.0 0.0 0.00 (Decimal('125.398355'), Decimal('58.602489'))
    3 Amderma 2019 0.00 0.00 0.0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.0 0.0 0.00 (Decimal('61.577429'), Decimal('69.759076'))
    4 Anadyr (Carbon) 2019 81.63 143.01 260.9 304.36 122.00 106.87 84.99 130.00 102.00 118.00 94.0 199.0 1746.76 (Decimal('177.738273'), Decimal('64.713433'))
    df_3 = df_3[(df_3.Year<=2015) & (df_3.Year>=2010)]
    df_3.head()
    
    Airport name Year January February March April May June July August September October November December Whole year Airport coordinates
    1083 Abakan 2015 37.70 47.97 54.67 82.12 68.81 112.95 55.83 95.20 137.79 72.13 63.67 78.30 907.14 (Decimal('91.399735'), Decimal('53.751351'))
    1084 Aikhal 2015 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 (Decimal('111.543324'), Decimal('65.957161'))
    1085 Loss 2015 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 (Decimal('125.398355'), Decimal('58.602489'))
    1086 Amderma 2015 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 (Decimal('61.577429'), Decimal('69.759076'))
    1087 Anadyr 2015 124.31 254.19 340.37 286.06 156.55 124.95 89.05 72.29 118.16 92.89 158.39 316.94 2134.15 (Decimal('177.738273'), Decimal('64.713433'))
    df_3.groupby('Year').apply(lambda x: len(x[x['Whole year'] == 0]) / len(x)
                               ).apply(lambda x: format(x, '.2%'))
    
    Year
    2010    76.71%
    2011    77.05%
    2012    77.05%
    2013    77.05%
    2014    77.05%
    2015    77.05%
    dtype: object
    

    (4) 若某机场至少存在5 年或以上满足所有月运量记录都为0,则将其所有年份的记录信息从表中删除,并返回处理后的表格(考察点:数据删除,分组,筛选,索引)

    df_4 = df.copy()
    df_4.head()
    df_4.describe()
    
    Airport name Year January February March April May June July August September October November December Whole year Airport coordinates
    0 Abakan 2019 44.70 66.21 72.7 75.82 100.34 78.38 63.88 73.06 66.74 75.44 110.5 89.8 917.57 (Decimal('91.399735'), Decimal('53.751351'))
    1 Aikhal 2019 0.00 0.00 0.0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.0 0.0 0.00 (Decimal('111.543324'), Decimal('65.957161'))
    2 Loss 2019 0.00 0.00 0.0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.0 0.0 0.00 (Decimal('125.398355'), Decimal('58.602489'))
    3 Amderma 2019 0.00 0.00 0.0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.0 0.0 0.00 (Decimal('61.577429'), Decimal('69.759076'))
    4 Anadyr (Carbon) 2019 81.63 143.01 260.9 304.36 122.00 106.87 84.99 130.00 102.00 118.00 94.0 199.0 1746.76 (Decimal('177.738273'), Decimal('64.713433'))
    Year January February March April May June July August September October November December Whole year
    count 3711.000000 3711.000000 3711.000000 3711.000000 3711.000000 3711.000000 3711.000000 3711.000000 3711.000000 3711.000000 3711.000000 3711.000000 3711.000000 3711.000000
    mean 2012.874427 143.047750 174.173236 205.743428 209.632010 207.959469 204.179264 207.930881 220.353824 227.028739 239.991000 239.022280 250.341396 2527.561663
    std 3.689772 1013.203152 1125.253840 1404.358066 1386.452459 1389.496213 1376.645217 1407.243660 1467.573705 1510.842715 1591.031855 1613.188413 1597.294536 16803.633543
    min 2007.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
    25% 2010.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
    50% 2013.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
    75% 2016.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
    max 2019.000000 21304.800000 22765.700000 27617.900000 28634.200000 27026.400000 30202.600000 28158.600000 28298.200000 29620.100000 30608.600000 32723.900000 32533.800000 329817.200000
    for name, group in df_4.groupby('Airport name'):
        num = group[group['Whole year'] == 0].shape[0]
        if num > 4:
            df_4 = df_4[df_4['Airport name'] != name]
    df_4.head()
    df_4.describe()
    
    Airport name Year January February March April May June July August September October November December Whole year Airport coordinates
    0 Abakan 2019 44.70 66.21 72.70 75.82 100.34 78.38 63.88 73.06 66.74 75.44 110.50 89.80 917.57 (Decimal('91.399735'), Decimal('53.751351'))
    4 Anadyr (Carbon) 2019 81.63 143.01 260.90 304.36 122.00 106.87 84.99 130.00 102.00 118.00 94.00 199.00 1746.76 (Decimal('177.738273'), Decimal('64.713433'))
    5 Anapa (Vitjazevo) 2019 45.92 53.15 54.00 54.72 52.00 67.45 172.31 72.57 70.00 63.00 69.00 82.10 856.22 (Decimal('37.341511'), Decimal('45.003748'))
    8 Arkhangelsk (Talagy) 2019 85.61 118.70 131.39 144.82 137.95 140.18 128.56 135.68 124.75 139.60 210.27 307.10 1804.61 (Decimal('40.714892'), Decimal('64.596138'))
    9 Astrakhan (Narimanovo) 2019 51.75 61.08 65.60 71.84 71.38 63.95 164.86 79.46 85.21 87.23 79.06 99.16 980.58 (Decimal('47.999896'), Decimal('46.287344'))
    Year January February March April May June July August September October November December Whole year
    count 862.000000 862.000000 862.000000 862.000000 862.000000 862.000000 862.000000 862.000000 862.000000 862.000000 862.000000 862.000000 862.000000 862.000000
    mean 2013.027842 614.009327 747.470093 882.867088 899.167599 892.588608 876.229327 892.193979 945.517935 974.562053 1029.921369 1025.466415 1073.369617 10848.347332
    std 3.757352 2033.238687 2242.107332 2810.670604 2768.077529 2776.264104 2752.586894 2814.629864 2931.593539 3017.721843 3177.001678 3225.895008 3179.534606 33560.997245
    min 2007.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
    25% 2010.000000 39.180000 53.335000 58.975000 63.495000 61.470000 62.392500 59.977500 65.795000 64.827500 68.045000 65.820000 79.150000 780.210000
    50% 2013.000000 117.370000 157.265000 186.725000 195.110000 171.350000 157.325000 154.060000 166.100000 176.570000 193.770000 193.000000 246.125000 2161.020000
    75% 2016.000000 397.075000 515.927500 558.567500 606.100000 620.500000 582.527500 618.047500 621.970000 629.922500 699.000000 658.585000 733.175000 7442.140000
    max 2019.000000 21304.800000 22765.700000 27617.900000 28634.200000 27026.400000 30202.600000 28158.600000 28298.200000 29620.100000 30608.600000 32723.900000 32533.800000 329817.200000

    (5) 采用一种合理的方式将所有机场划分为东南西北四个分区,并给出2017年-2019 年货运总量最大的区域。(考察点:分组)

    (6) 在统计学中常常用秩代表排名,现在规定某个机场某年某个月的秩为该机场该月在当年所有月份中货运量的排名(例如*** 机场19 年1 月运量在整个19 年12 个月中排名第一,则秩为1),那么判断某月运量情况的相对大小的秩方法为将所有机场在该月的秩排名相加,并将这个量定义为每一个月的秩综合指数,请根据上述定义计算2016 年12 个月的秩综合指数。(分组,合并,排序)

    df_6 = df.copy()
    df_6 = df_6[(df_6.Year==2016)]
    df_6.head()
    df_6.columns
    
    Airport name Year January February March April May June July August September October November December Whole year Airport coordinates
    791 Abakan 2016 34.1 45.41 58.97 72.71 91.66 78.49 64.31 127.29 89.07 74.99 78.66 94.01 909.67 (Decimal('91.399735'), Decimal('53.751351'))
    792 Aikhal 2016 0.0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 (Decimal('111.543324'), Decimal('65.957161'))
    793 Loss 2016 0.0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 (Decimal('125.398355'), Decimal('58.602489'))
    794 Amderma 2016 0.0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 (Decimal('61.577429'), Decimal('69.759076'))
    795 Anadyr 2016 146.5 232.97 245.75 429.50 188.14 126.57 71.93 117.67 109.68 108.43 133.43 108.43 2019.00 (Decimal('177.738273'), Decimal('64.713433'))
    Index(['Airport name', 'Year', 'January', 'February', 'March', 'April', 'May',
           'June', 'July', 'August', 'September', 'October', 'November',
           'December', 'Whole year', 'Airport coordinates'],
          dtype='object')
    
    month = df_6.columns[2:-2]
    for i in month:
        df_6[i+'_rank']=0    
    

    未完,待续。。。。。。

    参考内容

    1. 教程仓库连接
    2. 《利用Python进行数据分析》
  • 相关阅读:
    洛谷 P2700 逐个击破
    洛谷 P1503 鬼子进村
    洛谷 P1556 幸福的路
    洛谷 P1490 买蛋糕
    洛谷 P2507 [SCOI2008]配对
    code vs 3305 水果姐逛水果街Ⅱ
    通过idea远程调试
    【Cocos2d-x JavaScript Binding】
    ☀【SeaJS】SeaJS Grunt构建
    -_-#【Better Code】throttle / debounce
  • 原文地址:https://www.cnblogs.com/Jack-Tim-TYJ/p/12831894.html
Copyright © 2011-2022 走看看