zoukankan      html  css  js  c++  java
  • 二、pandas入门

    
    
    import numpy as np
    import pandas as pd
    
    
    
    Series:
    #创建Series方法1
    s1=pd.Series([1,2,3,4])
    s1
    # 0    1
    # 1    2
    # 2    3
    # 3    4
    # dtype: int64
    View Code
    s1.values#array([1, 2, 3, 4], dtype=int64)
    s1.index#RangeIndex(start=0, stop=4, step=1)
    

      

    #创建Series方法2
    s2=pd.Series(np.arange(5,10))
    print(s2)
    # 0    5
    # 1    6
    # 2    7
    # 3    8
    # 4    9
    # dtype: int32
    View Code
    #创建Series方法3
    s3=pd.Series({'5':1,'6':3,'7':9})
    print(s3)
    # 5    1
    # 6    3
    # 7    9
    # dtype: int64
    View Code
    print(s3.index)#Index(['5', '6', '7'], dtype='object')
    #创建Series方法4
    s4=pd.Series([1,2,3,4],index=['A','B','C','D'])
    print(s4)
    # A    1
    # B    2
    # C    3
    # D    4
    # dtype: int64
    
    #取值
    print(s4['A'])#1
    print(s4[s4>2])
    # C    3
    # D    4
    # dtype: int64
    
    #将Series转换成字典
    dict=s4.to_dict()
    print(dict)#{'A': 1, 'B': 2, 'C': 3, 'D': 4}
    
    #将字典转换为Series
    seri=pd.Series(dict)
    print(seri)
    # A    1
    # B    2
    # C    3
    # D    4
    # dtype: int64
    
    #改变Series的index
    index_1=['z','A','B','v','C']
    s5=pd.Series(s4,index=index_1)
    print(s5)
    # z    NaN
    # A    1.0
    # B    2.0
    # v    NaN
    # C    3.0
    # dtype: float64
    
    #判断是不是null
    print(pd.isnull(s5))
    # z     True
    # A    False
    # B    False
    # v     True
    # C    False
    # dtype: bool
    print(pd.notnull(s5))
    # z    False
    # A     True
    # B     True
    # v    False
    # C     True
    # dtype: bool
    
    #给Series起名字
    s5.name='demo'
    print(s5)
    # z    NaN
    # A    1.0
    # B    2.0
    # v    NaN
    # C    3.0
    # Name: demo, dtype: float64
    
    s5.index.name='demo index'
    print(s5.index)#Index(['z', 'A', 'B', 'v', 'C'], dtype='object', name='demo index')

    DataFrame:
    from pandas import Series,DataFrame
    import webbrowser
    link='https://www.tiobe.com/tiobe-index/'
    webbrowser.open(link)#打开该网站
    #复制网站中一下内容内容
    '''
       Jan 2019  Jan 2018  Change Programming Language  Ratings Change.1
    0         1         1     NaN                 Java  16.904%   +2.69%
    1         2         2     NaN                    C  13.337%   +2.30%
    2         3         4  change               Python   8.294%   +3.62%
    3         4         3  change                  C++   8.158%   +2.55%
    4         5         7  change    Visual Basic .NET   6.459%   +3.20%
    '''
    df=pd.read_clipboard()#从剪切板里创建DataFrame
    type(df)#pandas.core.frame.DataFrame
    print(df)#打印出和上述内容一样的DataFrame
    #    Jan 2019  Jan 2018  Change Programming Language  Ratings Change.1
    # 0         1         1     NaN                 Java  16.904%   +2.69%
    # 1         2         2     NaN                    C  13.337%   +2.30%
    # 2         3         4  change               Python   8.294%   +3.62%
    # 3         4         3  change                  C++   8.158%   +2.55%
    # 4         5         7  change    Visual Basic .NET   6.459%   +3.20%

      

    #获取列名
    print(df.columns)#Index(['Jan 2019', 'Jan 2018', 'Change', 'Programming Language', 'Ratings','Change.1'],dtype='object')
    

      

    #获取某一列的value
    print(df.Ratings)#获取Ratings列
    # 0    16.904%
    # 1    13.337%
    # 2     8.294%
    # 3     8.158%
    # 4     6.459%
    # Name: Ratings, dtype: object
    print(df['Jan 2019'])#获取'Jan 2019'列,因为两个单词,所以不能用上式   获取两列则用print(df[['Jan 2019',Ratings]]),得到的类型为DataFrame
    # 0    1
    # 1    2
    # 2    3
    # 3    4
    # 4    5
    # Name: Jan 2019, dtype: int64
    print(type(df.Ratings),type(df['Jan 2019']))#<class 'pandas.core.series.Series'> <class 'pandas.core.series.Series'>
    

      

    # 提取旧的DataFrame某些列生成新的DataFrame
    df_new=DataFrame(df,columns=['Programming Language','Jan 2019'])
    print(df_new)
    #   Programming Language  Jan 2019
    # 0                 Java         1
    # 1                    C         2
    # 2               Python         3
    # 3                  C++         4
    # 4    Visual Basic .NET         5
    

      

    #提取旧的DataFrame某些列生成新的DataFrame,但新的DataFrame中有的列在旧的没有,会生成新的列
    df_new2=DataFrame(df,columns=['new lie','Jan 2019'])
    print(df_new2)
    #    new lie  Jan 2019
    # 0      NaN         1
    # 1      NaN         2
    # 2      NaN         3
    # 3      NaN         4
    # 4      NaN         5
    
    #可以给new lie赋值
    df_new2['new lie']=range(5,10)
    df_new2['new lie']=np.arange(5,10)#也可以通过numpy赋值
    df_new2['new lie']=pd.Series(np.arange(5,10))#也可以通过Series赋值
    print(df_new2)
    #    new lie  Jan 2019
    # 0        5         1
    # 1        6         2
    # 2        7         3
    # 3        8         4
    # 4        9         5
    
    df_new2['new lie']=pd.Series([200,200],index=[2,3])#指定某一列某一两个元素值的更改
    print(df_new2)
    #    new lie  Jan 2019
    # 0      NaN         1
    # 1      NaN         2
    # 2    200.0         3
    # 3    200.0         4
    # 4      NaN         5

    Series与DataFrame:

    import numpy as np
    import pandas as pd
    from pandas import Series,DataFrame
    data={'country':['Belgium','India','Brazil'],
          'Capital':['Brussels','New Delhi','Brasillia'],
          'Population':[11190846,1303171035,207847528]}
    
    #Seiries
    s1=pd.Series(data['country'])
    # 0    Belgium
    # 1      India
    # 2     Brazil
    # dtype: object
    s1.values#array(['Belgium', 'India', 'Brazil'], dtype=object)
    s1.index#RangeIndex(start=0, stop=3, step=1)
    
    #DataFrame
    df1=pd.DataFrame(data)#通过字典创建DataFrame
    # 	country	Capital	Population
    # 0	Belgium	Brussels	11190846
    # 1	India	New Delhi	1303171035
    # 2	Brazil	Brasillia	207847528
    df1['country']#访问某一列
    df1.country##访问某一列的另一种方式,效果同上
    # 0    Belgium
    # 1      India
    # 2     Brazil
    # Name: country, dtype: object
    type(df1['country'])#pandas.core.series.Series
    
    #访问DataFrame的行
    df1.iterrows()#<generator object DataFrame.iterrows at 0x0000000004E8F888>
    for row in  df1.iterrows():
        print(row)
        print('类型:',type(row))
        print('长度:',len(row),'
    ')
    '''
    (0, country        Belgium
    Capital       Brussels
    Population    11190846
    Name: 0, dtype: object)
    类型: <class 'tuple'>
    长度: 2 
    
    (1, country            India
    Capital        New Delhi
    Population    1303171035
    Name: 1, dtype: object)
    类型: <class 'tuple'>
    长度: 2 
    
    (2, country          Brazil
    Capital       Brasillia
    Population    207847528
    Name: 2, dtype: object)
    类型: <class 'tuple'>
    长度: 2 
    '''
    for row in  df1.iterrows():
        print('第一个:',row[0])
        print('第二个:', row[1],'
    ')
        print('类型:',type(row[0]),type(row[1]))
        break
    '''
    第一个: 0
    第二个: country        Belgium
    Capital       Brussels
    Population    11190846
    Name: 0, dtype: object 
    
    类型: <class 'int'> <class 'pandas.core.series.Series'>
    '''
    
    #通过Series创建DataFrame
    s1=pd.Series(data['Capital'])
    s2=pd.Series(data['country'])
    s3=pd.Series(data['Population'])
    df_new1=pd.DataFrame([s1,s2,s3])
    print(df_new1)
    '''
              0           1          2
    0  Brussels   New Delhi  Brasillia
    1   Belgium       India     Brazil
    2  11190846  1303171035  207847528
    '''
    print(df_new1.T)# 转置
    '''
               0        1           2
    0   Brussels  Belgium    11190846
    1  New Delhi    India  1303171035
    2  Brasillia   Brazil   207847528
    '''
    df_new2=pd.DataFrame([s1,s2,s3],index=['Capital','country','Population']).T
    print(df_new2)
    '''
         Capital  country  Population
    0   Brussels  Belgium    11190846
    1  New Delhi    India  1303171035
    2  Brasillia   Brazil   207847528
    '''

     pandas中的DateFrame的IO操作:

    import numpy as np
    import pandas as pd
    from pandas import Series,DataFrame
    import webbrowser
    link='http://pandas.pydata.org/pandas-docs/version/0.20/io.html'
    webbrowser.open(link)#打开该网站
    #复制网站中一下内容内容
    '''
    Format Type Data Description    Reader  Writer
    text    CSV read_csv    to_csv
    text    JSON    read_json   to_json
    text    HTML    read_html   to_html
    text    Local clipboard read_clipboard  to_clipboard
    binary  MS Excel    read_excel  to_excel
    binary  HDF5 Format read_hdf    to_hdf
    binary  Feather Format  read_feather    to_feather
    binary  Msgpack read_msgpack    to_msgpack
    binary  Stata   read_stata  to_stata
    binary  SAS read_sas    
    binary  Python Pickle Format    read_pickle to_pickle
    SQL SQL read_sql    to_sql
    SQL Google Big Query    read_gbq    to_gbq
    '''
    df1=pd.read_clipboard()
    print(df1)
    '''
       Format Type      Data Description          Reader        Writer
    0         text                   CSV        read_csv        to_csv
    1         text                  JSON       read_json       to_json
    2         text                  HTML       read_html       to_html
    3         text       Local clipboard  read_clipboard  to_clipboard
    4       binary              MS Excel      read_excel      to_excel
    5       binary           HDF5 Format        read_hdf        to_hdf
    6       binary        Feather Format    read_feather    to_feather
    7       binary               Msgpack    read_msgpack    to_msgpack
    8       binary                 Stata      read_stata      to_stata
    9       binary                   SAS        read_sas              
    10      binary  Python Pickle Format     read_pickle     to_pickle
    11         SQL                   SQL        read_sql        to_sql
    12         SQL      Google Big Query        read_gbq        to_gbq
    '''
    df1.to_clipboard()#将df1的内容复制到粘贴板
    df1.to_csv('df1.csv')#将df1的内容输出到df1.csv文件中,包括index
    df1.to_csv('df11.csv',index=False)#将df1的内容输出到df2.csv文件中,但不包括index
    df2=pd.read_csv('df11.csv')#读取csv文件
    print(df2)
    '''
       Format Type      Data Description          Reader        Writer
    0         text                   CSV        read_csv        to_csv
    1         text                  JSON       read_json       to_json
    2         text                  HTML       read_html       to_html
    3         text       Local clipboard  read_clipboard  to_clipboard
    4       binary              MS Excel      read_excel      to_excel
    5       binary           HDF5 Format        read_hdf        to_hdf
    6       binary        Feather Format    read_feather    to_feather
    7       binary               Msgpack    read_msgpack    to_msgpack
    8       binary                 Stata      read_stata      to_stata
    9       binary                   SAS        read_sas              
    10      binary  Python Pickle Format     read_pickle     to_pickle
    11         SQL                   SQL        read_sql        to_sql
    12         SQL      Google Big Query        read_gbq        to_gbq
    '''
    df3=df1.to_json()#输出为json格式
    print(df3)
    '''
    {"Format Type":{"0":"text","1":"text","2":"text","3":"text","4":"binary","5":"binary","6":"binary","7":"binary","8":"binary","9":"binary","10":"binary","11":"SQL","12":"SQL"},
    "Data Description":{"0":"CSV","1":"JSON","2":"HTML","3":"Local clipboard","4":"MS Excel","5":"HDF5 Format","6":"Feather Format","7":"Msgpack","8":"Stata","9":"SAS","10":"Python Pickle Format","11":"SQL","12":"Google Big Query"},
    "Reader":{"0":"read_csv","1":"read_json","2":"read_html","3":"read_clipboard","4":"read_excel","5":"read_hdf","6":"read_feather","7":"read_msgpack","8":"read_stata","9":"read_sas","10":"read_pickle","11":"read_sql","12":"read_gbq"},
    "Writer":{"0":"to_csv","1":"to_json","2":"to_html","3":"to_clipboard","4":"to_excel","5":"to_hdf","6":"to_feather","7":"to_msgpack","8":"to_stata","9":" ","10":"to_pickle","11":"to_sql","12":"to_gbq"}}
    '''
    print(pd.read_json(df3))#读json格式
    '''
       Format Type      Data Description          Reader        Writer
    0         text                   CSV        read_csv        to_csv
    1         text                  JSON       read_json       to_json
    10      binary  Python Pickle Format     read_pickle     to_pickle
    11         SQL                   SQL        read_sql        to_sql
    12         SQL      Google Big Query        read_gbq        to_gbq
    2         text                  HTML       read_html       to_html
    3         text       Local clipboard  read_clipboard  to_clipboard
    4       binary              MS Excel      read_excel      to_excel
    5       binary           HDF5 Format        read_hdf        to_hdf
    6       binary        Feather Format    read_feather    to_feather
    7       binary               Msgpack    read_msgpack    to_msgpack
    8       binary                 Stata      read_stata      to_stata
    9       binary                   SAS        read_sas              
    '''
    df1.to_json('df1.json')#生成json文件
    print(pd.read_json('df1.json'))#读取json文件
    '''
       Format Type      Data Description          Reader        Writer
    0         text                   CSV        read_csv        to_csv
    1         text                  JSON       read_json       to_json
    10      binary  Python Pickle Format     read_pickle     to_pickle
    11         SQL                   SQL        read_sql        to_sql
    12         SQL      Google Big Query        read_gbq        to_gbq
    2         text                  HTML       read_html       to_html
    3         text       Local clipboard  read_clipboard  to_clipboard
    4       binary              MS Excel      read_excel      to_excel
    5       binary           HDF5 Format        read_hdf        to_hdf
    6       binary        Feather Format    read_feather    to_feather
    7       binary               Msgpack    read_msgpack    to_msgpack
    8       binary                 Stata      read_stata      to_stata
    9       binary                   SAS        read_sas              
    '''
    df1.to_html('df1.html')#生成html文件
    print(pd.read_html('df1.html'))#读取html文件
    '''
    [    Unnamed: 0 Format Type      Data Description          Reader        Writer
    0            0        text                   CSV        read_csv        to_csv
    1            1        text                  JSON       read_json       to_json
    2            2        text                  HTML       read_html       to_html
    3            3        text       Local clipboard  read_clipboard  to_clipboard
    4            4      binary              MS Excel      read_excel      to_excel
    5            5      binary           HDF5 Format        read_hdf        to_hdf
    6            6      binary        Feather Format    read_feather    to_feather
    7            7      binary               Msgpack    read_msgpack    to_msgpack
    8            8      binary                 Stata      read_stata      to_stata
    9            9      binary                   SAS        read_sas           NaN
    10          10      binary  Python Pickle Format     read_pickle     to_pickle
    11          11         SQL                   SQL        read_sql        to_sql
    12          12         SQL      Google Big Query        read_gbq        to_gbq]
    '''
    df1.to_excel('df1.xlsx')#生成excell文件
    print(pd.read_excel('df1.xlsx'))#读取excell文件
    '''
       Format Type      Data Description          Reader        Writer
    0         text                   CSV        read_csv        to_csv
    1         text                  JSON       read_json       to_json
    2         text                  HTML       read_html       to_html
    3         text       Local clipboard  read_clipboard  to_clipboard
    4       binary              MS Excel      read_excel      to_excel
    5       binary           HDF5 Format        read_hdf        to_hdf
    6       binary        Feather Format    read_feather    to_feather
    7       binary               Msgpack    read_msgpack    to_msgpack
    8       binary                 Stata      read_stata      to_stata
    9       binary                   SAS        read_sas              
    10      binary  Python Pickle Format     read_pickle     to_pickle
    11         SQL                   SQL        read_sql        to_sql
    12         SQL      Google Big Query        read_gbq        to_gbq
    '''

    Series和DataFrame的indexing

    import numpy as np
    import pandas as pd
    imdb=pd.read_csv(r'C:UsersAdministratorDesktoppy_workcodespresidential_polls.csv')
    print(imdb.shape)#查看行列数   (10236, 27)
    print(imdb.head())#默认打印前五行
    '''
       cycle     branch        type                        matchup forecastdate  
    0   2016  President  polls-plus  Clinton vs. Trump vs. Johnson      11/1/16   
    1   2016  President  polls-plus  Clinton vs. Trump vs. Johnson      11/1/16   
    2   2016  President  polls-plus  Clinton vs. Trump vs. Johnson      11/1/16   
    3   2016  President  polls-plus  Clinton vs. Trump vs. Johnson      11/1/16   
    4   2016  President  polls-plus  Clinton vs. Trump vs. Johnson      11/1/16   
    
          state   startdate     enddate                  pollster grade  
    0      U.S.  10/25/2016  10/31/2016   Google Consumer Surveys     B   
    1      U.S.  10/27/2016  10/30/2016  ABC News/Washington Post    A+   
    2  Virginia  10/27/2016  10/30/2016  ABC News/Washington Post    A+   
    3   Florida  10/20/2016  10/24/2016                 SurveyUSA     A   
    4      U.S.  10/20/2016  10/25/2016       Pew Research Center    B+   
    
               ...           adjpoll_clinton adjpoll_trump  adjpoll_johnson  
    0          ...                  42.64140      40.86509         5.675099   
    1          ...                  43.29659      44.72984         3.401513   
    2          ...                  46.29779      40.72604         6.401513   
    3          ...                  46.35931      45.30585         1.777730   
    4          ...                  45.32744      42.20888         3.618320   
    
       adjpoll_mcmullin  multiversions  
    0               NaN            NaN   
    1               NaN            NaN   
    2               NaN            NaN   
    3               NaN            NaN   
    4               NaN            NaN   
    
                                                     url  poll_id  question_id  
    0  https://datastudio.google.com/u/0/#/org//repor...    47940        74999   
    1  http://www.langerresearch.com/wp-content/uploa...    47881        74936   
    2  https://www.washingtonpost.com/local/virginia-...    47880        74934   
    3  http://www.baynews9.com/content/news/baynews9/...    47465        74252   
    4  http://www.people-press.org/2016/10/27/as-elec...    47616        74519   
    
       createddate             timestamp  
    0      11/1/16  15:09:38  1 Nov 2016  
    1      11/1/16  15:09:38  1 Nov 2016  
    2      11/1/16  15:09:38  1 Nov 2016  
    3     10/25/16  15:09:38  1 Nov 2016  
    4     10/27/16  15:09:38  1 Nov 2016  
    
    [5 rows x 27 columns]
    '''
    print(imdb.tail())#默认打印后5行,与head用法相同
    '''
           cycle     branch        type                        matchup  
    10231   2016  President  polls-only  Clinton vs. Trump vs. Johnson   
    10232   2016  President  polls-only  Clinton vs. Trump vs. Johnson   
    10233   2016  President  polls-only  Clinton vs. Trump vs. Johnson   
    10234   2016  President  polls-only  Clinton vs. Trump vs. Johnson   
    10235   2016  President  polls-only  Clinton vs. Trump vs. Johnson   
    
          forecastdate           state  startdate     enddate  
    10231      11/1/16         Alabama  9/30/2016  10/13/2016   
    10232      11/1/16        Virginia  9/30/2016   10/6/2016   
    10233      11/1/16        Virginia  9/16/2016   9/22/2016   
    10234      11/1/16  North Carolina  6/20/2016   6/21/2016   
    10235      11/1/16            Utah  7/29/2016   8/18/2016   
    
                        pollster grade          ...           adjpoll_clinton  
    10231                  Ipsos    A-          ...                  37.30964   
    10232                  Ipsos    A-          ...                  49.13094   
    10233                  Ipsos    A-          ...                  45.97130   
    10234  Public Policy Polling    B+          ...                  45.29390   
    10235                  Ipsos    A-          ...                  31.62721   
    
          adjpoll_trump  adjpoll_johnson  adjpoll_mcmullin  multiversions  
    10231      54.76821              NaN               NaN            NaN   
    10232      39.41588              NaN               NaN            NaN   
    10233      39.97518              NaN               NaN            NaN   
    10234      46.66175         1.596946               NaN            NaN   
    10235      44.65947              NaN               NaN            NaN   
    
                                                         url  poll_id  
    10231              http://reuters.com/statesofthenation/    46817   
    10232          http://www.reuters.com/statesofthenation/    46675   
    10233          http://www.reuters.com/statesofthenation/    46096   
    10234  http://www.publicpolicypolling.com/pdf/2015/PP...    44400   
    10235           http://www.reuters.com/statesofthenation    44978   
    
           question_id  createddate             timestamp  
    10231        73263     10/15/16  14:57:58  1 Nov 2016  
    10232        72969     10/10/16  14:57:58  1 Nov 2016  
    10233        72088      9/26/16  14:57:58  1 Nov 2016  
    10234        67363      6/23/16  14:57:58  1 Nov 2016  
    10235        69011      8/24/16  14:57:58  1 Nov 2016  
    
    [5 rows x 27 columns]
    '''

      

    print(imdb.iloc[10:13,0:5])#查看第10到12行,0到4列(iloc通过index搜索的,基于位置信息,类似切片,不包含末尾位置)
    '''
        cycle     branch        type                        matchup forecastdate
    10   2016  President  polls-plus  Clinton vs. Trump vs. Johnson      11/1/16
    11   2016  President  polls-plus  Clinton vs. Trump vs. Johnson      11/1/16
    12   2016  President  polls-plus  Clinton vs. Trump vs. Johnson      11/1/16
    '''
    df=imdb.iloc[10:13,0:5]
    print(df.iloc[1:3,1:3])
    '''
           branch        type
    11  President  polls-plus
    12  President  polls-plus
    '''
    print(df.loc[10:12,:'type'])#loc是通过lable查询的,基于lable信息查询,包含末尾位置
    '''
        cycle     branch        type
    10   2016  President  polls-plus
    11   2016  President  polls-plus
    12   2016  President  polls-plus
    '''
    print(imdb['adjpoll_clinton'].head())#查看某列
    '''
    0    42.64140
    1    43.29659
    2    46.29779
    3    46.35931
    4    45.32744
    Name: adjpoll_clinton, dtype: float64
    '''
    print(imdb['adjpoll_clinton'][10])#查看某个元素  44.53217
    print(imdb[['adjpoll_trump','adjpoll_johnson']])#通过某(些)列生成新的DataFrame
    '''
           adjpoll_trump  adjpoll_johnson
    0           40.86509         5.675099
    1           44.72984         3.401513
    2           40.72604         6.401513
    3           45.30585         1.777730
    4           42.20888         3.618320
    5           42.26663         6.114222
    6           43.56017         3.153590
    7           43.50333         3.466432
    8           37.24948         6.420006
    9           41.69540         4.220173
    10          43.84845              NaN
    11          47.92262         2.676897
    12          29.50605         3.170510
    13          40.34972         5.823322
    14          42.01937         6.499082
    15          45.07725         3.499082
    16          39.33826         5.044833
    17          46.11255         3.054228
    18          39.80679         6.359501
    19          41.34735         4.421316
    20          39.99571         6.272840
    21          50.75720              NaN
    22          38.87231         8.359501
    23          41.55637         4.964521
    24          43.84806         5.359501
    25          45.03370         2.193952
    26          44.78595         4.359501
    27          44.18040         5.160502
    28          40.41809         3.333669
    29          49.47709         4.308866
    ...              ...              ...
    10206       36.75014         9.152230
    10207       40.08237              NaN
    10208       43.67710              NaN
    10209       43.40106              NaN
    10210       35.52956              NaN
    10211       35.03328              NaN
    10212       44.77681              NaN
    10213       38.24798              NaN
    10214       41.25978              NaN
    10215       41.59738              NaN
    10216       41.64499         1.974752
    10217       36.15054              NaN
    10218       38.65057              NaN
    10219       29.49314         9.007062
    10220       37.87221              NaN
    10221       39.42957              NaN
    10222       53.95455              NaN
    10223       33.07150         3.328916
    10224       41.88533         1.974752
    10225       36.82408         9.741756
    10226       47.80848              NaN
    10227       42.01089         3.671217
    10228       45.06726              NaN
    10229       40.16534        12.889780
    10230       41.56030         2.872088
    10231       54.76821              NaN
    10232       39.41588              NaN
    10233       39.97518              NaN
    10234       46.66175         1.596946
    10235       44.65947              NaN
    
    [10236 rows x 2 columns]
    '''  

    Series和DataFrame的Reindexing

    import numpy as np
    import pandas as pd
    s1=pd.Series([1,2,3,4],index=['A','B','C','D'])
    print(s1)
    A    1
    B    2
    C    3
    D    4
    dtype: int64
    View Code
    print(s1.reindex(['A','C','E']))
    A    1.0
    C    3.0
    E    NaN
    dtype: float64
    View Code
    print(s1.reindex(['A','C','m'],fill_value=11))#通过fille_value填充数值
    A     1
    C     3
    m    11
    dtype: int64
    View Code
    s2=pd.Series(['a','b','c'],index=[1,5,10])
    print(s2)
    1     a
    5     b
    10    c
    dtype: object
    View Code
    print(s2.reindex(index=range(15)))
    0     NaN
    1       a
    2     NaN
    3     NaN
    4     NaN
    5       b
    6     NaN
    7     NaN
    8     NaN
    9     NaN
    10      c
    11    NaN
    12    NaN
    13    NaN
    14    NaN
    dtype: object
    View Code
    print(s2.reindex(index=range(15),method='ffill'))#自动填充,第0个是NaN,第1到4用a填充(<=4),第5到9用b填充(大于等于5小于10),大于等于10用c填充
    0     NaN
    1       a
    2       a
    3       a
    4       a
    5       b
    6       b
    7       b
    8       b
    9       b
    10      c
    11      c
    12      c
    13      c
    14      c
    dtype: object
    View Code
    print(s2)
    1     a
    5     b
    10    c
    dtype: object
    View Code
    df1=pd.DataFrame(np.random.rand(25).reshape([5,5]))
    print(df1)
              0         1         2         3         4
    0  0.499115  0.244375  0.849224  0.348352  0.472657
    1  0.676503  0.769790  0.479774  0.468003  0.703029
    2  0.153982  0.699009  0.379184  0.151905  0.921860
    3  0.904037  0.196925  0.421180  0.384442  0.642122
    4  0.641124  0.748790  0.824351  0.101550  0.412564
    View Code
    df2=pd.DataFrame(np.random.rand(25).reshape([5,5]),index=['A','B','D','E','F'],columns=['c1','c2','c3','c4','c5'])
    print(df2)
             c1        c2        c3        c4        c5
    A  0.279563  0.267224  0.077868  0.080046  0.528182
    B  0.660053  0.088954  0.512298  0.259552  0.108562
    D  0.734865  0.776419  0.581695  0.578712  0.157753
    E  0.926365  0.729410  0.328161  0.531319  0.550878
    F  0.849754  0.770988  0.537104  0.833631  0.062303
    View Code
    print(df2.reindex(index=['A','B','C','D','E','F']))
             c1        c2        c3        c4        c5
    A  0.279563  0.267224  0.077868  0.080046  0.528182
    B  0.660053  0.088954  0.512298  0.259552  0.108562
    C       NaN       NaN       NaN       NaN       NaN
    D  0.734865  0.776419  0.581695  0.578712  0.157753
    E  0.926365  0.729410  0.328161  0.531319  0.550878
    F  0.849754  0.770988  0.537104  0.833631  0.062303
    View Code
    print(df2.reindex(columns=['c1','c2','c3','c4','c5','c6']))
             c1        c2        c3        c4        c5  c6
    A  0.279563  0.267224  0.077868  0.080046  0.528182 NaN
    B  0.660053  0.088954  0.512298  0.259552  0.108562 NaN
    D  0.734865  0.776419  0.581695  0.578712  0.157753 NaN
    E  0.926365  0.729410  0.328161  0.531319  0.550878 NaN
    F  0.849754  0.770988  0.537104  0.833631  0.062303 NaN
    View Code
    print(df2.reindex(index=['A','B','C','D','E','F'],columns=['c1','c2','c3','c4','c5','c6']))
             c1        c2        c3        c4        c5  c6
    A  0.279563  0.267224  0.077868  0.080046  0.528182 NaN
    B  0.660053  0.088954  0.512298  0.259552  0.108562 NaN
    C       NaN       NaN       NaN       NaN       NaN NaN
    D  0.734865  0.776419  0.581695  0.578712  0.157753 NaN
    E  0.926365  0.729410  0.328161  0.531319  0.550878 NaN
    F  0.849754  0.770988  0.537104  0.833631  0.062303 NaN
    View Code
    s1=pd.Series([1,2,3,4],index=['A','B','C','D'])
    print(s1)
    A    1
    B    2
    C    3
    D    4
    dtype: int64
    View Code
    print(s1.reindex(['A','C']))#也可写成print(s1.reindex(index=['A','C']))
    A    1
    C    3
    dtype: int64
    View Code
    print(df2.reindex(index=['A','C']))
             c1        c2        c3        c4        c5
    A  0.279563  0.267224  0.077868  0.080046  0.528182
    C       NaN       NaN       NaN       NaN       NaN
    View Code
    print(s1.drop(['B','C']))
    A    1
    D    4
    dtype: int64
    View Code
    print(s1.drop('A'))
    B    2
    C    3
    D    4
    dtype: int64
    View Code
    print(df2.drop(['A'],axis=0))
             c1        c2        c3        c4        c5
    B  0.660053  0.088954  0.512298  0.259552  0.108562
    D  0.734865  0.776419  0.581695  0.578712  0.157753
    E  0.926365  0.729410  0.328161  0.531319  0.550878
    F  0.849754  0.770988  0.537104  0.833631  0.062303
    View Code
    print(df2.drop(['c1'],axis=1))
             c2        c3        c4        c5
    A  0.267224  0.077868  0.080046  0.528182
    B  0.088954  0.512298  0.259552  0.108562
    D  0.776419  0.581695  0.578712  0.157753
    E  0.729410  0.328161  0.531319  0.550878
    F  0.770988  0.537104  0.833631  0.062303
    View Code

    谈一谈NaN-means Not a Number

    n=np.nan
    print(type(n))#<class 'float'>
    print(1+n)#结果:nan   任何一个numuber与nan做运算结果永远都是not a nunmber

      

    s1=pd.Series([1,2,np.nan,3,4],index=['A','B','C','D','E'])
    print(s1)
    A    1.0
    B    2.0
    C    NaN
    D    3.0
    E    4.0
    dtype: float64
    View Code
    print(s1.isnull())
    A    False
    B    False
    C     True
    D    False
    E    False
    dtype: bool
    View Cod
    print(s1.notnull())
    A     True
    B     True
    C    False
    D     True
    E     True
    dtype: bool
    View Code
    print(s1.dropna())#drop掉value为nan的
    A    1.0
    B    2.0
    D    3.0
    E    4.0
    dtype: float64
    View Code

    NaN in DataFrame

    dframe=pd.DataFrame([[1,2,3],[np.nan,5,6],[7,np.nan,9],[np.nan,np.nan,np.nan]])
    print(dframe)
         0    1    2
    0  1.0  2.0  3.0
    1  NaN  5.0  6.0
    2  7.0  NaN  9.0
    3  NaN  NaN  NaN
    View Code
    print(dframe.isnull())
           0      1      2
    0  False  False  False
    1   True  False  False
    2  False   True  False
    3   True   True   True
    View Code
    print(dframe.notnull())
           0      1      2
    0   True   True   True
    1  False   True   True
    2   True  False   True
    3  False  False  False
    View Code
    print(dframe.dropna())#默认axis=0,相当于print(dframe.dropna(axis=0))    默认how='any'
         0    1    2
    0  1.0  2.0  3.0
    View Code
    print(dframe.dropna(how='any'))#any指的是凡是含有nan的都会drop掉
         0    1    2
    0  1.0  2.0  3.0
    View Code
    print(dframe.dropna(how='all'))#all指的是所有都是all的都会drop掉
         0    1    2
    0  1.0  2.0  3.0
    1  NaN  5.0  6.0
    2  7.0  NaN  9.0
    View Code
    print(dframe.dropna(axis=1))#只剩下index了
    Empty DataFrame
    Columns: []
    Index: [0, 1, 2, 3]
    View Code
    dframe2=pd.DataFrame([[1,2,3,np.nan],[2,np.nan,5,6],[np.nan,7,np.nan,9],[1,np.nan,np.nan,np.nan]])
    print(dframe2)
         0    1    2    3
    0  1.0  2.0  3.0  NaN
    1  2.0  NaN  5.0  6.0
    2  NaN  7.0  NaN  9.0
    3  1.0  NaN  NaN  NaN
    View Code
    df2=dframe2.dropna()#默认thresh=None,相当于df2=dframe2.dropna(thresh=None)
    print(df2)
    Empty DataFrame
    Columns: [0, 1, 2, 3]
    Index: []
    View Code
    df3=dframe2.dropna(thresh=2)#只要一行中NaN个数大于2,就删除该行
    print(df3)
         0    1    2    3
    0  1.0  2.0  3.0  NaN
    1  2.0  NaN  5.0  6.0
    2  NaN  7.0  NaN  9.0
    View Code
    print(dframe2.fillna(value=10))#将NaN填充为10
          0     1     2     3
    0   1.0   2.0   3.0  10.0
    1   2.0  10.0   5.0   6.0
    2  10.0   7.0  10.0   9.0
    3   1.0  10.0  10.0  10.0
    View Code
    print(dframe2.fillna(value={0:'A',1:'16',2:'中国',3:'k'}))#将每列各自的NaN赋值,即:第0列用A填充,第1列用16填充。。。。。。
    #注意:fillna和dropna不会改变原本的Series和DataFrame
       0   1   2  3
    0  1   2   3  k
    1  2  16   5  6
    2  A   7  中国  9
    3  1  16  中国  k
    View Code

    多级index

    s1=pd.Series(np.random.rand(6),index=[['1','1','1','2','2','2'],['a','b','c','a','b','c']])
    print(s1)
    1  a    0.973831
       b    0.762415
       c    0.135763
    2  a    0.974687
       b    0.471638
       c    0.573157
    dtype: float64
    View Code
    print(type(s1))#<class 'pandas.core.series.Series'>
    print(s1['1'])
    a    0.973831
    b    0.762415
    c    0.135763
    dtype: float64
    View Code
    print(type(s1['1']))#<class 'pandas.core.series.Series'>
    print(s1['1']['a'])#0.9738309965219155
    print(s1[:,'a'])
    1    0.973831
    2    0.974687
    dtype: float64
    View Code
    #二级的series转换成dataframe(两种方法)
    df1=s1.unstack()
    print(df1)
    
    df2=pd.DataFrame([s1['1'],s1['2']])
    print(df2)
              a         b         c
    1  0.973831  0.762415  0.135763
    2  0.974687  0.471638  0.573157
              a         b         c
    0  0.973831  0.762415  0.135763
    1  0.974687  0.471638  0.573157
    View Code
    #dataframe转换成二级series
    s2=df1.unstack()
    print(s2)
    a  1    0.973831
       2    0.974687
    b  1    0.762415
       2    0.471638
    c  1    0.135763
       2    0.573157
    dtype: float64
    View Code
    print(df1.T.unstack())
    1  a    0.973831
       b    0.762415
       c    0.135763
    2  a    0.974687
       b    0.471638
       c    0.573157
    dtype: float64
    View Code
    df=pd.DataFrame(np.arange(16).reshape(4,4),index=[['a','a','b','b'],[1,2,1,2]],columns=[['BJ','BJ','上海','广州'],[111,222,111,222]])
    print(df)
         BJ      上海  广州
        111 222 111 222
    a 1   0   1   2   3
      2   4   5   6   7
    b 1   8   9  10  11
      2  12  13  14  15
    View Code
    print(df['BJ'])
         111  222
    a 1    0    1
      2    4    5
    b 1    8    9
      2   12   13
    View Code
    print(type(df['BJ']))#<class 'pandas.core.frame.DataFrame'>
    print(df['BJ',111])#print(df['BJ'][111])效果相同
    a  1     0
       2     4
    b  1     8
       2    12
    Name: (BJ, 111), dtype: int32
    View Code

    Mapping

    df1=pd.DataFrame({'城市':['北京','上海','广州'],'人口':[1000,2000,1500]})
    print(df1)
       城市    人口
    0  北京  1000
    1  上海  2000
    2  广州  1500
    View Code
    #增加一列
    df1['GDP']=pd.Series([9999,8888,7777])#方法一
    print(df1)
       城市    人口   GDP
    0  北京  1000  9999
    1  上海  2000  8888
    2  广州  1500  7777
    View Code
    salary={'北京':10,'上海':20,'广州':30}#方法二,尽量用此方法,原因看df2
    df1['工资']=df1['城市'].map(salary)
    print(df1)
       城市    人口   GDP  工资
    0  北京  1000  9999  10
    1  上海  2000  8888  20
    2  广州  1500  7777  30
    View Code
    df2=pd.DataFrame({'城市':['北京','上海','广州'],'人口':[1000,2000,1500]},index=['A','B','C'])
    print(df2)
       城市    人口
    A  北京  1000
    B  上海  2000
    C  广州  1500
    View Code
    df2['GDP']=pd.Series([9999,8888,7777])
    print(df2)
       城市    人口  GDP
    A  北京  1000  NaN
    B  上海  2000  NaN
    C  广州  1500  NaN
    View Code
    df2['GGDDPP']=pd.Series([9999,8888,7777],index=['A','B','C'])
    print(df2)
       城市    人口  GDP  GGDDPP
    A  北京  1000  NaN    9999
    B  上海  2000  NaN    8888
    C  广州  1500  NaN    7777
    View Code

    Replace

    s1=pd.Series(np.arange(100,105))
    print(s1)
    0    100
    1    101
    2    102
    3    103
    4    104
    dtype: int32
    View Code
    print(s1.replace(101,np.nan))
    0    100.0
    1      NaN
    2    102.0
    3    103.0
    4    104.0
    dtype: float64
    View Code
    print(s1.replace({101:np.nan}))
    0    100.0
    1      NaN
    2    102.0
    3    103.0
    4    104.0
    dtype: float64
    View Code
    print(s1.replace([100,103,104],['中','eng','s']))
    0      中
    1    101
    2    102
    3    eng
    4      s
    dtype: object
    View Code
    print(s1)#s1并没有发生变化
    0    100
    1    101
    2    102
    3    103
    4    104
    dtype: int32
    View Code

     

      

  • 相关阅读:
    qq
    构造方法
    Java模块化开发
    q
    qqq
    qq
    qqq
    Git服务器搭建及SSH无密码登录设置
    php面向对象中的魔术方法中文说明
    计算机中丢失 msvcr110.dll 怎么办
  • 原文地址:https://www.cnblogs.com/tianqizhi/p/8723567.html
Copyright © 2011-2022 走看看