zoukankan      html  css  js  c++  java
  • Pandas 合并、连接、去重、替换

    1.merge合并 → 类似excel的vlookup

    # merge合并 → 类似excel的vlookup
    
    df1 = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
                         'A': ['A0', 'A1', 'A2', 'A3'],
                         'B': ['B0', 'B1', 'B2', 'B3']})
    df2 = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
                          'C': ['C0', 'C1', 'C2', 'C3'],
                          'D': ['D0', 'D1', 'D2', 'D3']})
    df3 = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'],
                        'key2': ['K0', 'K1', 'K0', 'K1'],
                        'A': ['A0', 'A1', 'A2', 'A3'],
                        'B': ['B0', 'B1', 'B2', 'B3']})
    df4 = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'],
                        'key2': ['K0', 'K0', 'K0', 'K0'],
                        'C': ['C0', 'C1', 'C2', 'C3'],
                        'D': ['D0', 'D1', 'D2', 'D3']})
    
    print(df1,'
    ',df2,'
    ')
    print(pd.merge(df1, df2, on='key'))  #按照’key'字段进行连接
    print('------','
    ')
    # left:第一个df
    # right:第二个df
    # on:参考键
    
    print(df3,'
    ',df4,'
    ')
    print(pd.merge(df3, df4, on=['key1','key2']))
    # 多个链接键

    输出结果:

      A   B key
    0  A0  B0  K0
    1  A1  B1  K1
    2  A2  B2  K2
    3  A3  B3  K3 
         C   D key
    0  C0  D0  K0
    1  C1  D1  K1
    2  C2  D2  K2
    3  C3  D3  K3 
    
        A   B key   C   D
    0  A0  B0  K0  C0  D0
    1  A1  B1  K1  C1  D1
    2  A2  B2  K2  C2  D2
    3  A3  B3  K3  C3  D3
    ------ 
    
        A   B key1 key2
    0  A0  B0   K0   K0
    1  A1  B1   K0   K1
    2  A2  B2   K1   K0
    3  A3  B3   K2   K1 
         C   D key1 key2
    0  C0  D0   K0   K0
    1  C1  D1   K1   K0
    2  C2  D2   K1   K0
    3  C3  D3   K2   K0 
    
        A   B key1 key2   C   D
    0  A0  B0   K0   K0  C0  D0
    1  A2  B2   K1   K0  C1  D1
    2  A2  B2   K1   K0  C2  D2

    2. 参数how → 合并方式

    # 参数how → 合并方式
    
    print(pd.merge(df3, df4,on=['key1','key2'], how = 'inner'))  
    print('------')
    # inner:默认,取交集
    
    print(pd.merge(df3, df4, on=['key1','key2'], how = 'outer'))  
    print('------')
    # outer:取并集,数据缺失范围NaN
    
    print(pd.merge(df3, df4, on=['key1','key2'], how = 'left'))  
    print('------')
    # left:按照df3为参考合并,数据缺失范围NaN
    
    print(pd.merge(df3, df4, on=['key1','key2'], how = 'right'))  
    # right:按照df4为参考合并,数据缺失范围NaN

    输出结果:

     A   B key1 key2   C   D
    0  A0  B0   K0   K0  C0  D0
    1  A2  B2   K1   K0  C1  D1
    2  A2  B2   K1   K0  C2  D2
    ------
         A    B key1 key2    C    D
    0   A0   B0   K0   K0   C0   D0
    1   A1   B1   K0   K1  NaN  NaN
    2   A2   B2   K1   K0   C1   D1
    3   A2   B2   K1   K0   C2   D2
    4   A3   B3   K2   K1  NaN  NaN
    5  NaN  NaN   K2   K0   C3   D3
    ------
        A   B key1 key2    C    D
    0  A0  B0   K0   K0   C0   D0
    1  A1  B1   K0   K1  NaN  NaN
    2  A2  B2   K1   K0   C1   D1
    3  A2  B2   K1   K0   C2   D2
    4  A3  B3   K2   K1  NaN  NaN
    ------
         A    B key1 key2   C   D
    0   A0   B0   K0   K0  C0  D0
    1   A2   B2   K1   K0  C1  D1
    2   A2   B2   K1   K0  C2  D2
    3  NaN  NaN   K2   K0  C3  D3

    3.参数 left_on, right_on, left_index, right_index → 当键不为一个列时,可以单独设置左键与右键

    # 参数 left_on, right_on, left_index, right_index → 当键不为一个列时,可以单独设置左键与右键
    
    df1 = pd.DataFrame({'lkey':list('bbacaab'),
                       'data1':range(7)})
    df2 = pd.DataFrame({'rkey':list('abd'),
                       'date2':range(3)})
    print(pd.merge(df1, df2, left_on='lkey', right_on='rkey'))
    print('------')
    # df1以‘lkey’为键,df2以‘rkey’为键
    
    df1 = pd.DataFrame({'key':list('abcdfeg'),
                       'data1':range(7)})
    df2 = pd.DataFrame({'date2':range(100,105)},
                      index = list('abcde'))
    print(pd.merge(df1, df2, left_on='key', right_index=True))
    # df1以‘key’为键,df2以index为键
    # left_index:为True时,第一个df以index为键,默认False
    # right_index:为True时,第二个df以index为键,默认False
    
    # 所以left_on, right_on, left_index, right_index可以相互组合:
    # left_on + right_on, left_on + right_index, left_index + right_on, left_index + right_index 
    #merge是针对DataFrame的,Series不存在此方法

    输出结果:

     data1 lkey  date2 rkey
    0      0    b      1    b
    1      1    b      1    b
    2      6    b      1    b
    3      2    a      0    a
    4      4    a      0    a
    5      5    a      0    a
    ------
       data1 key  date2
    0      0   a    100
    1      1   b    101
    2      2   c    102
    3      3   d    103
    5      5   e    104

    4. 连接:concat

    (1)

    # 连接:concat
    
    s1 = pd.Series([1,2,3])
    s2 = pd.Series([2,3,4])
    print(pd.concat([s1,s2]))
    print('-----')
    # 默认axis=0,行+行
    
    s3 = pd.Series([1,2,3],index = ['a','c','h'])
    s4 = pd.Series([2,3,4],index = ['b','e','d'])
    print(s3,'
    ')
    print(s4,'
    ')
    print(pd.concat([s3,s4]),'
    ')
    print(pd.concat([s3,s4]).sort_index())
    print(pd.concat([s3,s4], axis=1))
    print('-----')
    # axis=1,列+列,成为一个Dataframe

    输出结果:

    0    1
    1    2
    2    3
    0    2
    1    3
    2    4
    dtype: int64
    -----
    a    1
    c    2
    h    3
    dtype: int64 
    
    b    2
    e    3
    d    4
    dtype: int64 
    
    a    1
    c    2
    h    3
    b    2
    e    3
    d    4
    dtype: int64 
    
    a    1
    b    2
    c    2
    d    4
    e    3
    h    3
    dtype: int64
         0    1
    a  1.0  NaN
    b  NaN  2.0
    c  2.0  NaN
    d  NaN  4.0
    e  NaN  3.0
    h  3.0  NaN
    -----

    (2)

    snew  = pd.concat([s3,s4],axis = 1)
    print(snew)
    # snew.reset_index(inplace = True)
    # print(snew)
    snew.reset_index(inplace = True,drop = True)
    snew

    输出结果:

    5.去重 .duplicated

    # 去重 .duplicated
    
    s = pd.Series([1,1,1,1,2,2,2,3,4,5,5,5,5])
    print(s.duplicated())
    print(s[s.duplicated() == False])
    print('-----')
    # 判断是否重复
    # 通过布尔判断,得到不重复的值
    
    s_re = s.drop_duplicates()
    print(s_re)
    print('-----')
    # drop.duplicates移除重复
    # inplace参数:是否替换原值,默认False
    
    df = pd.DataFrame({'key1':['a','a',3,4,5],
                      'key2':['a','a','b','b','c']})
    print(df,'
    ')
    print(df.duplicated())
    print(df['key2'].duplicated())
    # Dataframe中使用duplicated

    输出结果:

    0     False
    1      True
    2      True
    3      True
    4     False
    5      True
    6      True
    7     False
    8     False
    9     False
    10     True
    11     True
    12     True
    dtype: bool
    0    1
    4    2
    7    3
    8    4
    9    5
    dtype: int64
    -----
    0    1
    4    2
    7    3
    8    4
    9    5
    dtype: int64
    -----
      key1 key2
    0    a    a
    1    a    a
    2    3    b
    3    4    b
    4    5    c 
    
    0    False
    1     True
    2    False
    3    False
    4    False
    dtype: bool
    0    False
    1     True
    2    False
    3     True
    4    False
    Name: key2, dtype: bool

    6. 替换 .replace

    0    a
    1    s
    2    c
    3    a
    4    a
    5    z
    6    s
    7    d
    dtype: object 
    
    0    NaN
    1      s
    2      c
    3    NaN
    4    NaN
    5      z
    6      s
    7      d
    dtype: object
    0    NaN
    1    NaN
    2      c
    3    NaN
    4    NaN
    5      z
    6    NaN
    7      d
    dtype: object
    0    hello world!
    1             123
    2               c
    3    hello world!
    4    hello world!
    5               z
    6             123
    7               d
    dtype: object

     练习题:

    作业1:按要求创建Dataframe df1、df2,并合并成df3

    import numpy as np
    import pandas as pd
    
    df1 = pd.DataFrame({'key':list('abc'),
                       'values1':np.random.rand(3)})
    df2 = pd.DataFrame({'key':list('bcd'),
                       'values2':np.random.rand(3)})
    print(df1,'
    ')
    print(df2,'
    ')
    print(pd.merge(df1, df2,on='key',how = 'outer'))

    作业2:按要求创建Dataframe df1、df2,并合并成df3

    #练习2
    df3 = pd.DataFrame({'key1':['a','b','c'],
                       'values1':np.random.rand(3)})
    df4 = pd.DataFrame({'key2':['b','c','d'],
                       'values2':np.random.rand(3)})
    print(df3,'
    ')
    print(df4,'
    ')
    
    print(pd.merge(df3, df4,left_on='key1',right_on='key2',how = 'outer'))

     作业3:按要求创建Dataframe df1、df2,并合并成df3

    #练习3
    # df3 = pd.DataFrame({'key':['a','b','c'],
    #                    'values1':np.random.rand(3)})
    # df4 = pd.DataFrame({'values2':np.random.rand(3),
    #                    'values3':[5,6,7]},index=['b','c','d'])
    # print(df3,'
    ')
    # print(df4,'
    ')
    # print(pd.merge(df3, df4,left_on='key',right_index= True))

    作业4:按要求创建Dataframe df1、df2,并连接成df3

    #练习4
    # df3 = pd.DataFrame({'values1':np.random.rand(4),
    #                   'values2':np.random.rand(4)},index = ['a','b','c','d'])
    # df4 = pd.DataFrame({'values1':np.random.rand(4),
    #                     'values2':np.random.rand(4)},index=['e','f','g','h'])
    # print(df3,'
    ')
    # print(df4,'
    ')
    # print(pd.concat([df3,df4]))
  • 相关阅读:
    Nginx的Mainline version、Stable version、Legacy version的版本区别
    十个程序员必备的网站推荐
    各大OJ题目分类
    ubuntu 12 安装bcm 43142无线网卡驱动
    unp.h
    Linux优秀软件整理
    陈皓一起写Makefile 概述
    开源资源目录
    (三)鸟哥Linux读书笔记
    CSS3实现选项卡
  • 原文地址:https://www.cnblogs.com/carlber/p/9923025.html
Copyright © 2011-2022 走看看