zoukankan      html  css  js  c++  java
  • DataFrame合并数据集 pandas.merge

    from pandas import DataFrame
    import pandas as pd
    
    df1 = DataFrame({'key':['b','b','a','c','a','a','b'],
                     'data1':range(7)})
    df2 = DataFrame({'key':['a','b','d'],
                     'data2':range(3)})
    print(df1)
    '''
       data1 key
    0      0   b
    1      1   b
    2      2   a
    3      3   c
    4      4   a
    5      5   a
    6      6   b
    '''
    print(df2)
    '''
       data2 key
    0      0   a
    1      1   b
    2      2   d
    '''
    
    # merge 根据一个或多个键将不同的dataframe中的行连接起来,类似数据库的连接
    # 没有指定列名时,默认用重叠的列名做键
    pd.merge(df1,df2)
    
    # inner:交集,merge默认做的inner连接
    print(pd.merge(df1,df2,how='inner',on='key'))
    '''
       data1 key  data2
    0      0   b      1
    1      1   b      1
    2      6   b      1
    3      2   a      0
    4      4   a      0
    5      5   a      0
    '''
    # outer:并集
    print(pd.merge(df1,df2,how='outer',on='key'))
    '''
       data1 key  data2
    0    0.0   b    1.0
    1    1.0   b    1.0
    2    6.0   b    1.0
    3    2.0   a    0.0
    4    4.0   a    0.0
    5    5.0   a    0.0
    6    3.0   c    NaN
    7    NaN   d    2.0
    '''
    
    # left:左连接
    print(pd.merge(df1,df2,how='left',on='key'))
    '''
       data1 key  data2
    0      0   b    1.0
    1      1   b    1.0
    2      2   a    0.0
    3      3   c    NaN
    4      4   a    0.0
    5      5   a    0.0
    6      6   b    1.0
    '''
    
    # right:右连接
    print(pd.merge(df1,df2,how='right',on='key'))
    '''
       data1 key  data2
    0    0.0   b      1
    1    1.0   b      1
    2    6.0   b      1
    3    2.0   a      0
    4    4.0   a      0
    5    5.0   a      0
    6    NaN   d      2
    '''
    
    # 如果两个对象列名不同也可以分别进行指定
    df3 = DataFrame({'key1':['b','b','a','c','a','a','b'],
                     'data1':range(7)})
    df4 = DataFrame({'key2':['a','b','d'],
                     'data2':range(3)})
    df_merge2 = pd.merge(df3,df4,left_on='key1',right_on='key2')
    print(df_merge2)
    '''
       data1 key1  data2 key2
    0      0    b      1    b
    1      1    b      1    b
    2      6    b      1    b
    3      2    a      0    a
    4      4    a      0    a
    5      5    a      0    a
    '''
    
    # 多对多的合并
    df5 = DataFrame({'key':['b','a','a','c'],
                     'data1':range(4)})
    df6 = DataFrame({'key':['a','a','a','b','d'],
                     'data2':range(5)})
    print(df5)
    '''
       data1 key
    0      0   b
    1      1   a
    2      2   a
    3      3   c
    '''
    print(df6)
    '''
       data2 key
    0      0   a
    1      1   a
    2      2   a
    3      3   b
    4      4   d
    '''
    print(pd.merge(df5,df6,on='key',how='inner')) # 多对多连接产生的行是笛卡尔积,左边2个a,右边3个a,最终6个a。
    '''
       data1 key  data2
    0      0   b      3
    1      1   a      0
    2      1   a      1
    3      1   a      2
    4      2   a      0
    5      2   a      1
    6      2   a      2
    '''
    
    # 要根据多个键进行合并,传入一个由列名组成的列表
    left = DataFrame({'key1':['foo','foo','bar'],
                      'key2':['one','two','one'],
                      'lval':[1,2,3]})
    right = DataFrame({'key1':['foo','foo','bar','bar'],
                      'key2':['one','one','one','two'],
                      'rval':[4,5,6,7]})
    
    print(left)
    '''
      key1 key2  lval
    0  foo  one     1
    1  foo  two     2
    2  bar  one     3
    '''
    print(right)
    '''
      key1 key2  rval
    0  foo  one     4
    1  foo  one     5
    2  bar  one     6
    3  bar  two     7
    '''
    print(pd.merge(left,right,on=['key1','key2'],how='outer'))
    '''
      key1 key2  lval  rval
    0  foo  one   1.0   4.0
    1  foo  one   1.0   5.0
    2  foo  two   2.0   NaN
    3  bar  one   3.0   6.0
    4  bar  two   NaN   7.0
    '''
    # 对于合并运算后需要处理重复列名,suffixes用于指定附加到左右两个dataframe对象的重叠列名上的字符串
    print(pd.merge(left,right,on='key1'))
    '''
      key1 key2_x  lval key2_y  rval
    0  foo    one     1    one     4
    1  foo    one     1    one     5
    2  foo    two     2    one     4
    3  foo    two     2    one     5
    4  bar    one     3    one     6
    5  bar    one     3    two     7
    '''
    print(pd.merge(left,right,on='key1',suffixes=['_left','_right']))
    '''
      key1 key2_left  lval key2_right  rval
    0  foo       one     1        one     4
    1  foo       one     1        one     5
    2  foo       two     2        one     4
    3  foo       two     2        one     5
    4  bar       one     3        one     6
    5  bar       one     3        two     7
    '''
  • 相关阅读:
    shell日志重定向到null
    nginx用户权限问题
    找不到 libgtk-x11-2.0.so.0
    OSError: libgfortran.so.3: cannot open shared object file: No such file or directory
    macos不能打开windows samba共享问题(转载)
    centos 磁盘分区格式化与挂载
    冒泡排序java代码
    二分查找java代码
    java基础复习
    python第二天
  • 原文地址:https://www.cnblogs.com/nicole-zhang/p/14446789.html
Copyright © 2011-2022 走看看