from pandas import DataFrame import pandas as pd df1 = DataFrame({'key':['b','b','a','c','a','a','b'], 'data1':range(7)}) df2 = DataFrame({'key':['a','b','d'], 'data2':range(3)}) print(df1) ''' data1 key 0 0 b 1 1 b 2 2 a 3 3 c 4 4 a 5 5 a 6 6 b ''' print(df2) ''' data2 key 0 0 a 1 1 b 2 2 d ''' # merge 根据一个或多个键将不同的dataframe中的行连接起来,类似数据库的连接 # 没有指定列名时,默认用重叠的列名做键 pd.merge(df1,df2) # inner:交集,merge默认做的inner连接 print(pd.merge(df1,df2,how='inner',on='key')) ''' data1 key data2 0 0 b 1 1 1 b 1 2 6 b 1 3 2 a 0 4 4 a 0 5 5 a 0 ''' # outer:并集 print(pd.merge(df1,df2,how='outer',on='key')) ''' data1 key data2 0 0.0 b 1.0 1 1.0 b 1.0 2 6.0 b 1.0 3 2.0 a 0.0 4 4.0 a 0.0 5 5.0 a 0.0 6 3.0 c NaN 7 NaN d 2.0 ''' # left:左连接 print(pd.merge(df1,df2,how='left',on='key')) ''' data1 key data2 0 0 b 1.0 1 1 b 1.0 2 2 a 0.0 3 3 c NaN 4 4 a 0.0 5 5 a 0.0 6 6 b 1.0 ''' # right:右连接 print(pd.merge(df1,df2,how='right',on='key')) ''' data1 key data2 0 0.0 b 1 1 1.0 b 1 2 6.0 b 1 3 2.0 a 0 4 4.0 a 0 5 5.0 a 0 6 NaN d 2 ''' # 如果两个对象列名不同也可以分别进行指定 df3 = DataFrame({'key1':['b','b','a','c','a','a','b'], 'data1':range(7)}) df4 = DataFrame({'key2':['a','b','d'], 'data2':range(3)}) df_merge2 = pd.merge(df3,df4,left_on='key1',right_on='key2') print(df_merge2) ''' data1 key1 data2 key2 0 0 b 1 b 1 1 b 1 b 2 6 b 1 b 3 2 a 0 a 4 4 a 0 a 5 5 a 0 a ''' # 多对多的合并 df5 = DataFrame({'key':['b','a','a','c'], 'data1':range(4)}) df6 = DataFrame({'key':['a','a','a','b','d'], 'data2':range(5)}) print(df5) ''' data1 key 0 0 b 1 1 a 2 2 a 3 3 c ''' print(df6) ''' data2 key 0 0 a 1 1 a 2 2 a 3 3 b 4 4 d ''' print(pd.merge(df5,df6,on='key',how='inner')) # 多对多连接产生的行是笛卡尔积,左边2个a,右边3个a,最终6个a。 ''' data1 key data2 0 0 b 3 1 1 a 0 2 1 a 1 3 1 a 2 4 2 a 0 5 2 a 1 6 2 a 2 ''' # 要根据多个键进行合并,传入一个由列名组成的列表 left = DataFrame({'key1':['foo','foo','bar'], 'key2':['one','two','one'], 'lval':[1,2,3]}) right = DataFrame({'key1':['foo','foo','bar','bar'], 'key2':['one','one','one','two'], 'rval':[4,5,6,7]}) print(left) ''' key1 key2 lval 0 foo one 1 1 foo two 2 2 bar one 3 ''' print(right) ''' key1 key2 rval 0 foo one 4 1 foo one 5 2 bar one 6 3 bar two 7 ''' print(pd.merge(left,right,on=['key1','key2'],how='outer')) ''' key1 key2 lval rval 0 foo one 1.0 4.0 1 foo one 1.0 5.0 2 foo two 2.0 NaN 3 bar one 3.0 6.0 4 bar two NaN 7.0 ''' # 对于合并运算后需要处理重复列名,suffixes用于指定附加到左右两个dataframe对象的重叠列名上的字符串 print(pd.merge(left,right,on='key1')) ''' key1 key2_x lval key2_y rval 0 foo one 1 one 4 1 foo one 1 one 5 2 foo two 2 one 4 3 foo two 2 one 5 4 bar one 3 one 6 5 bar one 3 two 7 ''' print(pd.merge(left,right,on='key1',suffixes=['_left','_right'])) ''' key1 key2_left lval key2_right rval 0 foo one 1 one 4 1 foo one 1 one 5 2 foo two 2 one 4 3 foo two 2 one 5 4 bar one 3 one 6 5 bar one 3 two 7 '''