zoukankan html css js c++ java

Pandas合并数据集之concat、combine_first方法

轴向连接(concat)

Numpy

import numpy as np
import pandas as pd
from pandas import Series

arr = np.arange(12).reshape(3,4)
arr

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

# axis默认为行，想合并列可以设置axis=1
np.concatenate([arr,arr])

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

pandas对象的轴向连接

# 三个没有重叠的索引合在一起
s1 = Series([0,1],index=['a','b'])
s2 = Series([2,3,4],index=['c','d','e'])
s3 = Series([5,6], index=['f','g'])
# concat对象里面需要接受一个可迭代的对象
pd.concat([s1,s2,s3])

a    0
b    1
c    2
d    3
e    4
f    5
g    6
dtype: int64

# 如果传入axis=1，则多一个轴方向，会变成DataFrame
pd.concat([s1,s2,s3],axis=1)

	0	1	2
a	0.0	NaN	NaN
b	1.0	NaN	NaN
c	NaN	2.0	NaN
d	NaN	3.0	NaN
e	NaN	4.0	NaN
f	NaN	NaN	5.0
g	NaN	NaN	6.0

s4 = pd.concat([s1*5,s3])
s4

a    0
b    5
f    5
g    6
dtype: int64


# 默认合并的还是outer
pd.concat([s1,s4],axis=1)

    0	1
a	0.0	0
b	1.0	5
f	NaN	5
g	NaN	6

# 默认合并的还是outer,如果想得到合并的交集，则指定join = 'inner'
pd.concat([s1,s4],axis=1,join='inner')

    0	1
a	0	0
b	1	5

# 指定要合并的索引名，如果没有，则合并为NaN
pd.concat([s1,s4],axis=1,join_axes=[['a','b','c','e']])

	0	1
a	0.0	0.0
b	1.0	5.0
c	NaN	NaN
e	NaN	NaN

#在合并行索引上创建一个层次化索引，keys参数
pd.concat([s1,s4],keys=['one','two','three'])

one  a    0
     b    1
two  a    0
     b    5
     f    5
     g    6
dtype: int64

pd.concat([s1,s2,s3],axis=1)

    0	1	2
a	0.0	NaN	NaN
b	1.0	NaN	NaN
c	NaN	2.0	NaN
d	NaN	3.0	NaN
e	NaN	4.0	NaN
f	NaN	NaN	5.0
g	NaN	NaN	6.0

#如果沿着axis=1对Series进行合并，则keys就会成为DATAFrame的列名
pd.concat([s1,s2,s3],axis=1,keys=['one','two','three'])

    one	two	three
a	0.0	NaN	NaN
b	1.0	NaN	NaN
c	NaN	2.0	NaN
d	NaN	3.0	NaN
e	NaN	4.0	NaN
f	NaN	NaN	5.0
g	NaN	NaN	6.0

DataFrame的concat操作

df1 = pd.DataFrame(np.arange(6).reshape(3,2),index=['a','b','c'],columns=['one','two'])
df1

    one	two
a	0	1
b	2	3
c	4	5

df2 = pd.DataFrame(5 + np.arange(4).reshape(2,2),index=['a','c'],columns=['three','four'])
df2

    three	four
a	5	6
c	7	8

# 合并列
pd.concat([df1,df2],axis=1)

    one	two	three	four
a	0	1	5.0	6.0
b	2	3	NaN	NaN
c	4	5	7.0	8.0

# 如果传入的不是列表而是一个字典，则字典的键就是层次化索引列名
pd.concat({'level1':df1,'level2':df2},axis=1)

    level1	level2
    one	two	three	four
a	0	1	5.0	6.0
b	2	3	NaN	NaN
c	4	5	7.0	8.0

# names的命名是层次化索引的行标签,upper行对应level1,level2
pd.concat([df1,df2],axis=1,keys=['level1','level2'],names=['upper','lower'])

upper	level1	level2
lower	one	two	three	four
a	0	1	5.0	6.0
b	2	3	NaN	NaN
c	4	5	7.0	8.0

df3 = pd.DataFrame(np.random.randn(3,4),columns=['a','b','c','d'])
df4 = pd.DataFrame(np.random.randn(2,3),columns=['b','d','a'])

# 这样的行索引重复的难看要死，可以关闭了，ignore_index=True
pd.concat([df3,df4])

    a	b	c	d
0	0.649869	-0.332470	0.918562	-1.781167
1	-0.271012	0.702998	-2.164433	0.185556
2	0.279104	-0.846209	-0.366614	0.444451
0	-0.204010	-0.974424	NaN	-2.215621
1	0.504930	0.490877	NaN	0.332790


#ingore_index启用后，行索引就会自增

pd.concat([df3,df4],ignore_index=True)

    a	b	c	d
0	0.649869	-0.332470	0.918562	-1.781167
1	-0.271012	0.702998	-2.164433	0.185556
2	0.279104	-0.846209	-0.366614	0.444451
3	-0.204010	-0.974424	NaN	-2.215621
4	0.504930	0.490877	NaN	0.332790

合并重叠数据(combine_first)

a = Series([np.nan,2.5,np.nan,3.5,4.5,np.nan],index=['f','e','d','c','b','a'])
a

f    NaN
e    2.5
d    NaN
c    3.5
b    4.5
a    NaN
dtype: float64

b = Series(np.arange(len(a),dtype=np.float64),index=['f','e','d','c','b','a'])
b

f    0.0
e    1.0
d    2.0
c    3.0
b    4.0
a    5.0
dtype: float64

# where(条件，真值，假值)，这里a数据集有null条件成立,故返回b的值
np.where(pd.isnull(a),b,a)

array([0. , 2.5, 2. , 3.5, 4.5, 5. ])

a[2:]
d    NaN
c    3.5
b    4.5
a    NaN
dtype: float64

b[:-2]
f    0.0
e    1.0
d    2.0
c    3.0
dtype: float64

# 用a的数据填补b，如果有重复的以b为准
b[:-2].combine_first(a[2:])

a    NaN
b    4.5
c    3.0
d    2.0
e    1.0
f    0.0
dtype: float64

# 用b的数据填补a，如果有重复的，以a为准
a[2:].combine_first(b[:-2])

a    NaN
b    4.5
c    3.5
d    2.0
e    1.0
f    0.0
dtype: float64

查看全文

相关阅读:
MySQL开启general_log并设置路径
 mysql日志文件开启及详解：General_log 和 Binlog
mysql binary like_MYSQL的binary解决mysql数据大小写敏感问题的方法
 分布式系统回滚机制
 ubuntu 后台运行的几种方法！
ubuntu磁盘分配和挂载
 .NET Core SDK在Windows系统安装后出现Failed to load the hostfxr.dll等问题的解决方法
 数组.html
温习
 for练习.html

原文地址：https://www.cnblogs.com/lishi-jie/p/10021288.html