参考链接:莫烦python [https://mofanpy.com/tutorials/data-manipulation/np-pd/]
1 pandas 基本介绍
import pandas as pd
import numpy as np
s = pd.Series([1,3,6,np.nan,44,1])
print(s)
0 1.0
1 3.0
2 6.0
3 NaN
4 44.0
5 1.0
dtype: float64
dates = pd.date_range('20210823',periods=6)
print(dates)
DatetimeIndex(['2021-08-23', '2021-08-24', '2021-08-25', '2021-08-26',
'2021-08-27', '2021-08-28'],
dtype='datetime64[ns]', freq='D')
df = pd.DataFrame(np.random.randn(6,4),index=dates,columns=['a','b','c','d']) # randn产生随机数
print(df)
a b c d
2021-08-23 -0.292332 -1.249332 1.661923 -0.174345
2021-08-24 0.385871 -0.241205 -0.975572 -0.227036
2021-08-25 0.608249 0.525965 1.420664 0.362081
2021-08-26 -1.767358 -0.479192 0.900635 -0.508770
2021-08-27 -0.108412 -0.722863 0.387900 0.956791
2021-08-28 0.427214 -0.134833 0.152999 -0.922964
df1 = pd.DataFrame(np.arange(12).reshape((3,4)))
print(df1)
0 1 2 3
0 0 1 2 3
1 4 5 6 7
2 8 9 10 11
df2 = pd.DataFrame({
'A':1.,
'B':pd.Timestamp('20210823'),
'C':pd.Series(1,index=list(range(4)),dtype='float32'),
'D':np.array([3]*4,dtype='int32'),
'E':pd.Categorical(['test','train','test','train']),
'F':'foo'
})
print(df2,end="
")
print(df2.dtypes,end="
")
print("行名",df2.index,end="
")
print("列名",df2.columns,end="
")
print("值",df2.values,end="
")
A B C D E F
0 1.0 2021-08-23 1.0 3 test foo
1 1.0 2021-08-23 1.0 3 train foo
2 1.0 2021-08-23 1.0 3 test foo
3 1.0 2021-08-23 1.0 3 train foo
A float64
B datetime64[ns]
C float32
D int32
E category
F object
dtype: object
行名 Int64Index([0, 1, 2, 3], dtype='int64')
列名 Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object')
值 [[1.0 Timestamp('2021-08-23 00:00:00') 1.0 3 'test' 'foo']
[1.0 Timestamp('2021-08-23 00:00:00') 1.0 3 'train' 'foo']
[1.0 Timestamp('2021-08-23 00:00:00') 1.0 3 'test' 'foo']
[1.0 Timestamp('2021-08-23 00:00:00') 1.0 3 'train' 'foo']]
print("描述性信息",df2.describe(),end="
")
print(df2.T)
描述性信息 A C D
count 4.0 4.0 4.0
mean 1.0 1.0 3.0
std 0.0 0.0 0.0
min 1.0 1.0 3.0
25% 1.0 1.0 3.0
50% 1.0 1.0 3.0
75% 1.0 1.0 3.0
max 1.0 1.0 3.0
0 1 2
A 1 1 1
B 2021-08-23 00:00:00 2021-08-23 00:00:00 2021-08-23 00:00:00
C 1 1 1
D 3 3 3
E test train test
F foo foo foo
3
A 1
B 2021-08-23 00:00:00
C 1
D 3
E train
F foo
print(df2.sort_index(axis=1,ascending=False),end="
") #按行排序
print(df2.sort_index(axis=1,ascending=False),end="
") #按列排序
print(df2.sort_values(by='E'),end="
") #对某一列排序
F E D C B A
0 foo test 3 1.0 2021-08-23 1.0
1 foo train 3 1.0 2021-08-23 1.0
2 foo test 3 1.0 2021-08-23 1.0
3 foo train 3 1.0 2021-08-23 1.0
F E D C B A
0 foo test 3 1.0 2021-08-23 1.0
1 foo train 3 1.0 2021-08-23 1.0
2 foo test 3 1.0 2021-08-23 1.0
3 foo train 3 1.0 2021-08-23 1.0
A B C D E F
0 1.0 2021-08-23 1.0 3 test foo
2 1.0 2021-08-23 1.0 3 test foo
1 1.0 2021-08-23 1.0 3 train foo
3 1.0 2021-08-23 1.0 3 train foo
2 pandas 选择数据
dates = pd.date_range('20210101',periods=6)
df = pd.DataFrame(np.arange(24).reshape((6,4)),index=dates,columns=['A','B','C','D'])
print(df)
A B C D
2021-01-01 0 1 2 3
2021-01-02 4 5 6 7
2021-01-03 8 9 10 11
2021-01-04 12 13 14 15
2021-01-05 16 17 18 19
2021-01-06 20 21 22 23
print(df['A'],end="
")
print(df.A)
2021-01-01 0
2021-01-02 4
2021-01-03 8
2021-01-04 12
2021-01-05 16
2021-01-06 20
Freq: D, Name: A, dtype: int32
2021-01-01 0
2021-01-02 4
2021-01-03 8
2021-01-04 12
2021-01-05 16
2021-01-06 20
Freq: D, Name: A, dtype: int32
print(df[0:3],end="
")
print(df["20210102":"20210104"])
A B C D
2021-01-01 0 1 2 3
2021-01-02 4 5 6 7
2021-01-03 8 9 10 11
A B C D
2021-01-02 4 5 6 7
2021-01-03 8 9 10 11
2021-01-04 12 13 14 15
# loc 以标签index 来选择
print(df.loc['20210104'],end="
")
print(df.loc[:,['A','B']],end="
")
print(df.loc['2021-01-02',['A','B']])
A 12
B 13
C 14
D 15
Name: 2021-01-04 00:00:00, dtype: int32
A B
2021-01-01 0 1
2021-01-02 4 5
2021-01-03 8 9
2021-01-04 12 13
2021-01-05 16 17
2021-01-06 20 21
A 4
B 5
Name: 2021-01-02 00:00:00, dtype: int32
# iloc 以位置来选择
print(df.iloc[3],end="
") #第三行
print(df.iloc[3,1],end="
")
print(df.iloc[3:5,1:3],end="
")
print(df.iloc[[1,3,5],1:3])
A 12
B 13
C 14
D 15
Name: 2021-01-04 00:00:00, dtype: int32
13
B C
2021-01-04 13 14
2021-01-05 17 18
B C
2021-01-02 5 6
2021-01-04 13 14
2021-01-06 21 22
# ix 混合筛选
print(df.ix[:3,['A','D']])
A D
2021-01-01 0 3
2021-01-02 4 7
2021-01-03 8 11
E:ProgramDataAnaconda3libsite-packagesipykernel_launcher.py:2: DeprecationWarning:
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing
See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
# bool 筛选
print(df,end="
")
print(df[df.A>8])
A B C D
2021-01-01 0 1 2 3
2021-01-02 4 5 6 7
2021-01-03 8 9 10 11
2021-01-04 12 13 14 15
2021-01-05 16 17 18 19
2021-01-06 20 21 22 23
A B C D
2021-01-04 12 13 14 15
2021-01-05 16 17 18 19
2021-01-06 20 21 22 23
3 设置值
dates = pd.date_range('20210101',periods=6)
df = pd.DataFrame(np.arange(24).reshape((6,4)),index=dates,columns=['A','B','C','D'])
print(df,end="
")
df.iloc[1,3]=1111
df.loc["20210102","B"]=2222
print(df)
A B C D
2021-01-01 0 1 2 3
2021-01-02 4 5 6 7
2021-01-03 8 9 10 11
2021-01-04 12 13 14 15
2021-01-05 16 17 18 19
2021-01-06 20 21 22 23
A B C D
2021-01-01 0 1 2 3
2021-01-02 4 2222 6 1111
2021-01-03 8 9 10 11
2021-01-04 12 13 14 15
2021-01-05 16 17 18 19
2021-01-06 20 21 22 23
df[df.A>4]=0
print(df,end="
")
dates = pd.date_range('20210101',periods=6)
df = pd.DataFrame(np.arange(24).reshape((6,4)),index=dates,columns=['A','B','C','D'])
df.A[df.A>4]=0 #防止更改其它列的值 df.B[df.A>4]=0
print(df,end="
")
df['F']=np.nan
print(df,end="
")
df['E']=pd.Series([1,2,3,4,5,6],index=pd.date_range("20210101",periods=6))
print(df)
A B C D
2021-01-01 0 1 2 3
2021-01-02 4 2222 6 1111
2021-01-03 0 0 0 0
2021-01-04 0 0 0 0
2021-01-05 0 0 0 0
2021-01-06 0 0 0 0
A B C D
2021-01-01 0 1 2 3
2021-01-02 4 5 6 7
2021-01-03 0 9 10 11
2021-01-04 0 13 14 15
2021-01-05 0 17 18 19
2021-01-06 0 21 22 23
A B C D F
2021-01-01 0 1 2 3 NaN
2021-01-02 4 5 6 7 NaN
2021-01-03 0 9 10 11 NaN
2021-01-04 0 13 14 15 NaN
2021-01-05 0 17 18 19 NaN
2021-01-06 0 21 22 23 NaN
A B C D F E
2021-01-01 0 1 2 3 NaN 1
2021-01-02 4 5 6 7 NaN 2
2021-01-03 0 9 10 11 NaN 3
2021-01-04 0 13 14 15 NaN 4
2021-01-05 0 17 18 19 NaN 5
2021-01-06 0 21 22 23 NaN 6
4 处理丢失数据
dates = pd.date_range('20210101',periods=6)
df = pd.DataFrame(np.arange(24).reshape((6,4)),index=dates,columns=['A','B','C','D'])
df.iloc[0,1]=np.nan
df.iloc[1,2]=np.nan
print(df)
A B C D
2021-01-01 0 NaN 2.0 3
2021-01-02 4 5.0 NaN 7
2021-01-03 8 9.0 10.0 11
2021-01-04 12 13.0 14.0 15
2021-01-05 16 17.0 18.0 19
2021-01-06 20 21.0 22.0 23
print(df.dropna(),end="
") #丢弃缺失值
print(df.dropna(axis=0,how='any'),end="
") #how={'any','all'},axis=0表示丢弃行
print(df.dropna(axis=1,how='any'))
A B C D
2021-01-03 8 9.0 10.0 11
2021-01-04 12 13.0 14.0 15
2021-01-05 16 17.0 18.0 19
2021-01-06 20 21.0 22.0 23
A B C D
2021-01-03 8 9.0 10.0 11
2021-01-04 12 13.0 14.0 15
2021-01-05 16 17.0 18.0 19
2021-01-06 20 21.0 22.0 23
A D
2021-01-01 0 3
2021-01-02 4 7
2021-01-03 8 11
2021-01-04 12 15
2021-01-05 16 19
2021-01-06 20 23
print(df.fillna(value=0)) #填充缺失值为value
print(df.isnull()) #是否为缺失值
print(np.any(df.isnull())==True) #是否有缺失值
A B C D
2021-01-01 0 0.0 2.0 3
2021-01-02 4 5.0 0.0 7
2021-01-03 8 9.0 10.0 11
2021-01-04 12 13.0 14.0 15
2021-01-05 16 17.0 18.0 19
2021-01-06 20 21.0 22.0 23
A B C D
2021-01-01 False True False False
2021-01-02 False False True False
2021-01-03 False False False False
2021-01-04 False False False False
2021-01-05 False False False False
2021-01-06 False False False False
A False
B True
C True
D False
dtype: bool
5 导入导出
#官网链接 https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html
data = pd.read_csv("student.csv")
print(data)
data.to_pickle('student.pickle')
Student ID name age gender
0 1100 Kelly 22 Female
1 1101 Clo 21 Female
2 1102 Tilly 22 Female
3 1103 Tony 24 Male
4 1104 David 20 Male
5 1105 Catty 22 Female
6 1106 M 3 Female
7 1107 N 43 Male
8 1108 A 13 Male
9 1109 S 12 Male
10 1110 David 33 Male
11 1111 Dw 3 Female
12 1112 Q 23 Male
13 1113 W 21 Female
6 合并 concat
#定义资料集
df1 = pd.DataFrame(np.ones((3,4))*0, columns=['a','b','c','d'])
df2 = pd.DataFrame(np.ones((3,4))*1, columns=['a','b','c','d'])
df3 = pd.DataFrame(np.ones((3,4))*2, columns=['a','b','c','d'])
#concat纵向合并
res = pd.concat([df1, df2, df3], axis=0) #行合并
print(res,end="
")
res = pd.concat([df1, df2, df3], axis=0,ignore_index=True) #行索引重拍
print(res,end="
")
res = pd.concat([df1, df2, df3], axis=1) #列合并
print(res)
a b c d
0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
0 1.0 1.0 1.0 1.0
1 1.0 1.0 1.0 1.0
2 1.0 1.0 1.0 1.0
0 2.0 2.0 2.0 2.0
1 2.0 2.0 2.0 2.0
2 2.0 2.0 2.0 2.0
a b c d
0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
3 1.0 1.0 1.0 1.0
4 1.0 1.0 1.0 1.0
5 1.0 1.0 1.0 1.0
6 2.0 2.0 2.0 2.0
7 2.0 2.0 2.0 2.0
8 2.0 2.0 2.0 2.0
a b c d a b c d a b c d
0 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0 2.0 2.0 2.0 2.0
1 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0 2.0 2.0 2.0 2.0
2 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0 2.0 2.0 2.0 2.0
# join,['inner','outer']
df1 = pd.DataFrame(np.ones((3,4))*0, columns=['a','b','c','d'],index=[1,2,3])
df2 = pd.DataFrame(np.ones((3,4))*1, columns=['b','c','d','e'],index=[2,3,4])
print(df1)
print(df2,end="
")
res = pd.concat([df1,df2]) # join='inner'
print(res,end="
") #默认外连接 outer 不同的也会留下,用 NaN 填充 并集
res = pd.concat([df1,df2],join='inner') # 也可以加 ingore_index=True
print(res,end="
") #内连接 inner 只有相同的留下 交集
a b c d
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
3 0.0 0.0 0.0 0.0
b c d e
2 1.0 1.0 1.0 1.0
3 1.0 1.0 1.0 1.0
4 1.0 1.0 1.0 1.0
a b c d e
1 0.0 0.0 0.0 0.0 NaN
2 0.0 0.0 0.0 0.0 NaN
3 0.0 0.0 0.0 0.0 NaN
2 NaN 1.0 1.0 1.0 1.0
3 NaN 1.0 1.0 1.0 1.0
4 NaN 1.0 1.0 1.0 1.0
b c d
1 0.0 0.0 0.0
2 0.0 0.0 0.0
3 0.0 0.0 0.0
2 1.0 1.0 1.0
3 1.0 1.0 1.0
4 1.0 1.0 1.0
E:ProgramDataAnaconda3libsite-packagesipykernel_launcher.py:7: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.
To accept the future behavior, pass 'sort=True'.
To retain the current behavior and silence the warning, pass sort=False
import sys
res = pd.concat([df1, df2], axis=1)
print(res,end="
")
#依照`df1.index`进行横向合并
res = pd.concat([df1, df2], axis=1, join_axes=[df1.index])
print(res,end="
")
#依照`df2.index`进行横向合并
res = pd.concat([df1, df2], axis=1, join_axes=[df2.index])
print(res)
a b c d b c d e
1 0.0 0.0 0.0 0.0 NaN NaN NaN NaN
2 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0
3 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0
4 NaN NaN NaN NaN 1.0 1.0 1.0 1.0
a b c d b c d e
1 0.0 0.0 0.0 0.0 NaN NaN NaN NaN
2 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0
3 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0
a b c d b c d e
2 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0
3 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0
4 NaN NaN NaN NaN 1.0 1.0 1.0 1.0
df1 = pd.DataFrame(np.ones((3,4))*0, columns=['a','b','c','d'])
df2 = pd.DataFrame(np.ones((3,4))*1, columns=['a','b','c','d'])
df3 = pd.DataFrame(np.ones((3,4))*1, columns=['a','b','c','d'])
#将df2合并到df1的下面,以及重置index,并打印出结果
res = df1.append(df2, ignore_index=True)
print(res,end="
")
#合并多个df,将df2与df3合并至df1的下面,以及重置index,并打印出结果
res = df1.append([df2, df3], ignore_index=True)
print(res,end="
")
#合并series,将s1合并至df1,以及重置index,并打印出结果
s1 = pd.Series([1,2,3,4], index=['a','b','c','d'])
res = df1.append(s1, ignore_index=True)
print(res)
a b c d
0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
3 1.0 1.0 1.0 1.0
4 1.0 1.0 1.0 1.0
5 1.0 1.0 1.0 1.0
a b c d
0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
3 1.0 1.0 1.0 1.0
4 1.0 1.0 1.0 1.0
5 1.0 1.0 1.0 1.0
6 1.0 1.0 1.0 1.0
7 1.0 1.0 1.0 1.0
8 1.0 1.0 1.0 1.0
a b c d
0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
3 1.0 2.0 3.0 4.0
7 合并 merge
7.1 只有一个key
left = pd.DataFrame({'key':['K0','K1','K2','K3'],
'A': ['A0', 'A1', 'A2','A3'],
'B': ['B0', 'B1', 'B2','b3']})
right = pd.DataFrame({'key':['K0','K1','K2','K3'],
'C': ['C0', 'C1', 'C2','C3'],
'D': ['D0', 'D1', 'D2','D3']})
print(left,end="
")
print(right)
key A B
0 K0 A0 B0
1 K1 A1 B1
2 K2 A2 B2
3 K3 A3 b3
key C D
0 K0 C0 D0
1 K1 C1 D1
2 K2 C2 D2
3 K3 C3 D3
# # select a,b while (a.key1==b.key1)
res = pd.merge(left,right,on='key') # 根据关键字合并
print(res)
key A B C D
0 K0 A0 B0 C0 D0
1 K1 A1 B1 C1 D1
2 K2 A2 B2 C2 D2
3 K3 A3 b3 C3 D3
7.2 有多个key
# 有两个key
left = pd.DataFrame({'key1':['K0','K1','K2','K3'],
'key2':['K0','K1','K0','K1'],
'A': ['A0', 'A1', 'A2','A3'],
'B': ['B0', 'B1', 'B2','b3']})
right = pd.DataFrame({'key1':['K0','K1','K2','K3'],
'key2':['K0','K0','K0','K0'],
'C': ['C0', 'C1', 'C2','C3'],
'D': ['D0', 'D1', 'D2','D3']})
print(left,end="
")
print(right)
key1 key2 A B
0 K0 K0 A0 B0
1 K1 K1 A1 B1
2 K2 K0 A2 B2
3 K3 K1 A3 b3
key1 key2 C D
0 K0 K0 C0 D0
1 K1 K0 C1 D1
2 K2 K0 C2 D2
3 K3 K0 C3 D3
# select a,b while (a.key1==b.key1) and (a.key2==b.key2)
res = pd.merge(left,right,on=['key1','key2']) #默认是inner。how=["inner",'outer','left','right']
print(res)
key1 key2 A B C D
0 K0 K0 A0 B0 C0 D0
1 K2 K0 A2 B2 C2 D2
df1 = pd.DataFrame({'col1':[0,1],'col_left':['a','b']})
df2 = pd.DataFrame({'col1':[1,2,3],'col_right':[2,2,2]})
print(df1)
print(df2,end="
")
# 显示是不是合并的两个数据都有值
res = pd.merge(df1,df2,on='col1',how='outer',indicator=True)#改名字可以indicator="AA"
print(res)
col1 col_left
0 0 a
1 1 b
col1 col_right
0 1 2
1 2 2
2 3 2
col1 col_left col_right _merge
0 0 a NaN left_only
1 1 b 2.0 both
2 2 NaN 2.0 right_only
3 3 NaN 2.0 right_only
7.3 用 index
left = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
'B': ['B0', 'B1', 'B2']},
index=['K0', 'K1', 'K2'])
right = pd.DataFrame({'C': ['C0', 'C2', 'C3'],
'D': ['D0', 'D2', 'D3']},
index=['K0', 'K2', 'K3'])
print(left)
print(right,end="
")
#依据左右资料集的index进行合并,how='outer',并打印出
res = pd.merge(left, right, left_index=True, right_index=True, how='outer') #how=["inner",'outer']
print(res)
A B
K0 A0 B0
K1 A1 B1
K2 A2 B2
C D
K0 C0 D0
K2 C2 D2
K3 C3 D3
A B C D
K0 A0 B0 C0 D0
K1 A1 B1 NaN NaN
K2 A2 B2 C2 D2
K3 NaN NaN C3 D3
7.4 解决 overlapping 的问题
boys = pd.DataFrame({'k': ['K0', 'K1', 'K2'], 'age': [1, 2, 3]})
girls = pd.DataFrame({'k': ['K0', 'K0', 'K3'], 'age': [4, 5, 6]})
#使用suffixes解决overlapping的问题
# 两数据中 age 列名相同,但实际是不同的两组数据。用 suffixex 来重命名
res = pd.merge(boys, girls, on='k', suffixes=['_boy', '_girl'], how='inner')
print(res)
k age_boy age_girl
0 K0 1 4
1 K0 1 5
plot 画图
import matplotlib.pyplot as plt
%matplotlib notebook
# 随机生成1000个数据
data = pd.Series(np.random.randn(1000),index=np.arange(1000))
# 为了方便观看效果, 我们累加这个数据
data = data.cumsum()
data.plot()
plt.show()
<IPython.core.display.Javascript object>
data = pd.DataFrame(
np.random.randn(100,4),
index=np.arange(100),
columns=list("ABCD")
)
data = data.cumsum()
print(data.head())
data.plot()
plt.show()
A B C D
0 -0.224539 -0.622456 0.213368 0.251068
1 -0.889679 -1.999832 2.133443 2.974004
2 -1.514201 -2.619678 1.099911 2.486326
3 -2.715098 -2.983646 2.484683 0.996700
4 -3.335238 -3.886706 1.337085 0.672863
<IPython.core.display.Javascript object>
# 'bar','hist','box','area','scatter','hexbin,'pie
# plt.scatter(x=.y=)
# data.plot.scatter(x='A',y='B'),color='DarkBlue'label='Class1')
ax = data.plot.scatter(x='A',y='B',color='DarkBlue',label='Class1')
# 将之下这个 data 画在上一个 ax 上面
data.plot.scatter(x='A',y='C',color='LightGreen',label='Class2',ax=ax)
plt.show()
<IPython.core.display.Javascript object>