pandas模块
pandas更多的是excel/csv文件处理,excel文件, 对numpy+xlrd模块做了一层封装
pandas的数据类型
import pandas as pd
import numpy as np
serise(现在一般不使用(一维))
df = pd.Series(np.array([1,2,3,4]))
print(df)
DataFrame多维
dates = pd.date_range('20190101', periods=6, freq='M')
print(dates)
values = np.random.rand(6, 4) * 10
print(values)
columns = ['c1','c2','c3','c3']
df = pd.DataFrame(values,index=dates,columns=columns)
print(df)
DatetimeIndex(['2019-01-31', '2019-02-28', '2019-03-31', '2019-04-30',
'2019-05-31', '2019-06-30'],
dtype='datetime64[ns]', freq='M')
[[1.16335011 4.48613539 7.68543002 2.28527564]
[8.93474708 7.31073142 8.61253719 2.50506357]
[4.88797902 6.81381968 9.0847644 8.34332396]
[6.74341716 9.32192571 9.01122189 2.93191827]
[3.83096571 3.27206377 7.25800888 1.30570883]
[2.87592228 0.17123983 4.97889883 5.4085225 ]]
c1 c2 c3 c3
2019-01-31 1.163350 4.486135 7.685430 2.285276
2019-02-28 8.934747 7.310731 8.612537 2.505064
2019-03-31 4.887979 6.813820 9.084764 8.343324
2019-04-30 6.743417 9.321926 9.011222 2.931918
2019-05-31 3.830966 3.272064 7.258009 1.305709
2019-06-30 2.875922 0.171240 4.978899 5.408523
DataFrame属性
dtype 查看数据类型
index 查看行序列或者索引
columns 查看各列的标签
values 查看数据框内的数据,也即不含表头索引的数据
describe 查看数据每一列的极值,均值,中位数,只可用于数值型数据
transpose 转置,也可用T来操作
sort_index 排序,可按行或列index排序输出
sort_values 按数据值来排序
print(df.dtypes)
print(df.index)
print(df.columns)
print(df.describe())
print(df.T)
import pandas as pd
import numpy as np
dates = pd.date_range('20190101', periods=6, freq='M')
print(dates)
values = np.random.rand(6, 4) * 10
print(values)
columns = ['c4','c2','c3','c1']
df = pd.DataFrame(values,index=dates,columns=columns)
print(df)
|
c4 |
c2 |
c3 |
c1 |
2019-01-31 |
5.820943 |
8.551214 |
4.164049 |
1.268047 |
2019-02-28 |
6.809855 |
3.161353 |
1.934861 |
3.639872 |
2019-03-31 |
0.679617 |
6.166411 |
3.264278 |
3.919507 |
2019-04-30 |
2.634395 |
8.825472 |
2.345733 |
0.301147 |
2019-05-31 |
9.859531 |
9.294794 |
4.025121 |
3.545862 |
2019-06-30 |
5.566927 |
0.043362 |
5.301493 |
0.214879 |
df.T
|
2019-01-31 |
2019-02-28 |
2019-03-31 |
2019-04-30 |
2019-05-31 |
2019-06-30 |
c4 |
5.820943 |
6.809855 |
0.679617 |
2.634395 |
9.859531 |
5.566927 |
c2 |
8.551214 |
3.161353 |
6.166411 |
8.825472 |
9.294794 |
0.043362 |
c3 |
4.164049 |
1.934861 |
3.264278 |
2.345733 |
4.025121 |
5.301493 |
c1 |
1.268047 |
3.639872 |
3.919507 |
0.301147 |
3.545862 |
0.214879 |
df = df.sort_index(axis=1) # 0列,1是行
df
|
c1 |
c2 |
c3 |
c4 |
2019-01-31 |
1.268047 |
8.551214 |
4.164049 |
5.820943 |
2019-02-28 |
3.639872 |
3.161353 |
1.934861 |
6.809855 |
2019-03-31 |
3.919507 |
6.166411 |
3.264278 |
0.679617 |
2019-04-30 |
0.301147 |
8.825472 |
2.345733 |
2.634395 |
2019-05-31 |
3.545862 |
9.294794 |
4.025121 |
9.859531 |
2019-06-30 |
0.214879 |
0.043362 |
5.301493 |
5.566927 |
df.sort_values('c3')
|
c1 |
c2 |
c3 |
c4 |
2019-02-28 |
3.639872 |
3.161353 |
1.934861 |
6.809855 |
2019-04-30 |
0.301147 |
8.825472 |
2.345733 |
2.634395 |
2019-03-31 |
3.919507 |
6.166411 |
3.264278 |
0.679617 |
2019-05-31 |
3.545862 |
9.294794 |
4.025121 |
9.859531 |
2019-01-31 |
1.268047 |
8.551214 |
4.164049 |
5.820943 |
2019-06-30 |
0.214879 |
0.043362 |
5.301493 |
5.566927 |
取值
2019-01-31 1.268047
2019-02-28 3.639872
2019-03-31 3.919507
2019-04-30 0.301147
2019-05-31 3.545862
2019-06-30 0.214879
Freq: M, Name: c1, dtype: float64
|
c1 |
c3 |
2019-01-31 |
1.268047 |
4.164049 |
2019-02-28 |
3.639872 |
1.934861 |
2019-03-31 |
3.919507 |
3.264278 |
2019-04-30 |
0.301147 |
2.345733 |
2019-05-31 |
3.545862 |
4.025121 |
2019-06-30 |
0.214879 |
5.301493 |
- df.loc['2019-01-31':'2019-02-28']
|
c1 |
c2 |
c3 |
c4 |
2019-01-31 |
1.268047 |
8.551214 |
4.164049 |
5.820943 |
2019-02-28 |
3.639872 |
3.161353 |
1.934861 |
6.809855 |
3.1613533123062734
|
c1 |
c2 |
c3 |
c4 |
2019-01-31 |
1.268047 |
8.551214 |
4.164049 |
5.820943 |
2019-02-28 |
3.639872 |
3.161353 |
1.934861 |
6.809855 |
2019-03-31 |
3.919507 |
6.166411 |
3.264278 |
0.679617 |
2019-04-30 |
0.301147 |
8.825472 |
2.345733 |
2.634395 |
2019-05-31 |
3.545862 |
9.294794 |
4.025121 |
9.859531 |
2019-06-30 |
0.214879 |
0.043362 |
5.301493 |
5.566927 |
|
c1 |
c2 |
c3 |
c4 |
2019-01-31 |
1.268047 |
8.551214 |
4.164049 |
5.820943 |
2019-02-28 |
3.639872 |
3.161353 |
1.934861 |
6.809855 |
2019-03-31 |
3.919507 |
6.166411 |
3.264278 |
0.679617 |
2019-04-30 |
0.301147 |
8.825472 |
2.345733 |
2.634395 |
2019-05-31 |
3.545862 |
9.294794 |
4.025121 |
9.859531 |
2019-06-30 |
0.214879 |
0.043362 |
5.301493 |
5.566927 |
|
c1 |
c2 |
c3 |
c4 |
2019-02-28 |
3.639872 |
3.161353 |
1.934861 |
6.809855 |
2019-03-31 |
3.919507 |
6.166411 |
3.264278 |
0.679617 |
2019-05-31 |
3.545862 |
9.294794 |
4.025121 |
9.859531 |
值替换
df.iloc[1,1] = 1
df
|
c1 |
c2 |
c3 |
c4 |
2019-01-31 |
1.268047 |
8.551214 |
4.164049 |
5.820943 |
2019-02-28 |
3.639872 |
1.000000 |
1.934861 |
6.809855 |
2019-03-31 |
3.919507 |
6.166411 |
3.264278 |
0.679617 |
2019-04-30 |
0.301147 |
8.825472 |
2.345733 |
2.634395 |
2019-05-31 |
3.545862 |
9.294794 |
4.025121 |
9.859531 |
2019-06-30 |
0.214879 |
0.043362 |
5.301493 |
5.566927 |
pandas操作表格
from io import StringIO
test_data = '''
5.1,,1.4,0.2
4.9,3.0,1.4,0.2
4.7,3.2,,0.2
7.0,3.2,4.7,1.4
6.4,3.2,4.5,1.5
6.9,3.1,4.9,
,,,
'''
print(test_data)
test_data = StringIO(test_data) # 把test_data读入内存,相当于变成文件
print(test_data)
# 把数据读入内存,变成csv文件
5.1,,1.4,0.2
4.9,3.0,1.4,0.2
4.7,3.2,,0.2
7.0,3.2,4.7,1.4
6.4,3.2,4.5,1.5
6.9,3.1,4.9,
,,,
<_io.StringIO object at 0x000001665DD6A828>
df = pd.read_csv('test.csv', header=None) #读取文件 # header没有columns
df.columns =['c1','c2','c3','c4']
df
|
c1 |
c2 |
c3 |
c4 |
0 |
5.1 |
NaN |
1.4 |
0.2 |
1 |
4.9 |
3.0 |
1.4 |
0.2 |
2 |
4.7 |
3.2 |
NaN |
0.2 |
3 |
7.0 |
3.2 |
4.7 |
1.4 |
4 |
6.4 |
3.2 |
4.5 |
1.5 |
5 |
6.9 |
3.1 |
4.9 |
NaN |
6 |
NaN |
NaN |
NaN |
NaN |
缺失值处理
df = df.dropna(axis=0) # 1列,0行
df
|
c1 |
c2 |
c3 |
c4 |
1 |
4.9 |
3.0 |
1.4 |
0.2 |
3 |
7.0 |
3.2 |
4.7 |
1.4 |
4 |
6.4 |
3.2 |
4.5 |
1.5 |
df = df.dropna(thresh=3) # 必须得有4个值
df
|
c1 |
c2 |
c3 |
c4 |
0 |
5.1 |
NaN |
1.4 |
0.2 |
1 |
4.9 |
3.0 |
1.4 |
0.2 |
2 |
4.7 |
3.2 |
NaN |
0.2 |
3 |
7.0 |
3.2 |
4.7 |
1.4 |
4 |
6.4 |
3.2 |
4.5 |
1.5 |
5 |
6.9 |
3.1 |
4.9 |
NaN |
合并处理
df1 = pd.DataFrame(np.zeros((2,3)))
df1
0 |
1 |
2 |
|
0 |
0.0 |
0.0 |
0.0 |
1 |
0.0 |
0.0 |
0.0 |
df2 = pd.DataFrame(np.ones((2,3)))
df2
|
0 |
1 |
2 |
0 |
1.0 |
1.0 |
1.0 |
1 |
1.0 |
1.0 |
1.0 |
pd.concat((df1,df2),axis=1) # 默认按列0,1行
|
0 |
1 |
2 |
0 |
1 |
2 |
0 |
0.0 |
0.0 |
0.0 |
1.0 |
1.0 |
1.0 |
1 |
0.0 |
0.0 |
0.0 |
1.0 |
1.0 |
1.0 |
df1.append(df2)
|
0 |
1 |
2 |
0 |
0.0 |
0.0 |
0.0 |
1 |
0.0 |
0.0 |
0.0 |
0 |
1.0 |
1.0 |
1.0 |
1 |
1.0 |
1.0 |
1.0 |
导入数据
df = pd.read_csv('test.csv', header=None) #读取文件 # header没有columns
# df = pd.read_excel('test.excel',)
df.columns =['c1','c2','c3','c4']
df
|
c1 |
c2 |
c3 |
c4 |
0 |
5.1 |
NaN |
1.4 |
0.2 |
1 |
4.9 |
3.0 |
1.4 |
0.2 |
2 |
4.7 |
3.2 |
NaN |
0.2 |
3 |
7.0 |
3.2 |
4.7 |
1.4 |
4 |
6.4 |
3.2 |
4.5 |
1.5 |
5 |
6.9 |
3.1 |
4.9 |
NaN |
6 |
NaN |
NaN |
NaN |
NaN |
df = df.dropna(thresh=4)
df
|
c1 |
c2 |
c3 |
c4 |
1 |
4.9 |
3.0 |
1.4 |
0.2 |
3 |
7.0 |
3.2 |
4.7 |
1.4 |
4 |
6.4 |
3.2 |
4.5 |
1.5 |
df.index = ['nick','jason','tank']
df.to_csv('test1.csv')
pandas基础中的基础,一定要学会, <奥卡姆剃刀>
df
|
ttery |
issue |
code |
code1 |
code2 |
time |
0 |
min |
20130801-3391 |
8,4,5,2,9 |
297734529 |
NaN |
1013395466000 |
1 |
min |
20130801-3390 |
7,8,2,1,2 |
298058212 |
NaN |
1013395406000 |
2 |
min |
20130801-3389 |
5,9,1,2,9 |
298329129 |
NaN |
1013395346000 |
3 |
min |
20130801-3388 |
3,8,7,3,3 |
298588733 |
NaN |
1013395286000 |
4 |
min |
20130801-3387 |
0,8,5,2,7 |
298818527 |
NaN |
1013395226000 |