pandas 基础
serise
import pandas as pd
from pandas import Series, DataFrame
obj = Series([4, -7, 5, 3])
obj
0 4
1 -7
2 5
3 3
dtype: int64
obj.values
array([ 4, -7, 5, 3], dtype=int64)
obj.index
RangeIndex(start=0, stop=4, step=1)
obj[[1,3]]
# 跳着选取数据
1 -7
3 3
dtype: int64
obj[1:3]
1 -7
2 5
dtype: int64
pd.isnull(obj)
0 False
1 False
2 False
3 False
dtype: bool
obj.reindex(range(5), method = 'ffill')
0 4
1 -7
2 5
3 3
4 3
dtype: int64
dataframe
data = {'state': ['asd','qwe','sdf','ert'],
'year': [2000, 2001, 2002, 2003],
'pop': [1.5,1.7,3.6,2.4]}
data = DataFrame(data)
data
|
pop |
state |
year |
0 |
1.5 |
asd |
2000 |
1 |
1.7 |
qwe |
2001 |
2 |
3.6 |
sdf |
2002 |
3 |
2.4 |
ert |
2003 |
data.year
# 比r里提取列要方便点
0 2000
1 2001
2 2002
3 2003
Name: year, dtype: int64
data['debt'] = range(4)
data
|
pop |
state |
year |
debt |
0 |
1.5 |
asd |
2000 |
0 |
1 |
1.7 |
qwe |
2001 |
1 |
2 |
3.6 |
sdf |
2002 |
2 |
3 |
2.4 |
ert |
2003 |
3 |
a = data.index
a[1] = 6
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-9-57677294f950> in <module>()
1 a = data.index
----> 2 a[1] = 6
F:Anacondalibsite-packagespandascoreindexesase.py in __setitem__(self, key, value)
1668
1669 def __setitem__(self, key, value):
-> 1670 raise TypeError("Index does not support mutable operations")
1671
1672 def __getitem__(self, key):
TypeError: Index does not support mutable operations
data.columns
Index(['pop', 'state', 'year', 'debt'], dtype='object')
- .ix标签索引功能,输入行和列
- 不加.ix只能选取其中的某列或某行,不能列与行同时选取
data[:3]
|
pop |
state |
year |
debt |
0 |
1.5 |
asd |
2000 |
0 |
1 |
1.7 |
qwe |
2001 |
1 |
2 |
3.6 |
sdf |
2002 |
2 |
data.ix[:,:3]
|
pop |
state |
year |
0 |
1.5 |
asd |
2000 |
1 |
1.7 |
qwe |
2001 |
2 |
3.6 |
sdf |
2002 |
3 |
2.4 |
ert |
2003 |
- 删除某列用drop,axis = 0表示行,1表示列
- 删除后原数据不变
data.drop(0,axis=0)
|
pop |
state |
year |
debt |
1 |
1.7 |
qwe |
2001 |
1 |
2 |
3.6 |
sdf |
2002 |
2 |
3 |
2.4 |
ert |
2003 |
3 |
data.drop('year', axis=1)
|
pop |
state |
debt |
0 |
1.5 |
asd |
0 |
1 |
1.7 |
qwe |
1 |
2 |
3.6 |
sdf |
2 |
3 |
2.4 |
ert |
3 |
data
|
pop |
state |
year |
debt |
0 |
1.5 |
asd |
2000 |
0 |
1 |
1.7 |
qwe |
2001 |
1 |
2 |
3.6 |
sdf |
2002 |
2 |
3 |
2.4 |
ert |
2003 |
3 |
import numpy as np
df = DataFrame(np.arange(9).reshape(3, 3))
df
|
0 |
1 |
2 |
0 |
0 |
1 |
2 |
1 |
3 |
4 |
5 |
2 |
6 |
7 |
8 |
- applymap()可以对dataframe每一个元素运用函数
- apply()可以对每一维数组运用函数
df.applymap(lambda x: '%.2f' % x)
|
0 |
1 |
2 |
0 |
0.00 |
1.00 |
2.00 |
1 |
3.00 |
4.00 |
5.00 |
2 |
6.00 |
7.00 |
8.00 |
data.sort_values(by='pop')
# 对某一列排序
|
pop |
state |
year |
debt |
0 |
1.5 |
asd |
2000 |
0 |
1 |
1.7 |
qwe |
2001 |
1 |
3 |
2.4 |
ert |
2003 |
3 |
2 |
3.6 |
sdf |
2002 |
2 |
data.describe()
|
pop |
year |
debt |
count |
4.000000 |
4.000000 |
4.000000 |
mean |
2.300000 |
2001.500000 |
1.500000 |
std |
0.948683 |
1.290994 |
1.290994 |
min |
1.500000 |
2000.000000 |
0.000000 |
25% |
1.650000 |
2000.750000 |
0.750000 |
50% |
2.050000 |
2001.500000 |
1.500000 |
75% |
2.700000 |
2002.250000 |
2.250000 |
max |
3.600000 |
2003.000000 |
3.000000 |
df.isin([1])
|
0 |
1 |
2 |
0 |
False |
True |
False |
1 |
False |
False |
False |
2 |
False |
False |
False |
- None、NaN会被当作NA处理
- df.shape不加括号相当于dim()
df.shape
(3, 3)
df.ix[:1, :1] = None
df
|
0 |
1 |
2 |
0 |
NaN |
NaN |
2 |
1 |
NaN |
NaN |
5 |
2 |
6.0 |
7.0 |
8 |
df.fillna({0:11, 1:22})
|
0 |
1 |
2 |
0 |
11.0 |
22.0 |
2 |
1 |
11.0 |
22.0 |
5 |
2 |
6.0 |
7.0 |
8 |
df
|
0 |
1 |
2 |
0 |
NaN |
NaN |
2 |
1 |
NaN |
NaN |
5 |
2 |
6.0 |
7.0 |
8 |
df.fillna({0:11, 1:22}, inplace=True)
|
0 |
1 |
2 |
0 |
11.0 |
22.0 |
2 |
1 |
11.0 |
22.0 |
5 |
2 |
6.0 |
7.0 |
8 |
df
|
0 |
1 |
2 |
0 |
11.0 |
22.0 |
2 |
1 |
11.0 |
22.0 |
5 |
2 |
6.0 |
7.0 |
8 |