import pandas as pd
from pandas import Series
result = pd.read_csv("examples/ex6.csv")
print(result)
'''
one two three four key
0 0.467976 -0.038649 -0.295344 -1.824726 L
1 -0.358893 1.404453 0.704965 -0.200638 B
……
9999 -0.096376 -1.012999 -0.657431 -0.573315 0
[10000 rows x 5 columns]
'''
# 只想读取几行,使用nrows
result = pd.read_csv("examples/ex6.csv",nrows=5)
print(result)
'''
one two three four key
0 0.467976 -0.038649 -0.295344 -1.824726 L
1 -0.358893 1.404453 0.704965 -0.200638 B
2 -0.501840 0.659254 -0.421691 -0.057688 G
3 0.204886 1.074134 1.388361 -0.982404 R
4 0.354628 -0.133116 0.283763 -0.837063 Q
'''
# 逐块读取,需要设置chunksize(行数)
chunker = pd.read_csv("examples/ex6.csv",chunksize=1000)
'''
<pandas.io.parsers.TextFileReader object at 0x007570F0>
'''
tot = Series([])
for piece in chunker:
# 根据设置 每1000行分一块,即每个piece为1000行
# value_counts()是一种查看表格某列中有多少个不同值的快捷方法,并计算每个不同值有在该列中有多少重复值。
# 注意:缺失值NaN与任何值相加的结果均为NaN,所以这就是为什么要用到fill_value的原因啦
# fill_value使tot中value的NaN=fill_value,然后与相同索引的value相加
tot = tot.add(piece['key'].value_counts(),fill_value=0)
tot = tot.sort_values(ascending=False)
print(tot[:10])
'''
E 368.0
X 364.0
L 346.0
O 343.0
Q 340.0
M 338.0
J 337.0
F 335.0
K 334.0
H 330.0
dtype: float64
'''
a.add(b,fill_value=0)
import pandas as pd
import numpy as np
a = pd.Series([1, 2, 3, np.nan], index=['a', 'b', 'c', 'd'])
b = pd.Series([1.1,np.nan,3.1,np.nan], index=['a', 'b', 'd', 'e'])
print(a.add(b,fill_value=0))
'''
a 2.1
b 2.0
c 3.0
d 3.1
e NaN
dtype: float64
'''
print(a.add(b))
'''
a 2.1
b NaN
c NaN
d NaN
e NaN
dtype: float64
'''