pandas是基于numpy构建的库,在数据处理方面可以把它理解为numpy的加强版,由于numpy主要用于科学计算,特长不在于数据处理,我们平常处理的数据一般带有列标签和index索引,这时pandas作为数据分析包而被开发出来。
pandas数据结构(Series/DataFrame)
一、Series
In [2]:
import pandas as pd
import numpy as np
In [3]:
# 创建Series
a1 = pd.Series([1, 2, 3]) # 数组生成Series
a1
Out[3]:
In [4]:
a2 = pd.Series(np.array([1, 2, 3])) # numpy数组生成Series
a2
Out[4]:
In [5]:
a3 = pd.Series([1, 2, 3], index=["index1", "index2", "index3"]) # 指定标签index生成
a3
Out[5]:
In [6]:
a4 = pd.Series({"index1": 1, "index2": 2, "index3": 3}) # 字典生成Series
a4
Out[6]:
In [8]:
a5 = pd.Series({"index": 1, "index2": 2, "index3": 3},
index=["index1", "index2", "index3"]) # 字典生成Series,指定index,不匹配部分为NaN
a5
Out[8]:
In [9]:
a6 = pd.Series(10, index=["index1", "index2", "index3"])
a6
Out[9]:
In [10]:
a1 = pd.Series([1, 2, 3])
a1.index # Series索引
Out[10]:
In [12]:
a1.values # Series数值
Out[12]:
In [13]:
a1.name = "population" # 指定Series名字
a1.index.name = "state" # 指定Series索引名字
a1
Out[13]:
In [14]:
a1.shape
Out[14]:
In [15]:
a1.size
Out[15]:
访问单个元素
s[indexname]
s.loc[indexname] 推荐
s[loc]
s.iloc[loc] 推荐<
访问多个元素
s[[indexname1,indexname2]]
s.loc[[indexname1,indexname2]] 推荐
s[[loc1,loc2]]
s.iloc[[loc1,loc2]] 推荐
In [17]:
a3 = pd.Series([1, 2, 3], index=["index1", "index2", "index3"])
a3
Out[17]:
In [18]:
a3["index1"]
Out[18]:
In [19]:
a3.loc['index1']
Out[19]:
In [20]:
a3[1]
Out[20]:
In [22]:
a3.iloc[1]
Out[22]:
In [23]:
a3[['index1','index2']]
Out[23]:
In [24]:
a3.loc[['index1','index2']]
Out[24]:
In [25]:
a3[[1,2]]
Out[25]:
In [26]:
a3.iloc[[1,2]]
Out[26]:
In [27]:
a3[a3 > np.mean(a3)] # 布尔值查找元素
Out[27]:
In [28]:
a3[0:2] # 绝对位置切片
Out[28]:
In [30]:
a3["index1":"index2"] # 索引切片
Out[30]:
In [32]:
# 修改元素
a3["index3"] = 100 # 按照索引修改元素
a3
Out[32]:
In [33]:
a3[2] = 1000 # 按照绝对位置修改元素
a3
Out[33]:
In [34]:
# 添加元素
a3["index4"] = 10 # 按照索引添加元素
a3
Out[34]:
In [35]:
a3.drop(["index4", "index3"], inplace=True) # inplace=True表示作用在当前Series
a3
Out[35]:
In [36]:
a3 = pd.Series([1, 2, 3], index=["index1", "index2", "index3"])
a3["index3"] = np.NaN # 添加元素
a3
Out[36]:
In [37]:
a3.isnull() # 判断Series是否有缺失值
Out[37]:
In [38]:
a3.notnull() # 判断Series是否没有缺失值
Out[38]:
In [39]:
"index1" in a3 # 判断Series中某个索引是否存在
Out[39]:
In [47]:
a3.isin([1,2]) # 判断Series中某个值是否存在
Out[47]:
In [48]:
a3.unique() # 统计Series中去重元素
Out[48]:
In [49]:
a3.value_counts() # 统计Series中去重元素和个数
Out[49]:
In [50]:
data = {"color": ["green", "red", "blue", "black", "yellow"], "price": [1, 2, 3, 4, 5]}
dataFrame1 = pd.DataFrame(data=data) # 通过字典创建
dataFrame1
Out[50]:
In [51]:
dataFrame2 = pd.DataFrame(data=data, index=["index1", "index2", "index3", "index4", "index5"])
dataFrame2
Out[51]:
In [52]:
dataFrame3 = pd.DataFrame(data=data, index=["index1", "index2", "index3", "index4", "index5"],
columns=["price"]) # 指定列索引
dataFrame3
Out[52]:
In [53]:
dataFrame4 = pd.DataFrame(data=np.arange(12).reshape(3, 4)) # 通过numpy数组创建
dataFrame4
Out[53]:
In [54]:
dic = {
'张三':[150,150,150,300],
'李四':[0,0,0,0]
}
pd.DataFrame(data=dic,index=['语文','数学','英语','理综'])
Out[54]:
In [56]:
data = [[0,150],[0,150],[0,150],[0,300]]
index = ['语文','数学','英语','理综']
columns = ['李四','张三']
pd.DataFrame(data=data,index=index,columns=columns)
Out[56]:
1.2通过Series创建
In [59]:
cars = pd.Series({"Beijing": 300000, "Shanghai": 350000, "Shenzhen": 300000, "Tianjian": 200000, "Guangzhou": 250000,
"Chongqing": 150000})
cars
Out[59]:
In [60]:
cities = {"Shanghai": 90000, "Foshan": 4500, "Dongguan": 5500, "Beijing": 6600, "Nanjing": 8000, "Lanzhou": None}
apts = pd.Series(cities, name="price")
apts
Out[60]:
In [61]:
df = pd.DataFrame({"apts": apts, "cars": cars})
df
Out[61]:
1.3通过dicts的list来构建Dataframe
In [62]:
data = [{"Beijing": 1000, "Shanghai": 2500, "Nanjing": 9850}, {"Beijing": 5000, "Shanghai": 4600, "Nanjing": 7000}]
pd.DataFrame(data)
Out[62]:
In [66]:
dataFrame2.columns # 查找dataFrame中所有列标签
Out[66]:
In [67]:
dataFrame2.index # 查找dataFrame中的所有行标签
Out[67]:
In [68]:
dataFrame2.values # 查找dataFrame中的所有值
Out[68]:
In [72]:
dataFrame2["color"]["index1"] # 索引查找数值(先列后行,否则报错)
Out[72]:
In [73]:
dataFrame2.at["index1", "color"] # 索引查找数值(先行后列,否则报错)
Out[73]:
In [79]:
dataFrame2.iat[0, 1] # 绝对位置查找数值
Out[79]:
In [89]:
data = {"color": ["green", "red", "blue", "black", "yellow"], "price": [1, 2, 3, 4, 5]}
dataFrame2 = pd.DataFrame(data=data, index=["index1", "index2", "index3", "index4", "index5"])
dataFrame2
Out[89]:
In [91]:
dataFrame2.loc["index1"] # 查找一行元素
Out[91]:
In [92]:
dataFrame2.iloc[0] # 查找一行元素(绝对位置)
Out[92]:
In [96]:
dataFrame2.iloc[0:2] # 通过iloc方法可以拿到行和列,直接按照index的顺序来取。# 可以当做numpy的ndarray的二维数组来操作。
Out[96]:
In [100]:
dataFrame2.loc[:, "price"] # 查找一列元素
Out[100]:
In [101]:
dataFrame2.iloc[:, 0] # 查找一列元素(绝对位置)
Out[101]:
In [102]:
dataFrame2.values[0] # 查找一行元素
Out[102]:
In [103]:
dataFrame2["price"] # 查找一列元素,#通过列名的方式,查找列,不能查找行
Out[103]:
In [104]:
dataFrame2["color"]
Out[104]:
In [106]:
dataFrame2.head(5) # 查看前5行元素
Out[106]:
In [107]:
dataFrame2.tail(5) # 查看后5行元素
Out[107]:
In [108]:
dataFrame2["index1":"index4"] # 切片多行
Out[108]:
In [109]:
dataFrame2[0:4] # 切片多行
Out[109]:
In [111]:
dataFrame2.loc[["index1", "index2"]] # 多行
Out[111]:
In [113]:
dataFrame2.iloc[[0, 1]] # 多行
Out[113]:
In [114]:
dataFrame2.loc[:, ["price"]] # 多列
Out[114]:
In [115]:
dataFrame2.iloc[:, [0, 1]] # 多列
Out[115]:
In [116]:
dataFrame2.loc[["index1", "index3"], ["price"]] # 索引查找
Out[116]:
In [117]:
dataFrame2.iloc[[1, 2], [0]] # 绝对位置查找
Out[117]:
In [120]:
dataFrame2.loc["index6"]=10
dataFrame2
Out[120]:
In [123]:
dataFrame2.iloc[5] = 10
dataFrame2
Out[123]:
In [125]:
dataFrame2.loc["index7"] = 100
dataFrame2
Out[125]:
In [129]:
dataFrame2.loc[:, "size"] = "small"
dataFrame2
Out[129]:
In [130]:
dataFrame2.iloc[:, 2] = 10
dataFrame2
Out[130]:
In [132]:
dataFrame2.iloc[0, 1] = 10
dataFrame2
Out[132]:
In [133]:
dataFrame2.at["index1", "price"] = 100
dataFrame2
Out[133]:
In [135]:
dataFrame2.iat[0, 1] = 1000
dataFrame2
Out[135]:
In [141]:
a=dataFrame2.drop(["price"], axis=1, inplace=False)
dataFrame2
Out[141]:
In [142]:
a
Out[142]:
8.1删除NaN数据
In [151]:
re=df.dropna(axis=1, inplace=False) # inplace默认为false
df
Out[151]:
In [152]:
re
Out[152]:
In [157]:
df2 = pd.DataFrame(np.ones((3, 4)) * 1, columns=['a', 'b', 'c', 'd'])
df2
Out[157]:
In [158]:
df3 = pd.DataFrame(np.ones((3, 4)) * 2, columns=['a', 'b', 'c', 'd'])
df3
Out[158]:
In [159]:
# ignore_index=True将重新对index排序
pd.concat([df1, df2, df3], axis=0, ignore_index=True)
Out[159]:
In [160]:
# ignore_index=True将重新对index排序
pd.concat([df1, df2, df3], axis=0, ignore_index=False)
Out[160]:
join参数用法
In [164]:
df1 = pd.DataFrame(np.ones((3, 4)) * 0, columns=['a', 'b', 'c', 'd'], index=[1, 2, 3])
df2 = pd.DataFrame(np.ones((3, 4)) * 1, columns=['b', 'c', 'd', 'e'], index=[2, 3, 4])
# join默认为'outer',不共有的列用NaN填充
pd.concat([df1, df2], sort=False, join='outer')
Out[164]:
In [166]:
# join='inner'只合并共有的列
pd.concat([df1, df2], sort=False, join='inner',ignore_index=True)
Out[166]:
join_axes参数用法
In [167]:
# 按照df1的index进行合并
pd.concat([df1, df2], axis=1, join_axes=[df1.index])
Out[167]:
9.2 append函数¶
In [169]:
df1 = pd.DataFrame(np.ones((3, 4)) * 0, columns=['a', 'b', 'c', 'd'])
df2 = pd.DataFrame(np.ones((3, 4)) * 1, columns=['a', 'b', 'c', 'd'])
re = df1.append(df2, ignore_index=True)
re
Out[169]:
append一组数据
In [170]:
df1 = pd.DataFrame(np.ones((3, 4)) * 0, columns=['a', 'b', 'c', 'd'])
s = pd.Series([4, 4, 4, 4], index=['a', 'b', 'c', 'd'])
re = df1.append(s, ignore_index=True)
re
Out[170]:
In [172]:
df2
Out[172]:
In [173]:
re = pd.merge(df1, df2, on='KEY')
re
Out[173]:
基于某两列进行合并
In [175]:
df1 = pd.DataFrame({'A': ['A1', 'A2', 'A3'],
'B': ['B1', 'B2', 'B3'],
'KEY1': ['K1', 'K2', 'K0'],
'KEY2': ['K0', 'K1', 'K3']})
df2 = pd.DataFrame({'C': ['C1', 'C2', 'C3'],
'D': ['D1', 'D2', 'D3'],
'KEY1': ['K0', 'K2', 'K1'],
'KEY2': ['K1', 'K1', 'K0']})
# how:['left','right','outer','inner']
re = pd.merge(df1, df2, on=['KEY1', 'KEY2'], how='inner')
re
Out[175]:
按index合并
In [176]:
df1 = pd.DataFrame({'A': ['A1', 'A2', 'A3'],
'B': ['B1', 'B2', 'B3']},
index=['K0', 'K1', 'K2'])
df2 = pd.DataFrame({'C': ['C1', 'C2', 'C3'],
'D': ['D1', 'D2', 'D3']},
index=['K0', 'K1', 'K3'])
re = pd.merge(df1, df2, left_index=True, right_index=True, how='outer')
re
Out[176]:
为列加后缀
In [177]:
df_boys = pd.DataFrame({'id': ['1', '2', '3'],
'age': ['23', '25', '18']})
df_girls = pd.DataFrame({'id': ['1', '2', '3'],
'age': ['18', '18', '18']})
re = pd.merge(df_boys, df_girls, on='id', suffixes=['_boys', '_girls'])
re
Out[177]: