1.pandas的Seriess使用介绍
1.1 Series数据结构
import pandas as pd #pd.Series? animals = ['Tiger', 'Bear', 'Moose'] pd.Series(animals) #output: """ 0 Tiger 1 Bear 2 Moose dtype: object """ numbers = [1, 2, None] pd.Series(numbers) #output: """ 0 1.0 1 2.0 2 NaN dtype: float64 """
1.2 numpy使用
import numpy as np np.nan == None #output: False np.nan == np.nan #output: False np.isnan(np.nan) #output: True
1.3 字典和Series使用
sports = {'Archery': 'Bhutan', 'Golf': 'Scotland', 'Sumo': 'Japan', 'Taekwondo': 'South Korea'} s = pd.Series(sports) s #output: """ Archery Bhutan Golf Scotland Sumo Japan Taekwondo South Korea dtype: object """ #索引 s.index """ Index(['Archery', 'Golf', 'Sumo', 'Taekwondo'], dtype='object') """ s = pd.Series(['Tiger', 'Bear', 'Moose'], index=['India', 'America', 'Canada']) s #output: """ India Tiger America Bear Canada Moose dtype: object """
sports = {'Archery': 'Bhutan', 'Golf': 'Scotland', 'Sumo': 'Japan', 'Taekwondo': 'South Korea'} s = pd.Series(sports, index=['Golf', 'Sumo', 'Hockey']) s #output: """ Golf Scotland Sumo Japan Hockey NaN dtype: object """
2. 对Series的索引操作
sports = {'Archery': 'Bhutan', 'Golf': 'Scotland', 'Sumo': 'Japan', 'Taekwondo': 'South Korea'} s = pd.Series(sports) s # ouptut: """ sports = {'Archery': 'Bhutan', 'Golf': 'Scotland', 'Sumo': 'Japan', 'Taekwondo': 'South Korea'} s = pd.Series(sports) s Archery Bhutan Golf Scotland Sumo Japan Taekwondo South Korea dtype: object """
2.1 iloc()和 loc()方法
s.iloc[3] #数字索引 #output: 'South Korea' s.loc['Golf'] #键值 #output : 'Scotland'
2.2 关于向量化操作
#this creates a big series of random numbers s = pd.Series(np.random.randint(0,1000,10000)) s.head() #output: """ 0 396 1 779 2 752 3 30 4 493 dtype: int64 """
时间对比 :
import numpy as np %%timeit -n 100 summary = 0 for item in s: summary+=item # 100 loops, best of 3: 1.87 ms per loop #向量化操作 %%timeit -n 100 summary = np.sum(s) # 100 loops, best of 3: 100 µs per loop
#broadcasting操作 s+=2 #adds two to each item in s using broadcasting s.head() """ 0 398 1 781 2 754 3 32 4 495 dtype: int64 """
迭代:
for label, value in s.iteritems(): s.set_value(label, value+2) s.head() #output: """ 0 400 1 783 2 756 3 34 4 497 dtype: int64 """
时间对比:
#迭代的方法 %%timeit -n 10 s = pd.Series(np.random.randint(0,1000,10000)) for label, value in s.iteritems(): s.loc[label]= value+2 #时间: 10 loops, best of 3: 1.62 s per loop # broadcasting %%timeit -n 10 s = pd.Series(np.random.randint(0,1000,10000)) s+=2 # 10 loops, best of 3: 472 µs per loop
2.3 元素操作
s = pd.Series([1, 2, 3]) s.loc['Animal'] = 'Bears' s #output """ 0 1 1 2 2 3 Animal Bears dtype: object """
2.4 Series的append()方法
original_sports = pd.Series({'Archery': 'Bhutan', 'Golf': 'Scotland', 'Sumo': 'Japan', 'Taekwondo': 'South Korea'}) cricket_loving_countries = pd.Series(['Australia', 'Barbados', 'Pakistan', 'England'], index=['Cricket', 'Cricket', 'Cricket', 'Cricket']) all_countries = original_sports.append(cricket_loving_countries)
original_sports未发生改变
#未发生变化 print(original_sports) """ Archery Bhutan Golf Scotland Sumo Japan Taekwondo South Korea dtype: object """
cricket_loving_countries的值:
print(cricket_loving_countries) """ Cricket Australia Cricket Barbados Cricket Pakistan Cricket England dtype: object """
all_countries的值
print(all_countries) """ Archery Bhutan Golf Scotland Sumo Japan Taekwondo South Korea Cricket Australia Cricket Barbados Cricket Pakistan Cricket England dtype: object """
print(all_countries.loc['Cricket']) """ Cricket Australia Cricket Barbados Cricket Pakistan Cricket England dtype: object """
3 .DataFrame数据结构
3.1 可以看做是多维的Series.
import pandas as pd purchase_1 = pd.Series({'Name': 'Chris', 'Item Purchased': 'Dog Food', 'Cost': 22.50}) purchase_2 = pd.Series({'Name': 'Kevyn', 'Item Purchased': 'Kitty Litter', 'Cost': 2.50}) purchase_3 = pd.Series({'Name': 'Vinod', 'Item Purchased': 'Bird Seed', 'Cost': 5.00}) df = pd.DataFrame([purchase_1, purchase_2, purchase_3], index=['Store 1', 'Store 1', 'Store 2']) print(df.head()) """ Cost Item Purchased Name Store 1 22.5 Dog Food Chris Store 1 2.5 Kitty Litter Kevyn Store 2 5.0 Bird Seed Vinod """
3.2 loc()操作
print(df.loc['Store 2']) """ Cost 5 Item Purchased Bird Seed Name Vinod Name: Store 2, dtype: object """
print(df.loc['Store 1']) """ Cost Item Purchased Name Store 1 22.5 Dog Food Chris Store 1 2.5 Kitty Litter Kevyn """
df.loc['Store 1', 'Cost'] """ Store 1 22.5 Store 1 2.5 Name: Cost, dtype: float64 """
3.3 转置
print(df.T) """ Store 1 Store 1 Store 2 Cost 22.5 2.5 5 Item Purchased Dog Food Kitty Litter Bird Seed Name Chris Kevyn Vinod """
print(df.T.loc['Cost']) """ print(df.T.loc['Cost']) Store 1 22.5 Store 1 2.5 Store 2 5 Name: Cost, dtype: object """ print(df['Cost']) """ Store 1 22.5 Store 1 2.5 Store 2 5.0 Name: Cost, dtype: float64 """ print(df.loc['Store 1']['Cost']) """ Store 1 22.5 Store 1 2.5 Name: Cost, dtype: float64 """ print(df.loc[:,['Name', 'Cost']]) """ Name Cost Store 1 Chris 22.5 Store 1 Kevyn 2.5 Store 2 Vinod 5.0 """
3.4 关于drop()方法
print(df.drop('Store 1')) """ Cost Item Purchased Name Store 2 5.0 Bird Seed Vinod """ #但是原来的df没有发生变化 print(df) """ Cost Item Purchased Name Store 1 22.5 Dog Food Chris Store 1 2.5 Kitty Litter Kevyn Store 2 5.0 Bird Seed Vinod """
3.5 copy()方法
copy_df = df.copy() copy_df = copy_df.drop('Store 1') print(copy_df) """ Cost Item Purchased Name Location Store 2 5.0 Bird Seed Vinod None """ # copy_df.drop?
3.6 del 操作和加列操作
del copy_df['Name'] print(copy_df) """ Cost Item Purchased Location Store 2 5.0 Bird Seed None """ df['Location'] = None print(df) """ Cost Item Purchased Name Location Store 1 22.5 Dog Food Chris None Store 1 2.5 Kitty Litter Kevyn None Store 2 5.0 Bird Seed Vinod None """