一.numpy二维数组
1.声明
1 import numpy as np 2 3 #每一个[]代表一行 4 ridership = np.array([ 5 [ 0, 0, 2, 5, 0], 6 [1478, 3877, 3674, 2328, 2539], 7 [1613, 4088, 3991, 6461, 2691], 8 [1560, 3392, 3826, 4787, 2613], 9 [1608, 4802, 3932, 4477, 2705], 10 [1576, 3933, 3909, 4979, 2685], 11 [ 95, 229, 255, 496, 201], 12 [ 2, 0, 1, 27, 0], 13 [1438, 3785, 3589, 4174, 2215], 14 [1342, 4043, 4009, 4665, 3033] 15 ])
2.取值
1 print ridership[1,3] #取第二行,第四个数 2 print ridership[1:3,3:5] #取子集 3 print ridership[1,:]#取第二行 4 print ridership[0,:]+ridership[1,:]#第一行的值加上第二行的值 5 print ridership[:,0]+ridership[:,1]#第一列的值加上第二列的值
3.numpy二维数组之间相加
根据线性代数的规则进行相加
1 a=np.array([[1,2,3],[4,5,6],[7,8,9]]) 2 b=np.array([[1,2,3],[4,5,6],[7,8,9]]) 3 print a+b
4.numpy二维数组的求和
1 a = np.array([ 2 [1, 2, 3], 3 [4, 5, 6], 4 [7, 8, 9] 5 ]) 6 7 print a.sum() #求出数组所有值的和 8 print a.sum(axis=0) #按照行计算总和 9 print a.sum(axis=1) #按照列计算总和
例:
1.求出最大车流的车站的平均数
1 def mean_riders_for_max_station(ridership): 2 max_station = ridership[0,:].argmax() #找出最大的车站的下标 3 overall_mean = ridership.mean() #求出总体的平均值 4 mean_for_max = ridership[:,max_station].mean() #根据最大的车站,找到该下标的所有列,求出平均值 5 return (overall_mean,mean_for_max)
2.求出平均车流量的最大和最小的平均值
1 def min_and_max_riders_per_day(ridership): 2 max_station = ridership.mean(axis=0) 3 max_daily_ridership = max_station.max() 4 min_daily_ridership = max_station.min() 5 return (max_daily_ridership,min_daily_ridership)
二.pandas二维数组
1.声明
1 import pandas as pd 2 ridership_df = pd.DataFrame( 3 data=[[ 0, 0, 2, 5, 0], 4 [1478, 3877, 3674, 2328, 2539], 5 [1613, 4088, 3991, 6461, 2691], 6 [1560, 3392, 3826, 4787, 2613], 7 [1608, 4802, 3932, 4477, 2705], 8 [1576, 3933, 3909, 4979, 2685], 9 [ 95, 229, 255, 496, 201], 10 [ 2, 0, 1, 27, 0], 11 [1438, 3785, 3589, 4174, 2215], 12 [1342, 4043, 4009, 4665, 3033]], 13 index=['05-01-11', '05-02-11', '05-03-11', '05-04-11', '05-05-11', 14 '05-06-11', '05-07-11', '05-08-11', '05-09-11', '05-10-11'], 15 columns=['R003', 'R004', 'R005', 'R006', 'R007'] 16 )
2.取值
1 print ridership_df #以表格的方式展示 2 print ridership_df.iloc[0] #获取第一行的数据 3 print ridership_df.loc['05-05-11']#找到该下标所对应行的数据 4 print ridership_df['R003'] #找出该下标所对应的列的数据 5 print ridership_df.iloc[1,3] #根据点找出对应的值 6 print ridership_df[['R003','R005']] #根据多个下标获取多列的数据
3.行列展示
1 df_1=pd.DataFrame({'A':[0,1,2],'B':[3,4,5]}) #默认以列的方式展示 2 print df_1 3 df_2=pd.DataFrame([[0,1,2],[3,4,5]],columns=['A','B','C']) #以行的方式展示 4 print df_2
例:
1.求出最大车流的车站的平均数
1 def mean_riders_for_max_station(ridership): 2 max_station = ridership.iloc[0].argmax() #获取第一行的最大的下边,即为最繁忙的地铁站 3 overall_mean = ridership.values.mean() #获取数组中值的所有平均数 4 mean_for_max = ridership[max_station].mean() #取出平均该列的的客流量 5 return (overall_mean,mean_for_max)
2.相关性
概念:相关性的值(-1,+1),如果越接近+1表示两个量程正相关,越接近-1表示程负相关,接近0表示越无关
1 subway_df = pd.read_csv('nyc-subway-weather.csv') #下载区有文件 2 def correlation(x,y): 3 #该函数体现了表格中两个变量之间的相关性 4 std_x = (x-x.mean())/x.std(ddof=0) 5 std_y = (y-y.mean())/y.std(ddof=0) 6 return (std_x*std_y).mean()
1 entries = subway_df['ENTRIESn_hourly'] #获取出对应列的值 2 cum_entries = subway_df['ENTRIESn'] 3 rain = subway_df['meanprecipi'] 4 temp = subway_df['meantempi'] 5 6 #找出两个变量之间的关联度 7 print correlation(entries,rain) 8 print correlation(entries,temp) 9 print correlation(rain,temp) 10 print correlation(entries,cum_entries)
3.DataFrame相关操作
符合线性代数的计算规则,如果两个DataFrame的值不同,就用NaN值填充
1 df1=pd.DataFrame({'a':[1,2,3],'b':[4,5,6],'c':[7,8,9]}) 2 df2=pd.DataFrame({'a':[10,20,30],'b':[40,50,60],'c':[70,80,90]}) 3 print df1+df2 4 5 df1 = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]}) 6 df2 = pd.DataFrame({'d': [10, 20, 30], 'c': [40, 50, 60], 'b': [70, 80, 90]}) 7 print df1 + df2 8 9 df1=pd.DataFrame({'a':[1,2,3],'b':[4,5,6],'c':[7,8,9]},index=['row1','row2','row3']) 10 df2=pd.DataFrame({'a':[10,20,30],'b':[40,50,60],'c':[70,80,90]},index=['row1','row2','row3']) 11 print df1+df2
例:
1.求出每小时的地铁的流量
1 entries_and_exits = pd.DataFrame({ 2 'ENTRIESn': [3144312, 3144335, 3144353, 3144424, 3144594, 3 3144808, 3144895, 3144905, 3144941, 3145094], 4 'EXITSn': [1088151, 1088159, 1088177, 1088231, 1088275, 5 1088317, 1088328, 1088331, 1088420, 1088753] 6 }) 7 #获取每小时的进出地铁的人数 8 def get_hourly_entries_and_exits(entries_and_exits): 9 return entries_and_exits-entries_and_exits.shift(1)
4.applymap方法,将DataFrame的所有值通过自定义方法得以修改
1 df = pd.DataFrame({ 2 'a': [1, 2, 3], 3 'b': [10, 20, 30], 4 'c': [5, 10, 15] 5 }) 6 7 def add_one(x): 8 return x + 1 9 10 11 print df.applymap(add_one)
例:
1.将学生的成绩转换为等级(A,B,C,D,F)
1 grades_df = pd.DataFrame( 2 data={'exam1': [43, 81, 78, 75, 89, 70, 91, 65, 98, 87], 3 'exam2': [24, 63, 56, 56, 67, 51, 79, 46, 72, 60]}, 4 index=['Andre', 'Barry', 'Chris', 'Dan', 'Emilio', 5 'Fred', 'Greta', 'Humbert', 'Ivan', 'James'] 6 ) 7 def convert_grade(grade): 8 if grade>=90: 9 return 'A' 10 elif grade>=80: 11 return 'B' 12 elif grade>=70: 13 return 'C' 14 elif grade>=60: 15 return 'D' 16 else: 17 return 'F' 18 def convert_grades(grades): 19 return grades.applymap(convert_grade)
2.计算标准差
1 def standardize_column(df): 2 return (df-df.mean())/df.std(ddof=0) #ddof表示使用贝塞尔教正参数 3 4 def standardize(df): 5 return df.apply(standardize_column) 6 print standardize(grades_df)
3.获取数组中第二大的值
1 df = pd.DataFrame({ 2 'a': [4, 5, 3, 1, 2], 3 'b': [20, 10, 40, 50, 30], 4 'c': [25, 20, 5, 15, 10] 5 }) 6 7 def second_largest_column(column): 8 #将数据以降序进行排列,下标为1的就是第二大的数 9 sorted_column = column.sort_values(ascending=False) 10 return sorted_column.iloc[1] 11 12 def senond_large(df): 13 return df.apply(second_largest_column)
5.DataFrame和Seriers操作
1 s=pd.Series([1,2,3,4]) 2 df=pd.DataFrame({0:[10,20,30,40],1:[50,60,70,80],2:[90,100,110,120],3:[130,140,150,160]}) 3 print df+s #每行对应数值相加 4 5 df = pd.DataFrame({0: [10], 1: [20], 2: [30], 3: [40]}) 6 print df+s #行相加 7 8 df = pd.DataFrame({0: [10, 20, 30, 40]}) 9 print df+s #列相加
1 s = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd']) 2 df = pd.DataFrame({ 3 'a': [10, 20, 30, 40], 4 'b': [50, 60, 70, 80], 5 'c': [90, 100, 110, 120], 6 'd': [130, 140, 150, 160] 7 }) 8 print df+s #默认进行行相加 9 print df.add(s,axis='index') #列相加,此处不匹配,显示NaN值 10 print df.add(s,axis='columns') #指定进行行相加
例:
1.计算学生的标准偏差
1 grades_df = pd.DataFrame( 2 data={'exam1': [43, 81, 78, 75, 89, 70, 91, 65, 98, 87], 3 'exam2': [24, 63, 56, 56, 67, 51, 79, 46, 72, 60]}, 4 index=['Andre', 'Barry', 'Chris', 'Dan', 'Emilio', 5 'Fred', 'Greta', 'Humbert', 'Ivan', 'James'] 6 ) 7 8 #计算每个学生的标准偏差 9 def standardize(df): 10 return (df-df.mean())/df.std(ddof=0) 11 12 def standardize_row(df): 13 #1.计算每个学生考试成绩和平均成绩的差 14 #2.再计算每个学生的样本偏差 15 mean_diffs = df.sub(df.mean(axis='columns'),axis='index') 16 return mean_diffs.div(df.std(axis='columns',ddof=0),axis='index')
6.groupby分组
1 values = np.array([1, 3, 2, 4, 1, 6, 4]) 2 example_df = pd.DataFrame({ 3 'value': values, 4 'even': values % 2 == 0, 5 'above_three': values > 3 6 }, index=['a', 'b', 'c', 'd', 'e', 'f', 'g']) 7 8 print example_df 9 10 #根据单列进行分组 11 grouped_data = example_df.groupby('even') 12 print grouped_data.groups 13 #根据多列进行分组 14 grouped_data = example_df.groupby(['even','above_three']) 15 print grouped_data.groups 16 17 grouped_data = example_df.groupby('even') 18 print grouped_data.sum() #分组后求和 19 print grouped_data.sum()['value'] #先求和再根据value分组 20 print grouped_data['value'].sum() #继续根据value分组在求和
例:
1.计算value的标准偏差
1 def standardize(xs): 2 return (xs-xs.mean())/xs.std(ddof=0) 3 4 #根据even字段分组 5 grouped_data=example_df.groupby('even') 6 7 #根据value在分组然后计算标准差 8 print grouped_data['value'].apply(standardize)
2.画出车站每周每小时的使用的平均值
1 %pylab inline 2 import seaborn as sns 3 subway_df = pd.read_csv('nyc-subway-weather.csv') 4 #根据day_week分组,然后获取平均值,最后获取ENTRIESn_hourly列的值 5 ridership_by_day = subway_df.groupby('day_week').mean()['ENTRIESn_hourly'] 6 ridership_by_day.plot()
3.获取每个地铁站每个小时的流量
1 ridership_df = pd.DataFrame({ 2 'UNIT': ['R051', 'R079', 'R051', 'R079', 'R051', 'R079', 'R051', 'R079', 'R051'], 3 'TIMEn': ['00:00:00', '02:00:00', '04:00:00', '06:00:00', '08:00:00', '10:00:00', '12:00:00', '14:00:00', '16:00:00'], 4 'ENTRIESn': [3144312, 8936644, 3144335, 8936658, 3144353, 8936687, 3144424, 8936819, 3144594], 5 'EXITSn': [1088151, 13755385, 1088159, 13755393, 1088177, 13755598, 1088231, 13756191, 1088275] 6 }) 7 def hour_by_group(entries_and_exits): 8 return entries_and_exits-entries_and_exits.shift(1) 9 #获取每个车站的每小时的进出口 10 def get_hourly_entries_and_exits(entries_and_exits): 11 #根据UNIT字段分组,然后,获取相应的列,最后调用自定义的方法得出结论 12 return entries_and_exits.groupby('UNIT')['ENTRIESn','EXITSn'].apply(hour_by_group)
7.merge组合,将两个结果集根据某些字段进行组合,整合为一个结果集
1 subway_df = pd.DataFrame({ 2 'UNIT': ['R003', 'R003', 'R003', 'R003', 'R003', 'R004', 'R004', 'R004', 3 'R004', 'R004'], 4 'DATEn': ['05-01-11', '05-02-11', '05-03-11', '05-04-11', '05-05-11', 5 '05-01-11', '05-02-11', '05-03-11', '05-04-11', '05-05-11'], 6 'hour': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 7 'ENTRIESn': [ 4388333, 4388348, 4389885, 4391507, 4393043, 14656120, 8 14656174, 14660126, 14664247, 14668301], 9 'EXITSn': [ 2911002, 2911036, 2912127, 2913223, 2914284, 14451774, 10 14451851, 14454734, 14457780, 14460818], 11 'latitude': [ 40.689945, 40.689945, 40.689945, 40.689945, 40.689945, 12 40.69132 , 40.69132 , 40.69132 , 40.69132 , 40.69132 ], 13 'longitude': [-73.872564, -73.872564, -73.872564, -73.872564, -73.872564, 14 -73.867135, -73.867135, -73.867135, -73.867135, -73.867135] 15 }) 16 17 weather_df = pd.DataFrame({ 18 'DATEn': ['05-01-11', '05-01-11', '05-02-11', '05-02-11', '05-03-11', 19 '05-03-11', '05-04-11', '05-04-11', '05-05-11', '05-05-11'], 20 'hour': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 21 'latitude': [ 40.689945, 40.69132 , 40.689945, 40.69132 , 40.689945, 22 40.69132 , 40.689945, 40.69132 , 40.689945, 40.69132 ], 23 'longitude': [-73.872564, -73.867135, -73.872564, -73.867135, -73.872564, 24 -73.867135, -73.872564, -73.867135, -73.872564, -73.867135], 25 'pressurei': [ 30.24, 30.24, 30.32, 30.32, 30.14, 30.14, 29.98, 29.98, 26 30.01, 30.01], 27 'fog': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 28 'rain': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 29 'tempi': [ 52. , 52. , 48.9, 48.9, 54. , 54. , 57.2, 57.2, 48.9, 48.9], 30 'wspdi': [ 8.1, 8.1, 6.9, 6.9, 3.5, 3.5, 15. , 15. , 15. , 15. ] 31 }) 32 33 #将相关联的量组合 34 def combine_dfs(subway_df,weather_df): 35 return subway_df.merge(weather_df,on=['DATEn','hour','latitude','longitude'],how='inner') 36 37 #如果两个列不同,则要通过on_left和on_right来匹配参数
例:
1.做出地铁站位置的散点图,通过点的大小展示哪里的车站人流最高
1 %pylab inline 2 import matplotlib.pyplot as plt 3 import numpy as np 4 import pandas as pd 5 import seaborn as sns 6 subway_df = pd.read_csv('nyc-subway-weather.csv') 7 #根据经度和纬度分组,求出平均数,注意as_index会将字段本身不作为索引,避免出错 8 data_for_location = subway_df.groupby(['latitude','longitude'],as_index=False).mean() 9 #求出每小时的标准偏差,作为图片大小 10 scaled_entries = 11 data_for_location['ENTRIESn_hourly']/data_for_location['ENTRIESn_hourly'].std(ddof=0) 12 #根据纬度为x轴,经度为y轴,s的教正系数,做出散点图 13 plt.scatter(data_for_location['latitude'],data_for_location['longitude'],s=scaled_entries)