zoukankan      html  css  js  c++  java
  • Python二维数据分析

    一.numpy二维数组

    1.声明

     1 import numpy as np
     2 
     3 #每一个[]代表一行
     4 ridership = np.array([
     5     [   0,    0,    2,    5,    0],
     6     [1478, 3877, 3674, 2328, 2539],
     7     [1613, 4088, 3991, 6461, 2691],
     8     [1560, 3392, 3826, 4787, 2613],
     9     [1608, 4802, 3932, 4477, 2705],
    10     [1576, 3933, 3909, 4979, 2685],
    11     [  95,  229,  255,  496,  201],
    12     [   2,    0,    1,   27,    0],
    13     [1438, 3785, 3589, 4174, 2215],
    14     [1342, 4043, 4009, 4665, 3033]
    15 ])
    View Code

    2.取值

    1 print ridership[1,3] #取第二行,第四个数
    2 print ridership[1:3,3:5] #取子集
    3 print ridership[1,:]#取第二行
    4 print ridership[0,:]+ridership[1,:]#第一行的值加上第二行的值
    5 print ridership[:,0]+ridership[:,1]#第一列的值加上第二列的值
    View Code

    3.numpy二维数组之间相加

    根据线性代数的规则进行相加

    1 a=np.array([[1,2,3],[4,5,6],[7,8,9]])
    2 b=np.array([[1,2,3],[4,5,6],[7,8,9]])
    3 print a+b
    View Code

    4.numpy二维数组的求和

    1 a = np.array([
    2         [1, 2, 3],
    3         [4, 5, 6],
    4         [7, 8, 9]
    5     ])
    6 
    7 print a.sum() #求出数组所有值的和
    8 print a.sum(axis=0) #按照行计算总和
    9 print a.sum(axis=1) #按照列计算总和
    View Code

    例:

    1.求出最大车流的车站的平均数

    1 def mean_riders_for_max_station(ridership):
    2     max_station = ridership[0,:].argmax() #找出最大的车站的下标
    3     overall_mean = ridership.mean() #求出总体的平均值
    4     mean_for_max = ridership[:,max_station].mean() #根据最大的车站,找到该下标的所有列,求出平均值
    5     return (overall_mean,mean_for_max)
    View Code

    2.求出平均车流量的最大和最小的平均值

    1 def min_and_max_riders_per_day(ridership):
    2     max_station = ridership.mean(axis=0)
    3     max_daily_ridership = max_station.max()
    4     min_daily_ridership = max_station.min()
    5     return (max_daily_ridership,min_daily_ridership)
    View Code

    二.pandas二维数组

    1.声明

     1 import pandas as pd
     2 ridership_df = pd.DataFrame(
     3     data=[[   0,    0,    2,    5,    0],
     4           [1478, 3877, 3674, 2328, 2539],
     5           [1613, 4088, 3991, 6461, 2691],
     6           [1560, 3392, 3826, 4787, 2613],
     7           [1608, 4802, 3932, 4477, 2705],
     8           [1576, 3933, 3909, 4979, 2685],
     9           [  95,  229,  255,  496,  201],
    10           [   2,    0,    1,   27,    0],
    11           [1438, 3785, 3589, 4174, 2215],
    12           [1342, 4043, 4009, 4665, 3033]],
    13     index=['05-01-11', '05-02-11', '05-03-11', '05-04-11', '05-05-11',
    14            '05-06-11', '05-07-11', '05-08-11', '05-09-11', '05-10-11'],
    15     columns=['R003', 'R004', 'R005', 'R006', 'R007']
    16 )
    View Code

    2.取值

    1 print ridership_df #以表格的方式展示
    2 print ridership_df.iloc[0] #获取第一行的数据
    3 print ridership_df.loc['05-05-11']#找到该下标所对应行的数据
    4 print ridership_df['R003'] #找出该下标所对应的列的数据
    5 print ridership_df.iloc[1,3] #根据点找出对应的值
    6 print ridership_df[['R003','R005']] #根据多个下标获取多列的数据
    View Code

    3.行列展示

    1 df_1=pd.DataFrame({'A':[0,1,2],'B':[3,4,5]}) #默认以列的方式展示
    2 print df_1
    3 df_2=pd.DataFrame([[0,1,2],[3,4,5]],columns=['A','B','C']) #以行的方式展示
    4 print df_2
    View Code

    例:

    1.求出最大车流的车站的平均数

    1 def mean_riders_for_max_station(ridership):
    2     max_station = ridership.iloc[0].argmax() #获取第一行的最大的下边,即为最繁忙的地铁站
    3     overall_mean = ridership.values.mean() #获取数组中值的所有平均数
    4     mean_for_max = ridership[max_station].mean() #取出平均该列的的客流量
    5     return (overall_mean,mean_for_max)
    View Code

    2.相关性

    概念:相关性的值(-1,+1),如果越接近+1表示两个量程正相关,越接近-1表示程负相关,接近0表示越无关

    1 subway_df = pd.read_csv('nyc-subway-weather.csv') #下载区有文件
    2 def correlation(x,y):
    3     #该函数体现了表格中两个变量之间的相关性
    4     std_x = (x-x.mean())/x.std(ddof=0)
    5     std_y = (y-y.mean())/y.std(ddof=0)
    6     return (std_x*std_y).mean()
    View Code
     1 entries = subway_df['ENTRIESn_hourly'] #获取出对应列的值
     2 cum_entries = subway_df['ENTRIESn']
     3 rain = subway_df['meanprecipi']
     4 temp = subway_df['meantempi']
     5 
     6 #找出两个变量之间的关联度
     7 print correlation(entries,rain)
     8 print correlation(entries,temp)
     9 print correlation(rain,temp)
    10 print correlation(entries,cum_entries)
    View Code

    3.DataFrame相关操作

    符合线性代数的计算规则,如果两个DataFrame的值不同,就用NaN值填充

     1 df1=pd.DataFrame({'a':[1,2,3],'b':[4,5,6],'c':[7,8,9]})
     2 df2=pd.DataFrame({'a':[10,20,30],'b':[40,50,60],'c':[70,80,90]})
     3 print df1+df2
     4 
     5 df1 = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]})
     6 df2 = pd.DataFrame({'d': [10, 20, 30], 'c': [40, 50, 60], 'b': [70, 80, 90]})
     7 print df1 + df2
     8 
     9 df1=pd.DataFrame({'a':[1,2,3],'b':[4,5,6],'c':[7,8,9]},index=['row1','row2','row3'])
    10 df2=pd.DataFrame({'a':[10,20,30],'b':[40,50,60],'c':[70,80,90]},index=['row1','row2','row3'])
    11 print df1+df2
    View Code

    例:

    1.求出每小时的地铁的流量

    1 entries_and_exits = pd.DataFrame({
    2     'ENTRIESn': [3144312, 3144335, 3144353, 3144424, 3144594,
    3                  3144808, 3144895, 3144905, 3144941, 3145094],
    4     'EXITSn': [1088151, 1088159, 1088177, 1088231, 1088275,
    5                1088317, 1088328, 1088331, 1088420, 1088753]
    6 })
    7 #获取每小时的进出地铁的人数
    8 def get_hourly_entries_and_exits(entries_and_exits):
    9     return entries_and_exits-entries_and_exits.shift(1)
    View Code

    4.applymap方法,将DataFrame的所有值通过自定义方法得以修改

     1 df = pd.DataFrame({
     2     'a': [1, 2, 3],
     3     'b': [10, 20, 30],
     4     'c': [5, 10, 15]
     5 })
     6 
     7 def add_one(x):
     8     return x + 1
     9 
    10 
    11 print df.applymap(add_one)
    View Code

    例:

    1.将学生的成绩转换为等级(A,B,C,D,F)

     1 grades_df = pd.DataFrame(
     2     data={'exam1': [43, 81, 78, 75, 89, 70, 91, 65, 98, 87],
     3           'exam2': [24, 63, 56, 56, 67, 51, 79, 46, 72, 60]},
     4     index=['Andre', 'Barry', 'Chris', 'Dan', 'Emilio', 
     5            'Fred', 'Greta', 'Humbert', 'Ivan', 'James']
     6 )
     7 def convert_grade(grade):
     8     if grade>=90:
     9         return 'A'
    10     elif grade>=80:
    11         return 'B'
    12     elif grade>=70:
    13         return 'C'
    14     elif grade>=60:
    15         return 'D'
    16     else:
    17         return 'F'
    18 def convert_grades(grades):
    19     return grades.applymap(convert_grade)
    View Code

    2.计算标准差

    1 def standardize_column(df):
    2     return (df-df.mean())/df.std(ddof=0) #ddof表示使用贝塞尔教正参数
    3 
    4 def standardize(df):
    5     return df.apply(standardize_column)
    6 print standardize(grades_df)
    View Code

    3.获取数组中第二大的值

     1 df = pd.DataFrame({
     2     'a': [4, 5, 3, 1, 2],
     3     'b': [20, 10, 40, 50, 30],
     4     'c': [25, 20, 5, 15, 10]
     5 })
     6 
     7 def second_largest_column(column):
     8     #将数据以降序进行排列,下标为1的就是第二大的数
     9     sorted_column = column.sort_values(ascending=False)
    10     return sorted_column.iloc[1]
    11 
    12 def senond_large(df):
    13     return df.apply(second_largest_column)
    View Code

    5.DataFrame和Seriers操作

    1 s=pd.Series([1,2,3,4])
    2 df=pd.DataFrame({0:[10,20,30,40],1:[50,60,70,80],2:[90,100,110,120],3:[130,140,150,160]})
    3 print df+s #每行对应数值相加
    4 
    5 df = pd.DataFrame({0: [10], 1: [20], 2: [30], 3: [40]})
    6 print df+s #行相加
    7 
    8 df = pd.DataFrame({0: [10, 20, 30, 40]})
    9 print df+s #列相加
    View Code
     1 s = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
     2 df = pd.DataFrame({
     3     'a': [10, 20, 30, 40],
     4     'b': [50, 60, 70, 80],
     5     'c': [90, 100, 110, 120],
     6     'd': [130, 140, 150, 160]
     7 })
     8 print df+s #默认进行行相加
     9 print df.add(s,axis='index') #列相加,此处不匹配,显示NaN值
    10 print df.add(s,axis='columns') #指定进行行相加
    View Code

    例:

    1.计算学生的标准偏差

     1 grades_df = pd.DataFrame(
     2     data={'exam1': [43, 81, 78, 75, 89, 70, 91, 65, 98, 87],
     3           'exam2': [24, 63, 56, 56, 67, 51, 79, 46, 72, 60]},
     4     index=['Andre', 'Barry', 'Chris', 'Dan', 'Emilio', 
     5            'Fred', 'Greta', 'Humbert', 'Ivan', 'James']
     6 )
     7 
     8 #计算每个学生的标准偏差
     9 def standardize(df):
    10     return (df-df.mean())/df.std(ddof=0)
    11 
    12 def standardize_row(df):
    13     #1.计算每个学生考试成绩和平均成绩的差
    14     #2.再计算每个学生的样本偏差
    15     mean_diffs = df.sub(df.mean(axis='columns'),axis='index')
    16     return mean_diffs.div(df.std(axis='columns',ddof=0),axis='index')
    View Code

    6.groupby分组

     1 values = np.array([1, 3, 2, 4, 1, 6, 4])
     2 example_df = pd.DataFrame({
     3     'value': values,
     4     'even': values % 2 == 0,
     5     'above_three': values > 3 
     6 }, index=['a', 'b', 'c', 'd', 'e', 'f', 'g'])
     7 
     8 print example_df
     9 
    10 #根据单列进行分组
    11 grouped_data = example_df.groupby('even')
    12 print grouped_data.groups
    13 #根据多列进行分组
    14 grouped_data = example_df.groupby(['even','above_three'])
    15 print grouped_data.groups
    16 
    17 grouped_data = example_df.groupby('even')
    18 print grouped_data.sum() #分组后求和
    19 print grouped_data.sum()['value'] #先求和再根据value分组
    20 print grouped_data['value'].sum() #继续根据value分组在求和
    View Code

    例:

    1.计算value的标准偏差

    1 def standardize(xs):
    2     return (xs-xs.mean())/xs.std(ddof=0)
    3 
    4 #根据even字段分组
    5 grouped_data=example_df.groupby('even')
    6 
    7 #根据value在分组然后计算标准差
    8 print grouped_data['value'].apply(standardize)
    View Code

    2.画出车站每周每小时的使用的平均值

    1 %pylab inline
    2 import seaborn as sns
    3 subway_df = pd.read_csv('nyc-subway-weather.csv')
    4 #根据day_week分组,然后获取平均值,最后获取ENTRIESn_hourly列的值
    5 ridership_by_day = subway_df.groupby('day_week').mean()['ENTRIESn_hourly']
    6 ridership_by_day.plot()
    View Code

    3.获取每个地铁站每个小时的流量

     1 ridership_df = pd.DataFrame({
     2     'UNIT': ['R051', 'R079', 'R051', 'R079', 'R051', 'R079', 'R051', 'R079', 'R051'],
     3     'TIMEn': ['00:00:00', '02:00:00', '04:00:00', '06:00:00', '08:00:00', '10:00:00', '12:00:00', '14:00:00', '16:00:00'],
     4     'ENTRIESn': [3144312, 8936644, 3144335, 8936658, 3144353, 8936687, 3144424, 8936819, 3144594],
     5     'EXITSn': [1088151, 13755385,  1088159, 13755393,  1088177, 13755598, 1088231, 13756191,  1088275]
     6 })
     7 def hour_by_group(entries_and_exits):
     8     return entries_and_exits-entries_and_exits.shift(1)
     9 #获取每个车站的每小时的进出口
    10 def get_hourly_entries_and_exits(entries_and_exits):
    11     #根据UNIT字段分组,然后,获取相应的列,最后调用自定义的方法得出结论
    12     return entries_and_exits.groupby('UNIT')['ENTRIESn','EXITSn'].apply(hour_by_group)
    View Code

    7.merge组合,将两个结果集根据某些字段进行组合,整合为一个结果集

     1 subway_df = pd.DataFrame({
     2     'UNIT': ['R003', 'R003', 'R003', 'R003', 'R003', 'R004', 'R004', 'R004',
     3              'R004', 'R004'],
     4     'DATEn': ['05-01-11', '05-02-11', '05-03-11', '05-04-11', '05-05-11',
     5               '05-01-11', '05-02-11', '05-03-11', '05-04-11', '05-05-11'],
     6     'hour': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
     7     'ENTRIESn': [ 4388333,  4388348,  4389885,  4391507,  4393043, 14656120,
     8                  14656174, 14660126, 14664247, 14668301],
     9     'EXITSn': [ 2911002,  2911036,  2912127,  2913223,  2914284, 14451774,
    10                14451851, 14454734, 14457780, 14460818],
    11     'latitude': [ 40.689945,  40.689945,  40.689945,  40.689945,  40.689945,
    12                   40.69132 ,  40.69132 ,  40.69132 ,  40.69132 ,  40.69132 ],
    13     'longitude': [-73.872564, -73.872564, -73.872564, -73.872564, -73.872564,
    14                   -73.867135, -73.867135, -73.867135, -73.867135, -73.867135]
    15 })
    16 
    17 weather_df = pd.DataFrame({
    18     'DATEn': ['05-01-11', '05-01-11', '05-02-11', '05-02-11', '05-03-11',
    19               '05-03-11', '05-04-11', '05-04-11', '05-05-11', '05-05-11'],
    20     'hour': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
    21     'latitude': [ 40.689945,  40.69132 ,  40.689945,  40.69132 ,  40.689945,
    22                   40.69132 ,  40.689945,  40.69132 ,  40.689945,  40.69132 ],
    23     'longitude': [-73.872564, -73.867135, -73.872564, -73.867135, -73.872564,
    24                   -73.867135, -73.872564, -73.867135, -73.872564, -73.867135],
    25     'pressurei': [ 30.24,  30.24,  30.32,  30.32,  30.14,  30.14,  29.98,  29.98,
    26                    30.01,  30.01],
    27     'fog': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
    28     'rain': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
    29     'tempi': [ 52. ,  52. ,  48.9,  48.9,  54. ,  54. ,  57.2,  57.2,  48.9,  48.9],
    30     'wspdi': [  8.1,   8.1,   6.9,   6.9,   3.5,   3.5,  15. ,  15. ,  15. ,  15. ]
    31 })
    32 
    33 #将相关联的量组合
    34 def combine_dfs(subway_df,weather_df):
    35     return subway_df.merge(weather_df,on=['DATEn','hour','latitude','longitude'],how='inner')
    36 
    37 #如果两个列不同,则要通过on_left和on_right来匹配参数
    View Code

    例:

    1.做出地铁站位置的散点图,通过点的大小展示哪里的车站人流最高

     1 %pylab inline
     2 import matplotlib.pyplot as plt
     3 import numpy as np
     4 import pandas as pd
     5 import seaborn as sns
     6 subway_df = pd.read_csv('nyc-subway-weather.csv')
     7 #根据经度和纬度分组,求出平均数,注意as_index会将字段本身不作为索引,避免出错
     8 data_for_location = subway_df.groupby(['latitude','longitude'],as_index=False).mean()
     9 #求出每小时的标准偏差,作为图片大小
    10 scaled_entries = 
    11 data_for_location['ENTRIESn_hourly']/data_for_location['ENTRIESn_hourly'].std(ddof=0)
    12 #根据纬度为x轴,经度为y轴,s的教正系数,做出散点图
    13 plt.scatter(data_for_location['latitude'],data_for_location['longitude'],s=scaled_entries)
    View Code
  • 相关阅读:
    机器学习、图像识别方面 书籍推荐 via zhihu
    网络工具 NetCat
    CSharp读取配置文件的类(简单实现)
    about future
    Google's BBR拥塞控制算法模型解析
    对称加密与非对称加密
    windows平台下新网络库RIO ( Winsock high-speed networking Registered I/O)
    在mac os下编译android -相关文章
    [原创] linux 下上传 datapoint数据到yeelink 【golang版本】同时上传2个数据点
    在 树莓派上使用 c++ libsockets library
  • 原文地址:https://www.cnblogs.com/luhuajun/p/7656803.html
Copyright © 2011-2022 走看看