zoukankan      html  css  js  c++  java
  • pandas中的时间序列基础

    重要的数据形式时间序列

    datetime以毫秒形式存储日期和时间

    now = datetime.now()
    now
    
    datetime.datetime(2018, 12, 18, 14, 18, 27, 693445)
    
    #now是一个时间对象
    now.year,now.month,now.day
    (2018, 12, 18)
    
    delta = datetime(2011,1,7)-datetime(2008,6,24,8,15)
    delta
    
    datetime.timedelta(days=926, seconds=56700)
    
    delta.days
    926
    
    timedelta表示时间差,默认差值是天数
    start = datetime(2011,7,7)
    start + timedelta(12)
    
    datetime.datetime(2011, 7, 19, 0, 0)
    
    start - 2*timedelta(12)
    datetime.datetime(2011, 6, 13, 0, 0)
    

    字符串和datetime的相互转化

    stamp = datetime(2011, 1, 3)
    str(stamp)
    
    '2011-01-03 00:00:00'
    
    # strftime将时间变为字符串
    stamp.strftime('%Y-%m-%d')
    
    '2011-01-03'
    
    # strptime将字符串转回去
    value = '2011-01-03'
    datetime.strptime(value,'%Y-%m-%d')
    
    datetime.datetime(2011, 1, 3, 0, 0)
    
    datestrs = ['7/6/2011','8/6/2011']
    [datetime.strptime(x,'%m/%d/%Y') for x in datestrs]
    
    [datetime.datetime(2011, 7, 6, 0, 0), datetime.datetime(2011, 8, 6, 0, 0)]
    

    每次定义格式是很麻烦的事情,尤其是对于一些常见的日期格式,在这个情况下,你可以用dateutil这个的第三方包parser.parse方法

    这个包几乎可以解析人类能够理解的日期表示形式

    from dateutil.parser import parse
    parse('2011-01-03')
    
    datetime.datetime(2011, 1, 3, 0, 0)
    
    parse('Jan 31,1997 10:45 PM')
    
    datetime.datetime(2018, 1, 31, 22, 45)
    
    # 国际通用的格式中,日通常出现在月的前面,传入dayfirst=True即可解决这个问题
    parse('6/12/2011',dayfirst=True)
    
    datetime.datetime(2011, 12, 6, 0, 0)
    
    # to_datetime方法可以解析很多种不同的日期表示形式
    datestrs
    ['7/6/2011', '8/6/2011']
    
    pd.to_datetime(datestrs)
    DatetimeIndex(['2011-07-06', '2011-08-06'], dtype='datetime64[ns]', freq=None)
    
    # 它还可以处理缺失值(None,空字符串),NaT是时间戳中的缺失值
    idx = pd.to_datetime(datestrs+[None])
    idx
    
    DatetimeIndex(['2011-07-06', '2011-08-06', 'NaT'], dtype='datetime64[ns]', freq=None)
    
    pd.isnull(idx)
    array([False, False,  True])
    

    时间序列基础

    from datetime import datetime
    
    # pandas 最基本的时间序列类型就是以时间戳为索引
    dates =[datetime(2011,1,2),datetime(2011,1,5),datetime(2011,1,7),
           datetime(2011,1,8),datetime(2011,1,10),datetime(2011,1,12)]
    ts = pd.Series([1,2,3,4,5,6],index=dates)
    ts
    
    
    2011-01-02    1
    2011-01-05    2
    2011-01-07    3
    2011-01-08    4
    2011-01-10    5
    2011-01-12    6
    dtype: int64
    
    ts + ts[::2]
    
    2011-01-02     2.0
    2011-01-05     NaN
    2011-01-07     6.0
    2011-01-08     NaN
    2011-01-10    10.0
    2011-01-12     NaN
    dtype: float64
    

    时间的索引、选取、子集构造

    # 对于较长的时间序列,只需传入'年'或'年月'即可轻松选取数据的切片
    import numpy as np
    #periods这个参数的意思,我测试的意思是,你有多少数据,他会让日期随着增加多少。和前面的randn的随机数量对应
    longer_ts = pd.Series(np.random.randn(1000),index=pd.date_range('1/1/2000',periods=1000))
    longer_ts
    
    2000-01-01    1.134719
    2000-01-02    0.135780
    2000-01-03    0.678652
    2000-01-04   -0.751968
    2000-01-05    0.429753
    2000-01-06    1.107126
    2000-01-07   -0.235910
    2000-01-08    1.119085
    2000-01-09   -0.150530
    2000-01-10    0.831567
    2000-01-11    0.525492
    2000-01-12    1.369756
    2000-01-13   -1.353343
    2000-01-14    0.748277
    2000-01-15    0.292153
    2000-01-16   -0.782864
    2000-01-17    1.698936
    2000-01-18   -1.355965
    2000-01-19   -0.562581
    2000-01-20   -1.333895
    2000-01-21   -0.679781
    2000-01-22    0.568681
    2000-01-23   -0.440312
    2000-01-24    0.045437
    2000-01-25    1.589143
    2000-01-26    0.284029
    2000-01-27    0.597105
    2000-01-28    0.585111
    2000-01-29   -1.011877
    2000-01-30    1.594290
                    ...   
    2002-08-28   -0.052543
    2002-08-29    1.233685
    2002-08-30    0.522945
    2002-08-31    1.145214
    2002-09-01    0.434717
    2002-09-02    0.346381
    2002-09-03   -0.286138
    2002-09-04    0.300973
    2002-09-05    0.220466
    2002-09-06    0.991901
    2002-09-07   -0.194287
    2002-09-08    0.498222
    2002-09-09   -0.760105
    2002-09-10   -0.230607
    2002-09-11    0.464191
    2002-09-12   -0.707616
    2002-09-13   -0.309575
    2002-09-14    2.273895
    2002-09-15   -0.640137
    2002-09-16   -0.416139
    2002-09-17    0.898827
    2002-09-18    0.316116
    2002-09-19   -0.067657
    2002-09-20   -1.296407
    2002-09-21    1.228108
    2002-09-22    0.227808
    2002-09-23   -0.550351
    2002-09-24   -0.378321
    2002-09-25   -0.170426
    2002-09-26   -0.397266
    Freq: D, Length: 1000, dtype: float64
    
    # 直接输入年份,可以取出这一年的
    longer_ts['2001']
    
    2001-01-01    0.698442
    2001-01-02    1.289272
    2001-01-03   -0.644030
    2001-01-04    2.075233
    2001-01-05   -0.815118
    2001-01-06   -0.693868
    2001-01-07    0.599281
    2001-01-08    0.443403
    2001-01-09    1.877780
    2001-01-10   -0.764040
    2001-01-11    0.451113
    2001-01-12   -1.426837
    2001-01-13    1.005724
    2001-01-14   -1.965532
    2001-01-15    0.052981
    2001-01-16   -0.367127
    2001-01-17    2.841093
    2001-01-18    0.451022
    2001-01-19   -0.826358
    2001-01-20    0.241916
    2001-01-21    2.213636
    2001-01-22   -0.870844
    2001-01-23   -0.626682
    2001-01-24   -1.516729
    2001-01-25    0.045325
    2001-01-26   -1.106228
    2001-01-27    0.681209
    2001-01-28    1.833933
    2001-01-29   -1.502188
    2001-01-30   -1.162823
                    ...   
    2001-12-02    0.903314
    2001-12-03    1.338822
    2001-12-04    1.326302
    2001-12-05    0.964913
    2001-12-06   -0.165172
    2001-12-07   -0.690804
    2001-12-08    0.381124
    2001-12-09    2.526006
    2001-12-10   -1.127983
    2001-12-11   -1.162128
    2001-12-12    0.461497
    2001-12-13   -0.830332
    2001-12-14    0.379069
    2001-12-15   -0.800934
    2001-12-16    1.524858
    2001-12-17    0.749656
    2001-12-18    0.922253
    2001-12-19   -1.220435
    2001-12-20    0.513252
    2001-12-21    2.233032
    2001-12-22    0.151856
    2001-12-23   -0.481607
    2001-12-24    0.737862
    2001-12-25   -0.637651
    2001-12-26    0.163501
    2001-12-27   -0.720798
    2001-12-28    0.029192
    2001-12-29   -0.773972
    2001-12-30   -2.377855
    2001-12-31    0.086702
    Freq: D, Length: 365, dtype: float64
    
    longer_ts['2001-07']
    
    2001-07-01   -0.868169
    2001-07-02    1.109987
    2001-07-03   -0.889585
    2001-07-04   -0.568596
    2001-07-05    0.749743
    2001-07-06    0.019171
    2001-07-07   -0.348141
    2001-07-08   -0.222702
    2001-07-09    0.294682
    2001-07-10   -1.780858
    2001-07-11    1.166257
    2001-07-12   -0.167143
    2001-07-13   -0.424275
    2001-07-14    1.393253
    2001-07-15   -1.485840
    2001-07-16    0.980488
    2001-07-17    1.018981
    2001-07-18    0.907556
    2001-07-19    0.105748
    2001-07-20   -0.201183
    2001-07-21    0.867441
    2001-07-22   -0.951957
    2001-07-23   -0.716637
    2001-07-24   -0.995653
    2001-07-25    0.439383
    2001-07-26   -0.927410
    2001-07-27   -1.997120
    2001-07-28   -1.022692
    2001-07-29    0.179568
    2001-07-30    0.586362
    2001-07-31    0.057300
    Freq: D, dtype: float64
    
    ts
    2011-01-02    1
    2011-01-05    2
    2011-01-07    3
    2011-01-08    4
    2011-01-10    5
    2011-01-12    6
    dtype: int64
    
    # 切片取数
    ts[datetime(2011,1,7):]
    
    2011-01-07    3
    2011-01-08    4
    2011-01-10    5
    2011-01-12    6
    dtype: int64
    
    ts['01/09/2011':'01/11/2011']
    
    2011-01-10    5
    dtype: int64
    
    dates = pd.date_range('1/1/2000',periods=100,freq='W-WED')
    dates
    
    DatetimeIndex(['2000-01-05', '2000-01-12', '2000-01-19', '2000-01-26',
                   '2000-02-02', '2000-02-09', '2000-02-16', '2000-02-23',
                   '2000-03-01', '2000-03-08', '2000-03-15', '2000-03-22',
                   '2000-03-29', '2000-04-05', '2000-04-12', '2000-04-19',
                   '2000-04-26', '2000-05-03', '2000-05-10', '2000-05-17',
                   '2000-05-24', '2000-05-31', '2000-06-07', '2000-06-14',
                   '2000-06-21', '2000-06-28', '2000-07-05', '2000-07-12',
                   '2000-07-19', '2000-07-26', '2000-08-02', '2000-08-09',
                   '2000-08-16', '2000-08-23', '2000-08-30', '2000-09-06',
                   '2000-09-13', '2000-09-20', '2000-09-27', '2000-10-04',
                   '2000-10-11', '2000-10-18', '2000-10-25', '2000-11-01',
                   '2000-11-08', '2000-11-15', '2000-11-22', '2000-11-29',
                   '2000-12-06', '2000-12-13', '2000-12-20', '2000-12-27',
                   '2001-01-03', '2001-01-10', '2001-01-17', '2001-01-24',
                   '2001-01-31', '2001-02-07', '2001-02-14', '2001-02-21',
                   '2001-02-28', '2001-03-07', '2001-03-14', '2001-03-21',
                   '2001-03-28', '2001-04-04', '2001-04-11', '2001-04-18',
                   '2001-04-25', '2001-05-02', '2001-05-09', '2001-05-16',
                   '2001-05-23', '2001-05-30', '2001-06-06', '2001-06-13',
                   '2001-06-20', '2001-06-27', '2001-07-04', '2001-07-11',
                   '2001-07-18', '2001-07-25', '2001-08-01', '2001-08-08',
                   '2001-08-15', '2001-08-22', '2001-08-29', '2001-09-05',
                   '2001-09-12', '2001-09-19', '2001-09-26', '2001-10-03',
                   '2001-10-10', '2001-10-17', '2001-10-24', '2001-10-31',
                   '2001-11-07', '2001-11-14', '2001-11-21', '2001-11-28'],
                  dtype='datetime64[ns]', freq='W-WED')
    
    long_df = pd.DataFrame(np.random.randn(100,4),index=dates,columns=['Colorado','Texas','New York','Ohio'])
    long_df.loc['2001-05']
    
                  Colorado	   Texas	 New York	  Ohio
    2001-05-02	-1.380726	-0.411279	0.153217	1.494666
    2001-05-09	2.554090	1.930090	-0.181046	0.866642
    2001-05-16	1.068669	1.494460	-1.386345	0.839434
    2001-05-23	0.988561	-1.986414	0.681924	0.939525
    2001-05-30	0.349177	1.213020	0.432394	-0.223059
    

    带有重复索引的时间序列

    dates = pd.DatetimeIndex(['1/1/2000','1/2/2000','1/2/2000','1/3/2000'])
    dyp_tus = pd.Series([1,2,3,4],index=dates)
    dyp_tus
    
    2000-01-01    1
    2000-01-02    2
    2000-01-02    3
    2000-01-03    4
    dtype: int64
    
    # 判断出来不是唯一,有重复时间,但是具体哪一行不好判断
    dyp_tus.index.is_unique
    False
    
    # 分组可以查看出是哪一行不是唯一索引
    grouped = dyp_tus.groupby(level=0)
    grouped.count()
    
    2000-01-01    1
    2000-01-02    2
    2000-01-03    1
    dtype: int64
    
  • 相关阅读:
    软工试水日报 3/7
    软工试水日报 3/6
    软工试水日报 3/5
    软工试水日报 3/4
    软工试水日报 3/3
    大二下学期每日总结之第一次个人作业(第二阶段:生成excel)
    大二下学期每日总结之第一次个人作业(第一阶段)
    大二下学期每日总结之第一次个人作业(第一阶段)
    大二下学期每日总结
    大二下学期每日总结
  • 原文地址:https://www.cnblogs.com/lishi-jie/p/10138588.html
Copyright © 2011-2022 走看看