ARIMA预测
# -*- coding: utf-8 -*- """ Created on Fri Mar 22 21:03:34 2019 @author: Administrator """ import pandas as pd import numpy as np import matplotlib.pyplot as plt from datetime import datetime from datetime import timedelta num = 14 filenames = [] basepath = 'D:\pworkspace\data\Metro_train\' for i in range(1, num+1): if i < 10: filenames.append(basepath + 'record_2019-01-0' + str(i) + '.csv') else: filenames.append(basepath + 'record_2019-01-' + str(i) + '.csv') flag = True for filename in filenames: df = pd.read_csv(filename) df['time'] = df['time'].str[:-4] + '0:00' df['time'] = pd.to_datetime(df['time']) df0 = df[df['stationID'] == 0].copy() del df user_in = df0[df0['status'] == 1] user_out = df0[df0['status'] == 0] user_in = user_in.groupby('time') user_out = user_out.groupby('time') user_in = user_in.count() user_out = user_out.count() user_in['count'] = user_in['userID'] user_out['count'] = user_out['userID'] user_in = user_in.drop(['lineID', 'stationID', 'deviceID', 'status', 'payType', 'userID'], axis=1) user_out = user_out.drop(['lineID', 'stationID', 'deviceID', 'status', 'payType', 'userID'], axis=1) if flag: user_in_all = user_in #user_out_all = user_out flag = False else: user_in_all = pd.concat([user_in_all,user_in], axis=0) #user_out_all = pd.concat([user_out_all,user_out], axis=0) #start = datetime(2019,1,1,0,0,0) #timelist = [ str(start + timedelta(seconds=600*i)) for i in range(24 * 6 * 2)] startdate = datetime(2019,1,1,0,0,0) enddate = startdate + timedelta(days=num-1, minutes=50, hours=23) all_time_data = pd.DataFrame({'time' : pd.date_range(start=str(startdate), end=str(enddate), freq='10T')}) all_time_data['count'] = 0 all_time_data.index = all_time_data['time'] all_time_data = all_time_data.drop('time', axis=1) user_in_all = pd.merge(all_time_data, user_in_all, right_on='time', left_index=True, how='outer') user_in_all[np.isnan(user_in_all['count_y'])] = 0 user_in_all['count_x'] = user_in_all['count_x'] + user_in_all['count_y'] user_in_all['count'] = user_in_all['count_x'] user_in_all = user_in_all.drop(['count_x', 'count_y'], axis=1) user_in_all.plot(figsize=(15,8)) plt.show() ts = user_in_all['count'] ts_ewma = pd.DataFrame(ts).ewm(span=60).mean() ts_ewma.plot(figsize=(15,8)) plt.show() from statsmodels.tsa.stattools import acf, pacf, adfuller from statsmodels.stats.diagnostic import acorr_ljungbox from statsmodels.tsa.arima_model import ARIMA #import statsmodels.api as sm ts_diff_1 = ts_ewma.diff(1).dropna(axis=0, how='any') ts_diff_1 = ts_diff_1['count'] # ADF平稳性检验 adfuller(ts_diff_1, autolag='AIC') # 白噪声检验 acorr_ljungbox(ts_diff_1, 1) # ACF PACF lag_acf = acf(ts_diff_1, nlags=50) lag_pacf = pacf(ts_diff_1, nlags=50) plt.figure(facecolor='white', figsize=(15, 8)) plt.plot(lag_acf) plt.show() plt.figure(facecolor='white', figsize=(15, 8)) plt.plot(lag_pacf) plt.axhline(y=-1.9/np.sqrt(len(ts_diff_1)), linestyle='--', color='gray') plt.axhline(y=1.9/np.sqrt(len(ts_diff_1)), linestyle='--', color='gray') plt.show() model = ARIMA(ts_diff_1, order=(6, 0, 0)) ts_predict = model.fit().predict() rmse = np.sqrt(sum((ts_predict - ts_diff_1)**2) / ts_diff_1.size) plt.figure(facecolor='white', figsize=(15, 8)) plt.plot(ts_predict, lw=0.5, color='blue', label='Predict') plt.plot(ts_diff_1, lw=0.5, color='red', label='Original') plt.legend(loc='lower right') #plt.ylim((-1000, 1000)) plt.show()
运行结果