import numpy as np import pandas as pd import matplotlib.pyplot as plt plt.rcParams['font.sans-serif'] = ['stxiHei']from sklearn.model_selection import train_test_split from sklearn.linear_model import LinearRegression import seaborn as sns from sklearn.metrics import r2_score, mean_squared_error data = pd.read_csv(r'Statistics汽车销售数据.csv',encoding = 'utf-8') data = data[data['公路里程数'].notna()]#只有1行缺失值,直接舍弃 data = data[['传统汽车销量','国内生产总值当季值(亿元)x1', '汽油价格(元/吨)x2', '人民币贷款基准利率%x3', '汽车总产量(万辆)x4', '公路里程数', '汽车整车股票指数', '消费者信心指数']] data.head() #数据的相关关系 cormatrix = data.corr() cormatrix *= np.tri(*cormatrix.values.shape,k=-1).T cormatrix #计算相关系数 corr_all = data.corr() #创建分布 mask = np.zeros_like(corr_all,dtype = np.bool) # mask[np.triu_indices_from(mask)] = True#创建firue,画出heatmap g,ax = plt.subplots(figsize = (9,7)) sns.heatmap(corr_all,mask = mask,square = True, linewidths = .5,ax = ax ,cmap = 'BuPu') plt.title('Correlation of Feactures') plt.show() #特征选取 X = data[['国内生产总值当季值(亿元)x1', '汽油价格(元/吨)x2', '人民币贷款基准利率%x3','公路里程数', '汽车整车股票指数', '消费者信心指数']] y = data['传统汽车销量'] plt.figure()for i in range(len(X.columns)): plt.scatter(X.iloc[:,i], y, color = np.array(plt.cm.tab10(i/len(X.columns))), label = X.columns[i]) plt.legend() plt.show() X = data[['国内生产总值当季值(亿元)x1', '汽油价格(元/吨)x2','公路里程数', '汽车整车股票指数', '消费者信心指数']] y = data['传统汽车销量'] X.head() #划分训练和测试数据集,为后续具有可重复性,设定随机种子random_state = 666 X_train, X_test, y_train, y_test = train_test_split(X,y,random_state = 666) #模型 lin_reg0 = LinearRegression() lin_reg0.fit(X_train,y_train) y_predict = lin_reg0.predict(X_test)print('r2_score:' + str(r2_score(y_test,y_predict)))print('MSE:' + str(mean_squared_error(y_test,y_predict))) #定义个绘制参数的函数,绘制残差图: def plot_risiduals(model,X_train,X_test,y_train,y_test): fig,ax = plt.subplots(figsize = (7,4)) risiduals_train = model.predict(X_train) - y_train ax.scatter(y_train,risiduals_train,label = 'Train',color = 'r') risiduals_test = model.predict(X_test) - y_test ax.scatter(y_test,risiduals_test,label = 'Test',color = 'k') plt.axhline(y=0.0, c="b", ls="--", lw=2) plt.title('Risiduals') plt.xlabel('True') plt.ylabel('Risiduals') plot_risiduals(lin_reg0,X_train,X_test,y_train,y_test) #调整特征数量 X_train = X_train[['国内生产总值当季值(亿元)x1', '汽车整车股票指数', '消费者信心指数']] X_test = X_test[['国内生产总值当季值(亿元)x1', '汽车整车股票指数', '消费者信心指数']] lin_reg1 = LinearRegression() lin_reg1.fit(X_train,y_train) y_predict = lin_reg1.predict(X_test)print('r2_score:' + str(r2_score(y_test,y_predict)))print('MSE:' + str(mean_squared_error(y_test,y_predict))) r2_score:0.9167941097031658 MSE:3878.5666590026112 plot_risiduals(lin_reg1,X_train,X_test,y_train,y_test) #预测 True_Predict = {'True':y_test,'Predict':y_predict,'Risiduals':y_test-y_predict} pd.DataFrame(True_Predict)