import matplotlib.pyplot as plt %matplotlib inline from sklearn import metrics import numpy as np import pandas as pd from sklearn import datasets, linear_model data = np.loadtxt('ccpp.csv',delimiter=',',skiprows=(1),unpack=True).T data = np.matrix(data) X = data[:,0:4] y = data[:,4] X_norm = np.array(X) mu = np.zeros((1,X.shape[1])) sigma = np.zeros((1,X.shape[1])) mu = np.mean(X_norm,0) # 求每一列的平均值(0指定为列,1代表行) sigma = np.std(X_norm,0) # 求每一列的标准差 for i in range(X.shape[1]): # 遍历列 X_norm[:,i] = (X_norm[:,i]-mu[i])/sigma[i] # 归一化 # print(X_norm[:,1]-mu[1]) # print(X_norm[:,1]) X = X_norm m,n = X.shape X = np.hstack((np.ones((m,1)),X)) X.shape # print(X) ##划分数据集变成训练集和测试集 #我们把X和y的样本组合划分成两部分,一部分是训练集,一部分是测试集,代码如下: from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1) ss = np.ones(len(X_train)-6062) w = np.matrix([[1],[1],[1],[1],[1]]) for i in np.arange(0,len(X_train)-6062): t = X_train[i] t=np.matrix(t) yt = y_train[i] yt=np.matrix(yt) l = t.T*(yt-t*w) ws = 0.01*l w = w+ws y_tt = y_test*1 for t in np.arange(0,len(X_test)): y_tt[t] = (X_test[t]*w) ss[i] = metrics.mean_squared_error(y_test, y_tt) print(w) y_tt = y_test*1 for i in np.arange(0,len(X_test)): y_tt[i] = (X_test[i]*w) re = np.array(y_test.T) res = np.array(y_tt.T) fig, ax = plt.subplots() ax.scatter(re[0],res[0]) ax.plot([y.min(), y.max()], [y.min(), y.max()], 'k', lw=4) ax.set_xlabel('Measured') ax.set_ylabel('Predicted') plt.show()
数据集: http://archive.ics.uci.edu/ml/datasets/Combined+Cycle+Power+Plant