关于 多元线性回归
简单线性回归:假设样本只有一个特征值;
多元线性回归:解决 很多特征值 。
$ hat{y}^{(i)} = heta_{0} + heta_{1}X_1^{(i)} + heta_{2}X_2^{(i)} + ... + heta_{n}X_n^{(i)} $
求解
目标:要找到 $ heta_{0}, heta_{1}, heta_{2} ... heta_{n} $ 使得 目标函数(损失函数) $ sum_{i=1}^m ( y^{(i)} - hat{y}^{(i)} )^2 $ 尽可能小
将上面多个 $ heta $ 整理为一个元素,转置为 列向量。
$ heta = ( heta_{0}, heta_{1}, heta_{2}, ... , heta_{n} )^T $
给 $ heta_{0} $ 增加一个虚构的乘数 $ X_0^{(i) } equiv 1(,得到: ) hat{y}^{(i)} = heta_{0}X_0^{(i) } + heta_{1}X_1^{(i)} + heta_{2}X_2^{(i)} + ... + heta_{n}X_n^{(i)} $
$ X^{(i)} = ( X_0^{(i)}, X_1^{(i)}, X_2^{(i)}, ... , X_n^{(i)} ) $
$ hat{y}^{(i)} = X^{(i)} cdot heta $
将此推广到所有样本上
比之前的 (X) 多了一列 1,为了区分,记作 $ X_b$
$ heta_0 $ 称为截距 intercept,代表一个偏移;
$ heta_1 - heta_n $ 称为系数 coefficients;每一个都对应原来样本中的一个特征,用于描述这个特征 对于最终样本 相应的贡献程度;
$ hat{y} = X_b cdot heta$
所以目标函数也可以转化为:
$ (y - X_b cdot heta)^T (y - X_b cdot heta) $
$ heta = ( X_b^T X_b )^{-1} X_b^T y $
这个式子称为 多元线性回归 的 正规方程解
(Normal Equation )
优点:不需要对数据做归一化处理;
因为最后估计的 $ heta $ 是原始数据进行数学运算的结果,这种运算中 不存在量纲的问题, $ heta $ 是线性方程中每一个 x 前面的系数而已,没有量纲的问题。
缺点:时间复杂度比较高,大概是 O(n^3),即使使用优化方案,也在 O(n^2.4) 左右。
无论是样本量大,还是特征多,用这个方法都会很慢。所以一般使用其他方法。
算法封装
import numpy as np
from .metrics import r2_score
class LinearRegression:
def __init__(self):
"""初始化Linear Regression模型"""
self.coef_ = None
self.intercept_ = None
self._theta = None
def fit_normal(self, X_train, y_train):
"""根据训练数据集X_train, y_train训练Linear Regression模型"""
assert X_train.shape[0] == y_train.shape[0],
"the size of X_train must be equal to the size of y_train"
X_b = np.hstack([np.ones((len(X_train), 1)), X_train])
self._theta = np.linalg.inv(X_b.T.dot(X_b)).dot(X_b.T).dot(y_train)
self.intercept_ = self._theta[0]
self.coef_ = self._theta[1:]
return self
def fit_gd(self, X_train, y_train, eta=0.01, n_iters=1e4):
"""根据训练数据集X_train, y_train, 使用梯度下降法训练Linear Regression模型"""
assert X_train.shape[0] == y_train.shape[0],
"the size of X_train must be equal to the size of y_train"
def J(theta, X_b, y):
try:
return np.sum((y - X_b.dot(theta)) ** 2) / len(y)
except:
return float('inf')
def dJ(theta, X_b, y):
# res = np.empty(len(theta))
# res[0] = np.sum(X_b.dot(theta) - y)
# for i in range(1, len(theta)):
# res[i] = (X_b.dot(theta) - y).dot(X_b[:, i])
# return res * 2 / len(X_b)
return X_b.T.dot(X_b.dot(theta) - y) * 2. / len(X_b)
def gradient_descent(X_b, y, initial_theta, eta, n_iters=1e4, epsilon=1e-8):
theta = initial_theta
cur_iter = 0
while cur_iter < n_iters:
gradient = dJ(theta, X_b, y)
last_theta = theta
theta = theta - eta * gradient
if (abs(J(theta, X_b, y) - J(last_theta, X_b, y)) < epsilon):
break
cur_iter += 1
return theta
X_b = np.hstack([np.ones((len(X_train), 1)), X_train])
initial_theta = np.zeros(X_b.shape[1])
self._theta = gradient_descent(X_b, y_train, initial_theta, eta, n_iters)
self.intercept_ = self._theta[0]
self.coef_ = self._theta[1:]
return self
def fit_sgd(self, X_train, y_train, n_iters=5, t0=5, t1=50):
"""根据训练数据集X_train, y_train, 使用梯度下降法训练Linear Regression模型"""
assert X_train.shape[0] == y_train.shape[0],
"the size of X_train must be equal to the size of y_train"
assert n_iters >= 1
def dJ_sgd(theta, X_b_i, y_i):
return X_b_i * (X_b_i.dot(theta) - y_i) * 2.
def sgd(X_b, y, initial_theta, n_iters, t0=5, t1=50):
def learning_rate(t):
return t0 / (t + t1)
theta = initial_theta
m = len(X_b)
for cur_iter in range(n_iters):
indexes = np.random.permutation(m)
X_b_new = X_b[indexes]
y_new = y[indexes]
for i in range(m):
gradient = dJ_sgd(theta, X_b_new[i], y_new[i])
theta = theta - learning_rate(cur_iter * m + i) * gradient
return theta
X_b = np.hstack([np.ones((len(X_train), 1)), X_train])
initial_theta = np.random.randn(X_b.shape[1])
self._theta = sgd(X_b, y_train, initial_theta, n_iters, t0, t1)
self.intercept_ = self._theta[0]
self.coef_ = self._theta[1:]
return self
def predict(self, X_predict):
"""给定待预测数据集X_predict,返回表示X_predict的结果向量"""
assert self.intercept_ is not None and self.coef_ is not None,
"must fit before predict!"
assert X_predict.shape[1] == len(self.coef_),
"the feature number of X_predict must be equal to X_train"
X_b = np.hstack([np.ones((len(X_predict), 1)), X_predict])
return X_b.dot(self._theta)
def score(self, X_test, y_test):
"""根据测试数据集 X_test 和 y_test 确定当前模型的准确度"""
y_predict = self.predict(X_test)
return r2_score(y_test, y_predict)
def __repr__(self):
return "LinearRegression()"
使用 sklearn 处理 boston 房价回归问题
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
boston = datasets.load_boston()
boston.keys()
# dict_keys(['data', 'target', 'feature_names', 'DESCR', 'filename'])
boston.DESCR
# 有 13 个特征
boston.feature_names
# array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT'], dtype='<U7')
X = boston.data
y = boston.target
X = X[y < 50.0]
y = y[y < 50.0]
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42)
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
reg.fit(X_train, y_train)
# LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)
reg.coef_
'''
array([-1.22096140e-01, 3.24934065e-02, -4.51945652e-02, 6.32525034e-02,
-1.17444911e+01, 3.61793376e+00, -2.00394486e-02, -1.21059188e+00,
2.47235697e-01, -1.31042159e-02, -8.35556922e-01, 8.18076684e-03,
-3.81616606e-01])
'''
reg.intercept_
# 32.73189970831903
reg.score(X_test, y_test)
# 0.7640047258028569
使用 kNN 解决多元线性回归问题
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
knn_reg = KNeighborsRegressor()
knn_reg.fit(X_train, y_train)
# KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski', metric_params=None, n_jobs=None, n_neighbors=5, p=2, weights='uniform')
knn_reg.score(X_test, y_test)
# 0.5812004118756823
# 使用网格搜索寻找超参数
param_grid = [{'weights': ['uniform'],
'n_neighbors': [i for i in range(1,11)]
},
{'weights': ['distance'],
'n_neighbors': [i for i in range(1,11)],
'p': [i for i in range(1,6)]
}]
from sklearn.model_selection import GridSearchCV
# CV 的意思是 Cross Validation,交叉验证。
grid_search = GridSearchCV(knn_reg, param_grid, n_jobs=-1, verbose=2)
%time
grid_search.fit(X_train, y_train)
'''
CPU times: user 2 µs, sys: 1e+03 ns, total: 3 µs
Wall time: 4.77 µs
Fitting 3 folds for each of 60 candidates, totalling 180 fits
GridSearchCV(cv='warn', error_score='raise-deprecating',
estimator=KNeighborsRegressor(algorithm='auto', leaf_size=30,
metric='minkowski',
metric_params=None, n_jobs=None,
n_neighbors=5, p=2,
weights='uniform'),
iid='warn', n_jobs=-1,
param_grid=[{'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
'weights': ['uniform']},
{'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
'p': [1, 2, 3, 4, 5], 'weights': ['distance']}],
pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
scoring=None, verbose=2)
'''
grid_search.best_params_
# {'n_neighbors': 10, 'p': 1, 'weights': 'distance'}
grid_search.best_score_
# 0.6343537550346618
grid_search.best_estimator_
# KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski', metric_params=None, n_jobs=None, n_neighbors=10, p=1, weights='distance')
grid_search.best_estimator_.score(X_test, y_test)
# 比 knn 默认的结果,但是不如线性回归的效果。但这里的score 标准,可能和 lr 中的标准不通过。
# 0.6717251762406973