zoukankan html css js c++ java

线性回归算法Sklearn完整复现

1. 模型优化

1.1 多项式与线性回归

若线性回归模型太简单导致欠拟合时，我们可以增加特征多项式来让线性回归模型更好地拟合数据。比如有两个特征x1,x2，可以增加两特征的乘积作为新特征x3。还可以增加x1^2作为另一个新特征x4。

在scikit-learn里，线性回归是由类sklearn.linear_model.LinearRegression实现，多项式由类sklearn.preprocessing.PolynomialFeatures实现。添加多项式特征需要一个管道把两个类串起来，要使用sklearn.pipline.Pipeline

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline

def polynomial_model(degree = 1):
    polynomial_features = PolynomialFeatures(degree = degree,
                                            include_bias = False)
    linear_regression = LinearRegression()
    #这是一个流水线，先增加多项式阶数，然后再用线性回归算法来拟合数据
    pipline = Pipeline([("polynomial_features",polynomial_features),
                       ("linear_regression",linear_regression)])
    return pipline

fit、fit_transform、transform的区别详解：https://blog.csdn.net/weixin_38278334/article/details/82971752

在scikit-learn里，使用LinearRegression进行线性回归时，可以指定normalize = True来对数据进行归一化处理。

2. 示例：使用线性回归算法拟合正弦函数

#生成200个在[-2Π,2Π]区间内的正弦函数上的点，并给这些点加上随机噪声
import numpy as np
n_dots = 200

X = np.linspace(-2 * np.pi,2 * np.pi,n_dots)
Y = np.sin(X) + 0.2 * np.random.rand(n_dots) - 0.1
X = X.reshape(-1,1)
Y = Y.reshape(-1,1)

#分别用2，3，5，10阶多项式来拟合数据集
from sklearn.metrics import mean_squared_error

degrees = [2,3,5,10]
results = []
for d in degrees:
    model = polynomial_model(degree=d)
    model.fit(X,Y)
    train_score = model.score(X,Y)
    mse = mean_squared_error(Y,model.predict(X))
    results.append({"model":model,"degree":d,"score":
                       train_score,"mse":mse})
for r in results:
    print("degree: {};train score: {};mean squared error: {}".format(
        r["degree"],r["score"],r["mse"]))

degree: 2;train score: 0.14691964884268827;mean squared error: 0.4337561603823593
degree: 3;train score: 0.2725519790368923;mean squared error: 0.3698773040811927
degree: 5;train score: 0.8949982058380093;mean squared error: 0.053389079946778877
degree: 10;train score: 0.9936659355081904;mean squared error: 0.0032206104499468945

results

[{'model': Pipeline(steps=[('polynomial_features', PolynomialFeatures(include_bias=False)),
                  ('linear_regression', LinearRegression())]),
  'degree': 2,
  'score': 0.14691964884268827,
  'mse': 0.4337561603823593},
 {'model': Pipeline(steps=[('polynomial_features',
                   PolynomialFeatures(degree=3, include_bias=False)),
                  ('linear_regression', LinearRegression())]),
  'degree': 3,
  'score': 0.2725519790368923,
  'mse': 0.3698773040811927},
 {'model': Pipeline(steps=[('polynomial_features',
                   PolynomialFeatures(degree=5, include_bias=False)),
                  ('linear_regression', LinearRegression())]),
  'degree': 5,
  'score': 0.8949982058380093,
  'mse': 0.053389079946778877},
 {'model': Pipeline(steps=[('polynomial_features',
                   PolynomialFeatures(degree=10, include_bias=False)),
                  ('linear_regression', LinearRegression())]),
  'degree': 10,
  'score': 0.9936659355081904,
  'mse': 0.0032206104499468945}]

使用mean_squared_error算出均方根误差，即实际的点和模型预测的点之间的距离，均方根误差越小说明模型拟合效果越好

#绘制不同模型拟合效果
from matplotlib.figure import SubplotParams
import matplotlib.pyplot as plt

plt.figure(figsize = (12,6),dpi = 200, subplotpars = SubplotParams(hspace = 0.3))
for i,r in enumerate(results):
    fig = plt.subplot(2,2,i+1)
    plt.xlim(-8,8)
    plt.title("LinearRegression degree={}".format(r['degree']))
    plt.scatter(X,Y,s = 5,c = 'b',alpha = 0.5)
    plt.plot(X,r['model'].predict(X),'r-')

3. 示例：测算房价

使用scikit-learn自带的波士顿房价数据集来训练模型，然后用模型来测算房价，

数据集收集的13个特征：

CRIM:城镇人均犯罪率。
ZN:城镇超过25,000平方英尺的住宅区域的占地比例。
INDUS:城镇非零售用地占地比例。
CHAS:是否靠近河边，1为靠近，0为远离。
NOX:一氧化氮浓度。
RM:每套房产的平均房间个数。
AGE:在1940年之前就盖好，且业主自住的房子的比例。
DIS:与波士顿市中心的距离。
RAD:周边高速公道的便利性指数。
TAX:每10,000美元的财产税率。
PTRATIO:小学老师的比例。
B:城镇黑人的比例。
LSTAT:地位较低的人口比例。

#导入数据
from sklearn.datasets import load_boston

boston = load_boston()
X = boston.data
y = boston.target
X.shape

(506, 13)

X[0]

array([6.320e-03, 1.800e+01, 2.310e+00, 0.000e+00, 5.380e-01, 6.575e+00,
       6.520e+01, 4.090e+00, 1.000e+00, 2.960e+02, 1.530e+01, 3.969e+02,
       4.980e+00])

#查看特征标签
boston.feature_names

array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',
       'TAX', 'PTRATIO', 'B', 'LSTAT'], dtype='<U7')

3.1 模型训练

#将数据集分成两份
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size =0.2
                                                 ,random_state = 2)

#训练模型并测试模型的准确性评分
import time
from sklearn.linear_model import LinearRegression

model = LinearRegression()

start = time.perf_counter()
model.fit(X_train,y_train)
cv_score = model.score(X_test,y_test)
print('elaspe: {0:.6f};train_score: {1:0.6f};cv_score: {2:.6f}'.format(
    time.perf_counter() - start,train_score,cv_score))

elaspe: 0.001908;train_score: 0.993666;cv_score: 0.778921

3.2 模型优化

#数据归一化
model = LinearRegression(normalize = True)

数据归一化处理只会加快算法收敛速度，优化算法训练效率，无法提升算法的准确性。

#增加多项式特征，增加模型的复杂度
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline

def polynomial_model(degree = 1):
    polynomial_features = PolynomialFeatures(degree = degree,
                                             include_bias = False)
    linear_regression = LinearRegression(normalize = True)
    pipeline = Pipeline([("polynomial_features",polynomial_features),(
        "linear_regression",linear_regression)])
    return pipeline

#二阶多项式拟合数据
model = polynomial_model(degree = 2)

start = time.perf_counter()
model.fit(X_train,y_train)

train_score = model.score(X_train,y_train)
cv_score = model.score(X_test,y_test)
print('elaspe: {0:.6f};train_score: {1:0.6f};cv_score: {2:.6f}'.format(
        time.perf_counter() - start,train_score,cv_score))

elaspe: 0.034632;train_score: 0.929593;cv_score: 0.896364

#三阶多项式拟合数据
model = polynomial_model(degree = 3)

start = time.perf_counter()
model.fit(X_train,y_train)

train_score = model.score(X_train,y_train)
cv_score = model.score(X_test,y_test)
print('elaspe: {0:.6f};train_score: {1:0.6f};cv_score: {2:.6f}'.format(
        time.perf_counter() - start,train_score,cv_score))

elaspe: 0.161353;train_score: 1.000000;cv_score: -318.549144

三阶多项式出现了过拟合现象

总共有13个输入特征，从一阶变成二阶多项式输入特征个数增加了几个？

3.3 学习曲线

from common.utils import plot_learning_curve
from sklearn.model_selection import ShuffleSplit

cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=0)
plt.figure(figsize=(18, 4))
title = 'Learning Curves (degree={0})'
degrees = [1, 2, 3]

start = time.clock()
plt.figure(figsize=(18, 4), dpi=200)
for i in range(len(degrees)):
    plt.subplot(1, 3, i + 1)
    plot_learning_curve(plt, polynomial_model(degrees[i]), title.format(degrees[i]), X, y, ylim=(0.01, 1.01), cv=cv)

print('elaspe: {0:.6f}'.format(time.clock()-start))

查看全文

相关阅读:
C++封装SQLite实例<六>
大哥给力点我要发博客
 Android仿人人客户端（v5.7.1）——授权认证（用accessToken换取session_key、session_secret和userId）
三阶魔方玩法总结
 二阶魔方玩法
 [置顶] C语言实验：等额本金还款法的计算
 活到老、学到老，我要做80岁还能写代码的奇葩！
百度PHP电话面试之十问
 Java 面向对象编程思想类及其方法的调用，分解质因数，界面交互方法 int string 类型转换陈光剑
 [置顶] C语言实验：输入任意一个日期的年、月、日的值，求出从公元1年1月1日到这一天总共有多少天，并求出这一天是星期几。

原文地址：https://www.cnblogs.com/MurasameLory-chenyulong/p/15093946.html