zoukankan      html  css  js  c++  java
  • 线性回归算法Sklearn完整复现

    1. 模型优化

    1.1 多项式与线性回归

    若线性回归模型太简单导致欠拟合时,我们可以增加特征多项式来让线性回归模型更好地拟合数据。比如有两个特征x1,x2,可以增加两特征的乘积作为新特征x3。还可以增加x1^2作为另一个新特征x4

    scikit-learn里,线性回归是由类sklearn.linear_model.LinearRegression实现,多项式由类sklearn.preprocessing.PolynomialFeatures实现。添加多项式特征需要一个管道把两个类串起来,要使用sklearn.pipline.Pipeline

    from sklearn.linear_model import LinearRegression
    from sklearn.preprocessing import PolynomialFeatures
    from sklearn.pipeline import Pipeline
    
    def polynomial_model(degree = 1):
        polynomial_features = PolynomialFeatures(degree = degree,
                                                include_bias = False)
        linear_regression = LinearRegression()
        #这是一个流水线,先增加多项式阶数,然后再用线性回归算法来拟合数据
        pipline = Pipeline([("polynomial_features",polynomial_features),
                           ("linear_regression",linear_regression)])
        return pipline
    

    fitfit_transformtransform的区别详解:https://blog.csdn.net/weixin_38278334/article/details/82971752

    scikit-learn里,使用LinearRegression进行线性回归时,可以指定normalize = True来对数据进行归一化处理。

    2. 示例:使用线性回归算法拟合正弦函数

    #生成200个在[-2Π,2Π]区间内的正弦函数上的点,并给这些点加上随机噪声
    import numpy as np
    n_dots = 200
    
    X = np.linspace(-2 * np.pi,2 * np.pi,n_dots)
    Y = np.sin(X) + 0.2 * np.random.rand(n_dots) - 0.1
    X = X.reshape(-1,1)
    Y = Y.reshape(-1,1)
    
    #分别用2,3,5,10阶多项式来拟合数据集
    from sklearn.metrics import mean_squared_error
    
    degrees = [2,3,5,10]
    results = []
    for d in degrees:
        model = polynomial_model(degree=d)
        model.fit(X,Y)
        train_score = model.score(X,Y)
        mse = mean_squared_error(Y,model.predict(X))
        results.append({"model":model,"degree":d,"score":
                           train_score,"mse":mse})
    for r in results:
        print("degree: {};train score: {};mean squared error: {}".format(
            r["degree"],r["score"],r["mse"]))
    
    degree: 2;train score: 0.14691964884268827;mean squared error: 0.4337561603823593
    degree: 3;train score: 0.2725519790368923;mean squared error: 0.3698773040811927
    degree: 5;train score: 0.8949982058380093;mean squared error: 0.053389079946778877
    degree: 10;train score: 0.9936659355081904;mean squared error: 0.0032206104499468945
    
    results
    
    [{'model': Pipeline(steps=[('polynomial_features', PolynomialFeatures(include_bias=False)),
                      ('linear_regression', LinearRegression())]),
      'degree': 2,
      'score': 0.14691964884268827,
      'mse': 0.4337561603823593},
     {'model': Pipeline(steps=[('polynomial_features',
                       PolynomialFeatures(degree=3, include_bias=False)),
                      ('linear_regression', LinearRegression())]),
      'degree': 3,
      'score': 0.2725519790368923,
      'mse': 0.3698773040811927},
     {'model': Pipeline(steps=[('polynomial_features',
                       PolynomialFeatures(degree=5, include_bias=False)),
                      ('linear_regression', LinearRegression())]),
      'degree': 5,
      'score': 0.8949982058380093,
      'mse': 0.053389079946778877},
     {'model': Pipeline(steps=[('polynomial_features',
                       PolynomialFeatures(degree=10, include_bias=False)),
                      ('linear_regression', LinearRegression())]),
      'degree': 10,
      'score': 0.9936659355081904,
      'mse': 0.0032206104499468945}]
    

    使用mean_squared_error算出均方根误差,即实际的点和模型预测的点之间的距离,均方根误差越小说明模型拟合效果越好

    #绘制不同模型拟合效果
    from matplotlib.figure import SubplotParams
    import matplotlib.pyplot as plt
    
    plt.figure(figsize = (12,6),dpi = 200, subplotpars = SubplotParams(hspace = 0.3))
    for i,r in enumerate(results):
        fig = plt.subplot(2,2,i+1)
        plt.xlim(-8,8)
        plt.title("LinearRegression degree={}".format(r['degree']))
        plt.scatter(X,Y,s = 5,c = 'b',alpha = 0.5)
        plt.plot(X,r['model'].predict(X),'r-')
    


    3. 示例:测算房价

    使用scikit-learn自带的波士顿房价数据集来训练模型,然后用模型来测算房价,

    数据集收集的13个特征:

    • CRIM:城镇人均犯罪率。
    • ZN:城镇超过25,000平方英尺的住宅区域的占地比例。
    • INDUS:城镇非零售用地占地比例。
    • CHAS:是否靠近河边,1为靠近,0为远离。
    • NOX:一氧化氮浓度。
    • RM:每套房产的平均房间个数。
    • AGE:在1940年之前就盖好,且业主自住的房子的比例。
    • DIS:与波士顿市中心的距离。
    • RAD:周边高速公道的便利性指数。
    • TAX:每10,000美元的财产税率。
    • PTRATIO:小学老师的比例。
    • B:城镇黑人的比例。
    • LSTAT:地位较低的人口比例。
    #导入数据
    from sklearn.datasets import load_boston
    
    boston = load_boston()
    X = boston.data
    y = boston.target
    X.shape
    
    (506, 13)
    
    X[0]
    
    array([6.320e-03, 1.800e+01, 2.310e+00, 0.000e+00, 5.380e-01, 6.575e+00,
           6.520e+01, 4.090e+00, 1.000e+00, 2.960e+02, 1.530e+01, 3.969e+02,
           4.980e+00])
    
    #查看特征标签
    boston.feature_names
    
    array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',
           'TAX', 'PTRATIO', 'B', 'LSTAT'], dtype='<U7')
    

    3.1 模型训练

    #将数据集分成两份
    from sklearn.model_selection import train_test_split
    
    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size =0.2
                                                     ,random_state = 2)
    
    #训练模型并测试模型的准确性评分
    import time
    from sklearn.linear_model import LinearRegression
    
    model = LinearRegression()
    
    start = time.perf_counter()
    model.fit(X_train,y_train)
    cv_score = model.score(X_test,y_test)
    print('elaspe: {0:.6f};train_score: {1:0.6f};cv_score: {2:.6f}'.format(
        time.perf_counter() - start,train_score,cv_score))
    
    elaspe: 0.001908;train_score: 0.993666;cv_score: 0.778921
    

    3.2 模型优化

    #数据归一化
    model = LinearRegression(normalize = True)
    

    数据归一化处理只会加快算法收敛速度,优化算法训练效率,无法提升算法的准确性。

    #增加多项式特征,增加模型的复杂度
    from sklearn.linear_model import LinearRegression
    from sklearn.preprocessing import PolynomialFeatures
    from sklearn.pipeline import Pipeline
    
    def polynomial_model(degree = 1):
        polynomial_features = PolynomialFeatures(degree = degree,
                                                 include_bias = False)
        linear_regression = LinearRegression(normalize = True)
        pipeline = Pipeline([("polynomial_features",polynomial_features),(
            "linear_regression",linear_regression)])
        return pipeline
    
    #二阶多项式拟合数据
    model = polynomial_model(degree = 2)
    
    start = time.perf_counter()
    model.fit(X_train,y_train)
    
    train_score = model.score(X_train,y_train)
    cv_score = model.score(X_test,y_test)
    print('elaspe: {0:.6f};train_score: {1:0.6f};cv_score: {2:.6f}'.format(
            time.perf_counter() - start,train_score,cv_score))
    
    elaspe: 0.034632;train_score: 0.929593;cv_score: 0.896364
    
    #三阶多项式拟合数据
    model = polynomial_model(degree = 3)
    
    start = time.perf_counter()
    model.fit(X_train,y_train)
    
    train_score = model.score(X_train,y_train)
    cv_score = model.score(X_test,y_test)
    print('elaspe: {0:.6f};train_score: {1:0.6f};cv_score: {2:.6f}'.format(
            time.perf_counter() - start,train_score,cv_score))
    
    elaspe: 0.161353;train_score: 1.000000;cv_score: -318.549144
    

    三阶多项式出现了过拟合现象

    总共有13个输入特征,从一阶变成二阶多项式输入特征个数增加了几个?

    3.3 学习曲线

    from common.utils import plot_learning_curve
    from sklearn.model_selection import ShuffleSplit
    
    cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=0)
    plt.figure(figsize=(18, 4))
    title = 'Learning Curves (degree={0})'
    degrees = [1, 2, 3]
    
    start = time.clock()
    plt.figure(figsize=(18, 4), dpi=200)
    for i in range(len(degrees)):
        plt.subplot(1, 3, i + 1)
        plot_learning_curve(plt, polynomial_model(degrees[i]), title.format(degrees[i]), X, y, ylim=(0.01, 1.01), cv=cv)
    
    print('elaspe: {0:.6f}'.format(time.clock()-start))
    

  • 相关阅读:
    HDU3336 Count the string —— KMP next数组
    CodeForces
    51Nod 1627 瞬间移动 —— 组合数学
    51Nod 1158 全是1的最大子矩阵 —— 预处理 + 暴力枚举 or 单调栈
    51Nod 1225 余数之和 —— 分区枚举
    51Nod 1084 矩阵取数问题 V2 —— 最小费用最大流 or 多线程DP
    51Nod 机器人走方格 V3 —— 卡特兰数、Lucas定理
    51Nod XOR key —— 区间最大异或值 可持久化字典树
    HDU4825 Xor Sum —— Trie树
    51Nod 1515 明辨是非 —— 并查集 + 启发式合并
  • 原文地址:https://www.cnblogs.com/MurasameLory-chenyulong/p/15093946.html
Copyright © 2011-2022 走看看