zoukankan      html  css  js  c++  java
  • 数据挖掘实践(25):实战-- 建筑能源得分预测报告(三)

    5 建立基础模型,尝试多种算法

    #之前把精力都放在了前面了,这回我的重点就要放在建模上了,导入所需要的包
    # 数据分析库
    import pandas as pd
    import numpy as np
    
    # warnings:警告——>忽视
    pd.options.mode.chained_assignment = None
    pd.set_option('display.max_columns', 60)
    
    # 可视化
    import matplotlib.pyplot as plt
    %matplotlib inline
    
    # 字体大小设置
    plt.rcParams['font.size'] = 24
    
    from IPython.core.pylabtools import figsize
    
    # Seaborn 高级可视化工具
    import seaborn as sns
    sns.set(font_scale = 2)
    
    # 预处理:缺失值 、 最大最小归一化
    from sklearn.preprocessing import Imputer, MinMaxScaler
    
    # 机器学习算法库
    from sklearn.linear_model import LinearRegression
    from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
    from sklearn.svm import SVR
    from sklearn.neighbors import KNeighborsRegressor
    
    # 调参工具包
    from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
    
    
    import warnings
    warnings.filterwarnings("ignore")
    • 上次保存好的数据加载进来
    # Read in data into dataframes 
    train_features = pd.read_csv('data/training_features.csv')
    test_features = pd.read_csv('data/testing_features.csv')
    train_labels = pd.read_csv('data/training_labels.csv')
    test_labels = pd.read_csv('data/testing_labels.csv')
    
    # Display sizes of data
    print('Training Feature Size: ', train_features.shape)
    print('Testing Feature Size:  ', test_features.shape)
    print('Training Labels Size:  ', train_labels.shape)
    print('Testing Labels Size:   ', test_labels.shape)
    Training Feature Size:  (6622, 67)
    Testing Feature Size:   (2839, 67)
    Training Labels Size:   (6622, 1)
    Testing Labels Size:    (2839, 1)
    train_features.head()

    5.1 缺失值填充

    imputer = Imputer(strategy='median') # 因为数据有离群点,有大有小,用mean不太合适,用中位数较合适
    # 在训练特征中训练
    imputer.fit(train_features)
    
    # 对训练数据进行转换
    X = imputer.transform(train_features)#用中位数来代替做成的训练集
    X_test = imputer.transform(test_features) #用中位数来代替做成的测试集
    #np.isnan:数值进行空值检测
    print('Missing values in training features: ', np.sum(np.isnan(X))) #返回的是0 ,代表缺失值任务已经完成了
    print('Missing values in testing features:  ', np.sum(np.isnan(X_test)))
    Missing values in training features:  0
    Missing values in testing features:   0

    5.2 特征进行与归一化

    scaler = MinMaxScaler(feature_range=(0, 1))
    
    # 训练与转换
    scaler.fit(X)
    
    # 把训练数据转换过来(0,1)
    X = scaler.transform(X)
    X_test = scaler.transform(X_test) # 测试数据
    #标签值是1列 ,reshape变成1行
    # reshape(行数,列数)常用来更改数据的行列数目
    y = np.array(train_labels).reshape((-1,))#一维数组 , 变成1列
    y_test = np.array(test_labels).reshape((-1, )) # 一维数组 , 变成1列

    6 建立基础模型,尝试多种算法(回归问题)

    1. Linear Regression
    2. Support Vector Machine Regression
    3. Random Forest Regression
    4. Gradient Boosting Regression
    5. K-Nearest Neighbors Regression

    6.1 建立损失函数

    # 在这里的损失函数是MAE ,abs()是绝对值
    def mae(y_true, y_pred):
        return np.mean(abs(y_true - y_pred))
    
    
    #制作一个模型 ,训练模型和在验证集上验证模型的参数
    def fit_and_evaluate(model):
        
        # 训练模型
        model.fit(X, y)
        
        # 训练模型开始在测试数据上训练
        model_pred = model.predict(X_test)
        model_mae = mae(y_test, model_pred)
        
        
        return model_mae

    6.2 选择机器学习算法

    lr = LinearRegression()#线性回归
    lr_mae = fit_and_evaluate(lr)
    
    print('Linear Regression Performance on the test set: MAE = %0.4f' % lr_mae)
    Linear Regression Performance on the test set: MAE = 12.9817
    svm = SVR(C = 1000, gamma = 0.1) #支持向量机
    svm_mae = fit_and_evaluate(svm)
    
    print('Support Vector Machine Regression Performance on the test set: MAE = %0.4f' % svm_mae)
    Support Vector Machine Regression Performance on the test set: MAE = 10.9336
    random_forest = RandomForestRegressor(random_state=60)#集成算法的随机森林
    random_forest_mae = fit_and_evaluate(random_forest)
    
    print('Random Forest Regression Performance on the test set: MAE = %0.4f' % random_forest_mae)
    Random Forest Regression Performance on the test set: MAE = 10.0941
    gradient_boosted = GradientBoostingRegressor(random_state=60) #梯度提升树
    gradient_boosted_mae = fit_and_evaluate(gradient_boosted)
    
    print('Gradient Boosted Regression Performance on the test set: MAE = %0.4f' % gradient_boosted_mae)
    Gradient Boosted Regression Performance on the test set: MAE = 9.9893
    knn = KNeighborsRegressor(n_neighbors=10)#K近邻算法
    knn_mae = fit_and_evaluate(knn)
    
    print('K-Nearest Neighbors Regression Performance on the test set: MAE = %0.4f' % knn_mae)
    K-Nearest Neighbors Regression Performance on the test set: MAE = 12.6722
    plt.style.use('fivethirtyeight') 
    figsize(8, 6)
    
    
    model_comparison = pd.DataFrame({'model': ['Linear Regression', 'Support Vector Machine',
                                               'Random Forest', 'Gradient Boosted',
                                                'K-Nearest Neighbors'],
                                     'mae': [lr_mae, svm_mae, random_forest_mae, 
                                             gradient_boosted_mae, knn_mae]})
    
    #         ascending=True是对的意思升序      降序 :从大到小/从第1行到第5行    barh:横着去画的直方图 
    model_comparison.sort_values('mae', ascending = False).plot(x = 'model', y = 'mae', kind = 'barh',
                                                               color = 'red', edgecolor = 'black')
    
    # 纵轴是算法模型的名称    yticks:为递增值向量       横轴是MAE损失                 xticks:为递增值向量
    plt.ylabel(''); plt.yticks(size = 14); plt.xlabel('Mean Absolute Error'); plt.xticks(size = 14)
    plt.title('Model Comparison on Test MAE', size = 20);

    7 模型调参

    7.1 调参

    loss = ['ls', 'lad', 'huber']
    
    # 所使用的弱“学习者”(决策树)的数量
    n_estimators = [100, 500, 900, 1100, 1500]
    
    # 决策树的最大深度
    max_depth = [2, 3, 5, 10, 15]
    
    # 决策树的叶节点所需的最小示例个数
    min_samples_leaf = [1, 2, 4, 6, 8]
    
    # 分割决策树节点所需的最小示例个数
    min_samples_split = [2, 4, 6, 10]
    
    
    
    
    
    hyperparameter_grid = {'loss': loss,
                           'n_estimators': n_estimators,
                           'max_depth': max_depth,
                           'min_samples_leaf': min_samples_leaf,
                           'min_samples_split': min_samples_split}
    model = GradientBoostingRegressor(random_state = 42)
    
    
    random_cv = RandomizedSearchCV(estimator=model, 
                                   
                                   param_distributions=hyperparameter_grid,
                                   cv=4, n_iter=25, 
                                   scoring = 'neg_mean_absolute_error', #选择好结果的评估值
                                   
                                   n_jobs = -1, verbose = 1, 
                                   
                                   return_train_score = True,
                                   
                                   random_state=42)
    # 注意:运行的时间非常慢,需要14mins
    random_cv.fit(X, y)
    Fitting 4 folds for each of 25 candidates, totalling 100 fits
    
     
    [Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
    [Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  3.4min
    [Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  9.7min finished

    输出
    RandomizedSearchCV(cv=4, error_score='raise-deprecating',
                       estimator=GradientBoostingRegressor(alpha=0.9,
                                                           criterion='friedman_mse',
                                                           init=None,
                                                           learning_rate=0.1,
                                                           loss='ls', max_depth=3,
                                                           max_features=None,
                                                           max_leaf_nodes=None,
                                                           min_impurity_decrease=0.0,
                                                           min_impurity_split=None,
                                                           min_samples_leaf=1,
                                                           min_samples_split=2,
                                                           min_weight_fraction_leaf=0.0,
                                                           n_estimators=100,...
                                                           verbose=0,
                                                           warm_start=False),
                       iid='warn', n_iter=25, n_jobs=-1,
                       param_distributions={'loss': ['ls', 'lad', 'huber'],
                                            'max_depth': [2, 3, 5, 10, 15],
                                            'min_samples_leaf': [1, 2, 4, 6, 8],
                                            'min_samples_split': [2, 4, 6, 10],
                                            'n_estimators': [100, 500, 900, 1100,
                                                             1500]},
                       pre_dispatch='2*n_jobs', random_state=42, refit=True,
                       return_train_score=True, scoring='neg_mean_absolute_error',
                       verbose=1)
    random_cv.best_estimator_ #最好的参数
    输出:GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
                              learning_rate=0.1, loss='lad', max_depth=5,
                              max_features=None, max_leaf_nodes=None,
                              min_impurity_decrease=0.0, min_impurity_split=None,
                              min_samples_leaf=4, min_samples_split=2,
                              min_weight_fraction_leaf=0.0, n_estimators=500,
                              n_iter_no_change=None, presort='auto',
                              random_state=42, subsample=1.0, tol=0.0001,
                              validation_fraction=0.1, verbose=0, warm_start=False)

    # 创建树策个数
    trees_grid = {'n_estimators': [100, 150, 200, 250, 300, 350, 400, 450, 500, 550, 600, 650, 700, 750, 800]}
    
    #建立模型
    #lad:最小化绝对偏差
    model = GradientBoostingRegressor(loss = 'lad', max_depth = 5,
                                      min_samples_leaf = 6,
                                      min_samples_split = 6,
                                      max_features = None,
                                      random_state = 42)
    
    # 传入参数
    grid_search = GridSearchCV(estimator = model, param_grid=trees_grid, cv = 4, 
                               scoring = 'neg_mean_absolute_error', verbose = 1,
                               n_jobs = -1, return_train_score = True)
    # 需要3mins
    grid_search.fit(X, y)
    Fitting 4 folds for each of 15 candidates, totalling 60 fits
    
     
    [Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
    [Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.0min
    [Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  2.2min finished

    输出:

    GridSearchCV(cv=4, error_score='raise-deprecating',
                 estimator=GradientBoostingRegressor(alpha=0.9,
                                                     criterion='friedman_mse',
                                                     init=None, learning_rate=0.1,
                                                     loss='lad', max_depth=5,
                                                     max_features=None,
                                                     max_leaf_nodes=None,
                                                     min_impurity_decrease=0.0,
                                                     min_impurity_split=None,
                                                     min_samples_leaf=6,
                                                     min_samples_split=6,
                                                     min_weight_fraction_leaf=0.0,
                                                     n_estimators=100,
                                                     n_iter_no_change=None,
                                                     presort='auto',
                                                     random_state=42, subsample=1.0,
                                                     tol=0.0001,
                                                     validation_fraction=0.1,
                                                     verbose=0, warm_start=False),
                 iid='warn', n_jobs=-1,
                 param_grid={'n_estimators': [100, 150, 200, 250, 300, 350, 400,
                                              450, 500, 550, 600, 650, 700, 750,
                                              800]},
                 pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
                 scoring='neg_mean_absolute_error', verbose=1)

    7.2 对比损失函数

    # 得到结果传入DataFrame
    results = pd.DataFrame(grid_search.cv_results_)
    
    # 画图操作
    figsize(8, 8)
    plt.style.use('fivethirtyeight')
    
    plt.plot(results['param_n_estimators'], -1 * results['mean_test_score'], label = 'Testing Error')
    plt.plot(results['param_n_estimators'], -1 * results['mean_train_score'], label = 'Training Error')
    #横轴是树的个数 ,纵轴是MAE的误差
    plt.xlabel('Number of Trees'); plt.ylabel('Mean Abosolute Error'); plt.legend();
    plt.title('Performance vs Number of Trees');
    #过拟合 , 蓝色平缓 ,红色比较陡 ,中间的数据越来陡,所以overfiting

    8 评估与测试:预测和真实之间的差异图

    # 测试模型
    default_model = GradientBoostingRegressor(random_state = 42)
    default_model.fit(X,y)
    # 选择最好的参数
    final_model = grid_search.best_estimator_
    
    final_model

    输出:

    GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
                              learning_rate=0.1, loss='lad', max_depth=5,
                              max_features=None, max_leaf_nodes=None,
                              min_impurity_decrease=0.0, min_impurity_split=None,
                              min_samples_leaf=6, min_samples_split=6,
                              min_weight_fraction_leaf=0.0, n_estimators=800,
                              n_iter_no_change=None, presort='auto',
                              random_state=42, subsample=1.0, tol=0.0001,
                              validation_fraction=0.1, verbose=0, warm_start=False)
    default_pred = default_model.predict(X_test)
    final_pred = final_model.predict(X_test)
    print('Default model performance on the test set: MAE = %0.4f.' % mae(y_test, default_pred))
    print('Final model performance on the test set:   MAE = %0.4f.' % mae(y_test, final_pred))
    Default model performance on the test set: MAE = 9.9864.
    Final model performance on the test set:   MAE = 9.1172.
    Default model performance on the test set: MAE = 9.9864.
    Final model performance on the test set:   MAE = 9.1172.

    figsize = (6, 6)
    
    # 最终的模型差异 = 模型  -  测试值 ,大部分都在+-25%
    residuals = final_pred - y_test
    
    plt.hist(residuals, color = 'red', bins = 20,
             edgecolor = 'black')
    plt.xlabel('Error'); plt.ylabel('Count')
    plt.title('Distribution of Residuals');

    9 解释模型:基于重要性来进行特征选择

    import pandas as pd
    import numpy as np
    
    
    pd.options.mode.chained_assignment = None
    pd.set_option('display.max_columns', 60)
    
    
    import matplotlib.pyplot as plt
    %matplotlib inline
    
    
    plt.rcParams['font.size'] = 24
    
    from IPython.core.pylabtools import figsize
    
    import seaborn as sns
    
    sns.set(font_scale = 2)
    
    
    from sklearn.preprocessing import Imputer, MinMaxScaler
    
    from sklearn.linear_model import LinearRegression
    from sklearn.ensemble import GradientBoostingRegressor
    
    from sklearn import tree
    
    
    
    import warnings
    warnings.filterwarnings("ignore")
    • 先拿到我们上个模型的结果
    train_features = pd.read_csv('data/training_features.csv')
    test_features = pd.read_csv('data/testing_features.csv')
    train_labels = pd.read_csv('data/training_labels.csv')
    test_labels = pd.read_csv('data/testing_labels.csv')
    # 用中值代替缺失值
    imputer = Imputer(strategy='median')
    
    # 开始训练
    imputer.fit(train_features)
    
    
    X = imputer.transform(train_features)
    X_test = imputer.transform(test_features)
    
    
    y = np.array(train_labels).reshape((-1,))
    y_test = np.array(test_labels).reshape((-1,))
    def mae(y_true, y_pred):
        return np.mean(abs(y_true - y_pred))
    model = GradientBoostingRegressor(loss='lad', max_depth=5, max_features=None,
                                      min_samples_leaf=6, min_samples_split=6, 
                                      n_estimators=800, random_state=42)
    
    model.fit(X, y)

    输出:

    GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
                              learning_rate=0.1, loss='lad', max_depth=5,
                              max_features=None, max_leaf_nodes=None,
                              min_impurity_decrease=0.0, min_impurity_split=None,
                              min_samples_leaf=6, min_samples_split=6,
                              min_weight_fraction_leaf=0.0, n_estimators=800,
                              n_iter_no_change=None, presort='auto',
                              random_state=42, subsample=1.0, tol=0.0001,
                              validation_fraction=0.1, verbose=0, warm_start=False)
    #  GBDT模型作为最终的模型
    model_pred = model.predict(X_test)
    
    print('Final Model Performance on the test set: MAE = %0.4f' % mae(y_test, model_pred))
    Final Model Performance on the test set: MAE = 9.1485
    # 特征重要度
    feature_results = pd.DataFrame({'feature': list(train_features.columns),  #所有的训练特征
                                    'importance': model.feature_importances_})
    
    # 展示前10名的重要的特征 ,降序  
    feature_results = feature_results.sort_values('importance', ascending = False).reset_index(drop=True)
    
    feature_results.head(10)

    figsize(12, 10)
    plt.style.use('fivethirtyeight')
    
    # 展示前10名的重要的特征 
    feature_results.loc[:9, :].plot(x = 'feature', y = 'importance', 
                                     edgecolor = 'k',
                                     kind='barh', color = 'blue');#barh:直方图横着
    plt.xlabel('Relative Importance', size = 20); plt.ylabel('')
    plt.title('Feature Importances from Random Forest', size = 30);

    most_important_features = feature_results['feature'][:10]#前10行的特征
    # indices=10个列名
    indices = [list(train_features.columns).index(x) for x in most_important_features]# 列表推导式
    
    
    X_reduced = X[:, indices]
    X_test_reduced = X_test[:, indices]
    
    print('Most important training features shape: ', X_reduced.shape)
    print('Most important testing  features shape: ', X_test_reduced.shape)
    Most important training features shape:  (6622, 10)
    Most important testing  features shape:  (2839, 10)
    lr = LinearRegression()
    
    
    lr.fit(X, y)
    lr_full_pred = lr.predict(X_test)
    
    
    lr.fit(X_reduced, y)
    lr_reduced_pred = lr.predict(X_test_reduced)
    
    
    print('Linear Regression Full Results: MAE =    %0.4f.' % mae(y_test, lr_full_pred))
    print('Linear Regression Reduced Results: MAE = %0.4f.' % mae(y_test, lr_reduced_pred))
    Linear Regression Full Results: MAE =    12.9817.
    Linear Regression Reduced Results: MAE = 14.4826.
  • 相关阅读:
    通过IP获取所在城市
    一次完整的HTTP请求过程
    Array.prototype.slice.call(arguments)
    移动web资源整理
    Null 和 Undefined
    JS正则表达式
    JavaScript和JSP的区别?
    JS判断图片上传时文件大小和图片尺寸
    JavaScript中的callee与caller的区别
    php结合redis实现高并发下的抢购、秒杀功能
  • 原文地址:https://www.cnblogs.com/qiu-hua/p/14397688.html
Copyright © 2011-2022 走看看