zoukankan      html  css  js  c++  java
  • sklearn:随机森林_回归树_波士顿房价_填补缺失值

    • 分类树和回归树参数差别:
    1. criterion
      • 分类:使用信息增益,
      • 回归:
        • 均方误差MSE,使用均值。mse是父节点与叶子节点之间的均方误差,用来选择特征。同时也是用于衡量模型质量的指标。均方误差是正的,但是sklearn中的均方误差是负数。
        • 绝对误差mae,使用中值。
        • 注意:回归树的接口score默认返回的是R方(负无穷到1,越接近1越好),不是mse
    from sklearn.datasets import load_boston
    from sklearn.model_selection import cross_val_score
    from sklearn.ensemble import RandomForestRegressor
    
    boston = load_boston()
    
    import sklearn
    sorted(sklearn.metrics.SCORERS.keys())
    
    ['accuracy',
     'adjusted_mutual_info_score',
     'adjusted_rand_score',
     'average_precision',
     'balanced_accuracy',
     'brier_score_loss',
     'completeness_score',
     'explained_variance',
     'f1',
     'f1_macro',
     'f1_micro',
     'f1_samples',
     'f1_weighted',
     'fowlkes_mallows_score',
     'homogeneity_score',
     'jaccard',
     'jaccard_macro',
     'jaccard_micro',
     'jaccard_samples',
     'jaccard_weighted',
     'max_error',
     'mutual_info_score',
     'neg_log_loss',
     'neg_mean_absolute_error',
     'neg_mean_squared_error',
     'neg_mean_squared_log_error',
     'neg_median_absolute_error',
     'normalized_mutual_info_score',
     'precision',
     'precision_macro',
     'precision_micro',
     'precision_samples',
     'precision_weighted',
     'r2',
     'recall',
     'recall_macro',
     'recall_micro',
     'recall_samples',
     'recall_weighted',
     'roc_auc',
     'v_measure_score']
    
    regresor = RandomForestRegressor(n_estimators=100, random_state=0)
    cross_val_score(regresor, boston.data, boston.target, cv=10
                   , scoring="neg_mean_squared_error"  # 可以通过 sklearn.metrics.SCORERS.keys() 查看scoring对应的参数,默认是R方
                   )
    # 返回10次交叉验证的衡量指标结果
    
    array([-10.72900447,  -5.36049859,  -4.74614178, -20.84946337,
           -12.23497347, -17.99274635,  -6.8952756 , -93.78884428,
           -29.80411702, -15.25776814])
    
    
    

    用随机森林回归填补缺失值

    import numpy as np
    import pandas as pd
    import matplotlib.pyplot as plt
    from sklearn.datasets import load_boston
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.model_selection import cross_val_score
    
    dataset = load_boston()
    dataset.data.shape
    
    (506, 13)
    
    x_full, y_full = dataset.data, dataset.target  # 保存完整的数据
    n_samples = x_full.shape[0]
    n_features = x_full.shape[1]
    n_samples, n_features
    
    (506, 13)
    
    # 首先确定希望放入的缺失值数据的比例。
    rng = np.random.RandomState(0)
    missing_rate = 0.5
    n_missing_samples = int(np.floor(n_samples * n_features * missing_rate))
    n_missing_samples
    
    3289
    
    # 构建缺失数据
    
    missing_features = rng.randint(0, n_features, n_missing_samples)  # 生成从0-n之间的n_missing_samples个数据
    missing_samples = rng.randint(0, n_samples, n_missing_samples)
    
    x_missing = x_full.copy()
    y_missing = y_full.copy()
    
    x_missing[missing_samples, missing_features] = np.nan
    x_missing = pd.DataFrame(x_missing)
    x_missing
    
    0 1 2 3 4 5 6 7 8 9 10 11 12
    0 NaN 18.0 NaN NaN 0.538 NaN 65.2 4.0900 1.0 296.0 NaN NaN 4.98
    1 0.02731 0.0 NaN 0.0 0.469 NaN 78.9 4.9671 2.0 NaN NaN 396.90 9.14
    2 0.02729 NaN 7.07 0.0 NaN 7.185 61.1 NaN 2.0 242.0 NaN NaN NaN
    3 NaN NaN NaN 0.0 0.458 NaN 45.8 NaN NaN 222.0 18.7 NaN NaN
    4 NaN 0.0 2.18 0.0 NaN 7.147 NaN NaN NaN NaN 18.7 NaN 5.33
    ... ... ... ... ... ... ... ... ... ... ... ... ... ...
    501 NaN NaN NaN 0.0 0.573 NaN 69.1 NaN 1.0 NaN 21.0 NaN 9.67
    502 0.04527 0.0 11.93 0.0 0.573 6.120 76.7 2.2875 1.0 273.0 NaN 396.90 9.08
    503 NaN NaN 11.93 NaN 0.573 6.976 91.0 NaN NaN NaN 21.0 NaN 5.64
    504 0.10959 0.0 11.93 NaN 0.573 NaN 89.3 NaN 1.0 NaN 21.0 393.45 6.48
    505 0.04741 0.0 11.93 0.0 0.573 6.030 NaN NaN 1.0 NaN NaN 396.90 7.88

    506 rows × 13 columns

    from sklearn.impute import SimpleImputer  # 专门用于填补缺失值的类
    
    # 使用均值填充
    imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
    x_missing_mean = imp_mean.fit_transform(x_missing)
    x_missing_mean = pd.DataFrame(x_missing_mean)
    x_missing_mean
    
    0 1 2 3 4 5 6 7 8 9 10 11 12
    0 3.627579 18.000000 11.163464 0.066007 0.538000 6.305921 65.2 4.090000 1.000000 296.000000 18.521192 352.741952 4.980000
    1 0.027310 0.000000 11.163464 0.000000 0.469000 6.305921 78.9 4.967100 2.000000 405.935275 18.521192 396.900000 9.140000
    2 0.027290 10.722951 7.070000 0.000000 0.564128 7.185000 61.1 3.856371 2.000000 242.000000 18.521192 352.741952 12.991767
    3 3.627579 10.722951 11.163464 0.000000 0.458000 6.305921 45.8 3.856371 9.383871 222.000000 18.700000 352.741952 12.991767
    4 3.627579 0.000000 2.180000 0.000000 0.564128 7.147000 67.4 3.856371 9.383871 405.935275 18.700000 352.741952 5.330000
    ... ... ... ... ... ... ... ... ... ... ... ... ... ...
    501 3.627579 10.722951 11.163464 0.000000 0.573000 6.305921 69.1 3.856371 1.000000 405.935275 21.000000 352.741952 9.670000
    502 0.045270 0.000000 11.930000 0.000000 0.573000 6.120000 76.7 2.287500 1.000000 273.000000 18.521192 396.900000 9.080000
    503 3.627579 10.722951 11.930000 0.066007 0.573000 6.976000 91.0 3.856371 9.383871 405.935275 21.000000 352.741952 5.640000
    504 0.109590 0.000000 11.930000 0.066007 0.573000 6.305921 89.3 3.856371 1.000000 405.935275 21.000000 393.450000 6.480000
    505 0.047410 0.000000 11.930000 0.000000 0.573000 6.030000 67.4 3.856371 1.000000 405.935275 18.521192 396.900000 7.880000

    506 rows × 13 columns

    # 使用 0填充缺失值
    imp_0 = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0)
    x_missing_0 = imp_0.fit_transform(x_missing)
    x_missing_0 = pd.DataFrame(x_missing_0)
    x_missing_0
    
    0 1 2 3 4 5 6 7 8 9 10 11 12
    0 0.00000 18.0 0.00 0.0 0.538 0.000 65.2 4.0900 1.0 296.0 0.0 0.00 4.98
    1 0.02731 0.0 0.00 0.0 0.469 0.000 78.9 4.9671 2.0 0.0 0.0 396.90 9.14
    2 0.02729 0.0 7.07 0.0 0.000 7.185 61.1 0.0000 2.0 242.0 0.0 0.00 0.00
    3 0.00000 0.0 0.00 0.0 0.458 0.000 45.8 0.0000 0.0 222.0 18.7 0.00 0.00
    4 0.00000 0.0 2.18 0.0 0.000 7.147 0.0 0.0000 0.0 0.0 18.7 0.00 5.33
    ... ... ... ... ... ... ... ... ... ... ... ... ... ...
    501 0.00000 0.0 0.00 0.0 0.573 0.000 69.1 0.0000 1.0 0.0 21.0 0.00 9.67
    502 0.04527 0.0 11.93 0.0 0.573 6.120 76.7 2.2875 1.0 273.0 0.0 396.90 9.08
    503 0.00000 0.0 11.93 0.0 0.573 6.976 91.0 0.0000 0.0 0.0 21.0 0.00 5.64
    504 0.10959 0.0 11.93 0.0 0.573 0.000 89.3 0.0000 1.0 0.0 21.0 393.45 6.48
    505 0.04741 0.0 11.93 0.0 0.573 6.030 0.0 0.0000 1.0 0.0 0.0 396.90 7.88

    506 rows × 13 columns

    # 使用 随机森林 填充缺失值
    # 通过已有的 特征数据 和 标签信息来 回归预测 缺失的数据
    # 先填充缺失较少的特征数据
    
    x_missing_reg = x_missing.copy()
    sortindex = np.argsort(x_missing_reg.isnull().sum(axis=0)).values  # 计算出特征空值数据,然后排序返回对应列的索引
    sortindex
    
    array([ 6, 12,  8,  7,  9,  0,  2,  1,  5,  4,  3, 10, 11], dtype=int64)
    
    # 遍历,填补空值
    for i in sortindex:
        df = x_missing_reg
        fillc = df.iloc[:, i]
        df = pd.concat([df.drop(i, axis=1), pd.DataFrame(y_full)], axis=1)
        
        df_0 = SimpleImputer(missing_values=np.nan
                            , strategy='constant'
                            , fill_value=0
                            ).fit_transform(df)
        
        y_train = fillc[fillc.notnull()]
        y_test = fillc[fillc.isnull()]
        x_train = df_0[y_train.index, :]
        x_test = df_0[y_test.index, :]
        
        rfc = RandomForestRegressor(n_estimators=100)
        rfc = rfc.fit(x_train, y_train)
        y_predict = rfc.predict(x_test)
        
        x_missing_reg.loc[x_missing_reg.loc[:, i].isnull(), i] = y_predict
    
    # 对填补好的数据进行建模
    
    X = [x_full, x_missing_mean, x_missing_0, x_missing_reg]
    
    mse = []
    std = []
    for x in X:
        estimator = RandomForestRegressor(random_state=0, n_estimators=100)
        scores = cross_val_score(estimator, x, y_full, scoring='neg_mean_squared_error', cv=5).mean()
        mse.append(scores * -1)
    
    # 用所得的结果画出条形图
    
    x_labels = ['Full data'
                , 'Zero Imputation'
                , 'Mean Imputation'
                , 'Regressor Imputation'
               ]
    colors = ['r', 'g', 'b', 'orange']
    
    plt.figure(figsize=(12, 6))
    ax = plt.subplot(111)
    for i in range(len(mse)):
        ax.barh(i, mse[i], color=colors[i], alpha=0.6, align='center')
        
    ax.set_title('Imputation Techniques with Boston Data')
    ax.set_xlim(left=np.min(mse) * 0.9,
                right=np.max(mse) * 1.1
               )
    ax.set_yticks(range(len(mse)))
    ax.set_xlabel('MSE')
    ax.set_yticklabels(x_labels)
    plt.show()
    

    png

  • 相关阅读:
    可执行
    创建
    可能的加分项
    给老师的建议
    周总结
    今日总结
    今日总结
    今日总结
    今日总结
    今日总结
  • 原文地址:https://www.cnblogs.com/jaysonteng/p/14226334.html
Copyright © 2011-2022 走看看