zoukankan      html  css  js  c++  java
  • 机器学习项目的完整流程

    A complete ML Project

    %matplotlib inline
    import numpy as np
    import pandas as pd
    import matplotlib.pyplot as plt
    

    Get data

    data = pd.read_csv(path)
    #大致看columns
    data.head()
    
    #看类型,是否有缺失等
    data.info()
    
    #看到object或其他类型,用value_counts()查看
    data["xx"].value_counts()
    
    data.describe()
    
    #直方图,检查数据是否被scaled or capped、单位是否合理,尤其是重要features和target、峰度、偏度等。
    data.hist(bins=50, figsize=(20,15));
    

    Create a Training/ Test Set

    If the training set is very large, you may want to sample an exploration set to make manipulations easy and fast.

    random sampling methods: This is generally fine if your dataset is large enough (especially relative to the number of attributes).否则要用分层抽样

    from sklearn.model_selection import train_test_split
    train_set, test_set = train_test_split(data, test_size=0.2, random_state=42)
    
    # 下面作阅读理解
    def split_train_test(data, test_ratio):
        shuffled_indices = np.random.permutation(len(data))
        test_set_size = int(len(data) * test_ratio)
        test_indices = shuffled_indices[:test_set_size]
        train_indices = shuffled_indices[test_set_size:]
        return data.iloc[train_indices], data.iloc[test_indices]
    
    #确保更新后的数据不会与旧数据混淆
    import hashlib
    
    def test_set_check(identifier, test_ratio, hash):
        return hash(np.int64(identifier)).digest()[-1] < 256 * test_ratio
    def split_train_test_by_id(data, test_ratio, id_column, hash==hashlib.md5):
        ids = data[id_column]
        in_test_set = ids.apply(lambda id_: test_set_check(id_, test_ratio, hash))
        return data.loc[~in_test_set], data.loc[in_test_set]
    #当数据没有identifier时,可以用ID(要确保更新的数据连接在后面,而且旧数据没有row变化):
    housing_with_id = housing.reset_index()   # adds an `index` column
    train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, "index")
    #又或者根据feature的特殊性创作id变量:
    housing_with_id["id"] = housing["longitude"] * 1000 + housing["latitude"]
    train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, "id")
    

    stratifed sampling: It is important to have a sufficient number of instances in your dataset for each stratum.This means that you should not have too many strata.

    # 从直方图看出income的主要集中在2-5
    housing["median_income"].hist(bins=50)
    # 也可以根据比例确定集中度
    housing["income_cat"].value_counts()/len(data)
    
    #为了使抽样更具代表性,下面进行分层。下面压缩某个feature,以减少分层数
    # Divide by 1.5 to limit the number of income categories。用ceil进行分类
    housing["income_cat"] = np.ceil(housing["median_income"] / 1.5)
    # 这里where的使用不同于np,将大于5的变为5.0
    housing["income_cat"].where(housing["income_cat"] < 5, 5.0, inplace=True)
    
    # 分好层后,用StratifiedShuffleSplit进行抽样。该函数n_splits是返回的train/test组数
    # 返回的结果是通过split获得的两组ndarray
    from sklearn.model_selection import StratifiedShuffleSplit
    
    split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
    for train_index, test_index in split.split(housing, housing["income_cat"]):
        strat_train_set = housing.loc[train_index]
        strat_test_set = housing.loc[test_index]
    
    # 检查test_set比例和整体比例是否匹配
    strat_test_set["income_cat"].value_counts() / len(strat_test_set)
    housing["income_cat"].value_counts() / len(housing)
    
    #最后要将上面创建的用于抽样的categories删去
    for set_ in (strat_train_set, strat_test_set):
        set_.drop("income_cat", axis=1, inplace=True)
    

    当不知道选哪种sample时,可以实现两个后进行比较

    # 各份数据中income_cat按种类划分后各种类的比例
    def sampling_proportions(data):
        return data["income_cat"].value_counts() / len(data)
    
    # 创建一个df比较strat和random得到的test_set中某个col里的种类比例与原始数据的比较。
    compare_props = pd.DataFrame({
        "Overall": income_cat_proportions(housing),
        "Stratified": income_cat_proportions(strat_test_set),
        "Random": income_cat_proportions(random_test_set),
    }).sort_index()
    compare_props["Rand. %error"] = 100 * compare_props["Random"] / compare_props["Overall"] - 100
    compare_props["Strat. %error"] = 100 * compare_props["Stratified"] / compare_props["Overall"] - 100
    

    Discover and visualize the data to gain insights

    #copy一份
    housing = strat_train_set.copy()
    

    Visualizing Geographical Data

    #先简单地看一下,设置alpha方便看密度(部分透明)
    housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.1)
    
    #然后加入其他features。这里s是点的大小,注意population要scaling,否则圈会很大;
    #c是点的颜色,引入cmap=plt.get_cmap("jet")为调色版,colorbar是是否显示调色板,sharex显示x轴?
    housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.4,
        s=housing["population"]/100, label="population", figsize=(10,7),
        c="median_house_value", cmap=plt.get_cmap("jet"), colorbar=True,
        sharex=False)
    plt.legend()# 图例
    save_fig("housing_prices_scatterplot")
    #从图中看出地理位置、人口和target feature(价格)关系很大。这可以用clustering来创造
    #找出人口中心来测量价格,还有与海的距离也可能是一个很好的变量(但是北边跟海的关系不大)。
    

    Looking for Correlations

    #查看各个feature和target的关系
    corr_matrix = housing.corr()
    corr_matrix["median_house_value"].sort_values(ascending=False)
    
    #由于corr只能发现线性关系,所以有必要看一下散点图。由于features较多,下面只关注部分
    from pandas.plotting import scatter_matrix
    
    attributes = ["median_house_value", "median_income", "total_rooms",
                  "housing_median_age"]
    scatter_matrix(housing[attributes], figsize=(12, 8))
    save_fig("scatter_matrix_plot")
    
    #之后具体查看
    housing.plot(kind="scatter", x="median_income", y="median_house_value", alpha=0.1)
    plt.axis([0, 16, 0, 550000])
    save_fig("income_vs_house_value_scatterplot")
    

    Experimenting with Attribute Combinations

    创建后再查看关联度,这是一个重复的过程

    #根据feature的实际情况和关系构建新features
    housing["rooms_per_household"] = housing["total_rooms"]/housing["households"]
    housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"]
    housing["population_per_household"]=housing["population"]/housing["households"]
    
    #这个过程可以建立一个class
    from sklearn.base import BaseEstimator, TransformerMixin
    
    # column index
    rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6
    
    class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
        def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs
            self.add_bedrooms_per_room = add_bedrooms_per_room
        def fit(self, X, y=None):
            return self  # nothing else to do
        def transform(self, X, y=None):
            rooms_per_household = X[:, rooms_ix] / X[:, household_ix]
            population_per_household = X[:, population_ix] / X[:, household_ix]
            if self.add_bedrooms_per_room:
                bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
                return np.c_[X, rooms_per_household, population_per_household,
                             bedrooms_per_room]
            else:
                return np.c_[X, rooms_per_household, population_per_household]
    
    attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
    housing_extra_attribs = attr_adder.transform(housing.values)
    

    Prepare the Data for Machine Learning Algorithms

    #先将除去label的train_data提取出来
    housing = strat_train_set.drop("median_house_value", axis=1) # drop labels for training set
    housing_labels = strat_train_set["median_house_value"].copy()
    
    #对缺失值有下面三个选择
    #Get rid of the corresponding districts.
    housing.dropna(subset=["total_bedrooms"]) #有nan的行都会被去掉
    #Get rid of the whole attribute.
    housing.drop("total_bedrooms", axis=1)
    #Set the values to some value 
    housing["total_bedrooms"].fillna(median)
    #第三种方法用sklearn会更好
    #这里对所有attributes进行同样操作,这是为了以后或许其他attributes出现缺失提前准备
    #只有数值型有中位数,所以要把部分object attributes去掉
    housing_num = housing.drop('ocean_proximity', axis=1)
    
    from sklearn.preprocessing import Imputer
    
    imputer = Imputer(strategy="median")
    imputer.fit(housing_num)
    imputer.statistics_#查看各attribute的中位数,imputer.strategy查看strategy
    X = imputer.transform(housing_num)# X是与housing_sum维度相同的np,下面转换回df
    housing_tr = pd.DataFrame(X, columns=housing_num.columns,index = list(housing.index.values))
    
    # Handling Text and Categorical Attributes
    #当sklearn 0.20发布时,可以从sklearn.preprocessing引入
    #文字转化为数字
    from future_encoders import OrdinalEncoder
    ordinal_encoder = OrdinalEncoder()
    housing_cat_encoded = ordinal_encoder.fit_transform(housing_cat)
    ordinal_encoder.categories_#查看种类
    # 转换成数字后,还要转换为one-hot vectors
    from sklearn.preprocessing import OneHotEncoder
    
    encoder = OneHotEncoder()
    housing_cat_1hot = encoder.fit_transform(housing_cat_encoded.reshape(-1,1))# 要将1d转换为2d
    housing_cat_1hot#输出默认为SciPy sparse matrix
    housing_cat_1hot.toarray()#转换回np或者提前设OneHotEncoder(sparse=False)
    
    #对于label的文字转化,可以用sklearn,这里把两次转换结合在一起。CategoricalEncoder等待更新
    from sklearn.preprocessing import LabelBinarizer
    encoder = LabelBinarizer()
    housing_cat_1hot = encoder.fit_transform(housing_cat)
    housing_cat_1hot#输出默认为np
    
    #未来版本
    from sklearn.preprocessing import CategoricalEncoder
    cat_encoder = CategoricalEncoder(encoding="onehot-dense")#输出默认为np
    housing_cat_reshaped = housing_cat.values.reshape(-1, 1)
    housing_cat_1hot = cat_encoder.fit_transform(housing_cat_reshaped)
    housing_cat_1hot
    
    
    

    Feature Scaling

    It is important to fit the scalers to the training data only, not to the full dataset (including the test set). Only then can you use them to transform the training set and the test set (and new data).

    sklearn: MinMaxScaler and StandardScaler

    Transformation Pipelines

    #对于piperline:All but the last estimator must be transformers.最后一项不是的话,
    # 就不能用fit_transform(),只能用fit(用后就只剩下最后一步没有转换)
    #对于FeatureUnion,同时运行多个pipeline,最后concatenates
    from sklearn.pipeline import Pipeline
    from sklearn.pipeline import FeatureUnion
    from sklearn.preprocessing import StandardScaler
    
    #分开数字和文字attributes
    num_attribs = list(housing_num)
    cat_attribs = ["ocean_proximity"]
    
    num_pipeline = Pipeline([
            ('selector', DataFrameSelector(num_attribs)),
            ('imputer', Imputer(strategy="median")),
            ('attribs_adder', CombinedAttributesAdder()),
            ('std_scaler', StandardScaler()),
        ])
    
    cat_pipeline = Pipeline([
            ('selector', DataFrameSelector(cat_attribs)),
            ('cat_encoder', CategoricalEncoder(encoding="onehot-dense")),
        ])
    
    full_pipeline = FeatureUnion(transformer_list=[
            ("num_pipeline", num_pipeline),
            ("cat_pipeline", cat_pipeline),
        ])
    
    housing_prepared = full_pipeline.fit_transform(housing)
    housing_prepared
    
    # Create a class to transform df to np
    from sklearn.base import BaseEstimator, TransformerMixin
    
    class DataFrameSelector(BaseEstimator, TransformerMixin):
        def __init__(self, attribute_names):
            self.attribute_names = attribute_names
        def fit(self, X, y=None):
            return self
        def transform(self, X):
            return X[self.attribute_names].values
    

    Select and Train a Model

    Training and Evaluating on the Training Set

    先找2~5个promising models再去考虑tweaking the hyperparameters

    1. 监督学习
    2. 非监督学习
    # get_params 查看模型设定
    #普通线性回归
    from sklearn.linear_model import LinearRegression
    lin_reg = LinearRegression()
    
    # SGD回归(default learning schedule,fit后会显示详细参数)
    from sklearn.linear_model import SGDRegressor
    sgd_reg = SGDRegressor(max_iter=50, penalty=None, eta0=0.1, random_state=42)
    # Batch 和 Mini-batch GD 在后面补充
    
    # Polynomial regression
    from sklearn.preprocessing import PolynomialFeatures
    poly_features = PolynomialFeatures(degree=2, include_bias=False)
    X_poly = poly_features.fit_transform(X)# 将X_poly代入普通线性回归就可以了
    
    # Ridge Regression 用的是近似的正解的方法 using a matrix factorization technique(cholesky)
    from sklearn.linear_model import Ridge
    ridge_reg = Ridge(alpha=1, solver="cholesky", random_state=42)
    
    # Logistic regression
    from sklearn.linear_model import LogisticRegression
    log_reg = LogisticRegression(random_state=42)
    log_reg.predict_proba()#能知道两个class的概率
    # Softmax Regression 的设置为multi_class="multinomial",solver="lbfgs"
    
    #决策树
    from sklearn.tree import DecisionTreeClassifier
    tree_clf = DecisionTreeClassifier(max_depth=2, random_state=42)
    tree_clf.predict_proba()
    from sklearn.tree import DecisionTreeRegressor
    tree_reg = DecisionTreeRegressor(random_state=42)
    
    #随机森林(决策树+Bagging的组合)
    from sklearn.ensemble import RandomForestRegressor
    forest_reg = RandomForestRegressor(random_state=42)
    # 例外:没有splitter,它是随机的;没有presort,它是关闭的; 没有max_samples,它是1.0
    # 可以提取重要feature:
    for name, score in zip(iris["feature_names"], rnd_clf.feature_importances_):
    
    # KNN
    from sklearn.neighbors import KNeighborsClassifier
    knn_clf = KNeighborsClassifier()
    
    #SVM
    from sklearn.svm import LinearSVR
    svm_reg = LinearSVR(epsilon=1.5, random_state=42)
    from sklearn.svm import SVR 
    svm_poly_reg = SVR(kernel="poly", degree=2, C=100, epsilon=0.1)# 在reg下,c越大,规整化越小
    
    # 这个方法regularizes the bias term,所以数据要先中心化
    # 对于 non-linearly separable 数据,同样用PolynomialFeatures后再用下面模型。但如果features
    #太多,要用kernel
    from sklearn.svm import LinearSVC
    svm_clf = LinearSVC(C=1, loss="hinge", dual= False random_state=42)#不提供每个class的prob
    #当数据量较大(out-of-core training),可以用下面的SGDClassifier(loss="hinge",alpha=1/(m*C))
    #相当于SVC的(kernel="linear", C=1),但更慢。
    
    # Kernel
    from sklearn.svm import SVC
    poly_kernel_svm_clf = SVC(kernel="poly", degree=3, coef0=1, C=5)
    # coef0 控制模型受到高维度和低维度影响的程度,相当于调整penality
    rbf_kernel_svm_clf = SVC(kernel="rbf", gamma=5, C=0.001)
    # SVC可以设probability=True,然乎使用CV来产生prob,生成predict_proba(),会比较慢
    
    # SGDClassifier
    from sklearn.linear_model import SGDClassifier
    sgd_clf = SGDClassifier(max_iter=5, random_state=42)
    #sgd_clf.decision_function([some_digit]) 能得到评分,多class时多个分数,
    #可用sgd_clf.classes_查对应的class
    
    # 非监督学习
    # k-means
    from sklearn.cluster import KMeans
    kmeans = KMeans(n_clusters=4)# n_init默认为10, init="k-means++"默认。
    # algorithm默认密集数据用elkan,sparse数据用full
    # predict和labels_一样
    kmeans.transform(X_new)#与每一个中心的距离
    # K的选取
    kmeans_per_k = [KMeans(n_clusters=k, random_state=42).fit(X)
                    for k in range(1, 10)]
    inertias = [model.inertia_ for model in kmeans_per_k]
    
    # plt.figure(figsize=(8, 3.5))
    # plt.plot(range(1, 10), inertias, "bo-")
    # plt.xlabel("$k$", fontsize=14)
    # plt.ylabel("Inertia", fontsize=14)
    # plt.annotate('Elbow',
    #              xy=(4, inertias[3]),
    #              xytext=(0.55, 0.55),
    #              textcoords='figure fraction',
    #              fontsize=16,
    #              arrowprops=dict(facecolor='black', shrink=0.1)
    #             )
    # plt.axis([1, 8.5, 0, 1300])
    # save_fig("inertia_vs_k_diagram")
    # plt.show()
    
    from sklearn.metrics import silhouette_score
    silhouette_scores = [silhouette_score(X, model.labels_)
                         for model in kmeans_per_k[1:]]
    # plt.figure(figsize=(8, 3))
    # plt.plot(range(2, 10), silhouette_scores, "bo-")
    # plt.xlabel("$k$", fontsize=14)
    # plt.ylabel("Silhouette score", fontsize=14)
    # plt.axis([1.8, 8.5, 0.55, 0.7])
    # save_fig("silhouette_score_vs_k_diagram")
    # plt.show()
    
    #SpectralClustering
    from sklearn.cluster import SpectralClustering
    model = SpectralClustering(n_clusters=2, affinity='nearest_neighbors',
                               assign_labels='kmeans')
    
    #Gaussian Mixtures
    from sklearn.mixture import GaussianMixture
    gm = GaussianMixture(n_components=3, n_init=10, random_state=42)
    #components的选择
    gms_per_k = [GaussianMixture(n_components=k, n_init=10, random_state=42).fit(X)
                 for k in range(1, 11)]
    bics = [model.bic(X) for model in gms_per_k]
    aics = [model.aic(X) for model in gms_per_k]
    #或者直接用Variational Bayesian Gaussian Mixtures
    #设一个认为绝对够大的n,模型会自动选
    from sklearn.mixture import BayesianGaussianMixture
    bgm = BayesianGaussianMixture(n_components=10, n_init=10)
    

    evaluation

    1. 准确性
    2. L2、 L1
    3. P/R 和 ROC 评分
    4. 交叉检验评分
    #准确性评分
    # L2评分
    from sklearn.metrics import mean_squared_error
    
    housing_predictions = lin_reg.predict(housing_prepared)
    lin_mse = mean_squared_error(housing_labels, housing_predictions)
    lin_rmse = np.sqrt(lin_mse)
    lin_rmse
    
    # L1评分
    from sklearn.metrics import mean_absolute_error
    
    lin_mae = mean_absolute_error(housing_labels, housing_predictions)
    lin_mae
    
    # P/R 和 ROC 评分
    from sklearn.model_selection import cross_val_predict
    y_train_pred = cross_val_predict(sgd_clf, X_train, y_train_5, cv=3)
    #下面用于binary classification有点小bug。 
    y_scores = cross_val_predict(sgd_clf, X_train, y_train_5, cv=3, method="decision_function")
    #当使用随机森林时,只能用method="predict_proba",返回概率(每个样本有属于各个class的概率)
    #然后直接用pro作为分数,如在binary classification里面,第二项就是True
    #y_scores_forest = y_probas_forest[:, 1] # score = proba of positive class
    
    
    # P/R
    # from sklearn.metrics import confusion_matrix
    # from sklearn.metrics import precision_score, recall_score
    # from sklearn.metrics import f1_score # 衡量两者,偏向数值小的。可以选average="macro"or"weighted"
    # confusion_matrix(y_train_5, y_train_pred)
    
    # 图
    # from sklearn.metrics import precision_recall_curve
    # precisions, recalls, thresholds = precision_recall_curve(y_train_5, y_scores)
    
    # def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
    #     plt.plot(thresholds, precisions[:-1], "b--", label="Precision", linewidth=2)
    #     plt.plot(thresholds, recalls[:-1], "g-", label="Recall", linewidth=2)
    #     plt.xlabel("Threshold", fontsize=16)
    #     plt.legend(loc="upper left", fontsize=16)
    #     plt.ylim([0, 1])
    
    # plt.figure(figsize=(8, 4))
    # plot_precision_recall_vs_threshold(precisions, recalls, thresholds)
    # plt.xlim([-700000, 700000])
    # save_fig("precision_recall_vs_threshold_plot")
    # plt.show()
    
    # ROC 
    # from sklearn.metrics import roc_curve
    # fpr, tpr, thresholds = roc_curve(y_train_5, y_scores)
    
    # 图
    # def plot_roc_curve(fpr, tpr, label=None):
    #     plt.plot(fpr, tpr, linewidth=2, label=label)
    #     plt.plot([0, 1], [0, 1], 'k--')
    #     plt.axis([0, 1, 0, 1])
    #     plt.xlabel('False Positive Rate', fontsize=16)
    #     plt.ylabel('True Positive Rate', fontsize=16)
    
    # plt.figure(figsize=(8, 6))
    # plot_roc_curve(fpr, tpr)
    # save_fig("roc_curve_plot")
    # plt.show()
    
    # 得分
    # from sklearn.metrics import roc_auc_score
    # roc_auc_score(y_train_5, y_scores)
    
    #原始Cross-Validation
    from sklearn.model_selection import StratifiedKFold
    from sklearn.base import clone
    
    skfolds = StratifiedKFold(n_splits=3, random_state=42)
    
    for train_index, test_index in skfolds.split(X_train, y_train_5):
        clone_clf = clone(sgd_clf)
        X_train_folds = X_train[train_index]
        y_train_folds = (y_train_5[train_index])
        X_test_fold = X_train[test_index]
        y_test_fold = (y_train_5[test_index])
    
        clone_clf.fit(X_train_folds, y_train_folds)
        y_pred = clone_clf.predict(X_test_fold)
        n_correct = sum(y_pred == y_test_fold)
        print(n_correct / len(y_pred))
    
    # Cross-Validation评分
    #上面评分和CV评分比较可知道模型是否overfitting
    from sklearn.model_selection import cross_val_score
    
    #普通线性回归
    lin_scores = cross_val_score(lin_reg, housing_prepared, housing_labels,
                                 scoring="neg_mean_squared_error", cv=10)
    lin_rmse_scores = np.sqrt(-lin_scores)#用的是效用函数(越大越好),所以最后加个-号
    #决策树
    scores = cross_val_score(tree_reg, housing_prepared, housing_labels,
                             scoring="neg_mean_squared_error", cv=10)
    tree_rmse_scores = np.sqrt(-scores)
    #随机森林
    forest_scores = cross_val_score(forest_reg, housing_prepared, housing_labels,
                                    scoring="neg_mean_squared_error", cv=10)
    forest_rmse_scores = np.sqrt(-forest_scores)
    
    def display_scores(scores):
        print("Scores:", scores)
        print("Mean:", scores.mean())
        print("Standard deviation:", scores.std())
    
    display_scores(lin_rmse_scores)
    display_scores(tree_rmse_scores)
    display_scores(forest_rmse_scores)
    

    Fine-Tune Your Model

    1. Grid Search
    2. Randomized Search
    #Grid Search
    from sklearn.model_selection import GridSearchCV
    
    param_grid = [
        # try 12 (3×4) combinations of hyperparameters
        {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
        # then try 6 (2×3) combinations with bootstrap set as False
        {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
      ]
    
    forest_reg = RandomForestRegressor(random_state=42)
    # train across 5 folds, that's a total of (12+6)*5=90 rounds of training 
    grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
                               scoring='neg_mean_squared_error', return_train_score=True)
    grid_search.fit(housing_prepared, housing_labels)
    
    grid_search.best_params_
    
    grid_search.best_estimator_
    
    cvres = grid_search.cv_results_
    for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
        print(np.sqrt(-mean_score), params)
    
    pd.DataFrame(grid_search.cv_results_)
    
    #Randomized Search
    from sklearn.model_selection import RandomizedSearchCV
    from scipy.stats import randint
    
    param_distribs = {
            'n_estimators': randint(low=1, high=200),
            'max_features': randint(low=1, high=8),
        }
    
    forest_reg = RandomForestRegressor(random_state=42)
    rnd_search = RandomizedSearchCV(forest_reg, param_distributions=param_distribs,
                                    n_iter=10, cv=5, scoring='neg_mean_squared_error', random_state=42)
    rnd_search.fit(housing_prepared, housing_labels)
    
    cvres = rnd_search.cv_results_
    for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
        print(np.sqrt(-mean_score), params)
    

    Analyze the Best Models and Their Errors

    • 找出重要变量,去掉不显著的变量
    • adding extra features or, on the contrary, getting rid of uninformative ones, cleaning up outliers, etc.
    #从grid_search中找出最重要attributes
    feature_importances = grid_search.best_estimator_.feature_importances_#attribute values
    #合并attributes名称,原变量加上combine的attributes和encode后的attributes
    extra_attribs = ["rooms_per_hhold", "pop_per_hhold", "bedrooms_per_room"]
    cat_encoder = cat_pipeline.named_steps["cat_encoder"]
    cat_one_hot_attribs = list(cat_encoder.categories_[0])
    attributes = num_attribs + extra_attribs + cat_one_hot_attribs
    sorted(zip(feature_importances, attributes), reverse=True)
    

    Evaluate Your System on the Test Set

    # The improvements would be unlikely to generalize to new data。所以只需一次评分
    final_model = grid_search.best_estimator_
    
    X_test = strat_test_set.drop("median_house_value", axis=1)
    y_test = strat_test_set["median_house_value"].copy()
    
    X_test_prepared = full_pipeline.transform(X_test)# 注意只能用transform
    final_predictions = final_model.predict(X_test_prepared)
    
    final_mse = mean_squared_error(y_test, final_predictions)
    final_rmse = np.sqrt(final_mse)
    
    final_rmse
    
    # compute a 95% confidence interval for the test RMSE
    from scipy import stats
    
    confidence = 0.95
    squared_errors = (final_predictions - y_test) ** 2
    mean = squared_errors.mean()
    m = len(squared_errors)
    
    np.sqrt(stats.t.interval(confidence, m - 1,
                             loc=np.mean(squared_errors),
                             scale=stats.sem(squared_errors)))
    

    Additional materials

    1. Mini-batch and Batch gradient descent
    2. Learning curves
    3. Early Stopping
    4. Ensemble System
    5. Dimensionality Reduction

    Mini-batch and Batch gradient descent

    # Mini-batch gradient descent
    theta_path_mgd = []
    
    n_iterations = 50
    minibatch_size = 20
    
    np.random.seed(42)
    theta = np.random.randn(2,1)  # random initialization
    
    t0, t1 = 200, 1000
    def learning_schedule(t):
        return t0 / (t + t1)
    
    t = 0
    for epoch in range(n_iterations):
        shuffled_indices = np.random.permutation(m)
        X_b_shuffled = X_b[shuffled_indices]
        y_shuffled = y[shuffled_indices]
        for i in range(0, m, minibatch_size):
            t += 1
            xi = X_b_shuffled[i:i+minibatch_size]
            yi = y_shuffled[i:i+minibatch_size]
            gradients = 2/minibatch_size * xi.T.dot(xi.dot(theta) - yi)
            eta = learning_schedule(t)
            theta = theta - eta * gradients
            theta_path_mgd.append(theta)
    
    # Batch GD
    eta = 0.1
    n_iterations = 1000
    m = 100
    theta = np.random.randn(2,1)
    
    for iteration in range(n_iterations):
        gradients = 2/m * X_b.T.dot(X_b.dot(theta) - y)
        theta = theta - eta * gradients
        
    #使用
    X_new_b.dot(theta)
    

    Learning curves

    from sklearn.pipeline import Pipeline
    from sklearn.metrics import mean_squared_error
    from sklearn.model_selection import train_test_split
    
    def plot_learning_curves(model, X, y):
        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=10)
        train_errors, val_errors = [], []
        for m in range(1, len(X_train)):
            model.fit(X_train[:m], y_train[:m])
            y_train_predict = model.predict(X_train[:m])
            y_val_predict = model.predict(X_val)
            train_errors.append(mean_squared_error(y_train[:m], y_train_predict))
            val_errors.append(mean_squared_error(y_val, y_val_predict))
    
        plt.plot(np.sqrt(train_errors), "r-+", linewidth=2, label="train")
        plt.plot(np.sqrt(val_errors), "b-", linewidth=3, label="val")
        plt.legend(loc="upper right", fontsize=14)   
        plt.xlabel("Training set size", fontsize=14)
        plt.ylabel("RMSE", fontsize=14)              
        
    
    polynomial_regression = Pipeline([
            ("poly_features", PolynomialFeatures(degree=10, include_bias=False)),
            ("lin_reg", LinearRegression()),
        ])
    
    plot_learning_curves(polynomial_regression, X, y)
    plt.axis([0, 80, 0, 3])           
    save_fig("learning_curves_plot")  
    plt.show()                        
    

    Early Stopping

    from sklearn.base import clone
    sgd_reg = SGDRegressor(max_iter=1, warm_start=True, penalty=None,
                           learning_rate="constant", eta0=0.0005, random_state=42)
    
    minimum_val_error = float("inf")
    best_epoch = None
    best_model = None
    for epoch in range(1000):
        sgd_reg.fit(X_train_poly_scaled, y_train)  # continues where it left off
        y_val_predict = sgd_reg.predict(X_val_poly_scaled)
        val_error = mean_squared_error(y_val, y_val_predict)
        if val_error < minimum_val_error:
            minimum_val_error = val_error
            best_epoch = epoch
            best_model = clone(sgd_reg)
    

    Ensemble System

    # Voting
    from sklearn.ensemble import VotingClassifier
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.linear_model import LogisticRegression
    from sklearn.svm import SVC
    from sklearn.metrics import accuracy_score
    
    log_clf = LogisticRegression(random_state=42)
    rnd_clf = RandomForestClassifier(random_state=42)
    svm_clf = SVC(random_state=42)
    
    voting_clf = VotingClassifier(
        estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
        voting='hard')# 可改soft,但全部estimators要有predict_proba方法
    voting_clf.fit(X_train, y_train)
    
    
    for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        print(clf.__class__.__name__, accuracy_score(y_test, y_pred))
    
    # bagging and pasting
    #默认soft voting(如果方法产生概率)
    from sklearn.ensemble import BaggingClassifier
    from sklearn.tree import DecisionTreeClassifier
    
    bag_clf = BaggingClassifier(
        DecisionTreeClassifier(random_state=42), n_estimators=500,
        max_samples=100, bootstrap=True, n_jobs=-1, oob_score=True, random_state=42)
    # bootstrap=True指用bagging,否则用pasting;max_samples指每个模型的样本数,可以设百分比0.0~1.0
    # n_jobs指用多少个CPU core,-1为全部;oob_score可以用来做test测试,bag_clf.oob_score_的评分
    # 可以设置 max_features 和 bootstrap_features
    bag_clf.fit(X_train, y_train)
    y_pred = bag_clf.predict(X_test)
    
    # AdaBoost
    from sklearn.ensemble import AdaBoostClassifier
    
    ada_clf = AdaBoostClassifier(
        DecisionTreeClassifier(max_depth=1), n_estimators=200,
        algorithm="SAMME.R", learning_rate=0.5, random_state=42)
    ada_clf.fit(X_train, y_train)
    # algorithm="SAMME.R"是当base estimator产生概率时才能用
    
    # Gradient Boosting
    from sklearn.ensemble import GradientBoostingRegressor
    
    gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=3, learning_rate=1.0, random_state=42)
    gbrt.fit(X, y)
    # staged_predict
    import numpy as np
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import mean_squared_error
    
    X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=49)
    
    gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=120, random_state=42)
    gbrt.fit(X_train, y_train)
    
    errors = [mean_squared_error(y_val, y_pred)
              for y_pred in gbrt.staged_predict(X_val)]
    bst_n_estimators = np.argmin(errors)
    
    gbrt_best = GradientBoostingRegressor(max_depth=2,n_estimators=bst_n_estimators, random_state=42)
    gbrt_best.fit(X_train, y_train)
    # early stop
    gbrt = GradientBoostingRegressor(max_depth=2, warm_start=True, random_state=42)
    
    min_val_error = float("inf")
    error_going_up = 0
    for n_estimators in range(1, 120):
        gbrt.n_estimators = n_estimators
        gbrt.fit(X_train, y_train)
        y_pred = gbrt.predict(X_val)
        val_error = mean_squared_error(y_val, y_pred)
        if val_error < min_val_error:
            min_val_error = val_error
            error_going_up = 0
        else:
            error_going_up += 1
            if error_going_up == 5:
                break  # early stopping
    

    Dimensionality Reduction

    # PCA
    from sklearn.decomposition import PCA
    pca = PCA(n_components = 2)# 可以设置0.0~1.0来保留variance的百分比;可选svd_solver="randomized"
    X2D = pca.fit_transform(X) # 也有pca.inverse_transform
    pca.components_.T[:,0]# 查看第一PCs
    pca.explained_variance_ratio_ # 查看每个PC包含了多少variance
    
    from sklearn.decomposition import IncrementalPCA
    n_batches = 100
    inc_pca = IncrementalPCA(n_components=154)
    for X_batch in np.array_split(X_train, n_batches):
        inc_pca.partial_fit(X_batch)
    
    from sklearn.decomposition import KernelPCA
    rbf_pca = KernelPCA(n_components = 2, kernel="rbf", gamma=0.04)
    X_reduced = rbf_pca.fit_transform(X)
    # 结合pipeline找最优kernel和gamma
    from sklearn.model_selection import GridSearchCV
    from sklearn.linear_model import LogisticRegression
    from sklearn.pipeline import Pipeline
    
    clf = Pipeline([
            ("kpca", KernelPCA(n_components=2)),
            ("log_reg", LogisticRegression())
        ])
    
    param_grid = [{
            "kpca__gamma": np.linspace(0.03, 0.05, 10),
            "kpca__kernel": ["rbf", "sigmoid"]
        }]
    
    grid_search = GridSearchCV(clf, param_grid, cv=3)
    grid_search.fit(X, y)
    

    参考:
    Hands-On Machine Learning with Scikit-Learn and TensorFlow

  • 相关阅读:
    慎用静态类static class
    20170617
    学习笔记之工厂模式-2017年1月11日23:00:53
    链表翻转
    面试被虐
    tips
    依赖注入那些事儿
    浅谈算法和数据结构(1):栈和队列
    猫都能学会的Unity3D Shader入门指南(一)
    SerializeField等Unity内的小用法
  • 原文地址:https://www.cnblogs.com/code2one/p/9872932.html
Copyright © 2011-2022 走看看