zoukankan      html  css  js  c++  java
  • 集成学习-Adaboost 参数选择

    先看下ababoost和决策树效果对比

    import numpy as np
    import matplotlib.pyplot as plt
    from sklearn.model_selection import learning_curve
    
    def plot_learning_curve(estimator,title,X,y,ylim=None,cv=None,
                            n_jobs=None,train_sizes=np.linspace(.1,1.0,10)):
        plt.figure()
        plt.title(title)
        if ylim is not None:
            plt.ylim(*ylim)
        plt.xlabel("Training examples")
        plt.ylabel("Score")
        train_sizes, train_scores, test_scores = learning_curve(
            estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
        train_scores_mean = np.mean(train_scores, axis=1)
        train_scores_std = np.std(train_scores, axis=1)
        test_scores_mean = np.mean(test_scores, axis=1)
        test_scores_std = np.std(test_scores, axis=1)
        plt.grid()
    
        plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                         train_scores_mean + train_scores_std, alpha=0.1,
                         color="r")
        plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                         test_scores_mean + test_scores_std, alpha=0.1, color="g")
        plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
                 label="Training score")
        plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
                 label="Cross-validation score")
    
        plt.legend(loc="best")
        return plt
    
    from sklearn.datasets import  make_gaussian_quantiles
    from sklearn.model_selection import learning_curve
    from sklearn.model_selection import ShuffleSplit
    import numpy as np
    
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.ensemble import AdaBoostClassifier
    # ##########################
    # 生成2维正态分布,生成的数据按分位数分为两类,50个样本特征,5000个样本数据
    X,y = make_gaussian_quantiles(cov=2.0,n_samples=5000,n_features=50,n_classes=2,random_state=1)
    # 设置一百折交叉验证参数,数据集分层越多,交叉最优模型越接近原模型
    cv = ShuffleSplit(n_splits=10,test_size=0.2,random_state=1)
    # 分别画出CART分类决策树和AdaBoost分类决策树的学习曲线
    estimatorCart = DecisionTreeClassifier(max_depth=1)
    estimatorBoost = AdaBoostClassifier(base_estimator=estimatorCart,n_estimators=270)
    # 画CART决策树和AdaBoost的学习曲线
    estimatorTuple = (estimatorCart,estimatorBoost)
    titleTuple =("decision learning curve","adaBoost learning curve")
    title = "decision learning curve"
    for i in range(2):
        estimator = estimatorTuple[i]
        title = titleTuple[i]
        plot_learning_curve(estimator,title, X, y, cv=cv)
        plt.show()

    输出学习曲线

    分析:随着样本数的增加,单决策树的预测精度稳定在0.5左右,是个弱分类器,而adaboost预测精度在0.85左右,明显高于单决策树,是个强分类器。

    参数选择

    上面的模型使用的是默认参数,其实还有优化的空间。

    在集成学习中,参数调优一般是先选择框架的参数,再选择基学习器的参数

    框架参数调优

    以基学习器个数为例

    from sklearn.model_selection import GridSearchCV
    from sklearn.ensemble import AdaBoostClassifier
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.datasets import make_gaussian_quantiles
    
    
    estimatorCart = DecisionTreeClassifier(max_depth=1)
    X,y = make_gaussian_quantiles(cov=2.0,n_samples=5000,n_features=50,n_classes=2,random_state=1)
    
    ### 第一轮
    # 对框架参数 弱学习器个数进行择优
    param_test1 = {"n_estimators":range(150,300,50)}
    # 框架参数择优
    gsearch1 = GridSearchCV(estimator=AdaBoostClassifier(estimatorCart),param_grid=param_test1,scoring="roc_auc",cv=5)
    gsearch1.fit(X,y)
    print(gsearch1.best_params_,gsearch1.best_score_)       # ({'n_estimators': 250}, 0.9360103999999999)
    
    
    ### 第二轮
    # 继续优化弱学习器个数,在最优学习器个数的范围内再次搜寻
    n_estimator1 = 250
    param_test2 = {"n_estimators":range(n_estimator1-30,n_estimator1+30,10)}
    gsearch2 = GridSearchCV(estimator=AdaBoostClassifier(estimatorCart),param_grid=param_test2,scoring="roc_auc",cv=5)
    gsearch2.fit(X,y)
    print(gsearch2.best_params_,gsearch2.best_score_)           # ({'n_estimators': 270}, 0.9387719999999999)

    基学习器参数调优

    以max_depth和min_samples_split为例

    import numpy as np
    from sklearn.model_selection import cross_validate
    n_estimators2 = 270
    score = 0
    for i in range(1,3):  # 决策树最大深度循环
        print(i)
        for j in range(18,22):
            print(j)
            bdt=AdaBoostClassifier(DecisionTreeClassifier(max_depth=i,min_samples_split=j),n_estimators=n_estimators2)
            cv_result = cross_validate(bdt,X,y,return_train_score=False,cv=5)
            cv_value_vec = cv_result["test_score"]
            cv_mean = np.mean(cv_value_vec)
            print(cv_mean)
            if cv_mean>=score:
                score = cv_mean
                tree_depth = i
                samples_split = j

    用最优参数构建模型

    from sklearn.model_selection import train_test_split
    tree_depth = 1
    X_train, y_train, X_test, y_test = train_test_split(X, y)
    bdt = AdaBoostClassifier(DecisionTreeClassifier(max_depth=tree_depth),
                             n_estimators=n_estimators2)
    bdt.fit(X_train,y_train)
    print(bdt.score(X_test,y_test))

    85.6%,略有提高

    学习率与基学习器个数的探索

    import matplotlib.pyplot as plt
    
    from sklearn.ensemble import  AdaBoostClassifier
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.datasets import  make_gaussian_quantiles
    from sklearn.model_selection import learning_curve
    from sklearn.model_selection import ShuffleSplit
    import numpy as np
    
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.ensemble import AdaBoostClassifier
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import  accuracy_score
    from sklearn.metrics import zero_one_loss
    
    
    n_estimators = 200
    # 生成2维正态分布,生成的数据按分位数分为两类,50个样本特征,5000个样本数据
    X,y = make_gaussian_quantiles(cov=2.0,n_samples=5000,n_features=50,n_classes=2,random_state=1)
    # 数据划分为训练集和测试集
    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=1)
    # 根据上一节的参数择优,选择最优参数来构建模型
    estimatorCart = DecisionTreeClassifier(max_depth=1)
    dt_stump1 = AdaBoostClassifier(base_estimator=estimatorCart,n_estimators=n_estimators,learning_rate=0.8)
    dt_stump2 = AdaBoostClassifier(base_estimator=estimatorCart,n_estimators=n_estimators,learning_rate=0.1)
    dt_stump1.fit(X_train,y_train)
    dt_stump_err1 = 1.0 - dt_stump1.score(X_test,y_test)
    #
    dt_stump2.fit(X_train,y_train)
    dt_stump_err2 = 1.0 - dt_stump2.score(X_test,y_test)
    
    ############
    test_errors1 = []
    # 每迭代一次,得到一个测试结果
    ada_discrete_err1 = np.zeros((n_estimators,))
    ada_discrete_err2 = np.zeros((n_estimators,))
    for i,ypred in enumerate(dt_stump1.staged_predict(X_test)):
        ada_discrete_err1[i] = zero_one_loss(ypred,y_test)
    
    for i,ypred in enumerate(dt_stump2.staged_predict(X_test)):
        ada_discrete_err2[i] = zero_one_loss(ypred,y_test)
    
    # 画出迭代次数与准确率的关系
    fig = plt.figure()
    ax = fig.add_subplot(111)
    
    ax.plot(np.arange(n_estimators) + 1, ada_discrete_err1,label='learning rate = 0.8',color='red')
    ax.plot(np.arange(n_estimators) + 1, ada_discrete_err2,label='learning rate = 0.1',color='green')
    ax.set_ylim((0.0, 1))
    ax.set_xlabel('n_estimators')
    ax.set_ylabel('error rate')
    leg = ax.legend(loc='upper right', fancybox=True)
    leg.get_frame().set_alpha(0.7)
    plt.show()

    输出

    针对当前数据,学习率大,错误率低

    总结

    基学习器的复杂度尽量低,可以通过增加学习器个数提高泛化能力,

    但是当数据噪声较大或者基学习器复杂度较高时,增加基学习器个数很难提高泛化能力

    这只是大致方向,不绝对。

    参考资料:

    https://zhuanlan.zhihu.com/p/57319411

  • 相关阅读:
    FLV视频转换的利器 ffmpeg.exe
    ffmpeg参数设定解说
    SQL里加减日期
    SQL Server 获得影响行数
    CheckBoxList RadioButtonList 不生成table 表示
    SQL语句 从一个表读取数据,写入到另一个表的相同字段中
    ffmpeg和Mencoder使用实例小全
    执行存储过程出现:"不是有效的标识符。"
    SQL 格式化超长的字段
    js遍历选中的dom元素
  • 原文地址:https://www.cnblogs.com/yanshw/p/10721996.html
Copyright © 2011-2022 走看看