zoukankan      html  css  js  c++  java
  • stacking融合



    import
    pandas as pd import numpy as np from sklearn.linear_model import Lasso from sklearn.ensemble import GradientBoostingRegressor from sklearn.pipeline import make_pipeline from sklearn.preprocessing import RobustScaler from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone from sklearn.model_selection import KFold, cross_val_score, train_test_split import xgboost as xgb import lightgbm as lgb ## 采用stacking的方法训练预测,最终的提交文件为stacking_submit.csv all_train = pd.read_csv('all_train.csv',sep=' ') test_set = pd.read_csv('test_set.csv',sep=' ') result_name = test_set[['USRID']] train = all_train.drop(['USRID', 'FLAG'], axis=1) y_train = all_train['FLAG'].values test = test_set.drop(['USRID'], axis=1) #线下的交叉验证函数 n_folds = 5 def auc_cv(model): kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(train.values) auc = cross_val_score(model, train.values, y_train, scoring="roc_auc", cv = kf) return(auc) lasso = make_pipeline(RobustScaler(), Lasso(max_iter=1000,alpha=0.0005,fit_intercept=True,random_state=1)) GBoost = GradientBoostingRegressor(n_estimators=500, learning_rate=0.01, max_depth=18, max_features='sqrt', min_samples_leaf=16, min_samples_split=10, random_state =5) model_xgb = xgb.XGBRegressor(colsample_bytree=0.9, objective = 'binary:logistic', learning_rate=0.02, max_depth=6, eval_metric = 'auc', min_child_weight=10, n_estimators=842, subsample=0.7, silent=1, random_state =0, nthread = -1) model_lgb = lgb.LGBMRegressor(objective='binary',metric ='auc',num_leaves=35, learning_rate=0.01, n_estimators=842, max_bin = 55, bagging_fraction = 0.8, bagging_freq = 3, feature_fraction = 0.9, feature_fraction_seed=9, bagging_seed=9, min_data_in_leaf =370, min_sum_hessian_in_leaf = 11) # 单模型的线下得分 score_lasso = auc_cv(lasso) print(" Lasso score: {:.4f} ({:.4f}) ".format(score_lasso.mean(), score_lasso.std())) score_GBoost = auc_cv(GBoost) print("Gradient Boosting score: {:.4f} ({:.4f}) ".format(score_GBoost.mean(), score_GBoost.std())) score_lgb = auc_cv(model_lgb) print("LightGBM score: {:.4f} ({:.4f}) ".format(score_lgb.mean(), score_lgb.std())) score_xgb = auc_cv(model_xgb) print("XGBoost score: {:.4f} ({:.4f}) ".format(score_xgb.mean(), score_xgb.std())) ## 定义stacking的类 class StackingAveragedModels(BaseEstimator, RegressorMixin, TransformerMixin): def __init__(self, base_models, meta_model, n_folds=5): self.base_models = base_models self.meta_model = meta_model self.n_folds = n_folds def fit(self, X, y): self.base_models_ = [list() for x in self.base_models] self.meta_model_ = clone(self.meta_model) kfold = KFold(n_splits=self.n_folds, shuffle=True, random_state=156) out_of_fold_predictions = np.zeros((X.shape[0], len(self.base_models))) ##初始化矩阵 for i, model in enumerate(self.base_models): for train_index, holdout_index in kfold.split(X, y): instance = clone(model) self.base_models_[i].append(instance) ##五折交叉验证,一个基模型有5个instance instance.fit(X[train_index], y[train_index]) y_pred = instance.predict(X[holdout_index]) out_of_fold_predictions[holdout_index, i] = y_pred self.meta_model_.fit(out_of_fold_predictions, y) return self def predict(self, X): meta_features = np.column_stack([ np.column_stack([model.predict(X) for model in base_models]).mean(axis=1) for base_models in self.base_models_]) return self.meta_model_.predict(meta_features) stacked_averaged_models = StackingAveragedModels(base_models = (GBoost,model_xgb,model_lgb), meta_model = lasso) # stacking模型的线下得分 score = auc_cv(stacked_averaged_models) print("Stacking Averaged models score: {:.4f} ({:.4f})".format(score.mean(), score.std())) stacked_averaged_models.fit(train.values, y_train) stacked_pred = stacked_averaged_models.predict(test.values) result_name['RST'] = stacked_pred result_name.to_csv('stacking_submit.csv',index=None,sep=' ')

    普通加权融合,主要是根据线上的分数进行决定权重。(但是有时候存在对等分对线上更有帮助。
    加权平均结合sigmoid反函数
        主要步骤:首先将各个模型的结果代入到sigmoid反函数中,然后得到其均值,对其结果使用sigmoid函数。相较于普通的加权平均,这种方法更适合于结果具有较小差异性的。
    def f(x):
        res=1/(1+np.e**(-x))
        return res
    
    def f_ver(x):
        res=np.log(x/(1-x))
        return res

     https://www.sohu.com/a/196885191_116235

  • 相关阅读:
    20200305 VMware虚拟机安装及centOS
    20200303 pandas
    20200302 数据分析之numpy以及Jupyter
    Bash(Terminal)高频命令
    E117: Unkown function: vundle#installer#new
    字符串输入之%s
    结合getchar()理解缓冲区
    在HEXO主题中添加站内搜索
    字符串的全排列
    无法启动程序,系统找不到指定的文件
  • 原文地址:https://www.cnblogs.com/liuyicai/p/10669584.html
Copyright © 2011-2022 走看看