k折交叉验证
第一步,不重复抽样将原始数据随机分为 k 份。
第二步,每一次挑选其中 1 份作为测试集,剩余 k-1 份作为训练集用于模型训练。
第三步,重复第二步 k 次,这样每个子集都有一次机会作为测试集,其余机会作为训练集。
在每个训练集上训练后得到一个模型,
用这个模型在相应的测试集上测试,计算并保存模型的评估指标,
第四步,计算 k 组测试结果的平均值作为模型精度的估计,并作为当前 k 折交叉验证下模型的性能指标。
在这里我们采用5折交叉验证
网格搜索
GridSearchCV,它存在的意义就是自动调参,只要把参数输进去,就能给出最优化的结果和参数。但是这个方法适合于小数据集,一旦数据的量级上去了,很难得出结果。
import pandas as pd import numpy as np import matplotlib.pyplot as plt from xgboost import XGBClassifier from sklearn.metrics import roc_auc_score from sklearn.metrics import accuracy_score from sklearn.metrics import precision_score from sklearn.metrics import recall_score from sklearn.metrics import f1_score from sklearn.metrics import roc_curve,auc from sklearn.model_selection import train_test_split,GridSearchCV from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import GradientBoostingClassifier from lightgbm import LGBMClassifier from sklearn.preprocessing import StandardScaler from sklearn.linear_model import LogisticRegression from sklearn.tree import DecisionTreeClassifier from sklearn import svm data_all = pd.read_csv('D:\data_all.csv',encoding ='gbk') X = data_all.drop(['status'],axis = 1) y = data_all['status'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=2018) #数据标准化 scaler = StandardScaler() scaler.fit(X_train) X_train = scaler.transform(X_train) X_test = scaler.transform(X_test) #LR lr = LogisticRegression(random_state = 2018) param = {'C':[1e-3,0.01,0.1,1,10,100,1e3], 'penalty':['l1', 'l2']} grid = GridSearchCV(estimator=lr, param_grid=param, scoring='roc_auc', cv=5) grid.fit(X_train,y_train) print(grid.best_params_) print( grid.best_score_) print(grid.score(X_test,y_test)) #DecisionTree dt = DecisionTreeClassifier(random_state = 2018) param = {'criterion':['gini','entropy'],'splitter':['best','random'],'max_depth':[2,4,6,8],'max_features':['sqrt','log2',None]} grid = GridSearchCV(estimator = dt, param_grid=param, scoring='roc_auc', cv=5) print(grid.best_params_) print( grid.best_score_) print(grid.score(X_test,y_test)) #SVM svc = svm.SVC(random_state = 2018) param = {'C':[1e-2, 1e-1, 1, 10],'kernel':['linear','poly','rbf','sigmoid']} grid = GridSearchCV(estimator = svc, param_grid=param, scoring='roc_auc', cv=5) print(grid.best_params_) print( grid.best_score_) print(grid.score(X_test,y_test)) #RandomForest rft = RandomForestClassifier() param = {'n_estimators':[10,20,50,100],'criterion':['gini','entropy'],'max_depth':[2,4,6,8,10,None],'max_features':['sqrt','log2',None]} grid = GridSearchCV(estimator = rft, param_grid=param, scoring='roc_auc', cv=5) print(grid.best_params_) print( grid.best_score_) print(grid.score(X_test,y_test)) #GBDT gb = GradientBoostingClassifier() param = {'max_features':['sqrt','log2',None],'learning_rate':[0.01,0.1,0.5,1],'n_estimators':range(20,200,20),'subsample':[0.2,0.5,0.7,1.0]} grid = GridSearchCV(estimator = gb, param_grid=param, scoring='roc_auc', cv=5) print(grid.best_params_) print( grid.best_score_) print(grid.score(X_test,y_test)) #XGBoost xgb_c = XGBClassifier() param = {'n_estimators':range(20,200,20),'max_depth':[2,6,10],'reg_lambda':[0.2,0.5,1]} grid = GridSearchCV(estimator = xgb_c, param_grid=param, scoring='roc_auc', cv=5) print(grid.best_params_) print( grid.best_score_) print(grid.score(X_test,y_test)) #LightGBM lgbm_c = LGBMClassifier() param = {'learning_rate': [0.2,0.5,0.7], 'max_depth': range(1,10,2), 'n_estimators':range(20,100,10)} grid = GridSearchCV(estimator = lgbm_c, param_grid=param, scoring='roc_auc', cv=5) grid.fit(X_train,y_train) print(grid.best_params_) print( grid.best_score_) print(grid.score(X_test,y_test))