# -*- coding: utf-8 -*-
"""
Created on Wed Oct 31 20:59:39 2018
脚本描述:采用boosting思想开发一个解决二分类样本不平衡的多估计器模型
@author: WZD
"""
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
import numpy as np
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix
import pandas as pd
from seaborn import load_dataset
from sklearn.externals import joblib
###################准备训练数据和测试数据#######################################
df = load_dataset(name="titanic")
df = df[["survived","pclass","age","sibsp","parch"]]
feature_name = ["pclass","age","sibsp","parch"]
label_name = ["survived"]
train,test = train_test_split(df,test_size=0.2)
##################训练过程####################################################
#L1,采用全部训练数据
model_1 = XGBClassifier(max_depth=5,n_estimators=10)
model_1.fit(train[feature_name],train[label_name])
#采用model_1预测训练数据
y_model_1_pred = model_1.predict(train[feature_name])
#保存L1模型
model_1_path = joblib.dump(model_1,filename="./model_1.pkl")
#将L1模型的预测结果接在训练数据后面
train["y_model_1_pred"] = y_model_1_pred
#从train中分离出预测正确和错误的数据集
train_1_right = train[train["survived"]==train["y_model_1_pred"]]
train_1_error = train[train["survived"]!=train["y_model_1_pred"]]
#比价分错样本和正确样本的数量大小,小的那一个采样全部样本,大的那一个采样和小的同数量的样本
num_min = min(len(train_1_error),len(train_1_right))
train_2 = pd.concat([train_1_error.sample(n=num_min),train_1_right.sample(n=num_min)])
#L2,采用train_2数据集
model_2 = XGBClassifier(max_depth=1,n_estimators=1)
model_2.fit(train_2[feature_name],train_2[label_name])
#采用model_2预测train_2数据集
y_model_2_pred = model_2.predict(train_2[feature_name])
#保存L2模型
model_2_path = joblib.dump(model_2,filename="./model_2.pkl")
#将L2模型的预测结果接在train_2数据集后面
train_2["y_model_2_pred"] = y_model_2_pred
#提取train_2中分类错误的样本
#train_3 = train_2[train_2["survived"]!=train_2["y_model_2_pred"]]
train_3 = train_2[train_2["y_model_1_pred"]!=train_2["y_model_2_pred"]]#这里不是太清楚使用上面的那一条,还是这一条
#L3,采用train_3数据集
model_3 = XGBClassifier(max_depth=1,n_estimators=1)
model_3.fit(train_3[feature_name],train_3[label_name])
#采用model_3预测train_3数据集
y_model_3_pred = model_3.predict(train_3[feature_name])
#保存L3模型
model_3_path = joblib.dump(model_3,filename="./model_3.pkl")
##############在测试集上测试模型的效果#########################################
y_result = pd.DataFrame()
y_result["model_1"] = model_1.predict(test[feature_name])
y_result["model_2"] = model_2.predict(test[feature_name])
y_result["model_3"] = model_3.predict(test[feature_name])
def vote(x,y,z):
label_0 = 0
label_1 = 0
if x==0:
label_0 += 1
else:
label_1 += 1
if y==0:
label_0 += 1
else:
label_1 += 1
if z==0:
label_0 += 1
else:
label_1 += 1
if label_0>=label_1:
return 0
else:
return 1
y_result["result"] = y_result.apply(lambda df:vote(df["model_1"],df["model_2"],df["model_3"]),axis=1)
confusion_matrix(test[label_name],y_result["result"])