zoukankan      html  css  js  c++  java
  • 采用boosting思想开发一个解决二分类样本不平衡的多估计器模型

    # -*- coding: utf-8 -*-
    """
    Created on Wed Oct 31 20:59:39 2018
    脚本描述:采用boosting思想开发一个解决二分类样本不平衡的多估计器模型
    @author: WZD
    """
    from sklearn.datasets import load_breast_cancer
    from sklearn.model_selection import train_test_split
    import numpy as np
    from xgboost import XGBClassifier
    from sklearn.metrics import confusion_matrix
    import pandas as pd
    from seaborn import load_dataset
    from sklearn.externals import joblib
    
    ###################准备训练数据和测试数据#######################################
    df = load_dataset(name="titanic")
    df = df[["survived","pclass","age","sibsp","parch"]]
    feature_name = ["pclass","age","sibsp","parch"]
    label_name   = ["survived"]
    
    train,test = train_test_split(df,test_size=0.2)
    
    ##################训练过程####################################################
    #L1,采用全部训练数据
    model_1 = XGBClassifier(max_depth=5,n_estimators=10)
    model_1.fit(train[feature_name],train[label_name])
    #采用model_1预测训练数据
    y_model_1_pred = model_1.predict(train[feature_name])
    #保存L1模型
    model_1_path = joblib.dump(model_1,filename="./model_1.pkl")
    #将L1模型的预测结果接在训练数据后面
    train["y_model_1_pred"] = y_model_1_pred
    
    #从train中分离出预测正确和错误的数据集
    train_1_right = train[train["survived"]==train["y_model_1_pred"]]
    train_1_error = train[train["survived"]!=train["y_model_1_pred"]]
    
    
    
    #比价分错样本和正确样本的数量大小,小的那一个采样全部样本,大的那一个采样和小的同数量的样本
    num_min = min(len(train_1_error),len(train_1_right))
    train_2 = pd.concat([train_1_error.sample(n=num_min),train_1_right.sample(n=num_min)])
    #L2,采用train_2数据集
    model_2 = XGBClassifier(max_depth=1,n_estimators=1)
    model_2.fit(train_2[feature_name],train_2[label_name])
    #采用model_2预测train_2数据集
    y_model_2_pred = model_2.predict(train_2[feature_name])
    #保存L2模型
    model_2_path = joblib.dump(model_2,filename="./model_2.pkl")
    
    #将L2模型的预测结果接在train_2数据集后面
    train_2["y_model_2_pred"] = y_model_2_pred
    
    
    
    #提取train_2中分类错误的样本
    #train_3 = train_2[train_2["survived"]!=train_2["y_model_2_pred"]]
    train_3 = train_2[train_2["y_model_1_pred"]!=train_2["y_model_2_pred"]]#这里不是太清楚使用上面的那一条,还是这一条
    #L3,采用train_3数据集 model_3 = XGBClassifier(max_depth=1,n_estimators=1) model_3.fit(train_3[feature_name],train_3[label_name]) #采用model_3预测train_3数据集 y_model_3_pred = model_3.predict(train_3[feature_name]) #保存L3模型 model_3_path = joblib.dump(model_3,filename="./model_3.pkl") ##############在测试集上测试模型的效果######################################### y_result = pd.DataFrame() y_result["model_1"] = model_1.predict(test[feature_name]) y_result["model_2"] = model_2.predict(test[feature_name]) y_result["model_3"] = model_3.predict(test[feature_name]) def vote(x,y,z): label_0 = 0 label_1 = 0 if x==0: label_0 += 1 else: label_1 += 1 if y==0: label_0 += 1 else: label_1 += 1 if z==0: label_0 += 1 else: label_1 += 1 if label_0>=label_1: return 0 else: return 1 y_result["result"] = y_result.apply(lambda df:vote(df["model_1"],df["model_2"],df["model_3"]),axis=1) confusion_matrix(test[label_name],y_result["result"])
  • 相关阅读:
    零售定价(最终价格计算)(三)
    SAP数据表(一)商品表
    BizTalk 2006 R2 如何实现EDI报文的接收处理
    Simulate a Windows Service using ASP.NET to run scheduled jobs
    看曾士强评胡雪岩
    Smart Client Software Factory 初试
    Asp.net Dynamic Data之四定义字段的显示/编辑模板和自定义验证逻辑
    To set a 64bit mode IIS installation to 32bit mode
    集中日志查询平台方案(Draft)
    .net开发框架比较
  • 原文地址:https://www.cnblogs.com/wzdLY/p/9889639.html
Copyright © 2011-2022 走看看