zoukankan      html  css  js  c++  java
  • 机器学习sklearn(92):算法实例(49)分类(30)XGBoost(六)XGBoost应用中的其他问题

    1 过拟合:剪枝参数与回归模型调参

     

     

     

    dfull = xgb.DMatrix(X,y)
    param1 = {'silent':True #并非默认
             ,'obj':'reg:linear' #并非默认
             ,"subsample":1
             ,"max_depth":6
             ,"eta":0.3
             ,"gamma":0
             ,"lambda":1
             ,"alpha":0
             ,"colsample_bytree":1
             ,"colsample_bylevel":1
             ,"colsample_bynode":1
             ,"nfold":5}
    num_round = 200
    time0 = time()
    cvresult1 = xgb.cv(param1, dfull, num_round)
    print(datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f"))
    fig,ax = plt.subplots(1,figsize=(15,10))
    #ax.set_ylim(top=5)
    ax.grid()
    ax.plot(range(1,201),cvresult1.iloc[:,0],c="red",label="train,original")
    ax.plot(range(1,201),cvresult1.iloc[:,2],c="orange",label="test,original")
    ax.legend(fontsize="xx-large")
    plt.show()

    param1 = {'silent':True
             ,'obj':'reg:linear'
             ,"subsample":1
             ,"max_depth":6
             ,"eta":0.3
             ,"gamma":0
             ,"lambda":1
             ,"alpha":0
             ,"colsample_bytree":1
             ,"colsample_bylevel":1
             ,"colsample_bynode":1
             ,"nfold":5}
    num_round = 200
    time0 = time()
    cvresult1 = xgb.cv(param1, dfull, num_round)
    print(datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f"))
    fig,ax = plt.subplots(1,figsize=(15,8))
    ax.set_ylim(top=5)
    ax.grid()
    ax.plot(range(1,201),cvresult1.iloc[:,0],c="red",label="train,original")
    ax.plot(range(1,201),cvresult1.iloc[:,2],c="orange",label="test,original")
    param2 = {'silent':True
             ,'obj':'reg:linear'
             ,"nfold":5}
    param3 = {'silent':True
             ,'obj':'reg:linear'
             ,"nfold":5}
    time0 = time()
    cvresult2 = xgb.cv(param2, dfull, num_round)
    print(datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f"))
    time0 = time()
    cvresult3 = xgb.cv(param3, dfull, num_round)
    print(datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f"))
    ax.plot(range(1,201),cvresult2.iloc[:,0],c="green",label="train,last")
    ax.plot(range(1,201),cvresult2.iloc[:,2],c="blue",label="test,last")
    ax.plot(range(1,201),cvresult3.iloc[:,0],c="gray",label="train,this")
    ax.plot(range(1,201),cvresult3.iloc[:,2],c="pink",label="test,this")
    ax.legend(fontsize="xx-large")
    plt.show()
    在这里,为大家提供我调出来的结果,供大家参考:
    #默认设置
    param1 = {'silent':True
             ,'obj':'reg:linear'
             ,"subsample":1
             ,"max_depth":6
             ,"eta":0.3
             ,"gamma":0
             ,"lambda":1
             ,"alpha":0
             ,"colsample_bytree":1
             ,"colsample_bylevel":1
             ,"colsample_bynode":1
             ,"nfold":5}
    num_round = 200
    time0 = time()
    cvresult1 = xgb.cv(param1, dfull, num_round)
    print(datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f"))
    fig,ax = plt.subplots(1,figsize=(15,8))
    ax.set_ylim(top=5)
    ax.grid()
    ax.plot(range(1,201),cvresult1.iloc[:,0],c="red",label="train,original")
    ax.plot(range(1,201),cvresult1.iloc[:,2],c="orange",label="test,original") 

    #调参结果1 param2 = {'silent':True         ,'obj':'reg:linear'         ,"subsample":1         ,"eta":0.05         ,"gamma":20         ,"lambda":3.5         ,"alpha":0.2         ,"max_depth":4         ,"colsample_bytree":0.4         ,"colsample_bylevel":0.6         ,"colsample_bynode":1         ,"nfold":5}

    #调参结果2 param3 = {'silent':True         ,'obj':'reg:linear'         ,"max_depth":2         ,"eta":0.05         ,"gamma":0         ,"lambda":1         ,"alpha":0         ,"colsample_bytree":1         ,"colsample_bylevel":0.4         ,"colsample_bynode":1         ,"nfold":5} time0 = time() cvresult2 = xgb.cv(param2, dfull, num_round) print(datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f")) ax.plot(range(1,201),cvresult2.iloc[:,0],c="green",label="train,final") ax.plot(range(1,201),cvresult2.iloc[:,2],c="blue",label="test,final") ax.legend(fontsize="xx-large") plt.show()

     

    2 XGBoost模型的保存和调用

    2.1 使用Pickle保存和调用模型

    import pickle
    dtrain = xgb.DMatrix(Xtrain,Ytrain) #设定参数,对模型进行训练
    param = {'silent':True
             ,'obj':'reg:linear'
             ,"subsample":1
             ,"eta":0.05
             ,"gamma":20
             ,"lambda":3.5
             ,"alpha":0.2
             ,"max_depth":4
             ,"colsample_bytree":0.4
             ,"colsample_bylevel":0.6
             ,"colsample_bynode":1}
    num_round = 180
    bst = xgb.train(param, dtrain, num_round) #保存模型
    pickle.dump(bst, open("xgboostonboston.dat","wb"))
    #注意,open中我们往往使用w或者r作为读取的模式,但其实w与r只能用于文本文件,当我们希望导入的不是文本文件,而
    是模型本身的时候,我们使用"wb""rb"作为读取的模式。其中wb表示以二进制写入,rb表示以二进制读入
    #看看模型被保存到了哪里?
    import sys
    sys.path
    #重新打开jupyter lab
    from sklearn.datasets import load_boston
    from sklearn.model_selection import train_test_split as TTS
    from sklearn.metrics import mean_squared_error as MSE
    import pickle
    import xgboost as xgb
    data = load_boston()
    X = data.data
    y = data.target
    Xtrain,Xtest,Ytrain,Ytest = TTS(X,y,test_size=0.3,random_state=420) #注意,如果我们保存的模型是xgboost库中建立的模型,则导入的数据类型也必须是xgboost库中的数据类型
    dtest = xgb.DMatrix(Xtest,Ytest) #导入模型
    loaded_model = pickle.load(open("xgboostonboston.dat", "rb"))
    print("Loaded model from: xgboostonboston.dat") #做预测
    ypreds = loaded_model.predict(dtest)
    from sklearn.metrics import mean_squared_error as MSE, r2_score
    MSE(Ytest,ypreds)
    r2_score(Ytest,ypreds)
    2.2 使用Joblib保存和调用模型 

    bst = xgb.train(param, dtrain, num_round)
    import joblib
    #同样可以看看模型被保存到了哪里
    joblib.dump(bst,"xgboost-boston.dat")
    loaded_model = joblib.load("xgboost-boston.dat")
    ypreds = loaded_model.predict(dtest)
    MSE(Ytest, ypreds)
    r2_score(Ytest,ypreds) #使用sklearn中的模型
    from xgboost import XGBRegressor as XGBR
    bst = XGBR(n_estimators=200
               ,eta=0.05,gamma=20
               ,reg_lambda=3.5
               ,reg_alpha=0.2
               ,max_depth=4
               ,colsample_bytree=0.4
               ,colsample_bylevel=0.6).fit(Xtrain,Ytrain)
    joblib.dump(bst,"xgboost-boston.dat")
    loaded_model = joblib.load("xgboost-boston.dat") #则这里可以直接导入Xtest
    ypreds = loaded_model.predict(Xtest)
    MSE(Ytest, ypreds)

    3 分类案例:XGB中的样本不均衡问题 

    1. 导库,创建样本不均衡的数据集 
    import numpy as np
    import xgboost as xgb
    import matplotlib.pyplot as plt
    from xgboost import XGBClassifier as XGBC
    from sklearn.datasets import make_blobs
    from sklearn.model_selection import train_test_split as TTS
    from sklearn.metrics import confusion_matrix as cm, recall_score as recall, roc_auc_score
    as auc
    class_1 = 500 #类别1有500个样本
    class_2 = 50 #类别2只有50个
    centers = [[0.0, 0.0], [2.0, 2.0]] #设定两个类别的中心
    clusters_std = [1.5, 0.5] #设定两个类别的方差,通常来说,样本量比较大的类别会更加松散
    X, y = make_blobs(n_samples=[class_1, class_2],
                      centers=centers,
                      cluster_std=clusters_std,
                      random_state=0, shuffle=False)
    Xtrain, Xtest, Ytrain, Ytest = TTS(X,y,test_size=0.3,random_state=420) (y == 1).sum() / y.shape[0]
    2. 在数据集上建模:sklearn模式 
    #在sklearn下建模#
    clf = XGBC().fit(Xtrain,Ytrain)
    ypred = clf.predict(Xtest)
    clf.score(Xtest,Ytest)
    cm(Ytest,ypred,labels=[1,0])
    recall(Ytest,ypred)
    auc(Ytest,clf.predict_proba(Xtest)[:,1])
    #负/正样本比例
    clf_ = XGBC(scale_pos_weight=10).fit(Xtrain,Ytrain)
    ypred_ = clf_.predict(Xtest)
    clf_.score(Xtest,Ytest)
    cm(Ytest,ypred_,labels=[1,0])
    recall(Ytest,ypred_)
    auc(Ytest,clf_.predict_proba(Xtest)[:,1])
    #随着样本权重逐渐增加,模型的recall,auc和准确率如何变化?
    for i in [1,5,10,20,30]:
        clf_ = XGBC(scale_pos_weight=i).fit(Xtrain,Ytrain)
        ypred_ = clf_.predict(Xtest)
        print(i)
        print("	Accuracy:{}".format(clf_.score(Xtest,Ytest)))
        print("	Recall:{}".format(recall(Ytest,ypred_)))
        print("	AUC:{}".format(auc(Ytest,clf_.predict_proba(Xtest)[:,1])))
    3. 在数据集上建模:xgboost模式 
    dtrain = xgb.DMatrix(Xtrain,Ytrain)
    dtest = xgb.DMatrix(Xtest,Ytest) #看看xgboost库自带的predict接口
    param= {'silent':True,'objective':'binary:logistic',"eta":0.1,"scale_pos_weight":1}
    num_round = 100
    bst = xgb.train(param, dtrain, num_round)
    preds = bst.predict(dtest) #看看preds返回了什么?
    preds
    #自己设定阈值
    ypred = preds.copy()
    ypred[preds > 0.5] = 1
    ypred[ypred != 1] = 0 #写明参数
    scale_pos_weight = [1,5,10]
    names = ["negative vs positive: 1"
             ,"negative vs positive: 5"
             ,"negative vs positive: 10"] #导入模型评估指标
    from sklearn.metrics import accuracy_score as accuracy, recall_score as recall, 
    roc_auc_score as auc
    for name,i in zip(names,scale_pos_weight):
        param= {'silent':True,'objective':'binary:logistic'
               ,"eta":0.1,"scale_pos_weight":i}
        clf = xgb.train(param, dtrain, num_round)
        preds = clf.predict(dtest)
        ypred = preds.copy()
        ypred[preds > 0.5] = 1
        ypred[ypred != 1] = 0
        print(name)
        print("	Accuracy:{}".format(accuracy(Ytest,ypred)))
        print("	Recall:{}".format(recall(Ytest,ypred)))
        print("	AUC:{}".format(auc(Ytest,preds)))
    #当然我们也可以尝试不同的阈值
    for name,i in zip(names,scale_pos_weight):
        for thres in [0.3,0.5,0.7,0.9]:
            param= {'silent':True,'objective':'binary:logistic'
                   ,"eta":0.1,"scale_pos_weight":i}
            clf = xgb.train(param, dtrain, num_round)
            preds = clf.predict(dtest)
            ypred = preds.copy()
            ypred[preds > thres] = 1
            ypred[ypred != 1] = 0
            print("{},thresholds:{}".format(name,thres))
            print("	Accuracy:{}".format(accuracy(Ytest,ypred)))
            print("	Recall:{}".format(recall(Ytest,ypred)))
            print("	AUC:{}".format(auc(Ytest,preds)))

     

    4 XGBoost类中的其他参数和功能

     

     

     

     

  • 相关阅读:
    Hasura GraphQL schema 生成是如何工作的
    一张方便的graphql schema 语言手册
    使用lua graphql 模块让openresty 支持graphql api
    PostgREST docker-compose 试用
    subzero 基于postgrest && openresty && rabbitmq 的快速rest/graphql 开发平台
    使用blessed 开发丰富的cli 应用
    一个方便查看数据库转换rest/graphql api 的开源软件的github 项目
    treeql 基于rest 标准的接口开发协议
    graphql-modules 企业级别的graphql server 工具
    hangfire docker-compose 运行
  • 原文地址:https://www.cnblogs.com/qiu-hua/p/14968463.html
Copyright © 2011-2022 走看看