zoukankan      html  css  js  c++  java
  • python机器学习模型参数调优

    #机器学习模型选择与参数调优
    #三种集成学习算法-GBDT/XGBoost/lightGBM
    #1-1 GBDT算法:梯度决策树,加强型模型,构建多个决策树进行合并
    import numpy as np
    import matplotlib.pyplot as plt
    from sklearn import ensemble
    from sklearn import datasets
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import mean_squared_error
    '''
    #导入数据集
    bost=datasets.load_boston()
    x,y=bost.data,bost.target
    x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=666)
    #构建模型
    params={"n_estimators":800,"max_depth":4,"min_samples_split":2,"learning_rate":0.01,"loss":"ls"}
    clf=ensemble.GradientBoostingRegressor(**params)
    clf.fit(x_train,y_train)
    y_pre=clf.predict(x_test)
    mse=mean_squared_error(y_test,y_pre)
    print("MSE:",mse)
    #绘制测试误差-每一次所输出的迭代误差
    test_score=np.zeros((params["n_estimators"],),dtype=np.float64)
    for i ,y_pre in enumerate(clf.staged_predict(x_test)):
    test_score[i]=clf.loss_(y_test,y_pre)
    plt.figure(figsize=(12,6))
    plt.subplot(1,2,1)
    plt.title("Deviance")
    plt.plot(np.arange(params["n_estimators"])+1,clf.train_score_,"b-",label="Training set deviance")
    plt.plot(np.arange(params["n_estimators"])+1,test_score,"r-",label="Test set deviance")
    plt.legend(loc="upper right")
    plt.xlabel("Boostng Iterations")
    plt.ylabel("Deviance")
    #绘制特征重要性图
    feature_importance=clf.feature_importances_
    feature_importance=100.0*(feature_importance/feature_importance.max())
    sort_index=np.argsort(feature_importance)
    pos=np.arange(sort_index.shape[0])+0.5
    plt.subplot(1,2,2)
    plt.barh(pos,feature_importance[sort_index],align="center")
    plt.yticks(pos,bost.feature_names[sort_index])
    plt.xlabel("Relative Importance")
    plt.title("Veriable Importance")
    plt.show()

    #1-2 XGboost扩展的GBDT
    import xgboost as xgb
    data=np.random.rand(100000,10)
    label=np.random.randint(2,size=100000)
    dtrain=xgb.DMatrix(data,label=label,missing=-999.0)

    data2=np.random.rand(5000,10)
    label2=np.random.randint(2,size=5000)
    dtest=xgb.DMatrix(data2,label=label2,missing=-999.0)

    params={"bst:max_depth":2,"bst:eta":1,"silent":1,"objective":"binary:logistic"}
    params["nthread"]=4
    params["eval_metric"]="auc"
    evallist=[(dtrain,"train"),(dtest,"eval")] #监控效果
    num_round=10 #训练迭代的次数
    bst=xgb.train(params,dtrain,num_round,evallist)
    #设置一种早停,输出最好的效果
    bst=xgb.train(params,dtrain,num_round,evallist,early_stopping_rounds=10)

    #1-3 lightGBM 轻量级梯度提升机
    import lightgbm as lgb
    data=np.random.rand(100000,10)
    label=np.random.randint(2,size=100000)
    train=lgb.Dataset(data,label=label)

    data2=np.random.rand(5000,10)
    label2=np.random.randint(2,size=5000)
    test=lgb.Dataset(data2,label=label2)
    params={"num_leaves":31,"num_trees":100,"objective":"binary","metrics":"binary_error"}
    num_round=10 #10论训练
    bst=lgb.train(params,train,num_round,valid_sets=[test])
    #交叉验证
    num_round=10
    params={"num_leaves":50,"num_trees":100,"objective":"binary"}
    print(lgb.cv(params,train,num_round,nfold=5))

    bst=lgb.train(params,train,20,valid_sets=test,early_stopping_rounds=10)
    '''
    #招聘数据的实际项目需要
    #2-1 GBDT算法进行训练和预测
    import pandas as pd
    import numpy as np
    df=pd.read_csv("D:Byrbt2018StudyPython机器学习全流程项目实战精讲配套课件第七讲 机器学习建模lagou_featured.csv",encoding="gbk")
    print(df.shape)
    pd.options.display.max_columns=999 #设置最大的展示列数为999
    print(df.head())
    #首先对salary的目标回归变量进行直方图展示
    import matplotlib.pyplot as plt
    plt.hist(df["salary"])
    plt.show()
    #将表格数据转化为目标向量和训练特征矩阵
    x=df.drop(["salary"],axis=1).values
    y=df["salary"].values.reshape(-1,1)
    print(x.shape,y.shape)
    from sklearn.model_selection import train_test_split
    x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=666)
    print(x_train.shape,x_test.shape,y_train.shape,y_test.shape)
    from sklearn.ensemble import GradientBoostingRegressor
    clf=GradientBoostingRegressor(n_estimators=100,max_depth=5)
    clf.fit(x_train,y_train)
    from sklearn.metrics import mean_squared_error,max_error,mean_absolute_error
    y_pre=clf.predict(x_test)
    print(np.sqrt(mean_squared_error(y_test,y_pre)))
    print(np.sqrt(mean_squared_error(y_test,y_pre)))
    print(np.sqrt(mean_absolute_error(y_test,y_pre)))
    print(np.sqrt(max_error(y_test,y_pre)))
    print(clf.score(x_test,y_test))
    plt.plot(y_pre)
    plt.plot(y_test)
    plt.legend(["y_pre","y_test"])
    plt.show()
    #对目标回归变量进行对数化处理,消除有偏数据的影响
    from sklearn.model_selection import train_test_split
    x_train,x_test,y_train,y_test=train_test_split(x,np.log(y),random_state=666)
    print(x_train.shape,x_test.shape,y_train.shape,y_test.shape)
    from sklearn.ensemble import GradientBoostingRegressor
    clf=GradientBoostingRegressor(n_estimators=100,max_depth=5)
    clf.fit(x_train,y_train)
    from sklearn.metrics import mean_squared_error,max_error,mean_absolute_error
    y_pre=clf.predict(x_test)
    print(np.sqrt(mean_squared_error(np.exp(y_test),np.exp(y_pre))))
    print(np.sqrt(mean_absolute_error(np.exp(y_test),np.exp(y_pre))))
    print(np.sqrt(max_error(np.exp(y_test),np.exp(y_pre))))
    print(clf.score(x_test,y_test))
    print()
    plt.plot(np.exp(y_pre))
    plt.plot(np.exp(y_test))
    plt.legend(["y_pre","y_test"])
    plt.show()

    #2-2 XGBoost模型训练和预测
    from sklearn.model_selection import KFold #五划分的交叉验证的方式模块
    import xgboost as xgb
    from sklearn.metrics import mean_squared_error
    import time

    kf=KFold(n_splits=5,random_state=123,shuffle=True)
    def evalereor(pre,train):
    labels=train.getlabel()
    return "mse",mean_squared_error(np.exp(pre),np.exp(labels))

    y=np.log(y)
    valid_pre=np.zeros((330,5))
    time_start=time.time()
    for i,(train_ind,valid_ind) in enumerate(kf.split(x)):
    print("FOLD",i+1,"out of",5)
    x_train,y_train=x[train_ind],y[train_ind]
    x_valid,y_valid=x[valid_ind],y[valid_ind]
    xgb_params={"eta":0.01,"max_depth":6,"subsample":0.9,"colsample_bytree":0.9,"objective":"reg:linear","seed":99,"eval_metgric":"rmse","silent":True}
    d_train=xgb.DMatrix(x_train,y_train)
    d_valid=xgb.DMatrix(x_valid,y_valid)
    watchlist=[(d_train,"train"),(d_valid,"valid")] #监控效果
    model=xgb.train(
    xgb_params,
    d_train,
    2000,
    watchlist,
    verbose_eval=100,
    #feval=evalereor,
    early_stopping_rounds=1000
    )
    # valid_pre[:,i]=np.exp(model.predict(d_valid))
    print("cv training time{} second".format(time.time()-time_start))

    #网格搜索的方式
    import xgboost as xgb
    xg_train=xgb.DMatrix(x,y)
    params={"eta": 0.01, "max_depth": 6, "subsample": 0.9, "colsample_bytree": 0.9, "objective": "reg:linear",
    "seed": 99, "eval_metgric": "rmse", "silent": True}
    cv=xgb.cv(params,xg_train,1000,nfold=5,early_stopping_rounds=800,verbose_eval=100)
    print(cv)

    #2-3 lightGBM算法模型
    import time
    import lightgbm as lgb
    from sklearn.model_selection import KFold
    from sklearn.metrics import mean_squared_error
    from lightgbm import LGBMRegressor
    x=df.drop(["salary"],axis=1).values
    y=df["salary"].values
    y=np.log(y)
    def evalerror(pre,train):
    labels=train.getlabel()
    return "mse",mean_squared_error(np.exp(pre),np.exp(labels))
    params={
    "learning_rate":0.01,
    "boosting_type":"gbdt",
    "objective": "regression",
    "metric":"mse",
    "sub_feature":0.7,
    "num_leaves":17,
    "colsample_bytree":0.7,
    "feature_fraction":0.7,
    "min_data":100,
    "min_hessian":1,
    "verbose":-1
    }
    print("begin cv 5-fold training...")
    scores=[]
    time_start=time.time()
    kf=KFold(n_splits=5,shuffle=True,random_state=27)
    for i,(train_ind,valid_ind) in enumerate(kf.split(x)):
    print("FOLD",i+1,"out of",5)
    x_train,y_train=x[train_ind],y[train_ind]
    x_valid,y_valid=x[valid_ind],y[valid_ind]
    d_train=lgb.Dataset(x_train,y_train)
    d_valid=lgb.Dataset(x_valid,y_valid)
    model=lgb.train(
    params,
    d_train,
    num_boost_round=2000,
    valid_sets=d_valid,
    verbose_eval=200,
    #feval=evalerror(),
    early_stopping_rounds=1000)
    #valid_pre[:,i]=np.exp(model.predict(d_valid))
    print("cv training time{} second".format(time.time()-time_start))


  • 相关阅读:
    51 张图助你彻底掌握 HTTP
    Nginx从原理到实战
    vu3.0 + ts + swiper6 的问题
    使用 react-router-dom v5 查询query 参数的方法
    visual studio 2015配置SVN
    SVN使用教程总结
    C#与SAP进行数据交互
    shell csv/txt文件对比
    persto array_join(array_agg(),',')
    shell 拼接html table 发送邮件
  • 原文地址:https://www.cnblogs.com/Yanjy-OnlyOne/p/12591097.html
Copyright © 2011-2022 走看看