zoukankan      html  css  js  c++  java
  • XGBoost使用教程(进阶篇)三

    一、Importing all the libraries

    import pandas as pd
    import numpy as np
    from matplotlib import pyplot as plt

    from sklearn.model_selection import cross_val_score
    from sklearn import metrics
    from sklearn.metrics import accuracy_score
    二、Reading the file

    还是蘑菇数据集,直接采用Kaggle竞赛中22维特征 https://www.kaggle.com/uciml/mushroom-classification

    数据集下载地址:http://download.csdn.net/download/u011630575/10266626

    # path to where the data lies
    dpath = './data/'
    data = pd.read_csv(dpath+"mushrooms.csv")
    data.head(6)
    三、Let us check if there is any null values
    data.isnull().sum() #检查数据有没有空值
    四、check if we have two claasification. Either the mushroom is poisonous or edible
    data['class'].unique() #检查是否只有蘑菇的种类,有毒,可使用
    print(data.dtypes)
    五、check if 22 features(1st one is label) and 8124 instances
    data.shape #22个特征 8124个样例 第一个是标签
    六、The dataset has values in strings. We need to convert all the unique values to integers. Thus we perform label encoding on the data 标准化标签

    from sklearn.preprocessing import LabelEncoder
    labelencoder=LabelEncoder() #标准化标签,将标签值统一转换成range(标签值个数-1)范围内
    for col in data.columns:
    data[col] = labelencoder.fit_transform(data[col])

    data.head()
    Separating features and label

    X = data.iloc[:,1:23] # 获取1-23行特征
    y = data.iloc[:, 0] # 获取0行标签
    X.head()
    y.head()
    Splitting the data into training and testing dataset
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=4)
    七、default Logistic Regression

    from sklearn.linear_model import LogisticRegression
    model_LR= LogisticRegression()
    model_LR.fit(X_train,y_train)
    y_prob = model_LR.predict_proba(X_test)[:,1] # This will give you positive class prediction probabilities
    y_pred = np.where(y_prob > 0.5, 1, 0) # This will threshold the probabilities to give class predictions.
    model_LR.score(X_test, y_pred)
    注:np.where(condition,x,y) 是三元运算符,conditon条件成立则结果为x,否则为y。

    accuracy

    auc_roc=metrics.roc_auc_score(y_test,y_pred)
    print(auc_roc)
    八、Logistic Regression(Tuned model) 调整模型

    from sklearn.linear_model import LogisticRegression
    from sklearn.model_selection import cross_val_score
    from sklearn import metrics

    LR_model= LogisticRegression()

    tuned_parameters = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000] ,
    'penalty': ['l1','l2']
    }
    九、CV

    from sklearn.model_selection import GridSearchCV

    LR= GridSearchCV(LR_model, tuned_parameters,cv=10)
    LR.fit(X_train,y_train)
    print(LR.best_params_)
    y_prob = LR.predict_proba(X_test)[:,1] # This will give you positive class prediction probabilities
    y_pred = np.where(y_prob > 0.5, 1, 0) # This will threshold the probabilities to give class predictions.
    LR.score(X_test, y_pred)
    auc_roc=metrics.roc_auc_score(y_test,y_pred)
    print(auc_roc)
    十、Default Decision Tree model

    from sklearn.tree import DecisionTreeClassifier

    model_tree = DecisionTreeClassifier()
    model_tree.fit(X_train, y_train)
    y_prob = model_tree.predict_proba(X_test)[:,1] # This will give you positive class prediction probabilities
    y_pred = np.where(y_prob > 0.5, 1, 0) # This will threshold the probabilities to give class predictions.
    model_tree.score(X_test, y_pred)
    auc_roc=metrics.roc_auc_score(y_test,y_pred)
    auc_roc
    十一、Let us tune the hyperparameters of the Decision tree model

    from sklearn.tree import DecisionTreeClassifier

    model_DD = DecisionTreeClassifier()

    tuned_parameters= { 'max_features': ["auto","sqrt","log2"],
    'min_samples_leaf': range(1,100,1) , 'max_depth': range(1,50,1)
    }
    #tuned_parameters= { 'max_features': ["auto","sqrt","log2"] }


    #If “auto”, then max_features=sqrt(n_features).
    from sklearn.model_selection import GridSearchCV
    DD = GridSearchCV(model_DD, tuned_parameters,cv=10)
    DD.fit(X_train, y_train)
    print(DD.grid_scores_)
    print(DD.best_score_)
    print(DD.best_params_)
    y_prob = DD.predict_proba(X_test)[:,1] # This will give you positive class prediction probabilities
    y_pred = np.where(y_prob > 0.5, 1, 0) # This will threshold the probabilities to give class predictions.
    DD.score(X_test, y_pred)
    auc_roc=metrics.classification_report(y_test,y_pred)
    print(auc_roc)
    十二、Default Random Forest
    from sklearn.ensemble import RandomForestClassifier

    model_RR=RandomForestClassifier()
    model_RR.fit(X_train,y_train)
    y_prob = model_RR.predict_proba(X_test)[:,1] # This will give you positive class prediction probabilities
    y_pred = np.where(y_prob > 0.5, 1, 0) # This will threshold the probabilities to give class predictions.
    model_RR.score(X_test, y_pred)
    auc_roc=metrics.roc_auc_score(y_test,y_pred)
    auc_roc
    十三、Let us tuned the parameters of Random Forest just for the purpose of knowledge
    1) max_features 

    2) n_estimators  估计量

    3) min_sample_leaf

    from sklearn.ensemble import RandomForestClassifier

    model_RR=RandomForestClassifier()

    tuned_parameters = {'min_samples_leaf' range(10,100,10), 'n_estimators' : range(10,100,10),
    'max_features':['auto','sqrt','log2']
    }

    from sklearn.model_selection import GridSearchCV
    RR = GridSearchCV(model_RR, tuned_parameters,cv=10)

    RR.fit(X_train,y_train)

    print(RR.grid_scores_)

    print(RR.best_score_)

    print(RR.best_params_)

    y_prob = RR.predict_proba(X_test)[:,1] # This will give you positive class prediction probabilities
    y_pred = np.where(y_prob > 0.5, 1, 0) # This will threshold the probabilities to give class predictions.
    RR_model.score(X_test, y_pred)

    auc_roc=metrics.roc_auc_score(y_test,y_pred)
    auc_roc
    十四、Default  XGBoost

    from xgboost import XGBClassifier
    model_XGB=XGBClassifier()
    model_XGB.fit(X_train,y_train)
    y_prob = model_XGB.predict_proba(X_test)[:,1] # This will give you positive class prediction probabilities
    y_pred = np.where(y_prob > 0.5, 1, 0) # This will threshold the probabilities to give class predictions.
    model_XGB.score(X_test, y_pred)
    auc_roc=metrics.roc_auc_score(y_test,y_pred)
    auc_roc
    十五、特征重要性
    在XGBoost中特征重要性已经自动算好,存放在featureimportances

    print(model_XGB.feature_importances_)
    from matplotlib import pyplot
    pyplot.bar(range(len(model_XGB.feature_importances_)), model_XGB.feature_importances_)
    pyplot.show()
    # plot feature importance using built-in function
    from xgboost import plot_importance
    plot_importance(model_XGB)
    pyplot.show()
    可以根据特征重要性进行特征选择
    from numpy import sort
    from sklearn.feature_selection import SelectFromModel

    # Fit model using each importance as a threshold
    thresholds = sort(model_XGB.feature_importances_)
    for thresh in thresholds:
    # select features using threshold
    selection = SelectFromModel(model_XGB, threshold=thresh, prefit=True)
    select_X_train = selection.transform(X_train)
    # train model
    selection_model = XGBClassifier()
    selection_model.fit(select_X_train, y_train)
    # eval model
    select_X_test = selection.transform(X_test)
    y_pred = selection_model.predict(select_X_test)
    predictions = [round(value) for value in y_pred]
    accuracy = accuracy_score(y_test, predictions)
    print("Thresh=%.3f, n=%d, Accuracy: %.2f%%" % (thresh, select_X_train.shape[1],
    accuracy*100.0))

  • 相关阅读:
    codeforces1191B Tokitsukaze and Mahjong 哈希+思维
    洛谷P1608 路径统计 最短路变种 dijkstra算法
    自考新教材-p90_5(4)
    自考新教材-p90_5(3)
    自考新教材-p90_5(2)
    自考新教材-p90_5(1)
    自考新教材-p89_3
    自考新教材-p88_4(2)
    自考新教材-p88_4(1)
    自考新教材-p87_3
  • 原文地址:https://www.cnblogs.com/tan2810/p/11154658.html
Copyright © 2011-2022 走看看