zoukankan html css js c++ java

XGBoost实战

数据集地址

分类：http://archive.ics.uci.edu/ml/datasets/Iris
部分数据：

基于sklearn接口的分类

from pprint import pprint

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.externals import joblib
import numpy as np
from xgboost.sklearn import XGBClassifier


# 以分隔符,读取文件，得到的是一个二维列表
iris = np.loadtxt('iris.data', dtype=str, delimiter=',', unpack=False, encoding='utf-8')

# 前4列是特征
data = iris[:, :4].astype(np.float)
# 最后一列是标签，我们将其转换为二维列表
target = iris[:, -1][:, np.newaxis]

# 对标签进行onehot编码后还原成数字
enc = OneHotEncoder()
target = enc.fit_transform(target).astype(np.int).toarray()
target = [list(oh).index(1) for oh in target]

# 划分训练数据和测试数据
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=1)

# 模型训练
params = {
    'n_estimators': 100,
    'max_depth': 5,
    'min_child_weight': 1,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'reg_alpha': 0,
    'reg_lambda': 1,
    'learning_rate': 0.1}

xgb = XGBClassifier(random_state=1, **params)
xgb.fit(X_train, y_train)

# 模型存储
joblib.dump(xgb, 'xgb_model.pkl')
# 模型加载
gbdt = joblib.load('xgb_model.pkl')

# 模型预测
y_pred = xgb.predict(X_test)

# 模型评估
print('The accuracy of prediction is:', accuracy_score(y_test, y_pred))

# 特征重要度
print('Feature importances:', list(xgb.feature_importances_))

结果

The accuracy of prediction is: 0.9666666666666667
Feature importances: [0.002148238569679191, 0.0046703830672789074, 0.33366676380518245, 0.6595146145578594]

基于sklearn接口的回归

from sklearn.datasets import make_regression

from sklearn.model_selection import train_test_split
from xgboost.sklearn import XGBRegressor
from sklearn.metrics import mean_absolute_error

X, y = make_regression(n_samples=100, n_features=1, noise=20)

# 切分训练集、测试集
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.25, random_state=1)

# 调用XGBoost模型，使用训练集数据进行训练（拟合）
my_model = XGBRegressor(
    max_depth=30,
    learning_rate=0.01,
    n_estimators=5,
    silent=True,
    objective='reg:linear',
    booster='gblinear',
    n_jobs=50,
    nthread=None,
    gamma=0,
    min_child_weight=1,
    max_delta_step=0,
    subsample=1,
    colsample_bytree=1,
    colsample_bylevel=1,
    reg_alpha=0,
    reg_lambda=1,
    scale_pos_weight=1,
    base_score=0.5,
    random_state=0,
    seed=None,
    missing=None,
    importance_type='gain')

my_model.fit(train_X, train_y)

# 使用模型对测试集数据进行预测
predictions = my_model.predict(test_X)

# 对模型的预测结果进行评判（平均绝对误差）
print("Mean Absolute Error : " + str(mean_absolute_error(predictions, test_y)))

结果：

Mean Absolute Error : 47.98486383348952

查看全文

相关阅读:
分布式系统唯一ID生成方案汇总
 百度开源的分布式 id 生成器
 全局唯一ID生成器
 VisualSVN Server迁移的方法
 SQL Server 函数 LEN 与 DATALENGTH的区别
 SQLServer中DataLength()和Len()两内置函数的区别
 sql server 查询ntext字段长度
 JAVA使用POI如何导出百万级别数据
 Java 使用stringTemplate导出大批量数据excel（百万级）
Java 两个日期间的天数计算

原文地址：https://www.cnblogs.com/xiximayou/p/14421804.html