GDBT 可以解决分类和回归问题
回归问题
def __init__(self, loss='ls', learning_rate=0.1, n_estimators=100, subsample=1.0, criterion='friedman_mse', min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0., max_depth=3, min_impurity_decrease=0., min_impurity_split=None, init=None, random_state=None, max_features=None, alpha=0.9, verbose=0, max_leaf_nodes=None, warm_start=False, presort='auto')
示例
import numpy as np from sklearn.metrics import mean_squared_error from sklearn.datasets import make_friedman1 from sklearn.ensemble import GradientBoostingRegressor X, y = make_friedman1(n_samples=1200, random_state=0, noise=1.0) X_train, X_test = X[:200], X[200:] y_train, y_test = y[:200], y[200:] ### 损失函数 # 如果损失函数为 误差绝对值,L=|y-f(x)|,负梯度为 sign(y-f(x)),即要么1,要么-1,sklearn 中对应为 loss='lad' # 如果损失函数为 huber,sklearn 中对应为 loss='huber' # 如果损失函数为 均方误差,sklearn 中对应为 loss='ls' est = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=1, random_state=0, loss='huber').fit(X_train, y_train) pred = est.predict(X_test) error = mean_squared_error(pred, y_test) print(max(y_test), min(y_test)) # (27.214332670044374, 0.8719243023544349) print(error) # loss='ls' 5.009154859960321 # loss='lad' 5.817510629608294 # loss='huber' 4.690823542377095
分类问题
def __init__(self, loss='deviance', learning_rate=0.1, n_estimators=100, subsample=1.0, criterion='friedman_mse', min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0., max_depth=3, min_impurity_decrease=0., min_impurity_split=None, init=None, random_state=None, max_features=None, verbose=0, max_leaf_nodes=None, warm_start=False, presort='auto')
示例
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor from sklearn.model_selection import GridSearchCV, train_test_split from sklearn.preprocessing import StandardScaler from sklearn.metrics import accuracy_score, mean_squared_error from time import time import numpy as np import pandas as pd import mnist if __name__ == "__main__": # 读取Mnist数据集, 测试GBDT的分类模型 mnistSet = mnist.loadLecunMnistSet() train_X, train_Y, test_X, test_Y = mnistSet[0], mnistSet[1], mnistSet[2], mnistSet[3] m, n = np.shape(train_X) idx = range(m) np.random.shuffle(idx) # 使用PCA降维 # num = 30000 # pca = PCA(n_components=0.9, whiten=True, random_state=0) # for i in range(int(np.ceil(1.0 * m / num))): # minEnd = min((i + 1) * num, m) # sub_idx = idx[i * num:minEnd] # train_pca_X = pca.fit_transform(train_X[sub_idx]) # print np.shape(train_pca_X) print " **********测试GradientBoostingClassifier类**********" t = time() # param_grid1 = {"n_estimators": range(1000, 2001, 100)} # param_grid2 = {'max_depth': range(30, 71, 10), 'min_samples_split': range(4, 9, 2)} # param_grid3 = {'min_samples_split': range(4, 9, 2), 'min_samples_leaf': range(3, 12, 2)} # param_grid4 = {'subsample': np.arange(0.6, 1.0, 0.05)} # model = GridSearchCV( # estimator=GradientBoostingClassifier(max_features=90, max_depth=40, min_samples_split=8, learning_rate=0.1, # n_estimators=1800), # param_grid=param_grid4, cv=3) # # 拟合训练数据集 # model.fit(train_X, train_Y) # print "最好的参数是:%s, 此时的得分是:%0.2f" % (model.best_params_, model.best_score_) model = GradientBoostingClassifier(max_features=90, max_depth=40, min_samples_split=8, min_samples_leaf=3, n_estimators=1200, learning_rate=0.05, subsample=0.95) # 拟合训练数据集 model.fit(train_X, train_Y) # 预测训练集 train_Y_hat = model.predict(train_X[idx]) print "训练集精确度: ", accuracy_score(train_Y[idx], train_Y_hat) # 预测测试集 test_Y_hat = model.predict(test_X) print "测试集精确度: ", accuracy_score(test_Y, test_Y_hat) print "总耗时:", time() - t, "秒"
参考资料:
https://github.com/haidawyl/Mnist 各种模型的用法