zoukankan      html  css  js  c++  java
  • LightGBM建模

    LightGBM

    1.读取csv数据并指定参数建模

    # coding: utf-8
    import json
    import lightgbm as lgb
    import pandas as pd
    from sklearn.metrics import mean_squared_error
    
    # 加载数据
    print('Load data...')
    df_train = pd.read_csv('./data/regression.train.txt', header=None, sep='	')
    df_test = pd.read_csv('./data/regression.test.txt', header=None, sep='	')
    
    # 设定训练集和测试集
    y_train = df_train[0].values
    y_test = df_test[0].values
    X_train = df_train.drop(0, axis=1).values
    X_test = df_test.drop(0, axis=1).values
    
    # 构建lgb中的Dataset格式,和xgboost中的DMatrix是对应的
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
    
    # 参数
    params = {
        'task': 'train',
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'metric': {'l2', 'auc'},
        'num_leaves': 31,
        'learning_rate': 0.05,
        'feature_fraction': 0.9,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'verbose': 0
    }
    
    print('开始训练...')
    # 训练
    gbm = lgb.train(params,
                    lgb_train,
                    num_boost_round=20,
                    valid_sets=lgb_eval,
                    early_stopping_rounds=5)
    
    # 保存模型
    print('保存模型...')
    # 保存模型到文件中
    gbm.save_model('model.txt')
    
    print('开始预测...')
    # 预测
    y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
    # 评估
    print('预估结果的rmse为:')
    print(mean_squared_error(y_test, y_pred) ** 0.5)
    
    Load data...
    开始训练...
    [1]	valid_0's auc: 0.764496	valid_0's l2: 0.24288
    Training until validation scores don't improve for 5 rounds.
    [2]	valid_0's auc: 0.766173	valid_0's l2: 0.239307
    [3]	valid_0's auc: 0.785547	valid_0's l2: 0.235559
    [4]	valid_0's auc: 0.797786	valid_0's l2: 0.230771
    [5]	valid_0's auc: 0.805155	valid_0's l2: 0.226297
    [6]	valid_0's auc: 0.803083	valid_0's l2: 0.22359
    [7]	valid_0's auc: 0.809622	valid_0's l2: 0.220982
    [8]	valid_0's auc: 0.808114	valid_0's l2: 0.218316
    [9]	valid_0's auc: 0.805671	valid_0's l2: 0.215884
    [10]	valid_0's auc: 0.805365	valid_0's l2: 0.213232
    [11]	valid_0's auc: 0.804857	valid_0's l2: 0.211087
    [12]	valid_0's auc: 0.805453	valid_0's l2: 0.20914
    Early stopping, best iteration is:
    [7]	valid_0's auc: 0.809622	valid_0's l2: 0.220982
    保存模型...
    开始预测...
    预估结果的rmse为:
    0.4700869286041175
    

    2.添加样本权重训练

    # coding: utf-8
    import json
    import lightgbm as lgb
    import pandas as pd
    import numpy as np
    from sklearn.metrics import mean_squared_error
    import warnings
    warnings.filterwarnings("ignore")
    
    # 加载数据集
    print('加载数据...')
    df_train = pd.read_csv('./data/binary.train', header=None, sep='	')
    df_test = pd.read_csv('./data/binary.test', header=None, sep='	')
    W_train = pd.read_csv('./data/binary.train.weight', header=None)[0]
    W_test = pd.read_csv('./data/binary.test.weight', header=None)[0]
    
    y_train = df_train[0].values
    y_test = df_test[0].values
    X_train = df_train.drop(0, axis=1).values
    X_test = df_test.drop(0, axis=1).values
    
    num_train, num_feature = X_train.shape
    
    # 加载数据的同时加载权重
    lgb_train = lgb.Dataset(X_train, y_train,
                            weight=W_train, free_raw_data=False)
    lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train,
                           weight=W_test, free_raw_data=False)
    
    # 设定参数
    params = {
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': 'binary_logloss',
        'num_leaves': 31,
        'learning_rate': 0.05,
        'feature_fraction': 0.9,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'verbose': 0
    }
    
    # 产出特征名称
    feature_name = ['feature_' + str(col) for col in range(num_feature)]
    
    print('开始训练...')
    gbm = lgb.train(params,
                    lgb_train,
                    num_boost_round=10,
                    valid_sets=lgb_train,  # 评估训练集
                    feature_name=feature_name,
                    categorical_feature=[21])
    
    加载数据...
    开始训练...
    [1]	training's binary_logloss: 0.680298
    [2]	training's binary_logloss: 0.672021
    [3]	training's binary_logloss: 0.664444
    [4]	training's binary_logloss: 0.655536
    [5]	training's binary_logloss: 0.647375
    [6]	training's binary_logloss: 0.64095
    [7]	training's binary_logloss: 0.63514
    [8]	training's binary_logloss: 0.628769
    [9]	training's binary_logloss: 0.622774
    [10]	training's binary_logloss: 0.616895
    

    3.模型的载入与预测

    # 查看特征名称
    print('完成10轮训练...')
    print('第7个特征为:')
    print(repr(lgb_train.feature_name[6]))
    
    # 存储模型
    gbm.save_model('./model/lgb_model.txt')
    
    # 特征名称
    print('特征名称:')
    print(gbm.feature_name())
    
    # 特征重要度
    print('特征重要度:')
    print(list(gbm.feature_importance()))
    
    # lgb.Booster加载模型
    print('加载模型用于预测')
    bst = lgb.Booster(model_file='./model/lgb_model.txt')
    
    # 预测
    y_pred = bst.predict(X_test)
    
    # 在测试集评估效果
    print('在测试集上的rmse为:')
    print(mean_squared_error(y_test, y_pred) ** 0.5)
    
    完成10轮训练...
    第7个特征为:
    'feature_6'
    特征名称:
    ['feature_0', 'feature_1', 'feature_2', 'feature_3', 'feature_4', 'feature_5', 'feature_6', 'feature_7', 'feature_8', 'feature_9', 'feature_10', 'feature_11', 'feature_12', 'feature_13', 'feature_14', 'feature_15', 'feature_16', 'feature_17', 'feature_18', 'feature_19', 'feature_20', 'feature_21', 'feature_22', 'feature_23', 'feature_24', 'feature_25', 'feature_26', 'feature_27']
    特征重要度:
    [9, 6, 1, 15, 5, 40, 3, 0, 0, 8, 2, 1, 0, 9, 2, 0, 0, 6, 2, 6, 0, 0, 37, 2, 30, 50, 37, 29]
    加载模型用于预测
    在测试集上的rmse为:
    0.4624111763226729
    

    4.接着之前的模型继续训练

    # 继续训练
    # 从./model/model.txt中加载模型初始化
    gbm = lgb.train(params,
                    lgb_train,
                    num_boost_round=10,
                    init_model='./model/lgb_model.txt',
                    valid_sets=lgb_eval)
    
    print('以旧模型为初始化,完成第 10-20 轮训练...')
    
    # 在训练的过程中调整超参数
    # 比如这里调整的是学习率
    gbm = lgb.train(params,
                    lgb_train,
                    num_boost_round=10,
                    init_model=gbm,
                    learning_rates=lambda iter: 0.05 * (0.99 ** iter),
                    valid_sets=lgb_eval)
    
    print('逐步调整学习率完成第 20-30 轮训练...')
    
    # 调整其他超参数
    gbm = lgb.train(params,
                    lgb_train,
                    num_boost_round=10,
                    init_model=gbm,
                    valid_sets=lgb_eval,
                    callbacks=[lgb.reset_parameter(bagging_fraction=[0.7] * 5 + [0.6] * 5)])
    
    print('逐步调整bagging比率完成第 30-40 轮训练...')
    
    [11]	valid_0's binary_logloss: 0.614214
    [12]	valid_0's binary_logloss: 0.609777
    [13]	valid_0's binary_logloss: 0.605236
    [14]	valid_0's binary_logloss: 0.601523
    [15]	valid_0's binary_logloss: 0.598256
    [16]	valid_0's binary_logloss: 0.595957
    [17]	valid_0's binary_logloss: 0.591773
    [18]	valid_0's binary_logloss: 0.588163
    [19]	valid_0's binary_logloss: 0.585106
    [20]	valid_0's binary_logloss: 0.582878
    以旧模型为初始化,完成第 10-20 轮训练...
    [21]	valid_0's binary_logloss: 0.614214
    [22]	valid_0's binary_logloss: 0.60982
    [23]	valid_0's binary_logloss: 0.605366
    [24]	valid_0's binary_logloss: 0.601754
    [25]	valid_0's binary_logloss: 0.598598
    [26]	valid_0's binary_logloss: 0.596394
    [27]	valid_0's binary_logloss: 0.59243
    [28]	valid_0's binary_logloss: 0.58903
    [29]	valid_0's binary_logloss: 0.586164
    [30]	valid_0's binary_logloss: 0.583693
    逐步调整学习率完成第 20-30 轮训练...
    [31]	valid_0's binary_logloss: 0.613881
    [32]	valid_0's binary_logloss: 0.608822
    [33]	valid_0's binary_logloss: 0.604746
    [34]	valid_0's binary_logloss: 0.600465
    [35]	valid_0's binary_logloss: 0.596407
    [36]	valid_0's binary_logloss: 0.593572
    [37]	valid_0's binary_logloss: 0.589196
    [38]	valid_0's binary_logloss: 0.586633
    [39]	valid_0's binary_logloss: 0.583136
    [40]	valid_0's binary_logloss: 0.579651
    逐步调整bagging比率完成第 30-40 轮训练...
    

    5.自定义损失函数

    # 类似在xgboost中的形式
    # 自定义损失函数需要
    def loglikelood(preds, train_data):
        labels = train_data.get_label()
        preds = 1. / (1. + np.exp(-preds))
        grad = preds - labels
        hess = preds * (1. - preds)
        return grad, hess
    
    
    # 自定义评估函数
    def binary_error(preds, train_data):
        labels = train_data.get_label()
        return 'error', np.mean(labels != (preds > 0.5)), False
    
    
    gbm = lgb.train(params,
                    lgb_train,
                    num_boost_round=10,
                    init_model=gbm,
                    fobj=loglikelood,
                    feval=binary_error,
                    valid_sets=lgb_eval)
    
    print('用自定义的损失函数与评估标准完成第40-50轮...')
    
    [41]	valid_0's binary_logloss: 4.61573	valid_0's error: 0.394
    [42]	valid_0's binary_logloss: 4.66615	valid_0's error: 0.386
    [43]	valid_0's binary_logloss: 4.58473	valid_0's error: 0.388
    [44]	valid_0's binary_logloss: 4.63403	valid_0's error: 0.388
    [45]	valid_0's binary_logloss: 4.81468	valid_0's error: 0.38
    [46]	valid_0's binary_logloss: 4.86387	valid_0's error: 0.366
    [47]	valid_0's binary_logloss: 4.71095	valid_0's error: 0.37
    [48]	valid_0's binary_logloss: 4.81772	valid_0's error: 0.358
    [49]	valid_0's binary_logloss: 4.87924	valid_0's error: 0.358
    [50]	valid_0's binary_logloss: 4.86966	valid_0's error: 0.352
    用自定义的损失函数与评估标准完成第40-50轮...
    

    sklearn与LightGBM配合使用

    1.LightGBM建模,sklearn评估

    # coding: utf-8
    import lightgbm as lgb
    import pandas as pd
    from sklearn.metrics import mean_squared_error
    from sklearn.model_selection import GridSearchCV
    
    # 加载数据
    print('加载数据...')
    df_train = pd.read_csv('./data/regression.train.txt', header=None, sep='	')
    df_test = pd.read_csv('./data/regression.test.txt', header=None, sep='	')
    
    # 取出特征和标签
    y_train = df_train[0].values
    y_test = df_test[0].values
    X_train = df_train.drop(0, axis=1).values
    X_test = df_test.drop(0, axis=1).values
    
    print('开始训练...')
    # 直接初始化LGBMRegressor
    # 这个LightGBM的Regressor和sklearn中其他Regressor基本是一致的
    gbm = lgb.LGBMRegressor(objective='regression',
                            num_leaves=31,
                            learning_rate=0.05,
                            n_estimators=20)
    
    # 使用fit函数拟合
    gbm.fit(X_train, y_train,
            eval_set=[(X_test, y_test)],
            eval_metric='l1',
            early_stopping_rounds=5)
    
    # 预测
    print('开始预测...')
    y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration_)
    # 评估预测结果
    print('预测结果的rmse是:')
    print(mean_squared_error(y_test, y_pred) ** 0.5)
    
    加载数据...
    开始训练...
    [1]	valid_0's l1: 0.491735	valid_0's l2: 0.242763
    Training until validation scores don't improve for 5 rounds.
    [2]	valid_0's l1: 0.486563	valid_0's l2: 0.237895
    [3]	valid_0's l1: 0.481489	valid_0's l2: 0.233277
    [4]	valid_0's l1: 0.476848	valid_0's l2: 0.22925
    [5]	valid_0's l1: 0.47305	valid_0's l2: 0.226155
    [6]	valid_0's l1: 0.469049	valid_0's l2: 0.222963
    [7]	valid_0's l1: 0.465556	valid_0's l2: 0.220364
    [8]	valid_0's l1: 0.462208	valid_0's l2: 0.217872
    [9]	valid_0's l1: 0.458676	valid_0's l2: 0.215328
    [10]	valid_0's l1: 0.454998	valid_0's l2: 0.212743
    [11]	valid_0's l1: 0.452047	valid_0's l2: 0.210805
    [12]	valid_0's l1: 0.449158	valid_0's l2: 0.208945
    [13]	valid_0's l1: 0.44608	valid_0's l2: 0.206986
    [14]	valid_0's l1: 0.443554	valid_0's l2: 0.205513
    [15]	valid_0's l1: 0.440643	valid_0's l2: 0.203728
    [16]	valid_0's l1: 0.437687	valid_0's l2: 0.201865
    [17]	valid_0's l1: 0.435454	valid_0's l2: 0.200639
    [18]	valid_0's l1: 0.433288	valid_0's l2: 0.199522
    [19]	valid_0's l1: 0.431297	valid_0's l2: 0.198552
    [20]	valid_0's l1: 0.428946	valid_0's l2: 0.197238
    Did not meet early stopping. Best iteration is:
    [20]	valid_0's l1: 0.428946	valid_0's l2: 0.197238
    开始预测...
    预测结果的rmse是:
    0.4441153344254208
    

    2.网格搜索查找最优超参数

    # 配合scikit-learn的网格搜索交叉验证选择最优超参数
    estimator = lgb.LGBMRegressor(num_leaves=31)
    
    param_grid = {
        'learning_rate': [0.01, 0.1, 1],
        'n_estimators': [20, 40]
    }
    
    gbm = GridSearchCV(estimator, param_grid)
    
    gbm.fit(X_train, y_train)
    
    print('用网格搜索找到的最优超参数为:')
    print(gbm.best_params_)
    
    用网格搜索找到的最优超参数为:
    {'learning_rate': 0.1, 'n_estimators': 40}
    

    3.绘图解释

    # coding: utf-8
    import lightgbm as lgb
    import pandas as pd
    
    try:
        import matplotlib.pyplot as plt
    except ImportError:
        raise ImportError('You need to install matplotlib for plotting.')
    
    # 加载数据集
    print('加载数据...')
    df_train = pd.read_csv('./data/regression.train.txt', header=None, sep='	')
    df_test = pd.read_csv('./data/regression.test.txt', header=None, sep='	')
    
    # 取出特征和标签
    y_train = df_train[0].values
    y_test = df_test[0].values
    X_train = df_train.drop(0, axis=1).values
    X_test = df_test.drop(0, axis=1).values
    
    # 构建lgb中的Dataset数据格式
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_test = lgb.Dataset(X_test, y_test, reference=lgb_train)
    
    # 设定参数
    params = {
        'num_leaves': 5,
        'metric': ('l1', 'l2'),
        'verbose': 0
    }
    
    evals_result = {}  # to record eval results for plotting
    
    print('开始训练...')
    # 训练
    gbm = lgb.train(params,
                    lgb_train,
                    num_boost_round=100,
                    valid_sets=[lgb_train, lgb_test],
                    feature_name=['f' + str(i + 1) for i in range(28)],
                    categorical_feature=[21],
                    evals_result=evals_result,
                    verbose_eval=10)
    
    print('在训练过程中绘图...')
    ax = lgb.plot_metric(evals_result, metric='l1')
    plt.show()
    
    print('画出特征重要度...')
    ax = lgb.plot_importance(gbm, max_num_features=10)
    plt.show()
    
    print('画出第84颗树...')
    ax = lgb.plot_tree(gbm, tree_index=83, figsize=(20, 8), show_info=['split_gain'])
    plt.show()
    
    #print('用graphviz画出第84颗树...')
    #graph = lgb.create_tree_digraph(gbm, tree_index=83, name='Tree84')
    #graph.render(view=True)
    
    加载数据...
    开始训练...
    [10]	training's l1: 0.457448	training's l2: 0.217995	valid_1's l1: 0.456464	valid_1's l2: 0.21641
    [20]	training's l1: 0.436869	training's l2: 0.205099	valid_1's l1: 0.434057	valid_1's l2: 0.201616
    [30]	training's l1: 0.421302	training's l2: 0.197421	valid_1's l1: 0.417019	valid_1's l2: 0.192514
    [40]	training's l1: 0.411107	training's l2: 0.192856	valid_1's l1: 0.406303	valid_1's l2: 0.187258
    [50]	training's l1: 0.403695	training's l2: 0.189593	valid_1's l1: 0.398997	valid_1's l2: 0.183688
    [60]	training's l1: 0.398704	training's l2: 0.187043	valid_1's l1: 0.393977	valid_1's l2: 0.181009
    [70]	training's l1: 0.394876	training's l2: 0.184982	valid_1's l1: 0.389805	valid_1's l2: 0.178803
    [80]	training's l1: 0.391147	training's l2: 0.1828	valid_1's l1: 0.386476	valid_1's l2: 0.176799
    [90]	training's l1: 0.388101	training's l2: 0.180817	valid_1's l1: 0.384404	valid_1's l2: 0.175775
    [100]	training's l1: 0.385174	training's l2: 0.179171	valid_1's l1: 0.382929	valid_1's l2: 0.175321
    在训练过程中绘图...
    

    画出特征重要度...
    

    画出第84颗树...
    

  • 相关阅读:
    mysql系列:加深对脏读、脏写、可重复读、幻读的理解
    PHP命令空间namespace及use的用法实践总结
    欲望与自制力
    如何被动不费太大力气的提升自己
    经常用到的常识
    Html单选按钮自定义样式
    连连看算法实现 —— 分治实现
    ctime,atime,mtime
    C++实现二叉搜索树的插入,删除
    进程调度算法总结
  • 原文地址:https://www.cnblogs.com/chenxiangzhen/p/10962983.html
Copyright © 2011-2022 走看看