zoukankan      html  css  js  c++  java
  • LightGBM代码模板

    一、二分类代码模板

    二分类也就是我们平时的0和1二类,和逻辑回归的label一样

    import lightgbm as lgb  
    import pandas as pd  
    import numpy as np  
    import pickle  
    from sklearn.metrics import roc_auc_score  
    from sklearn.model_selection import train_test_split  
    
    print("Loading Data ... ")  
    
    # 导入数据  
    train_x, train_y, test_x = load_data()  
    
    # 用sklearn.cross_validation进行训练数据集划分,这里训练集和交叉验证集比例为7:3,可以自己根据需要设置  
    X, val_X, y, val_y = train_test_split(  
        train_x,  
        train_y,  
        test_size=0.05,  
        random_state=1,  
        stratify=train_y # 这里保证分割后y的比例分布与原数据一致  
    )  
    
    X_train = X  
    y_train = y  
    X_test = val_X  
    y_test = val_y  
    
    # create dataset for lightgbm  
    lgb_train = lgb.Dataset(X_train, y_train)  
    lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)  
    # specify your configurations as a dict  
    params = {  
        'boosting_type': 'gbdt',  
        'objective': 'binary',  
        'metric': {'binary_logloss', 'auc'},  #二进制对数损失
        'num_leaves': 5,  
        'max_depth': 6,  
        'min_data_in_leaf': 450,  
        'learning_rate': 0.1,  
        'feature_fraction': 0.9,  
        'bagging_fraction': 0.95,  
        'bagging_freq': 5,  
        'lambda_l1': 1,    
        'lambda_l2': 0.001,  # 越小l2正则程度越高  
        'min_gain_to_split': 0.2,  
        'verbose': 5,  
        'is_unbalance': True  
    }  
    
    # train  
    print('Start training...')  
    gbm = lgb.train(params,  
                    lgb_train,  
                    num_boost_round=10000,  
                    valid_sets=lgb_eval,  
                    early_stopping_rounds=500)  
    
    print('Start predicting...')  
    
    preds = gbm.predict(test_x, num_iteration=gbm.best_iteration)  # 输出的是概率结果  
    
    # 导出结果  
    threshold = 0.5  
    for pred in preds:  
        result = 1 if pred > threshold else 0  
    
    # 导出特征重要性  
    importance = gbm.feature_importance()  
    names = gbm.feature_name()  
    with open('./feature_importance.txt', 'w+') as file:  
        for index, im in enumerate(importance):  
            string = names[index] + ', ' + str(im) + '
    '  
            file.write(string)  
    View Code

    里面的参数可以自己更改,又或者是配合着网格搜索法调参

    下面我使用kaggle的信用卡欺诈识别数据尝试一下效果如何

    # -*- coding: utf-8 -*-
    """
    Created on Thu Feb 25 15:08:00 2021
    
    @author: Administrator
    """
    
    #%%导入模块
    import pandas as pd 
    import numpy as np
    from scipy import stats
    import seaborn as sns
    import matplotlib.pyplot as plt
    %matplotlib inline
    plt.rc("font",family="SimHei",size="12")  #解决中文无法显示的问题
    
    
    #%%导入数据
    creditcard = pd.read_csv('D:/信用卡欺诈检测/creditcard.csv/creditcard.csv')
    
    
    #%%建模
    import lightgbm as lgb  
    import pandas as pd  
    import numpy as np  
    import pickle  
    from sklearn.metrics import roc_auc_score  
    from sklearn.model_selection import train_test_split  
    
    print("Loading Data ... ")  
    
    # 导入数据  
    train_x = creditcard.iloc[:,1:-1]
    train_y = creditcard['Class']
    
    # 用sklearn.cross_validation进行训练数据集划分,这里训练集和交叉验证集比例为7:3,可以自己根据需要设置  
    X, val_X, y, val_y = train_test_split(  
        train_x,  
        train_y,  
        test_size=0.05,  
        random_state=1,  
        stratify=train_y # 这里保证分割后y的比例分布与原数据一致  
    )  
    
    X_train = X  
    y_train = y  
    X_test = val_X  
    y_test = val_y  
    
    # create dataset for lightgbm  
    lgb_train = lgb.Dataset(X_train, y_train)  
    lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)  
    # specify your configurations as a dict  
    params = {  
        'boosting_type': 'gbdt',  
        'objective': 'binary',  
        'metric': {'binary_logloss', 'auc'},  #二进制对数损失
        'num_leaves': 5,  
        'max_depth': 6,  
        'min_data_in_leaf': 450,  
        'learning_rate': 0.1,  
        'feature_fraction': 0.9,  
        'bagging_fraction': 0.95,  
        'bagging_freq': 5,  
        'lambda_l1': 1,    
        'lambda_l2': 0.001,  # 越小l2正则程度越高  
        'min_gain_to_split': 0.2,  
        'verbose': 5,  
        'is_unbalance': True  
    }  
    
    # train  
    print('Start training...')  
    gbm = lgb.train(params,  
                    lgb_train,  
                    num_boost_round=10000,  
                    valid_sets=lgb_eval,  
                    early_stopping_rounds=500)  
    
    print('Start predicting...')  
    
    preds = gbm.predict(X_test, num_iteration=gbm.best_iteration)  # 输出的是概率结果  
    
    # 导出结果  
    preds= pd.DataFrame(preds)
    preds[preds>0.5] = 1
    preds[preds<=0.5] = 0
    
    #%%一些模型评估方法
    from sklearn.metrics import precision_score, recall_score, f1_score,confusion_matrix
    matrix = confusion_matrix(y_test, preds)
    print("混淆矩阵:
    ", matrix)
    print("精度:", precision_score(y_test, preds))
    print("召回率:", recall_score(y_test, preds))
    print("f1分数:", f1_score(y_test, preds))
    '''
    混淆矩阵:
     [[13899   317]
     [    7    18]]
    精度: 0.05373134328358209
    召回率: 0.72
    f1分数: 0.1
    '''
    
    
    # 导出特征重要性  
    importance = gbm.feature_importance()  
    names = gbm.feature_name()  
    with open('./feature_importance.txt', 'w+') as file:  
        for index, im in enumerate(importance):  
            string = names[index] + ', ' + str(im) + '
    '  
            file.write(string)  
    View Code

    好像也不是特别好

    补充:

    1.LightGBM可以自动处理缺失值

  • 相关阅读:
    你现在是否在高效地使用时间?
    关于不使用web服务实现文本框自动完成扩展
    SpringBoot(一) -- SpringBoot入门
    微信小程序(三)--小程序UI开发
    微信小程序(二)--逻辑层与界面层
    微信小程序(一)--微信小程序的介绍
    Spring学习(七)--Spring MVC的高级技术
    Spring学习(四)--面向切面的Spring
    Spring学习(三)--高级装配
    django源码分析 LazySetting对象
  • 原文地址:https://www.cnblogs.com/cgmcoding/p/14447144.html
Copyright © 2011-2022 走看看