一、二分类代码模板
二分类也就是我们平时的0和1二类,和逻辑回归的label一样
![](https://images.cnblogs.com/OutliningIndicators/ContractedBlock.gif)
import lightgbm as lgb import pandas as pd import numpy as np import pickle from sklearn.metrics import roc_auc_score from sklearn.model_selection import train_test_split print("Loading Data ... ") # 导入数据 train_x, train_y, test_x = load_data() # 用sklearn.cross_validation进行训练数据集划分,这里训练集和交叉验证集比例为7:3,可以自己根据需要设置 X, val_X, y, val_y = train_test_split( train_x, train_y, test_size=0.05, random_state=1, stratify=train_y # 这里保证分割后y的比例分布与原数据一致 ) X_train = X y_train = y X_test = val_X y_test = val_y # create dataset for lightgbm lgb_train = lgb.Dataset(X_train, y_train) lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train) # specify your configurations as a dict params = { 'boosting_type': 'gbdt', 'objective': 'binary', 'metric': {'binary_logloss', 'auc'}, #二进制对数损失 'num_leaves': 5, 'max_depth': 6, 'min_data_in_leaf': 450, 'learning_rate': 0.1, 'feature_fraction': 0.9, 'bagging_fraction': 0.95, 'bagging_freq': 5, 'lambda_l1': 1, 'lambda_l2': 0.001, # 越小l2正则程度越高 'min_gain_to_split': 0.2, 'verbose': 5, 'is_unbalance': True } # train print('Start training...') gbm = lgb.train(params, lgb_train, num_boost_round=10000, valid_sets=lgb_eval, early_stopping_rounds=500) print('Start predicting...') preds = gbm.predict(test_x, num_iteration=gbm.best_iteration) # 输出的是概率结果 # 导出结果 threshold = 0.5 for pred in preds: result = 1 if pred > threshold else 0 # 导出特征重要性 importance = gbm.feature_importance() names = gbm.feature_name() with open('./feature_importance.txt', 'w+') as file: for index, im in enumerate(importance): string = names[index] + ', ' + str(im) + ' ' file.write(string)
里面的参数可以自己更改,又或者是配合着网格搜索法调参
下面我使用kaggle的信用卡欺诈识别数据尝试一下效果如何
![](https://images.cnblogs.com/OutliningIndicators/ContractedBlock.gif)
# -*- coding: utf-8 -*- """ Created on Thu Feb 25 15:08:00 2021 @author: Administrator """ #%%导入模块 import pandas as pd import numpy as np from scipy import stats import seaborn as sns import matplotlib.pyplot as plt %matplotlib inline plt.rc("font",family="SimHei",size="12") #解决中文无法显示的问题 #%%导入数据 creditcard = pd.read_csv('D:/信用卡欺诈检测/creditcard.csv/creditcard.csv') #%%建模 import lightgbm as lgb import pandas as pd import numpy as np import pickle from sklearn.metrics import roc_auc_score from sklearn.model_selection import train_test_split print("Loading Data ... ") # 导入数据 train_x = creditcard.iloc[:,1:-1] train_y = creditcard['Class'] # 用sklearn.cross_validation进行训练数据集划分,这里训练集和交叉验证集比例为7:3,可以自己根据需要设置 X, val_X, y, val_y = train_test_split( train_x, train_y, test_size=0.05, random_state=1, stratify=train_y # 这里保证分割后y的比例分布与原数据一致 ) X_train = X y_train = y X_test = val_X y_test = val_y # create dataset for lightgbm lgb_train = lgb.Dataset(X_train, y_train) lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train) # specify your configurations as a dict params = { 'boosting_type': 'gbdt', 'objective': 'binary', 'metric': {'binary_logloss', 'auc'}, #二进制对数损失 'num_leaves': 5, 'max_depth': 6, 'min_data_in_leaf': 450, 'learning_rate': 0.1, 'feature_fraction': 0.9, 'bagging_fraction': 0.95, 'bagging_freq': 5, 'lambda_l1': 1, 'lambda_l2': 0.001, # 越小l2正则程度越高 'min_gain_to_split': 0.2, 'verbose': 5, 'is_unbalance': True } # train print('Start training...') gbm = lgb.train(params, lgb_train, num_boost_round=10000, valid_sets=lgb_eval, early_stopping_rounds=500) print('Start predicting...') preds = gbm.predict(X_test, num_iteration=gbm.best_iteration) # 输出的是概率结果 # 导出结果 preds= pd.DataFrame(preds) preds[preds>0.5] = 1 preds[preds<=0.5] = 0 #%%一些模型评估方法 from sklearn.metrics import precision_score, recall_score, f1_score,confusion_matrix matrix = confusion_matrix(y_test, preds) print("混淆矩阵: ", matrix) print("精度:", precision_score(y_test, preds)) print("召回率:", recall_score(y_test, preds)) print("f1分数:", f1_score(y_test, preds)) ''' 混淆矩阵: [[13899 317] [ 7 18]] 精度: 0.05373134328358209 召回率: 0.72 f1分数: 0.1 ''' # 导出特征重要性 importance = gbm.feature_importance() names = gbm.feature_name() with open('./feature_importance.txt', 'w+') as file: for index, im in enumerate(importance): string = names[index] + ', ' + str(im) + ' ' file.write(string)
好像也不是特别好
补充:
1.LightGBM可以自动处理缺失值