zoukankan html css js c++ java

【第17期Datawhale | 零基础入门金融风控-贷款违约预测】Task05：模型融合（3天） : stacking (叠加)+GPU加速示例

借鉴学习链接

https://www.zhihu.com/column/exuding 零基础入门金融风控贷款违约模型

使用AI-stdio的16GGPU进行加速处理

TODO 待续

模型融合代码如下:

# # 如果需要进行持久化安装, 需要使用持久化路径, 如下方代码示例:
# # If a persistence installation is required, you need to use the persistence path as the following:
# # !mkdir /home/aistudio/external-libraries
# # !pip install --upgrade pip -t /home/aistudio/external-libraries
# # !pip install pandas -t /home/aistudio/external-libraries
# # !pip install pandas_profiling 

# !pip install lightgbm  -t /home/aistudio/external-libraries
# !pip install xgboost  -t /home/aistudio/external-libraries
# !pip install heamy  -t /home/aistudio/external-libraries
import sys
sys.path.append('/home/aistudio/external-libraries')

####  import
import datetime
import warnings

import numpy as np
import pandas as pd
import xgboost as xgb
import lightgbm as lgb
from sklearn import metrics
from sklearn.model_selection import train_test_split
warnings.filterwarnings('ignore')
import seaborn as sns
import matplotlib.pyplot as plt

# model fusion
from heamy.dataset import Dataset
from heamy.estimator import Classifier
from sklearn.linear_model import LogisticRegression
from heamy.pipeline import ModelsPipeline

请点击此处查看本环境基本用法.

Please click here for more detailed instructions.

####  pre  declaration
"""
sns 相关设置
@return:
"""
# 声明使用 Seaborn 样式
sns.set()
# 有五种seaborn的绘图风格，它们分别是：darkgrid, whitegrid, dark, white, ticks。默认的主题是darkgrid。
sns.set_style("whitegrid")
# 有四个预置的环境，按大小从小到大排列分别为：paper, notebook, talk, poster。其中，notebook是默认的。
sns.set_context('talk')
# 中文字体设置-黑体
plt.rcParams['font.sans-serif'] = ['SimHei']
# 解决保存图像是负号'-'显示为方块的问题
plt.rcParams['axes.unicode_minus'] = False
# 解决Seaborn中文显示问题并调整字体大小
sns.set(font='SimHei')

print('import done, sns & plt preset done ', datetime.datetime.now())

####  0.1 公共变量

ali_file_path = './user_data/'
linux_file_path = '/plus/阿里云开发者-天池比赛/02_零基础入门金融风控_贷款违约预测/'
win_file_path = 'E:\阿里云开发者-天池比赛\02_零基础入门金融风控_贷款违约预测\'
baidu_file_path = 'data/data54049/'

# env_name = 'win'
# env_name = 'centos'
env_name = 'baidu'

now = datetime.datetime.now().strftime('%Y-%m-%d_%H_%M_%S')
train_file_path = ''
testA_file_path = ''
out_path = './'
if env_name == 'win':
    out_path = 'E:\PycharmProjects\TianChiProject\00_山枫叶纷飞\competitions\002_financial_risk\predict_result\'+ 'model_fusion_a_predict_{}.csv'.format(now)
    train_file_path = win_file_path+'train.csv'
    testA_file_path = win_file_path+'testA.csv'
elif env_name == 'baidu':
     out_path = 'work/predict_result/test_a_{}.csv'.format(now)
     train_file_path = baidu_file_path + 'train.csv'
     testA_file_path = baidu_file_path + 'testA.csv'

# %% 数据读取
# reduce_memory_usage 函数通过调整数据类型，帮助我们减少数据在内存中占用的空间
def reduce_memory_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.
    """
    start_mem = df.memory_usage().sum()  / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min >= np.iinfo(np.int8).min and c_max <= np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min >= np.iinfo(np.int16).min and c_max <= np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min >= np.iinfo(np.int32).min and c_max <= np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min >= np.iinfo(np.int64).min and c_max <= np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min >= np.finfo(np.float16).min and c_max <= np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min >= np.finfo(np.float32).min and c_max <= np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')
    end_mem = df.memory_usage().sum()  / 1024**2
    print('>>>>>>>>> Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('>>>>>>>>> Decreased by {:.1f}%

'.format(100 * (start_mem - end_mem) / start_mem)) # 下降75.6%
    return df

train = pd.read_csv(train_file_path)
testA = pd.read_csv(testA_file_path)

#python
train.head()

#python
data = pd.concat([train, testA], axis=0, ignore_index=True)

####  数据预览

# - 可以看到很多变量不能直接训练，比如grade、subGrade、employmentLength、issueDate、earliesCreditLine，需要进行预处理

#python
print(sorted(data['grade'].unique()))
print(sorted(data['subGrade'].unique()))
# #
#
#     ['A', 'B', 'C', 'D', 'E', 'F', 'G']
#     ['A1', 'A2', 'A3', 'A4', 'A5', 'B1', 'B2', 'B3', 'B4', 'B5', 'C1', 'C2', 'C3', 'C4', 'C5', 'D1', 'D2', 'D3', 'D4', 'D5', 'E1', 'E2', 'E3', 'E4', 'E5', 'F1', 'F2', 'F3', 'F4', 'F5', 'G1', 'G2', 'G3', 'G4', 'G5']

#python
data['employmentLength'].value_counts(dropna=False).sort_index()


####  数据预处理

# TODO 缺失值的处理

# 首先对employmentLength进行转换到数值

#python
data['employmentLength'].replace(to_replace='10+ years', value='10 years', inplace=True)
data['employmentLength'].replace('< 1 year', '0 years', inplace=True)

def employmentLength_to_int(s):
    if pd.isnull(s):
        return s
    else:
        return np.int8(s.split()[0])

data['employmentLength'] = data['employmentLength'].apply(employmentLength_to_int)

#python
data['employmentLength'].value_counts(dropna=False).sort_index()


# 对earliesCreditLine进行预处理


#python
data['earliesCreditLine'].sample(5)


#python
data['earliesCreditLine'] = data['earliesCreditLine'].apply(lambda s: int(s[-4:]))
#python
data['earliesCreditLine'].describe()


#python
data.head()



# 类别特征处理


#python
# 部分类别特征
cate_features = ['grade', 'subGrade', 'employmentTitle', 'homeOwnership', 'verificationStatus', 'purpose', 'postCode', 'regionCode', 
                 'applicationType', 'initialListStatus', 'title', 'policyCode']
for f in cate_features:
    print(f, '类型数：', data[f].nunique())



#python
# 类型数在2之上，又不是高维稀疏的
data = pd.get_dummies(data, columns=['grade', 'subGrade', 'homeOwnership', 'verificationStatus', 'purpose', 'regionCode'], drop_first=True)
#


# 高维类别特征需要进行转换 do
for f in ['employmentTitle', 'postCode', 'title']:
    data[f+'_cnts'] = data.groupby([f])['id'].transform('count')
    data[f+'_rank'] = data.groupby([f])['id'].rank(ascending=False).astype(int)
    del data[f]
#

## 训练数据/测试数据准备


#python
features = [f for f in data.columns if f not in ['id','issueDate','isDefault']]

train = data[data.isDefault.notnull()].reset_index(drop=True)
test = data[data.isDefault.isnull()].reset_index(drop=True)

x_train = train[features]
x_test = test[features]
y_train = train['isDefault']
#

import done, sns & plt preset done  2020-09-27 18:28:02.821639

####  模型训练
####  5.1 建立模型：【模型参数：xgb-->鱼佬baseline，lgb --> 贝叶斯调参】

def xgb_model(X_train, y_train, X_test, y_test=None):
    X_train_split, X_val, y_train_split, y_val = train_test_split(X_train, y_train, test_size=0.2)
    train_matrix = xgb.DMatrix(X_train_split , label=y_train_split)
    valid_matrix = xgb.DMatrix(X_val , label=y_val)
    test_matrix = xgb.DMatrix(X_test)

    params = {
        'booster': 'gbtree',
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'gamma': 1,
        'min_child_weight': 1.5,
        'max_depth': 5,
        'lambda': 10,
        'subsample': 0.7,
        'colsample_bytree': 0.7,
        'colsample_bylevel': 0.7,
        'eta': 0.04,
        'tree_method': 'exact',
        'seed': 2020,
        'n_jobs': -1,
        "silent": True,
    }
    watchlist = [(train_matrix, 'train'),(valid_matrix, 'eval')]

    model = xgb.train(params, train_matrix, num_boost_round=50000, evals=watchlist, verbose_eval=200, early_stopping_rounds=200)
    """计算在验证集上的得分"""
    val_pred  = model.predict(valid_matrix, ntree_limit=model.best_ntree_limit)
    fpr, tpr, threshold = metrics.roc_curve(y_val, val_pred)
    roc_auc = metrics.auc(fpr, tpr)
    print('调参后xgboost单模型在验证集上的AUC：{}'.format(roc_auc))
    """对测试集进行预测"""
    test_pred = model.predict(test_matrix, ntree_limit=model.best_ntree_limit)

    return test_pred


def lgb_model(X_train, y_train, X_test, y_test=None):
    X_train_split, X_val, y_train_split, y_val = train_test_split(X_train, y_train, test_size=0.2)
    train_matrix = lgb.Dataset(X_train_split, label=y_train_split)
    valid_matrix = lgb.Dataset(X_val, label=y_val)

    # 调参后的最优参数
    params = {
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': 'auc',
        'learning_rate': 0.01,
        'min_child_weight': 0.32,
        'num_leaves': 14,
        'max_depth': 4,
        'feature_fraction': 0.81,
        'bagging_fraction': 0.61,
        'bagging_freq': 9,
        'min_data_in_leaf': 13,
        'min_split_gain': 0.27,
        'reg_alpha': 9.58,
        'reg_lambda': 4.62,
        'seed': 2020,
        'n_jobs':-1,
        'silent': True,
        'verbose': -1,
    }

    model = lgb.train(params, train_matrix, 50000, valid_sets=[train_matrix, valid_matrix], verbose_eval=500, early_stopping_rounds=500)
    """计算在验证集上的得分"""
    val_pred = model.predict(X_val, num_iteration=model.best_iteration)
    fpr, tpr, threshold = metrics.roc_curve(y_val, val_pred)
    roc_auc = metrics.auc(fpr, tpr)
    print('调参后lightgbm单模型在验证集上的AUC：{}'.format(roc_auc))
    """对测试集进行预测"""
    test_pred = model.predict(X_test, num_iteration=model.best_iteration)

    return test_pred

####  5.2 基于模型层面的融合

#   x_train = train[features]
#   x_test = test[features]
#   y_train = train['isDefault']

model_dataset = Dataset(X_train=x_train, y_train=y_train, X_test=x_test)
model_xgb = Classifier(dataset=model_dataset, estimator=xgb_model, name='xgb', use_cache=False)
model_lgb = Classifier(dataset=model_dataset, estimator=lgb_model, name='lgb', use_cache=False)

####  5.3 使用stacking (叠加的方法)方法进行模型融合


pipeline = ModelsPipeline(model_xgb, model_lgb)
pipeline

####  5.4 构建第一层新特征，其中k默认是5，表示5折交叉验证，full_test=True，
#       对全部训练集进行训练得到基学习器，然后用基学习器对测试集预测得到新特征
stack_ds = pipeline.stack(k=5, seed=111, full_test=True)

####  5.5 第二层使用逻辑回归进行stack

LogisticRegression(solver='lbfgs')
stacker = Classifier(dataset=stack_ds, estimator=LogisticRegression, parameters={'solver': 'lbfgs'})
# 测试集的预测结果
test_pred = stacker.predict()
test_pred

####  """生成提交格式的DataFrame""" 导出CSV
test['isDefault'] = test_pred

print('out_path: ', out_path)
test[['id','isDefault']].to_csv(out_path, index=False)

print('导出CSV done', datetime.datetime.now())

输出结果

降低了xgb和lgb的参数 num_boost_round = 5000 (原文章建议值为50000)

导致训练结果大跌

对全部训练集进行训练得到基学习器，然后用基学习器对测试集预测得到新特征

stack_ds = pipeline.stack(k=5, seed=233, full_test=True)

输出

调参后xgboost单模型在验证集上的AUC：0.7314319707309533
调参后xgboost单模型在验证集上的AUC：0.7328692352003359
调参后xgboost单模型在验证集上的AUC：0.7348644657792799
调参后xgboost单模型在验证集上的AUC：0.7292134605693141
调参后xgboost单模型在验证集上的AUC：0.7273300682262768
调参后xgboost单模型在验证集上的AUC：0.7315962351115055
调参后lightgbm单模型在验证集上的AUC：0.7301180187432554
调参后lightgbm单模型在验证集上的AUC：0.7276946066179788
调参后lightgbm单模型在验证集上的AUC：0.7301011073663606
调参后lightgbm单模型在验证集上的AUC：0.731260767784271
调参后lightgbm单模型在验证集上的AUC：0.7274780648597519
调参后lightgbm单模型在验证集上的AUC：0.7275650245325219

打印混合模型后的所有列名

print('混合模型后, x_train所有列为: ', x_train.columns)
print('混合模型后, test 所有列为: ', test.columns)
print('混合模型后, test->np.array所有列为: ', np.array(test.columns))

混合模型后, x_train所有列为:  Index(['loanAmnt', 'term', 'interestRate', 'installment', 'employmentLength',
       'annualIncome', 'dti', 'delinquency_2years', 'ficoRangeLow',
       'ficoRangeHigh',
       ...
       'regionCode_47', 'regionCode_48', 'regionCode_49', 'regionCode_50',
       'employmentTitle_cnts', 'employmentTitle_rank', 'postCode_cnts',
       'postCode_rank', 'title_cnts', 'title_rank'],
      dtype='object', length=153)
混合模型后, test   所有列为:  Index(['id', 'loanAmnt', 'term', 'interestRate', 'installment',
       'employmentLength', 'annualIncome', 'issueDate', 'isDefault', 'dti',
       ...
       'regionCode_47', 'regionCode_48', 'regionCode_49', 'regionCode_50',
       'employmentTitle_cnts', 'employmentTitle_rank', 'postCode_cnts',
       'postCode_rank', 'title_cnts', 'title_rank'],
      dtype='object', length=156)
混合模型后, test->np.array所有列为:  ['id' 'loanAmnt' 'term' 'interestRate' 'installment' 'employmentLength'
 'annualIncome' 'issueDate' 'isDefault' 'dti' 'delinquency_2years'
 'ficoRangeLow' 'ficoRangeHigh' 'openAcc' 'pubRec' 'pubRecBankruptcies'
 'revolBal' 'revolUtil' 'totalAcc' 'initialListStatus' 'applicationType'
 'earliesCreditLine' 'policyCode' 'n0' 'n1' 'n2' 'n2.1' 'n4' 'n5' 'n6'
 'n7' 'n8' 'n9' 'n10' 'n11' 'n12' 'n13' 'n14' 'n2.2' 'n2.3' 'grade_B'
 'grade_C' 'grade_D' 'grade_E' 'grade_F' 'grade_G' 'subGrade_A2'
 'subGrade_A3' 'subGrade_A4' 'subGrade_A5' 'subGrade_B1' 'subGrade_B2'
 'subGrade_B3' 'subGrade_B4' 'subGrade_B5' 'subGrade_C1' 'subGrade_C2'
 'subGrade_C3' 'subGrade_C4' 'subGrade_C5' 'subGrade_D1' 'subGrade_D2'
 'subGrade_D3' 'subGrade_D4' 'subGrade_D5' 'subGrade_E1' 'subGrade_E2'
 'subGrade_E3' 'subGrade_E4' 'subGrade_E5' 'subGrade_F1' 'subGrade_F2'
 'subGrade_F3' 'subGrade_F4' 'subGrade_F5' 'subGrade_G1' 'subGrade_G2'
 'subGrade_G3' 'subGrade_G4' 'subGrade_G5' 'homeOwnership_1'
 'homeOwnership_2' 'homeOwnership_3' 'homeOwnership_4' 'homeOwnership_5'
 'verificationStatus_1' 'verificationStatus_2' 'purpose_1' 'purpose_2'
 'purpose_3' 'purpose_4' 'purpose_5' 'purpose_6' 'purpose_7' 'purpose_8'
 'purpose_9' 'purpose_10' 'purpose_11' 'purpose_12' 'purpose_13'
 'regionCode_1' 'regionCode_2' 'regionCode_3' 'regionCode_4'
 'regionCode_5' 'regionCode_6' 'regionCode_7' 'regionCode_8'
 'regionCode_9' 'regionCode_10' 'regionCode_11' 'regionCode_12'
 'regionCode_13' 'regionCode_14' 'regionCode_15' 'regionCode_16'
 'regionCode_17' 'regionCode_18' 'regionCode_19' 'regionCode_20'
 'regionCode_21' 'regionCode_22' 'regionCode_23' 'regionCode_24'
 'regionCode_25' 'regionCode_26' 'regionCode_27' 'regionCode_28'
 'regionCode_29' 'regionCode_30' 'regionCode_31' 'regionCode_32'
 'regionCode_33' 'regionCode_34' 'regionCode_35' 'regionCode_36'
 'regionCode_37' 'regionCode_38' 'regionCode_39' 'regionCode_40'
 'regionCode_41' 'regionCode_42' 'regionCode_43' 'regionCode_44'
 'regionCode_45' 'regionCode_46' 'regionCode_47' 'regionCode_48'
 'regionCode_49' 'regionCode_50' 'employmentTitle_cnts'
 'employmentTitle_rank' 'postCode_cnts' 'postCode_rank' 'title_cnts'
 'title_rank']

查看全文

相关阅读:
django 自定义用户身份验证
 登录验证算法
 Scrapy
爬虫性能相关
 Beautifulsoup模块
 selenium模块
 Cookie&Session
Django Admin 本质
 JavaScript自执行函数和jquery扩展方法
 JS作用域与词法分析

原文地址：https://www.cnblogs.com/zhazhaacmer/p/13740311.html