zoukankan      html  css  js  c++  java
  • 【第17期Datawhale | 零基础入门金融风控-贷款违约预测】Task05:模型融合(3天) : stacking (叠加)+GPU加速示例

    借鉴学习链接

    https://www.zhihu.com/column/exuding 零基础入门金融风控贷款违约模型

    使用AI-stdio的16GGPU进行加速处理

    TODO 待续

    模型融合 代码如下:

    # # 如果需要进行持久化安装, 需要使用持久化路径, 如下方代码示例:
    # # If a persistence installation is required, you need to use the persistence path as the following:
    # # !mkdir /home/aistudio/external-libraries
    # # !pip install --upgrade pip -t /home/aistudio/external-libraries
    # # !pip install pandas -t /home/aistudio/external-libraries
    # # !pip install pandas_profiling 
    
    # !pip install lightgbm  -t /home/aistudio/external-libraries
    # !pip install xgboost  -t /home/aistudio/external-libraries
    # !pip install heamy  -t /home/aistudio/external-libraries
    import sys
    sys.path.append('/home/aistudio/external-libraries')
    
    
    
    ####  import
    import datetime
    import warnings
    
    import numpy as np
    import pandas as pd
    import xgboost as xgb
    import lightgbm as lgb
    from sklearn import metrics
    from sklearn.model_selection import train_test_split
    warnings.filterwarnings('ignore')
    import seaborn as sns
    import matplotlib.pyplot as plt
    
    # model fusion
    from heamy.dataset import Dataset
    from heamy.estimator import Classifier
    from sklearn.linear_model import LogisticRegression
    from heamy.pipeline import ModelsPipeline
    

    请点击此处查看本环境基本用法.

    Please click here for more detailed instructions.

    ####  pre  declaration
    """
    sns 相关设置
    @return:
    """
    # 声明使用 Seaborn 样式
    sns.set()
    # 有五种seaborn的绘图风格,它们分别是:darkgrid, whitegrid, dark, white, ticks。默认的主题是darkgrid。
    sns.set_style("whitegrid")
    # 有四个预置的环境,按大小从小到大排列分别为:paper, notebook, talk, poster。其中,notebook是默认的。
    sns.set_context('talk')
    # 中文字体设置-黑体
    plt.rcParams['font.sans-serif'] = ['SimHei']
    # 解决保存图像是负号'-'显示为方块的问题
    plt.rcParams['axes.unicode_minus'] = False
    # 解决Seaborn中文显示问题并调整字体大小
    sns.set(font='SimHei')
    
    print('import done, sns & plt preset done ', datetime.datetime.now())
    
    ####  0.1 公共变量
    
    ali_file_path = './user_data/'
    linux_file_path = '/plus/阿里云开发者-天池比赛/02_零基础入门金融风控_贷款违约预测/'
    win_file_path = 'E:\阿里云开发者-天池比赛\02_零基础入门金融风控_贷款违约预测\'
    baidu_file_path = 'data/data54049/'
    
    # env_name = 'win'
    # env_name = 'centos'
    env_name = 'baidu'
    
    now = datetime.datetime.now().strftime('%Y-%m-%d_%H_%M_%S')
    train_file_path = ''
    testA_file_path = ''
    out_path = './'
    if env_name == 'win':
        out_path = 'E:\PycharmProjects\TianChiProject\00_山枫叶纷飞\competitions\002_financial_risk\predict_result\'+ 'model_fusion_a_predict_{}.csv'.format(now)
        train_file_path = win_file_path+'train.csv'
        testA_file_path = win_file_path+'testA.csv'
    elif env_name == 'baidu':
         out_path = 'work/predict_result/test_a_{}.csv'.format(now)
         train_file_path = baidu_file_path + 'train.csv'
         testA_file_path = baidu_file_path + 'testA.csv'
    
    # %% 数据读取
    # reduce_memory_usage 函数通过调整数据类型,帮助我们减少数据在内存中占用的空间
    def reduce_memory_usage(df):
        """ iterate through all the columns of a dataframe and modify the data type
            to reduce memory usage.
        """
        start_mem = df.memory_usage().sum()  / 1024**2
        print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
        for col in df.columns:
            col_type = df[col].dtype
    
            if col_type != object:
                c_min = df[col].min()
                c_max = df[col].max()
                if str(col_type)[:3] == 'int':
                    if c_min >= np.iinfo(np.int8).min and c_max <= np.iinfo(np.int8).max:
                        df[col] = df[col].astype(np.int8)
                    elif c_min >= np.iinfo(np.int16).min and c_max <= np.iinfo(np.int16).max:
                        df[col] = df[col].astype(np.int16)
                    elif c_min >= np.iinfo(np.int32).min and c_max <= np.iinfo(np.int32).max:
                        df[col] = df[col].astype(np.int32)
                    elif c_min >= np.iinfo(np.int64).min and c_max <= np.iinfo(np.int64).max:
                        df[col] = df[col].astype(np.int64)
                else:
                    if c_min >= np.finfo(np.float16).min and c_max <= np.finfo(np.float16).max:
                        df[col] = df[col].astype(np.float16)
                    elif c_min >= np.finfo(np.float32).min and c_max <= np.finfo(np.float32).max:
                        df[col] = df[col].astype(np.float32)
                    else:
                        df[col] = df[col].astype(np.float64)
            else:
                df[col] = df[col].astype('category')
        end_mem = df.memory_usage().sum()  / 1024**2
        print('>>>>>>>>> Memory usage after optimization is: {:.2f} MB'.format(end_mem))
        print('>>>>>>>>> Decreased by {:.1f}%
    
    '.format(100 * (start_mem - end_mem) / start_mem)) # 下降75.6%
        return df
    
    train = pd.read_csv(train_file_path)
    testA = pd.read_csv(testA_file_path)
    
    #python
    train.head()
    
    #python
    data = pd.concat([train, testA], axis=0, ignore_index=True)
    
    ####  数据预览
    
    # - 可以看到很多变量不能直接训练,比如grade、subGrade、employmentLength、issueDate、earliesCreditLine,需要进行预处理
    
    #python
    print(sorted(data['grade'].unique()))
    print(sorted(data['subGrade'].unique()))
    # #
    #
    #     ['A', 'B', 'C', 'D', 'E', 'F', 'G']
    #     ['A1', 'A2', 'A3', 'A4', 'A5', 'B1', 'B2', 'B3', 'B4', 'B5', 'C1', 'C2', 'C3', 'C4', 'C5', 'D1', 'D2', 'D3', 'D4', 'D5', 'E1', 'E2', 'E3', 'E4', 'E5', 'F1', 'F2', 'F3', 'F4', 'F5', 'G1', 'G2', 'G3', 'G4', 'G5']
    
    #python
    data['employmentLength'].value_counts(dropna=False).sort_index()
    
    
    ####  数据预处理
    
    # TODO 缺失值的处理
    
    # 首先对employmentLength进行转换到数值
    
    #python
    data['employmentLength'].replace(to_replace='10+ years', value='10 years', inplace=True)
    data['employmentLength'].replace('< 1 year', '0 years', inplace=True)
    
    def employmentLength_to_int(s):
        if pd.isnull(s):
            return s
        else:
            return np.int8(s.split()[0])
    
    data['employmentLength'] = data['employmentLength'].apply(employmentLength_to_int)
    
    #python
    data['employmentLength'].value_counts(dropna=False).sort_index()
    
    
    # 对earliesCreditLine进行预处理
    
    
    #python
    data['earliesCreditLine'].sample(5)
    
    
    #python
    data['earliesCreditLine'] = data['earliesCreditLine'].apply(lambda s: int(s[-4:]))
    #python
    data['earliesCreditLine'].describe()
    
    
    #python
    data.head()
    
    
    
    # 类别特征处理
    
    
    #python
    # 部分类别特征
    cate_features = ['grade', 'subGrade', 'employmentTitle', 'homeOwnership', 'verificationStatus', 'purpose', 'postCode', 'regionCode', 
                     'applicationType', 'initialListStatus', 'title', 'policyCode']
    for f in cate_features:
        print(f, '类型数:', data[f].nunique())
    
    
    
    #python
    # 类型数在2之上,又不是高维稀疏的
    data = pd.get_dummies(data, columns=['grade', 'subGrade', 'homeOwnership', 'verificationStatus', 'purpose', 'regionCode'], drop_first=True)
    #
    
    
    # 高维类别特征需要进行转换 do
    for f in ['employmentTitle', 'postCode', 'title']:
        data[f+'_cnts'] = data.groupby([f])['id'].transform('count')
        data[f+'_rank'] = data.groupby([f])['id'].rank(ascending=False).astype(int)
        del data[f]
    #
    
    ## 训练数据/测试数据准备
    
    
    #python
    features = [f for f in data.columns if f not in ['id','issueDate','isDefault']]
    
    train = data[data.isDefault.notnull()].reset_index(drop=True)
    test = data[data.isDefault.isnull()].reset_index(drop=True)
    
    x_train = train[features]
    x_test = test[features]
    y_train = train['isDefault']
    #
    
    
    
    import done, sns & plt preset done  2020-09-27 18:28:02.821639
    
    ####  模型训练
    ####  5.1 建立模型:【模型参数:xgb-->鱼佬baseline,lgb --> 贝叶斯调参】
    
    def xgb_model(X_train, y_train, X_test, y_test=None):
        X_train_split, X_val, y_train_split, y_val = train_test_split(X_train, y_train, test_size=0.2)
        train_matrix = xgb.DMatrix(X_train_split , label=y_train_split)
        valid_matrix = xgb.DMatrix(X_val , label=y_val)
        test_matrix = xgb.DMatrix(X_test)
    
        params = {
            'booster': 'gbtree',
            'objective': 'binary:logistic',
            'eval_metric': 'auc',
            'gamma': 1,
            'min_child_weight': 1.5,
            'max_depth': 5,
            'lambda': 10,
            'subsample': 0.7,
            'colsample_bytree': 0.7,
            'colsample_bylevel': 0.7,
            'eta': 0.04,
            'tree_method': 'exact',
            'seed': 2020,
            'n_jobs': -1,
            "silent": True,
        }
        watchlist = [(train_matrix, 'train'),(valid_matrix, 'eval')]
    
        model = xgb.train(params, train_matrix, num_boost_round=50000, evals=watchlist, verbose_eval=200, early_stopping_rounds=200)
        """计算在验证集上的得分"""
        val_pred  = model.predict(valid_matrix, ntree_limit=model.best_ntree_limit)
        fpr, tpr, threshold = metrics.roc_curve(y_val, val_pred)
        roc_auc = metrics.auc(fpr, tpr)
        print('调参后xgboost单模型在验证集上的AUC:{}'.format(roc_auc))
        """对测试集进行预测"""
        test_pred = model.predict(test_matrix, ntree_limit=model.best_ntree_limit)
    
        return test_pred
    
    
    def lgb_model(X_train, y_train, X_test, y_test=None):
        X_train_split, X_val, y_train_split, y_val = train_test_split(X_train, y_train, test_size=0.2)
        train_matrix = lgb.Dataset(X_train_split, label=y_train_split)
        valid_matrix = lgb.Dataset(X_val, label=y_val)
    
        # 调参后的最优参数
        params = {
            'boosting_type': 'gbdt',
            'objective': 'binary',
            'metric': 'auc',
            'learning_rate': 0.01,
            'min_child_weight': 0.32,
            'num_leaves': 14,
            'max_depth': 4,
            'feature_fraction': 0.81,
            'bagging_fraction': 0.61,
            'bagging_freq': 9,
            'min_data_in_leaf': 13,
            'min_split_gain': 0.27,
            'reg_alpha': 9.58,
            'reg_lambda': 4.62,
            'seed': 2020,
            'n_jobs':-1,
            'silent': True,
            'verbose': -1,
        }
    
        model = lgb.train(params, train_matrix, 50000, valid_sets=[train_matrix, valid_matrix], verbose_eval=500, early_stopping_rounds=500)
        """计算在验证集上的得分"""
        val_pred = model.predict(X_val, num_iteration=model.best_iteration)
        fpr, tpr, threshold = metrics.roc_curve(y_val, val_pred)
        roc_auc = metrics.auc(fpr, tpr)
        print('调参后lightgbm单模型在验证集上的AUC:{}'.format(roc_auc))
        """对测试集进行预测"""
        test_pred = model.predict(X_test, num_iteration=model.best_iteration)
    
        return test_pred
    
    ####  5.2 基于模型层面的融合
    
    #   x_train = train[features]
    #   x_test = test[features]
    #   y_train = train['isDefault']
    
    model_dataset = Dataset(X_train=x_train, y_train=y_train, X_test=x_test)
    model_xgb = Classifier(dataset=model_dataset, estimator=xgb_model, name='xgb', use_cache=False)
    model_lgb = Classifier(dataset=model_dataset, estimator=lgb_model, name='lgb', use_cache=False)
    
    ####  5.3 使用stacking (叠加的方法)方法进行模型融合
    
    
    pipeline = ModelsPipeline(model_xgb, model_lgb)
    pipeline
    
    ####  5.4 构建第一层新特征,其中k默认是5,表示5折交叉验证,full_test=True,
    #       对全部训练集进行训练得到基学习器,然后用基学习器对测试集预测得到新特征
    stack_ds = pipeline.stack(k=5, seed=111, full_test=True)
    
    ####  5.5 第二层使用逻辑回归进行stack
    
    LogisticRegression(solver='lbfgs')
    stacker = Classifier(dataset=stack_ds, estimator=LogisticRegression, parameters={'solver': 'lbfgs'})
    # 测试集的预测结果
    test_pred = stacker.predict()
    test_pred
    
    ####  """生成提交格式的DataFrame""" 导出CSV
    test['isDefault'] = test_pred
    
    print('out_path: ', out_path)
    test[['id','isDefault']].to_csv(out_path, index=False)
    
    print('导出CSV done', datetime.datetime.now())
    

    输出结果

    降低了xgb和lgb的参数 num_boost_round = 5000 (原文章建议值为50000)

    导致训练结果大跌

    对全部训练集进行训练得到基学习器,然后用基学习器对测试集预测得到新特征

    stack_ds = pipeline.stack(k=5, seed=233, full_test=True)

    输出

    • 调参后xgboost单模型在验证集上的AUC:0.7314319707309533

    • 调参后xgboost单模型在验证集上的AUC:0.7328692352003359

    • 调参后xgboost单模型在验证集上的AUC:0.7348644657792799

    • 调参后xgboost单模型在验证集上的AUC:0.7292134605693141

    • 调参后xgboost单模型在验证集上的AUC:0.7273300682262768

    • 调参后xgboost单模型在验证集上的AUC:0.7315962351115055

    • 调参后lightgbm单模型在验证集上的AUC:0.7301180187432554

    • 调参后lightgbm单模型在验证集上的AUC:0.7276946066179788

    • 调参后lightgbm单模型在验证集上的AUC:0.7301011073663606

    • 调参后lightgbm单模型在验证集上的AUC:0.731260767784271

    • 调参后lightgbm单模型在验证集上的AUC:0.7274780648597519

    • 调参后lightgbm单模型在验证集上的AUC:0.7275650245325219

    打印混合模型后的所有列名

    print('混合模型后, x_train所有列为: ', x_train.columns)
    print('混合模型后, test 所有列为: ', test.columns)
    print('混合模型后, test->np.array所有列为: ', np.array(test.columns))

    混合模型后, x_train所有列为:  Index(['loanAmnt', 'term', 'interestRate', 'installment', 'employmentLength',
           'annualIncome', 'dti', 'delinquency_2years', 'ficoRangeLow',
           'ficoRangeHigh',
           ...
           'regionCode_47', 'regionCode_48', 'regionCode_49', 'regionCode_50',
           'employmentTitle_cnts', 'employmentTitle_rank', 'postCode_cnts',
           'postCode_rank', 'title_cnts', 'title_rank'],
          dtype='object', length=153)
    混合模型后, test   所有列为:  Index(['id', 'loanAmnt', 'term', 'interestRate', 'installment',
           'employmentLength', 'annualIncome', 'issueDate', 'isDefault', 'dti',
           ...
           'regionCode_47', 'regionCode_48', 'regionCode_49', 'regionCode_50',
           'employmentTitle_cnts', 'employmentTitle_rank', 'postCode_cnts',
           'postCode_rank', 'title_cnts', 'title_rank'],
          dtype='object', length=156)
    混合模型后, test->np.array所有列为:  ['id' 'loanAmnt' 'term' 'interestRate' 'installment' 'employmentLength'
     'annualIncome' 'issueDate' 'isDefault' 'dti' 'delinquency_2years'
     'ficoRangeLow' 'ficoRangeHigh' 'openAcc' 'pubRec' 'pubRecBankruptcies'
     'revolBal' 'revolUtil' 'totalAcc' 'initialListStatus' 'applicationType'
     'earliesCreditLine' 'policyCode' 'n0' 'n1' 'n2' 'n2.1' 'n4' 'n5' 'n6'
     'n7' 'n8' 'n9' 'n10' 'n11' 'n12' 'n13' 'n14' 'n2.2' 'n2.3' 'grade_B'
     'grade_C' 'grade_D' 'grade_E' 'grade_F' 'grade_G' 'subGrade_A2'
     'subGrade_A3' 'subGrade_A4' 'subGrade_A5' 'subGrade_B1' 'subGrade_B2'
     'subGrade_B3' 'subGrade_B4' 'subGrade_B5' 'subGrade_C1' 'subGrade_C2'
     'subGrade_C3' 'subGrade_C4' 'subGrade_C5' 'subGrade_D1' 'subGrade_D2'
     'subGrade_D3' 'subGrade_D4' 'subGrade_D5' 'subGrade_E1' 'subGrade_E2'
     'subGrade_E3' 'subGrade_E4' 'subGrade_E5' 'subGrade_F1' 'subGrade_F2'
     'subGrade_F3' 'subGrade_F4' 'subGrade_F5' 'subGrade_G1' 'subGrade_G2'
     'subGrade_G3' 'subGrade_G4' 'subGrade_G5' 'homeOwnership_1'
     'homeOwnership_2' 'homeOwnership_3' 'homeOwnership_4' 'homeOwnership_5'
     'verificationStatus_1' 'verificationStatus_2' 'purpose_1' 'purpose_2'
     'purpose_3' 'purpose_4' 'purpose_5' 'purpose_6' 'purpose_7' 'purpose_8'
     'purpose_9' 'purpose_10' 'purpose_11' 'purpose_12' 'purpose_13'
     'regionCode_1' 'regionCode_2' 'regionCode_3' 'regionCode_4'
     'regionCode_5' 'regionCode_6' 'regionCode_7' 'regionCode_8'
     'regionCode_9' 'regionCode_10' 'regionCode_11' 'regionCode_12'
     'regionCode_13' 'regionCode_14' 'regionCode_15' 'regionCode_16'
     'regionCode_17' 'regionCode_18' 'regionCode_19' 'regionCode_20'
     'regionCode_21' 'regionCode_22' 'regionCode_23' 'regionCode_24'
     'regionCode_25' 'regionCode_26' 'regionCode_27' 'regionCode_28'
     'regionCode_29' 'regionCode_30' 'regionCode_31' 'regionCode_32'
     'regionCode_33' 'regionCode_34' 'regionCode_35' 'regionCode_36'
     'regionCode_37' 'regionCode_38' 'regionCode_39' 'regionCode_40'
     'regionCode_41' 'regionCode_42' 'regionCode_43' 'regionCode_44'
     'regionCode_45' 'regionCode_46' 'regionCode_47' 'regionCode_48'
     'regionCode_49' 'regionCode_50' 'employmentTitle_cnts'
     'employmentTitle_rank' 'postCode_cnts' 'postCode_rank' 'title_cnts'
     'title_rank']
    
  • 相关阅读:
    Haproxy图解
    Keeplived 配制图解
    日志文件 的管理 logrotate 配置
    Haproxy+MYSQL 负载均衡 原创
    MySQL内存----使用说明全局缓存+线程缓存) 转
    MYSQL内存--------启动mysql缓存机制,实现命中率100% 转
    MYSQL SQL 审核工具 (inception安装步骤)
    MHA手动切换 原创4 (非交互式切换)
    MHA手动切换 原创2 (主参与复制)
    MHA手动在线切换主 原创3(主不参与复制)
  • 原文地址:https://www.cnblogs.com/zhazhaacmer/p/13740311.html
Copyright © 2011-2022 走看看