zoukankan      html  css  js  c++  java
  • 决策树规则挖掘

    import pandas as pd
    import numpy as np
    import os
    
    data = pd.read_excel('oil_data_for_tree.xlsx')
    data.head()
    
    uid oil_actv_dt create_dt total_oil_cnt pay_amount_total class_new bad_ind oil_amount discount_amount sale_amount amount pay_amount coupon_amount payment_coupon_amount channel_code oil_code scene source_app call_source
    0 A8217710 2018-08-19 2018-08-17 137.0 24147747.2 B 0 1653.78 880040.0 898000.0 865540.0 4327700.0 0.0 100.0 1 3 2 0 3
    1 A8217710 2018-08-19 2018-08-16 137.0 24147747.2 B 0 2336.84 1243522.0 1268900.0 1218922.0 6094610.0 0.0 100.0 1 3 2 0 3
    2 A8217710 2018-08-19 2018-08-15 137.0 24147747.2 B 0 936.03 488922.0 498900.0 480922.0 2404610.0 0.0 200.0 1 2 2 0 3
    3 A8217710 2018-08-19 2018-08-14 137.0 24147747.2 B 0 2418.39 1263220.0 1289000.0 1242220.0 6211100.0 0.0 300.0 1 2 2 0 3
    4 A8217710 2018-08-19 2018-08-13 137.0 24147747.2 B 0 1292.69 675220.0 689000.0 664220.0 3321100.0 0.0 100.0 1 2 2 0 3

    uid
    oil_actv_dt
    create_dt
    total_oil_cnt
    pay_amount_total
    class_new
    bad_ind oil_amount
    discount_amount
    sale_amount
    amount
    pay_amount
    coupon_amount
    payment_coupon_amount
    channel_code
    oil_code
    scene
    source_app
    call_source

    set(data.class_new)
    
    {'A', 'B', 'C', 'D', 'E', 'F'}
    

    org_lst 时间类别型变量,不需要做特殊变换,直接去重
    agg_lst 数值型变量做聚合
    dstc_lst 文本型变量做cnt

    org_lst = ['uid', 'create_dt', 'oil_actv_dt', 'class_new', 'bad_ind']
    agg_lst = ['oil_amount', 'discount_amount', 'sale_amount', 'amount', 'pay_amount', 'coupon_amount', 'payment_coupon_amount']
    dstc_lst = ['channel_code', 'oil_code', 'scene', 'source_app', 'call_source']
    

    数据重组

    df = data[org_lst].copy()
    df.head()
    
    uid create_dt oil_actv_dt class_new bad_ind
    0 A8217710 2018-08-17 2018-08-19 B 0
    1 A8217710 2018-08-16 2018-08-19 B 0
    2 A8217710 2018-08-15 2018-08-19 B 0
    3 A8217710 2018-08-14 2018-08-19 B 0
    4 A8217710 2018-08-13 2018-08-19 B 0
    df[agg_lst] = data[agg_lst].copy()
    df[agg_lst].head()
    
    oil_amount discount_amount sale_amount amount pay_amount coupon_amount payment_coupon_amount
    0 1653.78 880040.0 898000.0 865540.0 4327700.0 0.0 100.0
    1 2336.84 1243522.0 1268900.0 1218922.0 6094610.0 0.0 100.0
    2 936.03 488922.0 498900.0 480922.0 2404610.0 0.0 200.0
    3 2418.39 1263220.0 1289000.0 1242220.0 6211100.0 0.0 300.0
    4 1292.69 675220.0 689000.0 664220.0 3321100.0 0.0 100.0
    df[dstc_lst] = data[dstc_lst].copy()
    df[dstc_lst].head()
    
    channel_code oil_code scene source_app call_source
    0 1 3 2 0 3
    1 1 3 2 0 3
    2 1 2 2 0 3
    3 1 2 2 0 3
    4 1 2 2 0 3

    查看缺失情况

    df.isna().sum()
    
    uid                         0
    create_dt                4944
    oil_actv_dt                 0
    class_new                   0
    bad_ind                     0
    oil_amount               4944
    discount_amount          4944
    sale_amount              4944
    amount                   4944
    pay_amount               4944
    coupon_amount            4944
    payment_coupon_amount    4946
    channel_code                0
    oil_code                    0
    scene                       0
    source_app                  0
    call_source                 0
    dtype: int64
    

    查看基础变量的describe()

    df.describe()
    
    bad_ind oil_amount discount_amount sale_amount amount pay_amount coupon_amount payment_coupon_amount channel_code oil_code scene source_app call_source
    count 50609.000000 45665.000000 4.566500e+04 4.566500e+04 4.566500e+04 4.566500e+04 45665.0 45663.000000 50609.000000 50609.000000 50609.000000 50609.000000 50609.000000
    mean 0.017764 212.188054 1.091035e+05 1.121195e+05 1.077312e+05 5.386562e+05 0.0 417.055384 1.476378 1.617894 1.906519 0.306072 2.900729
    std 0.132093 200.298122 1.010993e+05 1.031804e+05 9.953775e+04 4.976888e+05 0.0 968.250273 1.511470 3.074166 0.367280 0.893682 0.726231
    min 0.000000 0.000000 0.000000e+00 0.000000e+00 1.000000e+00 5.000000e+00 0.0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
    25% 0.000000 87.220000 4.854000e+04 5.000000e+04 4.820000e+04 2.410000e+05 0.0 0.000000 1.000000 0.000000 2.000000 0.000000 3.000000
    50% 0.000000 167.580000 8.820000e+04 9.000000e+04 8.709600e+04 4.354800e+05 0.0 100.000000 1.000000 0.000000 2.000000 0.000000 3.000000
    75% 0.000000 278.300000 1.391600e+05 1.430000e+05 1.371150e+05 6.855750e+05 0.0 500.000000 1.000000 0.000000 2.000000 0.000000 3.000000
    max 1.000000 3975.910000 1.958040e+06 1.998000e+06 1.925540e+06 9.627700e+06 0.0 50000.000000 6.000000 9.000000 2.000000 3.000000 4.000000

    对creat_dt做补全,用oil_actv_dt来填补,并且截取6个月的数据。
    构造变量的时候不能直接对历史所有数据做累加。
    否则随着时间推移,变量分布会有很大的变化。

    def time_isna(x,y):
        if str(x) == 'NaT':
            x = y
        else:
            x = x
        return x
    
    df2 = df.sort_values(['uid','create_dt'], ascending = False)
    df2.head()
    
    uid create_dt oil_actv_dt class_new bad_ind oil_amount discount_amount sale_amount amount pay_amount coupon_amount payment_coupon_amount channel_code oil_code scene source_app call_source
    50608 B96436391985035703 NaT 2018-10-08 B 0 NaN NaN NaN NaN NaN NaN NaN 6 9 2 3 4
    50607 B96436391984693397 NaT 2018-10-11 E 0 NaN NaN NaN NaN NaN NaN NaN 6 9 2 3 4
    50606 B96436391977217468 NaT 2018-10-17 B 0 NaN NaN NaN NaN NaN NaN NaN 6 9 2 3 4
    50605 B96436391976480892 NaT 2018-09-28 B 0 NaN NaN NaN NaN NaN NaN NaN 6 9 2 3 4
    50604 B96436391972106043 NaT 2018-10-19 A 0 NaN NaN NaN NaN NaN NaN NaN 6 9 2 3 4
    df2['create_dt'] = df2.apply(lambda x: time_isna(x.create_dt, x.oil_actv_dt), axis = 1)
    df2['create_dt'].head()
    
    50608   2018-10-08
    50607   2018-10-11
    50606   2018-10-17
    50605   2018-09-28
    50604   2018-10-19
    Name: create_dt, dtype: datetime64[ns]
    
    df2['dtn'] = (df2.oil_actv_dt - df2.create_dt).apply(lambda x: x.days)
    df2['dtn']
    
    50608      0
    50607      0
    50606      0
    50605      0
    50604      0
    50603      0
    50602      0
    50601      0
    50600      0
    50599      0
    50598      0
    50597      0
    50596      0
    50595      0
    50594      0
    50593      0
    50592      0
    50591      0
    50590      0
    50589      0
    50588      0
    50587      0
    50586      0
    50585      0
    50584      0
    50583      0
    50582      0
    50581      0
    50580      0
    50579      0
            ... 
    12860    255
    12865    275
    46427      0
    35426      2
    35425      4
    35427      7
    35419      9
    35418     15
    35414     19
    35420     38
    35421     39
    35428     41
    35417     42
    35415     44
    35424     45
    35423     46
    35422     48
    35416     49
    40437    183
    40440    186
    40442    188
    40438    189
    40436    192
    40441    193
    40439    194
    46426      0
    41434     21
    41433     36
    41432     69
    45296     28
    Name: dtn, Length: 50609, dtype: int64
    
    df = df2[df2['dtn'] < 180]
    df.head()
    
    uid create_dt oil_actv_dt class_new bad_ind oil_amount discount_amount sale_amount amount pay_amount coupon_amount payment_coupon_amount channel_code oil_code scene source_app call_source dtn
    50608 B96436391985035703 2018-10-08 2018-10-08 B 0 NaN NaN NaN NaN NaN NaN NaN 6 9 2 3 4 0
    50607 B96436391984693397 2018-10-11 2018-10-11 E 0 NaN NaN NaN NaN NaN NaN NaN 6 9 2 3 4 0
    50606 B96436391977217468 2018-10-17 2018-10-17 B 0 NaN NaN NaN NaN NaN NaN NaN 6 9 2 3 4 0
    50605 B96436391976480892 2018-09-28 2018-09-28 B 0 NaN NaN NaN NaN NaN NaN NaN 6 9 2 3 4 0
    50604 B96436391972106043 2018-10-19 2018-10-19 A 0 NaN NaN NaN NaN NaN NaN NaN 6 9 2 3 4 0

    对org_list变量求历史贷款天数的最大间隔,并且去重

    base = df[org_lst]
    base.head()
    
    uid create_dt oil_actv_dt class_new bad_ind
    50608 B96436391985035703 2018-10-08 2018-10-08 B 0
    50607 B96436391984693397 2018-10-11 2018-10-11 E 0
    50606 B96436391977217468 2018-10-17 2018-10-17 B 0
    50605 B96436391976480892 2018-09-28 2018-09-28 B 0
    50604 B96436391972106043 2018-10-19 2018-10-19 A 0
    base['dtn'] = df['dtn']
    base = base.sort_values(['uid', 'create_dt'], ascending = False)
    # 去重
    base = base.drop_duplicates(['uid'], keep = 'first')
    base.shape
    
    E:Anaconda3envssklearnlibsite-packagesipykernel_launcher.py:1: SettingWithCopyWarning: 
    A value is trying to be set on a copy of a slice from a DataFrame.
    Try using .loc[row_indexer,col_indexer] = value instead
    
    See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
      """Entry point for launching an IPython kernel.
    
    
    
    
    
    (11099, 6)
    

    变量衍生(可以尝试featuretools)

    gn = pd.DataFrame()
    for i in agg_lst:
        # count
        tp = pd.DataFrame(df.groupby('uid').apply(lambda df:len(df[i])).reset_index())
        tp.columns = ['uid', i + '_cnt']
        
        # sum
        if gn.empty == True:
            gn = tp
        else:
            gn = pd.merge(gn, tp, on = 'uid', how = 'left')
        tp = pd.DataFrame(df.groupby('uid').apply(lambda df:np.where(df[i]>0, 1, 0).sum()).reset_index())
        tp.columns = ['uid', i + '_num']
        
        # nansum
        if gn.empty == True:
            gn = tp
        else:
            gn = pd.merge(gn,tp,on = 'uid',how = 'left')
        tp = pd.DataFrame(df.groupby('uid').apply(lambda df:np.nansum(df[i])).reset_index())
        tp.columns = ['uid', i + '_tot']
        
        # nanmean
        if gn.empty == True:
            gn = tp
        else:
            gn = pd.merge(gn,tp,on = 'uid',how = 'left')
        tp = pd.DataFrame(df.groupby('uid').apply(lambda df:np.nanmean(df[i])).reset_index())
        tp.columns = ['uid',i + '_avg']
        
        # nanmax
        if gn.empty == True:
            gn = tp
        else:
            gn = pd.merge(gn,tp,on = 'uid',how = 'left')
        tp = pd.DataFrame(df.groupby('uid').apply(lambda df:np.nanmax(df[i])).reset_index())
        tp.columns = ['uid',i + '_max']
        
        # nanmin
        if gn.empty == True:
            gn = tp
        else:
            gn = pd.merge(gn,tp,on = 'uid',how = 'left')
        tp = pd.DataFrame(df.groupby('uid').apply(lambda df:np.nanmin(df[i])).reset_index())
        tp.columns = ['uid',i + '_min']
        
        # nanvar 计算沿指定轴的方差,同时忽略NaNs
        if gn.empty == True:
            gn = tp
        else:
            gn = pd.merge(gn,tp,on = 'uid',how = 'left')
        tp = pd.DataFrame(df.groupby('uid').apply(lambda df:np.nanvar(df[i])).reset_index())
        tp.columns = ['uid',i + '_var']
        
        # nanmax-nanmin
        if gn.empty == True:
            gn = tp
        else:
            gn = pd.merge(gn,tp,on = 'uid',how = 'left')
        tp = pd.DataFrame(df.groupby('uid').apply(lambda df:np.nanmax(df[i]) -np.nanmin(df[i]) ).reset_index())
        tp.columns = ['uid',i + '_var']
        
        # nanmean/nanvar
        if gn.empty == True:
            gn = tp
        else:
            gn = pd.merge(gn,tp,on = 'uid',how = 'left')
        tp = pd.DataFrame(df.groupby('uid').apply(lambda df:np.nanmean(df[i])/max(np.nanvar(df[i]),1)).reset_index())
        tp.columns = ['uid',i + '_var']
        
        if gn.empty == True:
            gn = tp
        else:
            gn = pd.merge(gn,tp,on = 'uid',how = 'left')
    
    E:Anaconda3envssklearnlibsite-packagesipykernel_launcher.py:24: RuntimeWarning: Mean of empty slice
    E:Anaconda3envssklearnlibsite-packagesipykernel_launcher.py:31: RuntimeWarning: All-NaN axis encountered
    E:Anaconda3envssklearnlibsite-packagesipykernel_launcher.py:38: RuntimeWarning: All-NaN axis encountered
    E:Anaconda3envssklearnlibsite-packagesipykernel_launcher.py:45: RuntimeWarning: Degrees of freedom <= 0 for slice.
    E:Anaconda3envssklearnlibsite-packagesipykernel_launcher.py:52: RuntimeWarning: All-NaN axis encountered
    E:Anaconda3envssklearnlibsite-packagesipykernel_launcher.py:59: RuntimeWarning: Mean of empty slice
    E:Anaconda3envssklearnlibsite-packagesipykernel_launcher.py:59: RuntimeWarning: Degrees of freedom <= 0 for slice.
    
    gn.head()
    
    uid oil_amount_cnt oil_amount_num oil_amount_tot oil_amount_avg oil_amount_max oil_amount_min oil_amount_var_x oil_amount_var_y oil_amount_var ... coupon_amount_var payment_coupon_amount_cnt payment_coupon_amount_num payment_coupon_amount_tot payment_coupon_amount_avg payment_coupon_amount_max payment_coupon_amount_min payment_coupon_amount_var_x payment_coupon_amount_var_y payment_coupon_amount_var
    0 A10000481 1 1 94.76 94.760000 94.76 94.76 0.000000 0.00 94.760000 ... 0.0 1 1 200.0 200.000000 200.0 200.0 0.000000 0.0 200.000000
    1 A1000232 3 3 372.82 124.273333 138.46 109.55 139.442022 28.91 0.891219 ... 0.0 3 2 300.0 100.000000 200.0 0.0 6666.666667 200.0 0.015000
    2 A1000455 1 0 0.00 NaN NaN NaN NaN NaN NaN ... NaN 1 0 0.0 NaN NaN NaN NaN NaN NaN
    3 A10010042 15 15 742.05 49.470000 76.47 31.62 109.291800 44.85 0.452641 ... 0.0 15 12 1900.0 126.666667 500.0 0.0 13955.555556 500.0 0.009076
    4 A1001014 1 0 0.00 NaN NaN NaN NaN NaN NaN ... NaN 1 0 0.0 NaN NaN NaN NaN NaN NaN

    5 rows × 64 columns

    gn.columns
    
    Index(['uid', 'oil_amount_cnt', 'oil_amount_num', 'oil_amount_tot',
           'oil_amount_avg', 'oil_amount_max', 'oil_amount_min',
           'oil_amount_var_x', 'oil_amount_var_y', 'oil_amount_var',
           'discount_amount_cnt', 'discount_amount_num', 'discount_amount_tot',
           'discount_amount_avg', 'discount_amount_max', 'discount_amount_min',
           'discount_amount_var_x', 'discount_amount_var_y', 'discount_amount_var',
           'sale_amount_cnt', 'sale_amount_num', 'sale_amount_tot',
           'sale_amount_avg', 'sale_amount_max', 'sale_amount_min',
           'sale_amount_var_x', 'sale_amount_var_y', 'sale_amount_var',
           'amount_cnt', 'amount_num', 'amount_tot', 'amount_avg', 'amount_max',
           'amount_min', 'amount_var_x', 'amount_var_y', 'amount_var',
           'pay_amount_cnt', 'pay_amount_num', 'pay_amount_tot', 'pay_amount_avg',
           'pay_amount_max', 'pay_amount_min', 'pay_amount_var_x',
           'pay_amount_var_y', 'pay_amount_var', 'coupon_amount_cnt',
           'coupon_amount_num', 'coupon_amount_tot', 'coupon_amount_avg',
           'coupon_amount_max', 'coupon_amount_min', 'coupon_amount_var_x',
           'coupon_amount_var_y', 'coupon_amount_var', 'payment_coupon_amount_cnt',
           'payment_coupon_amount_num', 'payment_coupon_amount_tot',
           'payment_coupon_amount_avg', 'payment_coupon_amount_max',
           'payment_coupon_amount_min', 'payment_coupon_amount_var_x',
           'payment_coupon_amount_var_y', 'payment_coupon_amount_var'],
          dtype='object')
    

    对dstc_lst变量求distinct个数

    # 去重计数
    gc = pd.DataFrame()
    for i in dstc_lst:
        tp = pd.DataFrame(df.groupby('uid').apply(lambda df: len(set(df[i]))).reset_index())
        tp.columns = ['uid',i + '_dstc']
        if gc.empty == True:
            gc = tp
        else:
            gc = pd.merge(gc,tp,on = 'uid',how = 'left')
    
    gc.head()
    
    uid channel_code_dstc oil_code_dstc scene_dstc source_app_dstc call_source_dstc
    0 A10000481 1 1 1 1 1
    1 A1000232 1 1 1 1 1
    2 A1000455 1 1 1 1 1
    3 A10010042 1 1 1 1 1
    4 A1001014 1 1 1 1 1

    将变量组合在一起

    fn = pd.merge(base,gn,on= 'uid')
    fn = pd.merge(fn,gc,on= 'uid') 
    fn.shape
    
    (45039, 73)
    
    fn = fn.fillna(0)
    
    fn.head()
    
    uid create_dt oil_actv_dt class_new bad_ind oil_amount_cnt oil_amount_num oil_amount_tot oil_amount_avg oil_amount_max ... payment_coupon_amount_max payment_coupon_amount_min payment_coupon_amount_var_x payment_coupon_amount_var_y payment_coupon_amount_var channel_code_dstc oil_code_dstc scene_dstc source_app_dstc call_source_dstc
    0 B96436391985035703 2018-10-08 2018-10-08 B 0 1 0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 1 1 1 1 1
    1 B96436391984693397 2018-10-11 2018-10-11 E 0 1 0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 1 1 1 1 1
    2 B96436391977217468 2018-10-17 2018-10-17 B 0 1 0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 1 1 1 1 1
    3 B96436391976480892 2018-09-28 2018-09-28 B 0 1 0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 1 1 1 1 1
    4 B96436391972106043 2018-10-19 2018-10-19 A 0 1 0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 1 1 1 1 1

    5 rows × 73 columns

    训练决策树模型

    # 特征
    x = fn.drop(['uid','oil_actv_dt','create_dt','bad_ind','class_new'],axis = 1)
    # 标签
    y = fn.bad_ind.copy()
    
    from sklearn import tree
    dtree = tree.DecisionTreeRegressor(max_depth=2, min_samples_leaf=500, min_samples_split=5000)
    dtree = dtree.fit(x,y)
    

    输出决策树图像,并作出决策

    import pydotplus 
    from IPython.display import Image
    from sklearn.externals.six import StringIO
    import os
    
    with open("dt.dot", "w") as f:
        tree.export_graphviz(dtree, out_file=f)
    
    dot_data = StringIO()
    
    tree.export_graphviz(dtree, out_file=dot_data,
                             feature_names=x.columns,
                             class_names=['bad_ind'],
                             filled=True, rounded=True,
                             special_characters=True)
    graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) 
    Image(graph.create_png())
    

    value = badrate

    sum(fn.bad_ind)/len(fn.bad_ind)
    
    0.04658077304261645
  • 相关阅读:
    Map的迭代操作
    Vector/Arraylist与Linklist的区别
    Notepad++ 快捷键 大全
    集成 Tomcat 插件到 Eclipse 的过程
    十个最好的Java性能故障排除工具
    eclipse删除空行
    java解析xml文件四种方式
    Java解析XML文档(简单实例)——dom解析xml
    Log4j 2.0 使用说明
    根据引用jar包路径查找原JAR包
  • 原文地址:https://www.cnblogs.com/chenxiangzhen/p/10902144.html
Copyright © 2011-2022 走看看