import pandas as pd
import numpy as np
import os
data = pd.read_excel('oil_data_for_tree.xlsx')
data.head()
uid | oil_actv_dt | create_dt | total_oil_cnt | pay_amount_total | class_new | bad_ind | oil_amount | discount_amount | sale_amount | amount | pay_amount | coupon_amount | payment_coupon_amount | channel_code | oil_code | scene | source_app | call_source | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | A8217710 | 2018-08-19 | 2018-08-17 | 137.0 | 24147747.2 | B | 0 | 1653.78 | 880040.0 | 898000.0 | 865540.0 | 4327700.0 | 0.0 | 100.0 | 1 | 3 | 2 | 0 | 3 |
1 | A8217710 | 2018-08-19 | 2018-08-16 | 137.0 | 24147747.2 | B | 0 | 2336.84 | 1243522.0 | 1268900.0 | 1218922.0 | 6094610.0 | 0.0 | 100.0 | 1 | 3 | 2 | 0 | 3 |
2 | A8217710 | 2018-08-19 | 2018-08-15 | 137.0 | 24147747.2 | B | 0 | 936.03 | 488922.0 | 498900.0 | 480922.0 | 2404610.0 | 0.0 | 200.0 | 1 | 2 | 2 | 0 | 3 |
3 | A8217710 | 2018-08-19 | 2018-08-14 | 137.0 | 24147747.2 | B | 0 | 2418.39 | 1263220.0 | 1289000.0 | 1242220.0 | 6211100.0 | 0.0 | 300.0 | 1 | 2 | 2 | 0 | 3 |
4 | A8217710 | 2018-08-19 | 2018-08-13 | 137.0 | 24147747.2 | B | 0 | 1292.69 | 675220.0 | 689000.0 | 664220.0 | 3321100.0 | 0.0 | 100.0 | 1 | 2 | 2 | 0 | 3 |
uid
oil_actv_dt
create_dt
total_oil_cnt
pay_amount_total
class_new
bad_ind oil_amount
discount_amount
sale_amount
amount
pay_amount
coupon_amount
payment_coupon_amount
channel_code
oil_code
scene
source_app
call_source
set(data.class_new)
{'A', 'B', 'C', 'D', 'E', 'F'}
org_lst 时间类别型变量,不需要做特殊变换,直接去重
agg_lst 数值型变量做聚合
dstc_lst 文本型变量做cnt
org_lst = ['uid', 'create_dt', 'oil_actv_dt', 'class_new', 'bad_ind']
agg_lst = ['oil_amount', 'discount_amount', 'sale_amount', 'amount', 'pay_amount', 'coupon_amount', 'payment_coupon_amount']
dstc_lst = ['channel_code', 'oil_code', 'scene', 'source_app', 'call_source']
数据重组
df = data[org_lst].copy()
df.head()
uid | create_dt | oil_actv_dt | class_new | bad_ind | |
---|---|---|---|---|---|
0 | A8217710 | 2018-08-17 | 2018-08-19 | B | 0 |
1 | A8217710 | 2018-08-16 | 2018-08-19 | B | 0 |
2 | A8217710 | 2018-08-15 | 2018-08-19 | B | 0 |
3 | A8217710 | 2018-08-14 | 2018-08-19 | B | 0 |
4 | A8217710 | 2018-08-13 | 2018-08-19 | B | 0 |
df[agg_lst] = data[agg_lst].copy()
df[agg_lst].head()
oil_amount | discount_amount | sale_amount | amount | pay_amount | coupon_amount | payment_coupon_amount | |
---|---|---|---|---|---|---|---|
0 | 1653.78 | 880040.0 | 898000.0 | 865540.0 | 4327700.0 | 0.0 | 100.0 |
1 | 2336.84 | 1243522.0 | 1268900.0 | 1218922.0 | 6094610.0 | 0.0 | 100.0 |
2 | 936.03 | 488922.0 | 498900.0 | 480922.0 | 2404610.0 | 0.0 | 200.0 |
3 | 2418.39 | 1263220.0 | 1289000.0 | 1242220.0 | 6211100.0 | 0.0 | 300.0 |
4 | 1292.69 | 675220.0 | 689000.0 | 664220.0 | 3321100.0 | 0.0 | 100.0 |
df[dstc_lst] = data[dstc_lst].copy()
df[dstc_lst].head()
channel_code | oil_code | scene | source_app | call_source | |
---|---|---|---|---|---|
0 | 1 | 3 | 2 | 0 | 3 |
1 | 1 | 3 | 2 | 0 | 3 |
2 | 1 | 2 | 2 | 0 | 3 |
3 | 1 | 2 | 2 | 0 | 3 |
4 | 1 | 2 | 2 | 0 | 3 |
查看缺失情况
df.isna().sum()
uid 0
create_dt 4944
oil_actv_dt 0
class_new 0
bad_ind 0
oil_amount 4944
discount_amount 4944
sale_amount 4944
amount 4944
pay_amount 4944
coupon_amount 4944
payment_coupon_amount 4946
channel_code 0
oil_code 0
scene 0
source_app 0
call_source 0
dtype: int64
查看基础变量的describe()
df.describe()
bad_ind | oil_amount | discount_amount | sale_amount | amount | pay_amount | coupon_amount | payment_coupon_amount | channel_code | oil_code | scene | source_app | call_source | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 50609.000000 | 45665.000000 | 4.566500e+04 | 4.566500e+04 | 4.566500e+04 | 4.566500e+04 | 45665.0 | 45663.000000 | 50609.000000 | 50609.000000 | 50609.000000 | 50609.000000 | 50609.000000 |
mean | 0.017764 | 212.188054 | 1.091035e+05 | 1.121195e+05 | 1.077312e+05 | 5.386562e+05 | 0.0 | 417.055384 | 1.476378 | 1.617894 | 1.906519 | 0.306072 | 2.900729 |
std | 0.132093 | 200.298122 | 1.010993e+05 | 1.031804e+05 | 9.953775e+04 | 4.976888e+05 | 0.0 | 968.250273 | 1.511470 | 3.074166 | 0.367280 | 0.893682 | 0.726231 |
min | 0.000000 | 0.000000 | 0.000000e+00 | 0.000000e+00 | 1.000000e+00 | 5.000000e+00 | 0.0 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
25% | 0.000000 | 87.220000 | 4.854000e+04 | 5.000000e+04 | 4.820000e+04 | 2.410000e+05 | 0.0 | 0.000000 | 1.000000 | 0.000000 | 2.000000 | 0.000000 | 3.000000 |
50% | 0.000000 | 167.580000 | 8.820000e+04 | 9.000000e+04 | 8.709600e+04 | 4.354800e+05 | 0.0 | 100.000000 | 1.000000 | 0.000000 | 2.000000 | 0.000000 | 3.000000 |
75% | 0.000000 | 278.300000 | 1.391600e+05 | 1.430000e+05 | 1.371150e+05 | 6.855750e+05 | 0.0 | 500.000000 | 1.000000 | 0.000000 | 2.000000 | 0.000000 | 3.000000 |
max | 1.000000 | 3975.910000 | 1.958040e+06 | 1.998000e+06 | 1.925540e+06 | 9.627700e+06 | 0.0 | 50000.000000 | 6.000000 | 9.000000 | 2.000000 | 3.000000 | 4.000000 |
对creat_dt做补全,用oil_actv_dt来填补,并且截取6个月的数据。
构造变量的时候不能直接对历史所有数据做累加。
否则随着时间推移,变量分布会有很大的变化。
def time_isna(x,y):
if str(x) == 'NaT':
x = y
else:
x = x
return x
df2 = df.sort_values(['uid','create_dt'], ascending = False)
df2.head()
uid | create_dt | oil_actv_dt | class_new | bad_ind | oil_amount | discount_amount | sale_amount | amount | pay_amount | coupon_amount | payment_coupon_amount | channel_code | oil_code | scene | source_app | call_source | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
50608 | B96436391985035703 | NaT | 2018-10-08 | B | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 6 | 9 | 2 | 3 | 4 |
50607 | B96436391984693397 | NaT | 2018-10-11 | E | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 6 | 9 | 2 | 3 | 4 |
50606 | B96436391977217468 | NaT | 2018-10-17 | B | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 6 | 9 | 2 | 3 | 4 |
50605 | B96436391976480892 | NaT | 2018-09-28 | B | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 6 | 9 | 2 | 3 | 4 |
50604 | B96436391972106043 | NaT | 2018-10-19 | A | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 6 | 9 | 2 | 3 | 4 |
df2['create_dt'] = df2.apply(lambda x: time_isna(x.create_dt, x.oil_actv_dt), axis = 1)
df2['create_dt'].head()
50608 2018-10-08
50607 2018-10-11
50606 2018-10-17
50605 2018-09-28
50604 2018-10-19
Name: create_dt, dtype: datetime64[ns]
df2['dtn'] = (df2.oil_actv_dt - df2.create_dt).apply(lambda x: x.days)
df2['dtn']
50608 0
50607 0
50606 0
50605 0
50604 0
50603 0
50602 0
50601 0
50600 0
50599 0
50598 0
50597 0
50596 0
50595 0
50594 0
50593 0
50592 0
50591 0
50590 0
50589 0
50588 0
50587 0
50586 0
50585 0
50584 0
50583 0
50582 0
50581 0
50580 0
50579 0
...
12860 255
12865 275
46427 0
35426 2
35425 4
35427 7
35419 9
35418 15
35414 19
35420 38
35421 39
35428 41
35417 42
35415 44
35424 45
35423 46
35422 48
35416 49
40437 183
40440 186
40442 188
40438 189
40436 192
40441 193
40439 194
46426 0
41434 21
41433 36
41432 69
45296 28
Name: dtn, Length: 50609, dtype: int64
df = df2[df2['dtn'] < 180]
df.head()
uid | create_dt | oil_actv_dt | class_new | bad_ind | oil_amount | discount_amount | sale_amount | amount | pay_amount | coupon_amount | payment_coupon_amount | channel_code | oil_code | scene | source_app | call_source | dtn | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
50608 | B96436391985035703 | 2018-10-08 | 2018-10-08 | B | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 6 | 9 | 2 | 3 | 4 | 0 |
50607 | B96436391984693397 | 2018-10-11 | 2018-10-11 | E | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 6 | 9 | 2 | 3 | 4 | 0 |
50606 | B96436391977217468 | 2018-10-17 | 2018-10-17 | B | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 6 | 9 | 2 | 3 | 4 | 0 |
50605 | B96436391976480892 | 2018-09-28 | 2018-09-28 | B | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 6 | 9 | 2 | 3 | 4 | 0 |
50604 | B96436391972106043 | 2018-10-19 | 2018-10-19 | A | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 6 | 9 | 2 | 3 | 4 | 0 |
对org_list变量求历史贷款天数的最大间隔,并且去重
base = df[org_lst]
base.head()
uid | create_dt | oil_actv_dt | class_new | bad_ind | |
---|---|---|---|---|---|
50608 | B96436391985035703 | 2018-10-08 | 2018-10-08 | B | 0 |
50607 | B96436391984693397 | 2018-10-11 | 2018-10-11 | E | 0 |
50606 | B96436391977217468 | 2018-10-17 | 2018-10-17 | B | 0 |
50605 | B96436391976480892 | 2018-09-28 | 2018-09-28 | B | 0 |
50604 | B96436391972106043 | 2018-10-19 | 2018-10-19 | A | 0 |
base['dtn'] = df['dtn']
base = base.sort_values(['uid', 'create_dt'], ascending = False)
# 去重
base = base.drop_duplicates(['uid'], keep = 'first')
base.shape
E:Anaconda3envssklearnlibsite-packagesipykernel_launcher.py:1: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
"""Entry point for launching an IPython kernel.
(11099, 6)
变量衍生(可以尝试featuretools)
gn = pd.DataFrame()
for i in agg_lst:
# count
tp = pd.DataFrame(df.groupby('uid').apply(lambda df:len(df[i])).reset_index())
tp.columns = ['uid', i + '_cnt']
# sum
if gn.empty == True:
gn = tp
else:
gn = pd.merge(gn, tp, on = 'uid', how = 'left')
tp = pd.DataFrame(df.groupby('uid').apply(lambda df:np.where(df[i]>0, 1, 0).sum()).reset_index())
tp.columns = ['uid', i + '_num']
# nansum
if gn.empty == True:
gn = tp
else:
gn = pd.merge(gn,tp,on = 'uid',how = 'left')
tp = pd.DataFrame(df.groupby('uid').apply(lambda df:np.nansum(df[i])).reset_index())
tp.columns = ['uid', i + '_tot']
# nanmean
if gn.empty == True:
gn = tp
else:
gn = pd.merge(gn,tp,on = 'uid',how = 'left')
tp = pd.DataFrame(df.groupby('uid').apply(lambda df:np.nanmean(df[i])).reset_index())
tp.columns = ['uid',i + '_avg']
# nanmax
if gn.empty == True:
gn = tp
else:
gn = pd.merge(gn,tp,on = 'uid',how = 'left')
tp = pd.DataFrame(df.groupby('uid').apply(lambda df:np.nanmax(df[i])).reset_index())
tp.columns = ['uid',i + '_max']
# nanmin
if gn.empty == True:
gn = tp
else:
gn = pd.merge(gn,tp,on = 'uid',how = 'left')
tp = pd.DataFrame(df.groupby('uid').apply(lambda df:np.nanmin(df[i])).reset_index())
tp.columns = ['uid',i + '_min']
# nanvar 计算沿指定轴的方差,同时忽略NaNs
if gn.empty == True:
gn = tp
else:
gn = pd.merge(gn,tp,on = 'uid',how = 'left')
tp = pd.DataFrame(df.groupby('uid').apply(lambda df:np.nanvar(df[i])).reset_index())
tp.columns = ['uid',i + '_var']
# nanmax-nanmin
if gn.empty == True:
gn = tp
else:
gn = pd.merge(gn,tp,on = 'uid',how = 'left')
tp = pd.DataFrame(df.groupby('uid').apply(lambda df:np.nanmax(df[i]) -np.nanmin(df[i]) ).reset_index())
tp.columns = ['uid',i + '_var']
# nanmean/nanvar
if gn.empty == True:
gn = tp
else:
gn = pd.merge(gn,tp,on = 'uid',how = 'left')
tp = pd.DataFrame(df.groupby('uid').apply(lambda df:np.nanmean(df[i])/max(np.nanvar(df[i]),1)).reset_index())
tp.columns = ['uid',i + '_var']
if gn.empty == True:
gn = tp
else:
gn = pd.merge(gn,tp,on = 'uid',how = 'left')
E:Anaconda3envssklearnlibsite-packagesipykernel_launcher.py:24: RuntimeWarning: Mean of empty slice
E:Anaconda3envssklearnlibsite-packagesipykernel_launcher.py:31: RuntimeWarning: All-NaN axis encountered
E:Anaconda3envssklearnlibsite-packagesipykernel_launcher.py:38: RuntimeWarning: All-NaN axis encountered
E:Anaconda3envssklearnlibsite-packagesipykernel_launcher.py:45: RuntimeWarning: Degrees of freedom <= 0 for slice.
E:Anaconda3envssklearnlibsite-packagesipykernel_launcher.py:52: RuntimeWarning: All-NaN axis encountered
E:Anaconda3envssklearnlibsite-packagesipykernel_launcher.py:59: RuntimeWarning: Mean of empty slice
E:Anaconda3envssklearnlibsite-packagesipykernel_launcher.py:59: RuntimeWarning: Degrees of freedom <= 0 for slice.
gn.head()
uid | oil_amount_cnt | oil_amount_num | oil_amount_tot | oil_amount_avg | oil_amount_max | oil_amount_min | oil_amount_var_x | oil_amount_var_y | oil_amount_var | ... | coupon_amount_var | payment_coupon_amount_cnt | payment_coupon_amount_num | payment_coupon_amount_tot | payment_coupon_amount_avg | payment_coupon_amount_max | payment_coupon_amount_min | payment_coupon_amount_var_x | payment_coupon_amount_var_y | payment_coupon_amount_var | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | A10000481 | 1 | 1 | 94.76 | 94.760000 | 94.76 | 94.76 | 0.000000 | 0.00 | 94.760000 | ... | 0.0 | 1 | 1 | 200.0 | 200.000000 | 200.0 | 200.0 | 0.000000 | 0.0 | 200.000000 |
1 | A1000232 | 3 | 3 | 372.82 | 124.273333 | 138.46 | 109.55 | 139.442022 | 28.91 | 0.891219 | ... | 0.0 | 3 | 2 | 300.0 | 100.000000 | 200.0 | 0.0 | 6666.666667 | 200.0 | 0.015000 |
2 | A1000455 | 1 | 0 | 0.00 | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | 1 | 0 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN |
3 | A10010042 | 15 | 15 | 742.05 | 49.470000 | 76.47 | 31.62 | 109.291800 | 44.85 | 0.452641 | ... | 0.0 | 15 | 12 | 1900.0 | 126.666667 | 500.0 | 0.0 | 13955.555556 | 500.0 | 0.009076 |
4 | A1001014 | 1 | 0 | 0.00 | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | 1 | 0 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN |
5 rows × 64 columns
gn.columns
Index(['uid', 'oil_amount_cnt', 'oil_amount_num', 'oil_amount_tot',
'oil_amount_avg', 'oil_amount_max', 'oil_amount_min',
'oil_amount_var_x', 'oil_amount_var_y', 'oil_amount_var',
'discount_amount_cnt', 'discount_amount_num', 'discount_amount_tot',
'discount_amount_avg', 'discount_amount_max', 'discount_amount_min',
'discount_amount_var_x', 'discount_amount_var_y', 'discount_amount_var',
'sale_amount_cnt', 'sale_amount_num', 'sale_amount_tot',
'sale_amount_avg', 'sale_amount_max', 'sale_amount_min',
'sale_amount_var_x', 'sale_amount_var_y', 'sale_amount_var',
'amount_cnt', 'amount_num', 'amount_tot', 'amount_avg', 'amount_max',
'amount_min', 'amount_var_x', 'amount_var_y', 'amount_var',
'pay_amount_cnt', 'pay_amount_num', 'pay_amount_tot', 'pay_amount_avg',
'pay_amount_max', 'pay_amount_min', 'pay_amount_var_x',
'pay_amount_var_y', 'pay_amount_var', 'coupon_amount_cnt',
'coupon_amount_num', 'coupon_amount_tot', 'coupon_amount_avg',
'coupon_amount_max', 'coupon_amount_min', 'coupon_amount_var_x',
'coupon_amount_var_y', 'coupon_amount_var', 'payment_coupon_amount_cnt',
'payment_coupon_amount_num', 'payment_coupon_amount_tot',
'payment_coupon_amount_avg', 'payment_coupon_amount_max',
'payment_coupon_amount_min', 'payment_coupon_amount_var_x',
'payment_coupon_amount_var_y', 'payment_coupon_amount_var'],
dtype='object')
对dstc_lst变量求distinct个数
# 去重计数
gc = pd.DataFrame()
for i in dstc_lst:
tp = pd.DataFrame(df.groupby('uid').apply(lambda df: len(set(df[i]))).reset_index())
tp.columns = ['uid',i + '_dstc']
if gc.empty == True:
gc = tp
else:
gc = pd.merge(gc,tp,on = 'uid',how = 'left')
gc.head()
uid | channel_code_dstc | oil_code_dstc | scene_dstc | source_app_dstc | call_source_dstc | |
---|---|---|---|---|---|---|
0 | A10000481 | 1 | 1 | 1 | 1 | 1 |
1 | A1000232 | 1 | 1 | 1 | 1 | 1 |
2 | A1000455 | 1 | 1 | 1 | 1 | 1 |
3 | A10010042 | 1 | 1 | 1 | 1 | 1 |
4 | A1001014 | 1 | 1 | 1 | 1 | 1 |
将变量组合在一起
fn = pd.merge(base,gn,on= 'uid')
fn = pd.merge(fn,gc,on= 'uid')
fn.shape
(45039, 73)
fn = fn.fillna(0)
fn.head()
uid | create_dt | oil_actv_dt | class_new | bad_ind | oil_amount_cnt | oil_amount_num | oil_amount_tot | oil_amount_avg | oil_amount_max | ... | payment_coupon_amount_max | payment_coupon_amount_min | payment_coupon_amount_var_x | payment_coupon_amount_var_y | payment_coupon_amount_var | channel_code_dstc | oil_code_dstc | scene_dstc | source_app_dstc | call_source_dstc | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | B96436391985035703 | 2018-10-08 | 2018-10-08 | B | 0 | 1 | 0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1 | 1 | 1 | 1 | 1 |
1 | B96436391984693397 | 2018-10-11 | 2018-10-11 | E | 0 | 1 | 0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1 | 1 | 1 | 1 | 1 |
2 | B96436391977217468 | 2018-10-17 | 2018-10-17 | B | 0 | 1 | 0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1 | 1 | 1 | 1 | 1 |
3 | B96436391976480892 | 2018-09-28 | 2018-09-28 | B | 0 | 1 | 0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1 | 1 | 1 | 1 | 1 |
4 | B96436391972106043 | 2018-10-19 | 2018-10-19 | A | 0 | 1 | 0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1 | 1 | 1 | 1 | 1 |
5 rows × 73 columns
训练决策树模型
# 特征
x = fn.drop(['uid','oil_actv_dt','create_dt','bad_ind','class_new'],axis = 1)
# 标签
y = fn.bad_ind.copy()
from sklearn import tree
dtree = tree.DecisionTreeRegressor(max_depth=2, min_samples_leaf=500, min_samples_split=5000)
dtree = dtree.fit(x,y)
输出决策树图像,并作出决策
import pydotplus
from IPython.display import Image
from sklearn.externals.six import StringIO
import os
with open("dt.dot", "w") as f:
tree.export_graphviz(dtree, out_file=f)
dot_data = StringIO()
tree.export_graphviz(dtree, out_file=dot_data,
feature_names=x.columns,
class_names=['bad_ind'],
filled=True, rounded=True,
special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
Image(graph.create_png())
value = badrate
sum(fn.bad_ind)/len(fn.bad_ind)
0.04658077304261645