业务梳理
逻辑回归的数理原理
-
应用场景
- 逻辑回归被广泛应用在目标变量是二值变量的场合(0,1)
-
公式
-
模型估计
- 极大似然估计
-
模型阐释/评估
- 一个解释变量的阐释图
- C值/AUC,Lift图
得到每个用户的违约概率(信用评分)
目标变量:用户的违约概率
数据清洗
初始数据整理
关联相关表
使用mysql将导入txt数据并且进行合并。
压缩数据
bank_detail 和 bill_detail 和 用户浏览行为表 不能直接进行关联,目前的想法,将这两张表的信息分别压缩到每个 user_id 上。
bank_detail 每个用户的信息压缩为:工资收入、工资外收入、净资产
use test;
-- -----------------------------------------------------
-- Table `user_info`
-- -----------------------------------------------------
CREATE TABLE IF NOT EXISTS `user_info` (
`user_id` INT NOT NULL,
`sex` INT NULL,
`jobs` INT NULL,
`edu_status` INT NULL,
`marry_status` INT NULL,
`residence` INT NULL)
ENGINE = InnoDB;
-- -----------------------------------------------------
-- Table `bank_detail`
-- -----------------------------------------------------
CREATE TABLE IF NOT EXISTS `bank_detail` (
`user_id` INT NOT NULL,
`timestemp` VARCHAR(45) NULL,
`trade_type` INT NULL,
`trade_amount` decimal(30,8) NULL,
`in_come` INT NULL)
ENGINE = InnoDB;
-- -----------------------------------------------------
-- Table `browse_history`
-- -----------------------------------------------------
CREATE TABLE IF NOT EXISTS `browse_history` (
`user_id` INT NOT NULL,
`timestemp` VARCHAR(45) NULL,
`browsing_behavior_data` INT NULL,
`browsing_behavior_number` INT NULL,
`seg` INT NULL)
ENGINE = InnoDB;
-- -----------------------------------------------------
-- Table `bill_detail`
-- -----------------------------------------------------
CREATE TABLE IF NOT EXISTS `bill_detail` (
`user_id` INT NOT NULL,
`bill_timestemp` VARCHAR(45) NULL,
`bank_id` INT NULL,
`pre_bill_amount` DECIMAL(30,6) NULL,
`pre_repayment_amount` DECIMAL(30,6) NULL,
`credit_limit` DECIMAL(30,6) NULL,
`curr_bill_balance` DECIMAL(30,6) NULL,
`curr_min_repayment` DECIMAL(30,6) NULL,
`num_of_consumption` INT NULL,
`curr_bill_amount` DECIMAL(30,6) NULL,
`adj_amount` DECIMAL(30,6) NULL,
`cycle_interest` DECIMAL(30,6) NULL,
`available_balance` DECIMAL(30,6) NULL,
`cash_in_advance` DECIMAL(30,6) NULL,
`repayment_status` DECIMAL(30,6) NULL)
ENGINE = InnoDB;
-- -----------------------------------------------------
-- Table `loan_time`
-- -----------------------------------------------------
CREATE TABLE IF NOT EXISTS `loan_time` (
`user_id` INT NOT NULL,
`loan_time` VARCHAR(45) NULL)
ENGINE = InnoDB;
-- -----------------------------------------------------
-- Table `overdue`
-- -----------------------------------------------------
CREATE TABLE IF NOT EXISTS `overdue` (
`user_id` INT NOT NULL,
`overdue_label` INT NULL)
ENGINE = InnoDB;
####将txt文件导入数据库中####
use test
load data infile "C:\ProgramData\MySQL\MySQL Server 5.7\Uploads\bank_detail_train.txt"
into table bank_detail
fields terminated by ',';
select count(*) from bank_detail;
load data infile "C:\ProgramData\MySQL\MySQL Server 5.7\Uploads\bill_detail_train.txt"
into table bill_detail
fields terminated by ',';
select count(*) from bill_detail;
load data infile "C:\ProgramData\MySQL\MySQL Server 5.7\Uploads\browse_history_train.txt"
into table browse_history
fields terminated by ',';
select count(*) from browse_history;
load data infile "C:\ProgramData\MySQL\MySQL Server 5.7\Uploads\loan_time_train.txt"
into table loan_time
fields terminated by ',';
select * from loan_time;
load data infile "C:\ProgramData\MySQL\MySQL Server 5.7\Uploads\overdue_train.txt"
into table overdue
fields terminated by ',';
select * from overdue;
load data infile "C:\ProgramData\MySQL\MySQL Server 5.7\Uploads\user_info_train.txt"
into table user_info
fields terminated by ',';
select * from user_info;
时间戳无法在mysql中最大为2030年,数据中的时间戳超过了这个时间范围。可以在r中将时间戳转换。
as.POSIXct(5894316387, origin = "1960-01-01", tz = "GMT")
use test;
##各表数据量
select count(1) from bank_detail;
#6070197
select count(1) from bill_detail;
#2338118
#select count(1) from browse_history;
#22919547
select count(1) from loan_time;
#55596
select count(1) from overdue;
#55596
select count(1) from user_info;
#55596
## 压缩bank_detail的信息
select *
from bank_detail t
where user_id = 29165;
信息汇总
我们的目标是将所有的信息汇总到每一个用户,定义关于用户的衍生变量。
我们发现,bank_detail 表中的user_id数据比其它表的user_id少84%;大概只有9k多条。我们认为bank_detail是与最终目标变量相关性很高的数据,所以我们将这部分数据拿出来,单独做一次分析。
数据清洗的语句如下:
use test;
##各表数据量
select count(1) from bank_detail;
#6070197
select count(1) from bill_detail;
#2338118
select count(1) from browse_history;
#22919547
select count(1) from loan_time;
#55596
select count(1) from overdue;
#55596
select count(1) from user_info;
#55596
## 压缩bank_detail的信息
drop table if exists temp_1;
create table temp_1
select t.*,IF(trade_type=1,-1*trade_amount,trade_amount) as is_inout
from bank_detail t;
#where user_id = 29165;
drop table if exists v_bank_detail;
create table v_bank_detail
select user_id,sum(in_come*trade_amount) wage_income,sum((-trade_type+1)*trade_amount)-sum(in_come*trade_amount) exwage_income,
sum(is_inout) property
from temp_1
#where user_id = 6965
group by user_id;
## 压缩browse_history的信息
drop table if exists v_browse_history;
create table v_browse_history
select user_id,count(1) browse_count
from browse_history
group by user_id;
/*
drop table temp_22;
create table temp_22
select user_id,
if(browsing_behavior_number = 1,num,0) num1,
if(browsing_behavior_number = 2,num,0) num2,
if(browsing_behavior_number = 3,num,0) num3,
if(browsing_behavior_number = 4,num,0) num4,
if(browsing_behavior_number = 5,num,0) num5,
if(browsing_behavior_number = 6,num,0) num6,
if(browsing_behavior_number = 7,num,0) num7,
if(browsing_behavior_number = 8,num,0) num8,
if(browsing_behavior_number = 9,num,0) num9,
if(browsing_behavior_number = 10,num,0) num10,
if(browsing_behavior_number = 11,num,0) num11
from temp_2;
drop table browse_history_temp2;
create table browse_history_temp2
select user_id,
count(1) browse_count
from temp_22
group by user_id;
*/
SET GLOBAL innodb_buffer_pool_size=67108864;
#压缩bill_detail表的信息到user_id上。
drop table if exists v_bill_detail;
create table v_bill_detail
select user_id,
count(distinct(bank_id)) count_bank,
sum(pre_bill_amount)-sum(pre_repayment_amount) pre_not_repay,
sum(credit_limit) sum_credit_limit,
sum(curr_bill_balance) sum_curr_bill_balance,
sum(curr_min_repayment) sum_curr_min_repayment,
sum(num_of_consumption) sum_num_of_consumption,
sum(curr_bill_amount) sum_curr_bill_amount,
sum(adj_amount) sum_adj_amount,
sum(cycle_interest) sum_cycle_interest,
sum(available_balance) sum_available_balance,
sum(cash_in_advance) sum_cash_in_advance,
count(*)-sum(repayment_status) repay_num,
sum(repayment_status) not_repay_num
from bill_detail
group by user_id;
##建立索引
create index idx_1 on v_bank_detail (user_id);
create index idx_2 on v_browse_history(user_id);
create index idx_3 on v_bill_detail(user_id);
create index idx_4 on user_info(user_id);
create index idx_5 on loan_time(user_id);
create index idx_6 on overdue(user_id);
# 关联各表,得到一张宽表。
#因为bank_detail表中的数据明显太少,只有总数据的16%。因此我们决定单独拿出来进行分析。
drop table if exists finall_1;
create table finall_1
select
t1.* ,
t2.loan_time loan_time ,
t3.overdue_label overdue_label ,
t4.browse_count browse_count ,
t.wage_income wage_income ,
t.exwage_income exwage_income ,
t.property property ,
t5.count_bank count_bank ,
t5.pre_not_repay pre_not_repay ,
t5.sum_credit_limit sum_credit_limit ,
t5.sum_curr_bill_balance sum_curr_bill_balance ,
t5.sum_curr_min_repayment sum_curr_min_repayment ,
t5.sum_num_of_consumption sum_num_of_consumption ,
t5.sum_curr_bill_amount sum_curr_bill_amount ,
t5.sum_adj_amount sum_adj_amount ,
t5.sum_cycle_interest sum_cycle_interest ,
t5.sum_available_balance sum_available_balance ,
t5.sum_cash_in_advance sum_cash_in_advance ,
t5.repay_num repay_num ,
t5.not_repay_num not_repay_num
from v_bank_detail t
inner join user_info t1 on t.user_id = t1.user_id
inner join loan_time t2 on t.user_id = t2.user_id
inner join overdue t3 on t.user_id = t3.user_id
inner join v_browse_history t4 on t.user_id = t4.user_id
inner join v_bill_detail t5 on t.user_id=t5.user_id;
#第一部分有bank_detail 的6k多条数据在finall_1中。
select count(*) from finall_1;
#导出到csv文件中。
select
*
from finall_2
#order by user_id asc
into outfile 'C:\ProgramData\MySQL\MySQL Server 5.7\Uploads\finall_bank.csv'
FIELDS TERMINATED BY ',' OPTIONALLY ENCLOSED BY '"'
LINES TERMINATED BY '
';
将这部分数据导入R中:
y<-read.csv("C:/Users/CDAer/Desktop/个人征信预测/finall_bank.csv",header=FALSE,sep=",")
colnames(y1)<-c('user_id',
'count_bank',
'sum_pre_bill_amount',
'sum_pre_repayment_amount',
'sum_credit_limit',
'sum_curr_bill_balance',
'sum_curr_min_repayment',
'sum_num_of_consumption',
'sum_curr_bill_amount',
'sum_adj_amount',
'sum_cycle_interest',
'sum_available_balance',
'sum_cash_in_advance',
'repay_num',
'not_repay_num',
'num1',
'num2',
'num3',
'num4',
'num5',
'num6',
'num7',
'num8',
'num9',
'num10',
'num11',
'wage_income',
'exwage_income',
'property',
'edu_status',
'jobs',
'marry_status',
'residence',
'sex',
'loan_time',
'overdue_label')
#导入的数据多了偶数行,重新取应用数据放入y1 中。
p<-seq(from=1,to=nrow(y),by=2)
p
y1<-y[p,]
缺失信息处理
上限/下限
筛选变量
建立模型
###1.数据处理####
rm(list = ls())
y<-read.csv("C:/Users/Guang/Desktop/work/finall_bank.csv",header=FALSE,sep=",",stringsAsFactors = F)
nrow(y)
table(y[!is.na(y)])
View(y[1])
class(y)
y$V9<-as.integer(y$V9)
y$V10<-as.numeric(y$V10)
y$V11<-as.numeric(y$V11)
y$V12<-as.numeric(y$V12)
y$V13<-as.numeric(y$V13)
y$V27<-as.integer(y$V27)
y$V28<-as.integer(y$V28)
y$V29<-as.integer(y$V29)
y$V30<-as.integer(y$V30)
y$V31<-as.integer(y$V31)
y$V32<-as.integer(y$V32)
y$V33<-as.integer(y$V33)
y$V34<-as.integer(y$V34)
y$V35<-as.integer(y$V35)
y$V36<-as.integer(y$V36)
y$V37<-as.integer(y$V37)
#将0为缺失值的化为缺失值
y<-y[1:53174,]
y$sex<-ifelse(y$sex==0,NA,y$sex)
y$loan_time<-ifelse(y$loan_time==0,NA,y$loan_time)
#变量名
colnames(y)<-c("user_id",
"sex",
"jobs",
"edu_status",
"marry_status",
"residence",
"loan_time",
"overdue_label",
"wage_income",
"exwage_income",
"property",
"income",
"outcome",
"count_bank",
"pre_not_repay",
"sum_credit_limit",
"sum_curr_bill_balance",
"sum_curr_min_repayment",
"sum_num_of_consumption",
"sum_curr_bill_amount",
"sum_adj_amount",
"sum_cycle_interest",
"sum_available_balance",
"sum_cash_in_advance",
"repay_num",
"not_repay_num",
"browse_num1",
"browse_num2",
"browse_num3",
"browse_num4",
"browse_num5",
"browse_num6",
"browse_num7",
"browse_num8",
"browse_num9",
"browse_num10",
"browse_num11"
)
#处理时间戳
y$loan_time<-as.POSIXct(y$loan_time, origin = "1960-01-01", tz = "GMT")
#查看时间,按照月份做违约数量图
#library(plyr)
#library(lubridate)
#y$month<-month(y$loan_time)
#day_plot<-ddply(y,.(month),summarise,yyy=sum(overdue_label),yyyn=length(user_id))
#day_plot
#plot(day_plot$month,day_plot$yyy/day_plot$yyyn,type="b")
###填充sex和browse_count
b<- which(y$sex==0)
set.seed(2)
a<-sample(length(which(y$sex==0)),round(length(y[y$sex==0,"sex"])*0.8))
a2<-(1:length(which(y$sex==0)))[!(1:length(which(y$sex==0)) %in% a)]
b[a]
y[b[a],"sex"]<-1
b[a2]
y[b[a2],"sex"]<-2
y[is.na(y$sex),"sex"]<-1
#y[is.na(y$browse_count),"browse_count"]<-294
##填充browse——num的缺失值
browse<-c(
"browse_num1",
"browse_num2",
"browse_num3",
"browse_num4",
"browse_num5",
"browse_num6",
"browse_num7",
"browse_num8",
"browse_num9",
"browse_num10",
"browse_num11"
)
for(i in browse){
y[is.na(y[,i]),i]<-median(y[,i],na.rm = T)
}
train<-y
###### 2 分割训练集和验证集 ####
table(train$overdue_label)
table(test$overdue_label)
set.seed(1)
a<-sample(nrow(y),round(nrow(y)*0.7))
y[!((1:nrow(y)) %in% a),]
train<-y[a,]
test<-y[!((1:nrow(y)) %in% a),]####
##### 3. profiles图 #############################################################################################
# overall performance
overall_cnt=nrow(train) #calculate the total count
overall_resp=sum(train$overdue_label,na.rm = T) #calculate the total responders count
overall_resp_rate=overall_resp/overall_cnt #calculate the response rate
overall_perf<-c(overall_count=overall_cnt,overall_responders=overall_resp,overall_response_rate=overall_resp_rate) #combine
View(t(overall_perf)) #take a look at the summary
library(plyr) #call plyr
#分类型变量
"sex"
"jobs"
"edu_status"
[5] "marry_status"
[6] "residence"
"month"
prof<-ddply(train,.(sex),summarise,cnt=length(user_id),res=sum(overdue_label)) #group by sex
#View(prof) #check the result
prof
prof1<-within(prof,
{var1="sex"
sex
percent<-cnt/overall_cnt
res_rate<-res/cnt
index<-res_rate/overall_resp_rate*100
}) #add response_rate,index, percentage
View(prof1)
##连续型
[8] "overdue_label"
[9] "browse_count"
[10] "wage_income"
[11] "exwage_income"
[12] "property"
[13] "count_bank"
[14] "pre_not_repay"
[15] "sum_credit_limit"
[16] "sum_curr_bill_balance"
[17] "sum_curr_min_repayment"
[18] "sum_num_of_consumption"
[19] "sum_curr_bill_amount"
[20] "sum_adj_amount"
[21] "sum_cycle_interest"
[22] "sum_available_balance"
[23] "sum_cash_in_advance"
[24] "repay_num"
[25] "not_repay_num"
table(train$browse_count)
nrow(train)
#separate to 2 parts: missing,nomissing
table(is.na(train$overdue_label))
nomissing<-data.frame(var_data[!is.na(train$browse_count),]) #select the no missing value records
missing<-data.frame(var_data[is.na(train$browse_count),])
##numeric Profiling:missing part
missing2<-ddply(missing,.(browse_count),summarise,cnt=length(overdue_label),res=sum(overdue_label)) #group by em_months_last_open
#View(missing2)
missing_perf<-within(missing2,{res_rate<-res/cnt
index<-res_rate/overall_resp_rate*100
percent<-cnt/overall_cnt
var_category<-c('unknown')
}) #summary
View(missing_perf)
##numeric Profiling:Non-missing part
nomissing_value<-train[!is.na(train$browse_count),"browse_count"]
nomissing$var_category<-cut(nomissing_value,unique(quantile(nomissing_value,(0:10)/10)),include.lowest = F)#separte into 10 groups
View(table(nomissing$var_category)) #take a look at the 10 category
prof2<-ddply(nomissing,.(var_category),summarise,cnt=length(overdue_label),res=sum(overdue_label)) #group by the 10 groups
#View(prof2)
nonmissing_perf<-within(prof2,
{res_rate<-res/cnt
index<-res_rate/overall_resp_rate*100
percent<-cnt/overall_cnt
}) #add resp_rate,index,percent
View(nonmissing_perf)
all_perf<-rbind(nonmissing_perf,missing_perf[,-1]) #set 2 data together
View(all_perf)
############################################################ 4: Means #############################################################
train<-y
c_name <- colnames(train)
for(i in c_name){
mean_var1<-train[,i]
mean1<-c(
var=i,
mean=mean(mean_var1,na.rm=T),
median=median(mean_var1,na.rm=T),
quantile(mean_var1,c(0,0.01,0.1,0.25,0.5,0.75,0.9,0.99,1),na.rm=T),
nmiss=sum(is.na(mean_var1))
)
all_mean<-rbind(all_mean,t(mean1)) #set 2 data together
}
#需要给all_mean 一个初始值 然后再跑一次循环。
all_mean<-mean1
for(i in c_name){
mean_var1<-train[,i]
mean1<-c(
var=i,
mean=mean(mean_var1,na.rm=T),
median=median(mean_var1,na.rm=T),
quantile(mean_var1,c(0,0.01,0.1,0.25,0.5,0.75,0.9,0.99,1),na.rm=T),
nmiss=sum(is.na(mean_var1))
)
all_mean<-rbind(all_mean,t(mean1)) #set 2 data together
}
View(all_mean)
### 5消除极值点######
#消除极值点
a<-c(#"browse_count",
"property",
"pre_not_repay",
"sum_credit_limit",
"sum_curr_bill_balance",
"sum_curr_min_repayment",
"sum_num_of_consumption",
"sum_curr_bill_amount",
"sum_adj_amount",
"sum_cycle_interest",
"sum_available_balance",
"sum_cash_in_advance",
"repay_num",
"browse_num1",
"browse_num2",
"browse_num3",
"browse_num4",
"browse_num5",
"browse_num6",
"browse_num7",
"browse_num8",
"browse_num9",
"browse_num10",
"browse_num11"
)
scale(train[,a])
summary(train[,a])
quantile(train[,"repay_num"],0.01)
quantile(train$sum_credit_limit,0.99)
boxplot(scale(train[,a]))
#### 将异常值处理
for(i in a){
train[,i] <-
ifelse(train[,i] <= quantile(train[,i],0.01,na.rm = T), quantile(train[,i],0.01,na.rm = T),
ifelse(train[,i] >=quantile(train[,i],0.99,na.rm = T), quantile(train[,i],0.99,na.rm = T),
train[,i]))
}
######### 6哑变量转换 #########
train_1<-train
train_1$m2_sex<-ifelse(train_1$sex %in% c(2),0,1)
summary(train_1$m2_sex)
summary(train_1$sex)
table(train_1$m2_sex)
#jobs
train_1$m2_jobs<-ifelse(train_1$jobs %in% c(0,1,4),1,0)
#edu_status
train_1$m2_edu_status<-ifelse(train_1$edu_status %in% c(0,4),1,0)
#marry_status
train_1$m2_marry_status<-ifelse(train_1$marry_status %in% c(0,4,5),1,0)
#residence
train_1$m2_residence<-ifelse(train_1$residence %in% c(0,2,4),1,0)
############7 logistic模型############
library(ape)
library(vegan)
library(permute)
library(lattice)
library(nlme)
library(picante) #call picante
# scale
#标准化-
train$scale_browse_count<-scale(train$browse_count)
train$scale_count_bank<-scale(train$count_bank)
train$scale_pre_not_repay<-scale(train$pre_not_repay)
train$scale_sum_credit_limit<-scale(train$sum_credit_limit)
train$scale_sum_curr_bill_balance<-scale(train$sum_curr_bill_balance)
train$scale_sum_curr_min_repayment<-scale(train$sum_curr_min_repayment)
train$scale_sum_num_of_consumption<-scale(train$sum_num_of_consumption)
train$scale_sum_curr_bill_amount<-scale(train$sum_curr_bill_amount)
train$scale_sum_adj_amount<-scale(train$sum_adj_amount)
train$scale_sum_cycle_interest<-scale(train$sum_cycle_interest)
train$scale_sum_available_balance<-scale(train$sum_available_balance)
train$scale_sum_cash_in_advance<-scale(train$sum_cash_in_advance)
train$scale_repay_num<-scale(train$repay_num)
train$scale_not_repay_num<- scale(train$not_repay_num)
var_list1<-c(
"m2_sex",
"m2_edu_status",
"m2_marry_status",
"wage_income",
"exwage_income",
"income",
"pre_not_repay",
"sum_curr_bill_balance",
"sum_num_of_consumption",
"sum_available_balance",
"sum_cash_in_advance",
"repay_num",
"not_repay_num"
)
var_list1<-c(
#"user_id",
"sex",
"jobs",
"edu_status",
"marry_status",
"residence",
#"loan_time",
#"overdue_label",
"wage_income",
"exwage_income",
"property",
"income",
"outcome",
"count_bank",
"pre_not_repay",
"sum_credit_limit",
"sum_curr_bill_balance",
"sum_curr_min_repayment",
"sum_num_of_consumption",
"sum_curr_bill_amount",
"sum_adj_amount",
"sum_cycle_interest",
"sum_available_balance",
"sum_cash_in_advance",
"repay_num",
"not_repay_num",
"browse_num1",
"browse_num2",
"browse_num3",
"browse_num4",
"browse_num5",
"browse_num6",
"browse_num7",
"browse_num8",
"browse_num9",
"browse_num10",
"browse_num11"
)
#View(train[,var_list1])
mods<-train[,c('overdue_label',var_list1)] #select Y and varibales you want to try
str(mods)
(model_glm<-glm(overdue_label~.,data=mods,family =binomial(link ="logit"))) #logistic model
#########Stepwise先建模然后跑递归
library(MASS)
model_sel<-stepAIC(model_glm,direction ="both") #using both backward and forward stepwise selection
summary<-summary(model_sel) #summary
model_summary<-data.frame(var=rownames(summary$coefficients),summary$coefficients) #do the model summary
View(model_summary)
#### 8 模型评估 ####
#预测
train$predict_1<-predict(model_glm,newdata=train,type="response")
summary(train$predict_1)
#train$predict_1<-ifelse(train$predict_1>0.95,1,0)
library(ROCR)
library(gplots)
roc<-prediction(train$predict_1,train$overdue_label)
cc<-performance(roc,"tpr","fpr")
plot(cc)
abline(a=0,b=1)
performance(roc,"auc")
#k-s曲线,越接近1越好,小于0.2不可接受
myks<-function(y,predict_y){
pred <- prediction(predictions=predict_y,labels=y)
perf <- performance(pred,"tpr","fpr")
tmp<-max(attr(perf,"y.values")[[1]]-attr(perf,"x.values")[[1]])
return(tmp)
}
myks(train$overdue_label,train$predict_1)
#预测test
test$m2_sex<-ifelse(test$sex %in% c(2),0,1)
summary(test$m2_sex)
summary(test$sex)
table(test$m2_sex)
#jobs
test$m2_jobs<-ifelse(test$jobs %in% c(0,1,4),1,0)
#edu_status
test$m2_edu_status<-ifelse(test$edu_status %in% c(0,4),1,0)
#marry_status
test$m2_marry_status<-ifelse(test$marry_status %in% c(0,4,5),1,0)
#residence
test$m2_residence<-ifelse(test$residence %in% c(0,2,4),1,0)
test$predict_1<-predict(model_glm,newdata=test,type="response")
summary(test$predict_1)
#test$predict_1<-ifelse(test$predict_1>0.95,1,0)
library(ROCR)
library(gplots)
roc<-prediction(test$predict_1,test$overdue_label)
cc<-performance(roc,"tpr","fpr")
plot(cc)
abline(a=0,b=1)
performance(roc,"auc")
#k-s曲线,越接近1越好,小于0.2不可接受
myks<-function(y,predict_y){
pred <- prediction(predictions=predict_y,labels=y)
perf <- performance(pred,"tpr","fpr")
tmp<-max(attr(perf,"y.values")[[1]]-attr(perf,"x.values")[[1]])
return(tmp)
}
myks(test$overdue_label,test$predict_1)
#### bank_detail取出单独做 因子化#####
train_1<-train
train_1<-train[!(is.na(train$wage_income)),]
write.csv(train_1,"C:/Users/CDAer/Desktop/train_1.csv")
hist(train$income)
nrow(train_1)
colnames(train_1)
train_1$m1_sex<-as.factor(train_1$sex)
train_1$m1_jobs<-as.factor(train_1$jobs)
train_1$m1_edu_status<-as.factor(train_1$edu_status)
train_1$m1_marry_status<-as.factor(train_1$marry_status)
train_1$m1_residence<-as.factor(train_1$residence)
var_list1<-c(
#"user_id",
"m1_sex",
"m1_jobs",
"m1_edu_status",
"m1_marry_status",
"m1_residence",
#"loan_time",
#"overdue_label",
"pre_not_repay",
"sum_credit_limit",
"sum_curr_bill_balance",
"sum_curr_min_repayment",
"sum_num_of_consumption",
"sum_curr_bill_amount",
"sum_adj_amount",
"sum_cycle_interest",
"sum_available_balance",
"sum_cash_in_advance",
"repay_num",
"not_repay_num",
"browse_num1",
"browse_num2",
"browse_num3",
"browse_num4",
"browse_num5",
"browse_num6",
"browse_num7",
"browse_num8",
"browse_num9",
"browse_num10",
"browse_num11"
)
View(train_1[,var_list1])
mods<-train_1[,c('overdue_label',var_list1)] #select Y and varibales you want to try
str(mods)
(model_glm<-glm(overdue_label~.,data=mods,family =binomial(link ="logit"))) #logistic model
#########Stepwise先建模然后跑递归
library(MASS)
model_sel<-stepAIC(model_glm,direction ="both") #using both backward and forward stepwise selection
summary<-summary(model_sel) #summary
model_summary<-data.frame(var=rownames(summary$coefficients),summary$coefficients) #do the model summary
View(model_summary)
#### 8 模型评估 ####
var_list<-c(
"m2_sex",
"m2_jobs",
"m2_edu_status",
"m2_marry_status",
"exwage_income",
"income",
"pre_not_repay",
"sum_credit_limit",
"sum_curr_bill_balance",
"sum_num_of_consumption",
"sum_cycle_interest",
"sum_available_balance",
"sum_cash_in_advance",
"browse_num3",
"browse_num6",
"browse_num11"
)
mods<-train_1[,c('overdue_label',var_list)] #select Y and varibales you want to try
str(mods)
(model_glm<-glm(overdue_label~.,data=mods,family =binomial(link ="logit"))) #logistic model
#预测
train_1$predict_1<-predict(model_glm,newdata=train_1,type="response")
summary(train_1$predict_1)
#train$predict_1<-ifelse(train$predict_1>0.95,1,0)
library(ROCR)
library(gplots)
roc<-prediction(train_1$predict_1,train_1$overdue_label)
cc<-performance(roc,"tpr","fpr")
plot(cc)
abline(a=0,b=1)
performance(roc,"auc")
#k-s曲线,越接近1越好,小于0.2不可接受
myks<-function(y,predict_y){
pred <- prediction(predictions=predict_y,labels=y)
perf <- performance(pred,"tpr","fpr")
tmp<-max(attr(perf,"y.values")[[1]]-attr(perf,"x.values")[[1]])
return(tmp)
}
myks(train_1$overdue_label,train_1$predict_1)