zoukankan      html  css  js  c++  java
  • 用R语言对一个信用卡数据实现logit,GBM,knn,xgboost

    Prepare the data

    数据来自UCIhttp://archive.ics.uci.edu/ml/machine-learning-databases/credit-screening,一个信a用卡的数据,具体各项变量名以及变量名代表的含义不明(应该是出于保护隐私的目的),本文会用logit,GBM,knn,xgboost来对数据进行分类预测,对比准确率

    预计的准确率应该是:

    xgboost > GBM > logit > knn

    Download the data

    dataset = read.table("http://archive.ics.uci.edu/ml/machine-learning-databases/credit-screening/crx.data", sep = ",", essay-header = F, na.strings = "?")

    head(dataset)
      V1    V2    V3 V4 V5 V6 V7   V8 V9 V10 V11 V12 V13 V14 V15 V16
    1  b 30.83 0.000  u  g  w  v 1.25  t   t   1   f   g 202   0   +
    2  a 58.67 4.460  u  g  q  h 3.04  t   t   6   f   g  43 560   +
    3  a 24.50 0.500  u  g  q  h 1.50  t   f   0   f   g 280 824   +
    4  b 27.83 1.540  u  g  w  v 3.75  t   t   5   t   g 100   3   +
    5  b 20.17 5.625  u  g  w  v 1.71  t   f   0   f   s 120   0   +
    6  b 32.08 4.000  u  g  m  v 2.50  t   f   0   t   g 360   0   +
    ## save.csv(dataset,file = "creditCard.csv")

    以上是数据的形式,接下来看下数据是否有缺失值和各个数据的类型

    sapply(dataset,function(x) sum(is.na(x)))
     V1  V2  V3  V4  V5  V6  V7  V8  V9 V10 V11 V12 V13 V14 V15 V16 
     12  12   0   6   6   9   9   0   0   0   0   0   0  13   0   0 
    sapply(dataset,class)
           V1        V2        V3        V4        V5        V6        V7        V8        V9       V10 
     "factor" "numeric" "numeric"  "factor"  "factor"  "factor"  "factor" "numeric"  "factor"  "factor" 
          V11       V12       V13       V14       V15       V16 
    "integer"  "factor"  "factor" "integer" "integer"  "factor" 

     

    Train and Test

    分割数据的训练集和测试集,这里set.seed(123),设定70%的训练集,30%的测试集.

    set.seed(123)dataset = na.omit(dataset)n = dim(dataset)[1]index = sample(n,round(0.7*n))train = dataset[index,]test = dataset[-index,]dim(train)
    [1] 457  16
    dim(test)
    [1] 196  16

    Change the variable into dummy variables

    有时候,需要转化变量为哑变量,因为在一些挖掘场合,数据不能直接使用因子型的数据:

    • knn

    • glmnet

    • svm

    • xgboost

    有些挖掘方法是可以使用因子变量的,比如:

    • logistic regression

    • raprt

    • GBM

    • randomforest

    dataset2 = datasetlibrary(plyr)into_factor = function(x){
      
      if(class(x) == "factor"){
      n = length(x)
      data.fac = data.frame(x = x,y = 1:n)
      output = model.matrix(y~x,data.fac)[,-1]
      ## Convert factor into dummy variable matrix
      }else{
        output = x
      ## if x is numeric, output is x
      }
      output
      }into_factor(dataset$V4)[1:5,]
      xu xy
    1  1  0
    2  1  0
    3  1  0
    4  1  0
    5  1  0
    dataset2 = colwise(into_factor)(dataset2)dataset2 = do.call(cbind,dataset2)dataset2 = as.data.frame(dataset2)head(dataset2)
      V1    V2    V3 xu xy xgg xp xc xcc xd xe xff xi xj xk xm xq xr xw xx xdd xff xh xj xn xo xv xz
    1  1 30.83 0.000  1  0   0  0  0   0  0  0   0  0  0  0  0  0  0  1  0   0   0  0  0  0  0  1  0
    2  0 58.67 4.460  1  0   0  0  0   0  0  0   0  0  0  0  0  1  0  0  0   0   0  1  0  0  0  0  0
    3  0 24.50 0.500  1  0   0  0  0   0  0  0   0  0  0  0  0  1  0  0  0   0   0  1  0  0  0  0  0
    4  1 27.83 1.540  1  0   0  0  0   0  0  0   0  0  0  0  0  0  0  1  0   0   0  0  0  0  0  1  0
    5  1 20.17 5.625  1  0   0  0  0   0  0  0   0  0  0  0  0  0  0  1  0   0   0  0  0  0  0  1  0
    6  1 32.08 4.000  1  0   0  0  0   0  0  0   0  0  0  0  1  0  0  0  0   0   0  0  0  0  0  1  0
        V8 V9 V10 V11 V12 xp xs V14 V15 V16
    1 1.25  1   1   1   0  0  0 202   0   1
    2 3.04  1   1   6   0  0  0  43 560   1
    3 1.50  1   0   0   0  0  0 280 824   1
    4 3.75  1   1   5   1  0  0 100   3   1
    5 1.71  1   0   0   0  0  1 120   0   1
    6 2.50  1   0   0   1  0  0 360   0   1
    dim(dataset2)
    [1] 653  38

    Logistic Regression

    使用logistic回归来进行测试建模和预测,使用的函数是glm

    logit.model = glm(V16~.,data = train,family = "binomial")logit.response = predict(logit.model,test,type = "response")logit.predict = ifelse(logit.response>0.5,"+","-")table(logit.predict,test$V16)
                 logit.predict  -  +
                - 90 24
                + 13 69
    accurancy1 = mean(logit.predict == test$V16)accurancy1
    [1] 0.81122

    GBM

    使用GBM方法来进行预测,这里用的是caret,repeat-cv来选择最优树

    library(caret)

    ctrl = trainControl(method = "repeatedcv", number = 5, repeats = 5)set.seed(300)m_gbm = train(V16 ~ ., data=train, method = "gbm",  metric = "Kappa", trControl = ctrl)

    gbm.predict = predict(m_gbm,test)table(gbm.predict,test$V16)
    accurancy2 = mean(gbm.predict == test$V16)accurancy2
    [1] 0.85714

    knn method for classification

    knn set k = 5

    This is a model without cross-validation

    首先测试一个knn模型,不做CV,不做标准化,不做数据类型转换得到的结果,这里,不转换数据类型会把因子类型的变量舍弃,仅保留数值变量

    library(caret)knn.model1 = knn3(V16 ~ .,data = train, k = 5) 
     knn.response1 = predict(knn.model1,test,class = "response")
     knn.predict1 = ifelse(knn.response1[,1]<0.5,"+","-")

    table(knn.predict1,test$V16)
                knn.predict1  -  +
               - 78 48
               + 25 45
    mean(knn.predict1 == test$V16)
    [1] 0.62755

     

    knn after scale

    After scaling and convert into dummy variables:

    经过标准化和数据转换之后的准确率:

    knn.dataset = cbind(
                    colwise(scale)(dataset2[,-38]),V16 = as.factor(dataset2$V16)
                    )

    set.seed(123)

    index = sample(n,round(0.7*n))

    train.knn = knn.dataset[index,]

    test.knn = knn.dataset[-index,]

    knn.model1 = knn3(V16 ~ .,data = train.knn, k = 5)

     knn.predict1 = predict(knn.model1,test.knn,,type = "class") table(knn.predict1,test.knn$V16)
                knn.predict1  0  1
               0 89 32
               1 14 61
    mean(knn.predict1 == test.knn$V16)
    [1] 0.76531

     

    knn CV for k

    my-try

    不管是我的这个程序函数caret,总算出来应该是k=2的时候误差最小,但是实际情况不是这样

    library(class)cv.knn = function(data,n=5,k){
      index = sample(1:5,nrow(data),replace = T)
      acc=0
      for ( i in 1:5){
        ind = index == i
        train = data[-ind,]
        test = data[ind,]
        knn.model1 = knn3(V16 ~ .,data = train, k = k)  
        knn.predict= predict(knn.model1,test,type = "class") 
        acc[i] = mean(knn.predict == test$V16)
      }
        mean(acc)}cv.knn(train.knn,3,5)
    [1] 0.8533
    k = 2:20set.seed(123)acc = sapply(k,function(x) cv.knn(train.knn,3,x))plot(k,acc,type = "b")
    k.final = which.max(acc)knn.model.f = knn3(V16 ~ .,data = train.knn, k = k.final)  knn.predict.f = predict(knn.model.f,test.knn,type = "class") 
    table(knn.predict.f,test.knn$V16)
                 knn.predict.f  0  1
                0 81 31
                1 22 62
    mean(knn.predict.f == test.knn$V16)
    [1] 0.72959
    library(caret)

    fitControl <- trainControl(method = "cv", number = 10)

    knnTune <- train(x = dataset2[1:37], y = dataset2[,38], method = "knn", preProc = c("center", "scale"),tuneGrid = data.frame(.k = 1:20), trControl = fitControl)

     

    直接train,test来看:

    效果是k=5最好

    knn_train_test = function(train,test,k =5){
        knn.model.f = knn3(V16 ~ .,data = train, k = k)  
        knn.predict.f = predict(knn.model.f,test,type = "class") 
        mean(knn.predict.f == test$V16)}x = 1:20result = 
      sapply(x,         function(x) knn_train_test(train.knn,test.knn,k = x))  plot(x,result,type = "b")
    k.final = which.max(result)accurancy3 = knn_train_test(train.knn,test.knn,k = k.final)accurancy3
    [1] 0.75

    xgboost

    Install:

    ## devtools::install_github('dmlc/xgboost',subdir='R-package')
    require(xgboost)

    require(methods)

    require(plyr)

    set.seed(123)

    set.seed(123)

    index = sample(n,round(0.7*n))

    train.xg = dataset2[index,]

    test.xg = dataset2[-index,]

    label <- as.matrix(train.xg[,38,drop =F])

    data <- as.matrix(train.xg[,-38,drop =F])

    data2 <-  as.matrix(test.xg[,-38,drop =F])

    label2 =  as.matrix(test.xg[,38,drop =F])

    # weight <- as.numeric(dtrain[[32]]) * testsize / length(label)

    xgmat <- xgb.DMatrix(data, label = label, missing = -10000)

    param <- list("objective" = "binary:logistic","bst:eta" = 1,"bst:max_depth" = 2,"eval_metric" = "logloss","silent" = 1,"nthread" = 16 ,"min_child_weight" =1.45)

    nround =275

    bst = xgb.train(param, xgmat, nround )

    res1 = predict(bst,data2)

    pre1 = ifelse(res1>0.5,1,0)

    table(pre1,label2)
        label2
    pre1  0  1
       0 91 15
       1 12 78
    accurancy4 = mean(pre1 ==label2)

    accurancy4
    [1] 0.86224

    Final Results

    MethodAccurancy
    logistic regression 0.81122
    GBM 0.85714
    knn 0.75
    xgboost 0.86224
    ---------------------------------------------------------------------------------- 数据和特征决定了效果上限,模型和算法决定了逼近这个上限的程度 ----------------------------------------------------------------------------------
  • 相关阅读:
    Array互转xml
    Android检测网络是否可用
    [LeetCode#117]Populating Next Right Pointers in Each Node II
    [LeetCode#23]Merge k Sorted Lists
    [LeetCode#83]Remove Duplicates from Sorted List
    [LeetCode#147]Insertion Sort List
    [LeetCode#55, 45]Jump Game, Jump Game II
    [LeetCode#125]Valid Palindrome
    [LeetCode#]Populating Next Right Pointers in Each Node
    [LeetCode#134]Gas Station
  • 原文地址:https://www.cnblogs.com/payton/p/5340538.html
Copyright © 2011-2022 走看看