zoukankan      html  css  js  c++  java
  • 玩一个预测人品的比赛-代码积累

    用xgboost进行训练,代码见下面

    #设置路径,加载包
    setwd('/Users/litao/R/eXtreme Gradient Boosting/eXtreme Gradient Boosting/比赛来一发数据集')
    library(xgboost)
    library(magrittr)
    library(Matrix)
    library(dplyr)
    # step 1:loading data
    train=read.csv('train_x.csv')
    test=read.csv('test_x.csv')
    train.y=read.csv('train_y.csv')
    ft=read.csv('features_type.csv')
    # step 2:创建训练集,测试集的index,方便以后进行筛选测试集和训练集
    train.index <- seq(1,nrow(train),1)
    test.index <- seq(nrow(train)+1, nrow(train)+nrow(test), 1)
    #combine train and test
    traintest.combine <- rbind(train,test)%>%cbind(index=c(train.index,test.index),.)
    #把所有的变量存进feature这个向量
    fea <- unique(ft[,1])
    #转换数据类型,分类变量转换为因子
    for(f in fea){
        if(ft[which(ft$feature==f),2]=='category') 
            traintest.combine[,f] <- as.factor(traintest.combine[,f])
    }
    #查看转化后的数据类型,和ft中的数据类型是否一致
    # str(traintest.combine, list.len=ncol(traintest.combine))
    # step 3:将分类变量转化为稀疏矩阵
    df <- traintest.combine
    res <- do.call('cbind',
                   lapply(names(df), function(x) model.matrix(as.formula(paste0(' ~',x,'-1')), df[x])))
    # 去掉存在分类变量中存在-1的变量
    X <- colnames(res)
    ol <- grep(glob2rx("*-1"), X)
    dat <- Matrix(res[,-ol],sparse=T)
    # step 4: modeling
    dtrain=xgb.DMatrix(data=dat[train.index,c(-1,-2)],label=train.y$y)
    dtest=xgb.DMatrix(data=dat[test.index,c(-1,-2)])
    set.seed(1)
    model100=xgboost(  booster='gbtree',
                       objective='binary:logistic',
                       scale_pos_weight=1542/13458,
                       gamma=0,
                       lambda=700,
                       subsample=0.7,
                       colsample_bytree=0.30,
                       min_child_weight=5,
                       max_depth=8,
                       eta=0.01,
                       data=dtrain,
                       nrounds=3820,
                       eval_metric='auc',
                       nthread=4)
    pred=predict(model100,dtest)
    write.csv(data.frame('uid'=test['uid'],'score'=pred),file='submit100.csv',row.names=F)
    head(data.frame('uid'=test[,1],'score'=pred))
    

    用随机森林训练,代码见下面

    # how to calculate AUC in R?
    # http://stackoverflow.com/questions/4903092/calculate-auc-in-r
    if(!'ROCR' %in% installed.packages()[,1]) (install.packages('ROCR'))
    library(ROCR)
    library(randomForest)
    library(e1071)
    library(gbm)
    library(xgboost)
    library(data.table)
    library(magrittr)
    library(stringr)
    library(foreach)
    # randomForest 
    # step 1: load data into R and convert data type by batch
    setwd('/Users/litao/R/eXtreme Gradient Boosting/eXtreme Gradient Boosting/比赛来一发数据集')
    list.files()
    features_type <- read.csv('features_type.csv')
    train_x <- fread('train_x.csv',header = TRUE)%>%as.data.frame()
    train_y <- fread('train_y.csv',header = TRUE)%>%as.data.frame()
    train_y$y <- as.factor(train_y$y)
    test_x <- fread('test_x.csv',header = TRUE)%>%as.data.frame()
    # for train_x, convert category into factor by batch.
    for(i in 1:1138){
        if(features_type[i,2]=='category') 
            train_x[,i+1] <- as.factor(train_x[,i+1])
    }
    # for test_x,convert category into factory by batch
    for(i in 1:1138){
        if(features_type[i,2]=='category') 
            test_x[,i+1] <- as.factor(test_x[,i+1])
    }
    # 统一level
    for(i in 1:1138){
        if(features_type[i,2]=='category') 
            levels(test_x[,i+1]) <- levels(train_x[,i+1])
    }
    # step 2: is there any missing value in train_x????
    ## calculate missing value ratio for coloumns
    missingvalue.ratio <- function(df){
        df <- as.data.frame(df)
        res <- is.na(df)%>%colSums()/length(df[,1])
        return(res)
    }
    missingvalue.ratio(train_x)
    ## stratify sampling with replace, down-sampling the majority class ,up-sampling the minority
    dat <- cbind(y=train_y[,2],train_x[,-1])
    set.seed(12)
    #----5000 颗树木
    train.rf.1000 <- randomForest(y~.,data=dat
                              ,mtry=34
                              ,ntree=5000
                              ,sampsize=c(1542,5000)
                              ,strata=dat$y
                              ,do.trace=1
                              ,nodesize=2
    )
    # calculate AUC in randomForest
    library(ROCR)
    calculate.auc <- function(rf_output,target){
        predictions=as.vector(train.rf$votes[,2])
        pred=prediction(predictions,dat$y)
        
        perf_AUC=performance(pred,"auc") #Calculate the AUC value
        AUC=perf_AUC@y.values[[1]]
        
        perf_ROC=performance(pred,"tpr","fpr") #plot the actual ROC curve
        plot(perf_ROC, main="ROC plot")
        text(0.5,0.5,paste("AUC = ",format(AUC, digits=5, scientific=FALSE)))
        #calculate.auc(train.rf.1000,y)
    }
  • 相关阅读:
    第四章 分布式扩展
    第三章 2.性能压测,容量问题
    第三章 1.云部署,打包上传
    MySQL语法大全
    Python随手记
    Python操作Mysql中文乱码问题
    Python基础函数
    破解电信校园网路由限制
    ThinkPHP扩展函数的三个方法
    $_SERVERS预定义变量
  • 原文地址:https://www.cnblogs.com/litao1105/p/5120211.html
Copyright © 2011-2022 走看看