zoukankan      html  css  js  c++  java
  • R 代码积累

    R 代码积累不定期更新

    1.阶乘、递归、reduce、sprintf

    #NO.1
    # 阶乘函数
    fact <- function(n){
      if(n==0) return(1) #基例在这
      else return(n*fact(n-1))
    }
    
    #1+2!+3!+...+20!的和
    #测试 Reduce('+', lapply(1:3,fact)) 结果是9
    Reduce('+', lapply(1:20,fact))
    
    #NO.2
    #判断101-200之间有多少个素数,并输出所有素数
    is.prime <- function(num) {
      if (num == 2) {
        TRUE
      } else if (any(num %% 2:(num-1) == 0)) {
        FALSE
      } else { 
        TRUE
      }
    }
    #101共有多少个素数
    Reduce('+', lapply(101:200,is.prime))
    #打印出每一个素数
    for(i in 101:200){
      if(!is.prime(i)) next
      print(sprintf('%d 是素数 %s',i,is.prime(i)))
    }
    

    2.MD5加密卡号

    library(magrittr)
    library(digest)
    library(data.table)
    library(readxl)
    library(lubridate)
    
    dat=read_excel('data_union.xlsx',sheet=4)
    dim(dat)
    get_len <- function(a){
      return(paste0(nchar(a),a))
    }
    dat$card_len=lapply(dat$bank_card, get_len)%>%unlist()
    dat$card_md5 <- lapply(dat$card_len, digest, algo="md5",serialize=F)%>%unlist()
    dat$card_md5_upper <- toupper(dat$card_md5)
    write.table(
      dat[,c('card_md5_upper')]
      ,'yqb_card_md5.txt',row.names = F
      ,col.names=F
      ,quote = F)
    
    #ym_apply
    dat$ym_apply=dat$time %m-% months(1)%>%
      format("%Y%m")%>%as.numeric()
    
    #write out card_md5 and ym_apply
    write.table(
      dat[c('card_md5_upper','ym_apply')]
      ,sep=','
      ,'yqb_card_ym_md5.txt',row.names = F
      ,col.names=F
      ,quote = F)
    
    unique(dat$ym_apply)
    

    3.时间函数

    https://cran.r-project.org/web/packages/lubridate/vignettes/lubridate.html

     4.随机森林可视化

    library(randomForest)
    library(kernlab) # spam数据集 来源于这个 包
    library(magrittr)
    library(caret)
    #setwd('...')
    #data <- read.csv('...')
    
    data(spam) #加载数据集
    # spam 是一个数据集,一条记录代表一封邮件包含哪些特殊词
    # ,这封邮件是否是垃圾邮件(y变量)等等一些变量
    View(spam)
    table(spam$type) #看一下Y变量的分布,是否存在不均衡(imbalanced data)
    #type 是y变量 也可以是0,1,但注意,如果你的数据集是 df,  要预先 将y变量转换为factor ,df$y <- as.factor(df$y)
    set.seed(2016) #为使模型可再现,这里预先设定随机种子,因为随机森林的 每棵树对行是进行随机有放回的抽样的
    
    rf <- randomForest(type~.
                       ,data=spam
                       ,importance=TRUE #保留变量的重要性 TURE的时候,可通过rf$importance 查看
                       ,proximity=FALSE #在n棵树种,一个矩阵任意两条记录 落在同一个叶子节点的概率,可以表示两条记录的相似程度
                       ,ntree=500 #种多少棵树
                       ,do.trace=20 #每20条显示 误分率
                       ,na.action=na.omit#一行中只要存在一个NA,这条记录就删掉
                       ,strata=spam$type #按照Y变量中(0,1的比例,进行上下采样,对占比少的用oversampling,对占比多的用downsampling)
                       ,sampsize=c(1500,1500) #对0,1采样的个数分别是多少,都是有放回的
                       ,mtry=7 #每一棵决策树,选取几个feature?,对于classification 一般是feature总数的开平方(这个是default)
                       ,keep.forest=FALSE) #保留每一棵数
    
    rf$confusion #混淆矩阵
    
    ##变量重要性,如果你的数据是有上千个变量,可以根据变量的重要性对数据进行降维
    par(mfrow = c(2,2))
    for(i in 1:4){
      plot(sort(rf$importance[,i],decreasing =TRUE)%>%head(20)
           ,type='h'
           ,main=paste0(colnames(rf$importance)[i])
           ,xlab='variable'
           ,ylab='importance')
      
      text(sort(rf$importance[,i],decreasing =TRUE)
           ,labels=names(sort(rf$importance[,i]))%>%head(20)
           ,pos=1
           ,cex=0.9)
    }
    
    ## 下面画ROC 曲线,计算AUC
    library(ROCR)
    predictions=as.vector(rf$votes[,2])
    pred=prediction(predictions,spam$type)
    
    perf_AUC=performance(pred,"auc") #Calculate the AUC value
    AUC=perf_AUC@y.values[[1]]
    
    perf_ROC=performance(pred,"tpr","fpr") #plot the actual ROC curve
    plot(perf_ROC, main="ROC plot")
    text(0.5,0.5,paste("AUC = ",format(AUC, digits=5, scientific=FALSE)))
    
    #cutoff accuracy
    perf <- performance(pred, measure="acc", x.measure="cutoff")
    # Get the cutoff for the best accuracy
    bestAccInd <- which.max(perf@"y.values"[[1]])
    bestMsg <- paste("best accuracy=", perf@"y.values"[[1]][bestAccInd], 
                     " at cutoff=", round(perf@"x.values"[[1]][bestAccInd], 4))
    plot(perf, sub=bestMsg)
    abline(v=round(perf@"x.values"[[1]][bestAccInd], 4))
    
    # calculate the confusion matrix and plot
    cm <- confusionMatrix(rf$predicted, reference = spam$type)
    draw_confusion_matrix(cm)
    
    #confusion matrix visualization
    draw_confusion_matrix <- function(cm) {
      
      layout(matrix(c(1,1,2)))
      par(mar=c(2,2,2,2))
      plot(c(100, 345), c(300, 450), type = "n", xlab="", ylab="", xaxt='n', yaxt='n')
      title('CONFUSION MATRIX', cex.main=2)
      
      # create the matrix 
      rect(150, 430, 240, 370, col='#3F97D0')
      text(195, 435,  rf$classes[1], cex=1.2)
      rect(250, 430, 340, 370, col='#F7AD50')
      text(295, 435, rf$classes[2], cex=1.2)
      text(125, 370, 'Predicted', cex=1.3, srt=90, font=2)
      text(245, 450, 'Actual', cex=1.3, font=2)
      rect(150, 305, 240, 365, col='#F7AD50')
      rect(250, 305, 340, 365, col='#3F97D0')
      text(140, 400,  rf$classes[1], cex=1.2, srt=90)
      text(140, 335, rf$classes[2], cex=1.2, srt=90)
      
      # add in the cm results 
      res <- as.numeric(cm$table)
      text(195, 400, res[1], cex=1.6, font=2, col='white')
      text(195, 335, res[2], cex=1.6, font=2, col='white')
      text(295, 400, res[3], cex=1.6, font=2, col='white')
      text(295, 335, res[4], cex=1.6, font=2, col='white')
      
      # add in the specifics 
      plot(c(100, 0), c(100, 0), type = "n", xlab="", ylab="", main = "DETAILS", xaxt='n', yaxt='n')
      text(10, 85, names(cm$byClass[1]), cex=1.2, font=2)
      text(10, 70, round(as.numeric(cm$byClass[1]), 3), cex=1.2)
      text(30, 85, names(cm$byClass[2]), cex=1.2, font=2)
      text(30, 70, round(as.numeric(cm$byClass[2]), 3), cex=1.2)
      text(50, 85, names(cm$byClass[5]), cex=1.2, font=2)
      text(50, 70, round(as.numeric(cm$byClass[5]), 3), cex=1.2)
      text(70, 85, names(cm$byClass[6]), cex=1.2, font=2)
      text(70, 70, round(as.numeric(cm$byClass[6]), 3), cex=1.2)
      text(90, 85, names(cm$byClass[7]), cex=1.2, font=2)
      text(90, 70, round(as.numeric(cm$byClass[7]), 3), cex=1.2)
      
      # add in the accuracy information 
      text(30, 35, names(cm$overall[1]), cex=1.5, font=2)
      text(30, 20, round(as.numeric(cm$overall[1]), 3), cex=1.4)
      text(70, 35, names(cm$overall[2]), cex=1.5, font=2)
      text(70, 20, round(as.numeric(cm$overall[2]), 3), cex=1.4)
    }  
    

     5.一个多线程 爬虫

    ##  99健康网 医院抓取  ##
    
    #step1:by city
    url_0='http://yyk.99.com.cn/city.html'
    content_0=url_0%>%read_html()
    city_nm=c(
      content_0%>%html_nodes('.cityarea li a')%>%html_text(trim=T)
      ,content_0%>%html_nodes('.cityarea .background li a')%>%html_text(trim=T)
    )
    city_url=c(
      content_0%>%html_nodes('.cityarea li a')%>%html_attr('href')%>%paste0('http://yyk.99.com.cn',.)
      ,content_0%>%html_nodes('.cityarea .background li a')%>%html_attr('href')%>%paste0('http://yyk.99.com.cn',.)
    )
    length(city_nm)
    length(city_url)
    
    #step2:get hospital by city
    get_hospital_1 <- function(i){
      # i=1
      content_1=city_url[i]%>%read_html(encoding='gbk')
      hospital_nm=content_1%>%html_nodes('.tablist li span')%>%html_text(trim=T)
      hospital_url=content_1%>%html_nodes('.tablist li a')%>%html_attr('href')%>%paste0(.,'jianjie.html')
      return(data.frame(
        stringsAsFactors = F
        ,city_nm=rep(city_nm[i],length(hospital_nm))
        ,hospital_nm=hospital_nm
        ,hospital_url=hospital_url
      ))
    }
    system.time({
      x <- 301:length(city_url)
      res_99jk_3 <- do.call('rbind',lapply(x,get_hospital_1))
    })
    write.table(res_99jk_1,'res_99jk_1.csv',row.names = F,sep='	')
    write.table(res_99jk_2,'res_99jk_2.csv',row.names = F,sep='	')
    write.table(res_99jk_3,'res_99jk_3.csv',row.names = F,sep='	')
    res_99jk <- rbind(res_99jk_1
                      ,res_99jk_2
                      ,res_99jk_3)
    
    #step3 get all fileds level beds_cnt...
    res_99jk <- read.table('res_99jk.csv',header = T)
    hospital_url_pool=res_99jk$hospital_url%>%unique()
    get_detail <- function(hospital_url){
      tryCatch({
        res=hospital_url%>%read_html(encoding='gbk')%>%html_nodes('.tdr')%>%html_text(trim=T)
        df=data.frame(stringsAsFactors = F
                      ,hospital_url=hospital_url
                      ,nm=res[1]
                      ,addr=res[2]
                      ,hp_admin=res[3]
                      ,start_dt=res[4]
                      ,type=res[5]
                      ,level=res[6]
                      ,section_cnt=res[7]
                      ,doc_cnt=res[8]
                      ,bed_cnt=res[9]
                      ,traffic=res[10]
                      ,insurance=res[11]
                      ,web_site=res[12]
                      ,phone=res[13]
                      ,addr=res[14]
                      ,postcode=res[15])
      },error = function(e) {
        rm(df)
        df=data.frame(stringsAsFactors = F
                      ,hospital_url=hospital_url
                      ,nm=NA
                      ,addr=NA
                      ,hp_admin=NA
                      ,start_dt=NA
                      ,type=NA
                      ,level=NA
                      ,section_cnt=NA
                      ,doc_cnt=NA
                      ,bed_cnt=NA
                      ,traffic=NA
                      ,insurance=NA
                      ,web_site=NA
                      ,phone=NA
                      ,addr=NA
                      ,postcode=NA)
      })
      return(df)
    }
    
    op <- pboptions(type = "timer")
    x <- hospital_url_pool[1:5000]
    system.time(rdf_99jk_1 <- do.call(rbind.data.frame,pblapply(x,get_detail)))
    pboptions(op)
    write.table(rdf_99jk_1,'rdf_99jk_1_5000.csv',row.names = F,sep='	')
    
    ##多线程爬虫
    pkg <- c('RMySQL','DBI','xml2','rvest','magrittr','xml2','stringr','httpuv','R2HTML','pbapply','curl')
    for(i in pkg){
      if(!i %in% installed.packages()[,1]) (install.packages(i))
    }
    
    library(magrittr)
    library(bitops)
    library(rvest)
    library(stringr)
    library(DBI)
    library(RCurl)
    library(curl)
    library(sqldf)
    library(gdata)
    library(xml2)
    library(pbapply)
    library(parallel)
    
    res_99jk <- read.table('res_99jk.csv',header = T,stringsAsFactors = F)
    hospital_url_pool=res_99jk$hospital_url%>%unique()
    get_detail <- function(hospital_url){
      tryCatch({
        res=hospital_url%>%read_html(encoding='gbk')%>%html_nodes('.tdr')%>%html_text(trim=T)
        df=data.frame(stringsAsFactors = F
                      ,hospital_url=hospital_url
                      ,nm=res[1]
                      ,addr=res[2]
                      ,hp_admin=res[3]
                      ,start_dt=res[4]
                      ,type=res[5]
                      ,level=res[6]
                      ,section_cnt=res[7]
                      ,doc_cnt=res[8]
                      ,bed_cnt=res[9]
                      ,traffic=res[10]
                      ,insurance=res[11]
                      ,web_site=res[12]
                      ,phone=res[13]
                      ,addr=res[14]
                      ,postcode=res[15])
      },error = function(e) {
        rm(df)
        df=data.frame(stringsAsFactors = F
                      ,hospital_url=hospital_url
                      ,nm=NA
                      ,addr=NA
                      ,hp_admin=NA
                      ,start_dt=NA
                      ,type=NA
                      ,level=NA
                      ,section_cnt=NA
                      ,doc_cnt=NA
                      ,bed_cnt=NA
                      ,traffic=NA
                      ,insurance=NA
                      ,web_site=NA
                      ,phone=NA
                      ,addr=NA
                      ,postcode=NA)
      })
      return(df)
    }
    
    # 用system.time来返回计算所需时间
    system.time({
      x <- hospital_url_pool[5001:length(hospital_url_pool)]
      cl <- makeCluster(4) # 初始化四核心集群
      clusterEvalQ(cl, library(magrittr))
      clusterEvalQ(cl, library(rvest))
      clusterEvalQ(cl, library(RCurl))
      clusterEvalQ(cl, library(curl))
      clusterEvalQ(cl, library(xml2))
      results <- parLapply(cl,x,get_detail) # lapply的并行版本
      res.df <- do.call('rbind',results) # 整合结果
      stopCluster(cl) # 关闭集群
    })
    
    # res2=do.call(rbind.data.frame,results)
    res2=data.frame(stringsAsFactors = F)
    for(i in 1:length(results)){
      print(paste0('this is ',i))
      if (!identical(dim(results[[i]]),dim(results[[1]]))) next
      res2=rbind(res2,results[[i]])
    }
    

     

  • 相关阅读:
    【JAVA基础】String 类
    【Java基础】ArrayList类
    【Java基础】Scanner类
    【Java基础】类--------封装
    spring boot默认访问静态资源
    spring boot整合servlet、filter、Listener等组件方式
    在web项目中搭建一个spring mvc + spring + mybatis的环境
    spring mvc注解版01
    spring mvc简单介绍xml版
    服务器和客户端的理解
  • 原文地址:https://www.cnblogs.com/litao1105/p/6915068.html
Copyright © 2011-2022 走看看