zoukankan      html  css  js  c++  java
  • R 代码积累

    R 代码积累不定期更新

    1.阶乘、递归、reduce、sprintf

    #NO.1
    # 阶乘函数
    fact <- function(n){
      if(n==0) return(1) #基例在这
      else return(n*fact(n-1))
    }
    
    #1+2!+3!+...+20!的和
    #测试 Reduce('+', lapply(1:3,fact)) 结果是9
    Reduce('+', lapply(1:20,fact))
    
    #NO.2
    #判断101-200之间有多少个素数,并输出所有素数
    is.prime <- function(num) {
      if (num == 2) {
        TRUE
      } else if (any(num %% 2:(num-1) == 0)) {
        FALSE
      } else { 
        TRUE
      }
    }
    #101共有多少个素数
    Reduce('+', lapply(101:200,is.prime))
    #打印出每一个素数
    for(i in 101:200){
      if(!is.prime(i)) next
      print(sprintf('%d 是素数 %s',i,is.prime(i)))
    }
    

    2.MD5加密卡号

    library(magrittr)
    library(digest)
    library(data.table)
    library(readxl)
    library(lubridate)
    
    dat=read_excel('data_union.xlsx',sheet=4)
    dim(dat)
    get_len <- function(a){
      return(paste0(nchar(a),a))
    }
    dat$card_len=lapply(dat$bank_card, get_len)%>%unlist()
    dat$card_md5 <- lapply(dat$card_len, digest, algo="md5",serialize=F)%>%unlist()
    dat$card_md5_upper <- toupper(dat$card_md5)
    write.table(
      dat[,c('card_md5_upper')]
      ,'yqb_card_md5.txt',row.names = F
      ,col.names=F
      ,quote = F)
    
    #ym_apply
    dat$ym_apply=dat$time %m-% months(1)%>%
      format("%Y%m")%>%as.numeric()
    
    #write out card_md5 and ym_apply
    write.table(
      dat[c('card_md5_upper','ym_apply')]
      ,sep=','
      ,'yqb_card_ym_md5.txt',row.names = F
      ,col.names=F
      ,quote = F)
    
    unique(dat$ym_apply)
    

    3.时间函数

    https://cran.r-project.org/web/packages/lubridate/vignettes/lubridate.html

     4.随机森林可视化

    library(randomForest)
    library(kernlab) # spam数据集 来源于这个 包
    library(magrittr)
    library(caret)
    #setwd('...')
    #data <- read.csv('...')
    
    data(spam) #加载数据集
    # spam 是一个数据集,一条记录代表一封邮件包含哪些特殊词
    # ,这封邮件是否是垃圾邮件(y变量)等等一些变量
    View(spam)
    table(spam$type) #看一下Y变量的分布,是否存在不均衡(imbalanced data)
    #type 是y变量 也可以是0,1,但注意,如果你的数据集是 df,  要预先 将y变量转换为factor ,df$y <- as.factor(df$y)
    set.seed(2016) #为使模型可再现,这里预先设定随机种子,因为随机森林的 每棵树对行是进行随机有放回的抽样的
    
    rf <- randomForest(type~.
                       ,data=spam
                       ,importance=TRUE #保留变量的重要性 TURE的时候,可通过rf$importance 查看
                       ,proximity=FALSE #在n棵树种,一个矩阵任意两条记录 落在同一个叶子节点的概率,可以表示两条记录的相似程度
                       ,ntree=500 #种多少棵树
                       ,do.trace=20 #每20条显示 误分率
                       ,na.action=na.omit#一行中只要存在一个NA,这条记录就删掉
                       ,strata=spam$type #按照Y变量中(0,1的比例,进行上下采样,对占比少的用oversampling,对占比多的用downsampling)
                       ,sampsize=c(1500,1500) #对0,1采样的个数分别是多少,都是有放回的
                       ,mtry=7 #每一棵决策树,选取几个feature?,对于classification 一般是feature总数的开平方(这个是default)
                       ,keep.forest=FALSE) #保留每一棵数
    
    rf$confusion #混淆矩阵
    
    ##变量重要性,如果你的数据是有上千个变量,可以根据变量的重要性对数据进行降维
    par(mfrow = c(2,2))
    for(i in 1:4){
      plot(sort(rf$importance[,i],decreasing =TRUE)%>%head(20)
           ,type='h'
           ,main=paste0(colnames(rf$importance)[i])
           ,xlab='variable'
           ,ylab='importance')
      
      text(sort(rf$importance[,i],decreasing =TRUE)
           ,labels=names(sort(rf$importance[,i]))%>%head(20)
           ,pos=1
           ,cex=0.9)
    }
    
    ## 下面画ROC 曲线,计算AUC
    library(ROCR)
    predictions=as.vector(rf$votes[,2])
    pred=prediction(predictions,spam$type)
    
    perf_AUC=performance(pred,"auc") #Calculate the AUC value
    AUC=perf_AUC@y.values[[1]]
    
    perf_ROC=performance(pred,"tpr","fpr") #plot the actual ROC curve
    plot(perf_ROC, main="ROC plot")
    text(0.5,0.5,paste("AUC = ",format(AUC, digits=5, scientific=FALSE)))
    
    #cutoff accuracy
    perf <- performance(pred, measure="acc", x.measure="cutoff")
    # Get the cutoff for the best accuracy
    bestAccInd <- which.max(perf@"y.values"[[1]])
    bestMsg <- paste("best accuracy=", perf@"y.values"[[1]][bestAccInd], 
                     " at cutoff=", round(perf@"x.values"[[1]][bestAccInd], 4))
    plot(perf, sub=bestMsg)
    abline(v=round(perf@"x.values"[[1]][bestAccInd], 4))
    
    # calculate the confusion matrix and plot
    cm <- confusionMatrix(rf$predicted, reference = spam$type)
    draw_confusion_matrix(cm)
    
    #confusion matrix visualization
    draw_confusion_matrix <- function(cm) {
      
      layout(matrix(c(1,1,2)))
      par(mar=c(2,2,2,2))
      plot(c(100, 345), c(300, 450), type = "n", xlab="", ylab="", xaxt='n', yaxt='n')
      title('CONFUSION MATRIX', cex.main=2)
      
      # create the matrix 
      rect(150, 430, 240, 370, col='#3F97D0')
      text(195, 435,  rf$classes[1], cex=1.2)
      rect(250, 430, 340, 370, col='#F7AD50')
      text(295, 435, rf$classes[2], cex=1.2)
      text(125, 370, 'Predicted', cex=1.3, srt=90, font=2)
      text(245, 450, 'Actual', cex=1.3, font=2)
      rect(150, 305, 240, 365, col='#F7AD50')
      rect(250, 305, 340, 365, col='#3F97D0')
      text(140, 400,  rf$classes[1], cex=1.2, srt=90)
      text(140, 335, rf$classes[2], cex=1.2, srt=90)
      
      # add in the cm results 
      res <- as.numeric(cm$table)
      text(195, 400, res[1], cex=1.6, font=2, col='white')
      text(195, 335, res[2], cex=1.6, font=2, col='white')
      text(295, 400, res[3], cex=1.6, font=2, col='white')
      text(295, 335, res[4], cex=1.6, font=2, col='white')
      
      # add in the specifics 
      plot(c(100, 0), c(100, 0), type = "n", xlab="", ylab="", main = "DETAILS", xaxt='n', yaxt='n')
      text(10, 85, names(cm$byClass[1]), cex=1.2, font=2)
      text(10, 70, round(as.numeric(cm$byClass[1]), 3), cex=1.2)
      text(30, 85, names(cm$byClass[2]), cex=1.2, font=2)
      text(30, 70, round(as.numeric(cm$byClass[2]), 3), cex=1.2)
      text(50, 85, names(cm$byClass[5]), cex=1.2, font=2)
      text(50, 70, round(as.numeric(cm$byClass[5]), 3), cex=1.2)
      text(70, 85, names(cm$byClass[6]), cex=1.2, font=2)
      text(70, 70, round(as.numeric(cm$byClass[6]), 3), cex=1.2)
      text(90, 85, names(cm$byClass[7]), cex=1.2, font=2)
      text(90, 70, round(as.numeric(cm$byClass[7]), 3), cex=1.2)
      
      # add in the accuracy information 
      text(30, 35, names(cm$overall[1]), cex=1.5, font=2)
      text(30, 20, round(as.numeric(cm$overall[1]), 3), cex=1.4)
      text(70, 35, names(cm$overall[2]), cex=1.5, font=2)
      text(70, 20, round(as.numeric(cm$overall[2]), 3), cex=1.4)
    }  
    

     5.一个多线程 爬虫

    ##  99健康网 医院抓取  ##
    
    #step1:by city
    url_0='http://yyk.99.com.cn/city.html'
    content_0=url_0%>%read_html()
    city_nm=c(
      content_0%>%html_nodes('.cityarea li a')%>%html_text(trim=T)
      ,content_0%>%html_nodes('.cityarea .background li a')%>%html_text(trim=T)
    )
    city_url=c(
      content_0%>%html_nodes('.cityarea li a')%>%html_attr('href')%>%paste0('http://yyk.99.com.cn',.)
      ,content_0%>%html_nodes('.cityarea .background li a')%>%html_attr('href')%>%paste0('http://yyk.99.com.cn',.)
    )
    length(city_nm)
    length(city_url)
    
    #step2:get hospital by city
    get_hospital_1 <- function(i){
      # i=1
      content_1=city_url[i]%>%read_html(encoding='gbk')
      hospital_nm=content_1%>%html_nodes('.tablist li span')%>%html_text(trim=T)
      hospital_url=content_1%>%html_nodes('.tablist li a')%>%html_attr('href')%>%paste0(.,'jianjie.html')
      return(data.frame(
        stringsAsFactors = F
        ,city_nm=rep(city_nm[i],length(hospital_nm))
        ,hospital_nm=hospital_nm
        ,hospital_url=hospital_url
      ))
    }
    system.time({
      x <- 301:length(city_url)
      res_99jk_3 <- do.call('rbind',lapply(x,get_hospital_1))
    })
    write.table(res_99jk_1,'res_99jk_1.csv',row.names = F,sep='	')
    write.table(res_99jk_2,'res_99jk_2.csv',row.names = F,sep='	')
    write.table(res_99jk_3,'res_99jk_3.csv',row.names = F,sep='	')
    res_99jk <- rbind(res_99jk_1
                      ,res_99jk_2
                      ,res_99jk_3)
    
    #step3 get all fileds level beds_cnt...
    res_99jk <- read.table('res_99jk.csv',header = T)
    hospital_url_pool=res_99jk$hospital_url%>%unique()
    get_detail <- function(hospital_url){
      tryCatch({
        res=hospital_url%>%read_html(encoding='gbk')%>%html_nodes('.tdr')%>%html_text(trim=T)
        df=data.frame(stringsAsFactors = F
                      ,hospital_url=hospital_url
                      ,nm=res[1]
                      ,addr=res[2]
                      ,hp_admin=res[3]
                      ,start_dt=res[4]
                      ,type=res[5]
                      ,level=res[6]
                      ,section_cnt=res[7]
                      ,doc_cnt=res[8]
                      ,bed_cnt=res[9]
                      ,traffic=res[10]
                      ,insurance=res[11]
                      ,web_site=res[12]
                      ,phone=res[13]
                      ,addr=res[14]
                      ,postcode=res[15])
      },error = function(e) {
        rm(df)
        df=data.frame(stringsAsFactors = F
                      ,hospital_url=hospital_url
                      ,nm=NA
                      ,addr=NA
                      ,hp_admin=NA
                      ,start_dt=NA
                      ,type=NA
                      ,level=NA
                      ,section_cnt=NA
                      ,doc_cnt=NA
                      ,bed_cnt=NA
                      ,traffic=NA
                      ,insurance=NA
                      ,web_site=NA
                      ,phone=NA
                      ,addr=NA
                      ,postcode=NA)
      })
      return(df)
    }
    
    op <- pboptions(type = "timer")
    x <- hospital_url_pool[1:5000]
    system.time(rdf_99jk_1 <- do.call(rbind.data.frame,pblapply(x,get_detail)))
    pboptions(op)
    write.table(rdf_99jk_1,'rdf_99jk_1_5000.csv',row.names = F,sep='	')
    
    ##多线程爬虫
    pkg <- c('RMySQL','DBI','xml2','rvest','magrittr','xml2','stringr','httpuv','R2HTML','pbapply','curl')
    for(i in pkg){
      if(!i %in% installed.packages()[,1]) (install.packages(i))
    }
    
    library(magrittr)
    library(bitops)
    library(rvest)
    library(stringr)
    library(DBI)
    library(RCurl)
    library(curl)
    library(sqldf)
    library(gdata)
    library(xml2)
    library(pbapply)
    library(parallel)
    
    res_99jk <- read.table('res_99jk.csv',header = T,stringsAsFactors = F)
    hospital_url_pool=res_99jk$hospital_url%>%unique()
    get_detail <- function(hospital_url){
      tryCatch({
        res=hospital_url%>%read_html(encoding='gbk')%>%html_nodes('.tdr')%>%html_text(trim=T)
        df=data.frame(stringsAsFactors = F
                      ,hospital_url=hospital_url
                      ,nm=res[1]
                      ,addr=res[2]
                      ,hp_admin=res[3]
                      ,start_dt=res[4]
                      ,type=res[5]
                      ,level=res[6]
                      ,section_cnt=res[7]
                      ,doc_cnt=res[8]
                      ,bed_cnt=res[9]
                      ,traffic=res[10]
                      ,insurance=res[11]
                      ,web_site=res[12]
                      ,phone=res[13]
                      ,addr=res[14]
                      ,postcode=res[15])
      },error = function(e) {
        rm(df)
        df=data.frame(stringsAsFactors = F
                      ,hospital_url=hospital_url
                      ,nm=NA
                      ,addr=NA
                      ,hp_admin=NA
                      ,start_dt=NA
                      ,type=NA
                      ,level=NA
                      ,section_cnt=NA
                      ,doc_cnt=NA
                      ,bed_cnt=NA
                      ,traffic=NA
                      ,insurance=NA
                      ,web_site=NA
                      ,phone=NA
                      ,addr=NA
                      ,postcode=NA)
      })
      return(df)
    }
    
    # 用system.time来返回计算所需时间
    system.time({
      x <- hospital_url_pool[5001:length(hospital_url_pool)]
      cl <- makeCluster(4) # 初始化四核心集群
      clusterEvalQ(cl, library(magrittr))
      clusterEvalQ(cl, library(rvest))
      clusterEvalQ(cl, library(RCurl))
      clusterEvalQ(cl, library(curl))
      clusterEvalQ(cl, library(xml2))
      results <- parLapply(cl,x,get_detail) # lapply的并行版本
      res.df <- do.call('rbind',results) # 整合结果
      stopCluster(cl) # 关闭集群
    })
    
    # res2=do.call(rbind.data.frame,results)
    res2=data.frame(stringsAsFactors = F)
    for(i in 1:length(results)){
      print(paste0('this is ',i))
      if (!identical(dim(results[[i]]),dim(results[[1]]))) next
      res2=rbind(res2,results[[i]])
    }
    

     

  • 相关阅读:
    angularjs的$on、$emit、$broadcast
    angularjs中的路由介绍详解 ui-route(转)
    ionic入门教程-ionic路由详解(state、route、resolve)(转)
    Cocos Creator 加载使用protobuf第三方库,因为加载顺序报错
    Cocos Creator 计时器错误 cc.Scheduler: Illegal target which doesn't have uuid or instanceId.
    Cocos Creator 构造函数传参警告 Can not instantiate CCClass 'Test' with arguments.
    Cocos Creator 对象池NodePool
    Cocos Creator 坐标系 (convertToWorldSpaceAR、convertToNodeSpaceAR)
    Cocos Creator 常驻节点addPersistRootNode
    Cocos Creator 配合Tiled地图的使用
  • 原文地址:https://www.cnblogs.com/litao1105/p/6915068.html
Copyright © 2011-2022 走看看