zoukankan      html  css  js  c++  java
  • RandomForest&ROC

      1 # -*- coding: utf-8 -*-
      2 # __author__ = 'JieYao'
      3 from biocluster.agent import Agent
      4 from biocluster.tool import Tool
      5 import os
      6 import types
      7 import subprocess
      8 from biocluster.core.exceptions import OptionError
      9 
     10 
     11 class RandomforestAgent(Agent):
     12     """
     13     需要RandomForest.pl
     14     version v1.0
     15     author: JieYao
     16     last_modified:2016.07.18
     17     """
     18 
     19     def __init__(self, parent):
     20         super(RandomforestAgent, self).__init__(parent)
     21         options = [
     22             {"name": "otutable", "type": "infile", "format": "meta.otu.otu_table, meta.otu.tax_summary_dir"},
     23             {"name": "level", "type": "string", "default": "otu"},
     24             {"name": "envtable", "type": "infile", "format": "meta.otu.group_table"},
     25             {"name": "envlabs", "type": "string", "default": ""},
     26             {"name": "ntree", "type": "int", "default": 500 },
     27             {"name": "problem_type", "type": "int", "default": 2 },
     28             {"name": "top_number", "type": "int", "default": 50}
     29         ]
     30         self.add_option(options)
     31         self.step.add_steps('RandomforestAnalysis')
     32         self.on('start', self.step_start)
     33         self.on('end', self.step_end)
     34 
     35     def step_start(self):
     36         self.step.RandomforestAnalysis.start()
     37         self.step.update()
     38 
     39     def step_end(self):
     40         self.step.RandomforestAnalysis.finish()
     41         self.step.update()
     42 
     43     def gettable(self):
     44         """
     45         根据输入的otu表和分类水平计算新的otu表
     46         :return:
     47         """
     48         if self.option('otutable').format == "meta.otu.tax_summary_dir":
     49             return self.option('otutable').get_table(self.option('level'))
     50         else:
     51             return self.option('otutable').prop['path']
     52         
     53     def check_options(self):
     54         """
     55         重写参数检查
     56         """
     57         if not self.option('otutable').is_set:
     58             raise OptionError('必须提供otu表')
     59         self.option('otutable').get_info()
     60         if self.option('otutable').prop['sample_num'] < 2:
     61             raise OptionError('otu表的样本数目少于2,不可进行随机森林特征分析')
     62         if self.option('envtable').is_set:
     63             self.option('envtable').get_info()
     64             if self.option('envlabs'):
     65                 labs = self.option('envlabs').split(',')
     66                 for lab in labs:
     67                     if lab not in self.option('envtable').prop['group_scheme']:
     68                         raise OptionError('envlabs中有不在物种(环境因子)表中存在的因子:%s' % lab)
     69             else:
     70                 pass
     71             if len(self.option('envtable').prop['sample']) < 2:
     72                 raise OptionError('物种(环境因子)表的样本数目少于2,不可进行随机森林特征分析')
     73         samplelist = open(self.gettable()).readline().strip().split('	')[1:]
     74         if self.option('envtable').is_set:
     75             self.option('envtable').get_info()
     76             if len(self.option('envtable').prop['sample']) > len(samplelist):
     77                 raise OptionError('OTU表中的样本数量:%s少于物种(环境因子)表中的样本数量:%s' % (len(samplelist),
     78                                   len(self.option('envtable').prop['sample'])))
     79             for sample in self.option('envtable').prop['sample']:
     80                 if sample not in samplelist:
     81                     raise OptionError('物种(环境因子)表的样本中存在OTU表中未知的样本%s' % sample)
     82         table = open(self.gettable())
     83         if len(table.readlines()) < 4 :
     84             raise OptionError('数据表信息少于3行')
     85         table.close()
     86         if self.option('top_number') > self.option('otutable').prop['otu_num']:
     87             self.option('top_number', self.option('otutable').prop['otu_num'])
     88         return True
     89     
     90     def set_resource(self):
     91         """
     92         设置所需资源
     93         """
     94         self._cpu = 2
     95         self._memory = ''
     96     
     97     def end(self):
     98         result_dir = self.add_upload_dir(self.output_dir)
     99         result_dir.add_relpath_rules([
    100             [".", "", "RandomForest分析结果和ROC计算数据输出目录"],
    101             ["./randomforest_confusion_table.xls", "xls", "RandomForest样本分组模拟结果"],
    102             ["./randomforest_mds_sites.xls", "xls", "样本点坐标表"],
    103             ["./randomforest_proximity_table.xls", "xls", "样本相似度临近矩阵"],
    104             ["./randomforest_topx_vimp.xls", "xls", "Top-X物种(环境因子)丰度表"],
    105             ["./randomforest_vimp_table.xls", "xls", "所有物种(环境因子)重要度表"],
    106             ["./randomforest_predicted_answer.xls", "xls", "随机森林预测分组结果表"],
    107             ["./randomforest_votes_probably.xls","xls", "随机森林各样本分组投票预测概率表"],
    108             ["./roc_table.xls", "xls", "ROC数据标准化后数据表"],
    109             ["./roc_point.xls", "xls", "ROC作图坐标点数据表"],
    110             ["./auc.xls", "xls", "ROC折线下方面积值"],
    111         ])
    112         print self.get_upload_files()
    113         super(RandomforestAgent, self).end()
    114 
    115         
    116 class RandomforestTool(Tool):
    117     def __init__(self, config):
    118         super(RandomforestTool, self).__init__(config)
    119         self._version = '1.0.1'
    120         self.cmd_path = self.config.SOFTWARE_DIR + '/bioinfo/meta/scripts/RandomForest_perl.pl'
    121         self.env_table = self.get_new_env()
    122         self.otu_table = self.get_otu_table()
    123     
    124     def get_otu_table(self):
    125         """
    126         根据调用的level参数重构otu表
    127         :return:
    128         """
    129         if self.option('otutable').format == "meta.otu.tax_summary_dir":
    130             otu_path = self.option('otutable').get_table(self.option('level'))
    131         else:
    132             otu_path = self.option('otutable').prop['path']
    133         if self.option('envtable').is_set:
    134             return self.filter_otu_sample(otu_path, self.option('envtable').prop['sample'],
    135                                           os.path.join(self.work_dir, 'temp_filter.otutable'))
    136         else:
    137             return otu_path
    138     
    139     def filter_otu_sample(self, otu_path, filter_samples, newfile):
    140         if not isinstance(filter_samples, types.ListType):
    141             raise Exception('过滤otu表样本的样本名称应为列表')
    142         try:
    143             with open(otu_path, 'rb') as f, open(newfile, 'wb') as w:
    144                 one_line = f.readline()
    145                 all_samples = one_line.rstrip().split('	')[1:]
    146                 if not ((set(all_samples) & set(filter_samples)) == set(filter_samples)):
    147                     raise Exception('提供的过滤样本集合中存在otu表中不存在的样本all:%s,filter_samples:%s' % (all_samples, filter_samples))
    148                 if len(all_samples) == len(filter_samples):
    149                     return otu_path
    150                 samples_index = [all_samples.index(i) + 1 for i in filter_samples]
    151                 w.write('OTU	' + '	'.join(filter_samples) + '
    ')
    152                 for line in f:
    153                     all_values = line.rstrip().split('	')
    154                     new_values = [all_values[0]] + [all_values[i] for i in samples_index]
    155                     w.write('	'.join(new_values) + '
    ')
    156                 return newfile
    157         except IOError:
    158             raise Exception('无法打开OTU相关文件或者文件不存在')
    159 
    160     def get_new_env(self):
    161         """
    162         根据envlabs生成新的envtable
    163         """
    164         if self.option('envlabs'):
    165             new_path = self.work_dir + '/temp_env_table.xls'
    166             self.option('envtable').sub_group(new_path, self.option('envlabs').split(','))
    167             return new_path
    168         else:
    169             return self.option('envtable').path
    170     
    171     def run(self):
    172         """
    173         运行
    174         """
    175         super(RandomforestTool, self).run()
    176         self.run_RandomForest_perl()
    177     
    178     def formattable(self, tablepath):
    179         with open(tablepath) as table:
    180             if table.read(1) == '#':
    181                 newtable = os.path.join(self.work_dir, 'temp_format.table')
    182                 with open(newtable, 'w') as w:
    183                     w.write(table.read())
    184                 return newtable
    185         return tablepath
    186     
    187     def run_RandomForest_perl(self):
    188         """
    189         运行RandomForest.pl
    190         """
    191         real_otu_path = self.formattable(self.otu_table)
    192         cmd = self.config.SOFTWARE_DIR + '/program/perl/perls/perl-5.24.0/bin/perl ' + self.cmd_path
    193         cmd += ' -i %s -o %s' % (real_otu_path, self.work_dir + '/RandomForest')
    194         if self.option('envtable').is_set:
    195             cmd += ' -g %s -m %s' % (self.env_table, self.env_table)
    196         cmd += ' -ntree %s' % (str(self.option('ntree')))
    197         cmd += ' -type %s' % (str(self.option('problem_type')))
    198         cmd += ' -top %s' % (str(self.option('top_number')))
    199         self.logger.info('运行RandomForest_perl.pl程序进行RandomForest计算')
    200         
    201         try:
    202             subprocess.check_output(cmd, shell=True)
    203             self.logger.info('生成 cmd.r 文件成功')
    204         except subprocess.CalledProcessError:
    205             self.logger.info('生成 cmd.r 文件失败')
    206             self.set_error('无法生成 cmd.r 文件')
    207         try:
    208             subprocess.check_output(self.config.SOFTWARE_DIR + 
    209                                     '/program/R-3.3.1/bin/R --restore --no-save < %s/cmd.r' % (self.work_dir + '/RandomForest'), shell=True)
    210             self.logger.info('RandomForest计算成功')
    211         except subprocess.CalledProcessError:
    212             self.logger.info('RandomForest计算失败')
    213             self.set_error('R运行计算RandomForest失败')
    214         self.logger.info('运行RandomForest_perl.pl程序进行RandomForest计算完成')
    215         allfiles = self.get_filesname()        
    216         self.linkfile(self.work_dir + '/RandomForest/' + allfiles[1], 'randomforest_mds_sites.xls')
    217         self.linkfile(self.work_dir + '/RandomForest/' + allfiles[2], 'randomforest_proximity_table.xls')
    218         self.linkfile(self.work_dir + '/RandomForest/' + allfiles[3], 'randomforest_topx_vimp.xls')
    219         self.linkfile(self.work_dir + '/RandomForest/' + allfiles[4], 'randomforest_vimp_table.xls')
    220         if self.option('envtable').is_set:
    221             if allfiles[0] and allfiles[5] and allfiles[6]:
    222                 self.linkfile(self.work_dir + '/RandomForest/' + allfiles[0], 'randomforest_confusion_table.xls')
    223                 self.linkfile(self.work_dir + '/RandomForest/' + allfiles[5], 'randomforest_predicted_answer.xls')
    224                 self.linkfile(self.work_dir + '/RandomForest/' + allfiles[6], 'randomforest_votes_probably.xls')
    225             else:
    226                 self.set_error('按分组计算的文件生成出错')
    227         if not self.option('envtable').is_set:
    228             self.end()
    229         if not (allfiles[5] and allfiles[6]):
    230             self.end()
    231         cmd = self.config.SOFTWARE_DIR + '/program/perl/perls/perl-5.24.0/bin/perl ' + self.config.SOFTWARE_DIR + '/bioinfo/meta/scripts/calc_roc.pl'
    232         cmd += ' -i1 %s' %(self.work_dir + '/RandomForest/randomforest_votes_probably.xls')
    233         cmd += ' -i2 %s' %(self.work_dir + '/RandomForest/randomforest_predicted_answer.xls')
    234         cmd += ' -o %s' %(self.work_dir + '/ROC/')
    235         self.logger.info('开始运行calc_roc.pl计算ROC相关数据')
    236         
    237         try:
    238             subprocess.check_output(cmd, shell=True)
    239             self.logger.info('生成 roc_cmd.r 成功')
    240         except subprocess.CalledProcessError:
    241             self.logger.info('生成 roc_cmd.r 失败')
    242             self.set_error('无法生成 roc_cmd.r 文件')
    243         try:
    244             subprocess.check_output(self.config.SOFTWARE_DIR +
    245                                     '/program/R-3.3.1/bin/R --restore --no-save < %s/roc_cmd.r' % (self.work_dir + '/ROC'), shell=True)
    246             self.logger.info('ROC计算成功')
    247         except subprocess.CalledProcessError:
    248             self.logger.info('ROC计算失败')
    249             self.set_error('R运行计算ROC失败')
    250         self.logger.info('运行calc_roc.pl程序进行ROC计算完成')
    251         allfiles = self.get_roc_filesname()
    252         self.linkfile(self.work_dir + '/ROC/' + allfiles[0], 'roc_table.xls')
    253         self.linkfile(self.work_dir + '/ROC/' + allfiles[1], 'roc_point.xls')
    254         self.linkfile(self.work_dir + '/ROC/' + allfiles[2], 'auc.xls')
    255         self.end()
    256 
    257     def linkfile(self, oldfile, newname):
    258         """
    259         link文件到output文件夹
    260         :param oldfile: 资源文件路径
    261         :param newname: 新的文件名
    262         :return:
    263         """
    264         newpath = os.path.join(self.output_dir, newname)
    265         if os.path.exists(newpath):
    266             os.remove(newpath)
    267         os.link(oldfile, newpath)
    268 
    269     def get_roc_filesname(self):
    270         filelist = os.listdir(self.work_dir + '/ROC')
    271         roc_table_file = None
    272         roc_point_file = None
    273         auc_file = None
    274         for name in filelist:
    275             if 'roc_table.xls' in name:
    276                 roc_table_file = name
    277             elif 'roc_point.xls' in name:
    278                 roc_point_file = name
    279             elif 'auc.xls' in name:
    280                 auc_file = name
    281         if (roc_table_file and roc_point_file and auc_file):
    282             return [roc_table_file, roc_point_file, auc_file]
    283         else:
    284             self.set_error("未知原因,ROC计算结果丢失")
    285     
    286     def get_filesname(self):
    287         filelist = os.listdir(self.work_dir + '/RandomForest')
    288         randomforest_confusion_table_file = None
    289         randomforest_mds_sites_file = None
    290         randomforest_proximity_table_file = None
    291         randomforest_topx_vimp_file = None
    292         randomforest_vimp_table_file = None
    293         randomforest_predicted_answer_file = None
    294         randomforest_votes_probably_file = None
    295         for name in filelist:
    296             if 'randomforest_confusion_table.xls' in name:
    297                 randomforest_confusion_table_file = name
    298             elif 'randomforest_mds_sites.xls' in name:
    299                 randomforest_mds_sites_file = name
    300             elif 'randomforest_proximity_table.xls' in name:
    301                 randomforest_proximity_table_file = name
    302             elif 'randomforest_topx_vimp.xls' in name:
    303                 randomforest_topx_vimp_file = name
    304             elif 'randomforest_vimp_table.xls' in name:
    305                 randomforest_vimp_table_file = name
    306             elif 'randomforest_predicted_answer.xls' in name:
    307                 randomforest_predicted_answer_file = name
    308             elif 'randomforest_votes_probably.xls' in name:
    309                 randomforest_votes_probably_file = name
    310         if (randomforest_mds_sites_file and randomforest_proximity_table_file and 
    311             randomforest_topx_vimp_file and randomforest_vimp_table_file):
    312             if self.option('envtable').is_set:
    313                 if not randomforest_confusion_table_file:
    314                     self.set_error('未知原因,样本分组模拟结果丢失或未生成')
    315                 if not randomforest_predicted_answer_file:
    316                     self.set_error('未知原因,样本分组预测结果文件丢失或未生成')
    317                 if not randomforest_votes_probably_file:
    318                     self.set_error('未知原因,样本分组预测概率表丢失或未生成')
    319             return [randomforest_confusion_table_file, randomforest_mds_sites_file,
    320                     randomforest_proximity_table_file, randomforest_topx_vimp_file,
    321                     randomforest_vimp_table_file, randomforest_predicted_answer_file,
    322                     randomforest_votes_probably_file]
    323         else:
    324             self.set_error('未知原因,数据计算结果丢失或者未生成')
    View Code
      1 #!/mnt/ilustre/users/sanger-dev/app/program/perl/perls/perl-5.24.0/bin/perl
      2 use strict;
      3 use warnings;
      4 use Getopt::Long;
      5 my %opts;
      6 my $VERSION = "V2.20160708";
      7 GetOptions( %opts,"i=s","m=s","o=s","g=s","ntree=i","top=i","type=s");
      8 my $usage = <<"USAGE";
      9        Program : $0   
     10        Discription: Program used to caculate randomforest,with mds plot and importance variables given .  
     11        Version : $VERSION
     12        Contact : jie.yao@majorbio.com
     13        Usage :perl $0 [options]         
     14                         -i      * input otu table file 
     15                         -o      * output dir
     16                         -m      input mapping file if you want set points's color and pch by groups. If omitted, randomForest will run in unsupervised mode.
     17                 Default:none
     18                         -g      group name in mapping file .Default:none
     19                         -ntree    Number of trees to grow. This should not be set to too     small a number, to ensure that every input row gets predicted at least a few times.Default:500    
     20                         -top    How many variables to show? 
     21                         -type     either 1,2 or 3, specifying the type of importance measure (1=mean decrease in accuracy, 2=mean decrease in node impurity).
     22             
     23        Example:$0 -i otu_table.xls -o randomForest  -m group -g group  
     24 
     25 USAGE
     26 
     27 die $usage if(!($opts{i}&&$opts{o}));
     28 die $usage if($opts{m}&& !$opts{g});
     29 die $usage if(!$opts{m}&& $opts{g});
     30 
     31 $opts{m}=defined $opts{m}?$opts{m}:"none";
     32 $opts{g}=defined $opts{g}?$opts{g}:"none";
     33 $opts{ntree}=defined $opts{ntree}?$opts{ntree}:"500";
     34 $opts{type}=defined $opts{type}?$opts{type}:"1";
     35 $opts{top}=defined $opts{top}?$opts{top}:"50";
     36 
     37 
     38 if(! -e $opts{o}){
     39                 `mkdir $opts{o}`;
     40 }
     41 
     42 
     43 open CMD,">$opts{o}/cmd.r";
     44 print CMD "
     45 library(sp,warn.conflicts = F)
     46 library(randomForest,warn.conflicts = F)
     47 library(maptools,warn.conflicts = F)
     48 basename="randomforest"
     49 
     50 
     51 # if read otu data
     52 otu <-read.table("$opts{i}",sep="\t",head=T,check.names = F)
     53 rownames(otu) <-as.factor(otu[,1])
     54 otu <-otu[,-1]
     55 rownames(otu) <-sapply(rownames(otu),function(x) gsub("_*{.+}"," ",x,perl = TRUE))
     56 rownames(otu) <-sapply(rownames(otu),function(x) gsub("-","_",x,perl = TRUE))
     57 rownames(otu) <-sapply(rownames(otu),function(x) gsub("\\[","",x,perl = TRUE))
     58 rownames(otu) <-sapply(rownames(otu),function(x) gsub("\\]","",x,perl = TRUE))
     59 rownames(otu) <-sapply(rownames(otu),function(x) gsub("\\(","",x,perl = TRUE))
     60 rownames(otu) <-sapply(rownames(otu),function(x) gsub("\\)","",x,perl = TRUE))
     61 rownames(otu) <-sapply(rownames(otu),function(x) gsub("^[0-9]","X\\1",x,perl = TRUE))
     62 rownames(otu) <-sapply(rownames(otu),function(x) gsub("/","",x,perl = TRUE))
     63 otu <-as.data.frame(t(otu),stringsAsFactors=T)
     64 
     65 map="$opts{m}"
     66 if(map !="none"){
     67                 sd <-read.table("$opts{m}",head=T,sep="\t",comment.char = "",check.names = FALSE)        
     68                 rownames(sd) <- as.character(sd[,1])
     69                 sd[,1] <-as.character(sd[,1])
     70                 sd$group <-as.factor(sd$group )
     71                 legend <- as.matrix(unique(sd$group)) 
     72 }
     73 
     74 set.seed(1)
     75 if(map != "none"){
     76 otu.rf <- randomForest(sd$group ~ .,otu,importance=T,proximity=T,ntree=$opts{ntree})
     77 
     78 
     79 
     80 class_count <-as.matrix(table(sd$group))
     81 class <-data.frame(count=class_count)
     82 
     83 ##randomforest votes probably
     84 votes_probably<- paste("$opts{o}/",basename,"_votes_probably.xls",sep="")
     85 write.table(otu.rf$votes,votes_probably,sep="\t",quote=F)
     86 
     87 ##randomforest predicted answer
     88 predicted_answer <- paste("$opts{o}/",basename,"_predicted_answer.xls",sep="")
     89 write.table(otu.rf$predicted,predicted_answer,sep="\t",quote=F)
     90 
     91 ##randomforest classification table
     92 rf_table <- paste("$opts{o}/",basename,"_confusion_table.xls",sep="")
     93 write.table(otu.rf$confusion,rf_table,sep="\t",quote=F)
     94 mds <- cmdscale(1-otu.rf$proximity)  
     95 }else{
     96 otu.rf <- randomForest(otu,importance=T,proximity=T,ntree=$opts{ntree})
     97 mds <- cmdscale(1-otu.rf$proximity)
     98 }
     99 
    100 
    101 ##mds points
    102 mds_points <- paste("$opts{o}/",basename,"_mds_sites.xls",sep="")
    103 write.table(mds,mds_points,sep="\t",quote=F)
    104 
    105 ##proximity table
    106 proximity <- paste("$opts{o}/",basename,"_proximity_table.xls",sep="")
    107 write.table(otu.rf$proximity,proximity,sep="\t",quote=F)
    108 
    109 ## importance table
    110 vimp_table <- paste("$opts{o}/",basename,"_vimp_table.xls",sep="")
    111 write.table(otu.rf$importance,vimp_table,sep="\t",quote=F)
    112 
    113 
    114 ## top importance species table
    115 topx_vimp <- paste("$opts{o}/",basename,"_topx_vimp.xls",sep="")
    116 imp <- importance(otu.rf)
    117 if($opts{type} == 1){
    118     top <- imp[order(imp[,"MeanDecreaseAccuracy"],decreasing=T),][1:min($opts{top},length(imp[,1])),]
    119     write.table(t(otu)[rownames(top),],topx_vimp,sep="\t",quote=F)
    120     
    121 }else if ($opts{type} == 2){
    122     top <- imp[order(imp[,"MeanDecreaseGini"],decreasing=T),][1:min($opts{top},length(imp[,1])),]
    123         write.table(t(otu)[rownames(top),],topx_vimp,sep="\t",quote=F)
    124 }
    125 
    126 ";
    127 
    128 `R --restore --no-save < $opts{o}/cmd.r`;
    View Code
  • 相关阅读:
    P3396 哈希冲突 分块
    大数据之路week01--自学之面向对象java(static,this指针(初稿))
    大数据之路week01--自学之集合_2(列表迭代器 ListIterator)
    大数据之路week01--自学之集合_2(List)
    大数据之路week01--自学之集合_2(Iterator迭代器)
    大数据之路week01--自学之集合_1(Collection)
    大数据之路day05_1--初识类、对象
    大数据之路day04_2--经典bug(equals与==比较不同,break的跳出不同)
    大数据之路day04_1--数组 and for循环进阶
    eclipse断点的使用---for循环举例
  • 原文地址:https://www.cnblogs.com/neverforget/p/5784965.html
Copyright © 2011-2022 走看看