1 # -*- coding: utf-8 -*- 2 # __author__ = 'JieYao' 3 from biocluster.agent import Agent 4 from biocluster.tool import Tool 5 import os 6 import types 7 import subprocess 8 from biocluster.core.exceptions import OptionError 9 10 11 class RandomforestAgent(Agent): 12 """ 13 需要RandomForest.pl 14 version v1.0 15 author: JieYao 16 last_modified:2016.07.18 17 """ 18 19 def __init__(self, parent): 20 super(RandomforestAgent, self).__init__(parent) 21 options = [ 22 {"name": "otutable", "type": "infile", "format": "meta.otu.otu_table, meta.otu.tax_summary_dir"}, 23 {"name": "level", "type": "string", "default": "otu"}, 24 {"name": "envtable", "type": "infile", "format": "meta.otu.group_table"}, 25 {"name": "envlabs", "type": "string", "default": ""}, 26 {"name": "ntree", "type": "int", "default": 500 }, 27 {"name": "problem_type", "type": "int", "default": 2 }, 28 {"name": "top_number", "type": "int", "default": 50} 29 ] 30 self.add_option(options) 31 self.step.add_steps('RandomforestAnalysis') 32 self.on('start', self.step_start) 33 self.on('end', self.step_end) 34 35 def step_start(self): 36 self.step.RandomforestAnalysis.start() 37 self.step.update() 38 39 def step_end(self): 40 self.step.RandomforestAnalysis.finish() 41 self.step.update() 42 43 def gettable(self): 44 """ 45 根据输入的otu表和分类水平计算新的otu表 46 :return: 47 """ 48 if self.option('otutable').format == "meta.otu.tax_summary_dir": 49 return self.option('otutable').get_table(self.option('level')) 50 else: 51 return self.option('otutable').prop['path'] 52 53 def check_options(self): 54 """ 55 重写参数检查 56 """ 57 if not self.option('otutable').is_set: 58 raise OptionError('必须提供otu表') 59 self.option('otutable').get_info() 60 if self.option('otutable').prop['sample_num'] < 2: 61 raise OptionError('otu表的样本数目少于2,不可进行随机森林特征分析') 62 if self.option('envtable').is_set: 63 self.option('envtable').get_info() 64 if self.option('envlabs'): 65 labs = self.option('envlabs').split(',') 66 for lab in labs: 67 if lab not in self.option('envtable').prop['group_scheme']: 68 raise OptionError('envlabs中有不在物种(环境因子)表中存在的因子:%s' % lab) 69 else: 70 pass 71 if len(self.option('envtable').prop['sample']) < 2: 72 raise OptionError('物种(环境因子)表的样本数目少于2,不可进行随机森林特征分析') 73 samplelist = open(self.gettable()).readline().strip().split(' ')[1:] 74 if self.option('envtable').is_set: 75 self.option('envtable').get_info() 76 if len(self.option('envtable').prop['sample']) > len(samplelist): 77 raise OptionError('OTU表中的样本数量:%s少于物种(环境因子)表中的样本数量:%s' % (len(samplelist), 78 len(self.option('envtable').prop['sample']))) 79 for sample in self.option('envtable').prop['sample']: 80 if sample not in samplelist: 81 raise OptionError('物种(环境因子)表的样本中存在OTU表中未知的样本%s' % sample) 82 table = open(self.gettable()) 83 if len(table.readlines()) < 4 : 84 raise OptionError('数据表信息少于3行') 85 table.close() 86 if self.option('top_number') > self.option('otutable').prop['otu_num']: 87 self.option('top_number', self.option('otutable').prop['otu_num']) 88 return True 89 90 def set_resource(self): 91 """ 92 设置所需资源 93 """ 94 self._cpu = 2 95 self._memory = '' 96 97 def end(self): 98 result_dir = self.add_upload_dir(self.output_dir) 99 result_dir.add_relpath_rules([ 100 [".", "", "RandomForest分析结果和ROC计算数据输出目录"], 101 ["./randomforest_confusion_table.xls", "xls", "RandomForest样本分组模拟结果"], 102 ["./randomforest_mds_sites.xls", "xls", "样本点坐标表"], 103 ["./randomforest_proximity_table.xls", "xls", "样本相似度临近矩阵"], 104 ["./randomforest_topx_vimp.xls", "xls", "Top-X物种(环境因子)丰度表"], 105 ["./randomforest_vimp_table.xls", "xls", "所有物种(环境因子)重要度表"], 106 ["./randomforest_predicted_answer.xls", "xls", "随机森林预测分组结果表"], 107 ["./randomforest_votes_probably.xls","xls", "随机森林各样本分组投票预测概率表"], 108 ["./roc_table.xls", "xls", "ROC数据标准化后数据表"], 109 ["./roc_point.xls", "xls", "ROC作图坐标点数据表"], 110 ["./auc.xls", "xls", "ROC折线下方面积值"], 111 ]) 112 print self.get_upload_files() 113 super(RandomforestAgent, self).end() 114 115 116 class RandomforestTool(Tool): 117 def __init__(self, config): 118 super(RandomforestTool, self).__init__(config) 119 self._version = '1.0.1' 120 self.cmd_path = self.config.SOFTWARE_DIR + '/bioinfo/meta/scripts/RandomForest_perl.pl' 121 self.env_table = self.get_new_env() 122 self.otu_table = self.get_otu_table() 123 124 def get_otu_table(self): 125 """ 126 根据调用的level参数重构otu表 127 :return: 128 """ 129 if self.option('otutable').format == "meta.otu.tax_summary_dir": 130 otu_path = self.option('otutable').get_table(self.option('level')) 131 else: 132 otu_path = self.option('otutable').prop['path'] 133 if self.option('envtable').is_set: 134 return self.filter_otu_sample(otu_path, self.option('envtable').prop['sample'], 135 os.path.join(self.work_dir, 'temp_filter.otutable')) 136 else: 137 return otu_path 138 139 def filter_otu_sample(self, otu_path, filter_samples, newfile): 140 if not isinstance(filter_samples, types.ListType): 141 raise Exception('过滤otu表样本的样本名称应为列表') 142 try: 143 with open(otu_path, 'rb') as f, open(newfile, 'wb') as w: 144 one_line = f.readline() 145 all_samples = one_line.rstrip().split(' ')[1:] 146 if not ((set(all_samples) & set(filter_samples)) == set(filter_samples)): 147 raise Exception('提供的过滤样本集合中存在otu表中不存在的样本all:%s,filter_samples:%s' % (all_samples, filter_samples)) 148 if len(all_samples) == len(filter_samples): 149 return otu_path 150 samples_index = [all_samples.index(i) + 1 for i in filter_samples] 151 w.write('OTU ' + ' '.join(filter_samples) + ' ') 152 for line in f: 153 all_values = line.rstrip().split(' ') 154 new_values = [all_values[0]] + [all_values[i] for i in samples_index] 155 w.write(' '.join(new_values) + ' ') 156 return newfile 157 except IOError: 158 raise Exception('无法打开OTU相关文件或者文件不存在') 159 160 def get_new_env(self): 161 """ 162 根据envlabs生成新的envtable 163 """ 164 if self.option('envlabs'): 165 new_path = self.work_dir + '/temp_env_table.xls' 166 self.option('envtable').sub_group(new_path, self.option('envlabs').split(',')) 167 return new_path 168 else: 169 return self.option('envtable').path 170 171 def run(self): 172 """ 173 运行 174 """ 175 super(RandomforestTool, self).run() 176 self.run_RandomForest_perl() 177 178 def formattable(self, tablepath): 179 with open(tablepath) as table: 180 if table.read(1) == '#': 181 newtable = os.path.join(self.work_dir, 'temp_format.table') 182 with open(newtable, 'w') as w: 183 w.write(table.read()) 184 return newtable 185 return tablepath 186 187 def run_RandomForest_perl(self): 188 """ 189 运行RandomForest.pl 190 """ 191 real_otu_path = self.formattable(self.otu_table) 192 cmd = self.config.SOFTWARE_DIR + '/program/perl/perls/perl-5.24.0/bin/perl ' + self.cmd_path 193 cmd += ' -i %s -o %s' % (real_otu_path, self.work_dir + '/RandomForest') 194 if self.option('envtable').is_set: 195 cmd += ' -g %s -m %s' % (self.env_table, self.env_table) 196 cmd += ' -ntree %s' % (str(self.option('ntree'))) 197 cmd += ' -type %s' % (str(self.option('problem_type'))) 198 cmd += ' -top %s' % (str(self.option('top_number'))) 199 self.logger.info('运行RandomForest_perl.pl程序进行RandomForest计算') 200 201 try: 202 subprocess.check_output(cmd, shell=True) 203 self.logger.info('生成 cmd.r 文件成功') 204 except subprocess.CalledProcessError: 205 self.logger.info('生成 cmd.r 文件失败') 206 self.set_error('无法生成 cmd.r 文件') 207 try: 208 subprocess.check_output(self.config.SOFTWARE_DIR + 209 '/program/R-3.3.1/bin/R --restore --no-save < %s/cmd.r' % (self.work_dir + '/RandomForest'), shell=True) 210 self.logger.info('RandomForest计算成功') 211 except subprocess.CalledProcessError: 212 self.logger.info('RandomForest计算失败') 213 self.set_error('R运行计算RandomForest失败') 214 self.logger.info('运行RandomForest_perl.pl程序进行RandomForest计算完成') 215 allfiles = self.get_filesname() 216 self.linkfile(self.work_dir + '/RandomForest/' + allfiles[1], 'randomforest_mds_sites.xls') 217 self.linkfile(self.work_dir + '/RandomForest/' + allfiles[2], 'randomforest_proximity_table.xls') 218 self.linkfile(self.work_dir + '/RandomForest/' + allfiles[3], 'randomforest_topx_vimp.xls') 219 self.linkfile(self.work_dir + '/RandomForest/' + allfiles[4], 'randomforest_vimp_table.xls') 220 if self.option('envtable').is_set: 221 if allfiles[0] and allfiles[5] and allfiles[6]: 222 self.linkfile(self.work_dir + '/RandomForest/' + allfiles[0], 'randomforest_confusion_table.xls') 223 self.linkfile(self.work_dir + '/RandomForest/' + allfiles[5], 'randomforest_predicted_answer.xls') 224 self.linkfile(self.work_dir + '/RandomForest/' + allfiles[6], 'randomforest_votes_probably.xls') 225 else: 226 self.set_error('按分组计算的文件生成出错') 227 if not self.option('envtable').is_set: 228 self.end() 229 if not (allfiles[5] and allfiles[6]): 230 self.end() 231 cmd = self.config.SOFTWARE_DIR + '/program/perl/perls/perl-5.24.0/bin/perl ' + self.config.SOFTWARE_DIR + '/bioinfo/meta/scripts/calc_roc.pl' 232 cmd += ' -i1 %s' %(self.work_dir + '/RandomForest/randomforest_votes_probably.xls') 233 cmd += ' -i2 %s' %(self.work_dir + '/RandomForest/randomforest_predicted_answer.xls') 234 cmd += ' -o %s' %(self.work_dir + '/ROC/') 235 self.logger.info('开始运行calc_roc.pl计算ROC相关数据') 236 237 try: 238 subprocess.check_output(cmd, shell=True) 239 self.logger.info('生成 roc_cmd.r 成功') 240 except subprocess.CalledProcessError: 241 self.logger.info('生成 roc_cmd.r 失败') 242 self.set_error('无法生成 roc_cmd.r 文件') 243 try: 244 subprocess.check_output(self.config.SOFTWARE_DIR + 245 '/program/R-3.3.1/bin/R --restore --no-save < %s/roc_cmd.r' % (self.work_dir + '/ROC'), shell=True) 246 self.logger.info('ROC计算成功') 247 except subprocess.CalledProcessError: 248 self.logger.info('ROC计算失败') 249 self.set_error('R运行计算ROC失败') 250 self.logger.info('运行calc_roc.pl程序进行ROC计算完成') 251 allfiles = self.get_roc_filesname() 252 self.linkfile(self.work_dir + '/ROC/' + allfiles[0], 'roc_table.xls') 253 self.linkfile(self.work_dir + '/ROC/' + allfiles[1], 'roc_point.xls') 254 self.linkfile(self.work_dir + '/ROC/' + allfiles[2], 'auc.xls') 255 self.end() 256 257 def linkfile(self, oldfile, newname): 258 """ 259 link文件到output文件夹 260 :param oldfile: 资源文件路径 261 :param newname: 新的文件名 262 :return: 263 """ 264 newpath = os.path.join(self.output_dir, newname) 265 if os.path.exists(newpath): 266 os.remove(newpath) 267 os.link(oldfile, newpath) 268 269 def get_roc_filesname(self): 270 filelist = os.listdir(self.work_dir + '/ROC') 271 roc_table_file = None 272 roc_point_file = None 273 auc_file = None 274 for name in filelist: 275 if 'roc_table.xls' in name: 276 roc_table_file = name 277 elif 'roc_point.xls' in name: 278 roc_point_file = name 279 elif 'auc.xls' in name: 280 auc_file = name 281 if (roc_table_file and roc_point_file and auc_file): 282 return [roc_table_file, roc_point_file, auc_file] 283 else: 284 self.set_error("未知原因,ROC计算结果丢失") 285 286 def get_filesname(self): 287 filelist = os.listdir(self.work_dir + '/RandomForest') 288 randomforest_confusion_table_file = None 289 randomforest_mds_sites_file = None 290 randomforest_proximity_table_file = None 291 randomforest_topx_vimp_file = None 292 randomforest_vimp_table_file = None 293 randomforest_predicted_answer_file = None 294 randomforest_votes_probably_file = None 295 for name in filelist: 296 if 'randomforest_confusion_table.xls' in name: 297 randomforest_confusion_table_file = name 298 elif 'randomforest_mds_sites.xls' in name: 299 randomforest_mds_sites_file = name 300 elif 'randomforest_proximity_table.xls' in name: 301 randomforest_proximity_table_file = name 302 elif 'randomforest_topx_vimp.xls' in name: 303 randomforest_topx_vimp_file = name 304 elif 'randomforest_vimp_table.xls' in name: 305 randomforest_vimp_table_file = name 306 elif 'randomforest_predicted_answer.xls' in name: 307 randomforest_predicted_answer_file = name 308 elif 'randomforest_votes_probably.xls' in name: 309 randomforest_votes_probably_file = name 310 if (randomforest_mds_sites_file and randomforest_proximity_table_file and 311 randomforest_topx_vimp_file and randomforest_vimp_table_file): 312 if self.option('envtable').is_set: 313 if not randomforest_confusion_table_file: 314 self.set_error('未知原因,样本分组模拟结果丢失或未生成') 315 if not randomforest_predicted_answer_file: 316 self.set_error('未知原因,样本分组预测结果文件丢失或未生成') 317 if not randomforest_votes_probably_file: 318 self.set_error('未知原因,样本分组预测概率表丢失或未生成') 319 return [randomforest_confusion_table_file, randomforest_mds_sites_file, 320 randomforest_proximity_table_file, randomforest_topx_vimp_file, 321 randomforest_vimp_table_file, randomforest_predicted_answer_file, 322 randomforest_votes_probably_file] 323 else: 324 self.set_error('未知原因,数据计算结果丢失或者未生成')
1 #!/mnt/ilustre/users/sanger-dev/app/program/perl/perls/perl-5.24.0/bin/perl 2 use strict; 3 use warnings; 4 use Getopt::Long; 5 my %opts; 6 my $VERSION = "V2.20160708"; 7 GetOptions( %opts,"i=s","m=s","o=s","g=s","ntree=i","top=i","type=s"); 8 my $usage = <<"USAGE"; 9 Program : $0 10 Discription: Program used to caculate randomforest,with mds plot and importance variables given . 11 Version : $VERSION 12 Contact : jie.yao@majorbio.com 13 Usage :perl $0 [options] 14 -i * input otu table file 15 -o * output dir 16 -m input mapping file if you want set points's color and pch by groups. If omitted, randomForest will run in unsupervised mode. 17 Default:none 18 -g group name in mapping file .Default:none 19 -ntree Number of trees to grow. This should not be set to too small a number, to ensure that every input row gets predicted at least a few times.Default:500 20 -top How many variables to show? 21 -type either 1,2 or 3, specifying the type of importance measure (1=mean decrease in accuracy, 2=mean decrease in node impurity). 22 23 Example:$0 -i otu_table.xls -o randomForest -m group -g group 24 25 USAGE 26 27 die $usage if(!($opts{i}&&$opts{o})); 28 die $usage if($opts{m}&& !$opts{g}); 29 die $usage if(!$opts{m}&& $opts{g}); 30 31 $opts{m}=defined $opts{m}?$opts{m}:"none"; 32 $opts{g}=defined $opts{g}?$opts{g}:"none"; 33 $opts{ntree}=defined $opts{ntree}?$opts{ntree}:"500"; 34 $opts{type}=defined $opts{type}?$opts{type}:"1"; 35 $opts{top}=defined $opts{top}?$opts{top}:"50"; 36 37 38 if(! -e $opts{o}){ 39 `mkdir $opts{o}`; 40 } 41 42 43 open CMD,">$opts{o}/cmd.r"; 44 print CMD " 45 library(sp,warn.conflicts = F) 46 library(randomForest,warn.conflicts = F) 47 library(maptools,warn.conflicts = F) 48 basename="randomforest" 49 50 51 # if read otu data 52 otu <-read.table("$opts{i}",sep="\t",head=T,check.names = F) 53 rownames(otu) <-as.factor(otu[,1]) 54 otu <-otu[,-1] 55 rownames(otu) <-sapply(rownames(otu),function(x) gsub("_*{.+}"," ",x,perl = TRUE)) 56 rownames(otu) <-sapply(rownames(otu),function(x) gsub("-","_",x,perl = TRUE)) 57 rownames(otu) <-sapply(rownames(otu),function(x) gsub("\\[","",x,perl = TRUE)) 58 rownames(otu) <-sapply(rownames(otu),function(x) gsub("\\]","",x,perl = TRUE)) 59 rownames(otu) <-sapply(rownames(otu),function(x) gsub("\\(","",x,perl = TRUE)) 60 rownames(otu) <-sapply(rownames(otu),function(x) gsub("\\)","",x,perl = TRUE)) 61 rownames(otu) <-sapply(rownames(otu),function(x) gsub("^[0-9]","X\\1",x,perl = TRUE)) 62 rownames(otu) <-sapply(rownames(otu),function(x) gsub("/","",x,perl = TRUE)) 63 otu <-as.data.frame(t(otu),stringsAsFactors=T) 64 65 map="$opts{m}" 66 if(map !="none"){ 67 sd <-read.table("$opts{m}",head=T,sep="\t",comment.char = "",check.names = FALSE) 68 rownames(sd) <- as.character(sd[,1]) 69 sd[,1] <-as.character(sd[,1]) 70 sd$group <-as.factor(sd$group ) 71 legend <- as.matrix(unique(sd$group)) 72 } 73 74 set.seed(1) 75 if(map != "none"){ 76 otu.rf <- randomForest(sd$group ~ .,otu,importance=T,proximity=T,ntree=$opts{ntree}) 77 78 79 80 class_count <-as.matrix(table(sd$group)) 81 class <-data.frame(count=class_count) 82 83 ##randomforest votes probably 84 votes_probably<- paste("$opts{o}/",basename,"_votes_probably.xls",sep="") 85 write.table(otu.rf$votes,votes_probably,sep="\t",quote=F) 86 87 ##randomforest predicted answer 88 predicted_answer <- paste("$opts{o}/",basename,"_predicted_answer.xls",sep="") 89 write.table(otu.rf$predicted,predicted_answer,sep="\t",quote=F) 90 91 ##randomforest classification table 92 rf_table <- paste("$opts{o}/",basename,"_confusion_table.xls",sep="") 93 write.table(otu.rf$confusion,rf_table,sep="\t",quote=F) 94 mds <- cmdscale(1-otu.rf$proximity) 95 }else{ 96 otu.rf <- randomForest(otu,importance=T,proximity=T,ntree=$opts{ntree}) 97 mds <- cmdscale(1-otu.rf$proximity) 98 } 99 100 101 ##mds points 102 mds_points <- paste("$opts{o}/",basename,"_mds_sites.xls",sep="") 103 write.table(mds,mds_points,sep="\t",quote=F) 104 105 ##proximity table 106 proximity <- paste("$opts{o}/",basename,"_proximity_table.xls",sep="") 107 write.table(otu.rf$proximity,proximity,sep="\t",quote=F) 108 109 ## importance table 110 vimp_table <- paste("$opts{o}/",basename,"_vimp_table.xls",sep="") 111 write.table(otu.rf$importance,vimp_table,sep="\t",quote=F) 112 113 114 ## top importance species table 115 topx_vimp <- paste("$opts{o}/",basename,"_topx_vimp.xls",sep="") 116 imp <- importance(otu.rf) 117 if($opts{type} == 1){ 118 top <- imp[order(imp[,"MeanDecreaseAccuracy"],decreasing=T),][1:min($opts{top},length(imp[,1])),] 119 write.table(t(otu)[rownames(top),],topx_vimp,sep="\t",quote=F) 120 121 }else if ($opts{type} == 2){ 122 top <- imp[order(imp[,"MeanDecreaseGini"],decreasing=T),][1:min($opts{top},length(imp[,1])),] 123 write.table(t(otu)[rownames(top),],topx_vimp,sep="\t",quote=F) 124 } 125 126 "; 127 128 `R --restore --no-save < $opts{o}/cmd.r`;