近期对序列文件处理的比較多,时常要看一些核酸序列的反向互补序列,长度。可能的翻译序列。
曾经我常用seqBuider 来查看。假设能在命令行直接查看。想必是极好的。
这是一个perl脚本。只是我把它的运行路径写入环境变量后。就能够当linux命令直接使用了,非常方便的。
这个脚本有四个參数。【-i -r -p -l 】
当中
-i 是必要的參数,用来接收标准输入;
-r 是获得一段序列的反向互补序列(50个字符一行的格式输出)。
-p 是提供一段序列的ORF框架序列,即三种可能的pep翻译(50个字符一行的格式输出)。
-l 获取一段序列的长度。
假设【-r-p-l】都是缺省状态的话。默认三种结果都输出。
在linux配置文件 ~/.bashrc 文件能够写入:
alias tfa=' perl /yourpath/transfa.pl'这样以后在linux命令行运行 tfa 命令,出现:
Usage: tfa <STDIN>[-i-r-p-l]
这种使用提示。
整个代码例如以下:
#! /usr/bin/perl -w use strict; use Getopt::Long; my ($i,$r,$p,$l); GetOptions( "i!"=>$i, "r!"=>$r, "p!"=>$p, "l!"=>$l, ); my $usage = " Usage: tfa <STDIN>[-i-r-p-l] "; die "$usage " unless $i; print "Please input the nucleotide sequence,and end by ctrl+D. "; unless($r || $p || $l){ ($r,$p,$l)=(1,1,1); } my $fa; do{local $/;chomp($fa=<STDIN>)}; $fa =~ s/s+//g; die "$usage " unless $fa; if($r){ my $faout = reverse_complement($fa); $faout = out_fasta($faout,50); print " ###rc### $faout "; } if($p){ my @fa_arr = cds2pep($fa); print " ###protein### "; $fa_arr[0] = out_fasta($fa_arr[0],50); print "ORF1: $fa_arr[0] "; $fa_arr[1] = out_fasta($fa_arr[1],50); print "ORF2: $fa_arr[1] "; $fa_arr[2] = out_fasta($fa_arr[2],50); print "ORF3: $fa_arr[2] "; } if($l){ my $len = length $fa; print " ###Length### $len "; } ##################### sub out_fasta{ my ($seq,$num) = @_; my $len = length $seq; $seq =~ s/([A-Za-z]{$num})/$1 /g; chop($seq) unless $len % $num; return $seq; } ##################### sub reverse_complement{ my ($seq)=shift; $seq=reverse$seq; $seq=~tr/AaGgCcTt/TtCcGgAa/; return $seq; } ##################### sub cds2pep{ my $seq=shift; ##phase0 my $str0 = $seq; $str0 = trans($str0); ##phase1 my $str1 = substr($seq,1); $str1 = trans($str1); ##phase0 my $str2 = substr($seq,2); $str2 = trans($str2); return ($str0,$str1,$str2); } ##################### sub trans{ my $seq = shift; my $p = code(); my $out; for(my $i=0;$i<length$seq;$i+=3){ my $codon=uc(substr($seq,$i,3)); last if (length$codon <3); $out.= exists $p->{"standard"}{$codon} ? $p->{"standard"}{$codon} : "X"; } return $out; } ##################### sub code{ my $p={ "standard" => { 'GCA' => 'A', 'GCC' => 'A', 'GCG' => 'A', 'GCT' => 'A', # Alanine 'TGC' => 'C', 'TGT' => 'C', # Cysteine 'GAC' => 'D', 'GAT' => 'D', # Aspartic Aci 'GAA' => 'E', 'GAG' => 'E', # Glutamic Aci 'TTC' => 'F', 'TTT' => 'F', # Phenylalanin 'GGA' => 'G', 'GGC' => 'G', 'GGG' => 'G', 'GGT' => 'G', # Glycine 'CAC' => 'H', 'CAT' => 'H', # Histidine 'ATA' => 'I', 'ATC' => 'I', 'ATT' => 'I', # Isoleucine 'AAA' => 'K', 'AAG' => 'K', # Lysine 'CTA' => 'L', 'CTC' => 'L', 'CTG' => 'L', 'CTT' => 'L', 'TTA' => 'L', 'TTG' => 'L', # Leucine 'ATG' => 'M', # Methionine 'AAC' => 'N', 'AAT' => 'N', # Asparagine 'CCA' => 'P', 'CCC' => 'P', 'CCG' => 'P', 'CCT' => 'P', # Proline 'CAA' => 'Q', 'CAG' => 'Q', # Glutamine 'CGA' => 'R', 'CGC' => 'R', 'CGG' => 'R', 'CGT' => 'R', 'AGA' => 'R', 'AGG' => 'R', # Arginine 'TCA' => 'S', 'TCC' => 'S', 'TCG' => 'S', 'TCT' => 'S', 'AGC' => 'S', 'AGT' => 'S', # Serine 'ACA' => 'T', 'ACC' => 'T', 'ACG' => 'T', 'ACT' => 'T', # Threonine 'GTA' => 'V', 'GTC' => 'V', 'GTG' => 'V', 'GTT' => 'V', # Valine 'TGG' => 'W', # Tryptophan 'TAC' => 'Y', 'TAT' => 'Y', # Tyrosine 'TAA' => 'U', 'TAG' => 'U', 'TGA' => 'U' # Stop } ## more translate table could be added here in future ## more translate table could be added here in future ## more translate table could be added here in future }; return $p; }
__END__