zoukankan      html  css  js  c++  java
  • 【代码速记】Genemapper软件数据转预处理

     1 use strict;
     2 #use warnings;
     3 
     4 use Math::Round;
     5 
     6 #input : open the origin file
     7 #e.g. Dye/Sample FileName Size Height Area DataPoint
     8 open RAWDATA , "<aflp1.txt" or die ("cannot open file:$!");
     9 
    10 #output
    11 my $output_name = "result-".time();
    12 mkdir ($output_name,0777) || die "cannot mkdir";
    13 open RESULT, ">$output_name/resultData.txt";
    14 
    15 my @all = <RAWDATA>;
    16 my @rows; #file to array
    17 my $tmp;
    18 my @all_b; #all bins
    19 my %all; #for binary data
    20 
    21 foreach $tmp (@all){
    22     push @rows, [split(/    /,$tmp)]
    23 }
    24 
    25 
    26 my @bins; #片段大小
    27 my %h; #片段大小-数量
    28 my %height; #每个样本的
    29 foreach my $bin (@rows){
    30     my $myBin; # Size
    31     $myBin = round(@$bin[2]);
    32     push @bins, $myBin;
    33     $h{$myBin}++;
    34 }
    35 
    36 
    37 for ( sort { $h{$b} <=> $h{$a} } keys %h ) {
    38     #print RESULT "$_	$h{$_}$/";
    39 }
    40 
    41 my @loci; #片段大小
    42 foreach my $key (sort keys %h){
    43     #print RESULT $key."    "
    44     push @loci, $key;
    45 }
    46 
    47 my %data; # all information in this hash
    48 foreach my $locus (@loci){
    49     foreach my $bin (@rows){
    50         my $size;
    51         my $height;
    52         my $name;
    53         my $locus_name;
    54         #$name = @$bin[1] =~ /[A-H][01-12]/;
    55         $name = substr(@$bin[1],2,3);
    56         #print $name . "
    ";
    57         $size = round(@$bin[2]);
    58         $height = round(@$bin[3]);
    59         $locus_name = "locus_".$locus;
    60         if ($locus == $size){
    61             $data{$name}{$locus_name} = $height;
    62         }
    63     }
    64 }
    65 
    66 
    67 print RESULT "sample"."	";
    68 foreach my $locus (@loci){
    69     print RESULT "locus_".$locus."	";
    70 }
    71 foreach my $key1 (keys %data){    
    72     my $hash2 = $data{$key1};
    73     print RESULT "
    ".$key1."	";
    74     foreach my $locus (@loci){
    75         my $key2;
    76         $key2 = "locus_".$locus;
    77         print RESULT %$hash2{$key2}."	";
    78     }
    79 }
    80 
    81 ##Sort
    82 # my %hash;
    83 # sort {$a <=> $b} @bins;#无效
    84 # @bins = grep { ++$hash{$_} < 2 } @bins; 
    85 
    86 # foreach my $bin (@bins){
    87     # print RESULT $bin."
    ";
    88 #}

     增加限制酶种类和重复后的脚本

    use strict;
    #use warnings;
    
    use Math::Round;
    
    #input : open the origin file
    #e.g. Dye/Sample FileName Size Height Area DataPoint
    open RAWDATA, "<msap2.txt" or die ("cannot open file:$!");
    open ORDER, "<order.txt" or die ("cannot open file:$!");
    open REPEAT, "<repeat.txt" or die ("cannot open file:$!");
    
    #output
    my $output_name = "result-".time();
    mkdir ($output_name,0777) || die "cannot mkdir";
    open RESULT, ">$output_name/data_msap2.txt";
    
    my @order_array = <ORDER>;
    my @repeat_array = <REPEAT>;
    my %order;
    my %repeat;
    
    foreach my $tmp (@order_array){
        my @match = split(/    /,$tmp);
        chomp $match[1];
        $order{$match[0]} = $match[1];
    }
    
    foreach my $tmp (@repeat_array){
        my @match = split(/    /,$tmp);
        chomp $match[1];
        $repeat{$match[0]} = $match[1];
    }
    
    my @all = <RAWDATA>;
    my @rows; #file to array
    my $tmp;
    my @all_b; #all bins
    my %all; #for binary data
    
    foreach $tmp (@all){
        push @rows, [split(/    /,$tmp)]
    }
    
    
    my @bins; #片段大小
    my %h; #片段大小-数量
    my %height; #每个样本的
    foreach my $bin (@rows){
        my $myBin; # Size
        $myBin = round(@$bin[2]);
        push @bins, $myBin;
        $h{$myBin}++;
    }
    
    
    for ( sort { $h{$b} <=> $h{$a} } keys %h ) {
        #print RESULT "$_	$h{$_}$/";
    }
    
    my @loci; #片段大小
    foreach my $key (sort keys %h){
        #print RESULT $key."    "
        push @loci, $key;
    }
    
    my %data; # all information in this hash
    foreach my $locus (@loci){
        foreach my $bin (@rows){
            my $size;
            my $height;
            my $name;
            my $name1;
            my $name2;
            my $locus_name;
            $name1 = substr(@$bin[1],2,3); #for name_length
            $name2 = substr(@$bin[1],6,2);
            $name1 =~ s/$name1/$order{$name1}/;
            $name2 =~ s/$name2/$repeat{$name2}/;
            $name = $name1."_".$name2;
            #print $name . "
    ";
            $size = round(@$bin[2]);
            $height = round(@$bin[3]);
            $locus_name = "locus_".$locus;
            if ($locus == $size){
                $data{$name}{$locus_name} = $height;
            }
        }
    }
    
    
    print RESULT "sample"."	";
    foreach my $locus (@loci){
        print RESULT "locus_".$locus."	";
    }
    foreach my $key1 (keys %data){    
        my $hash2 = $data{$key1};
        print RESULT "
    ".$key1."	";
        foreach my $locus (@loci){
            my $key2;
            $key2 = "locus_".$locus;
            print RESULT %$hash2{$key2}."	";
        }
    }
    
    
    close(RAWDATA);close(RESULT);close(ORDER);
  • 相关阅读:
    揭秘重度MMORPG手游后台性能优化方案
    算法:贪心、回溯(su)、分治、动态规划,思想简要
    表单提交 curl和浏览器方式
    mysql 聚集索引,非聚集索引,覆盖索引区别。
    虚拟机中的Linux不能上网
    第二篇 界面开发 (Android学习笔记)
    第一篇 入门必备 (Android学习笔记)
    C/C++知识补充 (1)
    向后/向前引用, 零宽断言, 转义, 命名分组
    C/C++知识补充(2) C/C++操作符/运算符的优先级 & 结合性
  • 原文地址:https://www.cnblogs.com/liulele/p/8032781.html
Copyright © 2011-2022 走看看