zoukankan      html  css  js  c++  java
  • linux系统shell实现统计 plink文件基因频率

    1、

    [root@centos79 test]# cat test.sh
    #!/bin/bash
    
    #step1 check consistence of columns
    temp1=`head -n 1 $1 | awk '{print NF}'`
    for i in $(seq `sed -n "$=" $1`)
    do
    temp2=$(sed -n "$i"p $1 | awk '{print NF}')
    if [ $temp2 -ne $temp1 ]
    then
    echo "$i row option exception!"
    echo -e "row1: $temp1 columns \nrow$i: $temp2 columns"
    exit
    fi
    done
    
    echo "step1 done!"
    
    #step2 check nucleotide A T G C or 0
    awk '{for(i = 7; i <= NF; i++) {print $i}}' $1 | sed 's/[\t ]/\n/' | sort -u  > tempped
    for i in `cat tempped`
    do
    if [ $i != "G" ] && [ $i != "C" ] && [ $i != "A" ] && [ $i != "T" ] && [ $i != "0" ]
    then
    echo "abnormal nucleotide: $i"
    exit
    fi
    done
    rm -f tempped
    
    echo "step2 done!"
    
    #step3 check match of map and ped
    
    map=$(sed -n "$=" $2)
    ped=$(head -n 1 $1 | awk '{print (NF - 6)/2}')
    if [ $map -ne $ped ]
    then
    echo "map and ped do not match!"
    exit
    fi
    
    echo "step3 done!"
    
    #step4 statistic
    for i in $(seq `head -n 1 $1 | awk '{print (NF - 6)/2}'`); do awk -v a=$i '{print $(a*2 + 6),$(a*2+5) }' $1 | sed 's/ /\n/' | sort | uniq -c | sed 's/^[\t ]\+//g' | sort -n | xargs | awk '{if(NF == 2 && $2 != 0) {print "NA", "0", $2, "1"} else if(NF == 2 && $2 == 0) {print "0", "0", "0", "1"} else if(NF == 4 && $2 != 0) {print $2, $1/($1 + $3), $4, $3/($1 + $3)} else if(NF == 4 && $2 == 0) {print "NA", "0", $4, "1"} else if(NF == 6) {print $4, $3/($3 + $5), $6, $5/($3 + $5)}}' | sed 's/ /\t/g' >> tempresult.txt; done
    
    echo "step 4 done!"
    
    #step5 add coordinate
    cut -f 1-2,4 $2 | paste -  tempresult.txt | sed "1i chr\tsnp\tpos\tmaf\tfreq\tmaj\tfreq" > freqresult.txt
    rm tempresult.txt
    
    echo "finished!"

    用法:

    [root@centos79 test]# ls   ## 测试数据
    outcome.map  outcome.ped  test.sh
    [root@centos79 test]# cat outcome.map
    1       snp1    0       55910
    1       snp2    0       85204
    1       snp3    0       122948
    1       snp4    0       203750
    1       snp5    0       312707
    1       snp6    0       356863
    1       snp7    0       400518
    1       snp8    0       487423
    [root@centos79 test]# cat outcome.ped
    DOR     1       0       0       0       -9      G G     0 0     0 0     0 0     A A     T T     G G     G C
    DOR     2       0       0       0       -9      G G     G G     0 0     0 0     A A     T T     A G     C C
    DOR     3       0       0       0       -9      G G     C C     0 0     G G     G G     T T     A G     G C
    DOR     4       0       0       0       -9      G G     C C     0 0     G G     G G     A A     G G     G G
    DOR     5       0       0       0       -9      G G     C C     0 0     G G     G G     A A     A G     G C
    DOR     6       0       0       0       -9      G G     C C     0 0     G G     G G     A A     A A     C C
    [root@centos79 test]# bash test.sh outcome.ped outcome.map
    step1 done!
    step2 done!
    step3 done!
    step 4 done!
    finished!
    [root@centos79 test]# ls   ##生成的结果
    freqresult.txt  outcome.map  outcome.ped  test.sh
    [root@centos79 test]# cat freqresult.txt
    chr     snp     pos     maf     freq    maj     freq
    1       snp1    55910   NA      0       G       1
    1       snp2    85204   G       0.2     C       0.8
    1       snp3    122948  0       0       0       1
    1       snp4    203750  NA      0       G       1
    1       snp5    312707  A       0.333333        G       0.666667
    1       snp6    356863  A       0.5     T       0.5
    1       snp7    400518  A       0.416667        G       0.583333
    1       snp8    487423  G       0.416667        C       0.583333

    2、plink软件验证, 结果一致

    [root@centos79 test]# plink --file outcome --freq --out verify > /dev/null; rm *.log *.nosex
    [root@centos79 test]# ls
    freqresult.txt  outcome.map  outcome.ped  test.sh  verify.frq
    [root@centos79 test]# cat freqresult.txt
    chr     snp     pos     maf     freq    maj     freq
    1       snp1    55910   NA      0       G       1
    1       snp2    85204   G       0.2     C       0.8
    1       snp3    122948  0       0       0       1
    1       snp4    203750  NA      0       G       1
    1       snp5    312707  A       0.333333        G       0.666667
    1       snp6    356863  A       0.5     T       0.5
    1       snp7    400518  A       0.416667        G       0.583333
    1       snp8    487423  G       0.416667        C       0.583333
    [root@centos79 test]# cat verify.frq
     CHR  SNP   A1   A2          MAF  NCHROBS
       1 snp1    0    G            0       12
       1 snp2    G    C          0.2       10
       1 snp3    0    0           NA        0
       1 snp4    0    G            0        8
       1 snp5    A    G       0.3333       12
       1 snp6    A    T          0.5       12
       1 snp7    A    G       0.4167       12
       1 snp8    G    C       0.4167       12
  • 相关阅读:
    常见的四种文本自动分词详解及IK Analyze的代码实现
    用java语言通过POI实现word文档的按标题提取
    spark的运行模式
    团队冲刺日志2
    简单之美-软件开发实践者的思考 03
    简单之美-软件开发实践者的思考 02
    简单之美-软件开发实践者的思考 01
    学习进度 15
    构建之法 06
    构建之法 05
  • 原文地址:https://www.cnblogs.com/liujiaxin2018/p/15485173.html
Copyright © 2011-2022 走看看