zoukankan      html  css  js  c++  java
  • plink格式数据依据染色体拆分数据、依据染色体合并数据

    1、准备测试数据

    [root@linuxprobe test]# ls
    test.map  test.ped
    [root@linuxprobe test]# wc -l *  ## 46827个位点,60个样本
       46827 test.map
          60 test.ped
       46887 total
    [root@linuxprobe test]# cut -f 1 test.map | uniq -c  ## 一共 26条染色体
       5244 1
       4970 2
       4465 3
       2436 4
       2109 5
       2318 6
       2005 7
       1851 8
       1901 9
       1649 10
       1077 11
       1528 12
       1527 13
       1045 14
       1506 15
       1395 16
       1291 17
       1272 18
       1124 19
        998 20
        768 21
        984 22
       1005 23
        662 24
        892 25
        805 26

    2、依据染色体拆分数据 

    for i in `seq 26`;do plink --file test --chr $i --sheep --recode tab --out chr$i;done  ## 利用for循环,结合plink软件提取每条染色体数据,(plink提前加入环境变量或者给绝对路径)
    ……
    ……
    [root@linuxprobe test]# ls
    chr10.log    chr12.ped    chr15.nosex  chr18.map    chr20.log    chr22.ped    chr25.nosex  chr3.map    chr6.log    chr8.ped
    chr10.map    chr13.log    chr15.ped    chr18.nosex  chr20.map    chr23.log    chr25.ped    chr3.nosex  chr6.map    chr9.log
    chr10.nosex  chr13.map    chr16.log    chr18.ped    chr20.nosex  chr23.map    chr26.log    chr3.ped    chr6.nosex  chr9.map
    chr10.ped    chr13.nosex  chr16.map    chr19.log    chr20.ped    chr23.nosex  chr26.map    chr4.log    chr6.ped    chr9.nosex
    chr11.log    chr13.ped    chr16.nosex  chr19.map    chr21.log    chr23.ped    chr26.nosex  chr4.map    chr7.log    chr9.ped
    chr11.map    chr14.log    chr16.ped    chr19.nosex  chr21.map    chr24.log    chr26.ped    chr4.nosex  chr7.map    test.map
    chr11.nosex  chr14.map    chr17.log    chr19.ped    chr21.nosex  chr24.map    chr2.log     chr4.ped    chr7.nosex  test.ped
    chr11.ped    chr14.nosex  chr17.map    chr1.log     chr21.ped    chr24.nosex  chr2.map     chr5.log    chr7.ped
    chr12.log    chr14.ped    chr17.nosex  chr1.map     chr22.log    chr24.ped    chr2.nosex   chr5.map    chr8.log
    chr12.map    chr15.log    chr17.ped    chr1.nosex   chr22.map    chr25.log    chr2.ped     chr5.nosex  chr8.map
    chr12.nosex  chr15.map    chr18.log    chr1.ped     chr22.nosex  chr25.map    chr3.log     chr5.ped    chr8.nosex

     [root@linuxprobe test]# wc -l *.map
     1649 chr10.map
     1077 chr11.map
     1528 chr12.map
     1527 chr13.map
     1045 chr14.map
     1506 chr15.map
     1395 chr16.map
     1291 chr17.map
     1272 chr18.map
     1124 chr19.map
     5244 chr1.map
     998 chr20.map
     768 chr21.map
     984 chr22.map
     1005 chr23.map
     662 chr24.map
     892 chr25.map
     805 chr26.map
     4970 chr2.map
     4465 chr3.map
     2436 chr4.map
     2109 chr5.map
     2318 chr6.map
     2005 chr7.map
     1851 chr8.map
     1901 chr9.map
     46827 test.map
     93654 total

    3、依据染色体合并数据  

    [root@linuxprobe test]# cp chr1.map merge.map  ##复制染色体1为起始的合并结果
    [root@linuxprobe test]# cp chr1.ped merge.ped  ##同上
    for i in `seq 2 26`;do plink --file merge --merge chr$i.ped chr$i.map --sheep --recode tab --out merge;done  ## 利用for循环,结合plink进行合并 (这种合并仅适合样本完全一致时合并)

      [root@linuxprobe test]# wc -l merge.ped merge.map  ##统计合并结果
      60 merge.ped
      46827 merge.map
      46887 total
      [root@linuxprobe test]# md5sum test.ped test.map merge.ped merge.map ## 检查合并后是否一致 
      04a12361169ff0fdc77349f280f11b6b test.ped
      8d4537314a4c7beaeb0da726a8c5fcba test.map
      89b9ee552b2ae3b5e9fcb5282cbbb5c9 merge.ped
      8d4537314a4c7beaeb0da726a8c5fcba merge.map

    ## ped文件不一致是由于在部分位点基因频率为0.5时,主次等位基因顺序颠倒引起的,不影响结果

    4、验证ped文件md5sum不一致的情况   

    [root@linuxprobe test]# mkdir test  ## 单独建立文件夹,将需要验证结果复制至该文件夹
    [root@linuxprobe test]# cp test.map test.ped merge.map merge.ped test
    [root@linuxprobe test]# cd test/
    [root@linuxprobe test]# ls
    merge.map  merge.ped  test.map  test.ped
     plink --file merge --freq --sheep --out merge;rm *.nosex   ## 利用plink统计基因频率
     plink --file test --freq --sheep --out test ;rm *.nosex   ## 利用plink统计基因频率
     [root@linuxprobe test]# ls
    merge.frq  merge.log  merge.map  merge.ped  test.frq  test.log  test.map  test.ped
    [root@linuxprobe test]# head merge.frq  ## 查看生成文件格式
     CHR                           SNP   A1   A2          MAF  NCHROBS
       1                      s64199.1    A    G       0.1667      120
       1              OAR19_64675012.1    G    C         0.25      120
       1              OAR19_64715327.1    A    G       0.1583      120
       1              OAR19_64803054.1    A    G         0.25      120
       1                DU281551_498.1    G    A          0.5      120
       1                      s18939.1    0    A            0      120
       1                  OAR1_88143.1    A    G       0.2083      120
       1                      s09912.1    G    C       0.3417      120
       1                      s36301.1    A    G        0.325      120
    [root@linuxprobe test]# sed 's/^[\t ]*//' merge.frq | sed 's/[\t ]\+/\t/g' | head  ## 去除行首空格,将多个空格或者制表符转换为一个制表符,查看前十行
    CHR     SNP     A1      A2      MAF     NCHROBS
    1       s64199.1        A       G       0.1667  120
    1       OAR19_64675012.1        G       C       0.25    120
    1       OAR19_64715327.1        A       G       0.1583  120
    1       OAR19_64803054.1        A       G       0.25    120
    1       DU281551_498.1  G       A       0.5     120
    1       s18939.1        0       A       0       120
    1       OAR1_88143.1    A       G       0.2083  120
    1       s09912.1        G       C       0.3417  120
    1       s36301.1        A       G       0.325   120
    [root@linuxprobe test]# sed 's/^[\t ]*//' merge.frq | sed 's/[\t ]\+/\t/g' | cut -f 3-4 --complement | md5sum   ## 剔除3、4列后查看md5sum
    643867ea0404ada96aa61702bc61f678  -
    [root@linuxprobe test]# sed 's/^[\t ]*//' test.frq | sed 's/[\t ]\+/\t/g' | cut -f 3-4 --complement | md5sum    ##剔除3、4列后查看买的sum
    643867ea0404ada96aa61702bc61f678  -
    ## 以上说明每个稳点的基因频率完全相同

    5、利用diff检查

    [root@linuxprobe test]# diff merge.frq test.frq | head   ## 差异的部分基因频率全部为0.5,主次等位基因颠倒,这种颠倒不影响结果
    6c6
    <    1                DU281551_498.1    G    A          0.5      120
    ---
    >    1                DU281551_498.1    A    G          0.5      120
    218c218
    <    1                      s64317.1    G    A          0.5      120
    ---
    >    1                      s64317.1    A    G          0.5      120
    251c251
    <    1                      s45057.1    G    A          0.5      120
    [root@linuxprobe test]# diff merge.frq test.frq | tail
    ---
    >   24              OAR24_21591643.1    A    G          0.5      120
    46100c46100
    <   26               OAR26_5333971.1    A    G          0.5      120
    ---
    >   26               OAR26_5333971.1    G    A          0.5      120
    46697c46697
    <   26                      s52004.1    G    A          0.5      120
    ---
    >   26                      s52004.1    A    G          0.5      120

       

  • 相关阅读:
    软件项目管理总体流程设计
    Delphi 编译时 提示 Internal error: URW3537 错误
    Oracle OLEDB 的手工分发
    Oracle10g Client的手工分发
    Delphi 处理在字符串截取中避免出现半个汉字
    Oracle ODP.Net 的手工分发
    项目中问题解决杂谈audio
    按钮 stylesheet 字符串;
    pb使用
    web测试和app测试的重点
  • 原文地址:https://www.cnblogs.com/liujiaxin2018/p/13770366.html
Copyright © 2011-2022 走看看