zoukankan      html  css  js  c++  java
  • linux 系统中shell实现将fasta文件的碱基转换为一行及还原

    1、测试数据

    root@PC1:/home/test# cat a.fna  ##  实现将碱基转换为1行, 其他信息不变
    >NC_019458.2 Ovis aries breed Texel chromosome 1, Oar_v4.0, [whole genome shotgun sequence]
    gataaaaaataaatagaaacaaaatcactgaagaaCCAGTGTGCCTGCTCAGGTCAGATGAAGCCAGAGGGCTGCCAGAG
    GGCAAGCGAGCTGCGTTGCCTGGAAAAAGTTAAACACACAGAGAGCATGGTGGCTCTGATACTTTCTAGAAGGATTAAAG
    TCACTTTCCCAGTCTTTATGAGAATTGGGCCGAAGCTTAGCTGGTGCAACGAATTTAGAAATGAATGCACTTGCATTTGA
    >NC_019458.2 Ovis aries breed Texel chromosome 1, Oar_v4.0, [whole genome shotgun sequence]
    AGATGATGTGTCTTTGCCTTGAgctaaaaattttagaataatctgaACGTCATCTGAGGAACCTGCTTCTGGCGTGGTTT
    TGGTGTCAGCATCTTCTCACCCTCTCTAGTAATTTTCAGTATGCATTTCTATTTTCGTGTAGTTATTTACAGGAGCATTT
    TATGGAAAACCGGCTCAAATCTTTTTGGGTGCAGGGGTAGTTCAAATGCACTGAGACCCTCAGTTTCACTTGCTAATCTC
    >NC_019458.2 Ovis aries breed Texel chromosome 1, Oar_v4.0, [whole genome shotgun sequence]
    CTCCAGAAACCCTGTTCTCCTCGAGTGACAAGGTCAGCAGGGCAGCACGTGTGTTCCTGTCACTGCCAACTCAAGAATAT
    GAAGTTTAAAGAGTTTCACCATCAAATGCAGTGTCGTGGACTGCCCCTGAACAGGTGTTTATAATCACGTGTGCAAGTGA
    AGCAAGCACAAATCCTCAGTGGAAAACGGGCAGAGGACACGAGCagacaattctttttaaaaactgcacaaATTAGCACA
    >NC_019458.2 Ovis aries breed Texel chromosome 1, Oar_v4.0, [whole genome shotgun sequence]
    CTAGGCACGGATGAGCGTGCCTACCGTGTTGCATGGAGGTAACAGATGCCAGAGCCCGGAGGAGGCGCAAAGCTCACAAA
    CAGATGCGGACCGCAGGAAGCCGGGACGGCCTTCCTCCCCTGAAGCAGGAGGACGCGCCCTACAGAAAGCCGCTCGATCC
    TCCAGGCATTTGTTGTGAGCACTTAATCATCATTCGATCATTTGACGTGTACTCACTAGTAAAAGGCAGGACTGTGTCCC

    2、 awk + sed实现

    root@PC1:/home/test# awk '{if($0 ~ /^[a-zA-Z]/) {printf("%s", $0)} else {print $0}}' a.fna | sed '$ s/$/\n/' | sed '2,$ s/>/\n>/'  
    >NC_019458.2 Ovis aries breed Texel chromosome 1, Oar_v4.0, [whole genome shotgun sequence]
    gataaaaaataaatagaaacaaaatcactgaagaaCCAGTGTGCCTGCTCAGGTCAGATGAAGCCAGAGGGCTGCCAGAGGGCAAGCGAGCTGCGTTGCCTGGAAAAAGTTAAACACACAGAGAGCATGGTGGCTCTGATACTTTCTAGAAGGATTAAAGTCACTTTCCCAGTCTTTATGAGAATTGGGCCGAAGCTTAGCTGGTGCAACGAATTTAGAAATGAATGCACTTGCATTTGA
    >NC_019458.2 Ovis aries breed Texel chromosome 1, Oar_v4.0, [whole genome shotgun sequence]
    AGATGATGTGTCTTTGCCTTGAgctaaaaattttagaataatctgaACGTCATCTGAGGAACCTGCTTCTGGCGTGGTTTTGGTGTCAGCATCTTCTCACCCTCTCTAGTAATTTTCAGTATGCATTTCTATTTTCGTGTAGTTATTTACAGGAGCATTTTATGGAAAACCGGCTCAAATCTTTTTGGGTGCAGGGGTAGTTCAAATGCACTGAGACCCTCAGTTTCACTTGCTAATCTC
    >NC_019458.2 Ovis aries breed Texel chromosome 1, Oar_v4.0, [whole genome shotgun sequence]
    CTCCAGAAACCCTGTTCTCCTCGAGTGACAAGGTCAGCAGGGCAGCACGTGTGTTCCTGTCACTGCCAACTCAAGAATATGAAGTTTAAAGAGTTTCACCATCAAATGCAGTGTCGTGGACTGCCCCTGAACAGGTGTTTATAATCACGTGTGCAAGTGAAGCAAGCACAAATCCTCAGTGGAAAACGGGCAGAGGACACGAGCagacaattctttttaaaaactgcacaaATTAGCACA
    >NC_019458.2 Ovis aries breed Texel chromosome 1, Oar_v4.0, [whole genome shotgun sequence]
    CTAGGCACGGATGAGCGTGCCTACCGTGTTGCATGGAGGTAACAGATGCCAGAGCCCGGAGGAGGCGCAAAGCTCACAAACAGATGCGGACCGCAGGAAGCCGGGACGGCCTTCCTCCCCTGAAGCAGGAGGACGCGCCCTACAGAAAGCCGCTCGATCCTCCAGGCATTTGTTGTGAGCACTTAATCATCATTCGATCATTTGACGTGTACTCACTAGTAAAAGGCAGGACTGTGTCCC

    3、利用正则表达式及sed预存储还原

    root@PC1:/home/test# ls
    a.fna  b.fna
    root@PC1:/home/test# cp b.fna b.fna_bak   ## 要在源文件进行修改,先进行备份
    root@PC1:/home/test# ls
    a.fna  b.fna  b.fna_bak
    root@PC1:/home/test# cat b.fna
    >NC_019458.2 Ovis aries breed Texel chromosome 1, Oar_v4.0, [whole genome shotgun sequence]
    gataaaaaataaatagaaacaaaatcactgaagaaCCAGTGTGCCTGCTCAGGTCAGATGAAGCCAGAGGGCTGCCAGAGGGCAAGCGAGCTGCGTTGCCTGGAAAAAGTTAAACACACAGAGAGCATGGTGGCTCTGATACTTTCTAGAAGGATTAAAGTCACTTTCCCAGTCTTTATGAGAATTGGGCCGAAGCTTAGCTGGTGCAACGAATTTAGAAATGAATGCACTTGCATTTGA
    >NC_019458.2 Ovis aries breed Texel chromosome 1, Oar_v4.0, [whole genome shotgun sequence]
    AGATGATGTGTCTTTGCCTTGAgctaaaaattttagaataatctgaACGTCATCTGAGGAACCTGCTTCTGGCGTGGTTTTGGTGTCAGCATCTTCTCACCCTCTCTAGTAATTTTCAGTATGCATTTCTATTTTCGTGTAGTTATTTACAGGAGCATTTTATGGAAAACCGGCTCAAATCTTTTTGGGTGCAGGGGTAGTTCAAATGCACTGAGACCCTCAGTTTCACTTGCTAATCTC
    >NC_019458.2 Ovis aries breed Texel chromosome 1, Oar_v4.0, [whole genome shotgun sequence]
    CTCCAGAAACCCTGTTCTCCTCGAGTGACAAGGTCAGCAGGGCAGCACGTGTGTTCCTGTCACTGCCAACTCAAGAATATGAAGTTTAAAGAGTTTCACCATCAAATGCAGTGTCGTGGACTGCCCCTGAACAGGTGTTTATAATCACGTGTGCAAGTGAAGCAAGCACAAATCCTCAGTGGAAAACGGGCAGAGGACACGAGCagacaattctttttaaaaactgcacaaATTAGCACA
    >NC_019458.2 Ovis aries breed Texel chromosome 1, Oar_v4.0, [whole genome shotgun sequence]
    CTAGGCACGGATGAGCGTGCCTACCGTGTTGCATGGAGGTAACAGATGCCAGAGCCCGGAGGAGGCGCAAAGCTCACAAACAGATGCGGACCGCAGGAAGCCGGGACGGCCTTCCTCCCCTGAAGCAGGAGGACGCGCCCTACAGAAAGCCGCTCGATCCTCCAGGCATTTGTTGTGAGCACTTAATCATCATTCGATCATTTGACGTGTACTCACTAGTAAAAGGCAGGACTGTGTCCC
    root@PC1:/home/test# for i in `seq 10`; do sed 's/\(^[a-zA-Z]\{50\}\)[^\n]/\1\n/' b.fna -i; done   ## 50可修改为任意的碱基数
    root@PC1:/home/test# cat b.fna
    >NC_019458.2 Ovis aries breed Texel chromosome 1, Oar_v4.0, [whole genome shotgun sequence]
    gataaaaaataaatagaaacaaaatcactgaagaaCCAGTGTGCCTGCTC
    GGTCAGATGAAGCCAGAGGGCTGCCAGAGGGCAAGCGAGCTGCGTTGCCT
    GAAAAAGTTAAACACACAGAGAGCATGGTGGCTCTGATACTTTCTAGAAG
    ATTAAAGTCACTTTCCCAGTCTTTATGAGAATTGGGCCGAAGCTTAGCTG
    TGCAACGAATTTAGAAATGAATGCACTTGCATTTGA
    >NC_019458.2 Ovis aries breed Texel chromosome 1, Oar_v4.0, [whole genome shotgun sequence]
    AGATGATGTGTCTTTGCCTTGAgctaaaaattttagaataatctgaACGT
    ATCTGAGGAACCTGCTTCTGGCGTGGTTTTGGTGTCAGCATCTTCTCACC
    TCTCTAGTAATTTTCAGTATGCATTTCTATTTTCGTGTAGTTATTTACAG
    AGCATTTTATGGAAAACCGGCTCAAATCTTTTTGGGTGCAGGGGTAGTTC
    AATGCACTGAGACCCTCAGTTTCACTTGCTAATCTC
    >NC_019458.2 Ovis aries breed Texel chromosome 1, Oar_v4.0, [whole genome shotgun sequence]
    CTCCAGAAACCCTGTTCTCCTCGAGTGACAAGGTCAGCAGGGCAGCACGT
    TGTTCCTGTCACTGCCAACTCAAGAATATGAAGTTTAAAGAGTTTCACCA
    CAAATGCAGTGTCGTGGACTGCCCCTGAACAGGTGTTTATAATCACGTGT
    CAAGTGAAGCAAGCACAAATCCTCAGTGGAAAACGGGCAGAGGACACGAG
    agacaattctttttaaaaactgcacaaATTAGCACA
    >NC_019458.2 Ovis aries breed Texel chromosome 1, Oar_v4.0, [whole genome shotgun sequence]
    CTAGGCACGGATGAGCGTGCCTACCGTGTTGCATGGAGGTAACAGATGCC
    GAGCCCGGAGGAGGCGCAAAGCTCACAAACAGATGCGGACCGCAGGAAGC
    GGGACGGCCTTCCTCCCCTGAAGCAGGAGGACGCGCCCTACAGAAAGCCG
    TCGATCCTCCAGGCATTTGTTGTGAGCACTTAATCATCATTCGATCATTT
    ACGTGTACTCACTAGTAAAAGGCAGGACTGTGTCCC
  • 相关阅读:
    ZOJ 3603字符串操作
    ZOJ 3609 求逆元
    HDOJ 4007 Dave【最大覆盖集】
    HDOJ4006 The kth great number 【串的更改和维护】
    【集训笔记】博弈论相关知识【HDOJ 1850【HDOJ2147
    【集训笔记】母函数【母函数模板】【HDOJ1028【HDOJ1085
    【集训笔记】【大数模板】特殊的数 【Catalan数】【HDOJ1133【HDOJ1134【HDOJ1130
    【集训笔记】动态规划背包问题【HDOJ1421【HDOJ1058【HDOJ2546
    【集训笔记】动态规划【HDOJ1159【HDOJ1003
    【集训笔记】二分图及其应用【HDOJ1068【HDOJ1150【HDOJ1151
  • 原文地址:https://www.cnblogs.com/liujiaxin2018/p/15586906.html
Copyright © 2011-2022 走看看