zoukankan      html  css  js  c++  java
  • spark kmer计算

    • 输入文件:fa格式的文件
    • 输出结果:kmer的频数和对应的kmer类型和计数
      1.将fq.gz的文件转换成fa文件:
    #!/usr/bin/python env
    # -*- coding:utf-8 -*-
    import os
    import re
    import os.path
    import gzip
    import sys
    
    
    #在这里可以写一个函数用来将文件转换成id和序列对应的字典
    #需要用到哪个转化操作呢?考虑先尝试使用filter或者map
    
    
    ''' 
    @r261DFRAAXX100204:1:100:10494:3070/1
    ACTGCATCCTGGAAAGAATCAATGGTGGCCGGAAAGTGTTTTTCAAATACAAGAGTGACAATGTGCCCTGTTGTTT
    +
    ACCCCCCCCCCCCCCCCCCCCCCCCCCCCCBC?CCCCCCCCC@@CACCCCCACCCCCCCCCCCCCCCCCCCCCCCC
    '''
    
    
    #这里是利用python直接读取压缩的fastq文件
    def read_gz_file(path):
        if os.path.exists(path):
            with gzip.open(path,'rt') as pf:
                for line in pf:
                    yield line
        else:
            print 'the path [{}] is not exist!'.format(path)
    
    def ReadFastq(fastq):
        flag = 1
        dict_fq={}
        if fastq.endswith('gz'):
            con = read_gz_file(fastq)
            if getattr(con,'__iter__','None'):
                for line in con:
                    line=line.strip()
                    flag_index = flag%4
                    if flag_index == 1:
                        id = line
                    if flag%4 == 2:
                        seq = line
                    else:
                        flag +=1
                        continue
                    dict_fq[id] = seq
                    flag+=1
                return dict_fq
        else:
            with open (fastq,'r') as fqr:
                for line in fqr.readlines():
                    line = line.strip()          
                    flag_index = flag%4
                    if flag_index == 1:
                        id = line
                    if flag%4 == 2:
                        seq = line
                    else:
                        flag +=1
                        continue
                    dict_fq[id] = seq
                    flag+=1
                return dict_fq
    
    
    def convert_to_fa(dict_hash,output):
        with open (output,'w') as fr:
            for i in dict_hash.keys():
                fr.write(i+'
    ')
                fr.write(dict_hash[i]+'
    ')
    
    
    if __name__ == '__main__':
        input = sys.argv[1]
        output = sys.argv[2]
    
        dic_fa = ReadFastq(input)
        convert_to_fa(dic_fa,output)

    2.将reads打断成kmer并统计kmer的频数

    #!/usr/bin/env python
    # coding=utf-8
    import os
    import sys
    import re
    from pyspark import SparkConf, SparkContext
    
    input_fasta_file ='/home/yueyao/Spark/00.data/both.fa'
    
    conf = SparkConf().setMaster("local").setAppName("Yue Yao app")
    sc = SparkContext(conf = conf)
    fasta_file = sc.textFile(input_fasta_file)
    
    #这里是对fasta文件进行转化操作,过滤掉reads的名称
    reads_fa = fasta_file.filter(lambda line :">" not in line)
    
    #这个函数用来将reads打断成kmer,这里的kmer是25,返回一个列表
    def map_file(line):
        seq_lis=[]
        for i in range(len(line)-25+1):
            sub_seq = line[i:i+25]
            seq_lis.append(sub_seq)
        return seq_lis
    
    kmer_list = reads_fa.flatMap(map_file)
    #对打断的kmer进行计数
    kmer_count = kmer_list.map(lambda id:(id,1))
    kmer_total_count = kmer_count.reduceByKey(lambda a,b:(a+b))
    #这里过滤掉了含有N的kmer
    kmer_not_contain_N = kmer_total_count.filter(lambda line :"N" not in line[0])
    kmer_key=kmer_not_contain_N.keys()
    #统计kmer的种类,并计数
    kmer_vari_count = kmer_not_contain_N.map(lambda kmer_vari:(kmer_vari[1],1))
    kmer_histo = kmer_vari_count.reduceByKey(lambda a,b:(a+b))
    #输出kmer频数的结果
    kmer_histo.saveAsTextFile('Kmer25.histo')
    kmer_not_contain_N.saveAsTextFile('kmer25')
    kmer_key.saveAsTextFile('kmer25_key')
  • 相关阅读:
    C# Unity依赖注入
    Spring学习总结
    .Net 上传文件和下载文件
    JavaWeb学习篇--Filter过滤器
    Struts2入门教程
    Ceph 时钟偏移问题 clock skew detected 解决方案--- 部署内网NTP服务
    Erasure Coding(纠删码)深入分析 转
    s3cmd : Add a config parameter to enable path-style bucket access 当ceph rgw使用域名时,需要支持 path-style bucket特性
    ceph rgw java sdk 使用域名访问服务时需要设置s3client的配置项 PathStyleAccess 为true, 负责将报域名异常
    直播流怎么存储在Ceph对象存储上? Linux内存文件系统tmpfs(/dev/shm) 的应用
  • 原文地址:https://www.cnblogs.com/raisok/p/10917678.html
Copyright © 2011-2022 走看看