zoukankan      html  css  js  c++  java
  • 003生信人必练

    1. gtf 文件

      序列的编号 注释信息的来源 注释信息的类型 开始与结束的位置  得分  序列的方向  起始编码的位置,仅对CDS有效  注释信息描述    
      11 ensembl_havana gene 5422111 5423206   ”.”表示为空。  +表示正义链, -反义链 , ? 表示未知.  有效值为0、1、2  键+值    

      11      ensembl_havana  gene    5422111 5423206 .       +       .       gene_id "ENSG00000167360"; gene_version "4"; gene_name "OR51Q1"; gene_source "ensembl_havana"; gene_biotype "protein_coding";
      11      ensembl_havana  transcript      5422111 5423206 .       +       .       gene_id "ENSG00000167360"; gene_version "4"; transcript_id "ENST00000300778"; transcript_version "4"; gene_name "OR51Q1"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "OR51Q1-001"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS31381";
      11      ensembl_havana  exon    5422111 5423206 .       +       .       gene_id "ENSG00000167360"; gene_version "4"; transcript_id "ENST00000300778"; transcript_version "4"; exon_number "1"; gene_name "OR51Q1"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "OR51Q1-001"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS31381"; exon_id "ENSE00001276439"; exon_version "4";
      11      ensembl_havana  CDS     5422201 5423151 .       +       0       gene_id "ENSG00000167360"; gene_version "4"; transcript_id "ENST00000300778"; transcript_version "4"; exon_number "1"; gene_name "OR51Q1"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "OR51Q1-001"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS31381"; protein_id "ENSP00000300778"; protein_version "4";

    2. 探究内容

    3. import sys
      import re
      
      args = sys.argv
      '''sys.argv 是命令行参数,是一个字符串列表,0代表其路径'''
      
      
      class Genome_info:      #这是一个基类,所有类型都通用的
          def __init__(self):
              self.chr = ""   #染色体号
              self.start = 0
              self.end = 0
      
      
      class Gene(Genome_info):   #这个函数继承了基类
          def __init__(self):
              Genome_info.__init__(self)
              self.orientation = ""
              self.id = ""
      
      
      class Transcript(Genome_info):
          def __init__(self):
              Genome_info.__init__(self)
              self.id = ""
              self.parent = ""
      
      
      class Exon(Genome_info):
          def __init__(self):
              Genome_info.__init__(self)
              self.parent = ""
      
      
      def main(args):
          """
          一个输入参数:
          第一个参数为物种gtf文件
      
          :return:
          """
      
          list_chr = []
          list_gene = {}   #因为有 id 号 所以用dir
          list_transcript = {}
          list_exon = []
          # l_n = 0
          with open(args[1]) as fp_gtf:        #打开文件遍历GTF
              for line in fp_gtf:
                  if line.startswith("#"):      #号开头的注释过滤掉,不统计这个
                      continue
                  # print ("in %s" % l_n)
                  # l_n += 1
                  lines = line.strip("
      ").split("	")
                  chr = lines[0]
                  type = lines[2]
                  start = int(lines[3])
                  end = int(lines[4])
                  orientation = lines[6]
                  attr = lines[8]
                  if not re.search(r'protein_coding', attr):   #取到的是蛋白的编码,如果没有的话就跳过,不做统计了
                      continue
      
                  if not chr in list_chr:    #把染色体的类型添加到列表里
                      list_chr.append(chr)
      
                  if type == "gene":
                      gene = Gene()          #初始化一个基因对象
                      id = re.search(r'gene_id "([^;]+)";?', attr).group(1)  # 0 返回所有列表,1取第一个
                      gene.chr = chr
                      gene.start = start
                      gene.end = end
                      gene.id = id
                      gene.orientation = orientation
                      list_gene[id] = gene
                      # print(id)
                  elif type == "transcript":
                      transcript = Transcript()
                      id = re.search(r'transcript_id "([^;]+)";?', attr).group(1)
                      parent = re.search(r'gene_id "([^;]+)";?', attr).group(1)
                      if not parent in list_gene:
                          continue
                      transcript.chr = chr
                      transcript.start = start
                      transcript.end = end
                      transcript.id = id
                      transcript.parent = parent
                      list_transcript[id] = transcript
      
                  elif type == "exon":
                      exon = Exon()
                      parent = re.search(r'transcript_id "([^;]+)";?', attr).group(1)
                      if not parent in list_transcript:
                          continue
                      exon.chr = chr
                      exon.start = start
                      exon.end = end
                      exon.parent = parent
                      list_exon.append(exon)
      
          chr_gene(list_gene)
          gene_len(list_gene)
          gene_transcript(list_transcript)
          transcript_exon(list_exon)
          exon_pos(list_exon)
      
      
      def chr_gene(list_gene):
          """
          染色体上基因数量分布
      
          :param list_gene:
          :return:
          """
      
          print("染色体上基因数量分布")
          count_gene = {}     #这是一个计数器
          for info in list_gene.values():
              chr = info.chr
              if chr in count_gene:
                  count_gene[info.chr] += 1
              else:
                  count_gene[info.chr] = 1
          with open("chr_gene.txt", 'w') as fp_out:
              for chr, num in count_gene.items():
                  print("	".join([chr, str(num)]) + "
      ")
                  fp_out.write("	".join([chr, str(num)]) + "
      ")
      
      
      def gene_len(list_gene):
          """
          基因长度分布情况
      
          :param list_gene:
          :return:
          """
      
          print("基因长度分布情况")
          with open("gene_len.txt", 'w') as fp_out:
              for gene_id, info in list_gene.items():
                  len = info.end - info.start + 1
                  fp_out.write("	".join([gene_id, str(len)]) + "
      ")
                  print("	".join([gene_id, str(len)]) + "
      ")
      
      
      def gene_transcript(list_transcript):
          """
          基因的转录本数量分布
      
          :param list_transcript:
          :return:
          """
      
          print("基因的转录本数量分布")
          count_transcript = {}
          for info in list_transcript.values():
              gene_id = info.parent
              if gene_id in count_transcript:
                  count_transcript[gene_id] += 1
              else:
                  count_transcript[gene_id] = 1
          with open("gene_transcript.txt", 'w') as fp_out:
              for gene_id, num in count_transcript.items():
                  print("	".join([gene_id, str(num)]) + "
      ")
                  fp_out.write("	".join([gene_id, str(num)]) + "
      ")
      
      
      def transcript_exon(list_exon):
          """
          转录本的外显子数量统计
      
          :param list_exon:
          :return:
          """
      
          print("转录本的外显子数量统计")
          count_exon = {}
          for exon in list_exon:
              transcript_id = exon.parent
              if transcript_id in count_exon:
                  count_exon[transcript_id] += 1
              else:
                  count_exon[transcript_id] = 1
          with open("transcript_exon.txt", 'w') as fp_out:
              for transcript_id, num in count_exon.items():
                  print("	".join([transcript_id, str(num)]) + "
      ")
                  fp_out.write("	".join([transcript_id, str(num)]) + "
      ")
      
      
      def exon_pos(list_exon):
          """
          外显子坐标统计
      
          :param list_exon:
          :return:
          """
      
          print("外显子坐标统计")
          count_exon = {}
          for exon in list_exon:
              transcript_id = exon.parent
              if transcript_id in count_exon:
                  count_exon[transcript_id] += ",%s-%s" % (str(exon.start), str(exon.end))
              else:
                  count_exon[transcript_id] = "%s-%s" % (str(exon.start), str(exon.end))
          with open("exon_pos.txt", 'w') as fp_out:
              for transcript_id, pos in count_exon.items():
                  print("	".join([transcript_id, pos]) + "
      ")
                  fp_out.write("	".join([transcript_id, pos]) + "
      ")
      
      
      def gene_exon_pos(list_gene, list_transcript, list_exon):
          """
          根据exon的parent将所有exon对应到transcript
          根据transcript的parent将所有transcript对应到gene
          根据gene按chr分组得到chromosome列表
      
          从chromosome中输出某个指定基因的所有外显子坐标信息并画图
          生信编程直播第五题
      
          :param list_gene:
          :param list_transcript:
          :param list_exon:
          :return:
          """
          pass
      
      
      if __name__ == "__main__":
          main(args)
      

        

  • 相关阅读:
    [JLOI2015]有意义的字符串
    二阶常系数线性齐次递推式的特征方程
    CH1812 生日礼物
    CH1809 匹配统计
    CH1808 Milking Grid
    BZOJ4025 二分图
    BZOJ3514 GERALD07加强版
    NOI2014 魔法森林
    WC2006 水管局长数据加强版
    LG3690 【模板】Link Cut Tree 和 SDOI2008 洞穴勘测
  • 原文地址:https://www.cnblogs.com/think-and-do/p/6388165.html
Copyright © 2011-2022 走看看