#!/usr/bin/python import re def readfa(l): col={} arr =[] sca ='' li = open(l) for line in li: if re.match(r'>(w*)',line): match = re.match(r'>(w*)',line) sca = match.group(1) col[sca]=arr arr =[] else: without = re.sub(r' ',"",line) arr.append(without) return col def readgff(l): col ={} arr =[] li = open(l) for line in li: sp = line.split( ) if sp[2] == 'mRNA': gene = re.match(r'ID=(.*?);',sp[8]).group(1) start =sp[3] arr=[] col[gene]=[sp[3],sp[4],arr,sp[0],sp[6]] elif sp[2] == 'CDS': gene = re.match(r'Parent=(.*?);',sp[8]).group(1) col[gene][2].append([sp[3],sp[4]]) return col def deal_gff(l): col ={} for key,value in l.items(): start=value[0] end = value[1] arr = value[2] sca = value[3] pos = value[4] if pos == '+': for single in arr: single[0] = int(single[0]) - int(start) single[1] = int(single[1]) - int(start)+1 elif pos =='-': for single in arr: off= int(end)-int(single[1]) lon= int(end)-int(single[0])+1 single[0] = off single[1] = lon arr.reverse col[sca]=arr del value[0] del value[0] del value[2] return l ###main### gff=readgff('gff') c=gff fa =readfa('fa') g=deal_gff(c) col = {} s='' for k,v in g.items(): sca = v[1] if fa[sca]: lon=s.join(fa[sca]) short='' for i in v[0]: short += lon[i[0]:i[1]] col[k]=short for k1,v1 in col.items(): print k1," ",v1