1、测试数据下载:ftp://ftp.ensemblgenomes.org/pub/plants/release-44/gff3/arabidopsis_thaliana/Arabidopsis_thaliana.TAIR10.44.chromosome.1.gff3.gz
2、测试:
root@PC1:/home/test# wc -l * 221871 a.txt 13 test.py 221884 total root@PC1:/home/test# tail -n 3 a.txt ## gff文件 1 araport11 exon 30424758 30425192 . + . Parent=transcript:AT1G80990.1;Name=AT1G80990.1.exon2;constitutive=1;ensembl_end_phase=0;ensembl_phase=0;exon_id=AT1G80990.1.exon2;rank=2 1 araport11 CDS 30424758 30425192 . + 0 ID=CDS:AT1G80990.1;Parent=transcript:AT1G80990.1;protein_id=AT1G80990.1 ### root@PC1:/home/test# cat test.py fr=open("a.txt", "r") fw=open("output.txt", "w") for line in fr: line = line.strip() ## 删除换行符 if line.startswith("#"): ## 过滤掉# continue tmp = line.split("\t") ## 依据制表符对每一行进行拆分 if int(tmp[0]) == 1 and tmp[2] == "gene" and int(tmp[3]) > 100000 and int(tmp[4]) < 500000: ## 过滤 gene = tmp[8].split(";")[0].split("=")[1] ## gene列单独筛选 final = tmp[0] + "\t" + tmp[3] + "\t" + tmp[4] + "\t" + gene ## 字符串拼接 fw.write(final + "\n") ## 写入文件 fr.close() fw.close() root@PC1:/home/test# python3 test.py root@PC1:/home/test# ls a.txt output.txt test.py root@PC1:/home/test# wc -l * 221871 a.txt 128 output.txt ## 结果文件 13 test.py 222012 total root@PC1:/home/test# head -n 2 output.txt ## 查看结果 1 104440 105330 gene:AT1G01250 1 108946 111699 gene:AT1G01260