1.提取gff文件中的HLA基因的相关bed文件。
gff的格式:
zcat *gz|gawk 'BGIN{FS=" ";OFS=" "}$3=="gene"{match($9,/gene_name([^;]+)/,a);if(a[1]~/HLA-/){print $1,$4,$5,a[1]}}' chr6 29722775 29738528 "HLA-F" chr6 29726601 29749049 "HLA-F-AS1" chr6 29790954 29797811 "HLA-V" chr6 29800415 29802425 "HLA-P" chr6 29826967 29831125 "HLA-G" chr6 29887752 29890482 "HLA-H" chr6 29896654 29897786 "HLA-T" chr6 29926459 29929232 "HLA-K" chr6 29934101 29934286 "HLA-U" chr6 29941260 29945884 "HLA-A" chr6 29956596 29958570 "HLA-W" chr6 30005971 30009956 "HLA-J" chr6 30259584 30293014 "HLA-L" chr6 30351416 30351550 "HLA-N" chr6 30489467 30494205 "HLA-E" chr6 31268749 31272130 "HLA-C" chr6 31269491 31357188 "HLA-B"