zoukankan      html  css  js  c++  java
  • 不同的方法从gtf中提取相关基因组信息in R

    Method One:

    library(GenomicRanges)

    library(GenomicFeatures)
    library(annotatr)
    makeTxDbFromGFF
    txdb <- annotatr::makeTxDbFromGFF(gff_file, format="gtf")

    GRanges(txdb)

    ebg <- transcriptsBy(txdb, by="seqlevels")

    anno_info <-  transcriptsBy(txdb, by=c("gene", "exon", "cds",'three_prime_utr','five_prime_utr'), use.names=FALSE)
    ####abstracting information about mouse gene names

    library(org.Mm.eg.db)

    x = get(sprintf('org.Mm.egSYMBOL', 'Mm10'))
    mapped_genes = mappedkeys(x)
    eg2symbol = as.data.frame(x[mapped_genes])
    eg2symbol$ensemble <- mapIds(org.Mm.eg.db,keys = eg2symbol$gene_id,keytype = "ENTREZID",column = "ENSEMBL")

    print(eg2symbol)

    Method Two:

    source("https://bioconductor.org/biocLite.R")

    biocLite('GenomicFeatures')

    library('GenomicFeatures')

    biocLite("GenomicRanges")

    library('GenomicRanges')

    library(rtracklayer)

    library(GenomicAlignments)

    library("AnnotationDbi")

    library("GenomicFeatures")

    test <- import(gtfFile, format = "gtf")

    #Abstract the first rank exon

    exon_1=subset(mcols(test),mcols(test)$exon_number=="1")

    #Abstract reads from bam files 

    reads <- readGAlignments('./test.bam')

    reads <- as(reads, "GRanges") #change the class from dataframe to GRanges

    txdb = makeTxDbFromGFF('./Homo_sapiens.GRCh38.87.gtf',format = "gtf",circ_seqs = character())

    #Abstract sites information

    genes <- genes(txdb)

    TS=transcripts(txdb)

    EX=exons(txdb)

    EX_hits<- findOverlaps(reads, EX)

    hits <- findOverlaps(reads, genes)

     Method Three:

    library(plyr)

    pfgtf <- file.path("./Homo_sapiens.GRCh38.87.gtf")

    gtf <- read.table(pfgtf, head = F, sep = " ", stringsAsFactors = F)

    test <- data.frame(do.call(rbind, strsplit(gtf$V9, split = "[; ]")), stringsAsFactors=F)

    valueFind <- function(x, type = "gene_id"){

      for(i in 1:length(x)){

        if(sum(x == type) == 0){

          return(NA)

        }else{

          return(x[which(x == type)[1]+1])

        }

      }

    }

    gtf_infor <- data.frame(gene_id = as.vector(apply(test,1,valueFind)),            

                            transcript_id = as.vector(apply(test,1,function(x){valueFind(x,type = "transcript_id")})),

                            gene_name = as.vector(apply(test,1,function(x){valueFind(x,type = "gene_name")})), 

                            gene_biotype = as.vector(apply(test,1,function(x){valueFind(x,type = "gene_biotype")})),

                            exon_id = as.vector(apply(test,1,function(x){valueFind(x,type = "exon_id")})),

                            exon_number = as.vector(apply(test,1,function(x){valueFind(x,type = "exon_number")})),  stringsAsFactors=F)

    gtf_info <- cbind(gtf[,c(1,3,4,5,7)], gtf_infor)

    gtf_exon_info <- gtf_info[gtf_info$V3 == "exon", ]

    colnames(gtf_exon_info)[1:5] <- c("chr", "type", "start", "end", "strand")

    gtf_exon_info$exon_number <- as.numeric(gtf_exon_info$exon_number)

    gtf_exon_info$exon_loci <- paste(gtf_exon_info$chr,":", gtf_exon_info$start, "-", gtf_exon_info$end, "_", gtf_exon_info$strand, sep = "")

    data_split <- dlply(gtf_exon_info, .variables = "gene_id")

    first_exon <- lapply(data_split, function(x){

      x[x$exon_number == 1, ]

    })

    biocLit('IRanges')

    library('IRanges')

    ####Compile first exons into  IRanges object

    IRanges_first_exon = lapply(first_exon,function(x) IRanges(x[,3],x[,4]))

    first_exon_overlap = lapply(IRanges_first_exon, function(x) findOverlaps(IRanges_reads, x))  

    ####统计每个基因上第一个外显子的位置可以与我找出来reads,相互overlap到的数目                              

    overlap_summary = lapply(first_exon_overlap,function(x) summary(x)[1])

    ###找出有map上reads的外显子个数,以及上面的reads数

    exist_overlap_reads=unlist(overlap_summary)[unlist(overlap_summary)!=‘0’]

    length(exist_overlap_reads)

    every_gene_map_reads=as.numeric(exist_overlap_reads)

    Method Four:

    rtracklayer::readGFF(gtfFile)

    importGTF(file , skip = 5 , nrow = -1, use.data.table = TRUE , level = 'gene' , features  = NULL , print.feature = FALSE)

  • 相关阅读:
    Git for Android Studio 学习笔记
    ACM-线段树区间更新+离散化
    hdu 1394 逆序数(线段树)
    Android瀑布流照片
    Android照片墙-多图加载
    Android-加载图片避免OOM
    Android-自定义View实现ImageView播放gif
    maven---工程建立及目录添加--
    oracle--视图(2)---
    Hibernate---Hql查询2---
  • 原文地址:https://www.cnblogs.com/beckygogogo/p/9865911.html
Copyright © 2011-2022 走看看