- Method One:
- library(GenomicRanges)
- library(GenomicFeatures)
- library(annotatr)
- makeTxDbFromGFF
- txdb <- annotatr::makeTxDbFromGFF(gff_file, format="gtf")
- GRanges(txdb)
- ebg <- transcriptsBy(txdb, by="seqlevels")
- anno_info <- transcriptsBy(txdb, by=c("gene", "exon", "cds",'three_prime_utr','five_prime_utr'), use.names=FALSE)
- ####abstracting information about mouse gene names
- library(org.Mm.eg.db)
- x = get(sprintf('org.Mm.egSYMBOL', 'Mm10'))
- mapped_genes = mappedkeys(x)
- eg2symbol = as.data.frame(x[mapped_genes])
- eg2symbol$ensemble <- mapIds(org.Mm.eg.db,keys = eg2symbol$gene_id,keytype = "ENTREZID",column = "ENSEMBL")
- print(eg2symbol)
- Method Two:
- source("https://bioconductor.org/biocLite.R")
- biocLite('GenomicFeatures')
- library('GenomicFeatures')
- biocLite("GenomicRanges")
- library('GenomicRanges')
- library(rtracklayer)
- library(GenomicAlignments)
- library("AnnotationDbi")
- library("GenomicFeatures")
- test <- import(gtfFile, format = "gtf")
- #Abstract the first rank exon
- exon_1=subset(mcols(test),mcols(test)$exon_number=="1")
- #Abstract reads from bam files
- reads <- readGAlignments('./test.bam')
- reads <- as(reads, "GRanges") #change the class from dataframe to GRanges
- txdb = makeTxDbFromGFF('./Homo_sapiens.GRCh38.87.gtf',format = "gtf",circ_seqs = character())
- #Abstract sites information
- genes <- genes(txdb)
- TS=transcripts(txdb)
- EX=exons(txdb)
- EX_hits<- findOverlaps(reads, EX)
- hits <- findOverlaps(reads, genes)
- Method Three:
- library(plyr)
- pfgtf <- file.path("./Homo_sapiens.GRCh38.87.gtf")
- gtf <- read.table(pfgtf, head = F, sep = "\t", stringsAsFactors = F)
- test <- data.frame(do.call(rbind, strsplit(gtf$V9, split = "[; ]")), stringsAsFactors=F)
- valueFind <- function(x, type = "gene_id"){
- for(i in 1:length(x)){
- if(sum(x == type) == 0){
- return(NA)
- }else{
- return(x[which(x == type)[1]+1])
- }
- }
- }
- gtf_infor <- data.frame(gene_id = as.vector(apply(test,1,valueFind)),
- transcript_id = as.vector(apply(test,1,function(x){
- valueFind(x,type = "transcript_id")
- })),
- gene_name = as.vector(apply(test,1,function(x){
- valueFind(x,type = "gene_name")
- })),
- gene_biotype = as.vector(apply(test,1,function(x){
- valueFind(x,type = "gene_biotype")
- })),
- exon_id = as.vector(apply(test,1,function(x){
- valueFind(x,type = "exon_id")
- })),
- exon_number = as.vector(apply(test,1,function(x){
- valueFind(x,type = "exon_number")
- })), stringsAsFactors=F)
- gtf_info <- cbind(gtf[,c(1,3,4,5,7)], gtf_infor)
- gtf_exon_info <- gtf_info[gtf_info$V3 == "exon", ]
- colnames(gtf_exon_info)[1:5] <- c("chr", "type", "start", "end", "strand")
- gtf_exon_info$exon_number <- as.numeric(gtf_exon_info$exon_number)
- gtf_exon_info$exon_loci <- paste(gtf_exon_info$chr,":", gtf_exon_info$start, "-", gtf_exon_info$end, "_", gtf_exon_info$strand, sep = "")
- data_split <- dlply(gtf_exon_info, .variables = "gene_id")
- first_exon <- lapply(data_split, function(x){
- x[x$exon_number == 1, ]
- })
- biocLit('IRanges')
- library('IRanges')
- ####Compile first exons into IRanges object
- IRanges_first_exon = lapply(first_exon,function(x) IRanges(x[,3],x[,4]))
- first_exon_overlap = lapply(IRanges_first_exon, function(x) findOverlaps(IRanges_reads, x))
- #### 统计每个基因上第一个外显子的位置可以与我找出来 reads, 相互 overlap 到的数目
- overlap_summary = lapply(first_exon_overlap,function(x) summary(x)[1])
- ### 找出有 map 上 reads 的外显子个数, 以及上面的 reads 数
- exist_overlap_reads=unlist(overlap_summary)[unlist(overlap_summary)!='0']
- length(exist_overlap_reads)
- every_gene_map_reads=as.numeric(exist_overlap_reads)
- Method Four:
- rtracklayer::readGFF(gtfFile)
- importGTF(file , skip = 5 , nrow = -1, use.data.table = TRUE , level = 'gene' , features = NULL , print.feature = FALSE)
来源: http://www.bubuko.com/infodetail-2983910.html