不同的方法从 gtf 中提取相关基因组信息 in R

Method One:
library(GenomicRanges)
library(GenomicFeatures)
library(annotatr)
makeTxDbFromGFF
txdb <- annotatr::makeTxDbFromGFF(gff_file, format="gtf")
GRanges(txdb)
ebg <- transcriptsBy(txdb, by="seqlevels")
anno_info <-  transcriptsBy(txdb, by=c("gene", "exon", "cds",'three_prime_utr','five_prime_utr'), use.names=FALSE)
####abstracting information about mouse gene names
library(org.Mm.eg.db)
x = get(sprintf('org.Mm.egSYMBOL', 'Mm10'))
mapped_genes = mappedkeys(x)
eg2symbol = as.data.frame(x[mapped_genes])
eg2symbol$ensemble <- mapIds(org.Mm.eg.db,keys = eg2symbol$gene_id,keytype = "ENTREZID",column = "ENSEMBL")
print(eg2symbol)
Method Two:
source("https://bioconductor.org/biocLite.R")
biocLite('GenomicFeatures')
library('GenomicFeatures')
biocLite("GenomicRanges")
library('GenomicRanges')
library(rtracklayer)
library(GenomicAlignments)
library("AnnotationDbi")
library("GenomicFeatures")
test <- import(gtfFile, format = "gtf")
#Abstract the first rank exon
exon_1=subset(mcols(test),mcols(test)$exon_number=="1")
#Abstract reads from bam files
reads <- readGAlignments('./test.bam')
reads <- as(reads, "GRanges") #change the class from dataframe to GRanges
txdb = makeTxDbFromGFF('./Homo_sapiens.GRCh38.87.gtf',format = "gtf",circ_seqs = character())
#Abstract sites information
genes <- genes(txdb)
TS=transcripts(txdb)
EX=exons(txdb)
EX_hits<- findOverlaps(reads, EX)
hits <- findOverlaps(reads, genes)
Method Three:
library(plyr)
pfgtf <- file.path("./Homo_sapiens.GRCh38.87.gtf")
gtf <- read.table(pfgtf, head = F, sep = "\t", stringsAsFactors = F)
test <- data.frame(do.call(rbind, strsplit(gtf$V9, split = "[; ]")), stringsAsFactors=F)
valueFind <- function(x, type = "gene_id"){
	for(i in 1:length(x)){
	if(sum(x == type) == 0){
	return(NA)	
}else{
	return(x[which(x == type)[1]+1])	
}	
}	
}
gtf_infor <- data.frame(gene_id = as.vector(apply(test,1,valueFind)),
transcript_id = as.vector(apply(test,1,function(x){
	valueFind(x,type = "transcript_id")	
})),
gene_name = as.vector(apply(test,1,function(x){
	valueFind(x,type = "gene_name")	
})),
gene_biotype = as.vector(apply(test,1,function(x){
	valueFind(x,type = "gene_biotype")	
})),
exon_id = as.vector(apply(test,1,function(x){
	valueFind(x,type = "exon_id")	
})),
exon_number = as.vector(apply(test,1,function(x){
	valueFind(x,type = "exon_number")	
})),  stringsAsFactors=F)
gtf_info <- cbind(gtf[,c(1,3,4,5,7)], gtf_infor)
gtf_exon_info <- gtf_info[gtf_info$V3 == "exon", ]
colnames(gtf_exon_info)[1:5] <- c("chr", "type", "start", "end", "strand")
gtf_exon_info$exon_number <- as.numeric(gtf_exon_info$exon_number)
gtf_exon_info$exon_loci <- paste(gtf_exon_info$chr,":", gtf_exon_info$start, "-", gtf_exon_info$end, "_", gtf_exon_info$strand, sep = "")
data_split <- dlply(gtf_exon_info, .variables = "gene_id")
first_exon <- lapply(data_split, function(x){
	x[x$exon_number == 1, ]	
})
biocLit('IRanges')
library('IRanges')
####Compile first exons into  IRanges object
IRanges_first_exon = lapply(first_exon,function(x) IRanges(x[,3],x[,4]))
first_exon_overlap = lapply(IRanges_first_exon, function(x) findOverlaps(IRanges_reads, x))
#### 统计每个基因上第一个外显子的位置可以与我找出来 reads, 相互 overlap 到的数目
overlap_summary = lapply(first_exon_overlap,function(x) summary(x)[1])
### 找出有 map 上 reads 的外显子个数, 以及上面的 reads 数
exist_overlap_reads=unlist(overlap_summary)[unlist(overlap_summary)!='0']
length(exist_overlap_reads)
every_gene_map_reads=as.numeric(exist_overlap_reads)
Method Four:
rtracklayer::readGFF(gtfFile)
importGTF(file , skip = 5 , nrow = -1, use.data.table = TRUE , level = 'gene' , features  = NULL , print.feature = FALSE)
来源: http://www.bubuko.com/infodetail-2983910.html
与本文相关文章

暂无,快来抢沙发吧！