Hi, I'm trying to filter my analysis to only specific gene types ( protein coding, link, ribosomal, rRNA). I'm starting from RNA seq counts. The GTF file is downloaded from: ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_24/gencode.v24.annotation.gtf.gz Here is the code I have until now, and I'd like to add "gene_type" column to the output file:
getbiocGet("GenomicRanges")
biocGet("rtracklayer")
biocGet("genoset")
library(GenomicRanges)
library(rtracklayer)
library(genoset)
GTFfile = "./gencode.v24.annotation.gtf"
GTF <- import.gff(con=GTFfile , format="gtf", genome="GRCh38.p5", feature.type="exon")
#Load the annotation and reduce it to get gene length
#GTF <- import.gff(GTFfile, format="gtf", genome="GRCm38.71", asRangedData=F, feature.type="exon")
grl <- reduce(split(GTF, elementMetadata(GTF)$gene_id))
reducedGTF <- unlist(grl, use.names=T)
elementMetadata(reducedGTF)$gene_id <- rep(names(grl), elementNROWS(grl))
elementMetadata(reducedGTF)$widths <- width(reducedGTF)
calc_length <- function(x) {
sum(elementMetadata(x)$widths)
}
output <- sapply(split(reducedGTF, elementMetadata(reducedGTF)$gene_id), calc_length)
colnames(output) <- c("Length")
Thanks!