|
require(GenomicFeatures) |
|
require(biomaRt) |
|
#set this to where your tophat_out directory is. |
|
working_directory = "/some_directory/" |
|
setwd(working_directory) |
|
#make a database of transcripts from the ensembl assembly |
|
#first get the current release 69 from ensembl, change this to your species if not using mice |
|
txdb <- makeTranscriptDbFromBiomart(biomart="ensembl",dataset = 'mmusculus_gene_ensembl') |
|
#make exon and transcript annotation objects |
|
save(txdb, file="txdb.Robject") |
|
exons <- exons(txdb, columns=c('gene_id', 'tx_id', 'tx_name', 'tx_chrom', 'tx_strand', 'exon_id', 'exon_name', 'cds_id', 'cds_name', 'cds_chrom', 'cds_strand', 'exon_rank')) |
|
save(exons, file="exons.Robject") |
|
transcripts <- transcripts(txdb, columns=c('gene_id', 'tx_id', 'tx_name', 'exon_id', 'exon_name', 'exon_chrom', 'exon_strand', 'cds_id', 'cds_name', 'cds_chrom', 'cds_strand', 'exon_rank')) |
|
|
|
|
|
require(GenomicRanges) |
|
require(Rsamtools) |
|
|
|
#set list of sample ids as a vector |
|
sample_ids = seq(12849,12858) |
|
|
|
transcript.countsTable <- data.frame( |
|
row.names = as.vector(unlist(elementMetadata(transcripts)['tx_name']))) |
|
exon.countsTable <- data.frame( |
|
row.names = as.vector(unlist(elementMetadata(exons)['exon_name']))) |
|
|
|
#this forloop iterates over the sample_ids and generates exon and transcript counts for each sample_id |
|
for(sample_id in sample_ids) { |
|
#read alignment |
|
align <- readBamGappedAlignments(sprintf("tophat_out/Sample_%s/accepted_hits.bam", sample_id)) |
|
#count the overlapping reads for the transcripts |
|
transcript.counts <- countOverlaps(transcripts, align) |
|
#reassign to a specific transcript.counts object. |
|
assign(sprintf("transcript.counts.%s", sample_id), transcript.counts) |
|
#add this column to the countsTable |
|
transcript.countsTable <- cbind(transcript.countsTable,transcript.counts) |
|
remove(transcript.counts) |
|
#count the overlapping reads for the exons |
|
exon.counts <- countOverlaps(exons, align) |
|
#reassign to a specific transcript.counts object. |
|
assign(sprintf("exon.counts.%s", sample_id), exon.counts) |
|
#add this column to the countsTable |
|
exon.countsTable <- cbind(exon.countsTable, exon.counts) |
|
#remove the align, transcript.counts and exon.counts objects for the next loop |
|
remove(align) |
|
remove(transcript.counts) |
|
remove(exon.counts) |
|
} |
|
|
|
summary(transcriptCountsTable) |
|
summary(exonCountsTable) |
|
|
|
#write these two counts tables to csv files. |
|
write.csv(transcriptCountsTable, "transcript_counts_table.csv") |
|
write.csv(exonCountsTable, "exon_counts_table.csv") |
what are the identifiers in your counts table? For example, i make my counts table based on ensembl transcripts so i have an ENSMUST for each rowname.
This is an example of the identifer I'm using gi|222109225|ref|NC_011992.1|:c1488979-1488104 I'm a novice at this so I'm not quite sure what thats for