Hello,
I want to compare (with a PCA) gene expression against copy number variation at gene level in a TCGA project.
When I retrieve the gene expression every value is mapped by sample and gene. But for the copy number variation, I get only chromosomal locations.
To do the PCA, I want to have a table with gene, expression by sample, CNV by sample (same genes and same samples for both datasets). So I'm trying to tag all rows of my CNV dataset with a gene, but the final result in masked.dt is completely incoherent (ie the chromosomal location retrieved with TCGA biolinks package do not match those of the genes I retrieved with Biomart.
Could someone please help?
I'm posting a code sample to reproduce the issue but I'm happy to use any other method...
library(TCGAbiolinks)
library(biomaRt)
patients <- c("TCGA-02-0047", "TCGA-02-2483", "TCGA-06-0129", "TCGA-06-0138",
"TCGA-06-0139", "TCGA-06-0178", "TCGA-06-0221", "TCGA-06-0750",
"TCGA-06-0878", "TCGA-06-0882", "TCGA-06-1804", "TCGA-06-2557",
"TCGA-06-2558", "TCGA-06-2559", "TCGA-06-2562", "TCGA-06-2569",
"TCGA-06-2570", "TCGA-06-5412", "TCGA-06-5416", "TCGA-06-5418",
"TCGA-12-0616", "TCGA-12-3653", "TCGA-15-1444", "TCGA-16-0846",
"TCGA-19-2625", "TCGA-19-4065", "TCGA-26-1442", "TCGA-26-5136",
"TCGA-27-1837", "TCGA-27-2521", "TCGA-27-2526", "TCGA-28-1747",
"TCGA-28-2509", "TCGA-32-1982", "TCGA-32-2632", "TCGA-32-2634",
"TCGA-41-2571", "TCGA-76-4925", "TCGA-76-4928")
query.exp <- GDCquery(project = "TCGA-GBM",
data.category = "Transcriptome Profiling",
data.type = "Gene Expression Quantification",
workflow.type = "STAR - Counts",
barcode = patients)
query.masked <- GDCquery(project = "TCGA-GBM",
data.category = "Copy Number Variation",
data.type = "Masked Copy Number Segment",
barcode=patients)
GDCdownload(query.exp)
GDCdownload(query.masked)
TCGA.GBM.exp <- GDCprepare(query.exp)
TCGA.GBM.masked <- GDCprepare(query.masked)
setDT(TCGA.GBM.masked)
tpm_unstrand.exp <- head(assays(TCGA.GBM.exp)$"tpm_unstrand")
samples.exp <- colnames(tpm_unstrand.exp)
tpm_unstrand.exp <- as.data.table(tpm_unstrand.exp, keep.rownames = TRUE)
colnames(tpm_unstrand.exp) <- c("Gene", samples.exp)
tpm_unstrand.exp[, Gene := sub("\\.[^.]*$", "", Gene)]
mart <- useMart(biomart="ensembl", dataset="hsapiens_gene_ensembl")
regions <- GRanges(TCGA.GBM.masked$Chromosome, IRanges(TCGA.GBM.masked$Start, TCGA.GBM.masked$End))
results <- getBM(attributes = c("hgnc_symbol", "ensembl_gene_id" , "ensembl_gene_id_version", "chromosome_name",
"start_position","end_position"),
filters = c("chromosomal_region"),
values=regions,
mart=mart)
colnames(results) <- c("hgnc_symbol", "ensembl_gene_id" , "ensembl_gene_id_version", "Chromosome",
"Start","End")
setDT(results)
results.interest <- results[ensembl_gene_id %in% tpm_unstrand.exp$Gene, .(ensembl_gene_id, Chromosome, Start, End)]
results.gr <- GRanges(results.interest$Chromosome, IRanges(results.interest$Start, results.interest$End))
TCGA.GBM.masked.gr <- GRanges(TCGA.GBM.masked$Chromosome, IRanges(TCGA.GBM.masked$Start, TCGA.GBM.masked$End))
olaps <- findOverlaps(TCGA.GBM.masked.gr, results.gr)
query.dt <- TCGA.GBM.masked[queryHits(olaps),][, .(Chromosome, Start, End, Sample, Segment_Mean)]
colnames(query.dt) <- paste0("TRUE_", c("Chromosome", "Start", "End", "Sample", "Segment_Mean"))
subject.dt <- results.interest[subjectHits(olaps),]
masked.dt <- cbind(query.dt, subject.dt)
# pca.dt <- merge(tpm_unstrand.exp , masked.dt, on = "Gene")