This may help you.
I have data in a data-frame RecurrentCNA
in pretty much the same format as you, with the following columns (it's recurrent CN data):
- Chr
- Aberration
- Start
- End
- q-value
I first create an annotation reference template of ENSEMBL genes with the following code (hg19/GRCh37):
require(biomaRt)
require(GenomicRanges)
mart <- useMart(biomart="ENSEMBL_MART_ENSEMBL", host="grch37.ensembl.org", path="/biomart/martservice", dataset="hsapiens_gene_ensembl")
genes <- getBM(attributes=c("hgnc_symbol", "chromosome_name", "start_position", "end_position"), mart=mart)
genes <- genes[genes[,1]!="" & genes[,2] %in% c(1:22,"X","Y"),]
xindex <- which(genes[,2]=="X")
yindex <- which(genes[,2]=="Y")
genes[xindex, 2] <- 23
genes[yindex, 2] <- 24
genes[,2] <- sapply(genes[,2],as.integer)
genes <- genes[order(genes[,3]),]
genes <- genes[order(genes[,2]),]
colnames(genes) <- c("GeneSymbol", "Chr", "Start", "End")
genes.GenomicRanges <- makeGRangesFromDataFrame(genes, keep.extra.columns=TRUE)
To annotate my dataframe RecurrentCNA
, I then use the following code:
RecurrentCNA.GenomicRanges <- makeGRangesFromDataFrame(RecurrentCNA, keep.extra.columns=TRUE)
hits <- findOverlaps(genes.GenomicRanges, RecurrentCNA.GenomicRanges, type="within")
RecurrentCNA.Ann <- cbind(RecurrentCNA[subjectHits(hits),], genes[queryHits(hits),])
AberrantRegion <- paste0(RecurrentCNA.Ann[,1], ":", RecurrentCNA.Ann[,3], "-", RecurrentCNA.Ann[,4])
GeneRegion <- paste0(RecurrentCNA.Ann[,7], ":", RecurrentCNA.Ann[,8], "-", RecurrentCNA.Ann[,9])
AmpDelGenes <- cbind(RecurrentCNA.Ann[,c(6,2,5)], AberrantRegion, GeneRegion)
AmpDelGenes[AmpDelGenes[,2]==0, 2] <- "Del"
AmpDelGenes[AmpDelGenes[,2]==1, 2] <- "Amp"
rownames(AmpDelGenes) <- NULL
This produces data like this:
GeneSymbol Aberration q-value AberrantRegion GeneRegion
ARHGEF16 Del 0.00100611929685957 1:3218610-3421586 1:3370990-3397677
TPRG1L Del 0.00100611929685957 1:3480149-5526584 1:3541566-3546691
WRAP73 Del 0.00100611929685957 1:3480149-5526584 1:3547331-3569325
TP73 Del 0.00100611929685957 1:3480149-5526584 1:3569084-3652765
TP73-AS1 Del 0.00100611929685957 1:3480149-5526584 1:3652548-3663900
CCDC27 Del 0.00100611929685957 1:3480149-5526584 1:3668962-3688209
SMIM1 Del 0.00100611929685957 1:3480149-5526584 1:3689352-3692546
LRRC47 Del 0.00100611929685957 1:3480149-5526584 1:3696784-3713068
RN7SL574P Del 0.00100611929685957 1:3480149-5526584 1:3699379-3699673
CEP104 Del 0.00100611929685957 1:3480149-5526584 1:3728645-3773778
DFFB Del 0.00100611929685957 1:3480149-5526584 1:3773845-3801993
C1orf174 Del 0.00100611929685957 1:3480149-5526584 1:3805689-3816857
LINC01134 Del 0.00100611929685957 1:3480149-5526584 1:3816936-3833789
AJAP1 Del 0.00100611929685957 1:3480149-5526584 1:4714792-4852594
MIR4417 Del 0.00100611929685957 1:5531247-5625929 1:5624131-5624203
MIR4689 Del 0.00100611929685957 1:5631545-6427292 1:5922732-5922801
As you can see, it outputs a single row for each gene, and our aberrant regions are duplicated. To tidy this up, you can use:
UniqueRegions <- unique(as.character(AmpDelGenes$AberrantRegion))
dfNew <- data.frame(row.names=UniqueRegions)
for (i in 1:length(UniqueRegions))
{
GeneVector <- paste(as.character(df[df$AberrantRegion==UniqueRegions[i],1]), collapse=",")
dfNew <- rbind(
dfNew,
data.frame(
unique(AmpDelGenes[AmpDelGenes$AberrantRegion==UniqueRegions[i],"Aberration"]),
unique(AmpDelGenes[AmpDelGenes$AberrantRegion==UniqueRegions[i],"q.value"]),
unique(AmpDelGenes[AmpDelGenes$AberrantRegion==UniqueRegions[i],"AberrantRegion"]),
GeneVector))
}
colnames(dfNew) <- c("Aberration","q.value","AberrantRegion","Genes")
head(dfNew)
Aberration q.value AberrantRegion Genes
Del 0.001006119 1:3218610-3421586 ARHGEF16
Del 0.001006119 1:3480149-5526584 TPRG1L,WRAP73,TP73,TP73-AS1,CCDC27,...,AJAP1
Del 0.001006119 1:5531247-5625929 MIR4417
Del 0.001006119 1:5631545-6427292 MIR4689,NPHP4,KCNAB2,...,HES3,GPR153
Del 0.032633120 1:6429625-6709814 HES2,ESPN,MIR4252,...,KLHL21,PHF13,THAP3
Del 0.001006119 1:6710352-7352328 RNU1-8P
I think that you can re-use this code to get what you're looking for.
Thank you very much for this extensive explanation!
Absolutely no problem - I hope that it helps. I had all of the code except for the final part already.
Best of luck with it.