So I've been working on a script in RStudio to organize/analyze RNASeq data and export it in various formats (i.e., heatmap, PCA, Excel CSV).
In my code, however, my gene names aren't matching with my accession numbers -- I get the following error:
Error in as.vector(x) : no method for coercing this S4 class to a vector
when I try to run this line:
genebm <- merge.data.frame(dds, getBM(filters="external_gene_name", attributes="ensembl_gene_id"), by = "row.names")
Please help!
## RNA-seq analysis with DESeq2
## Adapted in part from Stephen Turner and Margaret Gruca
# Import & pre-process ----------------------------------------------------
library(DESeq2)
library(biomaRt)
library(pheatmap)
directory <- "C:\\Users\\Username\\Desktop\\Holster\\RNASeq\\H149ABulkCounts\\Edited\\Human\\HEK"
mart <- useDataset("hsapiens_gene_ensembl", useMart(host = "aug2017.archive.ensembl.org", biomart = "ENSEMBL_MART_ENSEMBL"))
fileNames <- list.files(directory, pattern="*.counts")
sampleNames <- sub("(e.counts)","",fileNames)
conditionNames <- c("FUS1","FUS1", "H149A", "H149A")
sampleTable <- data.frame(sampleName = sampleNames,
fileName = fileNames,
condition = conditionNames)
ddsHTSeq <- DESeqDataSetFromHTSeqCount(sampleTable = sampleTable,
directory = directory,
design = ~condition)
dds <- DESeq(ddsHTSeq)
res <- results(dds)
#QC -- Check sizeFactors, should be close to 1 (ideally) -- largely affected by depth (read counts)
sizeFactors(dds)
#dds <- dds[ rowSums(counts(dds)) > 50, ]
dim(dds)
FPvC_counts <- (counts(dds, normalized=TRUE))[,-c(7,8,9,10)]
FPvC_data <- merge(as.data.frame(res), as.data.frame(FPvC_counts), by='row.names', sort=FALSE)
genes <- FPvC_data$Row.names
new_names <- getBM(filters= "ensembl_gene_id", attributes=c("ensembl_gene_id","external_gene_name"), values=genes,mart=mart)
out <- merge(FPvC_data,new_names,by.x="Row.names",by.y="ensembl_gene_id")
out <- out[,c(c(ncol(out),1:(ncol(out)-1)))]
out <- out[,-2]
write.table(out, file= "NS2_FUS1.txt", sep='\t')
genebm <- merge.data.frame(dds, getBM(filters="external_gene_name", attributes="ensembl_gene_id"), by = "row.names")
a <- c(genebm[3])
counts_genebm <-counts[genebm$ensembl_gene_id,]
row.names(counts_genebm) <- genebm[,"V1"]
cal_z_score <- function(x){
(x - mean(x)) / sd(x)
}
dim(dds)
head(dds)
# Plot dispersions
png("qc-dispersions.png", 1000, 1000, pointsize=20)
plotDispEsts(dds, main="Dispersion plot")
dev.off()
# Regularized log transformation for clustering/heatmaps, etc
rld <- rlogTransformation(dds)
head(assay(rld))
hist(assay(rld))
# Colors for plots below
## Ugly:
## (mycols <- 1:length(unique(condition)))
## Use RColorBrewer, better
library(RColorBrewer)
(mycols <- brewer.pal(8, "Dark2")[1:length(unique(condition))])
# Sample distance heatmap
sampleDists <- as.matrix(dist(t(assay(rld))))
library(gplots)
png("qc-heatmap-samples.png", w=1000, h=1000, pointsize=20)
heatmap.2(as.matrix(sampleDists), key=F, trace="none",
col=colorpanel(100, "orange", "red"),
# ColSideColors=mycols[condition],
# RowSideColors=mycols[condition],
margin=c(10, 10), main="Sample Distance Matrix")
dev.off()
# Principal components analysis
## Could do with built-in DESeq2 function:
## DESeq2::plotPCA(rld, intgroup="condition")
rld_pca <- function (rld, intgroup = "condition", ntop = 500, colors=NULL, legendpos="bottomleft", main="PCA Biplot", textcx=1, ...) {
require(genefilter)
require(calibrate)
require(RColorBrewer)
rv = rowVars(assay(rld))
select = order(rv, decreasing = TRUE)[seq_len(min(ntop, length(rv)))]
pca = prcomp(t(assay(rld)[select, ]))
fac = factor(apply(as.data.frame(colData(rld)[, intgroup, drop = FALSE]), 1, paste, collapse = " : "))
if (is.null(colors)) {
if (nlevels(fac) >= 3) {
colors = brewer.pal(nlevels(fac), "Paired")
} else {
colors = c("black", "red")
}
}
pc1var <- round(summary(pca)$importance[2,1]*100, digits=1)
pc2var <- round(summary(pca)$importance[2,2]*100, digits=1)
pc1lab <- paste0("PC1 (",as.character(pc1var),"%)")
pc2lab <- paste0("PC1 (",as.character(pc2var),"%)")
plot(PC2~PC1, data=as.data.frame(pca$x), bg=colors[fac], pch=21, xlab=pc1lab, ylab=pc2lab, main=main, ...)
with(as.data.frame(pca$x), textxy(PC1, PC2, labs=rownames(as.data.frame(pca$x)), cex=textcx))
legend(legendpos, legend=levels(fac), col=colors, pch=20)
# rldyplot(PC2 ~ PC1, groups = fac, data = as.data.frame(pca$rld),
# pch = 16, cerld = 2, aspect = "iso", col = colours, main = draw.key(key = list(rect = list(col = colours),
# terldt = list(levels(fac)), rep = FALSE)))
}
png("qc-pca.png", 1400, 1000, pointsize=20)
rld_pca(rld, colors=mycols, intgroup="condition", xlim=c(-75, 35))
dev.off()
#condx <- "TNFaDex10"
#condy <- "Untreated10"
# Get differential expression results, specify which conditions to contrast (max=2)
res <- results(dds, contrast = c("condition", condx, condy))
table(res$padj<0.05)
## Order by adjusted p-value
res <- res[order(res$padj), ]
## Merge with normalized count data
resdata <- merge(as.data.frame(res), as.data.frame(counts(dds, normalized=TRUE)), by="row.names", sort=FALSE)
names(resdata)[1] <- "Gene"
head(resdata)
## Write results
resdata <- resdata[complete.cases(resdata), ]
write.csv(resdata, file="diffexprresultsHEK.csv")
dev.off()
Get your datasets to the easiest forms to merge. Ensure they're both
data.frame
objects and they have a column each with matching values and compatible type. That should take care of the error.Hmm, how do I do that?
You wish to compare
res
to thedata.frame
output ofgetBM()
. Save thegetBM()
call to an object, saygetbm_res
. Check to see ifclass(getbm_res)
gives you data.frame as one of the classes. Similarly, see ifclass(res)
also hasdata.frame
. You can even useis.data.frame(getbm_res)
andis.data.frame(res)
to check this. Once you're sure they're bothdata.frame
s, you can merge them.Hm, I don't think I'm doing my merge right. I tried this:
But the dimensions of my resulting matrix (resdata) are vastly different (before merge it's 20381 x 11, and after it's 0 x 12).
That tells me that while you have matching column names between the two data frames, the matching columns don't have any values that can be matched. Can you show me the output to:
Thank you
You have no matching columns.
I'm trying to have the HGNC symbols replace the "genes" Ensembl ID's, is merge not the correct function?
Hello equinox145111!
We believe that this post no longer fits on this site.
You jumped to creating a new post with more specifics: Problem with merge data while trying to convert gene names
For this reason we have closed this question. This allows us to keep the site focused on the topics that the community can help with.
If you disagree please tell us why in a reply below, we'll be happy to talk about it.
Cheers!
Hello equinox145111!
We believe that this post does not fit the main topic of this site.
See for explanation: C: Problem with merge data while trying to convert gene names
For this reason we have closed your question. This allows us to keep the site focused on the topics that the community can help with.
If you disagree please tell us why in a reply below, we'll be happy to talk about it.
Cheers!