I am trying to analyse a few Affymetrix microarrays deposited on NCBI GEO to determine if they have any DE genes in common. I am a newbie and am roughly following the End to end workflow by Klaus & Reisenauer 2018. I am using the GEOQuery package instead of the ArrayExpress package to download the raw data.
My problem: Klaus and Reisenauer recommend filtering out lowly expressed genes and do so manually by inspecting a histogram of median intensities post-background correction/normalisation (rma function). I struggle with deciding on a threshold value, since the oligo::rma function of the raw intensities results in a large peak of very low median intensities (~ 2-3). This issue occurred with GSE23738, but also several other different arrays (e.g. GSE120616, GSE102850, GSE11238).
Thank you very much for your help!!
Script used for all above mentioned arrays, however linear model was only applied to arrays with replication.
library(GEOquery) library(oligo) library(Biobase) library(arrayQualityMetrics) library(mouse4302.db) library(limma) library(dplyr) library(tidyr) library(ggplot2) # GSE23738 - Rice et al 2011 (https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE23738) getGEOSuppFiles("GSE23738") list.files("GSE23738") untar("GSE23738/GSE23738_RAW.tar", exdir = "GSE23738/CEL") list.files("GSE23738/CEL") rice_raw <- read.celfiles(list.files("GSE23738/CEL", full = TRUE, pattern = "CEL.gz")) pData(rice_raw)$filename <- sampleNames(rice_raw) replicate <- factor(c(1,2,3,1,2,3,1,2,3,1,2, 1,2,3,1,2,3), levels = 1:3, labels = c("1", "2", "3")) time <- factor(c(1,1,1,2,2,2,3,3,3,4,4,5,5,5,6,6,6), levels = 1:6, labels = c("0 dpi", "1 dpi", "2 dpi", "3 dpi", "4 dpi", "5 dpi")) pData(rice_raw)$replicate <- replicate pData(rice_raw)$time <- time arrayQualityMetrics(expressionset = rice_raw, force = TRUE, do.logtransform = TRUE, intgroup = "time") rice_eset <- rma(rice_raw) arrayQualityMetrics(expressionset = rice_eset, force = TRUE, do.logtransform = FALSE, intgroup = "time") # 1 outlier # Histogram of median intensities to decide on intensity threshold rice_medians <- rowMedians(exprs(rice_eset)) median_histogram <- hist(rice_medians, 100, col = "cornsilk", freq = FALSE, main = "Histogram of median intensities", border = "antiquewhite4", xlab = "Median intensities") # Continue without filtering rice_annotations <- AnnotationDbi::select(mouse4302.db, keys = (featureNames(rice_eset)), columns = c("SYMBOL", "GENENAME", "ENTREZID"), keytype = "PROBEID") rice_annotations <- subset(rice_annotations, !is.na(SYMBOL)) multiple_annotation_probes <- filter(summarise(group_by(rice_annotations, PROBEID), no_of_matches = n_distinct(SYMBOL)), no_of_matches >1) probes_to_exclude <- (featureNames(rice_eset) %in% multiple_annotation_probes$PROBEID) rice_final <- subset(rice_eset, !probes_to_exclude) fData(rice_final)$PROBEID <- rownames(fData(rice_final)) fData(rice_final) <- left_join(fData(rice_final), rice_annotations) rownames(fData(rice_final)) <- fData(rice_final)$PROBEID rice_design <- model.matrix(~0 + time + replicate) colnames(rice_design)[1:6] <- c("dpi0", "dpi1", "dpi2", "dpi3", "dpi4", "dpi5") rownames(rice_design) <- replicate # Comparison 1 dpi vs 0 dpi contrast_matrix <- makeContrasts(dpi1-dpi0, levels = rice_design) rice_analysis <- eBayes(contrasts.fit(lmFit(rice_final, design = rice_design), contrast_matrix), trend = TRUE) plotSA(rice_analysis)