Hello Biostars I have been using the following script for differential expression of affymetrix microarray data for several datasets, however I keep getting very high adjusted p values so I believe I am missing out a step. Apologies in advance for the very long post but I think it's important to show all my code.
library(GEOquery)
library(limma)
library(oligo)
library(dplyr)
library(matrixStats)
library(biomaRt)
########## file download and normalisation####################################
setwd("~/Documents/Gina")
getGEOSuppFiles("GSE112841")
list.files("GSE112841")
untar("GSE112841/GSE112841_RAW.tar", exdir = "GSE112841/CEL")
celfiles <- list.files("GSE112841/CEL", full = TRUE)
rawData <- read.celfiles(celfiles)
normData <- rma(rawData)
###### filtering for poorly expressed probes##################################
data_medians <- rowMedians(exprs(normData))
man_threshold <- 4
no_samples <- table(man_threshold)
samples_cutoff <- min(no_samples)
idx_man_threshold <- apply(Biobase::exprs(normData), 1,
function(x){
sum(x > man_threshold) >= samples_cutoff})
table(idx_man_threshold)
filtered_normData <- subset(normData, idx_man_threshold)
dim(filtered_normData) # 24154 genes
dim(normData) # 53617 genes
###### gene annotation#######################################################
mart <- useMart("ENSEMBL_MART_ENSEMBL")
mart <- useDataset("hsapiens_gene_ensembl", mart)
Glist <- getBM(attributes = c("affy_hugene_2_0_st_v1",
"external_gene_name",
"gene_biotype"),
filters = "affy_hugene_2_0_st_v1",
values =rownames(exprs(filtered_normData)),
mart = mart)
Biomart <- Glist[which(Glist$gene_biotype=="protein_coding"),]
DF <- as.data.frame(exprs(filtered_normData))
DF$Probe_Name <- rownames(DF)
names(DF) <- c("shNDRG1_1", "shNDRG1_2", "shNDRG1_3",
"Control_1", "Control_2", "Control_3",
"Probe_Name")
Biomart <- Biomart[! duplicated(Biomart$external_gene_name),]
Biomart <- Biomart[! duplicated(Biomart$affy_hugene_2_0_st_v1),]
DF <- DF[! duplicated(DF$Probe_Name),]
X <- merge(DF, Biomart, by.x = "Probe_Name", by.y = "affy_hugene_2_0_st_v1")
rownames(X) <- X$external_gene_name
X$Probe_Name <- X$Probe_Name <- X$gene_biotype <- X$external_gene_name <- NULL
########### differential expression##########################################
Z <- rep(c("shNDRG1", "Control"), each = 3)
f = factor(Z,levels=c("shNDRG1","Control"))
design = model.matrix(~ 0 + f)
colnames(design) = c("shNDRG1","Control")
data.fit = lmFit(X, design)
contrast.matrix = makeContrasts(shNDRG1-Control,levels=design)
data.fit.con = contrasts.fit(data.fit,contrast.matrix)
data.fit.eb = eBayes(data.fit.con)
tab = topTable(data.fit.eb, number=Inf,adjust.method="fdr")
head(tab)
logFC AveExpr t P.Value adj.P.Val B
VLDLR 1.1195324 7.216597 8.877460 4.023989e-05 0.4014946 -2.950292
CCL28 1.4088015 4.193678 8.015200 7.887233e-05 0.4014946 -2.995123
SERHL2 -1.0807675 7.651956 -7.503533 1.210243e-04 0.4014946 -3.027914
CASP14 -1.2332972 4.224030 -7.443782 1.274233e-04 0.4014946 -3.032104
EPHB6 -1.1213133 5.036059 -7.232237 1.533330e-04 0.4014946 -3.047611
SRARP -1.3676657 5.549409 -7.226247 1.541483e-04 0.4014946 -3.048066
NDRG1 -1.1750064 7.820946 -7.224252 1.544210e-04 0.4014946 -3.048217
Any help with sorting out the adjusted p values will be most appreciated. Thanks.