Hey everyone, I've been trying to do a permutation test on my RNAseq data using differential expression analysis, as conducted by DESeq2. My goal is to check whether the real groups present the biggest differences between them, by looking at the number of differentially expressed RNA transcripts and mean log fold change values in a permutation test. I'm using HTSeq stats files, so initially, I did the label mixing inside the sampleTable object and rerun the creation of the ddsHTSeq object and the test. This method worked well but took a very long time to run, so instead, I tried to create the ddsHTSeq once and mix the labels inside of it and then running the differential expression test multiple times by asking for a new results object after each mixing of the labels. I've found 2 places that can work for mix the labels in that way, and confirmed that both changed exactly the same labels in both places: (1) ddsHTSeq$Diagnosis (2) colData(ddsHTSeq)$Diagnosis
The problem is, that when I mix the labels that way (either 1 or 2) I get the exact same result for all the permutations I do in one (DPDvsNDPD) of my 3 contrasts (comparing my 3 groups to each other), and in the other 2 contrasts (DPDvsNDC, NDPDvsNDC) there are more overlaps in the results than expected as seen in the first method I used (of mixing the labels before constructing the ddsHTSeq object). Does anyone know what can be the underlying problem? the only difference between the codes is where the label mixing is taking place. Perhaps I'm not mixing the labels in the right place? I've added the two codes I used below.
Thank you for your help! Shani
Code 1 - mixing the labels before constructing the ddsHTSeq object
library("DESeq2")
library("ggplot2")
library("base")
colData <- read.csv(".../malePdataBlood250420_plusFactors2.csv",header = TRUE, sep = ',') #server
directory <- "..../HTSeq_ensembl100_Male31" #server
sampleFiles <- grep('.HTseq.stats',list.files(directory),value=TRUE)
sampleTable <- data.frame(sampleName = colData$sample.name, fileName = sampleFiles, Diagnosis = as.factor(colData$diagnosis),
Age = as.factor(colData$age), Braak = as.factor(colData$braak), BraakLB = as.factor(colData$braaklb),
Amyloid = as.factor(colData$amyloid), ApoE = as.factor(colData$apoe), PMD = as.factor(colData$pmd.blood),
RIN = as.factor(colData$RIN_factor), Duration = as.factor(colData$pd.duration..y.), Sepsis = as.factor(colData$sepsis),
Batch = as.factor(colData$batch),HeartFaliure = as.factor(colData$heart.failure), Euthanasia = as.factor(colData$Euthanasia))
# #create vector with all desired contrasts
contrastList <- c("DPDvsNDC","NDPDvsNDC","DPDvsNDPD")
###
#here add permutation start!
##create permutation table
permNumber <-c(1:51) #first will allways be the original, so add 1 to the number of permutations required
permDataset <- data.frame(permNumber = permNumber,
DPDvsNDC_DE = rep(0,times = length(permNumber)),DPDvsNDC_DE_down = rep(0,times = length(permNumber)),DPDvsNDC_ave_logFC_down = rep(0,times = length(permNumber)),
DPDvsNDC_DE_up= rep(0,times = length(permNumber)),DPDvsNDC_ave_logFC_up = rep(0,times = length(permNumber)),
NDPDvsNDC_DE = rep(0,times = length(permNumber)),NDPDvsNDC_DE_down = rep(0,times = length(permNumber)),NDPDvsNDC_ave_logFC_down = rep(0,times = length(permNumber)),
NDPDvsNDC_DE_up = rep(0,times = length(permNumber)),NDPDvsNDC_ave_logFC_up = rep(0,times = length(permNumber)),
DPDvsNDPD_DE = rep(0,times = length(permNumber)), DPDvsNDPD_DE_down = rep(0,times = length(permNumber)),DPDvsNDPD_ave_logFC_down = rep(0,times = length(permNumber)),
DPDvsNDPD_DE_up = rep(0,times = length(permNumber)),DPDvsNDPD_ave_logFC_up = rep(0,times = length(permNumber)))
Labels_matrix <- NULL
#setwd(".../281220_2")
for (i in permNumber){
if( i == 1) {
# Balance design for clot and batch effects
ddsHTSeq <- DESeqDataSetFromHTSeqCount(sampleTable = sampleTable, directory = directory,
design = ~ Sepsis + Batch + RIN + Diagnosis)
ddsHTSeq
ddsHTSeq$Diagnosis <- relevel(ddsHTSeq$Diagnosis, ref = "NDC")
#preforming DE analysis
ddsHTSeq <- DESeq(ddsHTSeq)
ddsHTSeq <- ddsHTSeq[which(mcols(ddsHTSeq)$betaConv),]
Labels_matrix <- cbind(Labels_matrix,sampleTable$Diagnosis)
colnames(Labels_matrix)[i] <- "origin"
print("original")
for (t in contrastList){
resName <- paste('res', t, sep='_')
##create res file
assign(resName, results(ddsHTSeq, contrast=c("Diagnosis",strsplit(t, "vs")[[1]][1],strsplit(t, "vs")[[1]][2]), alpha=0.05))
summary(get(resName))
permDataset[i,paste0(t,"_DE")] <- sum(get(resName)$padj < 0.05, na.rm=TRUE) #How many genes were less than 0.05
permDataset[i,paste0(t,"_DE_down")] <- sum(get(resName)$padj < 0.05 & get(resName)$log2FoldChange < 0, na.rm=TRUE) #How many genes were down-regulated
permDataset[i,paste0(t,"_DE_up")] <- permDataset[i,paste0(t,"_DE")] - permDataset[i,paste0(t,"_DE_down")] #How many genes were up-regulated
permDataset[i,paste0(t,"_ave_logFC_down")] <- mean(get(resName)$log2FoldChange[which(get(resName)$padj < 0.05 & get(resName)$log2FoldChange < 0)])
permDataset[i,paste0(t,"_ave_logFC_up")] <- mean(get(resName)$log2FoldChange[which(get(resName)$padj < 0.05 & get(resName)$log2FoldChange > 0)])
}
} else {
print(paste("permutation number:", i))
# mix labels
## standard approach: scramble response value
Labels <- sampleTable$Diagnosis
perm <- sample(length(Labels))
mixLabels_perm <- transform(Labels[perm])
#ddsHTSeq$Diagnosis <- mixLabels_perm$X_data
sampleTable$Diagnosis <- mixLabels_perm$X_data
print(mixLabels_perm$X_data)
#save labels
Labels_matrix <- cbind(Labels_matrix,sampleTable$Diagnosis)
colnames(Labels_matrix)[i] <- paste0("perm_", i)
# Balance design for clot and batch effects
ddsHTSeq <- DESeqDataSetFromHTSeqCount(sampleTable = sampleTable, directory = directory,
design = ~ Sepsis + Batch + RIN + Diagnosis)
ddsHTSeq
ddsHTSeq$Diagnosis <- relevel(ddsHTSeq$Diagnosis, ref = "NDC")
#preforming DE analysis
ddsHTSeq <- DESeq(ddsHTSeq)
ddsHTSeq <- ddsHTSeq[which(mcols(ddsHTSeq)$betaConv),]
#run test
for (s in contrastList){
resName <- paste('res', s, sep='_')
##create res file
assign(resName, results(ddsHTSeq, contrast=c("Diagnosis",strsplit(s, "vs")[[1]][1],strsplit(s, "vs")[[1]][2]), alpha=0.05))
summary(get(resName))
permDataset[i,paste0(s,"_DE")] <- sum(get(resName)$padj < 0.05, na.rm=TRUE) #How many genes were less than 0.05
permDataset[i,paste0(s,"_DE_down")] <- sum(get(resName)$padj < 0.05 & get(resName)$log2FoldChange < 0, na.rm=TRUE) #How many genes were down-regulated
permDataset[i,paste0(s,"_DE_up")] <- permDataset[i,paste0(s,"_DE")] - permDataset[i,paste0(s,"_DE_down")] #How many genes were up-regulated
permDataset[i,paste0(s,"_ave_logFC_down")] <- mean(get(resName)$log2FoldChange[which(get(resName)$padj < 0.05 & get(resName)$log2FoldChange < 0)])
permDataset[i,paste0(s,"_ave_logFC_up")] <- mean(get(resName)$log2FoldChange[which(get(resName)$padj < 0.05 & get(resName)$log2FoldChange > 0)])
}
}
}
Code 2 - mixing the labels inside the ddsHTSeq object
library("DESeq2")
library("ggplot2")
library("base")
colData <- read.csv(".../malePdataBlood250420_plusFactors.csv",header = T, sep = ',') #server
directory <- "..../HTSeq_ensembl100_Male31" #server
sampleFiles <- grep('.HTseq.stats',list.files(directory),value=TRUE)
sampleTable <- data.frame(sampleName = colData$sample.name, fileName = sampleFiles, Diagnosis = as.factor(colData$diagnosis),
Age = as.factor(colData$age), Braak = as.factor(colData$braak), BraakLB = as.factor(colData$braaklb),
Amyloid = as.factor(colData$amyloid), ApoE = as.factor(colData$apoe), PMD = as.factor(colData$pmd.blood),
RIN = as.factor(colData$RIN_factor), Duration = as.factor(colData$pd.duration..y.), Sepsis = as.factor(colData$sepsis),
Batch = as.factor(colData$batch),HeartFaliure = as.factor(colData$heart.failure), Euthanasia = as.factor(colData$Euthanasia))
# #create vector with all desired contrasts
contrastList <- c("DPDvsNDC","NDPDvsNDC","DPDvsNDPD")
# Balance design for clot and batch effects
ddsHTSeq <- DESeqDataSetFromHTSeqCount(sampleTable = sampleTable, directory = directory,
design = ~ Sepsis + Batch + RIN + Diagnosis)
ddsHTSeq
ddsHTSeq$Diagnosis <- relevel(ddsHTSeq$Diagnosis, ref = "NDC")
#preforming DE analysis
ddsHTSeq <- DESeq(ddsHTSeq)
ddsHTSeq <- ddsHTSeq[which(mcols(ddsHTSeq)$betaConv),]
###
#here add permutation start!
##create permutation table
permNumber <-c(1:101) #first will allways be the original, so add 1 to the number of permutations required
permDataset <- data.frame(permNumber = permNumber,
DPDvsNDC_DE = rep(0,times = length(permNumber)),DPDvsNDC_DE_down = rep(0,times = length(permNumber)),DPDvsNDC_ave_logFC_down = rep(0,times = length(permNumber)),
DPDvsNDC_DE_up= rep(0,times = length(permNumber)),DPDvsNDC_ave_logFC_up = rep(0,times = length(permNumber)),
NDPDvsNDC_DE = rep(0,times = length(permNumber)),NDPDvsNDC_DE_down = rep(0,times = length(permNumber)),NDPDvsNDC_ave_logFC_down = rep(0,times = length(permNumber)),
NDPDvsNDC_DE_up = rep(0,times = length(permNumber)),NDPDvsNDC_ave_logFC_up = rep(0,times = length(permNumber)),
DPDvsNDPD_DE = rep(0,times = length(permNumber)), DPDvsNDPD_DE_down = rep(0,times = length(permNumber)),DPDvsNDPD_ave_logFC_down = rep(0,times = length(permNumber)),
DPDvsNDPD_DE_up = rep(0,times = length(permNumber)),DPDvsNDPD_ave_logFC_up = rep(0,times = length(permNumber)))
Labels_matrix_ddsHTSeq <- NULL
Labels_matrix_colData <- NULL
setwd(".../281220_2")
for (i in permNumber){
if( i == 1) {
Labels_matrix_colData <- as.data.frame(colData(ddsHTSeq)$Diagnosis)
colnames(Labels_matrix_colData)[i] <- "origin"
Labels_matrix_ddsHTSeq <- as.data.frame(ddsHTSeq$Diagnosis)
colnames(Labels_matrix_ddsHTSeq)[i] <- "origin"
print("original")
for (t in contrastList){
resName <- paste('res', t, sep='_')
##create res file
assign(resName, results(ddsHTSeq, contrast=c("Diagnosis",strsplit(t, "vs")[[1]][1],strsplit(t, "vs")[[1]][2]), alpha=0.05))
summary(get(resName))
permDataset[i,paste0(t,"_DE")] <- sum(get(resName)$padj < 0.05, na.rm=TRUE) #How many genes were less than 0.05
permDataset[i,paste0(t,"_DE_down")] <- sum(get(resName)$padj < 0.05 & get(resName)$log2FoldChange < 0, na.rm=TRUE) #How many genes were down-regulated
permDataset[i,paste0(t,"_DE_up")] <- permDataset[i,paste0(t,"_DE")] - permDataset[i,paste0(t,"_DE_down")] #How many genes were up-regulated
permDataset[i,paste0(t,"_ave_logFC_down")] <- mean(get(resName)$log2FoldChange[which(get(resName)$padj < 0.05 & get(resName)$log2FoldChange < 0)])
permDataset[i,paste0(t,"_ave_logFC_up")] <- mean(get(resName)$log2FoldChange[which(get(resName)$padj < 0.05 & get(resName)$log2FoldChange > 0)])
}
} else {
print(paste("permutation number:", i))
# mix labels
## standard approach: scramble response value
Labels_colData <- colData(ddsHTSeq)$Diagnosis
Labels_ddsHTSeq <- ddsHTSeq$Diagnosis
perm <- sample(length(Labels_colData))
mixLabels_perm <- transform(Labels_colData[perm])
#ddsHTSeq$Diagnosis <- mixLabels_perm$X_data
colData(ddsHTSeq)$Diagnosis <- mixLabels_perm$X_data
print(mixLabels_perm$X_data)
#save labels
Labels_matrix_colData <- cbind(Labels_matrix_colData,as.data.frame(colData(ddsHTSeq)$Diagnosis))
colnames(Labels_matrix_colData)[i] <- paste0("perm_", i)
Labels_matrix_ddsHTSeq <- cbind(Labels_matrix_ddsHTSeq,as.data.frame(ddsHTSeq$Diagnosis))
colnames(Labels_matrix_ddsHTSeq)[i] <- paste0("perm_", i)
#run test
for (s in contrastList){
resName <- paste('res', s, sep='_')
##create res file
assign(resName, results(ddsHTSeq, contrast=c("Diagnosis",strsplit(s, "vs")[[1]][1],strsplit(s, "vs")[[1]][2]), alpha=0.05))
summary(get(resName))
permDataset[i,paste0(s,"_DE")] <- sum(get(resName)$padj < 0.05, na.rm=TRUE) #How many genes were less than 0.05
permDataset[i,paste0(s,"_DE_down")] <- sum(get(resName)$padj < 0.05 & get(resName)$log2FoldChange < 0, na.rm=TRUE) #How many genes were down-regulated
permDataset[i,paste0(s,"_DE_up")] <- permDataset[i,paste0(s,"_DE")] - permDataset[i,paste0(s,"_DE_down")] #How many genes were up-regulated
permDataset[i,paste0(s,"_ave_logFC_down")] <- mean(get(resName)$log2FoldChange[which(get(resName)$padj < 0.05 & get(resName)$log2FoldChange < 0)])
permDataset[i,paste0(s,"_ave_logFC_up")] <- mean(get(resName)$log2FoldChange[which(get(resName)$padj < 0.05 & get(resName)$log2FoldChange > 0)])
}
}
}
identical(Labels_matrix_colData,Labels_matrix_ddsHTSeq) #TRUE