Problem with mixing labels for permutation test inside dds object in DESeq2
0
0
Entering edit mode
3.3 years ago

Hey everyone, I've been trying to do a permutation test on my RNAseq data using differential expression analysis, as conducted by DESeq2. My goal is to check whether the real groups present the biggest differences between them, by looking at the number of differentially expressed RNA transcripts and mean log fold change values in a permutation test. I'm using HTSeq stats files, so initially, I did the label mixing inside the sampleTable object and rerun the creation of the ddsHTSeq object and the test. This method worked well but took a very long time to run, so instead, I tried to create the ddsHTSeq once and mix the labels inside of it and then running the differential expression test multiple times by asking for a new results object after each mixing of the labels. I've found 2 places that can work for mix the labels in that way, and confirmed that both changed exactly the same labels in both places: (1) ddsHTSeq$Diagnosis (2) colData(ddsHTSeq)$Diagnosis

The problem is, that when I mix the labels that way (either 1 or 2) I get the exact same result for all the permutations I do in one (DPDvsNDPD) of my 3 contrasts (comparing my 3 groups to each other), and in the other 2 contrasts (DPDvsNDC, NDPDvsNDC) there are more overlaps in the results than expected as seen in the first method I used (of mixing the labels before constructing the ddsHTSeq object). Does anyone know what can be the underlying problem? the only difference between the codes is where the label mixing is taking place. Perhaps I'm not mixing the labels in the right place? I've added the two codes I used below.

Thank you for your help! Shani

Code 1 - mixing the labels before constructing the ddsHTSeq object

library("DESeq2")
library("ggplot2")
library("base")

colData <- read.csv(".../malePdataBlood250420_plusFactors2.csv",header = TRUE, sep = ',') #server

directory <- "..../HTSeq_ensembl100_Male31" #server
sampleFiles <- grep('.HTseq.stats',list.files(directory),value=TRUE)
sampleTable <- data.frame(sampleName = colData$sample.name, fileName = sampleFiles, Diagnosis = as.factor(colData$diagnosis), 
                          Age = as.factor(colData$age), Braak = as.factor(colData$braak), BraakLB = as.factor(colData$braaklb),
                          Amyloid = as.factor(colData$amyloid), ApoE = as.factor(colData$apoe), PMD = as.factor(colData$pmd.blood),
                          RIN = as.factor(colData$RIN_factor), Duration = as.factor(colData$pd.duration..y.), Sepsis = as.factor(colData$sepsis),
                          Batch = as.factor(colData$batch),HeartFaliure = as.factor(colData$heart.failure), Euthanasia = as.factor(colData$Euthanasia))

# #create vector with all desired contrasts
contrastList <- c("DPDvsNDC","NDPDvsNDC","DPDvsNDPD")


###
#here add permutation start!
##create permutation table
permNumber <-c(1:51) #first will allways be the original, so add 1 to the number of permutations required
permDataset <- data.frame(permNumber = permNumber,
                              DPDvsNDC_DE = rep(0,times = length(permNumber)),DPDvsNDC_DE_down = rep(0,times = length(permNumber)),DPDvsNDC_ave_logFC_down = rep(0,times = length(permNumber)),
                              DPDvsNDC_DE_up= rep(0,times = length(permNumber)),DPDvsNDC_ave_logFC_up = rep(0,times = length(permNumber)),
                              NDPDvsNDC_DE = rep(0,times = length(permNumber)),NDPDvsNDC_DE_down = rep(0,times = length(permNumber)),NDPDvsNDC_ave_logFC_down = rep(0,times = length(permNumber)),
                              NDPDvsNDC_DE_up = rep(0,times = length(permNumber)),NDPDvsNDC_ave_logFC_up = rep(0,times = length(permNumber)),
                              DPDvsNDPD_DE = rep(0,times = length(permNumber)), DPDvsNDPD_DE_down = rep(0,times = length(permNumber)),DPDvsNDPD_ave_logFC_down = rep(0,times = length(permNumber)),
                              DPDvsNDPD_DE_up = rep(0,times = length(permNumber)),DPDvsNDPD_ave_logFC_up = rep(0,times = length(permNumber)))

Labels_matrix <- NULL

#setwd(".../281220_2")

for (i in permNumber){
  if( i == 1) {
    # Balance design for clot and batch effects
    ddsHTSeq <- DESeqDataSetFromHTSeqCount(sampleTable = sampleTable, directory = directory,
                                           design = ~ Sepsis + Batch + RIN + Diagnosis) 
    ddsHTSeq

    ddsHTSeq$Diagnosis <- relevel(ddsHTSeq$Diagnosis, ref = "NDC")

    #preforming DE analysis
    ddsHTSeq <- DESeq(ddsHTSeq)
    ddsHTSeq <- ddsHTSeq[which(mcols(ddsHTSeq)$betaConv),]

    Labels_matrix <- cbind(Labels_matrix,sampleTable$Diagnosis)
    colnames(Labels_matrix)[i] <- "origin"

    print("original")

    for (t in contrastList){
      resName <- paste('res', t, sep='_')
      ##create res file
      assign(resName, results(ddsHTSeq, contrast=c("Diagnosis",strsplit(t, "vs")[[1]][1],strsplit(t, "vs")[[1]][2]), alpha=0.05))
      summary(get(resName))

      permDataset[i,paste0(t,"_DE")] <- sum(get(resName)$padj < 0.05, na.rm=TRUE) #How many genes were less than 0.05 
      permDataset[i,paste0(t,"_DE_down")] <- sum(get(resName)$padj < 0.05 & get(resName)$log2FoldChange < 0, na.rm=TRUE) #How many genes were down-regulated 
      permDataset[i,paste0(t,"_DE_up")] <- permDataset[i,paste0(t,"_DE")] - permDataset[i,paste0(t,"_DE_down")] #How many genes were up-regulated

      permDataset[i,paste0(t,"_ave_logFC_down")] <- mean(get(resName)$log2FoldChange[which(get(resName)$padj < 0.05 & get(resName)$log2FoldChange < 0)])
      permDataset[i,paste0(t,"_ave_logFC_up")] <- mean(get(resName)$log2FoldChange[which(get(resName)$padj < 0.05 & get(resName)$log2FoldChange > 0)])
    }

  } else {
    print(paste("permutation number:", i))

    # mix labels
    ## standard approach: scramble response value
    Labels <- sampleTable$Diagnosis 

    perm <- sample(length(Labels))
    mixLabels_perm <- transform(Labels[perm])
    #ddsHTSeq$Diagnosis <- mixLabels_perm$X_data
    sampleTable$Diagnosis <- mixLabels_perm$X_data
    print(mixLabels_perm$X_data)

    #save labels
    Labels_matrix <- cbind(Labels_matrix,sampleTable$Diagnosis)
    colnames(Labels_matrix)[i] <- paste0("perm_", i)

    # Balance design for clot and batch effects
    ddsHTSeq <- DESeqDataSetFromHTSeqCount(sampleTable = sampleTable, directory = directory,
                                           design = ~ Sepsis + Batch + RIN + Diagnosis) 
    ddsHTSeq

    ddsHTSeq$Diagnosis <- relevel(ddsHTSeq$Diagnosis, ref = "NDC")

    #preforming DE analysis
    ddsHTSeq <- DESeq(ddsHTSeq)
    ddsHTSeq <- ddsHTSeq[which(mcols(ddsHTSeq)$betaConv),] 

    #run test
    for (s in contrastList){
      resName <- paste('res', s, sep='_')
      ##create res file
      assign(resName, results(ddsHTSeq, contrast=c("Diagnosis",strsplit(s, "vs")[[1]][1],strsplit(s, "vs")[[1]][2]), alpha=0.05))
      summary(get(resName))

      permDataset[i,paste0(s,"_DE")] <- sum(get(resName)$padj < 0.05, na.rm=TRUE) #How many genes were less than 0.05 
      permDataset[i,paste0(s,"_DE_down")] <- sum(get(resName)$padj < 0.05 & get(resName)$log2FoldChange < 0, na.rm=TRUE) #How many genes were down-regulated 
      permDataset[i,paste0(s,"_DE_up")] <- permDataset[i,paste0(s,"_DE")] - permDataset[i,paste0(s,"_DE_down")] #How many genes were up-regulated

      permDataset[i,paste0(s,"_ave_logFC_down")] <- mean(get(resName)$log2FoldChange[which(get(resName)$padj < 0.05 & get(resName)$log2FoldChange < 0)])
      permDataset[i,paste0(s,"_ave_logFC_up")] <- mean(get(resName)$log2FoldChange[which(get(resName)$padj < 0.05 & get(resName)$log2FoldChange > 0)])
    }

  }
}

Code 2 - mixing the labels inside the ddsHTSeq object

library("DESeq2")
library("ggplot2")
library("base")


colData <- read.csv(".../malePdataBlood250420_plusFactors.csv",header = T, sep = ',') #server

directory <- "..../HTSeq_ensembl100_Male31" #server
sampleFiles <- grep('.HTseq.stats',list.files(directory),value=TRUE)
sampleTable <- data.frame(sampleName = colData$sample.name, fileName = sampleFiles, Diagnosis = as.factor(colData$diagnosis), 
                          Age = as.factor(colData$age), Braak = as.factor(colData$braak), BraakLB = as.factor(colData$braaklb),
                          Amyloid = as.factor(colData$amyloid), ApoE = as.factor(colData$apoe), PMD = as.factor(colData$pmd.blood),
                          RIN = as.factor(colData$RIN_factor), Duration = as.factor(colData$pd.duration..y.), Sepsis = as.factor(colData$sepsis),
                          Batch = as.factor(colData$batch),HeartFaliure = as.factor(colData$heart.failure), Euthanasia = as.factor(colData$Euthanasia))

# #create vector with all desired contrasts
contrastList <- c("DPDvsNDC","NDPDvsNDC","DPDvsNDPD")

# Balance design for clot and batch effects
ddsHTSeq <- DESeqDataSetFromHTSeqCount(sampleTable = sampleTable, directory = directory,
                                       design = ~ Sepsis + Batch + RIN + Diagnosis) 
ddsHTSeq

ddsHTSeq$Diagnosis <- relevel(ddsHTSeq$Diagnosis, ref = "NDC")

#preforming DE analysis
ddsHTSeq <- DESeq(ddsHTSeq)
ddsHTSeq <- ddsHTSeq[which(mcols(ddsHTSeq)$betaConv),] 

###
#here add permutation start!
##create permutation table
permNumber <-c(1:101) #first will allways be the original, so add 1 to the number of permutations required
permDataset <- data.frame(permNumber = permNumber,
                              DPDvsNDC_DE = rep(0,times = length(permNumber)),DPDvsNDC_DE_down = rep(0,times = length(permNumber)),DPDvsNDC_ave_logFC_down = rep(0,times = length(permNumber)),
                              DPDvsNDC_DE_up= rep(0,times = length(permNumber)),DPDvsNDC_ave_logFC_up = rep(0,times = length(permNumber)),
                              NDPDvsNDC_DE = rep(0,times = length(permNumber)),NDPDvsNDC_DE_down = rep(0,times = length(permNumber)),NDPDvsNDC_ave_logFC_down = rep(0,times = length(permNumber)),
                              NDPDvsNDC_DE_up = rep(0,times = length(permNumber)),NDPDvsNDC_ave_logFC_up = rep(0,times = length(permNumber)),
                              DPDvsNDPD_DE = rep(0,times = length(permNumber)), DPDvsNDPD_DE_down = rep(0,times = length(permNumber)),DPDvsNDPD_ave_logFC_down = rep(0,times = length(permNumber)),
                              DPDvsNDPD_DE_up = rep(0,times = length(permNumber)),DPDvsNDPD_ave_logFC_up = rep(0,times = length(permNumber)))

Labels_matrix_ddsHTSeq <- NULL
Labels_matrix_colData <- NULL

setwd(".../281220_2")

for (i in permNumber){
  if( i == 1) {
    Labels_matrix_colData <- as.data.frame(colData(ddsHTSeq)$Diagnosis)
    colnames(Labels_matrix_colData)[i] <- "origin"
    Labels_matrix_ddsHTSeq <- as.data.frame(ddsHTSeq$Diagnosis)
    colnames(Labels_matrix_ddsHTSeq)[i] <- "origin"

    print("original")

    for (t in contrastList){
      resName <- paste('res', t, sep='_')
      ##create res file
      assign(resName, results(ddsHTSeq, contrast=c("Diagnosis",strsplit(t, "vs")[[1]][1],strsplit(t, "vs")[[1]][2]), alpha=0.05))
      summary(get(resName))

      permDataset[i,paste0(t,"_DE")] <- sum(get(resName)$padj < 0.05, na.rm=TRUE) #How many genes were less than 0.05 
      permDataset[i,paste0(t,"_DE_down")] <- sum(get(resName)$padj < 0.05 & get(resName)$log2FoldChange < 0, na.rm=TRUE) #How many genes were down-regulated 
      permDataset[i,paste0(t,"_DE_up")] <- permDataset[i,paste0(t,"_DE")] - permDataset[i,paste0(t,"_DE_down")] #How many genes were up-regulated

      permDataset[i,paste0(t,"_ave_logFC_down")] <- mean(get(resName)$log2FoldChange[which(get(resName)$padj < 0.05 & get(resName)$log2FoldChange < 0)])
      permDataset[i,paste0(t,"_ave_logFC_up")] <- mean(get(resName)$log2FoldChange[which(get(resName)$padj < 0.05 & get(resName)$log2FoldChange > 0)])
    }

  } else {
    print(paste("permutation number:", i))

    # mix labels
    ## standard approach: scramble response value
    Labels_colData <- colData(ddsHTSeq)$Diagnosis 
    Labels_ddsHTSeq <- ddsHTSeq$Diagnosis

    perm <- sample(length(Labels_colData))
    mixLabels_perm <- transform(Labels_colData[perm])
    #ddsHTSeq$Diagnosis <- mixLabels_perm$X_data
    colData(ddsHTSeq)$Diagnosis <- mixLabels_perm$X_data
    print(mixLabels_perm$X_data)

    #save labels
    Labels_matrix_colData <- cbind(Labels_matrix_colData,as.data.frame(colData(ddsHTSeq)$Diagnosis))
    colnames(Labels_matrix_colData)[i] <- paste0("perm_", i)
    Labels_matrix_ddsHTSeq <- cbind(Labels_matrix_ddsHTSeq,as.data.frame(ddsHTSeq$Diagnosis))
    colnames(Labels_matrix_ddsHTSeq)[i] <- paste0("perm_", i)

    #run test
    for (s in contrastList){
      resName <- paste('res', s, sep='_')
      ##create res file
      assign(resName, results(ddsHTSeq, contrast=c("Diagnosis",strsplit(s, "vs")[[1]][1],strsplit(s, "vs")[[1]][2]), alpha=0.05))
      summary(get(resName))

      permDataset[i,paste0(s,"_DE")] <- sum(get(resName)$padj < 0.05, na.rm=TRUE) #How many genes were less than 0.05 
      permDataset[i,paste0(s,"_DE_down")] <- sum(get(resName)$padj < 0.05 & get(resName)$log2FoldChange < 0, na.rm=TRUE) #How many genes were down-regulated 
      permDataset[i,paste0(s,"_DE_up")] <- permDataset[i,paste0(s,"_DE")] - permDataset[i,paste0(s,"_DE_down")] #How many genes were up-regulated

      permDataset[i,paste0(s,"_ave_logFC_down")] <- mean(get(resName)$log2FoldChange[which(get(resName)$padj < 0.05 & get(resName)$log2FoldChange < 0)])
      permDataset[i,paste0(s,"_ave_logFC_up")] <- mean(get(resName)$log2FoldChange[which(get(resName)$padj < 0.05 & get(resName)$log2FoldChange > 0)])
    }

  }
}

identical(Labels_matrix_colData,Labels_matrix_ddsHTSeq) #TRUE
RNA-Seq DESeq2 permutation • 720 views
ADD COMMENT

Login before adding your answer.

Traffic: 3219 users visited in the last hour
Help About
FAQ
Access RSS
API
Stats

Use of this site constitutes acceptance of our User Agreement and Privacy Policy.

Powered by the version 2.3.6