I have done with this analysis:
#Creation of object(folder) exdir exdir <- path.expand("~/GSE162562_RAW") dir.create(exdir, showWarnings = FALSE) URL <- "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE162nnn/GSE162562/suppl/GSE162562_RAW.tar" FILE <- file.path(tempdir(), basename(URL)) #Downlaod the Raw Data from GEO utils::download.file(URL, FILE, mode = "wb") #unzip the files and storing them in GSE162562_RAW folder which we created already utils::untar(FILE, exdir = exdir) #Check your GSE162562_RAW folder after this , it must contains 108 samples in compressed format unlink(FILE, recursive = TRUE, force = TRUE) #listing of samples listed_files <- list.files(exdir, pattern=".gz", full.names=TRUE) #/ load files into R: loaded <- lapply(listed_files, function(x) read.delim(x, header=FALSE, row.names = "V1")) #/ bind everything into a single count matrix and assign colnames: raw_counts <- do.call(cbind, loaded) colnames(raw_counts) <- gsub(".txt.*", "", basename(listed_files)) #/ there is some nonsense in these files, probably due to the tool that was used (HTSeq?), #/ such as rows with names "__no_feature", lets remove that: raw_counts <- raw_counts[!grepl("^__", rownames(raw_counts)),] # / View raw_counts after filtering raw_counts #There are total 26364 genes after filtering #Store Raw counts in CSV File #I am sacing them in my Desktop.You can save according to your choice write.csv(raw_counts,"C:\\Users\\USER\\Desktop\\countsdata.csv") #load library library(DESeq2) #/ make a DESeq2 object. We can parse the group membership (Mild etc) from the colnames, #/ which are based on the original filenames: dds <- DESeqDataSetFromMatrix( countData=raw_counts, colData=data.frame(group=factor(unlist(lapply(strsplit(colnames(raw_counts), split="_"), function(x) x)))), design=~group) #View Deseq2 object dds #/ some Quality Control via PCA using the in-built PCA function from DESeq2: vsd <- vst(dds, blind=TRUE) #/ plot the PCA: plotPCA(vsd, "group") #/ extract the data: pcadata <- plotPCA(vsd, "group", returnData=TRUE) #view pca data pcadata #/ there is a very obvious batch effect, that we can correct, simply everything that is greater or less than zero in the first principal component, very obvious just by looking at the plot: dds$batch <- factor(ifelse(pcadata$PC1 > 0, "batch1", "batch2")) #/ lets use limma::removeBatchEffect to see whether it can be removed: library(limma) vsd2 <- vsd assay(vsd2) <- removeBatchEffect(assay(vsd), batch=dds$batch) #/ plot PCA again, looks much better. this means we modify our design to include batch and group, plotPCA(vsd2, "group") #include batch and group both in design design(dds) <- ~batch + group #Running the differential expression pipeline dds <- DESeq(dds) #Building the results table res <- results(dds) res #Saving Results in CSV file write.csv(res , "dds.csv") mcols(res, use.names = TRUE) #We can also summarize the results with the following line of code, which reports some additional information: summary(res) #Total 1236 DEGs have been found when we se p_value less than 0.1 table(res$padj < 0.01) res.05 <- results(dds, alpha = 0.05) summary(res.05) table(res.05$padj < 0.05) sum(res$pvalue < 0.05, na.rm=TRUE) sum(!is.na(res$pvalue<0.05)) sum(res$padj < 0.1, na.rm=TRUE) sum(res$padj < 0.05, na.rm=TRUE) resSig <- subset(res, padj < 0.1) head(resSig[ order(resSig$log2FoldChange),]) head(resSig[ order(resSig$log2FoldChange, decreasing = TRUE), ]) #Save DEGS at padj < 0.1 write.csv(resSig,"DEGs@0.1.csv")
Now I want to do gene set enrichment analysis by using the topGO package and also want to make a heatmap and Venn diagram.
How can I do this?