Question

Is this edgeR code fine?

0

Entering edit mode

10.0 years ago

biotech ▴ 570

Hi guys,

I would you to revise my edgeR code since it's possible I missed something important because being quite new on this.

Thanks, Bernardo

############################################################
#htseq-count stats
############################################################
# rRNA and tRNA will be discarded in counts file because the arbitrary mapped reads to these regions
# NOTE: minimun alignment quality is set to 10
# '-t CDS -i ID' will exclude rRNA and tRNA. Also 'Parent' will give the correct locus tag name to each 'feature' in count table.
python -m HTSeq.scripts.count -m intersection-nonempty -a 10 -t CDS -i ID 14.sam HS372.gff > 14.counts
python -m HTSeq.scripts.count -m intersection-nonempty -a 10 -t CDS -i ID 15.sam HS372.gff > 15.counts
python -m HTSeq.scripts.count -m intersection-nonempty -a 10 -t CDS -i ID 16.sam HS372.gff > 16.counts

#Last lines from .counts files
#14.counts
__no_feature    271363
__ambiguous 6
__too_low_aQual 137308
__not_aligned   133836
__alignment_not_unique  0

#15.counts
__no_feature    346247
__ambiguous 3
__too_low_aQual 148014
__not_aligned   135920
__alignment_not_unique  0

#16.counts

__no_feature    1067474
__ambiguous 0
__too_low_aQual 136484
__not_aligned   110488
__alignment_not_unique  0


############################################################
#edgeR
############################################################

#R code
library(edgeR)
library(WriteXLS)
dir () # The tag counts for the two individual libraries are stored in two separate plain text files. In each file, the tag IDs and counts for each tag are provided in a table
targets <- read.delim("targets.txt", stringsAsFactors = FALSE)
targets

#      files      group   description
#1 14.counts    biofilm    biofilm F9
#2 15.counts planktonic planktonic F9
#3 16.counts stationary stationary F9

RG <- readDGE(targets)
colnames(RG) <- c("biofilm","planktonic","stationary")
RG
dim(RG)

#filter low expressed transcripts
keep <- rowSums(cpm(RG)>1) > 1 #we keep genes that achieve at least one count per million (cpm) in at least TWO samples
RG <- RG[keep,]
dim(RG)
RG$samples$lib.size <- colSums(RG$counts) # After filtering, it is a good idea to reset the library sizes:
RG$samples

#Normalization
RG <- calcNormFactors(RG) # Apply TMM normalization
RG$samples # se manual
RG

############################################################
#Bio_vs_Plank
############################################################

bcv <- 0.2 # Assume a low BCV value of 0.2. The BCV (square root of the common dispersion) here is 20%, stilgthly higher than a typical size for a laboratory experiment with a cell line or a model organism.
et <- exactTest(RG, pair=c("planktonic","biofilm"),dispersion=bcv^2) # exactTest. dispersion = 0.04
et
class(et)
top <- topTags(et) # Top ten differentially expressed tags with their p-values
top
class(top)
cpm(RG)[rownames(top), ] # Check the individual cpm values for the top ten genes
summary(de <- decideTestsDGE(et, p=0.05, adjust="BH")) # The total number of differentiallly expressed genes at FDR< 0.05

# Here the entries for -1, 0 and 1 are for down-regulated, non-differentially expressed and up-regulated tags respectively.
#-1   54
#0  2153
#1    52

detags <- rownames(RG)[as.logical(de)] # detags contains the DE genes at 5% FDR
head (detags)
plotSmear(et, de.tags=detags, ylim=c(-7,7), xlim=c(0,15), cex.lab=1.4, cex.axis=1) # plot all genes and highlight DE genes at 5% FDR
abline(h=c(-1, 1), col="blue") # The blue lines indicate 2-fold changes.
title("Biofilm vs planktonic")
dev.copy2pdf(file = "Figure_1.pdf") #Save as .pdf##

# NOTE -> adjust 'n' depending on the available number of genes
Bio_vs_Plank <- topTags(et, n=2259, adjust.method="BH", sort.by="logFC")

# export excel file
x <- Bio_vs_Plank$table
WriteXLS("x","Bio_vs_Plank.xls", row.names = TRUE, col.names = TRUE)

# export genes for TopGO. DEG_up_pvalue_0.05
x.df <- Bio_vs_Plank$table
x.sub <- subset(x.df, logFC > 0 & PValue < 0.05)
x.sub
DEG_up_Pvalue_005 <- rownames(x.sub)
write.table(DEG_up_Pvalue_005, "Bio_vs_Plank_DEG_up_pvalue_005.txt", quote = FALSE, row.names = FALSE, col.names = FALSE)

# export genes for TopGO. DEG_down_pvalue_0.05
x.df <- Bio_vs_Plank$table
x.sub <- subset(x.df, logFC < 0 & PValue < 0.05)
x.sub
DEG_down_Pvalue_005 <- rownames(x.sub)
write.table(DEG_down_Pvalue_005, "Bio_vs_Plank_DEG_down_pvalue_005.txt", quote = FALSE, row.names = FALSE, col.names = FALSE)

# export reference gene set
reference_set <- rownames(RG$counts)
write.table(reference_set, "reference_set.txt", quote = FALSE, row.names = FALSE, col.names = FALSE)

edgeR rna-seq • 5.6k views

ADD COMMENT • link updated 2.6 years ago by Ram 43k • written 10.0 years ago by biotech ▴ 570

0

Entering edit mode

please edit your post and format it better, it is too long and properly readable at this time

ADD REPLY • link 10.0 years ago by Istvan Albert 100k

0

Entering edit mode

Sorry, I was counter editing, went raw again, I made a gist for it, and was trying to embed

gist fails with \gist anonymous/49dd230bd6532e3aec2a

ADD REPLY • link updated 4.3 years ago by Ram 43k • written 10.0 years ago by Sukhi Singh 11k

0

Entering edit mode

the way it works now is that you just paste the actual url to the gist and will embed it (see the main post)

ADD REPLY • link 10.0 years ago by Istvan Albert 100k

0

Entering edit mode

Ah, great :)

ADD REPLY • link 10.0 years ago by Sukhi Singh 11k

0

Entering edit mode

The code has been updated.

ADD REPLY • link 10.0 years ago by biotech ▴ 570