Question: Is this edgeR code fine?
0
gravatar for biotech
5.0 years ago by
biotech520
United States
biotech520 wrote:

Hi guys,

I would you to revise my edgeR code since it's possible I missed something important because being quite new on this.

Thanks, Bernardo

############################################################
#htseq-count stats
############################################################
# rRNA and tRNA will be discarded in counts file because the arbitrary mapped reads to these regions
# NOTE: minimun alignment quality is set to 10
# '-t CDS -i ID' will exclude rRNA and tRNA. Also 'Parent' will give the correct locus tag name to each 'feature' in count table.
python -m HTSeq.scripts.count -m intersection-nonempty -a 10 -t CDS -i ID 14.sam HS372.gff > 14.counts
python -m HTSeq.scripts.count -m intersection-nonempty -a 10 -t CDS -i ID 15.sam HS372.gff > 15.counts
python -m HTSeq.scripts.count -m intersection-nonempty -a 10 -t CDS -i ID 16.sam HS372.gff > 16.counts

#Last lines from .counts files
#14.counts
__no_feature    271363
__ambiguous 6
__too_low_aQual 137308
__not_aligned   133836
__alignment_not_unique  0

#15.counts
__no_feature    346247
__ambiguous 3
__too_low_aQual 148014
__not_aligned   135920
__alignment_not_unique  0

#16.counts

__no_feature    1067474
__ambiguous 0
__too_low_aQual 136484
__not_aligned   110488
__alignment_not_unique  0


############################################################
#edgeR
############################################################

#R code
library(edgeR)
library(WriteXLS)
dir () # The tag counts for the two individual libraries are stored in two separate plain text files. In each file, the tag IDs and counts for each tag are provided in a table
targets <- read.delim("targets.txt", stringsAsFactors = FALSE)
targets

#      files      group   description
#1 14.counts    biofilm    biofilm F9
#2 15.counts planktonic planktonic F9
#3 16.counts stationary stationary F9

RG <- readDGE(targets)
colnames(RG) <- c("biofilm","planktonic","stationary")
RG
dim(RG)

#filter low expressed transcripts
keep <- rowSums(cpm(RG)>1) > 1 #we keep genes that achieve at least one count per million (cpm) in at least TWO samples
RG <- RG[keep,]
dim(RG)
RG$samples$lib.size <- colSums(RG$counts) # After filtering, it is a good idea to reset the library sizes:
RG$samples

#Normalization
RG <- calcNormFactors(RG) # Apply TMM normalization
RG$samples # se manual
RG

############################################################
#Bio_vs_Plank
############################################################

bcv <- 0.2 # Assume a low BCV value of 0.2. The BCV (square root of the common dispersion) here is 20%, stilgthly higher than a typical size for a laboratory experiment with a cell line or a model organism.
et <- exactTest(RG, pair=c("planktonic","biofilm"),dispersion=bcv^2) # exactTest. dispersion = 0.04
et
class(et)
top <- topTags(et) # Top ten differentially expressed tags with their p-values
top
class(top)
cpm(RG)[rownames(top), ] # Check the individual cpm values for the top ten genes
summary(de <- decideTestsDGE(et, p=0.05, adjust="BH")) # The total number of differentiallly expressed genes at FDR< 0.05

# Here the entries for -1, 0 and 1 are for down-regulated, non-differentially expressed and up-regulated tags respectively.
#-1   54
#0  2153
#1    52

detags <- rownames(RG)[as.logical(de)] # detags contains the DE genes at 5% FDR
head (detags)
plotSmear(et, de.tags=detags, ylim=c(-7,7), xlim=c(0,15), cex.lab=1.4, cex.axis=1) # plot all genes and highlight DE genes at 5% FDR
abline(h=c(-1, 1), col="blue") # The blue lines indicate 2-fold changes.
title("Biofilm vs planktonic")
dev.copy2pdf(file = "Figure_1.pdf") #Save as .pdf##

# NOTE -> adjust 'n' depending on the available number of genes
Bio_vs_Plank <- topTags(et, n=2259, adjust.method="BH", sort.by="logFC")

# export excel file
x <- Bio_vs_Plank$table
WriteXLS("x","Bio_vs_Plank.xls", row.names = TRUE, col.names = TRUE)

# export genes for TopGO. DEG_up_pvalue_0.05
x.df <- Bio_vs_Plank$table
x.sub <- subset(x.df, logFC > 0 & PValue < 0.05)
x.sub
DEG_up_Pvalue_005 <- rownames(x.sub)
write.table(DEG_up_Pvalue_005, "Bio_vs_Plank_DEG_up_pvalue_005.txt", quote = FALSE, row.names = FALSE, col.names = FALSE)

# export genes for TopGO. DEG_down_pvalue_0.05
x.df <- Bio_vs_Plank$table
x.sub <- subset(x.df, logFC < 0 & PValue < 0.05)
x.sub
DEG_down_Pvalue_005 <- rownames(x.sub)
write.table(DEG_down_Pvalue_005, "Bio_vs_Plank_DEG_down_pvalue_005.txt", quote = FALSE, row.names = FALSE, col.names = FALSE)

# export reference gene set
reference_set <- rownames(RG$counts)
write.table(reference_set, "reference_set.txt", quote = FALSE, row.names = FALSE, col.names = FALSE)

rna-seq edger • 3.6k views
ADD COMMENTlink modified 3.0 years ago • written 5.0 years ago by biotech520

please edit your post and format it better, it is too long and properly readable at this time

ADD REPLYlink written 5.0 years ago by Istvan Albert ♦♦ 80k

Sorry, I was counter editing, went raw again, I made a gist for it, and was trying to embed
<script src="&lt;a href=" anonymous="" 49dd230bd6532e3aec2a"="">anonymous/49dd230bd6532e3aec2a"></script>

 

gist fails with \gist anonymous/49dd230bd6532e3aec2a

ADD REPLYlink modified 5.0 years ago • written 5.0 years ago by Sukhdeep Singh9.7k

the way it works now is that you just paste the actual url to the gist and will embed it (see the main post)

ADD REPLYlink written 5.0 years ago by Istvan Albert ♦♦ 80k

Ah, great :)

ADD REPLYlink written 5.0 years ago by Sukhdeep Singh9.7k

The code has been updated.

ADD REPLYlink written 5.0 years ago by biotech520
Please log in to add an answer.

Help
Access

Use of this site constitutes acceptance of our User Agreement and Privacy Policy.
Powered by Biostar version 2.3.0
Traffic: 1068 users visited in the last hour