Question: How to compare and categorize enrichment CO terms from multiple files?
1
gravatar for thomas.f.hahn2
4.1 years ago by
United States
thomas.f.hahn2100 wrote:

How to compare and categorize enrichment CO terms from multiple files?

I need to find a way to substitute more specific child GO terms with less specific parent GO terms since my remaining vision is not good enough for comparing them by visually looking at the enriched GO terms.  I have copy-pasted my latest version of the R code I am currently using at the end of this email.  Could anyone please assist me in accomplishing this or connect me to somebody, who might?  I only have 4 weeks left to finish my thesis.  If I cannot defend by the end of March my funding will be discontinued. 

Could you please respond to me directly via email (Thomas.F.Hahn2@gmail.com) or Skype (tfh002) or cell phone (318 243 3940) because my text to speech software, on which I am depending to access electronic information due to my very limited remaining vision, cannot read out all portions of this website aloud to me?  Fortunately, it is much more compatible with the G-mail and Skype interface thus allowing me to communicate much more efficiently and reliably when using those.   

 

I am very grateful for any kind of assistance in earning a PhD in bioinformatics despite being almost blind.

 

With very warm regards,

 

Thomas Hahn

Cell phone: 318 243 3940

Skype ID: tfh002

Email: Thomas.F.Hahn2@gmail.com

 

#################################################################################################
###
### This script combines the two subscripts that preparation and plotting of the log-fold
### yeast expression changes under CR and YEPD
###
### Additional (minor) changes:
###   - 
###
### Feb 20, 2015
###
#################################################################################################

message("Clean up workspace")
rm(list=ls(all=T))

#------------------------------------------------------------------------------------------------
### Generate scatter plot input file from:
###   1. Annotation file from Expression Lab (Affy)
###   2. Expression file from Expression Lab (Affy)
### Creates:
###   1. *.for.scatterplot.txt file in the <dir.scatter> folder in the current working directory
message("Create Scatterplot input files...")
setwd("D:/Analysis")

## INPUT FILES (relative or full path)
file="Our_data_from_expression_console.txt" # file with data from expression console
annotation = read.delim("Our_data_annotation.txt", header = F) # annotation for replicates

## OUTPUT directory
my.directory = dir.scatter="2015_02_26_Erg6"
chosen.folder = "chosen_genes"

#######################################################
replicates = tapply(annotation[,1], annotation[,2], list)
normdata = read.delim(file, check.names = F)

####Inserted by Patrick on Thursday, February 26th to set the minimum threshold to 7.6
adaptLowerBound <- TRUE # set to FALSE if no adaption so take place !!!

if (adaptLowerBound) {
  tmp.colIndices <- 2:7 # column indices of expression values !!!
  lowerBound <- 7.61 # hard-coded lower boundary of expression values !!!
  normdata.orig <- normdata
  normdata.sub <- normdata[,tmp.colIndices]
  normdata.sub[which(normdata.sub <= lowerBound, arr.ind=T)] <- lowerBound
  normdata[,tmp.colIndices] <- normdata.sub
}


# calculate the mean for replicates and add them to the table
for (i in 1:length(replicates)){
  new.data=apply(normdata[,as.character(unlist(replicates[i])),drop=FALSE] ,2, function(x)as.numeric(as.character(x)))
  normdata[, names(replicates)[i]] = rowMeans(new.data)  
}


# get the ratios that we want
ratios = t(combn(names(replicates),2))

write.ratios = vector()
for (i in 1:nrow(ratios)){
  ratio =  paste(ratios[i,], collapse="/")
  normdata[,ratio] = 2^(normdata[,ratios[i, 1]])/2^(normdata[,ratios[i,2]])
  write.ratios = c(write.ratios, ratio)
}

for (i in 1:nrow(ratios)){
  ratio =  paste(ratios[i,2], ratios[i,1], sep="/")
  normdata[,ratio] = 2^(normdata[,ratios[i, 2]])/2^(normdata[,ratios[i,1]])
  write.ratios = c(write.ratios, ratio)
}


normdata$Systematic = normdata[,"Representative Public ID"]
normdata$Common = normdata[, "Gene Symbol"]
normdata$Probesets = normdata[, "Probe Set ID"]
multiple=names(which(table(normdata$Systematic)>1))

# remove the ones with multiple probes and leave only the conditions and ratios
final.normdata=normdata[!(as.character(normdata$Systematic) %in% multiple), c("Probesets","Systematic","Common",names(replicates), write.ratios)]

pombe = grep("^SP",as.character(final.normdata$Systematic))
affy = grep("^AF", as.character(final.normdata$Systematic))
remove = c(pombe, affy)

if (length(remove)>0){
  final.normdata = final.normdata[-remove,]
}


normfile=gsub(".txt", ".for.scatterplot.txt", file)
dir.create(dir.scatter, showWarnings=F)
fn.scatterInput <- file.path(dir.scatter, normfile)

write.table(final.normdata, fn.scatterInput, sep="\t", row.names=FALSE, col.names=TRUE, quote=F)

message("  Done.")





#------------------------------------------------------------------------------------------------

### Generate scatter plot from:
###   1. *.for.scatterplot.txt file in the "Scatterplot_out" folder in the current working directory (not required to be specified)
###   2. *.txt list of genes
### Creates:
###   1. *.for.scatterplot.txt file in the "Scatterplot_out" folder in the current working directory
message("Create Scatterplot...")

#setwd("D:/Tang/Patrick/Friday_Feb20_2015")

## INPUT FILES
# read in the file with chosen genes from a txt file
# give the names of the chosen files, the colors and the labels
chosen.input.files = c(
  "Ergosterol SGD 150 genes.txt",
  "Dietary Restriction 72 genes.txt",
  "Autophagy SGD 61 genes.txt",
  "Sphingolipids 12 genes.txt",
  "ER lipid enzymes 9 genes.txt",
  "Flipases 7 genes.txt",
  "mito lipid enzymes 2 genes.txt",
  "Vacuolar Lipid Enzymes 2 genes.txt",
  "Phospholipid binding 16 genes.txt",
  "Glucose transport 9 genes.txt"
)


# what is the gene universe
#universe=read.delim(file=fn.scatterInput, header=T)
#universe$Systematic=gsub("[.]S1", "", universe$Systematic)# replaces .S1 with empty strings

universe=read.delim(file=fn.scatterInput, header=T)
universe$Systematic=gsub("[.]S1", "", universe$Systematic)# replaces .S1 with empty strings

# select the columns
yaxis="Erg6c.WTc"
#yaxis="Our.WTc.Our.WTy"
#yaxis="WTy"
#yaxis = colnames(universe)[4]#sets the Y axis to the name of the last coumn
#yaxis = colnames(universe)[ncol(universe)]#sets the Y axis to the name of the last coumn

#yaxis=ncol(universe)
#yaxis="Ag15c.Atg15yR1"
yratio=1.5
#xaxis=colnames(universe)[5]
xaxis="Erg6y.WTy"
#xaxis="WT.CR.2.WT.2"
#xaxis="X2.G.12.h"

xratio=1.5
logarithm = TRUE # for the ratio the logarithm should be set to TRUE
enrichment = TRUE
my.legend = TRUE
Brown = FALSE
no.black = FALSE # if set to TRUE it does not do enrichment for the chosen genes.  True means it will do everything, except for the chosen genes

black.only = FALSE # put it to FALSE if you want all genes
#Black = TRUE# it does enrichment only for the black genes (put it to FALSE to do it for everybody).  False means that it enriches the over and under expressed but not the chosen


if (enrichment == TRUE){
  library(GOstats)
  library(org.Sc.sgd.db)
}


exclusion.quadrant = FALSE
xaxis.limit = 7.61
yaxis.limit = 7.61
exclusion.color = "violet"

################################

number.of.lines = 5

chosen.colors = c("cornflowerblue", "gray53", "seagreen4", "blue", "deeppink3", "saddlebrown", "red", "black", "blueviolet", "darkblue")

chosen.labels = c("cornflowerblue", "gray53", "seagreen4", "blue", "deeppink3", "saddlebrown", "red", "black", "blueviolet", "darkblue")

my.directory = "2015_02_26_Erg6"

#####################################################################

### choose colors for the points

above.line.color = "yellow"
below.line.color = "cyan"
handpicked.color = "saddlebrown"
middle.genes.color = "moccasin"

above.line.label = "red"
below.line.label = "blue"
handpicked.label = "burlywood4"
middle.genes.label = "navajowhite4"

### choose colors for the lines
upper.diagonal.color = "yellow"
lower.diagonal.color = "yellow"
left.vertical.line = "yellow"
right.vertical.line = "yellow"
upper.horizontal.line = "yellow"
lower.horizontal.line = "yellow"

# define the shapes
shape.above.line = 17
shape.below.line = 15
legend.size = 0.4# font size
point.size = 0.4# size of points in scatter plot
show.above.names = "TRUE" #you can change it to FALSE
show.below.names = "TRUE"
show.middle.names = "FALSE"
show.chosen = "TRUE"


# define parameters for gene ontology enrichment
hgCutoff <- 0.05
FDR.thresh=0.1

text.cex=0.25 # This number tells the font size of the selected genes.
#######################################################################
#######################################################################
ADD COMMENTlink modified 4.1 years ago by Giovanni M Dall'Olio26k • written 4.1 years ago by thomas.f.hahn2100
Please log in to add an answer.

Help
Access

Use of this site constitutes acceptance of our User Agreement and Privacy Policy.
Powered by Biostar version 2.3.0
Traffic: 895 users visited in the last hour