Question: Arabidopsis Gene ID Conversion (BioMart, CLC Genomics Workbench Output)
1
gravatar for jamesfischer047
12 months ago by
jamesfischer04710 wrote:

Hi all,

I have an output of RNA-seq reads from CLC genomics workbench, for Arabidopsis thaliana. The list of genes contains a mix of gene names (i.e. "TRY", "TMM", "SVP", "FLC"), and IDs (e.g. "AT1G01390", "AT1G01310", "AT1G01240"). I would like to convert them all to gene names, so I can run it through a GO terms R package (the package seemingly does not read IDs like AT1G01390).

When I use biomaRt's getBM() function, it returns a lot less genes than the list of genes I'm reading into it. The original list from CLC has all Arabidopsis genes (27,655) and the outputs from getBM() generally have 12,085 gene names or less.

Anybody done this type of conversion before with success?

Thanks in advance!

I've tried various types of attributes, but none of them have worked.

reads <- as.matrix(read.csv("genereads_ONLY4.txt", sep = '\t', row.names = 1, header = TRUE))
  meta <- read.table("metatest4.txt", header = TRUE, fileEncoding= "UTF-16LE")


mart = useMart(biomart="plants_mart",host="plants.ensembl.org")
  listDatasets(useMart(biomart="plants_mart",host="plants.ensembl.org"))  
  ensembl = useDataset("athaliana_eg_gene",mart= mart)

  genes <- row.names(reads)

  test1 <- getBM(attributes='external_gene_name', 
        values = genes, 
        mart = ensembl)
rna-seq arabidopsis biomart R • 602 views
ADD COMMENTlink modified 5 months ago by Biostar ♦♦ 20 • written 12 months ago by jamesfischer04710
1
gravatar for jamesfischer047
12 months ago by
jamesfischer04710 wrote:

Okay, I found a round about way to solve this, at least for my scenario.

The gmt and fgsea information I'm using can only read gene symbols (e.g. "TRY") or entrez IDs. So I wrote a function to convert all of the information I had to either symbols or entrez IDs. The code is:

reads <- as.matrix(read.csv("genereads_ONLY4.txt", sep = '\t', row.names = 1, header = TRUE))

genes <- row.names(reads)

sum(lengths(regmatches(genes, gregexpr("\\AT[0-9]", genes, ignore.case = TRUE))))

#genes <- c("TRY", "AT2G46410", "AT5G41315", "AT2G42200", "AT1G10280")

IDconvert <- function(genes) {

  for (i in genes){

    if (grepl("AT[0-9]", i) == TRUE) {

      if is.na(getSYMBOL(i, data='org.At.tair.db')) == TRUE) {

        if is.na(getEG(i, data='org.At.tair')) == TRUE) {

          i <- i

           } else{

             name <- getEG(i, data='org.At.tair')

             name.l <- as.list(name)
             newname <- as.character(name.l[[1]])
             genes <- sub(i, newname, genes)

            }

      } else{
      name <- getSYMBOL(i, data='org.At.tair')

      name.l <- as.list(name)
      newname <- as.character(name.l[[1]])
      genes <- sub(i, newname, genes)

      }

    } else{
      NULL
    } 

  }
  return(genes)

}   


genes2 <- IDconvert(genes)

sum(lengths(regmatches(genes2, gregexpr("\\AT[0-9]", genes2, ignore.case = TRUE))))

row.names(reads) <- genes2  


gmt <- read.gmt("GSEA_BIO.gmt")
gmt.ids <- read.gmt("IB_BIO_GMT.gmt")                  
gmt.combo <- c(gmt, gmt.ids)

#Stage 3 GO terms

names3 <- row.names(sub.break3)
sub.break3$names=names3
ranks <- sub.break3$stat
names(ranks) <- sub.break3$names
sub.break3.rank <- sort(ranks, decreasing = T)

fgseaRes3 <- fgsea(pathways = gmt.combo, 
                  stats = sub.break3.rank,
                  minSize=5,
                  maxSize=500,
                  nperm=100000)
fgsea3.sig <- fgseaRes3[pval < 0.05]
pathways.stg3 <- fgsea3.sig$pathway



#Stage 1 GO terms

names1 <- row.names(sub.break1)
sub.break1$names=names1
ranks <- sub.break1$stat
names(ranks) <- sub.break1$names
sub.break1.rank <- sort(ranks, decreasing = T)

fgseaRes1 <- fgsea(pathways = gmt.combo, 
                  stats = sub.break1.rank,
                  minSize=5,
                  maxSize=500,
                  nperm=100000)
fgsea1.sig <- fgseaRes1[pval < 0.05]
pathways.stg1 <- fgsea1.sig$pathway


#Stage 2 GO terms

names2 <- row.names(sub.break2)
sub.break2$names=names2
ranks <- sub.break2$stat
names(ranks) <- sub.break2$names
sub.break2.rank <- sort(ranks, decreasing = T)

fgseaRes2 <- fgsea(pathways = gmt.combo, 
                   stats = sub.break2.rank,
                   minSize=5,
                   maxSize=500,
                   nperm=100000)
fgsea2.sig <- fgseaRes2[pval < 0.05]
pathways.stg2 <- fgsea2.sig$pathway



#Stage 4 GO terms

names4 <- row.names(sub.break4)
sub.break4$names=names4
ranks <- sub.break4$stat
names(ranks) <- sub.break4$names
sub.break4.rank <- sort(ranks, decreasing = T)

fgseaRes4 <- fgsea(pathways = gmt.combo, 
                   stats = sub.break4.rank,
                   minSize=5,
                   maxSize=500,
                   nperm=100000)
fgsea4.sig <- fgseaRes4[pval < 0.05]
pathways.stg4 <- fgsea4.sig$pathway
#openxlsx::write.xlsx(fgsea4.sig, "fgsea_stg4_t1.xlsx")


#GO Venn-----------------------------------

group.venn(list(One = pathways.stg1, 
                Two = pathways.stg2, 
                Three = pathways.stg3, 
                Four = pathways.stg4), 
           fill = c("orange", "green", "red", "blue"))
ADD COMMENTlink written 12 months ago by jamesfischer04710
0
gravatar for zhao03
12 months ago by
zhao0330
zhao0330 wrote:

Hi This is a website tool for ID Conversion https://biotools.fr/human/refseq_symbol_converter

ADD COMMENTlink written 12 months ago by zhao0330
Please log in to add an answer.

Help
Access

Use of this site constitutes acceptance of our User Agreement and Privacy Policy.
Powered by Biostar version 2.3.0
Traffic: 1214 users visited in the last hour