Arabidopsis Gene ID Conversion (BioMart, CLC Genomics Workbench Output)

Question

I have an output of RNA-seq reads from CLC genomics workbench, for Arabidopsis thaliana. The list of genes contains a mix of gene names (i.e. "TRY", "TMM", "SVP", "FLC"), and IDs (e.g. "AT1G01390", "AT1G01310", "AT1G01240"). I would like to convert them all to gene names, so I can run it through a GO terms R package (the package seemingly does not read IDs like AT1G01390).

When I use biomaRt's getBM() function, it returns a lot less genes than the list of genes I'm reading into it. The original list from CLC has all Arabidopsis genes (27,655) and the outputs from getBM() generally have 12,085 gene names or less.

Anybody done this type of conversion before with success?

Thanks in advance!

I've tried various types of attributes, but none of them have worked.

#data load in and conversions, meta matrix/design creation:
    #reads file was created in CLC Genomics Workbench, then the reads column copied and pasted for
      #each sample

  reads <- as.matrix(read.csv("genereads_ONLY4.txt", sep = '\t', row.names = 1, header = TRUE))
  meta <- read.table("metatest4.txt", header = TRUE, fileEncoding= "UTF-16LE")


mart = useMart(biomart="plants_mart",host="plants.ensembl.org")
  listDatasets(useMart(biomart="plants_mart",host="plants.ensembl.org"))  
  ensembl = useDataset("athaliana_eg_gene",mart= mart)

  genes <- row.names(reads)

  test1 <- getBM(attributes='external_gene_name', 
        values = genes, 
        mart = ensembl)

score 0 · Answer 1 · answered May 13 '19 at 16:25

Okay, I found a round about way to solve this, at least for my scenario.

The gmt and fgsea information I'm using can only read gene symbols (e.g. "TRY") or entrez IDs. So I wrote a function to convert all of the information I had to either symbols or entrez IDs. The code is:

  reads <- as.matrix(read.csv("genereads_ONLY4.txt", sep = '\t', row.names = 1, header = TRUE))

genes <- row.names(reads)

sum(lengths(regmatches(genes, gregexpr("\\AT[0-9]", genes, ignore.case = TRUE))))

#genes <- c("TRY", "AT2G46410", "AT5G41315", "AT2G42200", "AT1G10280")

IDconvert <- function(genes) {

  for (i in genes){

    if (grepl("AT[0-9]", i) == TRUE) {

      if (is.na(getSYMBOL(i, data='org.At.tair.db')) == TRUE) {

        if (is.na(getEG(i, data='org.At.tair')) == TRUE) {

          i <- i

           } else{

             name <- getEG(i, data='org.At.tair')

             name.l <- as.list(name)
             newname <- as.character(name.l[[1]])
             genes <- sub(i, newname, genes)

            }

      } else{
      name <- getSYMBOL(i, data='org.At.tair')

      name.l <- as.list(name)
      newname <- as.character(name.l[[1]])
      genes <- sub(i, newname, genes)

      }

    } else{
      NULL
    } 

  }
  return(genes)

}   


genes2 <- IDconvert(genes)

sum(lengths(regmatches(genes2, gregexpr("\\AT[0-9]", genes2, ignore.case = TRUE))))

row.names(reads) <- genes2  


gmt <- read.gmt("GSEA_BIO.gmt")
gmt.ids <- read.gmt("IB_BIO_GMT.gmt")                  
gmt.combo <- c(gmt, gmt.ids)

#Stage 3 GO terms

names3 <- row.names(sub.break3)
sub.break3$names=names3
ranks <- sub.break3$stat
names(ranks) <- sub.break3$names
sub.break3.rank <- sort(ranks, decreasing = T)

fgseaRes3 <- fgsea(pathways = gmt.combo, 
                  stats = sub.break3.rank,
                  minSize=5,
                  maxSize=500,
                  nperm=100000)
fgsea3.sig <- fgseaRes3[pval < 0.05]
pathways.stg3 <- fgsea3.sig$pathway



#Stage 1 GO terms

names1 <- row.names(sub.break1)
sub.break1$names=names1
ranks <- sub.break1$stat
names(ranks) <- sub.break1$names
sub.break1.rank <- sort(ranks, decreasing = T)

fgseaRes1 <- fgsea(pathways = gmt.combo, 
                  stats = sub.break1.rank,
                  minSize=5,
                  maxSize=500,
                  nperm=100000)
fgsea1.sig <- fgseaRes1[pval < 0.05]
pathways.stg1 <- fgsea1.sig$pathway


#Stage 2 GO terms

names2 <- row.names(sub.break2)
sub.break2$names=names2
ranks <- sub.break2$stat
names(ranks) <- sub.break2$names
sub.break2.rank <- sort(ranks, decreasing = T)

fgseaRes2 <- fgsea(pathways = gmt.combo, 
                   stats = sub.break2.rank,
                   minSize=5,
                   maxSize=500,
                   nperm=100000)
fgsea2.sig <- fgseaRes2[pval < 0.05]
pathways.stg2 <- fgsea2.sig$pathway



#Stage 4 GO terms

names4 <- row.names(sub.break4)
sub.break4$names=names4
ranks <- sub.break4$stat
names(ranks) <- sub.break4$names
sub.break4.rank <- sort(ranks, decreasing = T)

fgseaRes4 <- fgsea(pathways = gmt.combo, 
                   stats = sub.break4.rank,
                   minSize=5,
                   maxSize=500,
                   nperm=100000)
fgsea4.sig <- fgseaRes4[pval < 0.05]
pathways.stg4 <- fgsea4.sig$pathway
#openxlsx::write.xlsx(fgsea4.sig, "fgsea_stg4_t1.xlsx")


#GO Venn-----------------------------------

group.venn(list(One = pathways.stg1, 
                Two = pathways.stg2, 
                Three = pathways.stg3, 
                Four = pathways.stg4), 
           fill = c("orange", "green", "red", "blue"))

Arabidopsis Gene ID Conversion (BioMart, CLC Genomics Workbench Output)

1 Answers1