Hello everybody. Is there no better way than using this construct with R::seqinr to get the sequence based on the name rather ID of a sequence than the numeric position in the list?
seq<-eval(parse(text=(eval(paste("Fasta_from_seqinr$", "ID_I_Want", sep=""))))
The use of the eval() just seems really awkward - but does work for my purposes in a script I inherited - as does so quite well compared to the alternatives I'm aware of.
I tried using Biostrings - as some have suggested (I forget where, but seemed like a sensible approach for that need) - and might be a good choice due to speed of loading the Fasta file and the memory footprint (both vastly superior to the seqinr alternative!) but as I have ~45 000 Fasta reads to process the 'grep for ID into the names() list' is too slow to be practical:
seq<-as.vector(Fasta_from_Biostrings[[grep(myVecSample[x],names(Fasta_from_Biostrings))]])
even caching the IDs in advance didn't help much:
seq<-as.vector(Fasta_from_Biostrings[[grep(myVecSample[x], IDs)]])
Example code - sadly I can't easily supply the 33MB Fasta file for various reasons, size being one - is below as well as example output.
Other ideas I've had but excluded by a sanity check: 'you are making this way harder than it should be / might work - but still there must be a better way' :
- Build a lookup table that converts / maps sequence name to numeric location.
So: any ideas?
Tnx, MM
Example code:
#Demo / timing investigation of extracting data from a FASTA file in R by ID not by numeric index.
# Should work with most Fasta formated files with >40 sequences; IDs between the assumed to be common
#
library(seqinr)
library(Biostrings)
set.seed(001) #Meaningless without the same Fasta file, but still - it helps if so
#Demo.fasta is ~ 33MB; 48466 sequences
#Loading: method 1) using seqinr:
ptm <- proc.time()
Fasta_from_seqinr <-read.fasta("Demo.fasta")
print ("seqinr load took: "); print (proc.time()-ptm)
#Loading: method 2) using Biostrings:
ptm <- proc.time()
Fasta_from_Biostrings <-readDNAStringSet("Demo.fasta")
print ("Biostrings load took: "); print (proc.time()-ptm)
#
#Method 2 is ~ 10 faster to ~9-10 times larger in memory: 35Mb cf 271Mb
print (paste0("Size of 'seqinr' is : ", format(object.size(Fasta_from_seqinr), units="auto")))
print (paste0("Size of 'Biostrings' is: ", format(object.size(Fasta_from_Biostrings), units="auto")))
#Some contents:
print ("Example names from seqinr:"); print (head(names(Fasta_from_seqinr), n=2))
print ("Example names from Biostrings:"); print (head(names(Fasta_from_Biostrings), n=2))
IDs <-sample(names(Fasta_from_seqinr))
print (paste0("Number of IDs / Names for each (seqinr & Biostrings):", length(IDs), " & ", length(names(Fasta_from_Biostrings))))
#Pick 40 IDs 'at random' - given set.seed(001)
myVecSample <- IDs[1:40]
#Test seqinr:
print ("1) Test seqinr recall speed:")
ptm <- proc.time()
for (x in 1:length(myVecSample))
{ seq<-eval(parse(text=(eval(paste("Fasta_from_seqinr$", myVecSample[x], sep="")))))
# print (seq)
}
print (proc.time()-ptm); print ("-")
##
print ("2) Test Biostrings recall speed:")
ptm <- proc.time()
for (x in 1:length(myVecSample))
{ seq<-as.vector(Fasta_from_Biostrings[[grep(myVecSample[x], names(Fasta_from_Biostrings))]])
# print (seq)
}
print (proc.time()-ptm); print ("-")
##
print ("2a) Test Biostrings recall speed (using pre-canned ID names list):")
ptm <- proc.time()
for (x in 1:length(myVecSample))
{ seq<-as.vector(Fasta_from_Biostrings[[grep(myVecSample[x], IDs)]])
# print (seq)
}
print (proc.time()-ptm); print ("-")
` Example output:
> source ("FASTA_Extract.r")
[1] "seqinr load took: "
user system elapsed
6.072 0.004 6.078
[1] "Biostrings load took: "
user system elapsed
0.244 0.004 0.246
[1] "Size of 'seqinr' is : 271.9 Mb"
[1] "Size of 'Biostrings' is: 35.7 Mb"
[1] "Example names from seqinr:"
[1] "TRINITY_DN12769_c0_g1_i1" "TRINITY_DN12780_c0_g1_i1"
[1] "Example names from Biostrings:"
[1] "TRINITY_DN12769_c0_g1_i1 len=1769 path=[3493:0-1768] [-1, 3493, -2]"
[2] "TRINITY_DN12780_c0_g1_i1 len=844 path=[822:0-843] [-1, 822, -2]"
[1] "Number of IDs / Names for each (seqinr & Biostrings):48466 & 48466"
[1] "1) Test seqinr recall speed:"
user system elapsed
0.040 0.000 0.041
[1] "-"
[1] "2) Test Biostrings recall speed:"
user system elapsed
3.888 0.000 3.889
[1] "-"
[1] "2a) Test Biostrings recall speed (using pre-canned ID names list):"
user system elapsed
1.448 0.000 1.447
[1] "-"