Get cDNA sequences from NCBI for given protein IDs
        2 
    
    
    
        
        
        
        
            
                
                
                    
                        
                    
                
                    
                        Hi,
I have following NCBI protein ids, which I got from blast result. I want cDNA sequences for all of them. How can I get them. I tried batch entrez but it didn't work 
WP_114529907
WP_086161776
WP_023545800
WP_055635421
WP_046726840
WP_019985024
WP_030816474
WP_015656893
WP_066997879
WP_076090756
WP_046591128
WP_037737828
WP_019061909
WP_031182504
WP_029184528
WP_107430231
WP_075738394
WP_018547093
WP_050371941
WP_058922273
WP_014675804
WP_010039119
WP_059042595
WP_031045624
WP_003993952
WP_067124593
WP_069770111
WP_085209034
 
                 
                
                
                    
                    
    
        
        
            ncbi
         
        
    
        
        
            proteinid
         
        
    
        
        
            cdna
         
        
    
    
        • 2.8k views
    
 
                
                 
            
            
         
     
 
     
    
        
            
                
    
    
    
    
        
        
        
        
            
                
                
                    
                        
                    
                
                    
                        You could get all CDS associated with the nucleotide record and filter the fasta file for sequences of interest, based on the presence of the protein ID in the header.
elink -target nuccore -db protein -name protein_nuccore -id WP_114529907|efetch -format fasta_cds_na > all_cds.fasta
 
                 
                
                
                 
            
            
         
     
 
         
        
            
                
    
    
    
    
        
        
        
        
            
                
                
                    
                        
                    
                
                    
                        Once all_cds.fasta downloaded as mentioned above by @SejModha, below is the R script to get the query sequences. 
library(tidyverse)
## ncbi query protein ids  for which  we need cDNA fasta file 
query_ids <- c("WP_030938735.1", "WP_073767452.1", "WP_075030623.1", "WP_073737703.1", "WP_084907526.1", "WP_058922273.1", "WP_069743312.1", "WP_063483499.1", "WP_086727825.1", "WP_100828975.1") %>% tibble(V1= .)
## get sequences in R
fasta_cdna <- Biostrings::readDNAStringSet("./all_cds.fasta")
fasta_cdna_ids <- base::names(fasta_cdna) 
fasta_cdna_ids %>% head()
#> [1] "lcl|NZ_QLLY01000003.1_cds_WP_146614694.1_1 [locus_tag=K377_RS06515] [protein=DUF4331 domain-containing protein] [protein_id=WP_146614694.1] [location=<1..501] [gbkey=CDS]"                            
#> [2] "lcl|NZ_QLLY01000003.1_cds_WP_111662221.1_2 [locus_tag=K377_RS06520] [protein=hypothetical protein] [protein_id=WP_111662221.1] [location=583..1914] [gbkey=CDS]"                                       
#> [3] "lcl|NZ_QLLY01000003.1_cds_WP_111662644.1_3 [locus_tag=K377_RS06525] [protein=nickel transporter] [protein_id=WP_111662644.1] [location=2067..3539] [gbkey=CDS]"                                        
#> [4] "lcl|NZ_QLLY01000003.1_cds_WP_146614695.1_4 [locus_tag=K377_RS06530] [protein=hypothetical protein] [protein_id=WP_146614695.1] [location=complement(3622..3861)] [gbkey=CDS]"                          
#> [5] "lcl|NZ_QLLY01000003.1_cds_WP_111662645.1_5 [locus_tag=K377_RS06535] [protein=ABC transporter permease] [protein_id=WP_111662645.1] [location=complement(3995..4783)] [gbkey=CDS]"                      
#> [6] "lcl|NZ_QLLY01000003.1_cds_WP_111662223.1_6 [locus_tag=K377_RS06540] [protein=ATP-binding cassette domain-containing protein] [protein_id=WP_111662223.1] [location=complement(4822..5922)] [gbkey=CDS]"
## from original header get protein id for each seq and assign it as sequence id 
get_prot_id <- as_mapper(~ gsub(pattern = ".*\\[protein_id=(.*?)\\].*" , replacement = "\\1" ,x = .))
pp_id <-map_chr(fasta_cdna_ids ,get_prot_id)
length(pp_id)
#> [1] 1131303
pp_id %>% sample(10)
#>  [1] "WP_062925461.1"                                                                                                                                                     
#>  [2] "WP_143613249.1"                                                                                                                                                     
#>  [3] "WP_100109264.1"                                                                                                                                                     
#>  [4] "WP_095873712.1"                                                                                                                                                     
#>  [5] "WP_078631405.1"                                                                                                                                                     
#>  [6] "WP_099926778.1"                                                                                                                                                     
#>  [7] "lcl|NZ_CP041609.1_cds_8438 [locus_tag=FNV67_RS44310] [protein=hypothetical protein] [pseudo=true] [partial=3'] [location=complement(<9384513..9384704)] [gbkey=CDS]"
#>  [8] "WP_143607681.1"                                                                                                                                                     
#>  [9] "WP_067427611.1"                                                                                                                                                     
#> [10] "WP_097242740.1"
names(fasta_cdna) <- pp_id
## keep only unique sequences  
fasta_cdna_uniq <- fasta_cdna[ ! (names(fasta_cdna) %>% duplicated())] 
## get your sequences for your query id 
query_fasta_cdna <- fasta_cdna_uniq [names(fasta_cdna_uniq) %in% query_ids$V1] 
length(query_fasta_cdna)
#> [1] 9
## ids for which sequences not found 
not_found <- query_ids$V1[! query_ids$V1 %in% names(query_fasta_cdna)]
not_found %>% length()
#> [1] 1
## write fasta 
Biostrings::writeXStringSet(query_fasta_cdna , filepath = "./query_id_cdna.fasta")
## write not found ids
write_delim(path =  "./query_id_not_found.txt" , x = not_found %>% as.data.frame() , delim = "\t")
 
                 
                
                
                 
            
            
         
     
 
         
        
 
    
    
        
            
                  before adding your answer.
         
    
    
         
        
            
        
     
    
    Traffic: 3786 users visited in the last hour
         
    
    
        
    
    
 
Since these are
WP_*records they are special non-redundant protein ID's. They refer to more than one genome so that is a reason you are not able to get cDNA sequence directly.Thanks!
Somehow I got the nucleotide (cDNA) co-ordinates for my query protein id (
Wp_*). Is there a way to get sequences from NCBI using nucletide co-ordinates ?