Question

assembly id from protein accession via entrez?

0

Entering edit mode

4.2 years ago

anabaena ▴ 10

Hey all, I am trying to retrieve assembly IDs to append to a pandas dataframe. I have a BLAST output .tsv generated from an alignment with a local database I made, and what I have are the protein ascension IDs. What I need are the GenBank assembly accession IDs. When I search I can find the contig or WGS ID's but then from there I reach an HTTP error when trying to efetch the assembly gb file so I can parse it and extract the Assembly ID. Below is my code, and any help would be appreciated, thank you!

Entrez.email = 'xxxx@xxx.xxx'

handle2 = Entrez.efetch(id='PBRQ01000000', db='nuccore', rettype='gb', retmode='text')
record2 = SeqIO.read(handle2, 'gb')
print(record2)

---------------------------------------------------------------------------
HTTPError                                 Traceback (most recent call last)
 in 
      3 Entrez.email =xxxxxx
      4 
----> 5 handle2 = Entrez.efetch(id='PBRQ01000000', db='assembly', rettype='gb', retmode='text')
      6 record2 = SeqIO.read(handle2, 'gb')
      7 print(record2)

~/miniconda3/lib/python3.7/site-packages/Bio/Entrez/__init__.py in efetch(db, **keywords)
    197             # more than about 200 IDs
    198             post = True
--> 199     return _open(cgi, variables, post=post)
    200 
    201 

~/miniconda3/lib/python3.7/site-packages/Bio/Entrez/__init__.py in _open(cgi, params, post, ecitmatch)
    567                 handle = _urlopen(cgi, data=_as_bytes(options))
    568             else:
--> 569                 handle = _urlopen(cgi)
    570         except _URLError as exception:
    571             # Reraise if the final try fails

~/miniconda3/lib/python3.7/urllib/request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context)
    220     else:
    221         opener = _opener
--> 222     return opener.open(url, data, timeout)
    223 
    224 def install_opener(opener):

~/miniconda3/lib/python3.7/urllib/request.py in open(self, fullurl, data, timeout)
    529         for processor in self.process_response.get(protocol, []):
    530             meth = getattr(processor, meth_name)
--> 531             response = meth(req, response)
    532 
    533         return response

~/miniconda3/lib/python3.7/urllib/request.py in http_response(self, request, response)
    639         if not (200 <= code < 300):
    640             response = self.parent.error(
--> 641                 'http', request, response, code, msg, hdrs)
    642 
    643         return response

~/miniconda3/lib/python3.7/urllib/request.py in error(self, proto, *args)
    567         if http_err:
    568             args = (dict, 'default', 'http_error_default') + orig_args
--> 569             return self._call_chain(*args)
    570 
    571 # XXX probably also want an abstract factory that knows when it makes

~/miniconda3/lib/python3.7/urllib/request.py in _call_chain(self, chain, kind, meth_name, *args)
    501         for handler in handlers:
    502             func = getattr(handler, meth_name)
--> 503             result = func(*args)
    504             if result is not None:
    505                 return result

~/miniconda3/lib/python3.7/urllib/request.py in http_error_default(self, req, fp, code, msg, hdrs)
    647 class HTTPDefaultErrorHandler(BaseHandler):
    648     def http_error_default(self, req, fp, code, msg, hdrs):
--> 649         raise HTTPError(req.full_url, code, msg, hdrs, fp)
    650 
    651 class HTTPRedirectHandler(BaseHandler):

HTTPError: HTTP Error 400: Bad Request

biopython entrez Assembly • 997 views

ADD COMMENT • link 4.2 years ago by anabaena ▴ 10

GenoMax · Accepted Answer · 2020-01-30

If anyone is interested here is the solution to what I was trying to do:

def find_contig_id(protein):
    from Bio import Entrez, SeqIO
    Entrez.email = 'xxxxxx'
    print(protein)
    eser_handle = Entrez.efetch(id=protein, db ='protein', rettype='gb', retmode='text')
    eser_record = SeqIO.read(eser_handle, 'gb')
    interim = eser_record.annotations['db_source']
    research = interim.lstrip('accession ')
    r_handle = Entrez.efetch(id=research, db='nucleotide', rettype='gb', retmode='text')
    r_record = SeqIO.read(r_handle, 'gb')
    result = r_record.annotations['accessions'][1]
    s_handle = Entrez.esummary(db='nucleotide', id=result, rettype='gb', retmode='text')
    s_record = Entrez.read(s_handle)
    print(s_record[0]['AccessionVersion'])
    return s_record[0]['AccessionVersion']

def Retrieve_Assembly_Accession(contig):
    from Bio import Entrez
    Entrez.email = 'xxxxxx'
    esearch_file = Entrez.esearch(db='assembly', term=contig)
    esearch_record = Entrez.read(esearch_file)
    for id in esearch_record['IdList']:
        es_handle = Entrez.esummary(db='assembly', id=id, report='full')
        es_record = Entrez.read(es_handle)
        result_final = es_record['DocumentSummarySet']['DocumentSummary'][0]['AssemblyAccession']
        print(result_final)
        return result_final


contig_id = []
accession_id = []


for x in aioA_database['Subject ID']:
    query = find_contig_id(x)
    contig_id.append(query)
    accession = Retrieve_Assembly_Accession(query)
    accession_id.append(accession)