Question: assembly id from protein accession via entrez?
Hey all, I am trying to retrieve assembly IDs to append to a pandas dataframe. I have a BLAST output .tsv generated from an alignment with a local database I made, and what I have are the protein ascension IDs. What I need are the GenBank assembly accession IDs. When I search I can find the contig or WGS ID's but then from there I reach an HTTP error when trying to efetch the assembly gb file so I can parse it and extract the Assembly ID. Below is my code, and any help would be appreciated, thank you! = ''

handle2 = Entrez.efetch(id='PBRQ01000000', db='nuccore', rettype='gb', retmode='text')
record2 =, 'gb')

HTTPError                                 Traceback (most recent call last)
      3 =xxxxxx
----> 5 handle2 = Entrez.efetch(id='PBRQ01000000', db='assembly', rettype='gb', retmode='text')
      6 record2 =, 'gb')
      7 print(record2)

~/miniconda3/lib/python3.7/site-packages/Bio/Entrez/ in efetch(db, **keywords)
    197             # more than about 200 IDs
    198             post = True
--> 199     return _open(cgi, variables, post=post)

~/miniconda3/lib/python3.7/site-packages/Bio/Entrez/ in _open(cgi, params, post, ecitmatch)
    567                 handle = _urlopen(cgi, data=_as_bytes(options))
    568             else:
--> 569                 handle = _urlopen(cgi)
    570         except _URLError as exception:
    571             # Reraise if the final try fails

~/miniconda3/lib/python3.7/urllib/ in urlopen(url, data, timeout, cafile, capath, cadefault, context)
    220     else:
    221         opener = _opener
--> 222     return, data, timeout)
    224 def install_opener(opener):

~/miniconda3/lib/python3.7/urllib/ in open(self, fullurl, data, timeout)
    529         for processor in self.process_response.get(protocol, []):
    530             meth = getattr(processor, meth_name)
--> 531             response = meth(req, response)
    533         return response

~/miniconda3/lib/python3.7/urllib/ in http_response(self, request, response)
    639         if not (200 <= code < 300):
    640             response = self.parent.error(
--> 641                 'http', request, response, code, msg, hdrs)
    643         return response

~/miniconda3/lib/python3.7/urllib/ in error(self, proto, *args)
    567         if http_err:
    568             args = (dict, 'default', 'http_error_default') + orig_args
--> 569             return self._call_chain(*args)
    571 # XXX probably also want an abstract factory that knows when it makes

~/miniconda3/lib/python3.7/urllib/ in _call_chain(self, chain, kind, meth_name, *args)
    501         for handler in handlers:
    502             func = getattr(handler, meth_name)
--> 503             result = func(*args)
    504             if result is not None:
    505                 return result

~/miniconda3/lib/python3.7/urllib/ in http_error_default(self, req, fp, code, msg, hdrs)
    647 class HTTPDefaultErrorHandler(BaseHandler):
    648     def http_error_default(self, req, fp, code, msg, hdrs):
--> 649         raise HTTPError(req.full_url, code, msg, hdrs, fp)
    651 class HTTPRedirectHandler(BaseHandler):

HTTPError: HTTP Error 400: Bad Request
If anyone is interested here is the solution to what I was trying to do:

def find_contig_id(protein):
    from Bio import Entrez, SeqIO = 'xxxxxx'
    eser_handle = Entrez.efetch(id=protein, db ='protein', rettype='gb', retmode='text')
    eser_record =, 'gb')
    interim = eser_record.annotations['db_source']
    research = interim.lstrip('accession ')
    r_handle = Entrez.efetch(id=research, db='nucleotide', rettype='gb', retmode='text')
    r_record =, 'gb')
    result = r_record.annotations['accessions'][1]
    s_handle = Entrez.esummary(db='nucleotide', id=result, rettype='gb', retmode='text')
    s_record =
    return s_record[0]['AccessionVersion']

def Retrieve_Assembly_Accession(contig):
    from Bio import Entrez = 'xxxxxx'
    esearch_file = Entrez.esearch(db='assembly', term=contig)
    esearch_record =
    for id in esearch_record['IdList']:
        es_handle = Entrez.esummary(db='assembly', id=id, report='full')
        es_record =
        result_final = es_record['DocumentSummarySet']['DocumentSummary'][0]['AssemblyAccession']
        return result_final

contig_id = []
accession_id = []

for x in aioA_database['Subject ID']:
    query = find_contig_id(x)
    accession = Retrieve_Assembly_Accession(query)
query = find_contig_id(x)
    accession = Retrieve_Assembly_Accession(query)

Email address was inadvertently left in the answer. Redacted.

ADD REPLYlink written 9 weeks ago by genomax80k
