assembly id from protein accession via entrez?
1
0
Entering edit mode
4.2 years ago
anabaena ▴ 10

Hey all, I am trying to retrieve assembly IDs to append to a pandas dataframe. I have a BLAST output .tsv generated from an alignment with a local database I made, and what I have are the protein ascension IDs. What I need are the GenBank assembly accession IDs. When I search I can find the contig or WGS ID's but then from there I reach an HTTP error when trying to efetch the assembly gb file so I can parse it and extract the Assembly ID. Below is my code, and any help would be appreciated, thank you!

Entrez.email = 'xxxx@xxx.xxx'

handle2 = Entrez.efetch(id='PBRQ01000000', db='nuccore', rettype='gb', retmode='text')
record2 = SeqIO.read(handle2, 'gb')
print(record2)

---------------------------------------------------------------------------
HTTPError                                 Traceback (most recent call last)
 in 
      3 Entrez.email =xxxxxx
      4 
----> 5 handle2 = Entrez.efetch(id='PBRQ01000000', db='assembly', rettype='gb', retmode='text')
      6 record2 = SeqIO.read(handle2, 'gb')
      7 print(record2)

~/miniconda3/lib/python3.7/site-packages/Bio/Entrez/__init__.py in efetch(db, **keywords)
    197             # more than about 200 IDs
    198             post = True
--> 199     return _open(cgi, variables, post=post)
    200 
    201 

~/miniconda3/lib/python3.7/site-packages/Bio/Entrez/__init__.py in _open(cgi, params, post, ecitmatch)
    567                 handle = _urlopen(cgi, data=_as_bytes(options))
    568             else:
--> 569                 handle = _urlopen(cgi)
    570         except _URLError as exception:
    571             # Reraise if the final try fails

~/miniconda3/lib/python3.7/urllib/request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context)
    220     else:
    221         opener = _opener
--> 222     return opener.open(url, data, timeout)
    223 
    224 def install_opener(opener):

~/miniconda3/lib/python3.7/urllib/request.py in open(self, fullurl, data, timeout)
    529         for processor in self.process_response.get(protocol, []):
    530             meth = getattr(processor, meth_name)
--> 531             response = meth(req, response)
    532 
    533         return response

~/miniconda3/lib/python3.7/urllib/request.py in http_response(self, request, response)
    639         if not (200 <= code < 300):
    640             response = self.parent.error(
--> 641                 'http', request, response, code, msg, hdrs)
    642 
    643         return response

~/miniconda3/lib/python3.7/urllib/request.py in error(self, proto, *args)
    567         if http_err:
    568             args = (dict, 'default', 'http_error_default') + orig_args
--> 569             return self._call_chain(*args)
    570 
    571 # XXX probably also want an abstract factory that knows when it makes

~/miniconda3/lib/python3.7/urllib/request.py in _call_chain(self, chain, kind, meth_name, *args)
    501         for handler in handlers:
    502             func = getattr(handler, meth_name)
--> 503             result = func(*args)
    504             if result is not None:
    505                 return result

~/miniconda3/lib/python3.7/urllib/request.py in http_error_default(self, req, fp, code, msg, hdrs)
    647 class HTTPDefaultErrorHandler(BaseHandler):
    648     def http_error_default(self, req, fp, code, msg, hdrs):
--> 649         raise HTTPError(req.full_url, code, msg, hdrs, fp)
    650 
    651 class HTTPRedirectHandler(BaseHandler):

HTTPError: HTTP Error 400: Bad Request
biopython entrez Assembly • 997 views
ADD COMMENT
0
Entering edit mode
4.2 years ago
anabaena ▴ 10

If anyone is interested here is the solution to what I was trying to do:

def find_contig_id(protein):
    from Bio import Entrez, SeqIO
    Entrez.email = 'xxxxxx'
    print(protein)
    eser_handle = Entrez.efetch(id=protein, db ='protein', rettype='gb', retmode='text')
    eser_record = SeqIO.read(eser_handle, 'gb')
    interim = eser_record.annotations['db_source']
    research = interim.lstrip('accession ')
    r_handle = Entrez.efetch(id=research, db='nucleotide', rettype='gb', retmode='text')
    r_record = SeqIO.read(r_handle, 'gb')
    result = r_record.annotations['accessions'][1]
    s_handle = Entrez.esummary(db='nucleotide', id=result, rettype='gb', retmode='text')
    s_record = Entrez.read(s_handle)
    print(s_record[0]['AccessionVersion'])
    return s_record[0]['AccessionVersion']

def Retrieve_Assembly_Accession(contig):
    from Bio import Entrez
    Entrez.email = 'xxxxxx'
    esearch_file = Entrez.esearch(db='assembly', term=contig)
    esearch_record = Entrez.read(esearch_file)
    for id in esearch_record['IdList']:
        es_handle = Entrez.esummary(db='assembly', id=id, report='full')
        es_record = Entrez.read(es_handle)
        result_final = es_record['DocumentSummarySet']['DocumentSummary'][0]['AssemblyAccession']
        print(result_final)
        return result_final


contig_id = []
accession_id = []


for x in aioA_database['Subject ID']:
    query = find_contig_id(x)
    contig_id.append(query)
    accession = Retrieve_Assembly_Accession(query)
    accession_id.append(accession)
ADD COMMENT
1
Entering edit mode

Email address was inadvertently left in the answer. Redacted.

ADD REPLY

Login before adding your answer.

Traffic: 2664 users visited in the last hour
Help About
FAQ
Access RSS
API
Stats

Use of this site constitutes acceptance of our User Agreement and Privacy Policy.

Powered by the version 2.3.6