Closed:How to get fasta by gene id
2
0
Entering edit mode
6.7 years ago
Xylanaser ▴ 80

Hey, I'm tired with this... I'm trying to get fasta sequences(~1000) by gene id. And i wrote several scrips ( with selenium ) and all is ok but sometimes all scripts breaks because of error ( database sometimes not found record, but this record exist :/ after 5 minuts i check or page infinite loading) even scripts as efetch from eutiles are stuck.... Anyone have solution for me? please

i have problem with exception handling :/

def name_add_missedIDs(name):
    split_name = name.split('.') 
    new_name = split_name[0] + '_missedIDs' + '.txt'
    return new_name

def name_add_result(name):
    split_name = name.split('.') 
    new_name = split_name[0] + '_sequences' + '.txt'
    return new_name


name = 'names.txt'

with open(name,'r')as file:
    from selenium import webdriver
    import re
    import time
    from selenium.common.exceptions import TimeoutException

    sentences_list = []
    for sentence in file:
        sentences_list.append(sentence)


    browser = webdriver.Chrome()
    counter = 0
    for phrase in sentences_list:
        counter += 1
        print(counter, phrase)
        browser.implicitly_wait(60)
        browser.get('https://www.ncbi.nlm.nih.gov/protein/?term=' + phrase )
        try:
            try:
                print('    try-1')
                time.sleep(3)
                browser.implicitly_wait(60)
                browser.find_element_by_css_selector('div.rprt:nth-child(1) > div:nth-child(2) > p:nth-child(1) > a:nth-child(1)').click()
                browser.implicitly_wait(60)
                time.sleep(3)
                browser.find_element_by_css_selector('#EntrezSystem2\.PEntrez\.Protein\.Sequence_ResultsPanel\.Sequence_DisplayBar\.Display').click()
                time.sleep(3)
                browser.implicitly_wait(60)
                browser.find_element_by_css_selector('#display_settings_menu_report > fieldset > ul > li:nth-child(5) > label').click()
            except:
                try:
                    print('    except-a')
                    time.sleep(3)
                    browser.implicitly_wait(60)
                    browser.find_element_by_css_selector('#EntrezSystem2\.PEntrez\.Protein\.Sequence_ResultsPanel\.Sequence_DisplayBar\.Display').click()
                    time.sleep(3)
                    browser.implicitly_wait(60)
                    browser.find_element_by_css_selector('#display_settings_menu_report > fieldset > ul > li:nth-child(5) > label').click()
                except:
                    if browser.find_element_by_css_selector('#msgportlet > li:nth-child(1) > span').text == 'The following id was not found in Protein:' \
                    or browser.find_element_by_css_selector('#msgportlet > li:nth-child(1) > span').text == 'Search failed!' or \
                    browser.find_element_by_css_selector('#msgportlet > li > span').text == 'The requested page does not exist.':
                        print("reload")
                        time.sleep(60)
                        browser.refresh()
                        browser.implicitly_wait(60)
                        browser.get('https://www.ncbi.nlm.nih.gov/protein/')
                        browser.implicitly_wait(60)
                        browser.get('https://www.ncbi.nlm.nih.gov/protein/?term=' + phrase )
                        try:
                            print('    try-1b')
                            time.sleep(3)
                            browser.implicitly_wait(60)
                            browser.find_element_by_css_selector('div.rprt:nth-child(1) > div:nth-child(2) > p:nth-child(1) > a:nth-child(1)').click()
                            browser.implicitly_wait(60)
                            time.sleep(3)
                            browser.find_element_by_css_selector('#EntrezSystem2\.PEntrez\.Protein\.Sequence_ResultsPanel\.Sequence_DisplayBar\.Display').click()
                            time.sleep(3)
                            browser.implicitly_wait(60)
                            browser.find_element_by_css_selector('#display_settings_menu_report > fieldset > ul > li:nth-child(5) > label').click()
                        except:
                            print('    except-b')
                            time.sleep(3)
                            browser.implicitly_wait(60)
                            browser.find_element_by_css_selector('#EntrezSystem2\.PEntrez\.Protein\.Sequence_ResultsPanel\.Sequence_DisplayBar\.Display').click()
                            time.sleep(3)
                            browser.implicitly_wait(60)
                            browser.find_element_by_css_selector('#display_settings_menu_report > fieldset > ul > li:nth-child(5) > label').click()
                            browser.implicitly_wait(60)
    time.sleep(5)
    sequence = browser.find_element_by_css_selector('#viewercontent1').text
    adding_gene_name = '>' + phrase.strip() + ' '
    sequence_with_gene_id = re.sub('>', adding_gene_name, sequence )
    with open(name_add_result(name), "a") as f:
        f.write(sequence_with_gene_id + '\n')
with open(name_add_missedIDs(name),'a') as missedIDs:
    seqs = open(name_add_result(name),'r') 
    for row in sentences_list:
        if row not in seqs:
            missedIDs.write(row)
    seqs.close()
gene sequence protein biopython entrez • 200 views
ADD COMMENT
This thread is not open. No new answers may be added
Traffic: 2480 users visited in the last hour
Help About
FAQ
Access RSS
API
Stats

Use of this site constitutes acceptance of our User Agreement and Privacy Policy.

Powered by the version 2.3.6