I've written a code for the Batch CD-Search, which currently works for macOS but might work for windows as well with some tweaks which I've made some notes for. Here is a summary of what the code does (Bear with me, because this might be a long one):
- Takes an input DIRECTORY, parses all the fasta files into a
dictionary
- Splits the fasta file into 950 sequence chunks to comply to the 1000
seq limit of the batch mode
- Uses Selenium to open new windows and search paste the sequences
- Implements explicits WebDriverWaits to make sure everything runs smoothly
- Download the concise hit data
- Deposits the download into an output directory in the same location
as the input called "CDsearch_batch_res"
Here is what the code cannot do (yet, I will work on these soon):
- Alter the CD-search search parameters (evalue, database to search,
max aligns, etc)
- Download specific files (full, superfamily, alignments, etc.)
- Make sure that it's fully compatible to all operating systems
I've used it myself to to a monte-carlo substitution experiment to see if sequences would be identifiable by the CD-search after n substitutions, which was on a database well over 50,000 sequences. Hope it helps!
ALSO NOTE THAT YOU NEED TO SET UP CHROME WEB DRIVER YOURSELF!
Hope it helps!
"""
Takes an input fasta file, splits it, and iteratively searchs the NCBI conserved
domain BATCH search website for hits & returns sequences with specified hits.
- Requires selenium to be installed
- Requires a specified webdriver (Chrome by default)
- Waits for your browser for a maximum of 10 minutes if browser freezes
- Requires you to modify the Keys.COMMAND if you use windows
"""
import os
from itertools import islice
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
import pyperclip
inputdir = input("\nEnter the input file directory with all fasta files: ")
maxseqinput = input(
"\nEnter the maximum number of sequences to load in each entry (max 950): "
)
def fasta2dict(fastafile):
"""Reads a fasta file and returns a dictionary"""
try:
with open(fastafile, "r", errors="ignore", encoding="UTF-8") as inputfile:
content = inputfile.read()
contentsplit = content.split(">")
cleanlist = []
for seql in contentsplit:
seqr = repr(seql)
clean = seqr.replace("\\n", "*NEWLINE*", 1)
clean2 = clean.replace("\\n", "")
clean3 = clean2.replace("'", "")
cleanlist.append(clean3)
seq_dict = {}
for seqc in cleanlist:
seqto = seqc
seqsplit = seqto.split("*NEWLINE*")
if len(seqsplit) > 1:
seqkey = ">" + seqsplit[0]
seq_dict[seqkey] = seqsplit[1]
return seq_dict
except FileNotFoundError:
print("Could not open/read file: ", fastafile)
def chunks(inputdict, maxsize=950):
"""split fasta dictionary into 950 sequence chunks for CDD batch results"""
it_value = iter(inputdict)
for i in range(0, len(inputdict), maxsize):
yield {k: inputdict[k] for k in islice(it_value, maxsize)}
os.chdir(inputdir)
outputdir = os.path.join(inputdir, "CDsearch_batch_res")
FILECNTNAME = 1
while True:
try:
os.mkdir(outputdir)
break
except FileExistsError:
outputdir += "_" + str(FILECNTNAME)
FILECNTNAME += 1
for filename in os.listdir(inputdir):
os.chdir(inputdir)
root, ext = os.path.splitext(filename)
if filename == "CDD_batch_res":
continue
if ext in (".fa", ".fasta", ".fna"):
sequencedict = fasta2dict(filename)
for sliceddict in chunks(sequencedict, int(maxseqinput)):
SEQUENCE = "\n".join(
"\n".join((key, val)) for (key, val) in sliceddict.items()
)
chromeopt = webdriver.ChromeOptions()
prefs = {"download.default_directory": outputdir}
chromeopt.add_experimental_option("prefs", prefs)
driver = webdriver.Chrome(chrome_options=chromeopt)
driver.get("https://www.ncbi.nlm.nih.gov/Structure/bwrpsb/bwrpsb.cgi")
DELAY = 10 * 60
TXTBOX = '//*[@id="frm_New_Search"]/div/table/tbody/tr/td/table/tbody/tr[1]/td[1]/div[2]/textarea'
SRCHBTT = '//*[@id="frm_New_Search"]/div/table/tbody/tr/td/table/tbody/tr[2]/td/table/tbody/tr/td[3]/input'
DWNLDHBTT = '//*[@id="tbl_DLPanel"]/tbody/tr[3]/td[3]/input'
textarea = (
WebDriverWait(driver, DELAY)
.until(EC.visibility_of_element_located((By.XPATH, TXTBOX)))
.find_elements(By.XPATH, TXTBOX)
)[0]
pyperclip.copy(SEQUENCE)
textarea.send_keys(Keys.COMMAND, "v")
try:
subbutton = (
WebDriverWait(driver, DELAY)
.until(EC.visibility_of_element_located((By.XPATH, SRCHBTT)))
.find_element(By.XPATH, SRCHBTT)
)
subbutton.click()
except TimeoutException as ex:
ISRUNNING = 0
print("Exception has been thrown. " + str(ex))
driver.close()
try:
downbutton = (
WebDriverWait(driver, DELAY)
.until(EC.visibility_of_element_located((By.XPATH, DWNLDHBTT)))
.find_element(By.XPATH, DWNLDHBTT)
)
downbutton.click()
except TimeoutException as ex:
ISRUNNING = 0
print("Exception has been thrown. " + str(ex))
driver.close()
You can use a module called Selenium in Python to access CD Search. From there, it is just a way to find the elements that you want.
For example, you can do a little search for a especific sequence with the following script:
What changes should I make in the code if I want to use Batch CD-Search :
https://www.ncbi.nlm.nih.gov/Structure/bwrpsb/bwrpsb.cgi