Trimming local GB file with Biopython
1
0
Entering edit mode
7.8 years ago
13dsc • 0

I was wondering if I could use Biopython in order to trim a local gb file.

Ex. I have a 100kb gb file but I am interested in a continuous 25kb portion. I know the location of the start and stop of the 25kb region. How do I create a gb file which contains only that 25kb portion from the larger file.

python biopython genbank parsing • 2.3k views
ADD COMMENT
1
Entering edit mode

You can. Look into into SeqIO, and string slicing.

ADD REPLY
0
Entering edit mode

will that keep the gene annotations associated with the gb file for that slice?

ADD REPLY
0
Entering edit mode

Yes, just edit the seq record, and output the Seq object to a new file.

ADD REPLY
2
Entering edit mode
7.8 years ago
Joe 22k

I wrote a script that does exactly this a while back.

https://github.com/jrjhealey/bioinfo-tools/blob/master/Genbank_slicer.py

#!/usr/bin/python
# This script is designed to take a genbank file and 'slice out'/'subset'
# regions (genes/operons etc.) and produce a separate file. This can be
# done explicitly by telling the script which base sites to use, or can
# 'decide' for itself by blasting a fasta of the sequence you're inter-
# ed in against the Genbank you want to slice a record out of.
# Note, the script (obviously) does not preseve the index number of the
# bases from the original
# Based upon the tutorial at:
# http://biopython.org/DIST/docs/tutorial/Tutorial.html#htoc44
# This script depends on BLAST and having the makeblastdb functionality
# available if BLAST_MODE is active. It also depends on Biopython.
# Set up and handle arguments:
import warnings
import sys
import subprocess
import os
import time
from Bio import SeqIO
__author__ = "Joe R. J. Healey"
__version__ = "1.1.0"
__title__ = "Genbank_slicer"
__license__ = "GPLv3"
__author_email__ = "J.R.J.Healey@warwick.ac.uk"
# Import SearchIO and suppress experimental warning
from Bio import BiopythonExperimentalWarning
with warnings.catch_warnings():
warnings.simplefilter('ignore', BiopythonExperimentalWarning)
from Bio import SearchIO
def convert(basename, genbank):
'''Convert the provided genbank to a fasta to BLAST.'''
refFasta = "{}.fasta.tmp".format(basename)
SeqIO.convert(genbank, 'genbank', refFasta, 'fasta')
return refFasta
def runBlast(basename, refFasta, fasta, verbose):
'''Synthesise BLAST commands and make system calls'''
resultHandle = "{}.blastout.tmp".format(basename)
blastdb_cmd = 'makeblastdb -in {0} -dbtype nucl -title temp_blastdb'.format(refFasta)
blastn_cmd = 'blastn -query {0} -strand both -task blastn -db {1} -perc_identity 100 -outfmt 6 -out {2} -max_target_seqs 1'.format(fasta, refFasta, resultHandle)
print("Constructing BLAST Database: " + '\n' + blastdb_cmd)
print("BLASTing: " + '\n' + blastn_cmd)
DB_process = subprocess.Popen(blastdb_cmd,
shell=True,
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
DB_process.wait()
BLAST_process = subprocess.Popen(blastn_cmd,
shell=True,
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
BLAST_process.wait()
return resultHandle
def getIndices(resultHandle):
'''If not provided directly by the user, this function retrieves the best BLAST hit's indices.'''
blast_result = SearchIO.read(resultHandle, 'blast-tab')
print(blast_result[0][0])
start = blast_result[0][0].hit_start
end = blast_result[0][0].hit_end
return start, end
def slice(start, end, genbank, FPoffset, TPoffset):
'''Subset the provided genbank to return the sub record.'''
seqObj = SeqIO.read(genbank, 'genbank')
subRecord = seqObj[start-FPoffset:end+TPoffset]
return subRecord
def main():
###################################################################################################
# Parse command line arguments
import argparse
try:
parser = argparse.ArgumentParser(
description='This script slices entries such as genes or operons out of a genbank, subsetting them as their own file.')
parser.add_argument(
'-g',
'--genbank',
action='store',
required=True,
help='The genbank file you wish to subset.')
parser.add_argument(
'-o',
'--outfile',
action='store',
help='If specifed, the script will write a file, otherwise redirect STDOUT for pipelines.')
parser.add_argument(
'-s',
'--start',
type=int,
help='Integer base index to slice from.')
parser.add_argument(
'-e',
'--end',
type=int,
default=0,
help='Integer base index to slice to.')
parser.add_argument(
'-F',
'--FPoffset',
action='store',
type=int,
default=0,
help='If you want to capture regions around the target site, specify the 5\' offset.')
parser.add_argument(
'-T',
'--TPoffset',
action='store',
type=int,
default=0,
help='If you want to capture regions around the target site, specify the 3\' offset.')
parser.add_argument(
'-b',
'--blastmode',
action='store_true',
help='If flag is set, provide an input fasta (-f | --fasta), and BLAST will be called to find your sequence indices in the original genbank.')
parser.add_argument(
'-f',
'--fasta',
action='store',
help='The operon fasta to pull annotations from the provided genbank.')
parser.add_argument(
'-t',
'--tidy',
action='store_true',
help='Tell the script whether or not to remove the temporary files it generated during processing. Off by default. WARNING: removes files based on the "tmp" string.')
parser.add_argument(
'-m',
'--meta',
action='store',
default=None,
help='Specify a string to use as the Genbank meta-data fields if the parent one doesn\'t contain anything. Else inherits from parent sequence.')
parser.add_argument(
'-v',
'--verbose',
action='store_true',
help='Verbose behaviour. Shows progress of BLAST etc. if appropriate.')
args = parser.parse_args()
except NameError:
print "An exception occured with argument parsing. Check your provided options."
genbank = args.genbank
fasta = args.fasta
split = os.path.splitext(args.genbank)
basename = os.path.basename(split[0])
start = args.start
end = args.end
FPoffset = args.FPoffset
TPoffset = args.TPoffset
blastMode = args.blastmode
outfile = args.outfile
fasta = args.fasta
tidy = args.tidy
meta = args.meta
verbose = args.verbose
# Main code:
if blastMode is not False and fasta is not None:
refFasta = convert(basename,genbank)
resultHandle = runBlast(basename, refFasta, fasta, verbose)
start, end = getIndices(resultHandle)
else:
if fasta is None:
print("No fasta was provided so BLAST mode cannot be used.")
if start is None or end is None:
print('No slice indices have been specified or retrieved from blastout')
sys.exit(1)
subRecord = slice(start, end, genbank, FPoffset, TPoffset)
# Populate the metadata of the genbank
if meta is not None:
subRecord.id = meta
subRecord.locus = meta + 'subregion'
subRecord.accession = meta
subRecord.name = meta
subRecord.annotations["date"] = time.strftime("%d-%b-%Y").upper()
# Other field options include:
# subRecord.annotations["source"]
# ["taxonomy"]
# ["keywords"]
# ["references"]
# ["accessions"]
# ["data_file_division"] e.g. BCT, PLN, UNK
# ["organism"]
# ["topology"]
if outfile is not None:
SeqIO.write(subRecord, outfile, "genbank")
else:
print(subRecord.format('genbank'))
if tidy is True:
if verbose is True:
rm="rm -v ./*tmp*"
else:
rm="rm ./*tmp*"
subprocess.Popen(rm,shell=True)
if __name__ == "__main__":
main()

ADD COMMENT

Login before adding your answer.

Traffic: 2896 users visited in the last hour
Help About
FAQ
Access RSS
API
Stats

Use of this site constitutes acceptance of our User Agreement and Privacy Policy.

Powered by the version 2.3.6