I created a Python script for doing this, although it appears to be quite slow. The main bottleneck appears to be show-aligns
, but I could be wrong about that.
The main/important function to use is delta_to_fasta
. It requires Pandas and Biopython as dependencies. If it is going really slowly you can try turning on verbose=True
.
Update/edit: Sometimes for really large query sequences (e.g. 25GB but not 500 MB), writing to file only occurs in chunks, so the output file won't be empty until after verbose mode says about 20-25 pairs of ref+query sequences should have already been written. I don't know what the cause or reason for this is. In any case the fact that it is streaming/uses generators means that you can see partial output as the writing progresses and get some output if it's canceled partway through, which is helpful given how incredibly slow this can be.
import subprocess
import re
import pandas as pd
from io import StringIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.SeqIO import write
## NOTE: `mummer` command returns `.mum` format files, for which this script is powerless
## NOTE: Only `nucmer` or `promer` commands return files in required `.delta` format.
## NOTE: I learned this the hard way -- you don't have to.
def delta_file_to_dataframe(delta_filename, skiprows=4):
"""
I believe '1' and 'R' always refer to reference sequences, and '2' and 'Q' to query sequences.
Reference and query corresponding mostly to the order of input of the files to the nucmer or promer commands.
"""
coords_file = subprocess.run(['show-coords', '-c', '-l', '-r', '-T', delta_filename],
stdout=subprocess.PIPE).stdout.decode('utf-8')
column_names =[ 'S1', 'E1', 'S2', 'E2', 'LEN 1', 'LEN 2', '% IDY', 'LEN R', 'LEN Q', 'COV R', 'COV Q', 'Ref Id', 'Query Id']
dataframe = pd.read_table(StringIO(coords_file), skiprows=skiprows, index_col=False, header=None, names=column_names)
return dataframe
def get_alignments_from_ids(delta_filename, ref_id, query_id):
alignments = subprocess.run(['show-aligns', delta_filename, ref_id, query_id],
stdout=subprocess.PIPE).stdout.decode('utf-8')
# Note that no sorting is done by default for the output of `show-aligns`, so we _may_ assume
# that the order of the matches is the same as their order of appearance in the deltafile
# "Beginning delimiter" of every alignment in the `show-aligns` output
begin_alignment_regex = '-- BEGIN alignment \[ (?P<ref_direction>[+\-])1 (?P<ref_start>[0-9]+) - (?P<ref_end>[0-9]+) \|' + \
' (?P<query_direction>[+\-])1 (?P<query_start>[0-9]+) - (?P<query_end>[0-9]+) \]\n\n'
# "End delimiter" of every alignment in the `show-aligns` output
end_alignment_regex = '\n\n--\s+END alignment \[ [+\-]1 [0-9]+ - [0-9]+ \| [+\-]1 [0-9]+ - [0-9]+ \]'
# Goal is to capture everything between the begin alignment strings and the end alignment strings
parse_regex = '(?s)'+begin_alignment_regex+'(?P<alignment_string>.*?)'+end_alignment_regex
# FYI: have to use (?s) at beginning to ensure '.' will also match new lines
# See: https://stackoverflow.com/questions/42302482/python-find-a-string-between-two-strings-repeatedly#comment116031644_42302556
parsed_alignments = [match.groupdict() for match in re.finditer(parse_regex, alignments)]
# Now have a DataFrame with the first columns containing information about the
# alignment and the last column containing everything in between one of the
# "BEGIN alignment" and "END alignment" blocks of the `show-aligns` output
parsed_alignments = pd.DataFrame(parsed_alignments)
alignment_strings = list(parsed_alignments['alignment_string'])
parsed_alignments = parsed_alignments.drop(columns=['alignment_string'])
ref_sequences, query_sequences = tuple(zip(*[clean_alignment_string(alignment_string) for alignment_string in alignment_strings]))
parsed_alignments['ref_sequence'] = ref_sequences
parsed_alignments['query_sequence'] = query_sequences
return parsed_alignments
def extract_ref_and_query_seq(cleaned_lines):
return re.findall('[0-9]+\s+(.*)\n[0-9]+\s+(.*)', cleaned_lines)[0]
def clean_alignment_string(alignment_string):
# The line consisting of spaces, |'s, and numbers above each reference-query sequence pair in show-aligns output
seqs_overline_regex = '[\n]?[\s0-9\|]+\n'
# The line beneath each reference-query sequence pair in show-aligns output pointing to mismatches with ^'s
seqs_underline_regex = '\n[\s\^]+[\n]?'
# We REALLY need those question marks, or otherwise the matching will delete the first and last lines
cleaned_seqs = re.findall('(?s)'+seqs_overline_regex+'(.*?)'+seqs_underline_regex, alignment_string)
extracted_sequences = [extract_ref_and_query_seq(line) for line in cleaned_seqs]
extracted_sequences = list(zip(*extracted_sequences))
extracted_sequences = [''.join(extracted_sequence).upper() for extracted_sequence in extracted_sequences]
ref_sequence, query_sequence = tuple(extracted_sequences)
return ref_sequence, query_sequence
def get_seq_record(group, delta_filename, ref_sequences=True, query_sequences=True, ungap=False, verbose=False):
assert ref_sequences or query_sequences, "The generator has nothing to do if neither reference nor query sequences are yielded"
ref_id, query_id = group[0]
match_information = group[1]
percent_identities = list(match_information['% IDY'])
# Note that no sorting is done by default for the output of `show-aligns`, so we _may_ assume
# that the order of the matches is the same as their order of appearance in the deltafile
alignments = get_alignments_from_ids(delta_filename, ref_id, query_id)
for row_number in range(len(alignments.index)):
if verbose:
print("\tWriting match #{}".format(row_number+1))
if ref_sequences:
ref_start = alignments['ref_start'][row_number]
ref_end = alignments['ref_end'][row_number]
ref_direction = alignments['ref_direction'][row_number]
ref_seq_id = '{}:{}-{}({})'.format(ref_id, ref_start, ref_end, ref_direction)
pct_identity = percent_identities[row_number]
ref_description = '{}% identity match to query'.format(pct_identity)
ref_sequence = Seq(alignments['ref_sequence'][row_number])
if ungap:
ref_sequence = ref_sequence.ungap(".")
ref_seqrecord = SeqRecord(ref_sequence, id=ref_seq_id, description=ref_description)
yield ref_seqrecord
if query_sequences:
query_start = alignments['query_start'][row_number]
query_end = alignments['query_end'][row_number]
query_direction = alignments['query_direction'][row_number]
query_seq_id = '{}:{}-{}({})'.format(query_id, query_start, query_end, query_direction)
pct_identity = percent_identities[row_number]
query_description = '{}% identity match to reference'.format(pct_identity)
query_sequence = Seq(alignments['query_sequence'][row_number])
if ungap:
query_sequence = query_sequence.ungap(".")
query_seqrecord = SeqRecord(query_sequence, id=query_seq_id, description=query_description)
yield query_seqrecord
def delta_to_fasta(delta_filename, output_filename, ref_sequences=True, query_sequences=True, ungap=True, output_format="fasta", verbose=False, skiprows=4):
"""
The created SeqRecord objects only have id, description, and sequence fields.
So the output formats BioPython supports for this besides "fasta" include
e.g. "tab" and do not include "nexus" or "fastq".
That being said, I imagine you could easily modify the code for `get_seq_record`
to add the desired/required extra attributes to each SeqRecord such that your
preferred output format is supported.
"""
dataframe = delta_file_to_dataframe(delta_filename, skiprows=skiprows)
seqrecord_iterators = [get_seq_record(group, delta_filename, ref_sequences=ref_sequences,
query_sequences=query_sequences, ungap=ungap, verbose=verbose)
for group in iter(dataframe.groupby(by=['Ref Id', 'Query Id']))]
def concatenate_iterables(*iterables, verbose=False):
for counter, iterable in enumerate(iterables, 1):
if verbose:
print('Starting group #{}'.format(counter))
yield from iterable
ultimate_seqrecord_iterator = concatenate_iterables(*seqrecord_iterators, verbose=verbose)
write(ultimate_seqrecord_iterator, output_filename, output_format)
Would the 'show-aligns' command in Mummer package help? I used that for relative small regions. Btw, from the Mummer output (like 'show-snps') you can get snp/indel positions with flanking regions. I used that for grabbing sequences and pipe into primer3 to get primers.
Hi Vitis & thanks for the suggestion. Yes, for small inserts it works ... but I'm interested in multiple-kb insertions. Those are split into different "aligns" by mummer...