I was wondering whether anyone would have any insights into how to make the following python script faster. I need to access a set of paired end FASTQ files, and sort the reads into three distinct files depending on which adaptor sequence is present (or whether none of them is present). However, my current implementation takes over 5h+ for around 20 million reads... Would anyone have any hints into how to make it more efficient? I'm already using the low level FastqGeneralIterator which is supposed to be faster...
Thanks in advance!!
import gzip import sys from Bio import SeqIO from Bio import Align from Bio.SeqIO.QualityIO import FastqGeneralIterator # Arguments file1 = sys.argv file2 = sys.argv adaptor_root = sys.argv adaptor_five = sys.argv adaptor_three = sys.argv id = sys.argv cwd_path = sys.argv temp_path = sys.argv output_five_file_1 = temp_path + id + "_five_1.fastq.gz" output_five_file_2 = temp_path + id + "_five_2.fastq.gz" output_three_file_1 = temp_path + id + "_three_1.fastq.gz" output_three_file_2 = temp_path + id + "_three_2.fastq.gz" output_else_file_1 = temp_path + id + "_else_1.fastq.gz" output_else_file_2 = temp_path + id + "_else_2.fastq.gz" output_five_1 = gzip.open(output_five_file_1, 'wt', 1) output_five_2 = gzip.open(output_five_file_2, 'wt', 1) output_three_1 = gzip.open(output_three_file_1, 'wt', 1) output_three_2 = gzip.open(output_three_file_2, 'wt', 1) output_else_1 = gzip.open(output_else_file_1, 'wt', 1) output_else_2 = gzip.open(output_else_file_2, 'wt', 1) #Setup aligner with match 1, mismatch 0, local aligner = Align.PairwiseAligner() aligner.mode = 'local' aligner.open_gap_score = 0 with gzip.open(file1, "rt") as handle1: with gzip.open(file2, "rt") as handle2: for (title1, seq1, qual1), (title2, seq2, qual2) in zip(FastqGeneralIterator(handle1), FastqGeneralIterator(handle2)): root_alignment = aligner.align(seq2[0:30], adaptor_root) if root_alignment.score > 20: five_alignment = aligner.align(seq2[0:30], adaptor_five).score three_alignment = aligner.align(seq2[0:30], adaptor_three).score # if a 5 prime alignment is likelier, then output it there, else output to 3 prime file if five_alignment > three_alignment: output_five_1.write("@%s\n%s\n+\n%s\n" % (title1, seq1, qual1)) j = 30 + len(seq2[30:151]) - len(seq2[30:151].lstrip('G')) output_five_2.write("@%s\n%s\n+\n%s\n" % (title2, seq2[j:151], qual2[j:151])) elif three_alignment > five_alignment: output_three_1.write("@%s\n%s\n+\n%s\n" % (title1, seq1, qual1)) output_three_2.write("@%s\n%s\n+\n%s\n" % (title2, seq2[30:151], qual2[30:151])) else: output_else_1.write("@%s\n%s\n+\n%s\n" % (title1, seq1, qual1)) output_else_2.write("@%s\n%s\n+\n%s\n" % (title2, seq2, qual2)) output_five_1.close() output_five_2.close() output_three_1.close() output_three_2.close() output_else_1.close() output_else_2.close()