Question

Off topic:How to print stats from Python filtering script

0

Entering edit mode

10.0 years ago

arabhorsegal18 • 0

Hello all,

I have the following python script which will filter paired-end fastq read data based on nucleotide complexity with a user provided percentage (as a way of removing reads with lots of monomeric or dimeric repeats). The script works, but I would to generate some simple statistics, the number of read pairs input and the number of read pairs retained, and print them. I was trying to use .count but kept receiving error messages.

Any ideas or suggestions would be welcomed. As you can probably tell my Python skills are less than stellar, I'm still a beginner and feeling kind of stuck.

Thanks for any help,

~Ana

#!/usr/bin/env python

from Bio import SeqIO
from Bio import SeqRecord
import sys
import argparse

def open_either(file, mode='r'):
    '''open either compressed or not compressed files'''

    if file.endswith('.gz'):
        import gzip
        return gzip.open(file, mode)
    else:
        return open(file, mode)


def calc_percentA(List):
    BaseList = ['A']
    for Base in BaseList:
        Percent = 100 * List.count(Base) / float(len(List))
        PercentA = float(Percent)
    return PercentA  

def calc_percentC(List):
    BaseList = ['C']
    for Base in BaseList:
        Percent = 100 * List.count(Base) / float(len(List))
        PercentC = float(Percent)
    return PercentC        

def calc_percentG(List):
    BaseList = ['G']    
    for Base in BaseList:
        Percent = 100 * List.count(Base) / float(len(List))
        PercentG = float(Percent)
    return PercentG

def calc_percentT(List):
    BaseList = ['T']    
    for Base in BaseList:
        Percent = 100 * List.count(Base) / float(len(List))
        PercentT = float(Percent)
    return PercentT

if __name__ == '__main__' :
    parser = argparse.ArgumentParser(description='Filter paired-end read quality based on percent base composition.',)    
    parser.add_argument('-1', '--fastq1',
                                            help='The input read1 fastq file.',
                                            required=True)
    parser.add_argument('-2', '--fastq2',
                                            help='The input read2 fastq file.',
                                            required=True)
    parser.add_argument('-p', '--minPercent',
                                            help='Any base must account for this percent of read. Or read will be discarded.',
                                            type=float,
                                            default=5.0)
    parser.add_argument('-o1', '--outFile1',
                                            help='fastq1 output file.',
                                            required=True)
    parser.add_argument('-o2', '--outFile2',
                                            help='fastq2 output file.',
                                            required=True)
    args = parser.parse_args()

    fastqFormat = 'fastq'

    fastq1handle = open_either(args.fastq1, 'rU')
    fastq2handle = open_either(args.fastq2, 'rU')

    fastq1out = open_either(args.outFile1, 'wb')
    fastq2out = open_either(args.outFile2, 'wb')

    fastq1SeqIO = SeqIO.parse(fastq1handle, fastqFormat)
    fastq2SeqIO = SeqIO.parse(fastq2handle, fastqFormat)

    while True:        
        try:
            fastq1record = fastq1SeqIO.next()
            fastq2record = fastq2SeqIO.next()
        except:
            break

        Seq_ListR1 = fastq1record.seq
        Seq_ListR2 = fastq2record.seq

        if (calc_percentA(Seq_ListR1) < args.minPercent) or (calc_percentA(Seq_ListR2) < args.minPercent):
            continue

        if (calc_percentC(Seq_ListR1) < args.minPercent) or (calc_percentC(Seq_ListR2) < args.minPercent):
            continue

        if (calc_percentG(Seq_ListR1) < args.minPercent) or (calc_percentG(Seq_ListR2) < args.minPercent):
            continue            

        if (calc_percentT(Seq_ListR1) < args.minPercent) or (calc_percentT(Seq_ListR2) < args.minPercent):
            continue

        fastq1out.write(fastq1record.format(fastqFormat))
        fastq2out.write(fastq2record.format(fastqFormat))        

    fastq1out.close()
    fastq2out.close()

    print "Filtering complete!"

Biopython fastq Python • 1.7k views

ADD COMMENT • link updated 2.6 years ago by Ram 45k • written 10.0 years ago by arabhorsegal18 • 0