Question: Python multiprocessing FASTQ file
1
gravatar for mplace
3.7 years ago by
mplace40
United States
mplace40 wrote:

I have a large fastq file, ~40GB of short 50bp illumina reads.   The first 10bp are an experiment identifer tag, 18bp follow that are the same for all reads, the remainder of the sequence is the first 22 bp of a gene.  I have a script which identifies the experiment and the gene given a "decode" file.  The problem is this script is so, so slow.

Any suggestions on how to speed things up would be greatly appreciated.  

from collections import defaultdict

import argparse
import itertools
import os
import profile
import sys

class BarSeq( object ):
        
    def __init__( self, fastq, decode, seqID, cwd  ):
        """
        parameters:
        fastq  = BarSeq fastq file, expected to be for a single user
        decode = Gene UP_tag file 
        cwd    = current working directory
        seqID  = Experiment ID_tag file
        """
        self.fastq     = fastq  
        self.dir       = cwd
        self.seqid     = seqID
        self.decode    = decode
        self.geneList  = []                    # Gene names 

        self.geneUpTag = self.getGeneUpTag()   # key = gene names, values = list of sequence tags
        self.seqIdTag  = self.getSeqIdTag()    # key = sampleID, values = list [ sequnce, user ]
        self.exact_Results = defaultdict(dict) #  dict(key=experiment) value = Dict(key = Gene, value = Counts )
        self.fuzzy_Results = defaultdict(dict) # for results with 1 mismatch  
        
    def matchSeqId( self, read ):
        """
        Match a sequence read to an experiment with a seqID tag
        seqID tag must match start of string and be an EXACT match.
        """
        for key, value in self.seqIdTag.items():
            if read.startswith(value[0]):
                return key
        return
                    
    def matchGene( self, read ):
        """
        Match GENE UPTAG to sequence read,
        Tag must be an exact match
        Assumes the Gene part of the sequence begins at the 29th base of the read.
        If not you will need to change the base number in geneSeq = read[28:]
        to an appropriate value.
        """
        for key, value in self.geneUpTag.items():
            geneSeq = read[28:]
            if geneSeq.startswith(value[0]):             
                return key
        return
            
    def processFastq( self ):
        """
        Open and read fastq file read by read, calling functions matchSEQID
        and matchGene functions.  If match found count and store result.
        """
        with open( self.fastq, "rU" ) as data:
            for line1, line2, line3, line4 in itertools.zip_longest(*[data]*4):
                header = line1.rstrip()
                read    = line2.rstrip()
                sign   = line3.rstrip()
                qual   = line4.rstrip()
            
            # EXACT MATCHING STARTS HERE
                SeqID =  self.matchSeqId(read)  # returns sample/experiment identifer
                if SeqID:
                    geneName = self.matchGene(read)
                    if geneName:
                        if SeqID in self.exact_Results:
                            if geneName in self.exact_Results[SeqID]:
                                self.exact_Results[SeqID][geneName] += 1
                            else:
                                self.exact_Results[SeqID][geneName] = 1
                        else:
                            self.exact_Results[SeqID][geneName] = 1                

    def writeTable( self, data, outName ):
        """
        Write gene counts to file as a tab delimited table.
        """ 
        sampleList = list(data.keys())                  # get list to maintain order when writing table      
        sampleList.sort()

        with open( outName, 'w') as out:
            out.write('gene')
            for name in sampleList:
                newName = name + "-" + self.seqIdTag[name][0]
                out.write("\t%s" %(newName))
            out.write("\n")
            
            for idx in range(0, len(self.geneList)):
                out.write("%s\t" %( self.geneList[idx]))
                for sample in sampleList:
                    if self.geneList[idx] not in self.exact_Results[sample]:
                        out.write('0\t')
                    else:
                        out.write('%s\t' %(self.exact_Results[sample][self.geneList[idx]]))
                out.write("\n")

    def getSeqIdTag( self ):
        """
        Get SeqID UP_tag Decode information, put in dictionary
        Key = SampleID, value = list[ sequence, user_name]
        """
        seqUpTag = defaultdict(list)
        
        with open( self.seqid , 'r') as f:
            for line in f:
                if line.startswith('Sample'):
                    continue
                line = line.rstrip()
                items = line.split()
                sampleID = ["".join(items[0:3])]    
                if sampleID[0] in seqUpTag:
                    print("Duplicate sampleID %s" %(sampleID[0]))
                else:
                    seqUpTag[sampleID[0]].append(items[4])
                    seqUpTag[sampleID[0]].append(items[5])
                
        return seqUpTag
    
    def getGeneUpTag( self ):
        """
        Get GENE UP_tag Decode information, put in dictionary, 
        key = GENE,  value = list of sequence tags
        """
        geneUpTag = defaultdict(list)
        
        with open( self.decode, 'r') as f:
            for line in f:
                line = line.rstrip()
                if line.startswith('Y'):
                    items = line.split()
                    geneUpTag[items[0]].append(items[1])
                    if items[0] not in self.geneList:
                        self.geneList.append(items[0])
        
        return geneUpTag            
                    
def main():

 cmdparser = argparse.ArgumentParser(description="Produce a table of gene counts from DNA Bar-Seq data.",
                                        usage='%(prog)s -f fastq -d UP_tag_Decode file -s Seq_ID file', prog='BarSeq.py'  )                                  
    cmdparser.add_argument('-f', '--Fastq' , action='store'     , dest='FASTQ' , help='BarSeq fastq file'         , metavar='')
    cmdparser.add_argument('-d', '--Decode', action='store'     , dest='DECODE', help='GENE UP_tag_Decode file'        , metavar='')    
    cmdparser.add_argument('-s', '--Seqid' , action='store'     , dest='SEQID' , help='Seq_ID file (experiment ID)', metavar='')
    cmdparser.add_argument('-i', '--info'  , action='store_true', dest='INFO'  , help='Print a more detailed description of program.')
    cmdResults = vars(cmdparser.parse_args())
    
    # if no args print help
    if len(sys.argv) == 1:
        print("")
        cmdparser.print_help()
        sys.exit(1)
  
    cwd = os.getcwd()                                     # current working dir
    # Start processing the data
    data = BarSeq(fastq, geneDecode, seqID, cwd)
    data.processFastq()
    data.writeTable(data.exact_Results,"Exact-Match.table")

 

if __name__ == "__main__":
    main()           

 

 

fastq python multiprocessing • 2.4k views
ADD COMMENTlink modified 3.7 years ago • written 3.7 years ago by mplace40
2
gravatar for Devon Ryan
3.7 years ago by
Devon Ryan92k
Freiburg, Germany
Devon Ryan92k wrote:

There are a few issues. Firstly, while python is a great language, it's very slow and it has absolutely terrible multithreading functionality. If you want to code something quickly you use python. If you want it to run quickly you use C or C++ or even java.

You have some functions that aren't taking advantage of python. For example, matchSeqId() should not be doing any iteration at all. if key in some_dict: will be much faster than linearly searching. The same is the case for matchGene(), just make a dict, though you should really exclude duplicate initial 28 base sequences (note that you're currently just returning the first match among possibly many).

ADD COMMENTlink written 3.7 years ago by Devon Ryan92k
1
gravatar for John
3.7 years ago by
John12k
Germany
John12k wrote:

What Devon said (although I disagree that Python is *very* slow :P I'll settle for a "reasonably slow" ;) )

You should start by profiling your script. Python comes with a fairly good profiler called cProfile. Run your script like:

python -m cProfile -o my_profile my_program.py

This will run your code as usual, except it will take a bit longer, and will write out a binary profile file (my_profile) which you will give to other programs to analyse. In that binary file contains info on how many times certain functions/etc were called, how long they took, etc.

Many people like to vizulize this binary with SnakeViz, but personally I find the target chart it makes tedious. Target charts, particularly for HDD space calculations, are always tedious. My fav tool is cProfileV, which makes a little webserver for you to sort/etc your times.

Once you've spent a bit of time tuning your code, try running it in pypy. I see no modules here that would cause a problem, so you should get a pretty nice speed up right off the bat.

Finally, try not doing things like line=line.rstrip(); line.startswith('Y'). By re-allocating the line tag to a new position in memory, you give more work for python's (pretty bad) garbage collector, and it slows things down every time you copy something. It wont be a huge saving, but when you move to pypy with its very lean garbage collector, it can become an issue.

ADD COMMENTlink written 3.7 years ago by John12k
2

What's reasonably slow depends on how impatient you are :)

ADD REPLYlink written 3.7 years ago by Devon Ryan92k
1
gravatar for Rob
3.7 years ago by
Rob100
Rob100 wrote:

I suggest you to take a look on Cython (http://cython.org/). You can also recode your script in C as Devon said, but normally you can avoid that with Cython.

ADD COMMENTlink written 3.7 years ago by Rob100

Indeed, the normal order of speed (from slowest to fastest) is:

  • Python with slow algorithms
  • Python with fast algorithms
  • Python with Cython
  • Another language, like C

Sometimes Cython screws things up, but it's generally pretty decent.

ADD REPLYlink written 3.7 years ago by Devon Ryan92k
0
gravatar for mplace
3.7 years ago by
mplace40
United States
mplace40 wrote:

Makes sense,  I will make the changes to use  "if key in dict:",  if that is too slow I will switch to c++.

Thank you for taking the time to comment.

 

 

ADD COMMENTlink written 3.7 years ago by mplace40
0
gravatar for mplace
3.7 years ago by
mplace40
United States
mplace40 wrote:

The dict change result in a 10-fold increase in speed, very nice thanks again

ADD COMMENTlink written 3.7 years ago by mplace40
0
gravatar for romain.lannes
3.7 years ago by
romain.lannes80 wrote:

May be if you have access to computational ressource you could use threading. It is python library which allow Thread-based parallelism in one computer for more information here the official documentation:

https://docs.python.org/3.4/library/threading.html

ADD COMMENTlink written 3.7 years ago by romain.lannes80
0
gravatar for Fidel
3.7 years ago by
Fidel1.9k
Germany
Fidel1.9k wrote:

A simple way to parallelize is to split the fastq file in smaller files. You can do this reasonable fast with the unix command `split` which can take as input the number of lines. 

Then you run your program on each of the smaller fastq files. Even better would be for your program to do the split of the fastq, run threads on each chunk then collect all data and write the results. 

ADD COMMENTlink written 3.7 years ago by Fidel1.9k

Ideally one would just have bgzf compressed fastq files, which is now supported by bcl2fastq, though I haven't heard of many people actually using that (we're not)...

ADD REPLYlink written 3.7 years ago by Devon Ryan92k
0
gravatar for mplace
3.7 years ago by
mplace40
United States
mplace40 wrote:

Thanks for all the information, very helpful

ADD COMMENTlink written 3.7 years ago by mplace40
Please log in to add an answer.

Help
Access

Use of this site constitutes acceptance of our User Agreement and Privacy Policy.
Powered by Biostar version 2.3.0
Traffic: 1157 users visited in the last hour