Error remove over-representative sequences : TypeError: coercing to Unicode: need string or buffer, NoneType found
0
0
Entering edit mode
4.0 years ago

Hi I am running this python script to remove over-representative sequences from my fastq files, but I keep getting the error. I am new to bioinfomatics and have been following a fixed set of pipeline for sequence assembly. I wanted to remove over-representative sequences with this script.

Here is the command :

python /home/TranscriptomeAssemblyTools/RemoveFastqcOverrepSequenceReads.py -1 R1_1.fq -2 R1_2.fq

Here is the error :

File "TranscriptomeAssemblyTools/RemoveFastqcOverrepSequenceReads.py", line 46, in leftseqs=ParseFastqcLog(opts.l_fastqc) File "TranscriptomeAssemblyTools/RemoveFastqcOverrepSequenceReads.py", line 33, in ParseFastqcLog with open(fastqclog) as fp: TypeError: coercing to Unicode: need string or buffer, NoneType found

**Here is the python script :**
            import sys
            import gzip
            from os.path import basename
            import argparse
            import re
            from itertools import izip,izip_longest

            def seqsmatch(overreplist,read):
                flag=False
                if overreplist!=[]:
                    for seq in overreplist:
                        if seq in read:
                            flag=True
                            break
                return flag

            def get_input_streams(r1file,r2file):
                if  r1file[-2:]=='gz':
                    r1handle=gzip.open(r1file,'rb')
                    r2handle=gzip.open(r2file,'rb')
                else:
                    r1handle=open(r1file,'r')
                    r2handle=open(r2file,'r')

                return r1handle,r2handle

            def FastqIterate(iterable,fillvalue=None):
                "Grab one 4-line fastq read at a time"
                args = [iter(iterable)] * 4
                return izip_longest(fillvalue=fillvalue, *args) 

            def ParseFastqcLog(fastqclog):    
                with open(fastqclog) as fp:
                    for result in re.findall('Overrepresented sequences(.*?)END_MODULE', fp.read(), re.S):
                        seqs=([i.split('\t')[0] for i in result.split('\n')[2:-1]])
                return seqs     

            if __name__=="__main__": 
                parser = argparse.ArgumentParser(description="options for removing reads with over-represented sequences")
                parser.add_argument('-1','--left_reads',dest='leftreads',type=str,help='R1 fastq file')
                parser.add_argument('-2','--right_reads',dest='rightreads',type=str,help='R2 fastq file')
                parser.add_argument('-fql','--fastqc_left',dest='l_fastqc',type=str,help='fastqc text file for R1')
                parser.add_argument('-fqr','--fastqc_right',dest='r_fastqc',type=str,help='fastqc text file for R2')
                opts = parser.parse_args()

                leftseqs=ParseFastqcLog(opts.l_fastqc)
                rightseqs=ParseFastqcLog(opts.r_fastqc)

                r1_out=open('rmoverrep_'+basename(opts.leftreads).replace('.gz',''),'w')
                r2_out=open('rmoverrep_'+basename(opts.rightreads).replace('.gz',''),'w')

                r1_stream,r2_stream=get_input_streams(opts.leftreads,opts.rightreads)

                counter=0
                failcounter=0

                with r1_stream as f1, r2_stream as f2:
                    R1=FastqIterate(f1)
                    R2=FastqIterate(f2)
                    for entry in R1:
                        counter+=1
                        if counter%100000==0:
                            print "%s reads processed" % counter

                        head1,seq1,placeholder1,qual1=[i.strip() for i in entry]
                        head2,seq2,placeholder2,qual2=[j.strip() for j in R2.next()]

                        flagleft,flagright=seqsmatch(leftseqs,seq1),seqsmatch(rightseqs,seq2)

                        if True not in (flagleft,flagright):
                            r1_out.write('%s\n' % '\n'.join([head1,seq1,'+',qual1]))
                            r2_out.write('%s\n' % '\n'.join([head2,seq2,'+',qual2]))
                        else:
                            failcounter+=1


                    print 'total # of reads evaluated = %s' % counter
                    print 'number of reads retained = %s' % (counter-failcounter)
                    print 'number of PE reads filtered = %s' % failcounter


            r1_out.close()
            r2_out.close()
assembly RNA-Seq sequence • 666 views
ADD COMMENT

Login before adding your answer.

Traffic: 2153 users visited in the last hour
Help About
FAQ
Access RSS
API
Stats

Use of this site constitutes acceptance of our User Agreement and Privacy Policy.

Powered by the version 2.3.6