Traffic: 324 ip/hr
Question: merge paired-end reads
 
2
 
 

Hi,

How can I merge two paired end fastq (R and L) to give a single fastq file ? For information, the sequencing run is 72 bp long and it contains a majority of small RNA (miRNA,...) so a lot of paired end reads will overlap.

For example here's two paired reads :

@HWUSI-EAS529:41:FC62YHFAAXX:8:1:7969:1330 1:N:0:GCCAAT
CTACGAAAGGGCACTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCAATATCTCGTATGCCGTCTTCTGCT
+
IIIIIIIHIIHIIIIIIHHIIIHGIIIIEIIIIIIEIIHIIIIIIIIIIIHIIIIIBHIHIIHGIGIEGHHEGEEH


@HWUSI-EAS529:41:FC62YHFAAXX:8:1:7969:1330 2:N:0:GCCAAT
AGTGCCCTTTCGTAGGATCGTCGGACTGTAGAACTCTGAACGTGTAGATCTCGGTGGTCGCCGTATCATTAAAAAA
+
IIIIIIIIIIIIIIIIIIIIIIIDHIGIIIHIIIGHGIIIIIIIHHIHIIIIIIIIIHIIIIIIIIHIIGIIIIHI

I find the adapter in the first one :

Code:

EMBOSS_001         1 CTACGAAAGGGCACTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC     50
                                    |||||||||||||||||||||              
EMBOSS_001         1 ---------------TGGAATTCTCGGGTGCCAAGG--------------     21

EMBOSS_001        51 CAATATCTCGTATGCCGTCTTCTGCT     76

EMBOSS_001        22 --------------------------     21

but not in the second one ...

But I effectively found the overlap between the right read and the left read (using the reverse complement of it)

EMBOSS_001         1 --------------------------------------------------      0

EMBOSS_001         1 TTTTTTAATGATACGGCGACCACCGAGATCTACACGTTCAGAGTTCTACA     50

EMBOSS_001         1 -----------CTACGAAAGGGCACTTGGAATTCTCGGGTGCCAAGGAAC     39
                                |||||||||||||||                        
EMBOSS_001        51 GTCCGACGATCCTACGAAAGGGCACT------------------------     76

EMBOSS_001        40 TCCAGTCACGCCAATATCTCGTATGCCGTCTTCTGCT     76

EMBOSS_001        77 -------------------------------------     76

So my question is, how can I merge the two fastq files to produce a single fastq file ?

Thanks,

N.

4 answers

 
1
 
 

There is a decent program called stitch:[?] https://github.com/audy/stitch

I wrote a script called mergePairs that is very sensitive and incredibly slow:[?] http://code.google.com/p/standardized-velvet-assembly-report/source/browse/trunk/mergePairs.py

 
 
0
 
 

Either use Galaxy or use the single scripts

http://hg.notalon.org/galaxy/galaxy-central/src/7d9bb95caaa7/tools/fastq

HTH!

 

uhhh which script?

log in to reply • written 19 months ago by Jeremy Leipzig  10,85011031
 
 
0
 
 

Here is the skeleton for a C++ program merging the fastq reads (I've only tested it with your data):

#include <cstdlib>
#include <cstdio>
#include <string>
#include <algorithm>
#include <iostream>
#include <zlib.h>

using namespace std;

class AlignReads
{
private:
    gzFile open(const char* f)
        {
        gzFile in=gzopen(f,"rb");
        if(in==NULL)
            {
            cerr << "Cannot open " << f << endl;
            exit(EXIT_FAILURE);
            }
        return in;
        }
    bool read(gzFile in,string& line)
        {
        line.clear();
        if(gzeof(in)) return false;
        int c;
        while((c=gzgetc(in))!=EOF && c!='\n')
            {
            line+=(char)c;
            }
        return true;
        }
    char complement(char c)
        {
        switch(c)
            {
            case 'A':c='T'; break;
            case 'T':c='A'; break;
            case 'G':c='C'; break;
            case 'C':c='G'; break;
            default:c='N'; break;
            }
        return c;
        }
public:
    void run(const char* f1,const char *f2)
        {
        gzFile in1=open(f1);
        gzFile in2=open(f2);
        string line1;
        string line2;
        string title;
        long nLine=0;
        while(read(in1,line1) && read(in2,line2))
            {
            nLine++;
            if(nLine%4==1)
                {
                title=line1;
                }
            if(nLine%4!=2) continue;
            int best_dist=0;

            for(int L=1;L<= (int)line1.size() && L<= (int)line2.size();++L)
                {

                int j=0;
                for(j=0;j< L;++j)
                    {
                    char c1= line1.at(j);
                    char c2= complement(line2.at(L-1-j));

                    if(c1!=c2) break;
                    }
                if(j==L)
                    {
                    best_dist=L;
                    }
                }

            if(best_dist<10) continue;

            cout << "Title " << title << endl;
            cout << "Read1 : " << line1.substr(0,best_dist) << "/" <<
                    line1.substr(best_dist)
                    << endl;
            cout << "Read2 : " << line2.substr(0, best_dist) << "/" <<
                    line2.substr(best_dist)
                    << endl;
            cout << "Merge : ";
            for(int i=line2.size()-1;i>=best_dist;--i)
                            {
                            cout << complement(line2.at(i));
                            }
            cout << "/" << line1 ;

            cout << endl;
            cout << endl;
            }
        gzclose(in1);
        gzclose(in2);
        }
};

int main(int argc,char** argv)
    {
    AlignReads app;
    if(argc!=3) return EXIT_FAILURE;
    app.run(argv[1],argv[2]);
    return 0;
    }

Compilation:

 g++ file.cpp -lz

Test:

./a.out file1.fastq file2.fastq

Title @HWUSI-EAS529:41:FC62YHFAAXX:8:1:7969:1330 1:N:0:GCCAAT
Read1 : CTACGAAAGGGCACT/TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCAATATCTCGTATGCCGTCTTCTGCT
Read2 : AGTGCCCTTTCGTAG/GATCGTCGGACTGTAGAACTCTGAACGTGTAGATCTCGGTGGTCGCCGTATCATTAAAAAA
Merge : TTTTTTAATGATACGGCGACCACCGAGATCTACACGTTCAGAGTTCTACAGTCCGACGATC/CTACGAAAGGGCACTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCAATATCTCGTATGCCGTCTTCTGCT
 
 
0
 
 

Maybe this program is suited for you: http://www.cbcb.umd.edu/software/flash/

 

It's no available, the web page cannot be open. http://genomics.jhu.edu/software/FLASH/index.shtml

log in to reply • written 6 months ago by litiancheng.gansu  103
 
Log in to add a post