uhhh which script?
Hi,
How can I merge two paired end fastq (R and L) to give a single fastq file ? For information, the sequencing run is 72 bp long and it contains a majority of small RNA (miRNA,...) so a lot of paired end reads will overlap.
For example here's two paired reads :
@HWUSI-EAS529:41:FC62YHFAAXX:8:1:7969:1330 1:N:0:GCCAAT
CTACGAAAGGGCACTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCAATATCTCGTATGCCGTCTTCTGCT
+
IIIIIIIHIIHIIIIIIHHIIIHGIIIIEIIIIIIEIIHIIIIIIIIIIIHIIIIIBHIHIIHGIGIEGHHEGEEH
@HWUSI-EAS529:41:FC62YHFAAXX:8:1:7969:1330 2:N:0:GCCAAT
AGTGCCCTTTCGTAGGATCGTCGGACTGTAGAACTCTGAACGTGTAGATCTCGGTGGTCGCCGTATCATTAAAAAA
+
IIIIIIIIIIIIIIIIIIIIIIIDHIGIIIHIIIGHGIIIIIIIHHIHIIIIIIIIIHIIIIIIIIHIIGIIIIHI
I find the adapter in the first one :
Code:
EMBOSS_001 1 CTACGAAAGGGCACTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGC 50
|||||||||||||||||||||
EMBOSS_001 1 ---------------TGGAATTCTCGGGTGCCAAGG-------------- 21
EMBOSS_001 51 CAATATCTCGTATGCCGTCTTCTGCT 76
EMBOSS_001 22 -------------------------- 21
but not in the second one ...
But I effectively found the overlap between the right read and the left read (using the reverse complement of it)
EMBOSS_001 1 -------------------------------------------------- 0
EMBOSS_001 1 TTTTTTAATGATACGGCGACCACCGAGATCTACACGTTCAGAGTTCTACA 50
EMBOSS_001 1 -----------CTACGAAAGGGCACTTGGAATTCTCGGGTGCCAAGGAAC 39
|||||||||||||||
EMBOSS_001 51 GTCCGACGATCCTACGAAAGGGCACT------------------------ 76
EMBOSS_001 40 TCCAGTCACGCCAATATCTCGTATGCCGTCTTCTGCT 76
EMBOSS_001 77 ------------------------------------- 76
So my question is, how can I merge the two fastq files to produce a single fastq file ?
Thanks,
N.
There is a decent program called stitch:[?] https://github.com/audy/stitch
I wrote a script called mergePairs that is very sensitive and incredibly slow:[?] http://code.google.com/p/standardized-velvet-assembly-report/source/browse/trunk/mergePairs.py
Either use Galaxy or use the single scripts
http://hg.notalon.org/galaxy/galaxy-central/src/7d9bb95caaa7/tools/fastq
HTH!
Here is the skeleton for a C++ program merging the fastq reads (I've only tested it with your data):
#include <cstdlib>
#include <cstdio>
#include <string>
#include <algorithm>
#include <iostream>
#include <zlib.h>
using namespace std;
class AlignReads
{
private:
gzFile open(const char* f)
{
gzFile in=gzopen(f,"rb");
if(in==NULL)
{
cerr << "Cannot open " << f << endl;
exit(EXIT_FAILURE);
}
return in;
}
bool read(gzFile in,string& line)
{
line.clear();
if(gzeof(in)) return false;
int c;
while((c=gzgetc(in))!=EOF && c!='\n')
{
line+=(char)c;
}
return true;
}
char complement(char c)
{
switch(c)
{
case 'A':c='T'; break;
case 'T':c='A'; break;
case 'G':c='C'; break;
case 'C':c='G'; break;
default:c='N'; break;
}
return c;
}
public:
void run(const char* f1,const char *f2)
{
gzFile in1=open(f1);
gzFile in2=open(f2);
string line1;
string line2;
string title;
long nLine=0;
while(read(in1,line1) && read(in2,line2))
{
nLine++;
if(nLine%4==1)
{
title=line1;
}
if(nLine%4!=2) continue;
int best_dist=0;
for(int L=1;L<= (int)line1.size() && L<= (int)line2.size();++L)
{
int j=0;
for(j=0;j< L;++j)
{
char c1= line1.at(j);
char c2= complement(line2.at(L-1-j));
if(c1!=c2) break;
}
if(j==L)
{
best_dist=L;
}
}
if(best_dist<10) continue;
cout << "Title " << title << endl;
cout << "Read1 : " << line1.substr(0,best_dist) << "/" <<
line1.substr(best_dist)
<< endl;
cout << "Read2 : " << line2.substr(0, best_dist) << "/" <<
line2.substr(best_dist)
<< endl;
cout << "Merge : ";
for(int i=line2.size()-1;i>=best_dist;--i)
{
cout << complement(line2.at(i));
}
cout << "/" << line1 ;
cout << endl;
cout << endl;
}
gzclose(in1);
gzclose(in2);
}
};
int main(int argc,char** argv)
{
AlignReads app;
if(argc!=3) return EXIT_FAILURE;
app.run(argv[1],argv[2]);
return 0;
}
Compilation:
g++ file.cpp -lz
Test:
./a.out file1.fastq file2.fastq
Title @HWUSI-EAS529:41:FC62YHFAAXX:8:1:7969:1330 1:N:0:GCCAAT
Read1 : CTACGAAAGGGCACT/TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCAATATCTCGTATGCCGTCTTCTGCT
Read2 : AGTGCCCTTTCGTAG/GATCGTCGGACTGTAGAACTCTGAACGTGTAGATCTCGGTGGTCGCCGTATCATTAAAAAA
Merge : TTTTTTAATGATACGGCGACCACCGAGATCTACACGTTCAGAGTTCTACAGTCCGACGATC/CTACGAAAGGGCACTTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCAATATCTCGTATGCCGTCTTCTGCT
It's no available, the web page cannot be open. http://genomics.jhu.edu/software/FLASH/index.shtml