I am trying to find fasta sequences of 5 ids/name as provided by user from a big fasta file (containing 80000 fasta sequences) using an external header file kseq.h
as in: http://lh3lh3.users.sourceforge.net/kseq.shtml. When I run the program in a for loop, I have to open/close the big fasta file again and again (commented in the code) which makes the computation time slow. On the contrary, if I open/close only once outside the loop, the program stops if it encounters an entry which is not present in the big fasta file i.e. it reaches end of the file. Can anyone suggest how to get all the sequences without losing computational time. The code is:
#include <zlib.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include "ext_libraries/kseq.h"
KSEQ_INIT(gzFile, gzread)
int main(int argc,char *argv[])
{
char gwidd_ids[100];
kseq_t *seq;
int i=0,nFields=0,row=0,col=0;
int size=1000,flag1=0,l=0,index0=0;
printf("Opening file %s\n",argv[1]);
char **gi_ids=(char **)malloc(sizeof(char *)*size);
for(i=0;i<size;i++)
{
gi_ids[i]=(char *)malloc(sizeof(char)*50);
}
FILE *fp_inp = fopen(argv[1],"r");
while(fscanf(fp_inp,"%s",gwidd_ids) == 1)
{
printf("%s\n",gwidd_ids);
strcpy(gi_ids[index0],gwidd_ids);
index0++;
}
fclose(fp_inp);
FILE *f0 = fopen("xxx.txt", "w");
FILE *f1 = fopen("yyy.txt", "w");
FILE *f2 = fopen("zzz", "w");
FILE *instream = NULL;
instream = fopen("fasta_seq_nr_uniprot.txt", "r");
gzFile fpf = gzdopen(fileno(instream), "r");
for(col=0;col<index0;col++)
{
flag1=0;
// FILE *instream = NULL;
// instream = fopen("fasta_seq_nr_uniprot.txt", "r");
// gzFile fpf = gzdopen(fileno(instream), "r");
kseq_t *seq = kseq_init(fpf);
while((kseq_read(seq)) >= 0 && flag1 == 0)
{
if(strcasecmp(gi_ids[col],seq->name.s) == 0)
{
fprintf(f1,">%s\n",gi_ids[col]);
fprintf(f2,">%s\n%s\n",seq->name.s,seq->seq.s);
flag1 = 1;
}
}
if(flag1 == 0)
{
fprintf(f0,"%s\n",gi_ids[col]);
}
kseq_destroy(seq);
// gzclose(fpf);
}
gzclose(fpf);
fclose(f0);
fclose(f1);
fclose(f2);
for(i=0;i<size;i++)
{
free(gi_ids[i]);
}
free(gi_ids);
return 0;
}
I think I already have put in a flag condition inside the while loop to break when it reaches flag=1. I hope this is what you meant..
Yeah flag1 will work. Just embed the if-block in a for-loop.