Question

read multiple fasta sequence using kseq.h

1

Entering edit mode

9.8 years ago

madhurima.das ▴ 10

I am trying to find fasta sequences of 5 ids/name as provided by user from a big fasta file (containing 80000 fasta sequences) using an external header file kseq.h as in: http://lh3lh3.users.sourceforge.net/kseq.shtml. When I run the program in a for loop, I have to open/close the big fasta file again and again (commented in the code) which makes the computation time slow. On the contrary, if I open/close only once outside the loop, the program stops if it encounters an entry which is not present in the big fasta file i.e. it reaches end of the file. Can anyone suggest how to get all the sequences without losing computational time. The code is:

    #include <zlib.h>
    #include <stdio.h>
    #include <stdlib.h>
    #include <string.h>
    #include <time.h>
    #include "ext_libraries/kseq.h"

    KSEQ_INIT(gzFile, gzread)

    int main(int argc,char *argv[])
    {
        char gwidd_ids[100];
        kseq_t *seq;
        int i=0,nFields=0,row=0,col=0;
        int size=1000,flag1=0,l=0,index0=0;

        printf("Opening file %s\n",argv[1]);

        char **gi_ids=(char **)malloc(sizeof(char *)*size);
        for(i=0;i<size;i++)
        {
            gi_ids[i]=(char *)malloc(sizeof(char)*50);
        }
        FILE *fp_inp = fopen(argv[1],"r");
        while(fscanf(fp_inp,"%s",gwidd_ids) == 1)
        {
            printf("%s\n",gwidd_ids);
            strcpy(gi_ids[index0],gwidd_ids);
            index0++;
        }    
        fclose(fp_inp);

        FILE *f0 = fopen("xxx.txt", "w");
        FILE *f1 = fopen("yyy.txt", "w");
        FILE *f2 = fopen("zzz", "w");

        FILE *instream = NULL;
        instream = fopen("fasta_seq_nr_uniprot.txt", "r"); 
        gzFile fpf = gzdopen(fileno(instream), "r");

        for(col=0;col<index0;col++)
        {
            flag1=0;
    //        FILE *instream = NULL;
    //        instream = fopen("fasta_seq_nr_uniprot.txt", "r"); 
    //        gzFile fpf = gzdopen(fileno(instream), "r");
            kseq_t *seq = kseq_init(fpf); 
            while((kseq_read(seq)) >= 0 && flag1 == 0)
            {
                if(strcasecmp(gi_ids[col],seq->name.s) == 0)
                {
                    fprintf(f1,">%s\n",gi_ids[col]);
                    fprintf(f2,">%s\n%s\n",seq->name.s,seq->seq.s);
                    flag1 = 1;
                }
            }
            if(flag1 == 0)
            {
                fprintf(f0,"%s\n",gi_ids[col]);
            }
            kseq_destroy(seq);
    //        gzclose(fpf); 
        }
        gzclose(fpf); 

        fclose(f0);
        fclose(f1);
        fclose(f2);

        for(i=0;i<size;i++)
        {
            free(gi_ids[i]);
        }
        free(gi_ids);

        return 0;
    }

C sequence kseq.h • 2.9k views

ADD COMMENT • link updated 2.4 years ago by Ram 43k • written 9.8 years ago by madhurima.das ▴ 10

score 0 · Answer 1 · 2014-07-11

0

Entering edit mode

9.8 years ago

Devon Ryan 104k

This is borderline off-topic.

You should just iterate checking gi_ids from within the while((kseq_read(seq))... while loop, setting a found variable and breaking if there's a match.

ADD COMMENT • link 9.8 years ago by Devon Ryan 104k

0

Entering edit mode

I think I already have put in a flag condition inside the while loop to break when it reaches flag=1. I hope this is what you meant..

ADD REPLY • link 9.8 years ago by madhurima.das ▴ 10

0

Entering edit mode

Yeah flag1 will work. Just embed the if-block in a for-loop.

ADD REPLY • link 9.8 years ago by Devon Ryan 104k

Ram · Answer 2 · 2014-07-11

As an alternate approach, consider storing your sequence header IDs of interest into a hash table or associative array. Then stream through the large FASTA file and lookup the file's current sequence header ID against your hash table. If there's a match, print out the header and its sequence.

Lookups of hash tables only take O(1) time and you'll only have to stream through the file once. With five entries in a hash table, and only reading one line into memory at a time for evaluation, the file I/O time and memory usage will be minimal.

One approach in awk, for example, where you define your five patterns of interest in the variable ht:

$ awk ' \
BEGIN { \
    ht[">foo"] = 1; \
    ht[">bar"] = 1; \
    ht[">baz"] = 1; \
    flag = 0; \
} \
{ \
    if (($0 ~ /^>/) && (ht[$0] == 1)) { \
        print $0; \
        flag = 1; \
    } \
    else if (($0 !~ /^>/) && (flag == 1)) { \
        print $0; \
    } \
    else { \
        flag = 0; \
    } \
}' fasta_seq_nr_uniprot.txt

If you really need to write this in C, you could look into khash.