|
#include <stdio.h> |
|
#include <string.h> |
|
#include <ctype.h> |
|
static const char* STANDARD_GENETIC_CODE="FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG"; |
|
|
|
static int base2index(char c) |
|
{ |
|
switch(tolower(c)) |
|
{ |
|
case 't': return 0; |
|
case 'c': return 1; |
|
case 'a': return 2; |
|
case 'g': return 3; |
|
default: return -1; |
|
} |
|
} |
|
|
|
static char translate(char a,char b,char c) |
|
{ |
|
int base1= base2index(a); |
|
int base2= base2index(b); |
|
int base3= base2index(c); |
|
if(base1==-1 || base2==-1 || base3==-1) |
|
{ |
|
return '?'; |
|
} |
|
else |
|
{ |
|
return STANDARD_GENETIC_CODE[base1*16+base2*4+base3]; |
|
} |
|
|
|
} |
|
#define BIG_SIZE 5000 |
|
static char protein[BIG_SIZE]; |
|
|
|
#define NUM_SEQ 13 |
|
static char* chunks[]= |
|
{ |
|
"tacacgcacagccgcagcgcttctgcagaatgaccccaatcctgacgtactcagcaagactggattcacacccct", |
|
"cgtgATCATGGTCCGACTCCTGCTGGACAGAGGAGCTCAGATTGATGCCAAGACTAAGGATGAGCTAACTCCTCTGCATTGTGCAgccagaaatggtcatgtcag", |
|
"ctggatcatggagcccccatccaggcaaagaccaagaatggcctgtctCCAATCCACATGGCAGCACAGGGGGACCACATGGACTGCGTCAAGCAGCTTCTGCAGTACAACGCAGAGATAGATgacatcacac", |
|
"cacactgcggccaccaccgcatggccaaagtactgctggataaaggggccaaacccaactctcgggcattgaatg", |
|
"ttatgggtcatctcaACATTGTGAAGATCCTGCTTCAGAAAGGTGCTTCTCCGAGCGCCtccaacgtgaaagtggagacccctctccata", |
|
"gagtttttactgcagaattcagcaccagtggatgccaaggccaaggatgatcaaactcctctccattgtgccgctc", |
|
"atcctactggacatggaggctcagcagACCAAGATGACCAAGAAAGGCTTCACTCCGCTTCATGTGGCCTCAAAGtacggcaaggtggatgttgcagagctg", |
|
"agcAgaaCCAGGTGGAGGTGGCTAACAGCCTGTTGCAGTACGGCGCTTCGGCCAATGCTGAGTCACTGCAGGGAGttacacctctccacct", |
|
"gcaggaaggaaggcccgacatggtctccctgctcatctccaaacaggccaatgtcaaccttggaaacaagagtggat", |
|
"gcgtgtcactatggcaacatcaagatggtgaagttcctcctgcagcaacaggccaacgtcaacagcaaaacaagg", |
|
"tgccctcCTCTGCACCAGGCGGCCCAGCAGGGACACACAGACATTGTGACACTGTTGCTGAAGCATGGAGCTCAGCCCAACGAGACAACAACAaatggt", |
|
"tttgacgtcctaaagctcgtcactgaGGAGACAGTTTCCATgacGaccACAGAGAAACATCGCATGAGTTTCCCAGAAACAGTGGATGAGATACTAGACGTCTCTGAGGACGAAGGAGAGGAGCTCTTGGGGACAGAAGGGGCcagGTacatgaagatggatgacatgaaagaccatgatgacgatttcctctcccccaagaaatcactggagaattactc", |
|
"agttttatggtagatgctcgaggcggctcaatgcgaggcagcaggcataacggtctgcgtgtcatcatacctccgcgaacc" |
|
|
|
}; |
|
|
|
static void recursive(int seqindex,int protsize) |
|
{ |
|
int frame; |
|
int length; |
|
|
|
if(seqindex==NUM_SEQ) |
|
{ |
|
protein[protsize++]='\n'; |
|
protein[protsize]='\0'; |
|
fputs(protein,stdout); |
|
return; |
|
} |
|
else if(seqindex!=0) |
|
{ |
|
protein[protsize++]='X'; |
|
protein[protsize++]='X'; |
|
protein[protsize++]='X'; |
|
} |
|
|
|
length=strlen(chunks[seqindex]); |
|
|
|
for(frame=0;frame<3;++frame) |
|
{ |
|
int i; |
|
int newsize=protsize; |
|
for(i=frame;i+2< length;i+=3) |
|
{ |
|
protein[newsize++]=translate(chunks[seqindex][i+0],chunks[seqindex][i+1],chunks[seqindex][i+2]); |
|
} |
|
|
|
recursive(seqindex+1,newsize); |
|
} |
|
} |
|
|
|
int main(int argc,char** argv) |
|
{ |
|
recursive(0,0); |
|
return 0; |
|
} |
So if I understand correctly, the length of the "NNN" segments is approximate? Do you have upper/lower bounds for those segment lengths?
With so many segments composed of 'N's, you would probably end up with many possible alternatives, all with the same maximum translation length. In fact, the exact length of the Ns will be impossible to know since only 6 cases of ORF are possible (tell me if I'm wrong). Hence, let's say 22 is a possible length for a given N segment, then so will be any number of the form (1 + 3*x), which represents a +2 ORF. The problem is then that to maximize ORF length while permitting sequences of N that can take only 3 values (N, NN, NNN). What do you think?
I think this is quite a complex problem! The aim is to alter the length of "NNN" segments in order to maximise ORF length. Clearly we could "stretch" each set of N for as long as we wish, provided that we get a frame across the entire sequence. So there has to be an upper limit on N lengths.
Looking at the fasta file, I'm still unclear as to whether the 'N' are padding (that is, exact length of N segments unknown) or just "base unknown", but segment length is exact. If the latter, see my answer below.