/* bas2prot.c Converts a file of bases to a flat file database of proteins. Copyright 04/09/97 George Ruban, Joachim Reidl This software is in the public domain, and is freely available for any use, with no guarantee, on an "as is" basis. The authors would be interested in the ways in which this software is used and any problems that may be encountered. Users are requested, though not required, to send mail to the below addresses. This program and a paper describing it, "A THEORETICAL APPROACH TO IDENTIFY PUTATIVE SIGNAL SEQUENCES: PROVIDING EVIDENCE FOR SECRETED GENE PRODUCTS BY PURE COMPUTATIONAL ANALYSIS," are available on the World Wide Web. paper: http://www.geocities.com/SiliconValley/Vista/2013/joachim.html program: http://www.geocities.com/SiliconValley/Vista/2013/bas2prot.txt Written 02/07/1995 by George Ruban 67 Thurston Str., Somerville, MA 02145, USA. http://www.geocities.com/SiliconValley/Vista/2013 email: gmn@csa.bu.edu For Joachim Reidl Research Center for Infectious Disease, University of Wuerzburg, Roentgenring 11, 97070 Wuerzburg, Germany. Tel.: 0049 931 59509, e-mail: joachim.reidl@rzroe.uni-wuerzburg.de 04/09/1997 - changed from C++ style comments, published on World Wide Web: 03/31/1998 - replaced vonHeijne 1988 matrices with 1997 matrices, gave choice of 3 matrices to use. */ #include /* for fprintf(), scanf(), fopen()... */ #include /* for strncpy() */ #include /* for toupper(); */ #include /* for atoi() */ /* #define SWITCH_16_BITS /* uncomment define for old DOS compilers */ #ifndef SWITCH_16_BITS /* if the compiler doesn't use 16 bits only for switch() */ /* easy encoding of 3 chars into an unsigned long, to switch on */ #define e(a,b,c) (((((unsigned long)a<<8)+b)<<8)+c) #else /* if that's impossible - individual comparison of 3 chars to a string */ #define match(base,a,b,c) ((base[0] == (a)) && (base[1] == (b)) && (base[2] == (c))) #endif /* SWITCH_16_BITS */ #define MAXLINE 60 /* maximum length of a line of output for writeProtein() */ /* There are 4^3 = 64 different possible base triplets. 3 of them are stop codons. A length of 8000 amino acids should be plenty for a single protein, I hope... */ #define MAXPROTEIN 8000 #define MAXEUKARYOTIC 4087 #define MAXGRAMNEGATIVE 1196 #define MAXGRAMPOSITIVE 627 /* allow for different compilers */ char buffer[6][MAXPROTEIN]; /* 6 arrays to hold amino acids... */ char *buffptr[6] = { buffer[0], buffer[1], buffer[2], buffer[3] + MAXPROTEIN, buffer[4] + MAXPROTEIN, buffer[5] + MAXPROTEIN }; /* pointer to current location in each buffer */ unsigned int minWeight = 0; /* minimum weight to write a protein */ unsigned int maxSlide = 0; /* maximum number of times to slide */ unsigned int minLength = 0; /* minimum length of a protein in order to write it. */ unsigned int maxPositiveResidues = 0; /* maximum distance of K || R from starting M for valid protein */ unsigned long bases = 0, proteins = 0; /* number of bases read, proteins written */ /* number of amino acids excluded */ unsigned char nExcluded = 0; unsigned char exclude[26] = ""; /* initialized to 0s */ /* 1 if the presence of corresponding amino acid means entire amino acid is to be excluded */ /* input and output files */ FILE *in, *out; /* Amino acid counts for eukaryotic signal sequences from Gunnar von Heijne's paper in "Nucleic Acids Research" Volume 14, Number 11, 1986, p.4683, IRL Press, Oxford. 1 2 3 ... across the top, A-Y across the side */ const int cleavage [3] [26] [15] = { /* Eukaryotic cleavage sites */ { /* -13 -12 -11 -10 -9 -8 -7 -6 -5 -4 -3 -2 -1 1 2 */ /* A */{101,112,106,100,158,128,107,149,146,107,258, 80,458,141, 55}, /* no B's */{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, /* C */{ 36, 32, 34, 31, 42, 66, 57, 39, 30, 34, 68, 21, 50, 23, 27}, /* D */{ 2, 0, 3, 0, 3, 2, 4, 5, 25, 14, 4, 44, 6, 68, 66}, /* E */{ 1, 2, 4, 5, 4, 6, 13, 9, 28, 27, 7, 66, 6, 92, 88}, /* F */{ 75, 67, 80, 81, 65, 58, 92, 67, 25, 34, 7, 46, 6, 34, 28}, /* G */{ 39, 24, 26, 34, 36, 50, 40, 29,108,125, 74, 38,184, 52, 57}, /* H */{ 5, 3, 4, 2, 4, 10, 14, 7, 23, 12, 5, 53, 2, 23, 22}, /* I */{ 73, 67, 82, 59, 52, 52, 47, 69, 22, 43, 41, 18, 4, 37, 34}, /* no J's */{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, /* K */{ 4, 1, 1, 2, 0, 3, 3, 3, 20, 19, 7, 24, 4, 52, 40}, /* L */{376,432,397,449,386,329,333,280, 98,147, 65,165, 21, 60, 51}, /* M */{ 26, 15, 21, 19, 11, 30, 20, 11, 5, 12, 9, 14, 3, 14, 12}, /* N */{ 2, 4, 4, 6, 7, 7, 8, 7, 25, 14, 9, 37, 9, 26, 53}, /* no O's */{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, /* P */{ 23, 13, 17, 9, 12, 13, 24, 60, 98, 74, 7, 17, 27, 11,166}, /* Q */{ 3, 6, 4, 9, 11, 19, 18, 13, 55, 40, 5, 72, 15,135, 42}, /* R */{ 7, 4, 4, 2, 3, 7, 9, 6, 34, 38, 5, 56, 20, 33, 40}, /* S */{ 57, 47, 40, 44, 65, 76, 67, 66,117, 97,142,112,141, 91, 88}, /* T */{ 32, 39, 30, 38, 38, 49, 42, 35, 70, 70,103, 44, 43, 39, 57}, /* no U's */{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, /* V */{112,124,127,100, 99, 86, 79,129, 60, 72,191, 45, 8, 50, 56}, /* W */{ 16, 13, 20, 11, 9, 13, 24, 21, 13, 13, 2, 33, 2, 9, 6}, /* no X's */{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, /* Y */{ 17, 3, 6, 9, 6, 7, 10, 6, 8, 19, 2, 26, 2, 21, 23}, /* no Z's */{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, }, /* Gram-negative cleavage sites */ { /* -13 -12 -11 -10 -9 -8 -7 -6 -5 -4 -3 -2 -1 1 2 */ /* A */{ 63, 58, 56, 64, 63, 39, 41, 78, 34, 41,156, 20,229,102, 19}, /* no B's */{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, /* C */{ 10, 6, 5, 5, 6, 5, 4, 7, 3, 0, 3, 3, 0, 4, 0}, /* D */{ 0, 1, 1, 2, 1, 2, 2, 0, 0, 1, 2, 2, 0, 17, 43}, /* E */{ 0, 0, 0, 1, 1, 0, 1, 0, 1, 2, 1, 4, 1, 21, 40}, /* F */{ 15, 12, 18, 12, 17, 32, 31, 7, 17, 13, 1, 28, 0, 2, 1}, /* G */{ 20, 22, 17, 13, 11, 29, 19, 13, 41, 15, 5, 5, 11, 12, 15}, /* H */{ 1, 0, 0, 0, 1, 0, 2, 1, 7, 1, 1, 32, 0, 2, 0}, /* I */{ 17, 22, 16, 15, 16, 5, 25, 11, 4, 9, 3, 5, 0, 3, 9}, /* no J's */{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, /* K */{ 0, 1, 2, 0, 0, 0, 1, 0, 1, 4, 1, 3, 0, 12, 4}, /* L */{ 60, 61, 71, 63, 67, 82, 44, 7, 31, 12, 15, 42, 4, 8, 11}, /* M */{ 11, 9, 9, 8, 17, 12, 17, 9, 4, 2, 0, 19, 0, 3, 2}, /* N */{ 2, 1, 2, 2, 3, 0, 2, 5, 5, 13, 2, 15, 3, 10, 13}, /* no O's */{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, /* P */{ 2, 4, 6, 6, 5, 3, 4, 19, 16, 40, 1, 0, 1, 1, 23}, /* Q */{ 0, 0, 0, 2, 2, 1, 0, 1, 13, 12, 3, 25, 0, 30, 15}, /* R */{ 3, 2, 1, 1, 2, 3, 2, 0, 6, 5, 3, 4, 1, 3, 2}, /* S */{ 16, 21, 19, 30, 20, 17, 29, 72, 39, 53, 25, 18, 11, 12, 16}, /* T */{ 15, 11, 20, 17, 13, 15, 18, 23, 26, 19, 10, 13, 3, 6, 32}, /* no U's */{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, /* V */{ 28, 32, 21, 21, 16, 18, 18, 11, 16, 14, 33, 14, 2, 12, 17}, /* W */{ 0, 0, 1, 1, 1, 1, 2, 1, 1, 0, 0, 5, 0, 1, 3}, /* no X's */{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, /* Y */{ 1, 1, 0, 2, 3, 2, 4, 1, 0, 10, 1, 9, 0, 5, 1}, /* no Z's */{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, }, /* Gram-positive cleavage sites */ { /* -13 -12 -11 -10 -9 -8 -7 -6 -5 -4 -3 -2 -1 1 2 */ /* A */{ 29, 27, 17, 32, 25, 17, 19, 27, 14, 13, 77, 18,114, 58, 15}, /* no B's */{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, /* C */{ 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0}, /* D */{ 0, 1, 1, 0, 1, 0, 0, 4, 3, 7, 1, 3, 0, 15, 13}, /* E */{ 2, 0, 1, 1, 1, 1, 1, 3, 4, 3, 5, 11, 2, 13, 15}, /* F */{ 8, 8, 6, 11, 14, 5, 12, 1, 1, 3, 2, 10, 0, 2, 0}, /* G */{ 6, 13, 14, 14, 12, 17, 12, 9, 16, 7, 4, 11, 5, 1, 10}, /* H */{ 0, 0, 0, 0, 1, 0, 2, 3, 1, 2, 1, 8, 0, 1, 4}, /* I */{ 10, 10, 14, 10, 7, 8, 6, 8, 2, 5, 4, 2, 0, 3, 4}, /* no J's */{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, /* K */{ 1, 0, 1, 0, 0, 0, 0, 2, 6, 7, 0, 12, 2, 8, 4}, /* L */{ 26, 31, 26, 27, 31, 32, 18, 10, 5, 5, 3, 5, 0, 1, 1}, /* M */{ 3, 5, 4, 3, 2, 7, 3, 3, 0, 5, 0, 1, 0, 0, 0}, /* N */{ 1, 1, 0, 1, 5, 2, 7, 3, 13, 9, 1, 6, 1, 6, 5}, /* no O's */{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, /* P */{ 3, 3, 5, 3, 9, 6, 16, 22, 20, 15, 1, 2, 0, 2, 16}, /* Q */{ 0, 3, 0, 0, 0, 4, 7, 5, 8, 5, 0, 17, 1, 8, 6}, /* R */{ 1, 0, 0, 2, 0, 1, 0, 1, 1, 3, 2, 1, 3, 1, 1}, /* S */{ 14, 11, 17, 13, 13, 14, 12, 11, 18, 13, 12, 19, 8, 14, 17}, /* T */{ 17, 13, 14, 7, 7, 12, 9, 16, 19, 28, 1, 4, 1, 4, 21}, /* no U's */{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, /* V */{ 18, 12, 18, 15, 11, 10, 14, 12, 10, 9, 27, 3, 3, 2, 8}, /* W */{ 0, 0, 0, 1, 1, 2, 1, 0, 0, 0, 0, 2, 0, 1, 0}, /* no X's */{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, /* Y */{ 0, 1, 1, 0, 0, 2, 1, 1, 0, 2, 0, 5, 1, 1, 1}, /* no Z's */{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} } /* end Gram-positive */ }; /* end cleavage */ int table = -1; /* 0, 1, or 2, eu- or pro-karyotic */ char lookup(const char base[3]); char pair(char base); char add(char amin, char frame); void writeProtein(char frame, unsigned long position, const char *start, unsigned int len); int weigh(const char *prot, unsigned int len); /* Returns 'A' on input 'T', 'C' on input 'G', and vice versa, the "pair" base on the other part of the double helix */ char pair(char base) { switch(base){ case 'A': return 'T'; case 'T': return 'A'; case 'C': return 'G'; case 'G': return 'C'; default: return '?'; /* error condition */ } } /* returns amino acid character for input base triplet */ char lookup(const char base[3]) { char c; #ifndef SWITCH_16_BITS unsigned long i; i = e(base[0], base[1], base[2]); /* printf("looking up '%s' = %lu\n", base, i); */ switch(i) { case e('T','T','T'):case e('T','T','C'): c = 'F'; break; case e('T','T','A'):case e('T','T','G'): case e('C','T','T'):case e('C','T','C'): case e('C','T','A'):case e('C','T','G'): c = 'L'; break; case e('A','T','T'):case e('A','T','C'): case e('A','T','A'): c = 'I'; break; case e('A','T','G'): c = 'M'; break; case e('G','T','T'):case e('G','T','C'): case e('G','T','A'):case e('G','T','G'): c = 'V'; break; case e('T','C','T'):case e('T','C','C'): case e('T','C','A'):case e('T','C','G'): case e('A','G','T'):case e('A','G','C'): c = 'S'; break; case e('C','C','T'):case e('C','C','C'): case e('C','C','A'):case e('C','C','G'): c = 'P'; break; case e('A','C','T'):case e('A','C','C'): case e('A','C','A'):case e('A','C','G'): c = 'T'; break; case e('G','C','T'):case e('G','C','C'): case e('G','C','A'):case e('G','C','G'): c = 'A'; break; case e('T','A','T'):case e('T','A','C'): c = 'Y'; break; case e('T','A','A'):case e('T','A','G'): case e('T','G','A'): c = '*'; break; /* special "stop codon" */ case e('C','A','T'):case e('C','A','C'): c = 'H'; break; case e('C','A','A'):case e('C','A','G'): c = 'Q'; break; case e('A','A','T'):case e('A','A','C'): c = 'N'; break; case e('A','A','A'):case e('A','A','G'): c = 'K'; break; case e('G','A','T'):case e('G','A','C'): c = 'D'; break; case e('G','A','A'):case e('G','A','G'): c = 'E'; break; case e('T','G','T'):case e('T','G','C'): c = 'C'; break; case e('T','G','G'): c = 'W'; break; case e('C','G','T'):case e('C','G','C'): case e('C','G','A'):case e('C','G','G'): case e('A','G','A'):case e('A','G','G'): c = 'R'; break; case e('G','G','T'):case e('G','G','C'): case e('G','G','A'):case e('G','G','G'): c = 'G'; break; default: /* error condition, return '?', for unknown */ c = '?'; break; } #else /* SWITCH_16_BITS */ if( match(base,'T','T','T') || match(base,'T','T','C') ) c = 'F'; else if( match(base,'T','T','A') || match(base,'T','T','G') || match(base,'C','T','T') || match(base,'C','T','C') || match(base,'C','T','A') || match(base,'C','T','G') ) c = 'L'; else if( match(base,'A','T','T') || match(base,'A','T','C') || match(base,'A','T','A') ) c = 'I'; else if( match(base,'A','T','G') ) c = 'M'; else if( match(base,'G','T','T') || match(base,'G','T','C') || match(base,'G','T','A') || match(base,'G','T','G') ) c = 'V'; else if( match(base,'T','C','T') || match(base,'T','C','C') || match(base,'T','C','A') || match(base,'T','C','G') || match(base,'A','G','T') || match(base,'A','G','C') ) c = 'S'; else if( match(base,'C','C','T') || match(base,'C','C','C') || match(base,'C','C','A') || match(base,'C','C','G') ) c = 'P'; else if( match(base,'A','C','T') || match(base,'A','C','C') || match(base,'A','C','A') || match(base,'A','C','G') ) c = 'T'; else if( match(base,'G','C','T') || match(base,'G','C','C') || match(base,'G','C','A') || match(base,'G','C','G') ) c = 'A'; else if( match(base,'T','A','T') || match(base,'T','A','C') ) c = 'Y'; else if( match(base,'T','A','A') || match(base,'T','A','G') || match(base,'T','G','A') ) c = '*'; /* special "stop codon" */ else if( match(base,'C','A','T') || match(base,'C','A','C') ) c = 'H'; else if( match(base,'C','A','A') || match(base,'C','A','G') ) c = 'Q'; else if( match(base,'A','A','T') || match(base,'A','A','C') ) c = 'N'; else if( match(base,'A','A','A') || match(base,'A','A','G') ) c = 'K'; else if( match(base,'G','A','T') || match(base,'G','A','C') ) c = 'D'; else if( match(base,'G','A','A') || match(base,'G','A','G') ) c = 'E'; else if( match(base,'T','G','T') || match(base,'T','G','C') ) c = 'C'; else if( match(base,'T','G','G') ) c = 'W'; else if( match(base,'C','G','T') || match(base,'C','G','C') || match(base,'C','G','A') || match(base,'C','G','G') || match(base,'A','G','A') || match(base,'A','G','G') ) c = 'R'; else if( match(base,'G','G','T') || match(base,'G','G','C') || match(base,'G','G','A') || match(base,'G','G','G') ) c = 'G'; else /* error condition, return '?', for unknown */ c = '?'; #endif /* SWITCH_16_BITS */ return c; } /* returns the next legal base character, ignoring others */ int getBase(FILE* in) { int c; do{ c = getc(in); if(c==EOF) break; c = toupper(c); } while(c!='A' && c !='C' && c!= 'T' && c!= 'G'); return c; } /* adds an amino acid to a frame, writes the protein if a stop codon. */ char add(char amin, char frame) { /* printf("bases %ld: adding char %c to frame %u\n", bases, amin, frame); */ if(amin == '*' || amin == 0) { /* special stop codon */ unsigned int len; /* length of protein string */ if(frame < 3) { /* writing forward */ if(amin != 0) *buffptr[frame]++ = amin; len = buffptr[frame] - buffer[frame]; writeProtein(frame, bases - len*3, buffer[frame], len); buffptr[frame] = buffer[frame]; /* reset buffptr */ } else { /* frame >= 3, writing backward */ len = buffer[frame] + MAXPROTEIN - buffptr[frame]; writeProtein(frame, bases - len*3 - 1, buffptr[frame], len); /* -1 to position because the '*' isn't counted in yet */ buffptr[frame] = buffer[frame] + MAXPROTEIN; /* reset buffptr */ if(amin != 0) *--buffptr[frame] = amin; } } else { /* write normal, non-stop codon */ if(frame < 3){ *buffptr[frame]++ = amin; if(buffptr[frame] >= buffer[frame] + MAXPROTEIN) return 0; /* overflow of buffer */ } else { /* write backward */ *--buffptr[frame] = amin; if(buffptr[frame] <= buffer[frame]) return 0; /* overflow of buffer */ } } return 1; }/* end add() */ /* Writes the protein to the file. A typical signal sequence needs to have the positively charged amino acids first (K or R or both) which are right next (5 to 10 residues) to the start amino acid (M). These positive residues (K,R) are not evaluated by the Hejne Matrix. Only the 15 residues which follows next to K,or R are evaluated, noted as -13, to 2, and also known as cleavage site. Now, if we could let the program first look for the presents of K and R right next (window 5 to 10 or variable) to the M and then allow the algorithem to start to evaluate the next coming residues according to the matrix (15 positions), we definitly would have a much higher yield in putative secreted proteins. First task: i) find an open reading frame defined with the start M and check for residues K or R right next (window 1 to 10, optional), then ii) apply to such candidates, and only to such, a sliding window (1- 15, optional) which compares the following residues to the matrix and stores them away if minimal weight factor level is reached. The first considered (to be compared with the matrix) amino acid is position -13 then the secound is -12, down to -1 (here the cut is produced) +1 means the mature protein starts. example: -13 -112 MFGKKRLALVALVALLLLAXA (this is a good looking siganl sequence!!!!) MFGGDFLALVALVALLLLAXA (this is not a signal sequence at all, because it lacks the positive residues K and/or R!!!!) */ void writeProtein(char frame, unsigned long position, const char *prot, unsigned int len /* length of protein in aminos */ ) { unsigned int wt, maxWt = 0; unsigned int i, maxI; /* counter and its endpoint */ unsigned int maxPos=0; /* position of max weight of protein */ const char *start = prot; /* start of final protein */ const char *residues; /* start of weighed residues in final protein */ unsigned int endKR=0; /* distance until after last contiguous K/R amino of K/R sequence starting within maxPositiveResidues of start */ /* test for excluded amino acids */ if(nExcluded > 0) { for(i=0; i= minLength && *start !='M') { start ++; len--; } if(len < minLength) return; /* too short, or no 'M' at all... */ /* search for positive residues, K or R */ maxI = (maxPositiveResidues > len) ? len : maxPositiveResidues; for(i=0; i maxWt){ maxWt = wt; maxPos = i; } } if(maxWt >= minWeight) break; /* found one */ } /* otherwise reject this whole protein, try next starting one further down */ len--; start++; }/* end for(;;) - found a good enough weight, now write it */ if(frame < 3) position += 3 * (start - prot); /* For the reversed frames the position is the location of the stop codon anyway, so where a protein starts doesn't affect that. */ proteins++; /* wrote another protein */ fprintf(out, "> %d / %lu (%u amino acids long,", frame+1, position + 1, len); fprintf(out, " weight %d, from position %d)\n", maxWt, maxPos + endKR + 1); while(len > MAXLINE){ fwrite(start, sizeof(char), MAXLINE, out); len -= MAXLINE; start += MAXLINE; putc('\n', out); } fwrite(start, sizeof(char), len, out); putc('\n', out); /* blank line between proteins */ putc('\n', out); } /* weighs the likelihood of the protein being valid, the higher the better */ int weigh(const char *prot, unsigned int len) { register unsigned int i; unsigned int stop; int total = 0; if(len>15) stop = 15; else{ stop = len; if(prot[stop -1] == '*') /* don't weigh the stop codon */ stop --; } for(i=0; i maxWeight){ fprintf(stderr, "Illegal weight %u. Must be between 0 and %u.\n", minWeight, maxWeight); return(1); } printf("Maximum number of times to slide weighing window? "); scanf("%u", &maxSlide); if(maxSlide < 1) maxSlide = 1; /* '0' shorthand for '1' (?) */ printf("Maximum distance of positive residue (K or R)\n" "from protein start codon (M) (1 to 10, or more)? "); scanf("%u", &maxPositiveResidues); if(maxPositiveResidues < 1) maxPositiveResidues = 10; /* '0' shorthand for '10' (?) */ printf("Minimum length (in amino acids) to write a protein? "); scanf("%u", &minLength); if(minLength < 0 || minLength > MAXPROTEIN){ fprintf(stderr, "Illegal minimum length %u, must be between 0 and %ld.", minLength, MAXPROTEIN); return(1); } if(minLength <1) minLength = 1; /* '0' shorthand for '1' (?) */ printf("Any amino acids to be excluded from database\n " "(especially W, P, N, R, K, D, or C - 0 for none)? "); scanf("%s", aminos); nExcluded = 0; for(c=0; aminos[c] != '\0'; ++c){ if(isalpha(aminos[c]) && aminos[c] >= 'A' && aminos[c] <= 'Z'){ exclude[ aminos[c] - 'A'] = 1; ++nExcluded; } } if(nExcluded >= 1){ printf("Excluding %u amino acids:\n", nExcluded); for(c = 0; c<26; ++c) if(exclude[c]) printf(" %c", c+'A'); printf("\n"); } printf("Beginning program.\n"); /* read 2 first bases, to "prime" buffers */ c = getBase(in); if(c==EOF){ fprintf(stderr, "Input file ended on first character!\n"); return(2); } fbuf[1] = (char)c; rbuf[1] = pair((char)c); c = getBase(in); if(c==EOF){ fprintf(stderr, "Input file ended on second character!\n"); return(2); } fbuf[2] = (char)c; rbuf[0] = pair((char)c); bases = 3; /* number of bases read */ /* main loop, reading bases and writing amino acids to buffers */ for(frame=0;(c=getBase(in))!=EOF; frame=(char)((frame+1)%3)){ /* shift back buffers */ fbuf[0] = fbuf[1]; fbuf[1] = fbuf[2]; fbuf[2] = (char)c; rbuf[2] = rbuf[1]; rbuf[1] = rbuf[0]; rbuf[0] = pair((char)c); if(!add(lookup(fbuf), frame) || !add(lookup(rbuf), (char)(frame+3))){ fprintf(stderr, "buffer overflow: a protein of over %ld amino acids\n" "has been found, and this program can't deal with it.\n", MAXPROTEIN); return(3); } if(++bases%1000 == 0) printf("Read %ld bases - wrote %ld proteins.\n", bases, proteins); } for(frame=0; frame<6; ++frame) add('\0', frame); /* phony stop */ fclose(in); fclose(out); printf("Done! %ld bases read, %ld proteins written.\n", bases, proteins); return 0; }/* end main() */ /* This function is not used in the program any more. It was once used to reverse the output of the last 3 (backwards) frames, now that is done in the first pass. Function left in code "just in case". */ void reverseFile(const char *inName, const char *outName) { FILE *ifp, *ofp; long begin, current; printf("Reversing file '%s' to '%s'.\n", inName, outName); ifp = fopen(inName, "r"); if(ifp == NULL){ fprintf(stderr, "could not open \"%s\"", inName); return; } ofp = fopen(outName, "w"); if(ofp == NULL){ fprintf(stderr, "could not open \"%s\"", outName); return; } begin = ftell(ifp); fseek(ifp, 0, SEEK_END); current = ftell(ifp); while(current>begin){ fseek(ifp, current-1, SEEK_SET); current = ftell(ifp); putc(getc(ifp), ofp); if((current-begin)%1000 == 1000-1) printf("%ld characters reversed.\n", current-begin); } fclose(ifp); fclose(ofp); }