fasta.c

/*
 * fasta.c
 *
 * $Id: fasta.c,v 1.4 2007/11/27 18:33:59 sheneman Exp $
 *
 *****************************************************************************
 *
 * Copyright (c) 2004,  Luke Sheneman
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without 
 * modification, are permitted provided that the following conditions 
 * are met:
 * 
 *  + Redistributions of source code must retain the above copyright 
 *    notice, this list of conditions and the following disclaimer. 
 *  + Redistributions in binary form must reproduce the above copyright 
 *    notice, this list of conditions and the following disclaimer in 
 *    the documentation and/or other materials provided with the 
 *    distribution. 
 *  + The names of its contributors may not be used to endorse or promote 
 *    products derived  from this software without specific prior 
 *    written permission. 
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
 * POSSIBILITY OF SUCH DAMAGE.  
 *
 *****************************************************************************
 *
 * Functions for parsing FASTA formatted alignment files
 *
 *****************************************************************************
 *
 * AUTHOR:
 * 
 *   Luke Sheneman
 *   sheneman@cs.uidaho.edu
 *
 */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>

#include "clearcut.h"
#include "common.h"
#include "fasta.h"


#define NJ_NUM_DNA_AMBIGUITY_SYMS 14
static const char NJ_dna_ambiguity_syms[NJ_NUM_DNA_AMBIGUITY_SYMS] = 
{
  'M', 'R', 'W', 'S', 'Y', 'K',
  'V', 'H', 'D', 'B', 'X', 'N',
  '-', '.'
};


#define NJ_NUM_PROTEIN_AMBIGUITY_SYMS 6
static const char NJ_protein_ambiguity_syms[NJ_NUM_PROTEIN_AMBIGUITY_SYMS] =
{
  'X', 'B', 'Z', '*', '-', '.'
};

#define NJ_NUM_DNA_SYMS 5
static const char NJ_dna_syms[NJ_NUM_DNA_SYMS] = 
{
  'A', 'G', 'C', 'T', 'U'
};


#define NJ_NUM_PROTEIN_SYMS 20
static const char NJ_protein_syms[NJ_NUM_PROTEIN_SYMS] = 
{
  'A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 
  'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V'
};


/*
 * NJ_is_whitespace() - Check to see if character is whitespace
 *
 * INPUTS:
 * -------
 *     c -- character to check
 *
 * RETURNS:
 * --------
 *    int -- 0 if not whitespace
 *           1 if whitespace
 */
static inline
int
NJ_is_whitespace(char c) {
  if( c == ' '  ||   /* space           */
      c == '\n' ||   /* newline         */
      c == '\r' ||   /* carriage-return */
      c == '\v' ||   /* vertical tab    */
      c == '\f' ||   /* form feed       */
      c == '\t' ) {  /* horizontal tab  */
    return(1);
  } else {
    return(0);
  }
}


/*
 * NJ_is_dna() - 
 *
 * Determines if the given symbol is DNA
 *
 * RETURNS: 1 if DNA
 *          0 if not DNA
 *
 */
static inline
int
NJ_is_dna(char c) {

  int i;
  char up_c;
  
  up_c = toupper(c);
  
  for(i=0;i<NJ_NUM_DNA_SYMS;i++) {
    if(up_c == NJ_dna_syms[i]) {
      return(1);
    }
  }

  return(0);
}


/*
 * NJ_is_dna_ambiguity() - 
 *
 * Determines if the given symbol is a 
 * DNA ambiguity code
 *
 * RETURNS: 1 if DNA Ambiguity Code
 *          0 if not DNA Ambiguity Code
 *
 */
static inline
int
NJ_is_dna_ambiguity(char c) {
  
  int i;
  char up_c;
  
  up_c = toupper(c);
  
  for(i=0;i<NJ_NUM_DNA_AMBIGUITY_SYMS;i++) {
    if(up_c == NJ_dna_ambiguity_syms[i]) {
      return(1);
    }
  }
  
  return(0);
}


/*
 * NJ_is_protein() - 
 *
 * Determines if supplied symbol is amino acid symbol
 *
 * RETURNS: 1 if amino acid
 *          0 if not amino acid
 *
 */
static inline
int
NJ_is_protein(char c) {

  int i;
  char up_c;
  
  up_c = toupper(c);
  
  for(i=0;i<NJ_NUM_PROTEIN_SYMS;i++) {
    if(up_c == NJ_protein_syms[i]) {
      return(1);
    }
  }
  
  return(0);
}


/*
 * NJ_is_protein_ambiguity() - 
 *
 * Determines if supplied symbol is amino acid ambiguity symbol
 *
 * RETURNS: 1 if amino acid ambiguity symbol
 *          0 if not amino acid ambiguity symbol
 *
 */
static inline
int 
NJ_is_protein_ambiguity(char c) {

  int i;
  char up_c;
  
  up_c = toupper(c);

  for(i=0;i<NJ_NUM_PROTEIN_AMBIGUITY_SYMS;i++) {
    if(up_c == NJ_protein_ambiguity_syms[i]) {
      return(1);
    }
  }
  
  return(0);
}


/*
 * NJ_read_fasta() - A function for inputing FASTA sequences into an alignment
 *                   data structure
 *
 *
 * INPUTS:
 * -------
 *   nj_args -- A pointer to a data structure containing command-line args
 *
 * RETURNS:
 * --------
 *   NJ_alignment * -- A pointer to a multiple sequence alignment
 *
 * DESCRIPTION:
 * ------------
 *
 * 
 * This function implements a state machine parser for parsing FASTA-formatted
 * multiple sequence alignment files.  
 * 
 * Example Input:
 *
 *   > sequence1
 *   ATAGATATAGATTAGAATAT----TATAGATAT----ATATAT-TTT-
 *   > sequence2 
 *   --ATAGATA---ATATATATATTTT--GTCTCATAGT---ATATGCTT
 *   > sequence3
 *   TTATAGATA---ATATATATATTTTAAGTCTCATAGT-A-ATATGC--
 * 
 * This function will parse alignments for DNA or protein, and will do
 * so mindful of ambiguity codes for these kinds of sequences.  All 
 * ambiguity codes are ignored by this program for the purposes of 
 * computing a distance matrix from a multiple alignment.  By design, 
 * this program does not auto-detect DNA vs. Protein, and requires that 
 * the user explictly specify that on the command-line.
 *
 * Gaps can be represented either with the '-' or '.' characters.
 * 
 * Newlines and other whitespace are allowed to be interspersed 
 * throughout the sequences.
 *
 * Taxon labels are required to be unique, and they must start with 
 * an alphabetic character (not a number, etc.).  The parser will read
 * the first token after the > character in the description line up until the
 * first whitespace and use that for the taxon label.
 *
 * For example, in the line "> taxon1 is homo sapien", the taxon label will be 
 * "taxon1"
 *
 */
NJ_alignment *
NJ_read_fasta(NJ_ARGS *nj_args) {

  FILE *fp  = NULL;
  char *buf = NULL;
  char *ptr = NULL;
  NJ_alignment *alignment = NULL;

  char c;
  int state;
  long int index, x, seq;
  long int i;
  long int bufsize, nseqs = NJ_INITIAL_NSEQS;
  int first_sequence_flag;


  /* 
   * In this function, we implement a FASTA alignment parser which
   * reads in an alignment character-by-character, maintaining state
   * information which guides the parser.
   *
   * The program reads either DNA or Protein alignments.  All title lines
   * and sequences can be arbitrarily long.  Gaps can be represented by 
   * "-" or "." characters.  
   *
   * Ambiguity codes are also handled.
   * 
   */

  /* 
   * We can't handle reading fasta input unless the user explicity 
   * specifies the input type...just to be sure.
   */
  if( (!nj_args->dna_flag && !nj_args->protein_flag) ||
      (nj_args->dna_flag  &&  nj_args->protein_flag) ) {
    fprintf(stderr, "Clearcut: Explicitly specify protein or DNA\n");
    goto XIT_BAD;
  }

  /* open specified fasta file here */
  if(nj_args->stdin_flag) {
    fp = stdin;
  } else {
    fp = fopen(nj_args->infilename, "r");
    if(!fp) {
      fprintf(stderr, "Clearcut: Failed to open input FASTA file: %s\n", nj_args->infilename);
      perror("Clearcut");
      goto XIT_BAD;
    }
  }

  /* allocate the initial buffer */
  bufsize = NJ_INITIAL_BUFFER_SIZE;
  buf = (char *)calloc(bufsize, sizeof(char));
  
  /* allocate the alignment container here */
  alignment = (NJ_alignment *)calloc(1, sizeof(NJ_alignment));
  
  /* allocate initial title array */
  //  printf("allocating initial title array\n");
  alignment->titles = (char **)calloc(NJ_INITIAL_NSEQS, sizeof(char *));

  /* make sure that we successfully allocated memory */
  if(!buf || !alignment || !alignment->titles) {
    fprintf(stderr, "Clearcut: Memory allocation error in NJ_read_fasta()\n");
    goto XIT_BAD;
  }

  /* a flag */
  first_sequence_flag = 1;  
  
  index  = 0;  /* tracks the position in buffer     */
  x      = 0;  /* tracks the position on sequence   */
  seq    = 0;  /* tracks the active sequence        */

  /* intitial state of state machine */
  state  = NJ_FASTA_MODE_UNKNOWN;

  while(1) {
    
    /* get the next character */
    c = fgetc(fp);
    if(feof(fp)) {

      if(state == NJ_FASTA_MODE_SEQUENCE) {
	buf[index+1] = '\0';

	/* copy buf to alignment */
	for(i=1;i<=alignment->length;i++) {
	  alignment->data[seq*alignment->length+i-1] = buf[i];
	}
      }

      break;
    }

    /* make sure our dynamic buffer is big enough */
    if(index >= bufsize) {
      bufsize *= 2;
      buf = (char *)realloc(buf, bufsize);
      if(!buf) {
	fprintf(stderr, "Clearcut: Memory allocation error in NJ_read_fasta()\n");
	goto XIT_BAD;
      }
    }

    switch(state) {
      
    case NJ_FASTA_MODE_UNKNOWN:
      
      if(!NJ_is_whitespace(c)) {
	if(c == '>') {
	  state = NJ_FASTA_MODE_TITLE;
	  buf[0] = '>';
	} else {
	  goto XIT_BAD;
	}
      }

      break;

    case NJ_FASTA_MODE_TITLE:

      if( c == '\n' ||
	  c == '\r' ) {

	buf[index] = '\0';
	state = NJ_FASTA_MODE_SEQUENCE;
	index = 0;
	x     = -1;

	/* make sure we've allocated enough space for titles and sequences */
	if(seq == nseqs) {

	  //	  printf("realloc().  seq = %d, nseqs = %d\n", seq, nseqs);

	  nseqs *= 2;

	  alignment->titles = (char **)realloc(alignment->titles, nseqs*sizeof(char *));
	  if(!alignment->titles) {
	    fprintf(stderr, "Clearcut:  Memory allocation error in NJ_read_fasta()\n");
	    goto XIT_BAD;
	  }

	  alignment->data = (char *)realloc(alignment->data, alignment->length*nseqs*sizeof(char));
	  if(!alignment->data) {
	    fprintf(stderr, "Clearcut: Allocation error in NJ_read_fasta()\n");
	    goto XIT_BAD;
	  }
	}

	// printf("Allocating %d bytes for title %d: %s\n", (int)strlen(buf), (int)seq, buf);

	alignment->titles[seq] = (char *)calloc(strlen(buf), sizeof(char));
	if(!alignment->titles[seq]) {
	  fprintf(stderr, "Clearcut:  Memory allocation error in NJ_read_fasta()\n");
	  goto XIT_BAD;
	}

	/* lets forward to the first non-space (space/tab) character after the '>' */

	if(first_sequence_flag) {
	  ptr = buf;
	} else {
	  ptr = &buf[1];
	}
	while(*ptr == '\t' || *ptr == ' ') {
	  ptr++;
	}
	sscanf(ptr, "%s", alignment->titles[seq]);  /* get the first word and use as the title */

	alignment->nseq++;
      }
	
      buf[index++] = c;

      break;


    case NJ_FASTA_MODE_SEQUENCE:

      if(c == '>') {

	if(first_sequence_flag) {
	  first_sequence_flag = 0;

	  /* allocate our alignment data section here */
	  alignment->length = index-1;

	  nseqs = NJ_INITIAL_NSEQS;
	  alignment->data = (char *)calloc(alignment->length*nseqs, sizeof(char));
	  if(!alignment->data) {
	    fprintf(stderr, "Clearcut: Allocation error in NJ_read_fasta()\n");
	    goto XIT_BAD;
	  }
	} 
	
	if(!first_sequence_flag) {
	  if(index-1 < alignment->length) {
	    fprintf(stderr, "Clearcut: Sequences must be of uniform length in alignment at sequence %ld\n", seq);
	    goto XIT_BAD;
	  }
	}

	/* re-allocate if necessary */
	/*
	if(seq >= nseqs) {
	  nseqs *= 2;
	  alignment->data = (char *)realloc(alignment->data, alignment->length*nseqs*sizeof(char));
	  if(!alignment->data) {
	    fprintf(stderr, "Clearcut: Allocation error in NJ_read_fasta()\n");
	    goto XIT_BAD;
	  }
	}
	*/

	/* copy buf to alignment */
	for(i=1;i<=alignment->length;i++) {
	  alignment->data[seq*alignment->length+i-1] = buf[i];
	}
	  
	state = NJ_FASTA_MODE_TITLE;
	index = 1;
	x     = 1;
	  
	buf[0] = c;
	
	seq++;

      } else {
	  
	if(NJ_is_whitespace(c)) {
	  break;
	}
		
	if(!first_sequence_flag) {
	  if(index-1 >= alignment->length) {
	    fprintf(stderr, "Clearcut: Sequences must be of uniform length in alignment at sequence %ld\n", seq);
	    goto XIT_BAD;
	  }
	}


	/* 
	 * Here we check to make sure that the symbol read is appropriate
	 * for the type of data the user specified.  (dna or protein).
	 * We also handle ambiguity codes by converting them to a specific
	 * assigned ambiguity code character.  Ambiguity codes are ignored
	 * when computing distances
	 */

	if(nj_args->dna_flag) {
	  if(NJ_is_dna(c)) {
	    buf[index++] = toupper(c);
	  } else {
	    if(NJ_is_dna_ambiguity(c)) {
	      buf[index++] = NJ_AMBIGUITY_CHAR;
	    } else {
	      fprintf(stderr, "Clearcut: Unknown symbol '%c' in nucleotide sequence %ld.\n", c, seq);
	      goto XIT_BAD;
	    }
	  }
	} else if(nj_args->protein_flag) {
	  if(NJ_is_protein(c)) {
	    buf[index++] = toupper(c);
	  } else {
	    if(NJ_is_protein_ambiguity(c)) {
	      buf[index++] = NJ_AMBIGUITY_CHAR;
	    } else {
	      fprintf(stderr, "Clearcut: Unknown symbol '%c' in protein sequence %ld.\n", c, seq);
	      goto XIT_BAD;
	    }
	  }
	}

      }

      break;
	
    default:
      goto XIT_BAD;
	
      break;

    }
  }
  
  if(index-1 != alignment->length) {
    fprintf(stderr, "Clearcut: Sequences must be of uniform length in alignment at sequence %ld\n", seq);
    goto XIT_BAD;
  }
  
  /* check for duplicate taxon labels */
  if(!NJ_taxaname_unique(alignment)) {
    goto XIT_BAD;
  }
  
  return(alignment);

  
 XIT_BAD:

  if(fp) {
    fprintf(stderr, "Clearcut: Fatal error parsing FASTA file at file offset %ld.\n", ftell(fp));
  }
  
  if(buf) {
    free(buf);
  }
  
  NJ_free_alignment(alignment);

  return(NULL);
}


/*
 * NJ_print_alignment() - Print multiple sequence alignment (for debugging)
 *
 * INPUTS:
 * -------
 *    alignment -- A pointer to the alignment
 *
 * RETURNS:
 * --------
 *    NONE
 *
 */
void
NJ_print_alignment(NJ_alignment *alignment) {
  
  long int i, j;
  
  printf("nseq = %ld, length = %ld\n", alignment->nseq, alignment->length);
  
  for(i=0;i<alignment->nseq;i++) {
    
    printf("> %s\n", alignment->titles[i]);

    for(j=0;j<alignment->length;j++) {
      printf("%c", alignment->data[i*alignment->length+j]);
    }

    printf("\n");
  }

  return;
}


/*
 *
 * NJ_free_alignment() - Free all of the memory allocated for the
 *                       multiple sequence alignment 
 *
 * INPUTS:
 * -------
 *   alignment -- A pointer to the multiple sequence alignment
 *
 * RETURNS:
 * --------
 *    NONE
 *
 */
void
NJ_free_alignment(NJ_alignment *alignment) {
  
  long int i;
  
  if(alignment) {

    /* free the allocated titles */
    if(alignment->titles) {
      for(i=0;i<alignment->nseq;i++) {

	if(alignment->titles[i]) {
	  free(alignment->titles[i]);
	}
      }
      
      free(alignment->titles);
    }

    /* free the alignment data */
    if(alignment->data) {
      free(alignment->data);
    }

    /* free the alignment itself */
    free(alignment);
  }

  return;
}


/*
 * NJ_taxaname_unique() - Check to see if taxanames are unique in alignment
 *
 * INPUTS:
 * -------
 *  alignment -- a pointer to a multiple sequence alignment
 *
 * OUTPUTS:
 * --------
 *  int -- 0 if all taxanames in alignment are unique
 *         1 if all taxanames in alignment are NOT unique
 *
 *
 * DESCRIPTION:
 * ------------
 *
 * Check to see if the taxanames in the alignment are unique.  It
 * will be impossible to make sense of the final tree if the taxon
 * labels are not unqiue.
 *
 */
int
NJ_taxaname_unique(NJ_alignment *alignment) {
  
  long int i, j;
  
  for(i=0;i<alignment->nseq;i++) {
    for(j=i+1;j<alignment->nseq;j++) {
        if(!strcmp(alignment->titles[i], 
		 alignment->titles[j])) {
	fprintf(stderr, "Clearcut: Taxa %ld and %ld (%s) do not have unique taxon labels.\n", 
		i, j, alignment->titles[i]);
	return(0);
      }
    }
  }
  
  return(1);
}


void
NJ_print_titles(NJ_alignment *alignment) {

  int i;

  for(i=0;i<alignment->nseq;i++) {
    printf("%d: %s\n", i, alignment->titles[i]);
  }

  return;
}