#include "fluctuate.h"
#include "getdata.h"

#ifdef DMALLOC_FUNC_CHECK
#include "/usr/local/include/dmalloc.h"
#endif


void setupdata(dnadata **dna, long sites, long numseq)
/* initialize the dna data structure */
{
  long i;

  (*dna) = (dnadata *)calloc(1,sizeof(dnadata));
  (*dna)->sites = sites;
  (*dna)->numseq = numseq;
  (*dna)->seqs = (char **)calloc(numseq,sizeof(char *));
  (*dna)->seqs[0] = (char *)calloc(numseq*sites,sizeof(char));
  for (i=1;i<numseq;i++)
     (*dna)->seqs[i] = (*dna)->seqs[0] + i*sites;
} /* setupdata */


void freedata(dnadata *dna)
/* free the dna data structure */
{
  free(dna->seqs[0]);
  free(dna->seqs);
  free(dna);
}

void getdata(tree *curtree, dnadata *dna, option_struct *op, FILE *infile, 
  FILE *outfile)
{
  /* read sequences */
  long i, j, k, l, basesread, basesnew;
  char ch;
  boolean allread, done;
  sequence y;

  basesnew = 0; /* just to be careful */
  y = dna->seqs;

  putc('\n', outfile);
  j = NMLNGTH + (dna->sites + (dna->sites - 1) / 10) / 2 - 5;
  if (j < NMLNGTH - 1)
    j = NMLNGTH - 1;
  if (j > 37)
    j = 37;
  if (op->printdata) {
    fprintf(outfile, "Name");
    for (i = 1; i <= j; i++)
      putc(' ', outfile);
    fprintf(outfile, "Sequences\n");
    fprintf(outfile, "----");
    for (i = 1; i <= j; i++)
      putc(' ', outfile);
    fprintf(outfile, "---------\n\n");
  }
  basesread = 0;
  allread = false;
  while (!(allread)) {
    allread = true;
    if (eoln(infile)) {
      fscanf(infile, "%*[^\n]");
      getc(infile);
    }
    i = 1;
    while (i <= dna->numseq) {
      if ((op->interleaved && basesread == 0) || !op->interleaved) {
	for (j = 0; j < NMLNGTH; j++) {
	  curtree->nodep[i - 1]->nayme[j] = getc(infile);
	  if (curtree->nodep[i - 1]->nayme[j] == '\n')
	    curtree->nodep[i - 1]->nayme[j] = ' ';
	  if (eof(infile) || eoln(infile)){
	    printf("ERROR: END-OF-LINE OR END-OF-FILE IN THE MIDDLE OF A SPECIES NAME\n");
	    exit(-1);
	  }
	}
      }
      if (op->interleaved)
	j = basesread;
      else
	j = 0;
      done = false;
      while (((!done) && (!(eoln(infile) || eof(infile))))) {
	if (op->interleaved)
	  done = true;
	while (((j < dna->sites) && (!(eoln(infile) || eof(infile))))) {
	  ch = getc(infile);
	  if (ch == '\n')
	    ch = ' ';
	  if (ch == ' ' || (ch >= '0' && ch <= '9'))
	    continue;
	  ch = isupper(ch) ? ch : toupper(ch);
	  if (!(int)strchr("ABCDGHKMNRSTUVWXY?O-.",ch)){
	    printf("ERROR: BAD BASE:%c AT POSITION%5ld OF SPECIES %3ld\n",
		   ch, j, i);
	    exit(-1);
	  }
	  j++;
	  if (ch == '.')
	    ch = y[0][j - 1];
	  y[i - 1][j - 1] = ch;
	}
	if (op->interleaved)
	  continue;
	if (j < dna->sites) {
	  fscanf(infile, "%*[^\n]");
	  getc(infile);
	} else if (j == dna->sites)
	  done = true;
      }
      if (op->interleaved && i == 1)
	basesnew = j;
      fscanf(infile, "%*[^\n]");
      getc(infile);
      if ((op->interleaved && j != basesnew) || 
         (!op->interleaved && j != dna->sites)){
	printf("ERROR: SEQUENCES OUT OF ALIGNMENT\n");
	exit(-1);}
      i++;
    }
    if (op->interleaved) {
      basesread = basesnew;
      allread = (basesread == dna->sites);
    } else
      allread = (i > dna->numseq);
  }
  if (!op->printdata)
    return;
  for (i = 1; i <= ((dna->sites - 1) / 60 + 1); i++) {
    for (j = 1; j <= dna->numseq; j++) {
      for (k = 0; k < NMLNGTH; k++)
	putc(curtree->nodep[j - 1]->nayme[k], outfile);
      fprintf(outfile, "   ");
      l = i * 60;
      if (l > dna->sites)
	l = dna->sites;
      for (k = (i - 1) * 60 + 1; k <= l; k++) {
	if (j > 1 && y[j - 1][k - 1] == y[0][k - 1])
	  ch = '.';
	else
	  ch = y[j - 1][k - 1];
	putc(ch, outfile);
	if (k % 10 == 0 && k % 60 != 0)
	  putc(' ', outfile);
      }
      putc('\n', outfile);
    }
    putc('\n', outfile);
  }
}  /* getdata */

void makevalues(dnadata *dna, long categs, tree *curtree)
{
  /* set up fractional likelihoods at tips */
  long i, k, l;
  long b;
  sequence y;

  y = dna->seqs;

  for (k = 0; k < dna->sites; k++) {
    for (i = 0; i < dna->numseq; i++) {
      for (l = 0; l < categs; l++) {
	for (b = baseA; b <= baseT; b = b + 1)
	  curtree->nodep[i]->x[k][l][b] = 0.0;
	switch (y[i][k]) {

	case 'A':
	  curtree->nodep[i]->x[k][l][baseA] = 1.0;
	  break;

	case 'C':
	  curtree->nodep[i]->x[k][l][baseC] = 1.0;
	  break;

	case 'G':
	  curtree->nodep[i]->x[k][l][baseG] = 1.0;
	  break;

	case 'T':
	  curtree->nodep[i]->x[k][l][baseT] = 1.0;
	  break;

	case 'U':
	  curtree->nodep[i]->x[k][l][baseT] = 1.0;
	  break;

	case 'M':
	  curtree->nodep[i]->x[k][l][baseA] = 1.0;
	  curtree->nodep[i]->x[k][l][baseC] = 1.0;
	  break;

	case 'R':
	  curtree->nodep[i]->x[k][l][baseA] = 1.0;
	  curtree->nodep[i]->x[k][l][baseG] = 1.0;
	  break;

	case 'W':
	  curtree->nodep[i]->x[k][l][baseA] = 1.0;
	  curtree->nodep[i]->x[k][l][baseT] = 1.0;
	  break;

	case 'S':
	  curtree->nodep[i]->x[k][l][baseC] = 1.0;
	  curtree->nodep[i]->x[k][l][baseG] = 1.0;
	  break;

	case 'Y':
	  curtree->nodep[i]->x[k][l][baseC] = 1.0;
	  curtree->nodep[i]->x[k][l][baseT] = 1.0;
	  break;

	case 'K':
	  curtree->nodep[i]->x[k][l][baseG] = 1.0;
	  curtree->nodep[i]->x[k][l][baseT] = 1.0;
	  break;

	case 'B':
	  curtree->nodep[i]->x[k][l][baseC] = 1.0;
	  curtree->nodep[i]->x[k][l][baseG] = 1.0;
	  curtree->nodep[i]->x[k][l][baseT] = 1.0;
	  break;

	case 'D':
	  curtree->nodep[i]->x[k][l][baseA] = 1.0;
	  curtree->nodep[i]->x[k][l][baseG] = 1.0;
	  curtree->nodep[i]->x[k][l][baseT] = 1.0;
	  break;

	case 'H':
	  curtree->nodep[i]->x[k][l][baseA] = 1.0;
	  curtree->nodep[i]->x[k][l][baseC] = 1.0;
	  curtree->nodep[i]->x[k][l][baseT] = 1.0;
	  break;

	case 'V':
	  curtree->nodep[i]->x[k][l][baseA] = 1.0;
	  curtree->nodep[i]->x[k][l][baseC] = 1.0;
	  curtree->nodep[i]->x[k][l][baseG] = 1.0;
	  break;

	case 'N':
	  for (b = baseA; b <= baseT; b = b + 1)
	    curtree->nodep[i]->x[k][l][b] = 1.0;
	  break;

	case 'X':
	  for (b = baseA; b <= baseT; b = b + 1)
	    curtree->nodep[i]->x[k][l][b] = 1.0;
	  break;

	case '?':
	  for (b = baseA; b <= baseT; b = b + 1)
	    curtree->nodep[i]->x[k][l][b] = 1.0;
	  break;

	case 'O':
	  for (b = baseA; b <= baseT; b = b + 1)
	    curtree->nodep[i]->x[k][l][b] = 1.0;
	  break;

	case '-':
	  for (b = baseA; b <= baseT; b = b + 1)
	    curtree->nodep[i]->x[k][l][b] = 1.0;
	  break;
	}
      }
    }
  }
}  /* makevalues */

void empiricalfreqs(tree *curtree, dnadata *dna, long *weight)
{
  /* Get empirical base frequencies from the data */
  long i, j, k;
  double temp, suma, sumc, sumg, sumt, w;
 
  dna->freqa = 0.25;
  dna->freqc = 0.25;
  dna->freqg = 0.25;
  dna->freqt = 0.25;
  for (k = 1; k <= 8; k++) {
    suma = 0.0;
    sumc = 0.0;
    sumg = 0.0;
    sumt = 0.0;
     for (i = 0; i < dna->numseq; i++) {
      for (j = 0; j < dna->sites; j++) {
	w = weight[j];
	temp = dna->freqa * curtree->nodep[i]->x[j][0][baseA];
	temp += dna->freqc * curtree->nodep[i]->x[j][0][baseC];
	temp += dna->freqg * curtree->nodep[i]->x[j][0][baseG];
	temp += dna->freqt * curtree->nodep[i]->x[j][0][baseT];
	suma += w * dna->freqa * curtree->nodep[i]->x[j][0][baseA] / temp;
	sumc += w * dna->freqc * curtree->nodep[i]->x[j][0][baseC] / temp;
	sumg += w * dna->freqg * curtree->nodep[i]->x[j][0][baseG] / temp;
	sumt += w * dna->freqt * curtree->nodep[i]->x[j][0][baseT] / temp;
      }
    }
    temp = suma + sumc + sumg + sumt;
    dna->freqa = suma / temp;
    dna->freqc = sumc / temp;
    dna->freqg = sumg / temp;
    dna->freqt = sumt / temp;
  }
}  /* empiricalfreqs */

void getbasefreqs(dnadata *dna, option_struct *op, double locus_ttratio,
  FILE *outfile)
{
  double aa, bb;

  putc('\n', outfile);
  if (op->freqsfrom)
    fprintf(outfile, "Empirical ");
  fprintf(outfile, "Base Frequencies:\n\n");
  fprintf(outfile, "   A    %10.5f\n", dna->freqa);
  fprintf(outfile, "   C    %10.5f\n", dna->freqc);
  fprintf(outfile, "   G    %10.5f\n", dna->freqg);
  fprintf(outfile, "  T(U)  %10.5f\n", dna->freqt);
  dna->freqr = dna->freqa + dna->freqg;
  dna->freqy = dna->freqc + dna->freqt;
  dna->freqar = dna->freqa / dna->freqr;
  dna->freqcy = dna->freqc / dna->freqy;
  dna->freqgr = dna->freqg / dna->freqr;
  dna->freqty = dna->freqt / dna->freqy;
  fprintf(outfile, "Transition/transversion ratio = %10.6f\n", locus_ttratio);
  aa = locus_ttratio * dna->freqr * dna->freqy - 
       dna->freqa * dna->freqg - dna->freqc * dna->freqt;
  bb = dna->freqa * dna->freqgr + dna->freqc * dna->freqty;
  dna->xi = aa / (aa + bb);
  dna->xv = 1.0 - dna->xi;
  dna->ttratio = dna->xi / dna->xv;
  if (dna->xi <= 0.0) {
    printf("WARNING: This transition/transversion ratio\n");
    printf("is impossible with these base frequencies!\n");
    dna->xi = 3.0 / 5;
    dna->xv = 2.0 / 5;
    fprintf(outfile, " Transition/transversion parameter reset\n\n");
  }
  fprintf(outfile, "(Transition/transversion parameter = %10.6f)\n",
	  dna->xi / dna->xv);
  dna->fracchange = dna->xi * (2 * dna->freqa * dna->freqgr + 
      2 * dna->freqc * dna->freqty) +
      dna->xv * (1.0 - dna->freqa * dna->freqa - 
      dna->freqc * dna->freqc - dna->freqg * dna->freqg - 
      dna->freqt * dna->freqt);
}  /* getbasefreqs */
