///////////////////////////////////////////////////////////////////////////// 
/* 
  Copyright 2001 Ronald S. Burkey 
 
  This file is part of GutenMark. 
 
  GutenMark is free software; you can redistribute it and/or modify 
  it under the terms of the GNU General Public License as published by 
  the Free Software Foundation; either version 2 of the License, or 
  (at your option) any later version. 
 
  GutenMark is distributed in the hope that it will be useful, 
  but WITHOUT ANY WARRANTY; without even the implied warranty of 
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
  GNU General Public License for more details. 
 
  You should have received a copy of the GNU General Public License 
  along with GutenMark; if not, write to the Free Software 
  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
 
  Filename:	MarkBody.c 
  Purpose:	This analyzes the body of the text. 
  Mods:		08/31/01 RSB	Began. 
  		09/03/01 RSB	N, S, E, and W added to honorifics (really 
				street names, but works the same). 
		09/04/01 RSB	Now replace '<' & '>' & '&', since they  
				can be mistaken for markup.  Add closing  
				markup on reaching end of file. 
		09/06/01 RSB	Process paragraphs to try and deduce  
				whether to make them justified or ragged.	 
		09/08/01 RSB	Lotsa cleanup and tweaking.	 
		09/15/01 RSB	Now account for the ~~ italics delimiters. 
				Also, the /italics/ format. 
				Added more heuristics for detecting verse. 
		11/02/01 RSB	Added GPL disclaimer and reformatted  
				somewhat for first web release.	 
		11/09/01 RSB	Now convert all strings of dashes of 
				length 2 or more to mdashes and ndashes. 
				In other words, "--" converts to &mdash; 
				"---" converts to &mdash;&ndash; 
				"----" converts to &mdash;&mdash; and so on. 
				These don't look good in html, but look 
				great in postscript or pdf.  Also, the 
				dash in " - " is converted to an mdash.	 
		11/12/01 RSB	Added detection of the PG file-ender. 
		11/13/01 RSB	Added smart quotes. 
		11/23/01 RSB	Added ALL-CAPs and foreign italicizing. 
		11/24/01 RSB	Centralized all of the markup-file output 
				operations into the AddMarkup function. 
				8-bit ASCII restoration. 
		11/25/01 RSB	Resolved conflicts between 8-bit restoration 
				and ALL-CAPS/foreign markup.	
		12/09/01 RSB	Fixed bug in which an ALL-CAPS word
				at the beginning of a section (or other
				odd circumstance) might be treated as being
				in the middle of a sentence.  Began adding
				support for --first-italics and 
				--first-capital.
		12/10/01 RSB	Modified Nbsp markup after colon or between
				sentences with SingleSpace.  Added some
				HTML beautification:  The stuff involving
				Status.LastFirstSpace is all used to put closing
				tags for things like paragraphs, headings,
				etc. BEFORE newlines rather than after them.
		12/11/01 RSB	Fixed that hanging <p> that was always being
				stuck at the file end due to the unused
				PG "end" line.  When backtracking through
				foreign words, it was possible to double-
				italicize words that had already been 
				italicized due to being ALL-CAPS.  The 
				ALL-CAPS condition now terminates the 
				backtrack list.	
		12/13/01 RSB	Added WeirdSequences, LikelyName, and
				WantFirstCharUpper.
		12/15/01 RSB	Split off MarkByChar from MarkBody to make
				the code look a little simpler.	
		12/16/01 RSB	Added additional code to the centering
				recognizer to distinguish block quotes 
				whose first and/or last lines are simply
				indented differently.  Also, added code
				to the blockquote processor for this case.
		12/27/01 RSB	Fixed a bug in which the markup for the
				PG ender would be at offset 0 if there was
				no header.
		12/30/01 RSB	Split MarkByLineHeuristic from the
				MarkBody function, to make the heuristic
				approach more easily replaceable by a
				neural-net approach.	
		01/01/02 RSB	Moved AddMarkup function to its own
				source file.  Split IsHeader into 
				IsHeaderNeural and IsHeaderHeuristic, each in
				its own soard file.  Similarly, split 
				PrefatoryAnalysisPass function.	
		07/25/02 RSB	Increased the MarkBody line-buffer from 256 
				characters to 16384 characters.	Added a log
				record for long lines.								
   
*/
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <ctype.h>
#include "AutoMark.h"

//----------------------------------------------------------------------- 
// Normalizes a potential heading string in-place.  Returns the length of 
// the normalized string. 
int
NormalizePotentialHeader (char *s)
{
  char *ss;
  int j;

  // Normalize the line by removing leading spaces. 
  for (ss = s; isspace (*ss); ss++);
  if (ss != s)
    strcpy (s, ss);

  // Also, remove any trailing spaces or punctuation. 
  j = strlen (s);
  if (j > 0)
    {
      for (ss = &s[j - 1]; ss >= s; ss--)
	{
	  if (isspace (*ss) || ispunct (*ss))
	    *ss = 0;
	  else
	    break;
	}
    }

  // Get rid of multiple spaces. 
  for (ss = s; *ss; ss++)
    if (isspace (*ss))
      {
	*ss = ' ';
	if (isspace (ss[1]))
	  {
	    strcpy (ss, ss + 1);
	    ss--;
	  }
      }

  // Oh, and let's make it upper-case. 
  for (ss = s; *ss; ss++)
    *ss = toupper (*ss);
  j = strlen (s);
  return (j);
}

//--------------------------------------------------------------------------- 
// Checks a line to see if it's a match against the buffered prefatory lines. 
int
MatchesPrefatoryLine (AnalysisDataset * Dataset, char *Line)
{
  char s[256];
  int i;
  if (NULL != Line)
    {
      strcpy (s, Line);
      NormalizePotentialHeader (s);
      for (i = 0; i < Dataset->NumNonTrivialPrefatoryLines; i++)
	if (0 ==
	    strncmp (s, Dataset->PrefatoryLines[i],
		     Dataset->PrefatoryLineSizes[i]) && strlen (s) <
	    2 * Dataset->PrefatoryLineSizes[i])
	  return (1);
    }
  return (0);
}

//------------------------------------------------------------------------ 
// This function is used to recognize headings.  Returns 0 if it thinks the 
// line is not a header.  The array Status.BufferedLines[BUFFERED_LINES] should have 
// been pre-loaded.  Status.BufferedLines[PRE_OR_POST_LINES] is the current line,  
// while the other array elements are earlier and later lines from the file. 
int
IsHeader (LineRecord * BufferedLines, int *LastWasHeader1, int LineNum,
	  AnalysisDataset * Dataset, char *Line)
{
  int RetVal;
  if (Dataset->ForceNeural)
    RetVal =
      IsHeaderNeural (BufferedLines, LastWasHeader1, LineNum, Dataset, Line);
  else
    RetVal =
      IsHeaderHeuristic (BufferedLines, LastWasHeader1, LineNum, Dataset,
			 Line);
  return (RetVal);
}

//--------------------------------------------------------------------- 
// Tests if a character is appropriate for end-of-phrase punctuation. 
int
IsEndPunct (char c)
{
  return (c == '.' || c == '?' || c == '!' || c == ':' || c == ';'
	  || c == ')' || c == ',' || c == '}' || c == ']');
}

int
IsEndSpace (char c)
{
  return (!c || isspace (c));
}

//--------------------------------------------------------------------- 
// The method of analysis uses several passes.  The first pass collects  
// data about all the lines of the file.  The last pass advances through  
// the file a line at a time, keeping in memory a window of the  
// pre-collected line data in which several lines prior to the curent line  
// and several lines after the current line are available for quick access.   
// This might allow, for example, detecting a line preceded and followed  
// by a couple of blank lines as being a header. 
static char s[16384];
int
MarkBody (AnalysisDataset * Dataset)
{
  MarkStatus Status = { 0 };

  //char s[256];
  int ReturnValue = 1, AbbyNormalCount = 0;
  LineRecord DefaultLine = {
    0
  };
  int i, j;

  Status.BlockIndentation = 0;
  Status.TripSquote = -1;
  Status.SentenceStart = 1;
  Status.ParagraphType = MarkBeginJustifiedParagraph;

  Dataset->LineFile = tmpfile ();
  if (Dataset->LineFile == NULL)
    {
      fprintf (stderr, "Cannot create first-pass temporary file.\n");
      ReturnValue = 2;
      goto Done;
    }

  //--------------------------------------------------------------------- 
  // Run the first analysis pass. 
  if (0 != (Status.ErrorCode = LineAnalysisPass (Dataset)))
    {
      fprintf (stderr, "Line-analysis failed (code = 0x%X).\n",
	       Status.ErrorCode);
      ReturnValue = 3;
      goto Done;
    }

  //---------------------------------------------------------------------- 
  // Another analysis pass that attempts to determine where the title-page & 
  // contents are.  In other words, the area between the PG header and the 
  // actual text.  
  if (Dataset->ForceNeural)
    PrefatoryAnalysisPassNeural (Dataset);
  else
    PrefatoryAnalysisPassHeuristic (Dataset);
  Dataset->NumPrefatoryMatched = 0;

  //---------------------------------------------------------------------- 
  // Run the "final" analysis pass. 
  // Set up the data structures. 
  s[sizeof (s) - 1] = '\0';
  fseek (Dataset->LineFile, 0, SEEK_SET);
  fseek (Dataset->InputFile, Dataset->TextStart, SEEK_SET);
  DefaultLine.OutOfRange = 1;
  DefaultLine.Empty = 1;
  for (i = 0; i < PRE_OR_POST_LINES; i++)
    Status.BufferedLines[i] = DefaultLine;
  for (; i < BUFFERED_LINES; i++)
    {
      if (1 !=
	  fread (Status.BufferedLines + i, sizeof (LineRecord), 1,
		 Dataset->LineFile))
	Status.BufferedLines[i] = DefaultLine;
    }
  Status.InParagraph = Status.InBlockquote = Status.InHeader1 =
    Status.LastWasHeader1 = Status.InSubtitle = Status.InTable =
    Status.Versifying = Status.LastVersifying = 0;
  Status.LineInfo = &Status.BufferedLines[PRE_OR_POST_LINES];

  // Okay, here's where we actually loop through the lines of the input file. 
  for (i = 0; i < Dataset->NumLines; i++)
    {
      extern int rfgets_size;
      
      // Fetch the current line of the input file.  The window of buffered 
      // LineRecord data is already set up properly. 
      if (NULL == rfgets (s, sizeof (s) - 1, Dataset->InputFile))
	{
	  fprintf (stderr, "Premature end of input file.\n");
	  ReturnValue = 4;
	  goto Done;
	}

      if (rfgets_size > 90)
        { 
	  AbbyNormalCount++;
	  if (Dataset->LogFile != NULL)
	    fprintf (Dataset->LogFile, 
		     "Warning: Input line #%d is abnormally long (%d chars).\n",
		     i + 1, rfgets_size);
 	}		 

      // Now do stuff related to the entire line:
      if (Dataset->ForceNeural)
	j = MarkByLineNeural (Dataset, &Status, i, s);
      else
	j = MarkByLineHeuristic (Dataset, &Status, i, s);
      switch (j)
	{
	case 5:
	  goto DiskError;
	case -1:
	  goto AtEnd;
	case 0:
	  break;
	default:
	  return (-1);		// Unknown error type.
	}

      // Now do character-by-character and word-by-word stuff.
      if (MarkByChar (Dataset, &Status, s))
	goto DiskError;

      // *** All done with this line! *** 
      //DoneThisLine: 
      // Advance the LineRecord window.  Note, if it isn't obvious, that  
      // it's okay for the window to move past the end of file, since  
      // it is filled with default blank lines in this case. 
      for (j = 0; j < BUFFERED_LINES - 1; j++)
	Status.BufferedLines[j] = Status.BufferedLines[j + 1];
      if (1 !=
	  fread (Status.BufferedLines + j, sizeof (LineRecord), 1,
		 Dataset->LineFile))
	Status.BufferedLines[j] = DefaultLine;
    }

AtEnd:

  // We're at the end, but may still be in a heading, paragraph, etc. 
  if (Status.Italicizing)
    {
      if (AddMarkup (Dataset, ftell (Dataset->InputFile), MarkEndItalics, 0))
	goto DiskError;
    }
  if (Status.InSubtitle)
    {
      if (AddMarkup (Dataset, ftell (Dataset->InputFile), MarkEndSubtitle, 0))
	goto DiskError;
    }
  if (Status.InTable)
    {
      if (AddMarkup (Dataset, ftell (Dataset->InputFile), MarkEndTable, 0))
	goto DiskError;
    }
  if (Status.InParagraph)
    {
      if (AddMarkup
	  (Dataset, ftell (Dataset->InputFile), MarkEndParagraph, 0))
	goto DiskError;
    }
  if (Status.InBlockquote)
    {
      if (AddMarkup
	  (Dataset, ftell (Dataset->InputFile), MarkEndBlockquote, 0))
	goto DiskError;
    }
  if (Status.InHeader1)
    {
      if (AddMarkup (Dataset, ftell (Dataset->InputFile), MarkEndHeader1, 0))
	goto DiskError;
    }
  if (Status.LineInfo[0].Offset != 0)
    if (AddMarkup (Dataset, Status.LineInfo[0].Offset, MarkGutenbergEnder, 0))
      goto DiskError;
      
  //--------------------------------------------------------------------- 
  // Run Stanford natural-language parser pass. 
  if (Dataset->UseStanfordParser)
    {
      if (0 != (Status.ErrorCode = StanfordPass (Dataset)))
	{
	  fprintf (stderr, "Stanford NLP pass failed (code = 0x%X).\n",
		   Status.ErrorCode);
	  ReturnValue = 4;
	  goto Done;
	}
    }      
      
  ReturnValue = 0;
Done:fclose (Dataset->LineFile);
  AddMarkup (Dataset, -1, MarkNoMoreMarks, 0);	// Flush the buffer. 
  if (AbbyNormalCount)
    {
      fprintf (stderr, 
               "***********************************************************\n"
               "                        WARNING\n"
               "%d abnormally long line(s) were found in the input file.\n"
      	       "These are usually editing errors in the Project Gutenberg text\n"
	       "file.  GutenMark may not format these lines correctly.  Consider\n"
	       "submitting a bug report -- about the text file, and NOT\n"
	       "about GutenMark -- directly to Project Gutenberg.  To get\n"
	       "more detail, run GutenMark using the --debug option\n"
	       "and search the GutenMark.log file for \"abnormally long\".\n"
	       "There is a temporary workaround, if GutenMark's formatting is\n"
	       "unacceptable.  You can manually edit the input text file, and\n"
	       "add hard carriage-returns to shorten the abnormal line(s).\n"
	       "Then run GutenMark again.\n",
      	       AbbyNormalCount);
      if (Dataset->LogFile != NULL)	       
	fprintf (Dataset->LogFile, 
		 "%d abnormally long line(s) found in the input file.\n",
		 AbbyNormalCount);
    }
  return (ReturnValue);
DiskError:fprintf (stderr, "Disk-write error.\n");
  ReturnValue = 5;
  goto Done;
}
