/////////////////////////////////////////////////////////////////////////////
/*
  Copyright 2001,2002,2003,2008 Ronald S. Burkey.
  Latex support Copyright 2001 Joe Cherry.

  This file is part of GutenMark.

  GutenMark is free software; you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation; either version 2 of the License, or
  (at your option) any later version.

  GutenMark is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.

  You should have received a copy of the GNU General Public License
  along with GutenMark; if not, write to the Free Software
  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

  Filename:	AutoMark.c 
  Purpose:	This is the main program for AutoMark, a vanilla-ANSI C 
  		program that marks up a vanilla-ASCII Project Gutenberg 
		(or similar) text file to HTML, with the intention of 
		making it printable on demand, or very beautifully readable.  
		In other words, Project Gutenberg has provided the content, 
		but now I want to make that content aesthetically pleasing.
  Mods:		08/30/01 RSB	Began.
  		09/03/01 RSB	Added MarkInsertChar.
		09/04/01 RSB	Some things that produced invalid HTML were 
				fixed:  '>', '<', '&' within body of the 
				text.  Empty <head></head> (no <title>).
				Empty anchor after Gutenberg header.  Open 
				paragraphs, headings, block quotes at end of 
				file.	
		09/06/01 RSB	Now allow output file as command-line 
				argument.  Also, allow some command-line 
				options (--no-justify, --help).  Added a lot 
				more processing on the "title".		
		09/08/01 RSB	Area between the PG header and the actual text, 
				which I call the "prefatory" area, is now now 
				detected and used essentially without markup.  
				Phrases found in this area (often a table of 
				contents) are used preferentially to detect 
				headings in the text and (conversely) to 
				reject identifications as headings.  MANY 
				additional tweaks performed.  Tables are now 
				handled, but only as "preformatted text" and 
				not a HTML tables.
		09/09/01 RSB	Again, numerous changes.
		09/15/01 RSB	Now handles PG so-called "8-bit ASCII".  A 
				lot more cleanup, such as block quotes, new 
				types of italic styles, and so on.
		09/16/01 RSB	Here is just a trivial remark added for 
				checking CVS.
		11/02/01 RSB	Added GPL disclaimer, and cleaned up the 
				formatting, prior to first web release.	
				Added startup messages.	
		11/09/01 RSB	Convert strings of dashes (>1) to mdashes
				and ndashes.	
		11/12/01 RSB	Added the --yes-header option,
				and PG header/ender stuff.
		11/13/01 RSB	Added smart quotes.
		11/19/01 RSB	Added first cut at ALL-CAPS italicizing.
				Added a GutenMark declaration at the 
				beginning of the output.
		11/24/01 RSB	Added --no-foreign.
		11/25/01 RSB	Fixed bug in parsing the --profile switch.
				Added --no-diacritical switch.
				Began adding Joe Cherry's --latex patch.
				(Still need to modify the patch to handle
				the GutenMark disclaimer.)
		11/26/01 RSB	Completed Latex patch.  Joe indicates he's
				working on some of the problems I've pointed
				out.	
		11/29/01 RSB	Added wordlist output (GutenMark.native &
				GutenMark.foreign) to --debug output.
		12/05/01 RSB	Removed compilation date from prefatory-area
				markup, to allow the regression test during
				compilation to work properly.  Began adding
				the --first-capital and --first-italics
				options.
		12/10/01 RSB	Added SingleSpace.  Added some HTML 
				beautification:  Now there are quotes around
				attributes in HTML tags, and all HTML tags
				and entities are consistently lower case.
				Also, HTML entities now default to symbolic
				(like &rdquo;) but can be forced to numeric
				(like &#8221;) with the '--force-numeric'
				switch.	Fixed an unnoticed bug in the HTML
				in which if the PG header was discarded there
				would be a </pre> without a <pre>.
		12/11/01 RSB	HTML 4.0 file-header replaced with HTML 3.2
				file-header to make the "tidy" utility
				(an HTML-validator and tidier-upper) happy.
		12/15/01 RSB	Added paragraph line-length normalization and
				removal of multiple newlines in the raw HTML
				to make it look prettier; won't affect 
				appearance in browser.	
		12/16/01 RSB	--force-symbolic switch added, and entity
				support default changed to numeric.
		12/30/01 RSB	Added --force-neural, --force-rule, and
				--neural-learn.	
		01/13/02 RSB	Split all actual writing to the output file
				into separate functions OutputHtml, 
				OutputLatex, OutputXml, ... (in separate
				source files) to allow easier addition
				and/or maintenance of output formats.
				Also, added a command-line switch
				(--xml) to activate XML output.  (There
				currently isn't any working XML output,
				though.)
		01/18/02 RSB	Added --no-prefatory and --page-breaks.	
		06/15/02 RSB	Added --no-parskip. 
		06/30/02 RSB	Added a few new log messages.
		07/13/02 RSB	Added --config.	
		07/20/02 RSB	Added argv[] to logfile.
		07/21/02 RSB	Now works properly with PATH, due to the 
				addition of RelativeWhich.	
		08/05/02 RSB	Added --latex-sections.
		08/08/02 RSB	Added --no-toc, --author, and --title.
		08/09/02 RSB	Added --caps-ok.
		11/20/02 RSB	Changed my email address from birdsproject
				to sandroid.
		11/22/02 RSB	Added --mdash-size
		12/23/02 RSB	Fixed alignment of --latex-sections in the
				--help output.	Also, added the "--ron" switch.	
		01/05/03 RSB	Changed --mdash-size default to 3.
		03/01/08 RSB	Added the "Stanford Parser" pass.
		04/20/08 RSB	--ron switch corrected to have 3 hyphens per 
				mdash.
		05/27/08 RSB	I notice that the --help screen still shows the
				name of the program as "AutoMark".  Well, it's
				fixed now.											
								
  The program has to handle many special cases, such as differing paragraph 
  styles (indentation vs. no indentation, extra blank lines between paragraphs
  vs. no extra blank lines, etc.), differing styles for indicating italics 
  (or bold, or underlining), hyphenation vs. no hyphenation, Gutenberg vs. 
  non-Gutenberg, etc.  However, I want this program to be AUTOMATIC, so I'm 
  holding off on adding command-line switches for ANYTHING.  The program will 
  analyze the text to determine the styles, and if extra passes are necessary 
  to accomplish this, then so be it!
  
  The program works as follows:
  	AutoMark [options] [inputfile [outputfile]]
  If outputfile and/or inputfile are missing, then the stdout and/or stdin 
  are used.	
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <zlib.h>
#include "AutoMark.h"

//--------------------------------------------------------------------------

int
main (int argc, char *argv[])
{
#define MAX_FREQUENT 100
  int ReturnValue;
  struct
  {
    int Index;
    int Count;
  }
  Frequents[MAX_FREQUENT];
  AnalysisDataset Dataset = { NULL };
  int ErrorCode, i, j, k;
  char Profile[64] = "english";
  char *AltCfg = NULL;
  int FilesFound = 0;
  FILE *OutputFile;
  char *InputFilename = NULL, *ExecutableFilename;

  // Process the command line.
  Dataset.MdashSize = 3;
  Dataset.ForceNumeric = 1;
  Dataset.ForceNeural = 0;
  Dataset.InputFile = stdin;
  Dataset.LogFile = NULL;
  
  Dataset.UseStanfordParser = 0;
  Dataset.PathToJava = "java";
  Dataset.PathToParser = "";
  
  OutputFile = stdout;
  fprintf (stderr, "GutenMark PG-to-HTML markup tool, build " __DATE__ "\n");
  fprintf (stderr,
	   "Copyright (C) 2001-2003,2008 Ronald Burkey (info@sandroid.org)\n");
  fprintf (stderr,
	   "LaTeX support Copyright (C) 2001 Joe Cherry (joe@vimaster.org)\n");
  fprintf (stderr, "GutenMark comes with ABSOLUTELY NO WARRANTY.\n");
  fprintf (stderr, "This is free software, licensed under the GNU GPL.\n");
  for (i = 1; i < argc; i++)
    {
      if (*argv[i] == '-')
	{
	  if (0 == strcmp (argv[i], "--help"))
	    {
	      printf
		("----------------------------------------------------\n");
	      printf ("USAGE:");
	      printf
		("\tGutenMark [Options] [InputTxtFile [OutputHtmlFile]]\n");
	      printf
		("If the output file and/or input file are missing, "
		 " then the standard\n");
	      printf ("output and/or input are used.  "
		      "The available options are:\n");
	      // 08/08/02 RSB
	      printf
		("\t\"--author=name\"  Overrides the name of the author, as\n"
		 "\t                 deduced from the input file, with a\n"
		 "\t                 string of your own choosing.  The\n"
		 "\t                 quotes are usually necessary to\n"
		 "\t                 compensate for blanks within the name.\n");
	      // 08/09/02 RSB
	      printf
	        ("\t--caps-ok        Normally, ALL-CAPS phrases are converted\n"
		 "\t                 automatically to italicized phrases.  The\n"
		 "\t                 --caps-ok switch can be used to override \n"
		 "\t                 this behavior if, for example, you happen\n"
		 "\t                 to know that emphasis is provided by\n"
		 "\t                 _underscoring_ or by some other supported\n"
		 "\t                 technique in the input text.\n");	 
	      // 07/13/02 RSB
	      printf ("\t--config=path    Path to non-default CFG file.\n");
	      printf ("\t--debug          Create optional logfiles.\n");
	      // 12/09/01 RSB
	      printf
		("\t--first-capital  1st word of chapter can be ALL-CAPs\n");
	      // 12/09/01 RSB
	      printf
		("\t--first-italics  1st word of chapter can be italics\n");
	      // 12/30/01 RSB
	      printf
		("\t--force-neural   Uses the neural-net method for identifying\n"
		 "\t                 headings, verse, paragraphs, etc.\n"
		 "\t                 Cannot be used with --force-rule.\n");
	      // 12/10/01 RSB
	      printf
		("\t--force-numeric  (Default.) Makes HTML entities numeric.\n");
	      // 12/30/01 RSB
	      printf
		("\t--force-rule     (Default.) Uses rule-based method\n"
		 "\t                 for identifying headings, verse, etc.\n"
		 "\t                 Cannot be used with --force-neural.\n");
	      printf ("\t--force-symbolic Makes HTML entities symbolic.\n");
	      printf ("\t--help           Displays this information.\n");
	      printf ("\t--latex          Output LaTeX instead of html.\n");
	      printf ("\t--latex-sections In LaTeX, use section* rather\n" 
	      	      "\t                 than chapter*.\n");
	      printf ("\t--mdash-size=N   In LaTeX, allows the width of an\n");
	      printf ("\t                 mdash to be adjusted.  The default\n");
	      printf ("\t                 is 3.\n");	      
	      printf
		("\t--neural-learn   Put neural net into learning mode.\n"
		 "\t                 Implies --force-neural.\n");
	      printf
		("\t--no-diacritical Turn off restoration of diacritical\n"
		 "\t                 marks.  (On by default.)\n");
	      printf ("\t--no-foreign     Turn off automatic italicizing of\n"
		      "\t                 foreign text.  (On by default.)\n");
	      printf ("\t--no-justify     Allows ragged right margin.\n"
		      "\t                 Default is justified.\n");
	      printf ("\t--no-mdash       Keeps strings of dashes intact.\n"
		      "\t                 Default converts to mdashes/ndashes.\n");
	      // 06/15/02 RSB
	      printf
		("\t--no-parskip     Used with --latex only.  Allows LaTeX\n"
		 "\t                 to indent paragraphs.  (The default\n"
		 "\t                 behavior is unindented paragraphs \n"
		 "\t                 separated by blank lines.\n");
	      // 01/18/02 RSB
	      printf
		("\t--no-prefatory   Adds invisible comments to the HTML\n"
		 "\t                 that allow post-processing by the html2ps\n"
		 "\t                 program to remove the prefatory area.\n"
		 "\t                 (Still appears in the HTML, though.)\n");
	      // 08/08/02 RSB
	      printf
		("\t--no-toc         By default, a table of contents is\n"
		 "\t                 added to LaTeX output.  This switch\n"
		 "\t                 removes the table of contents.\n");
	      // 01/18/02 RSB
	      printf
		("\t--page-breaks    Adds invisible comments to the HTML\n"
		 "\t                 that allow post-processing by the html2ps\n"
		 "\t                 to add page breaks before headings.\n"
		 "\t                 (HTML browses the same, though.)\n");
	      printf
	        ("\t--path-to-java=P Useful only if the --stanford switch (see\n"
		 "\t                 below) is also used.  P is the pathname of\n"
		 "\t                 the command that runs the Java interpreter.\n"
		 "\t                 It defaults simply to \"java\", which is\n"
		 "\t                 probably correct in most cases.  It will\n"
		 "\t                 *not* be correct in a normal installation\n"
		 "\t                 of Fedora Linux (and probably a number of\n"
		 "\t                 other Linux distributions) because in those\n"
		 "\t                 cases the simple command \"java\" runs the\n"
		 "\t                 GNU gij program instead of normal Sun Java,\n"
		 "\t                 even when the Sun JDK is installed.  For \n"
		 "\t                 example, on my Fedora Core 5 workstation, Sun\n"
		 "\t                 JDK 1.5.0-11 is installed, but I have to use\n"
		 "\t                 --path-to-java=/usr/java/jre1.5.0_11/bin/java.\n"
		 "\t                 You can tell what Java you have by using the\n"
		 "\t                 command \"java -version\"\n");
	      printf 
	        ("\t--path-to-parser=D Useful only if the --stanford switch (see\n"
		 "\t                 below) is also used.  D is the name of the \n"
		 "\t                 directory that contains the database files\n"
		 "\t                 used by the Stanford Parser.  In particular, it\n"
		 "\t                 determines where the files stanford-parser.jar\n"
		 "\t                 and englishPCFG.ser.gz are located.  By default,\n"
		 "\t                 D is empty, so those files are expected to be in\n"
		 "\t                 the directory from which you are running the\n"
		 "\t                 GutenMark program itself.  If you use some other\n"
		 "\t                 directory, be sure include a trailing slash in\n"
		 "\t                 D.  (\'\\\' in Windows or \'/\' in Linux.)\n");
	      printf
		("\t--profile=xxx    Select language xxx.  Actually, these\n"
		 "\t                 names refer not to languages, but rather\n"
		 "\t                 to names of profiles in GutenMark.cfg.\n"
		 "\t                 But the profiles are usually named for\n"
		 "\t                 a particular language.\n");
	      // 12/10/01 RSB    
	      printf
		("\t--single-space   Allow single space between sentences.\n");
	      // 03/01/08 RSB
	      printf
	        ("\t--stanford       Activate the \"Stanford Parser\" pass.\n"
		 "\t                 This pass is disabled by default because\n"
		 "\t                 it requires so very much time to run, but\n"
		 "\t                 I *hope* can improve the output quality.\n");
	      printf
		("\t\"--title=name\"   Overrides the book\'s title, as\n"
		 "\t                 deduced from the input file, with a\n"
		 "\t                 string of your own choosing.  The\n"
		 "\t                 quotes are usually necessary to\n"
		 "\t                 compensate for blanks within the name.\n"
		 "\t                 If the title itself contains quotes,\n"
		 "\t                 use single-quotes instead.\n");
	      // 01/13/02 RSB
	      printf
		("\t--xml            Allow XML output (experimental only).\n");
	      printf
		("\t--yes-header     Causes PG file-header to be retained.\n");
	      // 12/25/02 RSB
	      printf
	        ("\t--ron            Activates various customizations that I (RSB)\n"
		 "\t                 like.  Includes --latex --no-parskip --mdash-size=3.\n"
		 "\t                 (Also, sets the page size to 5.5x8.5 inches,\n"
		 "\t                 font to 12pt. New Century Schoolbook, and other\n"
		 "\t                 stuff that GutenMark otherwise cannot affect.)\n");
	      return (0);
	    }
	  else if (!strcmp (argv[i], "--stanford"))
	    {
	      Dataset.UseStanfordParser = 1;
	      continue;
	    }
	  else if (!strncmp (argv[i], "--path-to-java=", 15))
	    {
	      Dataset.PathToJava = &argv[i][15];
	      continue;
	    }
	  else if (!strncmp (argv[i], "--path-to-parser=", 17))
	    {
	      Dataset.PathToParser = &argv[i][17];
	      continue;
	    }
	  else if (0 == strcmp (argv[i], "--no-justify"))
	    {
	      Dataset.NoJustify = 1;
	      continue;
	    }
	  // 08/09/02 RSB.  
	  else if (0 == strcmp (argv[i], "--caps-ok"))
	    {
	      Dataset.CapsOk = 1;
	      continue;
	    }
	  else if (0 == strcmp (argv[i], "--no-mdash"))
	    {
	      Dataset.NoMdash = 1;
	      continue;
	    }
	  else if (0 == strcmp (argv[i], "--yes-header"))
	    {
	      Dataset.YesHeader = 1;
	      continue;
	    }
	  else if (0 == strcmp (argv[i], "--no-foreign"))
	    {
	      Dataset.NoForeign = 1;
	      continue;
	    }
	  else if (0 == strcmp (argv[i], "--no-diacritical"))
	    {
	      Dataset.NoDiacritical = 1;
	      continue;
	    }
	  // 12/09/01 RSB  
	  else if (0 == strcmp (argv[i], "--first-capital"))
	    {
	      Dataset.FirstCapital = 1;
	      Dataset.FirstItalics = 0;
	      continue;
	    }
	  // 12/09/01 RSB  
	  else if (0 == strcmp (argv[i], "--first-italics"))
	    {
	      Dataset.FirstCapital = 0;
	      Dataset.FirstItalics = 1;
	      continue;
	    }
	  // 12/10/01 RSB  
	  else if (0 == strcmp (argv[i], "--single-space"))
	    {
	      Dataset.SingleSpace = 1;
	      continue;
	    }
	  // 12/10/01 RSB  
	  else if (0 == strcmp (argv[i], "--force-numeric"))
	    {
	      Dataset.ForceNumeric = 1;
	      continue;
	    }
	  // 12/16/01 RSB  
	  else if (0 == strcmp (argv[i], "--force-symbolic"))
	    {
	      Dataset.ForceNumeric = 0;
	      continue;
	    }
	  // 12/30/01 RSB  
	  else if (0 == strcmp (argv[i], "--force-neural"))
	    {
	      Dataset.ForceNeural = 1;
	      continue;
	    }
	  // 12/30/01 RSB  
	  else if (0 == strcmp (argv[i], "--force-rule"))
	    {
	      Dataset.ForceNeural = 0;
	      continue;
	    }
	  // 12/25/02 RSB  
	  else if (0 == strcmp (argv[i], "--ron"))
	    {
	      Dataset.ron = 1;
	      Dataset.NoParskip = 1;
	      Dataset.MdashSize = 3;
	      Dataset.Latex = 1;
	      continue;
	    }
	  // 12/30/01 RSB  
	  else if (0 == strcmp (argv[i], "--neural-learn"))
	    {
	      Dataset.NeuralLearn = 1;
	      continue;
	    }
	  // 01/13/02 RSB  
	  else if (0 == strcmp (argv[i], "--xml"))
	    {
	      Dataset.Xml = 1;
	      continue;
	    }
	  // 01/18/02 RSB  
	  else if (0 == strcmp (argv[i], "--no-prefatory"))
	    {
	      Dataset.NoPrefatory = 1;
	      continue;
	    }
	  // 01/18/02 RSB  
	  else if (0 == strcmp (argv[i], "--page-breaks"))
	    {
	      Dataset.PageBreaks = 1;
	      continue;
	    }
	  else if (0 == strcmp (argv[i], "--debug"))
	    {
	      Dataset.LogFile = fopen ("GutenMark.log", "w");
	      continue;
	    }
	  else if (0 == strncmp (argv[i], "--profile=", 10))
	    {
	      if (strlen (&argv[i][10]) < sizeof (Profile))
		strcpy (Profile, &argv[i][10]);
	      continue;
	    }
	  // 07/13/02 RSB
	  else if (0 == strncmp (argv[i], "--config=", 9))
	    {
	      AltCfg = 9 + argv[i];
	      //fprintf (stderr, "Using alternate configuration \"%s\"\n", AltCfg);
	      continue;
	    }
	  else if (0 == strcmp (argv[i], "--latex"))
	    {
	      Dataset.Latex = 1;
	      continue;
	    }
	  // 08/05/02 RSB  
	  else if (0 == strcmp (argv[i], "--latex-sections"))
	    {
	      Dataset.LatexSections = 1;
	      continue;
	    }
	  // 08/08/02 RSB  
	  else if (0 == strcmp (argv[i], "--no-toc"))
	    {
	      Dataset.NoToc = 1;
	      continue;
	    }
	  // 08/08/02 RSB  
	  else if (0 == strncmp (argv[i], "--author=", 9))
	    {
	      if (strlen (&argv[i][9]) < MAX_TITLE - 1)
		strcpy (Dataset.AuthorBuffer, &argv[i][9]);
	      continue;
	    }
	  // 08/08/02 RSB  
	  else if (0 == strncmp (argv[i], "--title=", 8))
	    {
	      if (strlen (&argv[i][8]) < MAX_TITLE - 1)
		strcpy (Dataset.TitleBuffer, &argv[i][8]);
	      continue;
	    }
	  // 11/22/02 RSB  
	  else if (0 == strncmp (argv[i], "--mdash-size=", 13))
	    {
	      Dataset.MdashSize = atoi (&argv[i][13]);
	      continue;
	    }
	  // 06/15/02 RSB
	  else if (0 == strcmp (argv[i], "--no-parskip"))
	    {
	      Dataset.NoParskip = 1;
	      continue;
	    }
	  else
	    {
	      if (Dataset.LogFile != NULL)
		fprintf (Dataset.LogFile,
			 "Unknown command-line parameter: %s\n", argv[i]);
	      fprintf (stderr,
		       "Unknown command-line parameter: %s\n"
		       "Use \"GutenMark --help\" for more info.\n", argv[i]);
	      continue;
	    }
	}
      else if (FilesFound == 0)
	{
	  Dataset.InputFile = fopen (argv[i], "rb");
	  if (Dataset.InputFile == NULL)
	    {
	      if (Dataset.LogFile != NULL)
		fprintf (Dataset.LogFile, "Input file \"%s\" not found.\n",
			 argv[i]);
	      fprintf (stderr, "Input file does not exist.\n");
	      return (1);
	    }
	  InputFilename = argv[i];
	  FilesFound++;
	}
      else if (FilesFound == 1)
	{
	  OutputFile = fopen (argv[i], "w+b");
	  if (OutputFile == NULL)
	    {
	      if (Dataset.LogFile != NULL)
		fprintf (Dataset.LogFile,
			 "Output file \"%s\" cannot be created.\n", argv[i]);
	      fprintf (stderr, "Output file cannot be created.\n");
	      return (1);
	    }
	  FilesFound++;
	}
      else
	{
	  if (Dataset.LogFile != NULL)
	    fprintf (Dataset.LogFile,
		     "Too many filenames on command line.\n");
	  fprintf (stderr, "Too many filenames listed on command line.\n");
	  fclose (Dataset.InputFile);
	  fclose (OutputFile);
	  return (1);
	}
    }
  if (Dataset.LogFile != NULL)
    {
      fprintf (Dataset.LogFile, "argc=%d\n", argc);
      for (i = 0; i < argc; i++)
	fprintf (Dataset.LogFile, "argv[%d]=\"%s\"\n", i, argv[i]);
    }
  ExecutableFilename = RelativeWhich (argv[0]);
  if (ExecutableFilename == NULL)
    {
      fprintf (stderr, "Error locating executable file.\n");
      if (Dataset.LogFile != NULL)
	fprintf (Dataset.LogFile, "Error locating executable file.\n");
      ExecutableFilename = argv[0];
    }
  else
    {
      if (Dataset.LogFile != NULL)
	fprintf (Dataset.LogFile, "Executable: \"%s\"\n", ExecutableFilename);
    }

  // Create in-memory wordlist from the etext.
  Dataset.Words = CreateWordlist (Dataset.InputFile);

  // Determine the most frequently used words in the etext, 
  // from the wordlist just created.
  for (i = 0; i < MAX_FREQUENT; i++)
    Frequents[i].Count = -1;
  k = Frequents[0].Count;	// Min Count currently in Frequents[].  
  for (j = 0; j < Dataset.Words->NumWords; j++)
    if (Dataset.Words->Words[j].Count > k)
      {

	// The word has been used frequently enough to displace one
	// of the words already in Frequents[].  The words in 
	// Frequents[] are in the order of least frequent (at 0)
	// to most frequent (at MAX_FREQUENT-1).
	for (i = 0; i < MAX_FREQUENT - 1 &&
	     Frequents[i + 1].Count < Dataset.Words->Words[j].Count; i++)
	  Frequents[i] = Frequents[i + 1];
	Frequents[i].Count = Dataset.Words->Words[j].Count;
	Frequents[i].Index = j;
	k = Frequents[1].Count;
      }
  for (j = MAX_FREQUENT - 1; j >= 0; j--)
    if (Frequents[j].Count != -1)
      {
	Dataset.Words->Words[Frequents[j].Index].Frequent = 1;
	if (Dataset.LogFile != NULL)
	  fprintf (Dataset.LogFile, "Index=%d Count=%d Word=\"%s\"\n",
		   Frequents[j].Index, Frequents[j].Count,
		   Dataset.Words->Words[Frequents[j].Index].Full);
      }

  // Read the global wordlists and namelists.
  if (0 !=
      MatchWordlists (Dataset.LogFile, Dataset.Words, Profile,
		      ExecutableFilename, AltCfg))
    {
      fprintf (stderr, "Error matching wordlists/namelists.\n");
      if (Dataset.LogFile != NULL)
	fprintf (Dataset.LogFile, "Aborting due to MatchWordlists error.\n");
      return (1);
    }
  if (Dataset.LogFile != NULL)
    {
      gzFile Native, Foreign;
      Native = gzopen ("GutenMark.native.gz", "w");
      Foreign = gzopen ("GutenMark.foreign.gz", "w");
      fprintf (Dataset.LogFile, "NumWords = %d\n", Dataset.Words->NumWords);
      for (i = 0; i < Dataset.Words->NumWords; i++)
	{
	  if (0 != (Dataset.Words->Words[i].WordlistStatus & SPELL_NATIVE)
	      && Native != NULL)
	    {
	      if (Dataset.Words->Words[i].Match != NULL)
		gzprintf (Native, "%s\n", Dataset.Words->Words[i].Match);
	      else
		gzprintf (Native, "%s\n", Dataset.Words->Words[i].Full);
	    }
	  if (0 != (Dataset.Words->Words[i].WordlistStatus & SPELL_NONNATIVE)
	      && Foreign != NULL)
	    {
	      if (Dataset.Words->Words[i].Match != NULL)
		gzprintf (Foreign, "%s\n", Dataset.Words->Words[i].Match);
	      else
		gzprintf (Foreign, "%s\n", Dataset.Words->Words[i].Full);
	    }
	  fprintf (Dataset.LogFile, "Normalized=");
	  if (Dataset.Words->Words[i].Normalized == NULL)
	    fprintf (Dataset.LogFile, "NULL");
	  else
	    fprintf (Dataset.LogFile, "%s",
		     Dataset.Words->Words[i].Normalized);
	  fprintf (Dataset.LogFile, ", Full=");
	  if (Dataset.Words->Words[i].Full == NULL)
	    fprintf (Dataset.LogFile, "NULL");
	  else
	    fprintf (Dataset.LogFile, "%s", Dataset.Words->Words[i].Full);
	  if (Dataset.Words->Words[i].Match == NULL)
	    /* fprintf (Dataset.LogFile, ", Match=NULL") */ ;
	  else
	    fprintf (Dataset.LogFile, ", Match=%s",
		     Dataset.Words->Words[i].Match);
	  fprintf (Dataset.LogFile, ", NotAtBeginning=%d, Status=%02X",
		   Dataset.Words->Words[i].NotAtBeginning,
		   Dataset.Words->Words[i].WordlistStatus);
	  fprintf (Dataset.LogFile, ", Languages=0x%04lX\n",
		   Dataset.Words->Words[i].Languages);
	}
      if (Native != NULL)
	gzclose (Native);
      if (Foreign != NULL)
	gzclose (Foreign);
    }
  fseek (Dataset.InputFile, 0, SEEK_END);
  Dataset.FileEnderRegion = ftell (Dataset.InputFile) - 500;
  fseek (Dataset.InputFile, 0, SEEK_SET);
  Dataset.MarkupFile = tmpfile ();
  if (Dataset.MarkupFile == NULL)
    {
      fprintf (stderr, "Cannot create temporary markup file.\n");
      if (Dataset.LogFile != NULL)
	fprintf (Dataset.LogFile,
		 "Could not create temporary markup file.\n");
      return (1);
    }

  // Style analysis.  The analysis actually performs all 
  // of the markup, but doesn't modify the input or output files.  
  // Instead the  markups are saved to another file.
  ErrorCode = Markup (&Dataset);
  if (ErrorCode != 0)
    {
      fprintf (stderr, "An error code of %d occurred during markup.\n",
	       ErrorCode);
      if (Dataset.LogFile != NULL)
	fprintf (Dataset.LogFile, "Aborting due to Markup error.\n");
      return (1);
    }

  // Finally, handle the actual output.
  if (Dataset.Xml)
    ReturnValue = OutputXml (OutputFile, &Dataset);
  else if (Dataset.Latex)
    ReturnValue = OutputLatex (OutputFile, &Dataset);
  else
    ReturnValue = OutputHtml (OutputFile, &Dataset);

  fclose (Dataset.MarkupFile);
  fclose (Dataset.InputFile);
  fclose (OutputFile);
  return (ReturnValue);
}

//---------------------------------------------------------------------
// The "title" has been read from the first line of the file.  
// However, it may have all kinds of Gutenbergy stuff, like asterisks, 
// the phrase "Project Gutenberg etext of", and so on.

void
NormalizeTitle (char *s, int size)
{
  char *ss;

  // Make sure the string is null-terminated.
  s[size - 1] = '\0';

  // Get rid of leading asterisks and white space.
  for (ss = s; *ss == '*' || isspace (*ss); ss++);
  if (ss > s)
    strcpy (s, ss);

  // Let's get rid of trailing asterisks and white space also.
  if (*s != '\0')
    {
      for (ss = s + strlen (s) - 1; *ss == '*' || isspace (*ss); ss--);
      ss[1] = '\0';
    }

  // Let's also get rid of multiple spaces.
  for (ss = s; *ss;)
    {
      if (isspace (*ss) && isspace (ss[1]))
	strcpy (ss, ss + 1);
      else if (*ss == '\t')
	*ss = ' ';
      else
	ss++;
    }

  // Look for some stuff that PG sometimes adds AFTER the title.
  for (ss = s; *ss; ss++)
    if (*ss == '*' || *ss == '#' || *ss == '(' || *ss == '[' || *ss == '{')
      {
	while (ss > s
	       && (ss[-1] == ' ' || ss[-1] == ',' || ss[-1] == '.'
		   || ss[-1] == ';' || ss[-1] == ':' || ss[-1] == '-'))
	  ss--;
	*ss = '\0';
	break;
      }

  // Now get rid of several common things that PG likes to add:
  if (0 == strncasecmp (s, "PG ", 3))
    strcpy (s, s + 3);
  if (0 == strncasecmp (s, "PG\'s ", 5))
    strcpy (s, s + 5);
  else if (0 == strncasecmp (s, "Project Gutenberg\'s ", 20))
    strcpy (s, s + 20);
  else if (0 == strncasecmp (s, "Project Gutenberg ", 18))
    strcpy (s, s + 18);
  else if (0 == strncasecmp (s, "The Project Gutenberg ", 22))
    strcpy (s, s + 22);
  if (0 == strncasecmp (s, "etext of ", 9))
    strcpy (s, s + 9);
  else if (0 == strncasecmp (s, "etext: ", 7))
    strcpy (s, s + 7);
  else if (0 == strncasecmp (s, "etext, ", 7))
    strcpy (s, s + 7);
  else if (0 == strncasecmp (s, "etext; ", 7))
    strcpy (s, s + 7);
  else if (0 == strncasecmp (s, "etext. ", 7))
    strcpy (s, s + 7);
  else if (0 == strncasecmp (s, "etext ", 6))
    strcpy (s, s + 6);

  // For some reason, it can still happen that there's a newline at the
  // end -- get rid of it!
  for (ss = s; *ss; ss++)
    if (*ss == '\n')
      *ss = 0;

  // Finally, if the string is empty, fill in a default value.
  if (*s == '\0')
    strcpy (s, "A Project Gutenberg Book");
}
