/**
 * $Id: estutil.c,v 1.26 2006/08/17 14:57:15 shinh Exp $
 *
 * Copyright (C) shinichiro.h <hamaji@nii.ac.jp>
 *  http://shinh.skr.jp/
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
 */

/**
 * The present copyright holders of this program have given permission,
 * as a special exception, to link this program with Apache Portable
 * Runtime (APR) and to include header files for APR components when
 * those header files are covered by the Apache licenses, as long as
 * the GNU GPL is followed for this program in all other ways. 
 */

/**
 * Helper functions for estraier API. The functions are come from estcmd.c
 */

#include "estutil.h"
#include "config.h"

#include <assert.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>

#define NUMBUFSIZ      32                /* size of a buffer for a number */
#undef TRUE                              /* boolean true */
#define TRUE           1
#undef FALSE                             /* boolean false */
#define FALSE          0

void printfinfo(const char *format, ...);

/* check whether a buffer is binary */
static int est_check_binary(const char *buf, int size){
  int i, bin;
  assert(buf && size >= 0);
  if(size < 32) return FALSE;
  /* PDF */
  if(!memcmp(buf, "%PDF-", 5)) return TRUE;
  /* PostScript */
  if(!memcmp(buf, "%!PS-Adobe", 10)) return TRUE;
  /* generic binary */
  size -= 5;
  if(size >= 256) size = 256;
  bin = FALSE;
  for(i = 0; i < size; i++){
    if(buf[i] == 0x0){
      if(buf[i+1] == 0x0 && buf[i+2] == 0x0 && buf[i+3] == 0x0 && buf[i+4] == 0x0) return TRUE;
      bin = TRUE;
    }
  }
  if(!bin) return FALSE;
  /* PNG */
  if(!memcmp(buf, "\x89PNG", 4)) return TRUE;
  /* GIF(87a) */
  if(!memcmp(buf, "GIF87a", 6)) return TRUE;
  /* GIF(89a) */
  if(!memcmp(buf, "GIF89a", 6)) return TRUE;
  /* JFIF */
  if(!memcmp(buf, "\xff\xd8JFIF", 6)) return TRUE;
  /* TIFF(Intel) */
  if(!memcmp(buf, "MM\x00\x2a", 4)) return TRUE;
  /* TIFF(Motorola) */
  if(!memcmp(buf, "II\x2a\x00", 4)) return TRUE;
  /* BMP */
  if(!memcmp(buf, "BM", 2)) return TRUE;
  /* GZIP */
  if(!memcmp(buf, "\x1f\x8b\x08", 3)) return TRUE;
  /* BZIP2 */
  if(!memcmp(buf, "BZh", 3)) return TRUE;
  /* ZIP */
  if(!memcmp(buf, "PK\x03\x04", 4)) return TRUE;
  /* MP3(with ID3) */
  if(!memcmp(buf, "ID3", 3)) return TRUE;
  /* MP3 */
  if(((buf[0] * 0x100 + buf[1]) & 0xfffe) == 0xfffa) return TRUE;
  /* MIDI */
  if(!memcmp(buf, "MThd", 4)) return TRUE;
  /* RPM package*/
  if(!memcmp(buf, "0xed0xab", 2)) return TRUE;
  /* Debian package */
  if(!memcmp(buf, "!<arch>\ndebian", 14)) return TRUE;
  /* ELF */
  if(!memcmp(buf, "\x7f\x45\x4c\x46", 4)) return TRUE;
  /* MS-DOS executable */
  if(!memcmp(buf, "MZ", 2)) return TRUE;
  /* MS-Office */
  if(!memcmp(buf, "\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1", 8)) return TRUE;
  if(!memcmp(buf, "\xfe\x37\x00\x23", 4)) return TRUE;
  if(!memcmp(buf, "\xdb\xa5-\x00\x00\x00", 6)) return TRUE;
  return FALSE;
}

/* get the encoding of an HTML string */
static char *est_html_enc(const char *str){
  CBLIST *elems;
  CBMAP *attrs;
  const char *elem, *equiv, *content;
  char *enc, *pv;
  int i;
  assert(str);
  elems = cbxmlbreak(str, TRUE);
  for(i = 0; i < CB_LISTNUM(elems); i++){
    elem = cblistval(elems, i, NULL);
    if(elem[0] != '<' || !cbstrfwimatch(elem, "<meta")) continue;
    enc = NULL;
    attrs = cbxmlattrs(elem);
    equiv = cbmapget(attrs, "http-equiv", -1, NULL);
    if(!equiv) equiv = cbmapget(attrs, "HTTP-EQUIV", -1, NULL);
    if(!equiv) equiv = cbmapget(attrs, "Http-Equiv", -1, NULL);
    if(!equiv) equiv = cbmapget(attrs, "Http-equiv", -1, NULL);
    if(equiv && !cbstricmp(equiv, "Content-Type")){
      content = cbmapget(attrs, "content", -1, NULL);
      if(!content) content = cbmapget(attrs, "Content", -1, NULL);
      if(!content) content = cbmapget(attrs, "CONTENT", -1, NULL);
      if(content && ((pv = strstr(content, "charset")) != NULL ||
                     (pv = strstr(content, "Charset")) != NULL ||
                     (pv = strstr(content, "CHARSET")) != NULL)){
        enc = cbmemdup(pv + 8, -1);
        if((pv = strchr(enc, ';')) != NULL || (pv = strchr(enc, '\r')) != NULL ||
           (pv = strchr(enc, '\n')) != NULL || (pv = strchr(enc, ' ')) != NULL) *pv = '\0';
      }
    }
    cbmapclose(attrs);
    if(enc){
      cblistclose(elems);
      return enc;
    }
  }
  cblistclose(elems);
  return NULL;
}

/* unescape entity references of HTML */
static char *est_html_raw_text(const char *html){
  static const char *pairs[] = {
    /* basic symbols */
    "&amp;", "&", "&lt;", "<", "&gt;", ">", "&quot;", "\"", "&apos;", "'",
    /* ISO-8859-1 */
    "&nbsp;", "\xc2\xa0", "&iexcl;", "\xc2\xa1", "&cent;", "\xc2\xa2",
    "&pound;", "\xc2\xa3", "&curren;", "\xc2\xa4", "&yen;", "\xc2\xa5",
    "&brvbar;", "\xc2\xa6", "&sect;", "\xc2\xa7", "&uml;", "\xc2\xa8",
    "&copy;", "\xc2\xa9", "&ordf;", "\xc2\xaa", "&laquo;", "\xc2\xab",
    "&not;", "\xc2\xac", "&shy;", "\xc2\xad", "&reg;", "\xc2\xae",
    "&macr;", "\xc2\xaf", "&deg;", "\xc2\xb0", "&plusmn;", "\xc2\xb1",
    "&sup2;", "\xc2\xb2", "&sup3;", "\xc2\xb3", "&acute;", "\xc2\xb4",
    "&micro;", "\xc2\xb5", "&para;", "\xc2\xb6", "&middot;", "\xc2\xb7",
    "&cedil;", "\xc2\xb8", "&sup1;", "\xc2\xb9", "&ordm;", "\xc2\xba",
    "&raquo;", "\xc2\xbb", "&frac14;", "\xc2\xbc", "&frac12;", "\xc2\xbd",
    "&frac34;", "\xc2\xbe", "&iquest;", "\xc2\xbf", "&Agrave;", "\xc3\x80",
    "&Aacute;", "\xc3\x81", "&Acirc;", "\xc3\x82", "&Atilde;", "\xc3\x83",
    "&Auml;", "\xc3\x84", "&Aring;", "\xc3\x85", "&AElig;", "\xc3\x86",
    "&Ccedil;", "\xc3\x87", "&Egrave;", "\xc3\x88", "&Eacute;", "\xc3\x89",
    "&Ecirc;", "\xc3\x8a", "&Euml;", "\xc3\x8b", "&Igrave;", "\xc3\x8c",
    "&Iacute;", "\xc3\x8d", "&Icirc;", "\xc3\x8e", "&Iuml;", "\xc3\x8f",
    "&ETH;", "\xc3\x90", "&Ntilde;", "\xc3\x91", "&Ograve;", "\xc3\x92",
    "&Oacute;", "\xc3\x93", "&Ocirc;", "\xc3\x94", "&Otilde;", "\xc3\x95",
    "&Ouml;", "\xc3\x96", "&times;", "\xc3\x97", "&Oslash;", "\xc3\x98",
    "&Ugrave;", "\xc3\x99", "&Uacute;", "\xc3\x9a", "&Ucirc;", "\xc3\x9b",
    "&Uuml;", "\xc3\x9c", "&Yacute;", "\xc3\x9d", "&THORN;", "\xc3\x9e",
    "&szlig;", "\xc3\x9f", "&agrave;", "\xc3\xa0", "&aacute;", "\xc3\xa1",
    "&acirc;", "\xc3\xa2", "&atilde;", "\xc3\xa3", "&auml;", "\xc3\xa4",
    "&aring;", "\xc3\xa5", "&aelig;", "\xc3\xa6", "&ccedil;", "\xc3\xa7",
    "&egrave;", "\xc3\xa8", "&eacute;", "\xc3\xa9", "&ecirc;", "\xc3\xaa",
    "&euml;", "\xc3\xab", "&igrave;", "\xc3\xac", "&iacute;", "\xc3\xad",
    "&icirc;", "\xc3\xae", "&iuml;", "\xc3\xaf", "&eth;", "\xc3\xb0",
    "&ntilde;", "\xc3\xb1", "&ograve;", "\xc3\xb2", "&oacute;", "\xc3\xb3",
    "&ocirc;", "\xc3\xb4", "&otilde;", "\xc3\xb5", "&ouml;", "\xc3\xb6",
    "&divide;", "\xc3\xb7", "&oslash;", "\xc3\xb8", "&ugrave;", "\xc3\xb9",
    "&uacute;", "\xc3\xba", "&ucirc;", "\xc3\xbb", "&uuml;", "\xc3\xbc",
    "&yacute;", "\xc3\xbd", "&thorn;", "\xc3\xbe", "&yuml;", "\xc3\xbf",
    /* ISO-10646 */
    "&fnof;", "\xc6\x92", "&Alpha;", "\xce\x91", "&Beta;", "\xce\x92",
    "&Gamma;", "\xce\x93", "&Delta;", "\xce\x94", "&Epsilon;", "\xce\x95",
    "&Zeta;", "\xce\x96", "&Eta;", "\xce\x97", "&Theta;", "\xce\x98",
    "&Iota;", "\xce\x99", "&Kappa;", "\xce\x9a", "&Lambda;", "\xce\x9b",
    "&Mu;", "\xce\x9c", "&Nu;", "\xce\x9d", "&Xi;", "\xce\x9e",
    "&Omicron;", "\xce\x9f", "&Pi;", "\xce\xa0", "&Rho;", "\xce\xa1",
    "&Sigma;", "\xce\xa3", "&Tau;", "\xce\xa4", "&Upsilon;", "\xce\xa5",
    "&Phi;", "\xce\xa6", "&Chi;", "\xce\xa7", "&Psi;", "\xce\xa8",
    "&Omega;", "\xce\xa9", "&alpha;", "\xce\xb1", "&beta;", "\xce\xb2",
    "&gamma;", "\xce\xb3", "&delta;", "\xce\xb4", "&epsilon;", "\xce\xb5",
    "&zeta;", "\xce\xb6", "&eta;", "\xce\xb7", "&theta;", "\xce\xb8",
    "&iota;", "\xce\xb9", "&kappa;", "\xce\xba", "&lambda;", "\xce\xbb",
    "&mu;", "\xce\xbc", "&nu;", "\xce\xbd", "&xi;", "\xce\xbe",
    "&omicron;", "\xce\xbf", "&pi;", "\xcf\x80", "&rho;", "\xcf\x81",
    "&sigmaf;", "\xcf\x82", "&sigma;", "\xcf\x83", "&tau;", "\xcf\x84",
    "&upsilon;", "\xcf\x85", "&phi;", "\xcf\x86", "&chi;", "\xcf\x87",
    "&psi;", "\xcf\x88", "&omega;", "\xcf\x89", "&thetasym;", "\xcf\x91",
    "&upsih;", "\xcf\x92", "&piv;", "\xcf\x96", "&bull;", "\xe2\x80\xa2",
    "&hellip;", "\xe2\x80\xa6", "&prime;", "\xe2\x80\xb2", "&Prime;", "\xe2\x80\xb3",
    "&oline;", "\xe2\x80\xbe", "&frasl;", "\xe2\x81\x84", "&weierp;", "\xe2\x84\x98",
    "&image;", "\xe2\x84\x91", "&real;", "\xe2\x84\x9c", "&trade;", "\xe2\x84\xa2",
    "&alefsym;", "\xe2\x84\xb5", "&larr;", "\xe2\x86\x90", "&uarr;", "\xe2\x86\x91",
    "&rarr;", "\xe2\x86\x92", "&darr;", "\xe2\x86\x93", "&harr;", "\xe2\x86\x94",
    "&crarr;", "\xe2\x86\xb5", "&lArr;", "\xe2\x87\x90", "&uArr;", "\xe2\x87\x91",
    "&rArr;", "\xe2\x87\x92", "&dArr;", "\xe2\x87\x93", "&hArr;", "\xe2\x87\x94",
    "&forall;", "\xe2\x88\x80", "&part;", "\xe2\x88\x82", "&exist;", "\xe2\x88\x83",
    "&empty;", "\xe2\x88\x85", "&nabla;", "\xe2\x88\x87", "&isin;", "\xe2\x88\x88",
    "&notin;", "\xe2\x88\x89", "&ni;", "\xe2\x88\x8b", "&prod;", "\xe2\x88\x8f",
    "&sum;", "\xe2\x88\x91", "&minus;", "\xe2\x88\x92", "&lowast;", "\xe2\x88\x97",
    "&radic;", "\xe2\x88\x9a", "&prop;", "\xe2\x88\x9d", "&infin;", "\xe2\x88\x9e",
    "&ang;", "\xe2\x88\xa0", "&and;", "\xe2\x88\xa7", "&or;", "\xe2\x88\xa8",
    "&cap;", "\xe2\x88\xa9", "&cup;", "\xe2\x88\xaa", "&int;", "\xe2\x88\xab",
    "&there4;", "\xe2\x88\xb4", "&sim;", "\xe2\x88\xbc", "&cong;", "\xe2\x89\x85",
    "&asymp;", "\xe2\x89\x88", "&ne;", "\xe2\x89\xa0", "&equiv;", "\xe2\x89\xa1",
    "&le;", "\xe2\x89\xa4", "&ge;", "\xe2\x89\xa5", "&sub;", "\xe2\x8a\x82",
    "&sup;", "\xe2\x8a\x83", "&nsub;", "\xe2\x8a\x84", "&sube;", "\xe2\x8a\x86",
    "&supe;", "\xe2\x8a\x87", "&oplus;", "\xe2\x8a\x95", "&otimes;", "\xe2\x8a\x97",
    "&perp;", "\xe2\x8a\xa5", "&sdot;", "\xe2\x8b\x85", "&lceil;", "\xe2\x8c\x88",
    "&rceil;", "\xe2\x8c\x89", "&lfloor;", "\xe2\x8c\x8a", "&rfloor;", "\xe2\x8c\x8b",
    "&lang;", "\xe2\x8c\xa9", "&rang;", "\xe2\x8c\xaa", "&loz;", "\xe2\x97\x8a",
    "&spades;", "\xe2\x99\xa0", "&clubs;", "\xe2\x99\xa3", "&hearts;", "\xe2\x99\xa5",
    "&diams;", "\xe2\x99\xa6", "&OElig;", "\xc5\x92", "&oelig;", "\xc5\x93",
    "&Scaron;", "\xc5\xa0", "&scaron;", "\xc5\xa1", "&Yuml;", "\xc5\xb8",
    "&circ;", "\xcb\x86", "&tilde;", "\xcb\x9c", "&ensp;", "\xe2\x80\x82",
    "&emsp;", "\xe2\x80\x83", "&thinsp;", "\xe2\x80\x89", "&zwnj;", "\xe2\x80\x8c",
    "&zwj;", "\xe2\x80\x8d", "&lrm;", "\xe2\x80\x8e", "&rlm;", "\xe2\x80\x8f",
    "&ndash;", "\xe2\x80\x93", "&mdash;", "\xe2\x80\x94", "&lsquo;", "\xe2\x80\x98",
    "&rsquo;", "\xe2\x80\x99", "&sbquo;", "\xe2\x80\x9a", "&ldquo;", "\xe2\x80\x9c",
    "&rdquo;", "\xe2\x80\x9d", "&bdquo;", "\xe2\x80\x9e", "&dagger;", "\xe2\x80\xa0",
    "&Dagger;", "\xe2\x80\xa1", "&permil;", "\xe2\x80\xb0", "&lsaquo;", "\xe2\x80\xb9",
    "&rsaquo;", "\xe2\x80\xba", "&euro;", "\xe2\x82\xac",
    NULL
  };
  char *raw, *wp, buf[2], *tmp;
  int i, j, hit, num, tsiz;
  assert(html);
  CB_MALLOC(raw, strlen(html) * 3 + 1);
  wp = raw;
  while(*html != '\0'){
    if(*html == '&'){
      if(*(html + 1) == '#'){
        if(*(html + 2) == 'x' || *(html + 2) == 'X'){
          num = strtol(html + 3, NULL, 16);
        } else {
          num = atoi(html + 2);
        }
        buf[0] = num / 256;
        buf[1] = num % 256;
        if((tmp = est_uconv_out(buf, 2, &tsiz)) != NULL){
          for(j = 0; j < tsiz; j++){
            *wp = ((unsigned char *)tmp)[j];
            wp++;
          }
          free(tmp);
        }
        while(*html != ';' && *html != ' ' && *html != '\n' && *html != '\0'){
          html++;
        }
        if(*html == ';') html++;
      } else {
        hit = FALSE;
        for(i = 0; pairs[i] != NULL; i += 2){
          if(cbstrfwmatch(html, pairs[i])){
            wp += sprintf(wp, "%s", pairs[i+1]);
            html += strlen(pairs[i]);
            hit = TRUE;
            break;
          }
        }
        if(!hit){
          *wp = *html;
          wp++;
          html++;
        }
      }
    } else {
      *wp = *html;
      wp++;
      html++;
    }
  }
  *wp = '\0';
  return raw;
}

static char *est_html_iconv(const char *buf, int size,
                            int plang, const char *charset)
{
  const char *enc;
  char *nbuf, *nenc;

  assert(buf);

  enc = (charset) ? charset : est_enc_name(buf, size, plang);

  nbuf = NULL;
  if (!strcmp(enc, "UTF-16") || !strcmp(enc, "UTF-16BE") ||
      !strcmp(enc, "UTF-16LE")) {
    nbuf = est_iconv(buf, size, enc, "UTF-8", NULL, NULL);
  }
  else if (!strcmp(enc, "US-ASCII")) {
    nbuf = NULL;
  }
  else {
    if ((nenc = charset ? cbmemdup(charset, -1) : est_html_enc(buf)) != NULL) {
      if (cbstricmp(nenc, "UTF-8")) {
        nbuf = est_iconv(buf, size, nenc, "UTF-8", NULL, NULL);
        if (!nbuf) nbuf = est_iconv(buf, size, enc, "UTF-8", NULL, NULL);
      }
      free(nenc);
    }
    else {
      nbuf = est_iconv(buf, size, enc, "UTF-8", NULL, NULL);
    }
  }

  return nbuf;
}

/* create a document object from plain text */
ESTDOC *est_doc_new_from_text(const char *buf, int size,
                              const char *penc, int plang, int bcheck)
{
  ESTDOC *doc;
  CBLIST *lines;
  CBDATUM *datum;
  const char *enc, *text, *line;
  char *nbuf, numbuf[NUMBUFSIZ];
  int i;
  assert(buf && size >= 0);
  if(bcheck && est_check_binary(buf, size)) return NULL;
  doc = est_doc_new();
  enc = penc ? penc : est_enc_name(buf, size, plang);
  if(!strcmp(enc, "UTF-8")){
    nbuf = NULL;
    text = buf;
  } else {
    text = buf;
    nbuf = est_iconv(buf, size, enc, "UTF-8", NULL, NULL);
    if(nbuf) text = nbuf;
  }
  lines = cbsplit(text, -1, "\n");
  datum = cbdatumopen("", 0);
  for(i = 0; i < CB_LISTNUM(lines); i++){
    line = cblistval(lines, i, NULL);
    while(*line == ' ' || *line == '\t' || *line == '\r'){
      line++;
    }
    if(line[0] == '\0'){
      est_doc_add_text(doc, CB_DATUMPTR(datum));
      cbdatumsetsize(datum, 0);
    } else {
      cbdatumcat(datum, " ", 1);
      cbdatumcat(datum, line, -1);
    }
  }
  est_doc_add_text(doc, CB_DATUMPTR(datum));
  cbdatumclose(datum);
  cblistclose(lines);
  est_doc_add_attr(doc, ESTDATTRTYPE, "text/plain");
  sprintf(numbuf, "%d", size);
  est_doc_add_attr(doc, ESTDATTRSIZE, numbuf);
  if(nbuf) free(nbuf);
  return doc;
}

#ifdef HAVE_LIBTIDY
# include <tidy.h>
# include <buffio.h>

static TidyNode est_node_go_next(TidyNode node) {
  TidyNode next;
  next = tidyGetNext(node);
  while (!next) {
    node = tidyGetParent(node);
    if (tidyNodeIsBODY(node)) return NULL;
    next = tidyGetNext(node);
  }
  return next;
}

static TidyNode est_node_go_deep(TidyNode node) {
  TidyNode next;
  next = tidyGetChild(node);
  if (!next) next = est_node_go_next(node);
  return next;
}

/* create a document object from HTML */
ESTDOC *est_doc_new_from_html(const char *buf, int size,
                              const char *charset, int plang, int bcheck)
{
  ESTDOC *doc;
  TidyDoc tdoc;
  TidyNode node, child;
  TidyAttr attr;
  TidyBuffer errbuf = {0};

  const char *html, *name, *content;
  char *nbuf, *rbuf, numbuf[NUMBUFSIZ];

  CBDATUM *txt;
  CBDATUM *link, *linkstr, *linkpos;
  const char *href;
  int txt_num = 0;

  if(bcheck && est_check_binary(buf, size)) return NULL;

  doc = est_doc_new();

  nbuf = est_html_iconv(buf, size, plang, charset);
  html = (nbuf) ? nbuf : buf;

  tdoc = tidyCreate();
  tidySetErrorBuffer(tdoc, &errbuf);
  tidySetCharEncoding(tdoc, "utf8");
  tidyParseString(tdoc, html);
  tidyCleanAndRepair(tdoc);

  node = tidyGetHtml(tdoc);
  if (node) {
    attr = tidyAttrGetLANG(node);
    if (attr) {
      est_doc_add_attr(doc, ESTDATTRLANG, tidyAttrValue(attr));
    }
  }

  node = tidyGetHead(tdoc);
  for (node = tidyGetChild(node); node; node = tidyGetNext(node)) {
    if (tidyNodeIsMETA(node)) {
      attr = tidyAttrGetNAME(node);
      if (!attr) continue;
      name = tidyAttrValue(attr);
      if (!name) continue;

      attr = tidyAttrGetCONTENT(node);
      if (!attr) continue;
      content = tidyAttrValue(attr);
      if (!content) continue;

      if (cbstricmp(name, "author") == 0) {
        if (strchr(content, '&')) {
          rbuf = est_html_raw_text(content);
          est_doc_add_attr(doc, ESTDATTRAUTHOR, rbuf);
          free(rbuf);
        }
        else {
          est_doc_add_attr(doc, ESTDATTRAUTHOR, content);
        }
      }

      if(name[0] != '@'){
        if(strchr(content, '&')){
          rbuf = est_html_raw_text(content);
          est_doc_add_attr(doc, name, rbuf);
          free(rbuf);
        } 
        else {
          est_doc_add_attr(doc, name, content);
        }
      }
    }

    if (tidyNodeIsTITLE(node)) {
      child = tidyGetChild(node);
      if (tidyNodeGetName(child)) {
        /* error: title has child */
        continue;
      }
      else {
        TidyBuffer buf;
        char *text;
        tidyBufInit(&buf);
        tidyNodeGetText(tdoc, child, &buf);
        text = (char *)buf.bp;
        if (text) {
          if (strchr(text, '&')) {
            rbuf = est_html_raw_text(text);
            est_doc_add_attr(doc, ESTDATTRTITLE, rbuf);
            est_doc_add_hidden_text(doc, rbuf);
            free(rbuf);
          }
          else {
            est_doc_add_attr(doc, ESTDATTRTITLE, text);
            est_doc_add_hidden_text(doc, text);
          }
        }
        tidyBufFree(&buf);
      }
    }
  }

  txt = cbdatumopen("", 0);
  link = cbdatumopen("", 0);
  linkstr = cbdatumopen("", 0);
  linkpos = cbdatumopen("", 0);

  node = tidyGetBody(tdoc);
  if (node) node = tidyGetChild(node);

  while (node) {
    if (tidyNodeIsH1(node) || tidyNodeIsH2(node) ||
        tidyNodeIsH3(node) || tidyNodeIsH4(node) ||
        tidyNodeIsH5(node) || tidyNodeIsH6(node) ||
        tidyNodeIsP(node) || tidyNodeIsDIV(node) ||
        tidyNodeIsPRE(node) || tidyNodeIsHR(node) ||
        tidyNodeIsUL(node) || tidyNodeIsOL(node) ||
        tidyNodeIsDL(node) || tidyNodeIsLI(node) ||
        tidyNodeIsDT(node) || tidyNodeIsDD(node) ||
        tidyNodeIsTH(node) || tidyNodeIsTD(node) ||
        tidyNodeIsFORM(node) || tidyNodeIsTABLE(node))
    {
      if (cbdatumsize(txt)) {
        if (strchr(CB_DATUMPTR(txt), '&')) {
          const char *cp;
          rbuf = est_html_raw_text(CB_DATUMPTR(txt));
          cp = rbuf;
          while (*cp) {
            if (*cp > ' ') {
              est_doc_add_text(doc, rbuf);
              txt_num++;
              break;
            }
            cp++;
          }
          free(rbuf);
        }
        else {
          const char *cp = CB_DATUMPTR(txt);
          while (*cp) {
            if (*cp > ' ') {
              est_doc_add_text(doc, CB_DATUMPTR(txt));
              txt_num++;
              break;
            }
            cp++;
          }
        }
        cbdatumsetsize(txt, 0);
      }

      node = est_node_go_deep(node);
    }
    else if (tidyNodeIsSTYLE(node) || tidyNodeIsSCRIPT(node)) {
      node = est_node_go_next(node);
    }
    else if (tidyNodeIsText(node)) {
      TidyBuffer buf;
      char *bp;
      tidyBufInit(&buf);
      tidyNodeGetText(tdoc, node, &buf);
      bp = (char *)buf.bp;
      cbdatumcat(txt, bp, -1);
      tidyBufFree(&buf);

      node = est_node_go_next(node);
    }
    else {
      if (tidyNodeIsA(node)) {
        attr = tidyAttrGetHREF(node);
        if (attr) {
          href = tidyAttrValue(attr);
          if (href /*&& (strncmp(href, "http://", 7) == 0 ||
                       strncmp(href, "https://", 8) == 0 ||
                       strncmp(href, "ftp://", 6) == 0)*/)
          {
            TidyNode ch = tidyGetChild(node);
            if (tidyNodeIsText(ch)) {
              TidyBuffer buf;
              char *bp;
              int bl;
              tidyNodeGetText(tdoc, ch, &buf);
              bp = (char *)buf.bp;
              rbuf = (strchr(bp, '&')) ? est_html_raw_text(bp) : bp;
              bl = strlen(rbuf);
              while (isspace(rbuf[bl-1])) {
                rbuf[(bl--)-1] = '\0';
                if (bl == 0) break;
              }

              if (bl > 2) {
                char b[32];
                if (cbdatumsize(linkstr)) cbdatumcat(linkstr, " ", -1);
                cbdatumcat(linkstr, rbuf, -1);
                if (cbdatumsize(link)) cbdatumcat(link, " ", -1);
                cbdatumcat(link, href, -1);
                sprintf(b, "%d,%d", txt_num, bl);
                if (cbdatumsize(linkstr)) cbdatumcat(linkpos, " ", -1);
                cbdatumcat(linkpos, b, -1);
              }
              if (rbuf != bp) free(rbuf);
              tidyBufFree(&buf);
            }
          }
        }
      }

      node = est_node_go_deep(node);
    }
  }

  if (cbdatumsize(txt)) {
    if (strchr(CB_DATUMPTR(txt), '&')) {
      rbuf = est_html_raw_text(CB_DATUMPTR(txt));
      est_doc_add_text(doc, rbuf);
      free(rbuf);
    }
    else {
      est_doc_add_text(doc, CB_DATUMPTR(txt));
    }
  }

  if(nbuf) free(nbuf);
  est_doc_add_attr(doc, ESTDATTRTYPE, "text/html");
  sprintf(numbuf, "%d", size);
  est_doc_add_attr(doc, ESTDATTRSIZE, numbuf);

  if (cbdatumsize(link)) {
    est_doc_add_attr(doc, "link", CB_DATUMPTR(link));
    est_doc_add_attr(doc, "linkpos", CB_DATUMPTR(linkpos));
    est_doc_add_attr(doc, "linkstr", CB_DATUMPTR(linkstr));
  }
  cbdatumclose(link);
  cbdatumclose(linkpos);
  cbdatumclose(linkstr);

  return doc;
}

#else

/* create a document object from HTML */
ESTDOC *est_doc_new_from_html(const char *buf, int size,
                              const char *charset, int plang, int bcheck)
{
  ESTDOC *doc;
  CBLIST *elems;
  CBMAP *attrs;
  CBDATUM *datum;
  const char *html, *elem, *next, *value, *name, *content;
  char *nbuf,*rbuf, *lbuf, numbuf[NUMBUFSIZ];
  int i, esiz;

  if(bcheck && est_check_binary(buf, size)) return NULL;

  doc = est_doc_new();

  nbuf = est_html_iconv(buf, size, plang, charset);
  html = (nbuf) ? nbuf : buf;

  datum = cbdatumopen("", 0);
  elems = cbxmlbreak(html, TRUE);
  for(i = 0; i < CB_LISTNUM(elems); i++){
    elem = CB_LISTVAL2(elems, i, &esiz);
    if(!(next = cblistval(elems, i + 1, NULL))) next = "";
    if(elem[0] == '<'){
      if(cbstrfwimatch(elem, "<html")){
        attrs = cbxmlattrs(elem);
        value = cbmapget(attrs, "lang", -1, NULL);
        if(!value) value = cbmapget(attrs, "Lang", -1, NULL);
        if(!value) value = cbmapget(attrs, "LANG", -1, NULL);
        if(!value) value = cbmapget(attrs, "xml:lang", -1, NULL);
        if(value && value[0] != '\0') est_doc_add_attr(doc, ESTDATTRLANG, value);
        cbmapclose(attrs);
      } else if(cbstrfwimatch(elem, "<meta")){
        attrs = cbxmlattrs(elem);
        name = cbmapget(attrs, "name", -1, NULL);
        if(!name) name = cbmapget(attrs, "Name", -1, NULL);
        if(!name) name = cbmapget(attrs, "NAME", -1, NULL);
        if(!name) name = cbmapget(attrs, "http-equiv", -1, NULL);
        if(!name) name = cbmapget(attrs, "Http-equiv", -1, NULL);
        if(!name) name = cbmapget(attrs, "Http-Equiv", -1, NULL);
        if(!name) name = cbmapget(attrs, "HTTP-EQUIV", -1, NULL);
        content = cbmapget(attrs, "content", -1, NULL);
        if(!content) content = cbmapget(attrs, "Content", -1, NULL);
        if(!content) content = cbmapget(attrs, "CONTENT", -1, NULL);
        if(name && content){
          lbuf = cbmemdup(name, -1);
          cbstrtolower(lbuf);
          cbstrsqzspc(lbuf);
          if(!strcmp(lbuf, "author")){
            if(strchr(content, '&')){
              rbuf = est_html_raw_text(content);
              est_doc_add_attr(doc, ESTDATTRAUTHOR, rbuf);
              free(rbuf);
            } else {
              est_doc_add_attr(doc, ESTDATTRAUTHOR, content);
            }
          }
          if(name[0] != '@'){
            if(strchr(content, '&')){
              rbuf = est_html_raw_text(content);
              est_doc_add_attr(doc, lbuf, rbuf);
              free(rbuf);
            } else {
              est_doc_add_attr(doc, lbuf, content);
            }
          }
          free(lbuf);
        }
        cbmapclose(attrs);
      } else if(cbstrfwimatch(elem, "<title") && next[0] != '\0' && next[0] != '<'){
        if(strchr(next, '&')){
          rbuf = est_html_raw_text(next);
          est_doc_add_attr(doc, ESTDATTRTITLE, rbuf);
          est_doc_add_hidden_text(doc, rbuf);
          free(rbuf);
        } else {
          est_doc_add_attr(doc, ESTDATTRTITLE, next);
          est_doc_add_hidden_text(doc, next);
        }
        i++;
      } else if(cbstrfwimatch(elem, "<style") || cbstrfwimatch(elem, "<script")){
        i++;
      } else if(cbstrfwimatch(elem, "<h1") || cbstrfwimatch(elem, "<h2") ||
                cbstrfwimatch(elem, "<h3") || cbstrfwimatch(elem, "<h4") ||
                cbstrfwimatch(elem, "<h5") || cbstrfwimatch(elem, "<h6") ||
                cbstrfwimatch(elem, "<p>") || cbstrfwimatch(elem, "<p ") ||
                cbstrfwimatch(elem, "<div") || cbstrfwimatch(elem, "<hr") ||
                cbstrfwimatch(elem, "<ul") || cbstrfwimatch(elem, "<ol") ||
                cbstrfwimatch(elem, "<dl") || cbstrfwimatch(elem, "<li") ||
                cbstrfwimatch(elem, "<dt") || cbstrfwimatch(elem, "<dd") ||
                cbstrfwimatch(elem, "<th") || cbstrfwimatch(elem, "<td") ||
                cbstrfwimatch(elem, "<pre")){
        if(strchr(CB_DATUMPTR(datum), '&')){
          rbuf = est_html_raw_text(CB_DATUMPTR(datum));
          est_doc_add_text(doc, rbuf);
          free(rbuf);
        } else {
          est_doc_add_text(doc, CB_DATUMPTR(datum));
        }
        cbdatumsetsize(datum, 0);
      }
    } else {
      cbdatumcat(datum, " ", -1);
      cbdatumcat(datum, elem, esiz);
    }
  }
  cblistclose(elems);
  if(strchr(CB_DATUMPTR(datum), '&')){
    rbuf = est_html_raw_text(CB_DATUMPTR(datum));
    est_doc_add_text(doc, rbuf);
    free(rbuf);
  } else {
    est_doc_add_text(doc, CB_DATUMPTR(datum));
  }
  cbdatumclose(datum);
  if(nbuf) free(nbuf);
  est_doc_add_attr(doc, ESTDATTRTYPE, "text/html");
  sprintf(numbuf, "%d", size);
  est_doc_add_attr(doc, ESTDATTRSIZE, numbuf);
  return doc;
}

#endif /* HAVE_LIBTIDY */

enum {                                   /* enumeration for file formats */
  FF_AUTO,                               /* automatic detection */
  FF_DRAFT,                              /* draft */
  FF_TEXT,                               /* plain text */
  FF_HTML,                               /* HTML */
  FF_MIME,                               /* MIME */
  FF_NONE                                /* ignored */
};

#define NUMBUFSIZ      32                /* size of a buffer for a number */
#define URIBUFSIZ      8192              /* size of a buffer for an URI */

#if defined(_SYS_MSVC_) || defined(_SYS_MINGW_)
#define ESTPATHCHR '\\'
#else
#define ESTPATHCHR '/'
#endif

/* create a document object from draft data in another encoding */
static ESTDOC *est_doc_new_from_draft_enc(const char *buf, int size,
                                          const char *enc)
{
  ESTDOC *doc;
  char *rbuf;
  assert(buf);
  if(enc && (rbuf = est_iconv(buf, size, enc, "UTF-8", NULL, NULL)) != NULL){
    doc = est_doc_new_from_draft(rbuf);
    free(rbuf);
  } else {
    doc = est_doc_new_from_draft(buf);
  }
  return doc;
}

/* create a document object with an outer command */
ESTDOC *est_doc_new_with_xcmd(const char *buf, int size, const char *uri,
                              const char *xcmd, const char *tmpdir,
                              const char *penc, int plang)
{
  ESTDOC *doc;
  const char *pv, *ext;
  char iname[URIBUFSIZ], oname[URIBUFSIZ], ebuf[URIBUFSIZ], cmd[URIBUFSIZ];
  char *rbuf, numbuf[NUMBUFSIZ];
  int fmt, rsiz;
  assert(buf && size >= 0 && uri && xcmd && tmpdir);
  sprintf(ebuf, "ESTORIGFILE=%s", uri);
  ext = NULL;
  if((pv = strrchr(uri, '/')) != NULL) uri = pv;
  if((pv = strrchr(uri, '.')) != NULL) ext = pv;
  if(!ext) ext = "";
  sprintf(iname, "%s%cxcmd-in-%08d%s", tmpdir, ESTPATHCHR, getpid(), ext);
  sprintf(oname, "%s%cxcmd-out-%08d%cest", tmpdir, ESTPATHCHR, getpid(), '.');
  fmt = FF_DRAFT;
  if(cbstrfwmatch(xcmd, "T@")){
    fmt = FF_TEXT;
    xcmd += 2;
  } else if(cbstrfwmatch(xcmd, "H@")){
    fmt = FF_HTML;
    xcmd += 2;
  }/* else if(cbstrfwmatch(xcmd, "M@")){
    fmt = FF_MIME;
    xcmd += 2;
  }*/
  sprintf(cmd, "%s \"%s\" \"%s\"", xcmd, iname, oname);
  cbwritefile(iname, buf, size);
  putenv(ebuf);
  if (system(cmd)) {
    /* failure! */
    return NULL;
  }
  if((rbuf = cbreadfile(oname, &rsiz)) != NULL){
    switch(fmt){
    case FF_TEXT:
      doc = est_doc_new_from_text(rbuf, rsiz, penc, plang, FALSE);
      break;
    case FF_HTML:
      doc = est_doc_new_from_html(rbuf, rsiz, penc, plang, FALSE);
      break;
/*
    case FF_MIME:
      doc = est_doc_new_from_mime(rbuf, rsiz, penc, plang);
      break;
*/
    default:
      doc = est_doc_new_from_draft_enc(rbuf, rsiz, penc);
      break;
    }
    free(rbuf);
  } else {
    doc = est_doc_new();
  }

  if(doc && fmt != FF_DRAFT){
    sprintf(numbuf, "%d", size);
    est_doc_add_attr(doc, ESTDATTRSIZE, numbuf);
    est_doc_add_attr(doc, ESTDATTRTYPE, est_ext_type(ext));
  }
  unlink(oname);
  unlink(iname);
  return doc;
}
