/*************************************************************************************************
 * Implementation of chasen features
 *                                                      Copyright (C) 2003-2004 Mikio Hirabayashi
 * This file is part of Estraier, a personal full-text search system.
 * Estraier is free software; you can redistribute it and/or modify it under the terms of the GNU
 * General Public License as published by the Free Software Foundation; either version 2 of the
 * License, or any later version.
 * Estraier is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
 * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 * See the GNU General Public License for more details.
 * You should have received a copy of the GNU General Public License along with Estraier;
 * if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
 * MA 02111-1307 USA.
 *************************************************************************************************/

/* Note that the encoding of this file should be EUC-JP. */


#include "estcommon.h"



#if defined(MYCHASEN)
/*************************************************************************************************
 * implementations
 *************************************************************************************************/


#include <chasen.h>


static void estdocaddtextchasen_impl(ODDOC *doc, const char *text, int mode);
static int estisstopwordja(const char *word);


int estischasen = TRUE;
void (*estdocaddtextchasen)(ODDOC *, const char *, int) = estdocaddtextchasen_impl;


/* Break a Japanese text into words and register them to a document handle, using Chasen. */
static void estdocaddtextchasen_impl(ODDOC *doc, const char *text, int mode){
  static int first = TRUE;
  char *chasenargs[] = { "chasen", "-F", "%M\\t%m\\t%H\\n", NULL };
  char *signparts[] = { "̤θ", "̾", "ư", "ƻ", "ư", "",
                        "Ϣλ", "ư", "Ƭ", NULL };
  char *res, *pivot, *end, *normal, *asis, *part, *nsel, *asel;
  int i, sign;
  assert(doc && text);
  if(first){
    if(chasen_getopt_argv(chasenargs, stderr) != 0) return;
    first = FALSE;
  }
  if(!(res = chasen_sparse_tostr((char *)text))) return;
  pivot = res;
  while(TRUE){
    if((end = strchr(pivot, '\n')) != NULL) *end = '\0';
    normal = pivot;
    if((asis = strchr(normal, '\t')) != NULL){
      *asis = '\0';
      asis++;
      if((part = strchr(asis, '\t')) != NULL){
        *part = '\0';
        part++;
        if(normal[0] == '\0') normal = asis;
        if(!ESTISNOSTOPW){
          sign = FALSE;
          for(i = 0; signparts[i]; i++){
            if(!strcmp(part, signparts[i])){
              sign = TRUE;
              break;
            }
          }
          if(!sign || estisstopwordja(normal)) normal = "";
        }
        switch(mode){
        case ESTDOCBOTH:
          nsel = cbiconv(normal, -1, "EUC-JP", "UTF-8", NULL, NULL);
          asel = cbiconv(asis, -1, "EUC-JP", "UTF-8", NULL, NULL);
          if(nsel && asel) oddocaddword(doc, nsel, asel);
          free(asel);
          free(nsel);
          break;
        case ESTDOCNONLY:
          nsel = cbiconv(normal, -1, "EUC-JP", "UTF-8", NULL, NULL);
          if(nsel) oddocaddword(doc, nsel, "");
          free(nsel);
          break;
        case ESTDOCAONLY:
          asel = cbiconv(asis, -1, "EUC-JP", "UTF-8", NULL, NULL);
          oddocaddword(doc, "", asel);
          free(asel);
          break;
        }
      }
    }
    pivot = end + 1;
    if(!end) break;
  }
}


/* Check a word is a Japanese stop word or not.
   `word' specifies a word in normalized form.
   The return value is true if the word is a stop word, else, it is false. */
static int estisstopwordja(const char *word){
  static CBMAP *map = NULL;
  char *stopwords[] = {
    "", "", "", "", "ʤ", "", "ʤ", "Ĥ", "", "",
    "", "Ĥ", "դ", "", "", "", "", "Ǥ", "ޤ", "",
    "Ǥ", "", "", "ߤ", "ߤ", "Ҥ", "", "", "", "Ƥ",
    "", "", "ʤ", "ޤ", "ޤ", "Ҥ",
    "ʤ", "", "褤", "褯", "褷", "", "", "", "",
    "", "", "Ȥ", "ۤ", "", "椨", "", "ɤ", "Ӥ", "",
    "", "", "", "", "", "", "ɤ", "ɤ",
    "", "", "", "ɤ", "", "", "", "ɤ",
    "", "", "ɤ", "", "", "", "",
    "", "", "", "" , "", "褦", "", "ɤ", "դ", "ޤ", "",
    "", "", "", "", "", "", "", "", "", "",
    "", "", "", "", "", "", "", "", "", "",
    "", "", "", "", "", "", "", "", "", "",
    "", "", "", "", "", "", "", "",
    "", "", "", "", "", "", "", "", "", "",
    "", "", "", "", "", "", "", "", "", "",
    "", "", "", "", "", "", "", "", "", "",
    "", "", "", "", "",
    "", "", "", "", "", "",
    "", "", "", "ˤ", "ˤ", "ˤ",
    "Ҥ", "Ҥ", "Ҥ", "ߤ", "ߤ", "ߤ",
    "", "", "", "", "", "",
    "", "", "", "Ӥ", "Ӥ", "Ӥ",
    "Ԥ", "Ԥ", "Ԥ",
    "", "", "", "", "", "", "", "", "", "",
    "", "", "", "", "", "", "", "", "", "",
    "", "", "", "", "", "", "", "", "", "",
    "", "", "", "", "", "", "", "",
    "", "", "", "", "", "", "", "", "", "",
    "", "", "", "", "", "", "", "", "", "",
    "", "", "", "", "", "", "", "", "", "",
    "", "", "", "", "",
    "", "", "", "", "", "",
    "", "", "", "˥", "˥", "˥",
    "ҥ", "ҥ", "ҥ", "ߥ", "ߥ", "ߥ",
    "", "", "", "", "", "",
    "", "", "", "ӥ", "ӥ", "ӥ",
    "ԥ", "ԥ", "ԥ",
    "", "", "", "", "", "", "", "", "", "", "", "", "",
    "", "", "", "", "", "", "", "", "", "", "", "", "",
    "", "", "", "", "", "", "", "", "", "", "", "", "",
    "", "", "", "", "", "", "", "", "", "", "", "", "",
    "", "", "", "", "", "", "", "", "", "", "", "", "",
    "", "", "", "", "", "", "", "", "", "", "", "", "",
    "", "", "", "", "", "", "", "", "", "", "", "", "",
    "", "", "", "", "", "", "", "", "", "", "", "", "",
    "", "", "", "", "", "", "", "", "", "", "", "", "",
    "", "", "", "", "", "", "", "", "", "", "", "", "",
    "", "", "", "", "", "", "", "", "", "", "",
    NULL
  };
  int i, len;
  assert(word);
  if(!map){
    map = cbmapopenex((sizeof(stopwords) / sizeof(char *)) * 2);
    cbglobalgc(map, (void (*)(void *))cbmapclose);
    for(i = 0; stopwords[i]; i++){
      cbmapput(map, stopwords[i], -1, "", 0, TRUE);
    }
  }
  len = strlen(word);
  if(len > ESTWMAXLEN) return TRUE;
  return cbmapget(map, word, len, NULL) != NULL;
}



#else
/*************************************************************************************************
 * dummy interfaces
 *************************************************************************************************/


int estischasen = FALSE;
void (*estdocaddtextchasen)(ODDOC *, const char *, int) = NULL;



#endif



/* END OF FILE */
