/*************************************************************************************************
 * Implementation of kakasi features
 *                                                      Copyright (C) 2003-2004 Mikio Hirabayashi
 * This file is part of Estraier, a personal full-text search system.
 * Estraier is free software; you can redistribute it and/or modify it under the terms of the GNU
 * General Public License as published by the Free Software Foundation; either version 2 of the
 * License, or any later version.
 * Estraier is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
 * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 * See the GNU General Public License for more details.
 * You should have received a copy of the GNU General Public License along with Estraier;
 * if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
 * MA 02111-1307 USA.
 *************************************************************************************************/

/* Note that the encoding of this file should be EUC-JP. */


#include "estcommon.h"



#if defined(MYKAKASI)
/*************************************************************************************************
 * implementations
 *************************************************************************************************/


#include <libkakasi.h>


static void estdocaddtextkakasi_impl(ODDOC *doc, const char *text, int mode);
static int estisstopwordja(const char *word);


int estiskakasi = TRUE;
void (*estdocaddtextkakasi)(ODDOC *, const char *, int) = estdocaddtextkakasi_impl;


/* Break a Japanese text into words and register them to a document handle, using Kakasi. */
static void estdocaddtextkakasi_impl(ODDOC *doc, const char *text, int mode){
  static int first = TRUE;
  char *kakasiargs[] = { "kakasi", "-ieuc", "-w", NULL };
  CBLIST *words;
  const char *word, *asis, *normal;
  char *res, *sel;
  int i, wsiz;
  if(first){
    if(kakasi_getopt_argv(3, kakasiargs) != 0) return;
    first = FALSE;
  }
  res = kakasi_do((char *)text);
  if(res[0] == '\0') return;
  words = cbsplit(res, -1, " ");
  for(i = 0; i < cblistnum(words); i++){
    word = cblistval(words, i, &wsiz);
    sel = cbiconv(word, wsiz, "EUC-JP", "UTF-8", NULL, NULL);
    normal = sel;
    asis = sel;
    if(!ESTISNOSTOPW && estisstopwordja(word)) normal = "";
    switch(mode){
    case ESTDOCBOTH:
      oddocaddword(doc, normal, asis);
      break;
    case ESTDOCNONLY:
      oddocaddword(doc, normal, "");
      break;
    case ESTDOCAONLY:
      oddocaddword(doc, "", asis);
      break;
    }
    free(sel);
  }
  cblistclose(words);
  free(res);
  return;
}


/* Check a word is a Japanese stop word or not.
   `word' specifies a word in normalized form.
   The return value is true if the word is a stop word, else, it is false. */
static int estisstopwordja(const char *word){
  static CBMAP *map = NULL;
  char *stopwords[] = {
    "", "", "", "", "", "", "", "", "", "",
    "", "", "", "", "", "", "", "", "", "",
    "", "", "", "", "", "", "", "", "", "",
    "", "", "", "", "", "", "", "",
    "", "", "", "", "", "", "", "", "", "",
    "", "", "", "", "", "", "", "", "", "",
    "", "", "", "", "", "", "", "", "", "",
    "", "", "", "", "",
    "", "", "", "", "", "",
    "", "", "", "ˤ", "ˤ", "ˤ",
    "Ҥ", "Ҥ", "Ҥ", "ߤ", "ߤ", "ߤ",
    "", "", "", "", "", "",
    "", "", "", "Ӥ", "Ӥ", "Ӥ",
    "Ԥ", "Ԥ", "Ԥ",
    "", "", "", "", "", "", "", "", "", "",
    "", "", "", "", "", "", "", "", "", "",
    "", "", "", "", "", "", "", "", "", "",
    "", "", "", "", "", "", "", "",
    "", "", "", "", "", "", "", "", "", "",
    "", "", "", "", "", "", "", "", "", "",
    "", "", "", "", "", "", "", "", "", "",
    "", "", "", "", "",
    "", "", "", "", "", "",
    "", "", "", "˥", "˥", "˥",
    "ҥ", "ҥ", "ҥ", "ߥ", "ߥ", "ߥ",
    "", "", "", "", "", "",
    "", "", "", "ӥ", "ӥ", "ӥ",
    "ԥ", "ԥ", "ԥ",
    "", "", "", "", "", "", "", "", "", "", "", "", "",
    "", "", "", "", "", "", "", "", "", "", "", "", "",
    "", "", "", "", "", "", "", "", "", "", "", "", "",
    "", "", "", "", "", "", "", "", "", "", "", "", "",
    "", "", "", "", "", "", "", "", "", "", "", "", "",
    "", "", "", "", "", "", "", "", "", "", "", "", "",
    "", "", "", "", "", "", "", "", "", "", "", "", "",
    "", "", "", "", "", "", "", "", "", "", "", "", "",
    "", "", "", "", "", "", "", "", "", "", "", "", "",
    "", "", "", "", "", "", "", "", "", "", "", "", "",
    "", "", "", "", "", "", "", "", "", "", "",
    NULL
  };
  int i, len;
  assert(word);
  if(!map){
    map = cbmapopenex((sizeof(stopwords) / sizeof(char *)) * 2);
    cbglobalgc(map, (void (*)(void *))cbmapclose);
    for(i = 0; stopwords[i]; i++){
      cbmapput(map, stopwords[i], -1, "", 0, TRUE);
    }
  }
  len = strlen(word);
  if(len > ESTWMAXLEN) return TRUE;
  return cbmapget(map, word, len, NULL) != NULL;
}



#else
/*************************************************************************************************
 * dummy interfaces
 *************************************************************************************************/


int estiskakasi = FALSE;
void (*estdocaddtextkakasi)(ODDOC *, const char *, int) = NULL;



#endif



/* END OF FILE */
