# Copyright (C) 2005  Network Applied Communication Laboratory Co., Ltd.
#
# This file is part of Rast.
# See the file COPYING for redistribution information.
#

# -*- mode: Ruby; coding: euc-japan; -*-

require "test/unit"
require "fileutils"

require "rast_test"
require "bdb"
require "test-utility"
require "rast/database-generatable"

module Rast
  class TextIndexTest < Test::Unit::TestCase
    include DatabaseGeneratable

    def test_initialize
      index_name = generate_text_index_name
      index = TextIndex.new(index_name)
      assert_nothing_raised do
        index.register(1, "foo")
      end
      index = TextIndex.new(index_name, Rast::DB::RDONLY)
      assert_raise(RastError) do
        index.register(1, "foo")
      end
    end

    def test_register
      register_test
      register_test_free_list
      register_test_set_block_size
    end

    def register_test
      doc_id_1 = 0
      text_data = "abcdefdef"

      index_name = generate_text_index_name
      index = TextIndex.new(index_name)
      index.register(doc_id_1, text_data)
      index.close

      open_db do |rare_ngram_db, ngram_db, pos_file, free_list_file|
        check_rare_ngram_db(rare_ngram_db, "abc", doc_id_1, 0)
        check_rare_ngram_db(rare_ngram_db, "bcd", doc_id_1, 1)
        check_ngram_db(ngram_db, pos_file, "def",
                       PositionInfo.new(doc_id_1, [3, 6]))
        check_rare_ngram_db(rare_ngram_db, "ef", doc_id_1, 7)
        check_rare_ngram_db(rare_ngram_db, "f", doc_id_1, 8)
        check_free_list(free_list_file)
      end

      doc_id_2 = 1
      text_data = "1234123"

      index = TextIndex.new(index_name)
      index.register(doc_id_2, text_data)
      index.sync

      open_db do |rare_ngram_db, ngram_db, pos_file, free_list_file|
        check_ngram_db(ngram_db, pos_file, "123",
                       PositionInfo.new(doc_id_2, [0, 4]))
        check_rare_ngram_db(rare_ngram_db, "234", doc_id_2, 1)
        check_rare_ngram_db(rare_ngram_db, "341", doc_id_2, 2)
        check_rare_ngram_db(rare_ngram_db, "23", doc_id_2, 5)
        check_rare_ngram_db(rare_ngram_db, "3", doc_id_2, 6)
        check_free_list(free_list_file)
      end

      doc_id_3 = 2
      text_data = "01234"

      index.register(doc_id_3, text_data)
      index.sync

      open_db do |rare_ngram_db, ngram_db, pos_file, free_list_file|
        check_rare_ngram_db(rare_ngram_db, "012", doc_id_3, 0)
        check_ngram_db(ngram_db, pos_file, "123",
                       PositionInfo.new(doc_id_2, [0, 4]),
                       PositionInfo.new(doc_id_3, [1]))
        check_ngram_db(ngram_db, pos_file, "234",
                       PositionInfo.new(doc_id_2, [1]),
                       PositionInfo.new(doc_id_3, [2]))
        assert_equal(false, rare_ngram_db.key?("234"))
        check_free_list(free_list_file)
      end

      doc_id_4 = 3
      text_data = "x" * (2 + 2)

      index = TextIndex.new(index_name)
      index.register(doc_id_4, text_data)
      index.close

      open_db do |rare_ngram_db, ngram_db, pos_file, free_list_file|
        check_ngram_db(ngram_db, pos_file, "xxx",
                       PositionInfo.new(doc_id_4, (0..1).to_a))
        check_rare_ngram_db(rare_ngram_db, "xx", doc_id_4, 2)
        check_rare_ngram_db(rare_ngram_db, "x", doc_id_4, 3)
        check_free_list(free_list_file)
      end

      doc_id_5 = 4
      text_data = "x" * (512 + 2)

      index = TextIndex.new(index_name)
      index.register(doc_id_5, text_data)
      index.close

      open_db do |rare_ngram_db, ngram_db, pos_file, free_list_file|
        check_ngram_db(ngram_db, pos_file, "xxx",
                       PositionInfo.new(doc_id_4, (0..1).to_a),
                       PositionInfo.new(doc_id_5, (0..511).to_a))
        check_ngram_db(ngram_db, pos_file, "xx",
                       PositionInfo.new(doc_id_4, [2]),
                       PositionInfo.new(doc_id_5, [512]))
        check_ngram_db(ngram_db, pos_file, "x",
                       PositionInfo.new(doc_id_4, [3]),
                       PositionInfo.new(doc_id_5, [513]))
        assert_equal(false, rare_ngram_db.key?("xx"))
        assert_equal(false, rare_ngram_db.key?("x"))
        check_free_list(free_list_file, FreeListEntry.new(3, 1))
      end

      index = TextIndex.new(index_name)
      index.close
      open_db do |rare_ngram_db, ngram_db, pos_file, free_list_file|
        check_free_list(free_list_file, FreeListEntry.new(3, 1))
      end

      doc_id_6 = 6
      text_data = "yyyy"

      index = TextIndex.new(index_name)
      index.register(doc_id_6, text_data)
      index.close

      open_db do |rare_ngram_db, ngram_db, pos_file, free_list_file|
        check_ngram_db(ngram_db, pos_file, "xxx",
                       PositionInfo.new(doc_id_4, (0..1).to_a),
                       PositionInfo.new(doc_id_5, (0..511).to_a))
        check_ngram_db(ngram_db, pos_file, "xx",
                       PositionInfo.new(doc_id_4, [2]),
                       PositionInfo.new(doc_id_5, [512]))
        check_ngram_db(ngram_db, pos_file, "x",
                       PositionInfo.new(doc_id_4, [3]),
                       PositionInfo.new(doc_id_5, [513]))

        check_ngram_db(ngram_db, pos_file, "yyy",
                       PositionInfo.new(doc_id_6, [0, 1]))
        check_rare_ngram_db(rare_ngram_db, "yy", doc_id_6, 2)
        check_rare_ngram_db(rare_ngram_db, "y", doc_id_6, 3)
        check_free_list(free_list_file)
      end
    end

    def register_test_free_list
      index_name = generate_text_index_name
      index = TextIndex.new(index_name)

      doc_id_5 = 5
      text_data = "x" * (512 + 2) # (512: 896bytes < 2blocks)
      index.register(doc_id_5, text_data)
      index.sync

      open_db do |rare_ngram_db, ngram_db, pos_file, free_list_file|
        check_free_list(free_list_file)
      end

      doc_id_6 = 6
      text_data = "x" * (512 + 2) # (512: 896bytes < 2blocks)
      index.register(doc_id_6, text_data)
      index.sync

      open_db do |rare_ngram_db, ngram_db, pos_file, free_list_file|
        check_free_list(free_list_file, FreeListEntry.new(0, 2))
      end

      doc_id_7 = 7
      text_data = "yyyy"
      index.register(doc_id_7, text_data)
      index.close

      open_db do |rare_ngram_db, ngram_db, pos_file, free_list_file|
        check_free_list(free_list_file, FreeListEntry.new(1, 1))
      end
    end

    def register_test_set_block_size
      doc_id_1 = 1
      text_data = "aaaabbbb"

      index_name = generate_text_index_name
      index = TextIndex.new(index_name, Rast::DB::RDWR, 256)
      index.register(doc_id_1, text_data)
      index.close

      open_db do |rare_ngram_db, ngram_db, pos_file, free_list_file|
        check_ngram_db_with_block_size(ngram_db, pos_file, "aaa", 256,
                                       PositionInfo.new(doc_id_1,
                                                        [0, 1]))
        check_ngram_db_with_block_size(ngram_db, pos_file, "bbb", 256,
                                       PositionInfo.new(doc_id_1,
                                                        [4, 5]))
        check_free_list(free_list_file)
      end

      doc_id_2 = 2
      text_data = "a" * (256 + 2)

      index = TextIndex.new(index_name, Rast::DB::RDWR, 256)
      index.register(doc_id_2, text_data)
      index.close

      open_db do |rare_ngram_db, ngram_db, pos_file, free_list_file|
        check_ngram_db_with_block_size(ngram_db, pos_file, "aaa", 256,
                                       PositionInfo.new(doc_id_1,
                                                        [0, 1]),
                                       PositionInfo.new(doc_id_2,
                                                        (0..255).to_a))
        check_free_list(free_list_file, FreeListEntry.new(0, 1))
      end
    end

    def test_search
      search_test_simple
      search_test_same_ngram
      search_test_other
      search_test_without_sync
    end

    def search_test_simple
      index_name = generate_text_index_name
      text_index = TextIndex.new(index_name)
      text_index.register(0, "abcdef")
      text_index.register(1, "あいうえお")
      text_index.register(2, "defgh")
      text_index.register(3, "abc bcd")
      text_index.register(4, "abcd ef")
      text_index.register(5, "abcd bcdef cde")
      text_index.register(6, "abcabc")
      text_index.register(7, "foo tcl/tk8 bar")
      text_index.close

      text_index = TextIndex.new(index_name)

      result = text_index.search("")
      assert_equal("", result.terms[0].term)
      assert_equal(0, result.terms[0].doc_count)
      assert_equal(1, result.terms.length)
      assert_equal(0, result.candidates.length)

      result = text_index.search("cde")
      assert_equal("cde", result.terms[0].term)
      assert_equal(2, result.terms[0].doc_count)
      assert_equal(1, result.terms.length)
      assert_equal(0, result.candidates[0].doc_id)
      assert_equal(1, result.candidates[0].terms[0].count)
      assert_equal(2, result.candidates[0].terms[0].pos)
      assert_equal(5, result.candidates[1].doc_id)
      assert_equal(2, result.candidates[1].terms[0].count)
      assert_equal(6, result.candidates[1].terms[0].pos)
      assert_equal(2, result.candidates.length)

      result = text_index.search("def")
      assert_equal("def", result.terms[0].term)
      assert_equal(3, result.terms[0].doc_count)
      assert_equal(1, result.terms.length)
      assert_equal(0, result.candidates[0].doc_id)
      assert_equal(1, result.candidates[0].terms[0].count)
      assert_equal(3, result.candidates[0].terms[0].pos)
      assert_equal(2, result.candidates[1].doc_id)
      assert_equal(1, result.candidates[1].terms[0].count)
      assert_equal(0, result.candidates[1].terms[0].pos)
      assert_equal(5, result.candidates[2].doc_id)
      assert_equal(1, result.candidates[2].terms[0].count)
      assert_equal(7, result.candidates[2].terms[0].pos)
      assert_equal(3, result.candidates.length)

      result = text_index.search("de")
      assert_equal("de", result.terms[0].term)
      assert_equal(3, result.terms[0].doc_count)
      assert_equal(1, result.terms.length)
      assert_equal(0, result.candidates[0].doc_id)
      assert_equal(1, result.candidates[0].terms[0].count)
      assert_equal(3, result.candidates[0].terms[0].pos)
      assert_equal(2, result.candidates[1].doc_id)
      assert_equal(1, result.candidates[1].terms[0].count)
      assert_equal(0, result.candidates[1].terms[0].pos)
      assert_equal(5, result.candidates[2].doc_id)
      assert_equal(2, result.candidates[2].terms[0].count)
      assert_equal(7, result.candidates[2].terms[0].pos)
      assert_equal(3, result.candidates.length)

      result = text_index.search("いうえ")
      assert_equal(1, result.candidates[0].doc_id)
      assert_equal(1, result.candidates.length)

      result = text_index.search("cdef")
      assert_equal(0, result.candidates[0].doc_id)
      assert_equal(5, result.candidates[1].doc_id)
      assert_equal(2, result.candidates.length)

      result = text_index.search("abcd")
      assert_equal(0, result.candidates[0].doc_id)
      assert_equal(4, result.candidates[1].doc_id)
      assert_equal(5, result.candidates[2].doc_id)
      assert_equal(3, result.candidates.length)

      result = text_index.search("abcdef")
      assert_equal(0, result.candidates[0].doc_id)
      assert_equal(1, result.candidates.length)

      result = text_index.search("xyz")
      assert_equal(0, result.candidates.length)

      result = text_index.search("abc")
      assert_equal(0, result.candidates[0].doc_id)
      assert_equal(3, result.candidates[1].doc_id)
      assert_equal(4, result.candidates[2].doc_id)
      assert_equal(5, result.candidates[3].doc_id)
      assert_equal(6, result.candidates[4].doc_id)
      assert_equal(5, result.candidates.length)

      result = text_index.search("い")
      assert_equal("い", result.terms[0].term)
      assert_equal(1,     result.terms[0].doc_count)
      assert_equal(1,     result.terms.length)
      assert_equal(1,     result.candidates[0].doc_id)
      assert_equal(1,     result.candidates[0].terms[0].count)
      assert_equal(1, result.candidates[0].terms[0].pos)
      assert_equal(1,     result.candidates.length)

      result = text_index.search("tcl/tk")
      assert_equal("tcl/tk", result.terms[0].term)
      assert_equal(1, result.terms[0].doc_count)
      assert_equal(1, result.terms.length)
      assert_equal(7, result.candidates[0].doc_id)
      assert_equal(1, result.candidates[0].terms[0].count)
      assert_equal(4, result.candidates[0].terms[0].pos)
      assert_equal(1, result.candidates.length)

      result = text_index.search("not-found-string")
      assert_equal("not-found-string", result.terms[0].term)
      assert_equal(0, result.terms[0].doc_count)
      assert_equal(1, result.terms.length)
      assert_equal(0, result.candidates.length)

      text_index.close
    end

    def search_test_same_ngram
      index_name = generate_text_index_name
      text_index = TextIndex.new(index_name)
      text_index.register(1, "DE    F g")
      text_index.register(2, "100000")
      text_index.close

      text_index = TextIndex.new(index_name)

      result = text_index.search("DE    F g")
      assert_equal("DE    F g", result.terms[0].term)
      assert_equal(1, result.terms[0].doc_count)
      assert_equal(1, result.terms.length)
      assert_equal(1, result.candidates[0].doc_id)
      assert_equal(1, result.candidates[0].terms[0].count)
      assert_equal(0, result.candidates[0].terms[0].pos)
      assert_equal(1, result.candidates.length)

      result = text_index.search("100000")
      assert_equal("100000", result.terms[0].term)
      assert_equal(1, result.terms[0].doc_count)
      assert_equal(1, result.terms.length)
      assert_equal(2, result.candidates[0].doc_id)
      assert_equal(1, result.candidates[0].terms[0].count)
      assert_equal(0, result.candidates[0].terms[0].pos)
      assert_equal(1, result.candidates.length)

      text_index.close
    end

    def search_test_other
      index_name = generate_text_index_name
      text_index = TextIndex.new(index_name)
      text_index.register(0, " bcd abc")
      text_index.close

      text_index = TextIndex.new(index_name)
      result = text_index.search("abcd")
      assert_equal(0, result.candidates.length)
      text_index.close

      index_name = generate_text_index_name
      text_index = TextIndex.new(index_name)
      text_index.register(0, "cde abcde abcdef")
      text_index.close

      text_index = TextIndex.new(index_name)
      result = text_index.search("abcdef")
      assert_equal(1, result.candidates.length)
      text_index.close

      index_name = generate_text_index_name
      text_index = TextIndex.new(index_name)
      text_index.register(0, "abc bcd")
      text_index.close

      text_index = TextIndex.new(index_name)
      result = text_index.search("abcd")
      assert_equal(0, result.candidates.length)
      text_index.close
    end

    def search_test_without_sync
      index_name = generate_text_index_name
      text_index = TextIndex.new(index_name)

      result = text_index.search("abc")
      assert_equal(0, result.candidates.length)

      text_index.register(0, "abcdef")
      result = text_index.search("abc")
      assert_equal(1, result.candidates.length)
      result = text_index.search("a")
      assert_equal(1, result.candidates.length)

      text_index.sync

      text_index.register(1, "abc")
      result = text_index.search("abc")
      assert_equal(2, result.candidates.length)
      result = text_index.search("a")
      assert_equal(2, result.candidates.length)

      text_index.close
    end

    def test_optimize
      old_index_name = generate_text_index_name
      new_index_name = generate_text_index_name
      text_index = TextIndex.new(old_index_name)
      text_index.register(1, "abcd" * 2)
      text_index.register(2, "efgh" * 2)
      text_index.register(3, "ijkl" * 2)
      text_index.close

      text_index = TextIndex.new(old_index_name)
      result = text_index.search("abc")
      assert_equal(1, result.candidates[0].doc_id)
      assert_equal(1, result.candidates.length)
      result = text_index.search("efg")
      assert_equal(2, result.candidates[0].doc_id)
      assert_equal(1, result.candidates.length)
      result = text_index.search("ijk")
      assert_equal(3, result.candidates[0].doc_id)
      assert_equal(1, result.candidates.length)

      result = text_index.search("cd")
      assert_equal(1, result.candidates[0].doc_id)
      assert_equal(1, result.candidates.length)
      result = text_index.search("gh")
      assert_equal(2, result.candidates[0].doc_id)
      assert_equal(1, result.candidates.length)
      result = text_index.search("kl")
      assert_equal(3, result.candidates[0].doc_id)
      assert_equal(1, result.candidates.length)
      text_index.close

      TextIndex.optimize(old_index_name, new_index_name, {1 => 1, 3 => 2})

      text_index = TextIndex.new(new_index_name)
      result = text_index.search("abc")
      assert_equal(1, result.candidates[0].doc_id)
      assert_equal(1, result.candidates.length)
      result = text_index.search("efg")
      assert_equal(0, result.candidates.length)
      result = text_index.search("ijk")
      assert_equal(2, result.candidates[0].doc_id)
      assert_equal(1, result.candidates.length)

      result = text_index.search("cd")
      assert_equal(1, result.candidates[0].doc_id)
      assert_equal(1, result.candidates.length)
      result = text_index.search("gh")
      assert_equal(0, result.candidates.length)
      result = text_index.search("kl")
      assert_equal(2, result.candidates[0].doc_id)
      assert_equal(1, result.candidates.length)
      text_index.close
    end

    private

    PositionInfo = Struct.new(:doc_id, :positions)
    FreeListEntry = Struct.new(:block_no, :block_count)

    def generate_text_index_name
      text_index_name = generate_db_name
      @rare_ngram_db_name = text_index_name + ".rng"
      @ngram_db_name = text_index_name + ".ngm"
      @pos_file_name = text_index_name + ".pos"
      @free_list_file_name = text_index_name + ".pfl"
      @expected_free_list_version = 1
      return text_index_name
    end

    def check_ngram(db, ngram, *position_infos)
      s = db[ngram]
      nbytes_str = s.slice(/[\x80-\xFF]*[\x00-\x7F]/)
      nbytes = Rast::VNUM.unpack(nbytes_str)[0]
      ary = Rast::VNUM.unpack(s[nbytes_str.length, nbytes])

      while !ary.empty?
        doc_id = ary.shift
        num_positions = ary.shift
        positions = ary.slice!(0, num_positions)
        expected = position_infos.shift
        assert_equal(expected.doc_id, doc_id)
        assert_equal(expected.positions, positions)
      end
      assert_equal(true, position_infos.empty?)
    end

    def check_ngram_db_with_block_size(db, file, ngram, block_size,
                                       *position_infos)
      block_no, block_count, data_nbytes, num_docs = db[ngram].unpack("I*")
      assert_equal(position_infos.length, num_docs)
      file.seek(block_no * block_size)
      s = file.read(data_nbytes)
      while !s.empty?
        doc_id, bytes =
          *Rast::VNUM.unpack(s.slice!(/\A(?:[\x80-\xFF]*[\x00-\x7F]){2}/))
        positions = Rast::VNUM.unpack(s.slice!(0, bytes))
        expected = position_infos.shift
        assert_equal(expected.doc_id, doc_id)
        assert_equal(expected.positions, positions)
      end
      assert_equal(true, position_infos.empty?)
    end

    def check_ngram_db(db, file, ngram, *position_infos)
      check_ngram_db_with_block_size(db, file, ngram, 512,
                                     *position_infos)
    end

    def check_rare_ngram_db(db, ngram, expected_doc_id, expected_position)
      s = db[ngram]
      doc_id, bytes =
        *Rast::VNUM.unpack(s.slice!(/\A(?:[\x80-\xFF]*[\x00-\x7F]){2}/))
      assert_equal(s.length, bytes)
      positions = Rast::VNUM.unpack(s.slice!(0, bytes))
      assert_equal(0, s.length)
      assert_equal(expected_doc_id, doc_id)
      assert_equal([expected_position], positions)
    end

    def check_free_list(file, *expected_entries)
      ary = file.read.unpack("I*")
      if ary.nil?
        assert_equal(true, expected_entries.empty?)
        return
      end
      assert_equal(@expected_free_list_version, ary.shift)
      @expected_free_list_version += 1
      expected_entries.each do |expected_entry|
        assert_equal(expected_entry.block_no, ary.shift)
        assert_equal(expected_entry.block_count, ary.shift)
      end
      assert_equal(true, ary.empty?)
    end

    def open_db
      BDB::Btree.open(@rare_ngram_db_name, nil, 0) do |rare_ngram_db|
        BDB::Btree.open(@ngram_db_name, nil, 0) do |ngram_db|
          File.open(@pos_file_name) do |pos_file|
            File.open(@free_list_file_name,
                      File::RDONLY | File::CREAT) do |free_list_file|
              yield(rare_ngram_db, ngram_db, pos_file, free_list_file)
            end
          end
        end
      end
    end
  end
end
