/*
 * btnode.c - block/page management for NILFS B-Tree node
 *
 * Copyright (C) 2005 Nippon Telegraph and Telephone Corporation.
 *
 * This file is part of NILFS.
 *
 * NILFS is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * NILFS is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with NILFS; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 *
 * btnode.c,v 1.33 2006/07/14 06:31:53 kihara Exp
 *
 * Written by Seiji Kihara <kihara@osrg.net>
 */

/* just for debugging use */
#undef KEEP_REFERED_NODE_BLK

/* blocksize < pagesize support, under construction */
#undef BTNODE_SMALLBLK_SUPPORT

#include "nilfs.h"
#define	NILFS_PAGE_GANG_LOOKUP_SIZE	16

/*
 * common node block functions (lower level)
 */

/* page 64 tree */

static int
nilfs_add_to_page_64_tree(struct page *page, struct radix_tree_64_root *rtree,
			  spinlock_t *rtreelock, __u64 index,
			  struct address_space *mapping, gfp_t gfp_mask)
{
	int error;
	unsigned long flags;

	/* to avoid allocation while irq disabled */
	error = radix_tree_64_preload(gfp_mask & ~__GFP_HIGHMEM);
	if (error)
		goto out;

	spin_lock_irqsave(rtreelock, flags);
	error = radix_tree_64_insert(rtree, index, page);
	if (error)
		goto out_unlock;

	page_cache_get(page);
	page->mapping = mapping;
	if (NILFS_BTREE_PTR_IS_BH(index))
		page->index = 0; /* NULL for buffer head */
	else
		page->index = (pgoff_t) index;
	
	/*
	 * Note: pagecache_acct should not be called from kernel modules,
	 * if CONFIG_SMP is defined.
	 */
	/* pagecache_acct(1); */
	
 out_unlock:
	spin_unlock_irqrestore(rtreelock, flags);
	radix_tree_64_preload_end();
 out:
	return error;
}

/*
 * Remove a page from the page cache and free it. Caller has to make
 * sure the page is locked and that nobody else uses it - or that usage
 * is safe.  The caller must hold a write_lock on the mapping's tree_lock.
 */
/*
 * Note: acquire tree_lock here.
 *	 if tree_lock NULL, lock should be aquired in advance.
 */
static void
nilfs_remove_from_page_64_tree(struct page *page,
			       struct radix_tree_64_root *rtree,
			       spinlock_t *rtreelock)
{
	__u64 index;

	index = page->index;
	if (!index) {
		struct buffer_head *bh, *head;

		head = bh = page_buffers(page);
		do {
			if (bh->b_blocknr)
				goto out;
			bh = bh->b_this_page;
		} while (bh != head);
		index = NILFS_BTREE_BH_TO_PTR(page_buffers(page));
	}
 out:
	if (rtreelock)
		spin_lock(rtreelock);
	radix_tree_64_delete(rtree, index);
	page_cache_release(page); /* for radix_tree_64 */
	page->mapping = NULL;
	
	/*
	 * Note: pagecache_acct should not be called from kernel modules,
	 * if CONFIG_SMP is defined.
	 */
	/* pagecache_acct(-1); */
	
	if (rtreelock)
		spin_unlock(rtreelock);
}

static struct page *
nilfs_get_from_page_64_tree(struct radix_tree_64_root *rtree,
			    spinlock_t *rtreelock, __u64 index)
{
	struct page *ret;

	spin_lock(rtreelock);
	ret = radix_tree_64_lookup(rtree, index);
	spin_unlock(rtreelock);
	return ret;
}

static void
nilfs_mark_node_blk_dirty_bit(struct buffer_head *bh,
			      struct radix_tree_64_root *rtree,
			      spinlock_t *rtreelock, int tag)
{
	struct page *page;
	__u64 index;
	unsigned long flags;
	
	if ((tag != PAGECACHE64_TAG_DIRTY) &&
	    (tag != PAGECACHE64_TAG_PREPARE_DIRTY)) {
		page_warn("incorrect tag %d.\n", tag);
		return;
	}
	if (buffer_dirty(bh)){
		page_debug(2, "bh %p already dirty\n", bh);
		goto out;
	}
	/* lock_buffer(bh); */
	set_buffer_dirty(bh);
	/* unlock_buffer(bh); */
	if (tag == PAGECACHE64_TAG_PREPARE_DIRTY) {
		if (buffer_prepare_dirty(bh)) {
			page_debug(2, "bh %p already pdirty\n", bh);
			goto out;
		}
		/* lock_buffer(bh); */
		set_buffer_prepare_dirty(bh);
		/* unlock_buffer(bh); */
	}
	page = bh->b_page;
	/* lock_page(page); */
	if (!TestSetPageDirty(page))
		inc_page_state(nr_dirty);
	/* unlock_page(page); */
	spin_lock_irqsave(rtreelock, flags);
	index = (__u64) page->index;
	if (!index) {
		/* check that page do not have block on disk */
		if (!nilfs_node_page_new(page)) {
			page_warn("*** CAUTION *** "
				  "blocknr non-zero for new node, bh=%p.\n",
				  bh);
			goto out_unlock_rtree;
		}
		index = NILFS_BTREE_BH_TO_PTR(bh);
	}
	radix_tree_64_tag_set(rtree, index, tag);
	
	page_debug(2, "tag %d set\n", tag);
	
 out_unlock_rtree:
	spin_unlock_irqrestore(rtreelock, flags);
 out:
	return;
}

static inline void
nilfs_mark_node_blk_dirty(struct buffer_head *bh,
			  struct radix_tree_64_root *rtree,
			  spinlock_t *rtreelock)
{
	nilfs_mark_node_blk_dirty_bit(bh, rtree, rtreelock,
				      PAGECACHE64_TAG_DIRTY);
}

static inline void
nilfs_mark_node_blk_prepare_dirty(struct buffer_head *bh,
				  struct radix_tree_64_root *rtree,
				  spinlock_t *rtreelock)
{
	nilfs_mark_node_blk_dirty_bit(bh, rtree, rtreelock,
				      PAGECACHE64_TAG_PREPARE_DIRTY);
}

/**
 * nilfs_clear_node_blk_dirty - clear dirty status of nilfs_node_block
 *   @bh: buffer head of the block
 *
 *   this function do not handle prepare_dirty well,
 *   so this cannot be used in segment construction status.
 */
static void
nilfs_clear_node_blk_dirty(struct buffer_head *bh)
{
	struct page *page = bh->b_page;
	struct buffer_head *b;
	int dirty = 0, pdirty = 0;
	struct radix_tree_64_root *rtree;
	/* spinlock_t *rtreelock; */
	
	if (!buffer_dirty(bh)) {
		page_debug(2, "return because clear bh %p\n", bh);
		return;
	}
	if (buffer_prepare_dirty(bh)) {
		page_warn("*** PREPARE DIRTY BH %p ***\n", bh);
		return;
	}
	page_debug(2, "clear buffer_dirty %p\n", bh);
	/* lock_buffer(bh); */
	clear_buffer_dirty(bh);
	/* unlock_buffer(bh); */
	for (b = bh->b_this_page; b != bh; b = b->b_this_page) {
		if (buffer_prepare_dirty(b))
			pdirty = 1;
		else if (buffer_dirty(b))
			dirty = 1;
		if (dirty && pdirty)
			break;
	}
	
	if (dirty && pdirty) {
		page_debug(2, "dirty or pdirty buffer remains\n");
		return;
	}
	if (!PageDirty(page)) {
		page_warn("non dirty page %p\n", page);
		return;
	}
	if (!dirty && !pdirty) {
		/* lock_page(page); */
		ClearPageDirty(page);
		/* unlock_page(page); */
		dec_page_state(nr_dirty);
		page_debug(2, "dirty on page %p cleared.\n", page);
	}
	
	BUG_ON(!page->mapping);	  /* node block should have mapping */
	if (buffer_nilfs_bbt_node(bh)) {
		struct nilfs_inode_info *ii = NILFS_I(page->mapping->host);
		rtree = &ii->i_block_ntree;
		/* rtreelock = &ii->i_block_ntree_lock; */
	} else if (buffer_nilfs_ibt_node(bh)) {
		struct nilfs_sb_info *sbi = NILFS_AS_SB(page->mapping);
		rtree = &sbi->s_inode_ntree;
		/* rtreelock = &sbi->s_inode_ntree_lock; */
	} else {
		page_warn("invalid page %p bh %p\n", page, bh);
		return;
	}
	
	if (dirty) {
		page_debug(2, "clear dirty tag (page %p)\n", page);
		radix_tree_64_tag_clear(rtree, nilfs_node_page_index(page),
					PAGECACHE64_TAG_DIRTY);
	}
	if (pdirty) {
		page_debug(2, "clear pdirty tag (page %p)\n", page);
		radix_tree_64_tag_clear(rtree, nilfs_node_page_index(page),
					PAGECACHE64_TAG_PREPARE_DIRTY);
	}
}

static inline int
nilfs_partial_node_page(struct page *page)
{
	return buffer_nilfs_partial_node(page_buffers(page));
}

/* partial... lock should be acquired beforehand */
static void
nilfs_clear_partial_node_page(struct page *page)
{
	/* delete page->lru from list */
	list_del_init(&page->lru);
	clear_buffer_nilfs_partial_node(page_buffers(page));
}

/* acquire partial... lock internally */
static void
nilfs_set_partial_node_page(struct page *page)
{
	struct list_head *list;
	struct semaphore *sem;
	struct buffer_head *bh = page_buffers(page);
	
	if (buffer_nilfs_partial_node(bh)) {
		page_debug(2, "already partial page %p bh %p\n", page, bh);
		return;
	}
	/* get the partial list and sem */
	if (buffer_nilfs_bbt_node(bh)) {
		list = &NILFS_I(page->mapping->host)->i_partial_node_pages;
		sem = &NILFS_I(page->mapping->host)->i_node_pages_sem;
	} else if (buffer_nilfs_ibt_node(bh)) {
		list = &NILFS_AS_SB(page->mapping)->s_partial_node_pages;
		sem = &NILFS_AS_SB(page->mapping)->s_node_pages_sem;
	} else {
		page_warn("bad page %p bh %p\n", page, bh);
		return;
	}
	/* add page->lru to list */
	down(sem);
	set_buffer_nilfs_partial_node(bh);
	list_add_tail(&page->lru, list);
	up(sem);
}

/* lock should be acquired beforehand */
void
nilfs_clear_inactive_node_page(struct page *page)
{
	struct buffer_head *bh;
	int *listcnt;
	
	BUG_ON(!PagePrivate(page));
	bh = page_buffers(page);
	if (buffer_nilfs_bbt_node(bh)) {
		listcnt = &NILFS_SB(page->mapping->host->i_sb)->
			s_inactive_node_pages_cnt;
	} else if (buffer_nilfs_ibt_node(bh)) {
		listcnt = &NILFS_AS_SB(page->mapping)->
			s_inactive_node_pages_cnt;
	} else {
		page_warn("bad page %p bh %p=n", page, bh);
		return;
	}
	list_del_init(&page->lru);
	(*listcnt)--;
	clear_buffer_nilfs_inactive_node(bh);
}

/* list lock should be acquired beforehand */
static void
nilfs_set_inactive_node_page(struct page *page)
{
	struct list_head *list;
	int *listcnt;
	struct buffer_head *bh = page_buffers(page);
	
	/* get the inactive list and sem on sbinfo */
	if (buffer_nilfs_bbt_node(bh)) {
		list = &NILFS_SB(page->mapping->host->i_sb)->
			s_inactive_node_pages;
		listcnt = &NILFS_SB(page->mapping->host->i_sb)->
			s_inactive_node_pages_cnt;
	} else if (buffer_nilfs_ibt_node(bh)) {
		list = &NILFS_AS_SB(page->mapping)->s_inactive_node_pages;
		listcnt = &NILFS_AS_SB(page->mapping)->
			s_inactive_node_pages_cnt;
	} else {
		page_warn("bad page %p bh %p\n", page, bh);
		return;
	}
	/* add page->lru to list */
	if (buffer_nilfs_inactive_node(bh)) {
		page_debug(2, "already inactive page %p bh %p\n", page, bh);
		page_debug(2, "page = %p, bh = %p, bh->b_count = %d\n",
			   page, bh, atomic_read(&bh->b_count));
		nilfs_dump_stack(NILFS_VERBOSE_PAGE, 2);
		goto out;
	}
	set_buffer_nilfs_inactive_node(bh);
	list_add_tail(&page->lru, list);
	(*listcnt)++;
 out:
	return;
}

/*
 * return true if active buffer not exist in the page other than specified
 */
static int
nilfs_empty_node_page(struct buffer_head *bh)
{
	struct buffer_head *b = bh;
	int empty = 1;
	
	if (buffer_nilfs_new_node(bh))
		goto new_node;
	for (b = bh->b_this_page; b != bh; b = b->b_this_page) {
		if (b->b_blocknr) {
			empty = 0;
			break;
		}
	}
	goto done;
 new_node:
	for (b = bh->b_this_page; b != bh; b = b->b_this_page) {
		if (atomic_read(&b->b_count)) {
			empty = 0;
			break;
		}
	}
 done:
	return empty;
}

/*
 * return true if no buffer used other than specified
 */
static int
nilfs_unused_node_page(struct buffer_head *bh)
{
	struct buffer_head *b = bh;
	int unused = 1;
	
	for (b = bh->b_this_page; b != bh; b = b->b_this_page) {
		if (atomic_read(&b->b_count)) {
			unused = 0;
			break;
		}
	}
	return unused;
}

/*
 * return true if inactive buffer not exist in the page other than specified
 */
static int
nilfs_full_node_page(struct buffer_head *bh)
{
	struct buffer_head *b;
	int full = 1;
	
	page_debug(2, "bh=%p page=%p\n", bh, bh->b_page);
	if (buffer_nilfs_new_node(bh))
		goto new_node;
	
	for (b = bh->b_this_page; b != bh; b = b->b_this_page) {
		if (!b->b_blocknr) {
			full = 0;
			break;
		}
	}
	goto done;
 new_node:
	for (b = bh->b_this_page; b != bh; b = b->b_this_page) {
		if (!atomic_read(&b->b_count)) {
			full = 0;
			break;
		}
	}
 done:
	return full;
}

static inline int
nilfs_get_unused_bh(struct buffer_head **bh)
{
	struct buffer_head *b;
	
	for (b = (*bh)->b_this_page; b != *bh; b = b->b_this_page) {
		if (atomic_read(&b->b_count) == 0) {
			page_debug(2, "found bh %p\n", b);
			*bh = b;
			return 1;
		}
	}
	return 0;
}

/*
 * nilfs_get_node_blk() and nilfs_get_new_node_blk()
 * Note:
 *   References to page and bh are gained if the appropriate buffer/page
 *   existed, and those are set for new buffer/page by nilfs_getblkbh().
 */
static struct buffer_head *
nilfs_get_node_blk(struct radix_tree_64_root *rtree, spinlock_t *rtreelock,
		   struct address_space *mapping, struct block_device *bdev,
		   unsigned long blksize, unsigned int blkbits, dbn_t block,
		   spinlock_t *listlock)
{
	struct buffer_head *bh, *head, *ret = NULL;
	struct page *page;
	__u64 index;
	unsigned long flags;
	
	page_debug(2, "0 block=%lu\n", block);
	index = (__u64) (block >> (PAGE_CACHE_SHIFT - blkbits));
 start:
	spin_lock_irqsave(listlock, flags);
	page = nilfs_get_from_page_64_tree(rtree, rtreelock, index);
	if ((page) && (nilfs_inactive_node_page(page)))
		nilfs_clear_inactive_node_page(page);
	/* XXX should get any refcount before unlock */
	spin_unlock_irqrestore(listlock, flags);

	if (!page) {
		int error;
		
		/* get new page and buffer with the block number */
		bh = nilfs_getblkbh(bdev,
				    (dbn_t)index <<
				    (PAGE_CACHE_SHIFT - blkbits),
				    blksize);
		if (bh == NULL) {
			page_warn("nilfs_getblkbh returns NULL.\n");
			goto next;
		}
		/*
		 * set buffers' state in the page
		 * to show used for node blocks
		 */
		head = bh;
		do {
			set_buffer_nilfs_node(bh);
			bh = bh->b_this_page;
		} while (bh != head);
		
		page = bh->b_page;
		/* register the page to the radix tree */
		error = nilfs_add_to_page_64_tree(page, rtree, rtreelock,
						  index, mapping, GFP_NOFS);
		if (error == -EEXIST) {
			/* another process inserted */
			nilfs_putblkbh(bh);
			goto start;
		}
		if (error) {
			page_warn("nilfs_add_to_page_64_tree failed.\n");
			nilfs_putblkbh(bh);
			goto next;
		}
		get_bh(bh);
		ret = bh;
		page_debug(2, "set bh returned from nilfs_getblkbh.\n");
		goto next;
	}

	/* page found */
	/*
	 * only pages which are associated with buffer head can be added to
	 * the tree, and nilfs_getblkbh() prepare page and buffer head, so the
	 * page should have buffer head here.
	 */
	if (!page_has_buffers(page)) {
		page_warn("no buffers on page.\n");
		goto out_unlock;
	}
	bh = head = page_buffers(page);
	do {
		if (bh->b_blocknr == block) {
			/* found */
			get_bh(bh);
			ret = bh;
			goto out_unlock;
		}
		page_debug(2, "block %llu.\n", bh->b_blocknr);
		bh = bh->b_this_page;
	} while (bh != head);
	page_debug(2, "block not found.\n");
	
 out_unlock:
 next:
	if (ret == NULL) {
		page_debug(2, "RET null, block=%lu\n", block);
		return ret;
	}
	/* read if the buffer not uptodate */
	if (!buffer_uptodate(ret))
		ret = nilfs_bread_slow(ret);
	if (ret == NULL) {
		page_debug(2, "nilfs_bh_bread failed for block %lu\n", block);
		/* We don't have to release buffer_head (ret) here.
		 * For details, read comments on nilfs_bread_slow() */
		return NULL;
	}
	page_debug(2, "page->index=%lu page_count=%d b_count %d\n",
		   page->index, page_count(page), atomic_read(&ret->b_count));
	if (nilfs_node_page_new(ret->b_page))
		page_warn("*** WARNING *** new node blk.\n");
	return ret;
}


static struct buffer_head *
nilfs_get_new_node_blk_partial(struct list_head *partial_pages,
			       struct semaphore *sem)
{
	struct page *page, *next;
	struct buffer_head *bh = NULL;
	
	page_debug(2, "list_head %p, sem %p\n", partial_pages, sem);
	down(sem);
	if (list_empty(partial_pages))
		goto out_unlock;
	list_for_each_entry_safe(page, next, partial_pages, lru) {
		bh = page_buffers(page);
		if (nilfs_get_unused_bh(&bh)) {
			page_debug(2, "found bh %p on page %p.\n", page, bh);
			if (nilfs_full_node_page(bh)) {
				nilfs_clear_partial_node_page(page);
			}
			get_bh(bh);
			break;
		} else
			bh = NULL;
	}
 out_unlock:
	up(sem);
	return bh;
}

static struct buffer_head *
nilfs_get_new_node_blk(struct radix_tree_64_root *rtree, spinlock_t *rtreelock,
		       struct address_space *mapping,
		       struct block_device *bdev, unsigned long blksize,
		       unsigned int blkbits)
{
	struct buffer_head *ret, *bh;
	struct page *page;
	__u64 index;
	int error;
	
	/* get new page and buffer */
	ret = nilfs_getblkbh(bdev, 0, blksize);
	if (unlikely(ret == NULL))
		goto out;
	
	/* set buffers' state in the page to show used for node blocks */
	bh = ret;
	do {
		bh->b_blocknr = 0;
		set_buffer_nilfs_new_node(bh);
		set_buffer_nilfs_node(bh);
		bh = bh->b_this_page;
	} while (bh != ret);
	
	/* register the page with the buffer head to the radix-tree */
	page = ret->b_page;
	index = NILFS_BTREE_BH_TO_PTR(ret);
	error = nilfs_add_to_page_64_tree(page, rtree, rtreelock, index,
					  mapping, GFP_NOFS);
	if (unlikely(error)) {
		nilfs_putblkbh(ret);
		ret = NULL;
		goto out;
	}
	
	page_debug(2, "page->index=%lu\n", page->index);
	
	/* the buffer information should be set up in nilfs_getblkbh */
	/* return buffer */
	if (unlikely(!nilfs_node_page_new(ret->b_page)))
		page_warn("*** WARNING *** not new node blk.\n");
	get_bh(ret);
 out:
	return ret;
}

static void
nilfs_put_node_blk(struct buffer_head *bh)
{
	spinlock_t *listlock;
	unsigned long flags;

	if (buffer_nilfs_new_node(bh)) {
		/*
		 * this case is caused only when disk number is determined.
		 * the B-Tree module free reference,
		 * but the segment construction routine already have the
		 * reference.
		 */
		page_debug(2, "for new node.\n");
	}

	if (buffer_nilfs_bbt_node(bh)) {
		listlock = &NILFS_SB(bh->b_page->mapping->host->i_sb)->
			s_inactive_node_pages_lock;
	} else if (buffer_nilfs_ibt_node(bh)) {
		listlock = &NILFS_AS_SB(bh->b_page->mapping)->
			s_inactive_node_pages_lock;
	} else {
		page_warn("bad page %p bh %p\n", bh->b_page, bh);
		return;
	}
	spin_lock_irqsave(listlock, flags);
	if (atomic_read(&bh->b_count) > 1) {
		page_debug(2, "b_count > 1, call put_bh.\n");
		put_bh(bh);
		goto out_unlock;
	}
	if (nilfs_unused_node_page(bh))
		nilfs_set_inactive_node_page(bh->b_page);
	put_bh(bh);
 out_unlock:
	spin_unlock_irqrestore(listlock, flags);
}

static void
nilfs_remove_node_blk_entry(struct buffer_head *bh, int nolock)
{
	struct page *page = bh->b_page;
	struct radix_tree_64_root *rtree;
	spinlock_t *rtreelock = NULL;
	struct semaphore *psem = NULL;
	spinlock_t *listlock = NULL;
	unsigned long flags = 0;

	/* check if another holds the page */
	lock_page(page);
	wait_on_page_writeback(page);

	nilfs_clear_node_blk_dirty(bh);
	BUG_ON(!page->mapping);	  /* node block should have mapping */
	if (buffer_nilfs_bbt_node(bh)) {
		struct nilfs_inode_info *ii = NILFS_I(page->mapping->host);
		rtree = &ii->i_block_ntree;
		psem = &ii->i_node_pages_sem;
		if (!nolock) {
			listlock = &NILFS_SB(page->mapping->host->i_sb)->
				s_inactive_node_pages_lock;
			rtreelock = &ii->i_block_ntree_lock;
		}
	} else if (buffer_nilfs_ibt_node(bh)) {
		struct nilfs_sb_info *sbi = NILFS_AS_SB(page->mapping);
		rtree = &sbi->s_inode_ntree;
		psem = &sbi->s_node_pages_sem;
		if (!nolock) {
			listlock = &sbi->s_inactive_node_pages_lock;
			rtreelock = &sbi->s_inode_ntree_lock;
		}
	} else {
		page_warn("invalid page %p bh %p\n", page, bh);
		goto unlock;
	}
	page_debug(2, "removing page %p\n", page);
	if (nilfs_partial_node_page(page)) {
		down(psem);
		nilfs_clear_partial_node_page(page);
		up(psem);
	}
	if (!nolock) {
		spin_lock_irqsave(listlock, flags);
	}
	/*
	 * nilfs_clear_inactive_node_page should be called before
	 * nilfs_remove_from_page_64_tree, because the latter clears
	 * page->mapping.
	 */
	if (nilfs_inactive_node_page(page))
		nilfs_clear_inactive_node_page(page);
	nilfs_remove_from_page_64_tree(page, rtree, rtreelock);
	if (!nolock) {
		spin_unlock_irqrestore(listlock, flags);
	}
 unlock:
	unlock_page(page);
}

static inline void
nilfs_delete_node_blk_full(struct buffer_head *bh)
{
	nilfs_remove_node_blk_entry(bh, 0); /* lock */
	put_bh(bh);
	nilfs_putblkbh(bh);
}

static void
nilfs_delete_node_blk_partial(struct buffer_head *bh)
{
	struct page *page = bh->b_page;
	
	page_debug(2, "bh=%p, page=%p\n", bh, page);
	nilfs_clear_node_blk_dirty(bh);
	if (buffer_nilfs_new_node(bh)) {
		if (!nilfs_partial_node_page(page))
			nilfs_set_partial_node_page(page);
	} else {
		/* check if another (disk-io) hold the page */
		page_debug(2, "lock page %p for bh %p.\n", page, bh);
		lock_page(page);
		wait_on_page_writeback(page);
		bh->b_blocknr = 0;	/* for on-disk node block */
		unlock_page(page);
		page_debug(2, "unlock page %p for bh %p.\n", page, bh);
	}
	put_bh(bh);
}

static void
nilfs_delete_node_blk(struct buffer_head *bh)
{
	page_debug(2, "bh=%p, page=%p\n", bh, bh->b_page);
	page_debug(2, "page_count=%d, b_count=%d\n",
		   page_count(bh->b_page), atomic_read(&bh->b_count));
	if (atomic_read(&bh->b_count) > 1) {
		page_warn("bh %p refered by other module.\n", bh);
		page_warn("bh is for %s %s node block.\n",
			  buffer_nilfs_new_node(bh) ? "new" : "on-disk",
			  buffer_nilfs_bbt_node(bh) ? "file" :
			  (buffer_nilfs_ibt_node(bh) ? "inode" : "UNKNOWN"));
#ifdef KEEP_REFERED_NODE_BLK
		put_bh(bh);
		return;
#else
		page_warn("continue anyway.\n");
#endif
	}
	if (nilfs_empty_node_page(bh))
		nilfs_delete_node_blk_full(bh);
	else
		nilfs_delete_node_blk_partial(bh);
}

static void
nilfs_delete_node_page(struct page *page)
{
	struct buffer_head *b, *bh;
	
	bh = page_buffers(page);
	b = bh;
	do {
		b = b->b_this_page;
		/* b-tree do not keep ref for on-disk node, so get it now. */
		if (!buffer_nilfs_new_node(bh)) {
			if (b->b_blocknr == 0)
				continue;
			get_bh(b);
		} else {
			if (atomic_read(&b->b_count) < 1)
				continue;
		}
#if 0	/* for debug */
		/*
		 * force ref to normal code, for debugging use only.
		 */
		if (atomic_read(&b->b_count) > 1) {
			page_warn("*** BUFFER COUNT WRONG *** "
				  "(force to right value) "
				  "bh %p, page %p b_count %d\n",
				  b, page, atomic_read(&b->b_count));
			do {
				put_bh(b);
			} while (atomic_read(&b->b_count) > 1);
		}
#endif
		if (buffer_nilfs_bbt_node(b) || buffer_nilfs_ibt_node(b))
			nilfs_delete_node_blk(b);
		else {
			page_warn("*** BUFFER WRONG *** "
				  "bh %p page %p\n", b, page);
			put_bh(b);
		}
	} while (b != bh);
}

static void
nilfs_delete_all_node_blk(struct radix_tree_64_root *rtree,
		       spinlock_t *rtreelock, spinlock_t *listlock)
{
	struct page *pages[NILFS_PAGE_GANG_LOOKUP_SIZE];
	int i = 0, n = 0;
	__u64 index = 0;
	unsigned long flags;

	for(;;) {
		spin_lock_irqsave(listlock, flags);
		spin_lock(rtreelock);
		n = radix_tree_64_gang_lookup(rtree, (void **)pages, index,
					      NILFS_PAGE_GANG_LOOKUP_SIZE);
		spin_unlock(rtreelock);
		for (i = 0; i < n; i++) {
			BUG_ON(!PagePrivate(pages[i]));
			if (nilfs_inactive_node_page(pages[i])) {
				nilfs_clear_inactive_node_page(pages[i]);
			}
		}
		spin_unlock_irqrestore(listlock, flags);
		if (n > 0) {
			index = nilfs_node_page_index(pages[n - 1]) + 1;
		} else {
			break;
		}
		for (i = 0; i < n; i++) {
			struct page *page = pages[i];

#if 0 /* test code to avoid blocking */
			if (TestSetPageLocked(page))	/* lock */
				continue;
			if (PageWriteback(page)) {	/* under BIO */
				unlock_page(page);
				continue;
			}
#endif
			lock_page(page);
			wait_on_page_writeback(page);
			unlock_page(page);
			nilfs_delete_node_page(page);
		}
	}
}

static void
nilfs_clean_node_page(struct page *page)
{
	struct buffer_head *b, *bh;
	
	if (!PagePrivate(page)) {
		page_warn("no private %p\n", page);
		return;
	}
	bh = page_buffers(page);
	b = bh;
	do {
		b = b->b_this_page;
		/* b-tree do not keep ref for on-disk node, so get it now. */
		if (!buffer_nilfs_new_node(bh)) {
			if (b->b_blocknr == 0)
				continue;
		} else {
			if (atomic_read(&b->b_count) == 0)
				continue;
		}
		while (atomic_read(&b->b_count) < 1)
			get_bh(b);
		while (atomic_read(&b->b_count) > 1)
			put_bh(b);
		if (buffer_nilfs_bbt_node(b)) {
			page_warn("deleting nilfs file node block. "
				  "bh %p page %p\n", b, page);
			nilfs_delete_file_node_blk(b);
		} else if (buffer_nilfs_ibt_node(b)) {
			page_warn("deleting nilfs inode node block. "
				  "bh %p page %p\n", b, page);
			nilfs_delete_inode_node_blk(b);
		} else {
			page_warn("*** BUFFER WRONG *** "
				  "bh %p page %p\n", b, page);
			put_bh(b);
		}
	} while (b != bh);
}


static void
nilfs_clean_all_node_blk(struct radix_tree_64_root *rtree,
		      spinlock_t *rtreelock, spinlock_t *listlock)
{
	struct page *pages[NILFS_PAGE_GANG_LOOKUP_SIZE];
	int i = 0, n = 0;
	unsigned int index = 0;
	unsigned long flags;
	
	for(;;) {
		spin_lock_irqsave(listlock, flags);
		spin_lock(rtreelock);
		n = radix_tree_64_gang_lookup(rtree, (void **)pages, index,
					      NILFS_PAGE_GANG_LOOKUP_SIZE);
		spin_unlock(rtreelock);
		for (i = 0; i < n; i++) {
			if (nilfs_inactive_node_page(pages[i])) {
				nilfs_clear_inactive_node_page(pages[i]);
			}
		}
		spin_unlock_irqrestore(listlock, flags);
		if (n > 0) {
			index = nilfs_node_page_index(pages[n - 1]) + 1;
		} else {
			break;
		}
		for (i = 0; i < n; i++) {
			nilfs_clean_node_page(pages[i]);
		}
	}
}

/*
 * nilfs_release_node_page()
 *  called from nilfs_releasepage()
 *  must fail (return 0)
 *  this function should be never called,
 *  because PageLRU(page) should be 0 for
 *  nilfs_node_page pages.
 */
int
nilfs_release_node_page(struct page *page, gfp_t gfp_mask)
{
	page_warn("page %p, mask %d\n", page, gfp_mask);
	BUG();
	return 0;
}

/*
 * nilfs_invalidate_node_page()
 *  same as nilfs_release_node_page()
 */
int
nilfs_invalidate_node_page(struct page *page, unsigned long offset)
{
	page_warn("page %p, offset %lu\n",
		  page, offset);
	BUG();
	return 0;
}

/**
 * nilfs_put_allocated_page - a nonblocking page deallocator for ISRs
 *
 * nilfs_put_allocated_page() delays actual deallocation by giving
 * over to foreground processes.
 * It links a nilfs_allocated page to the inactive_node_pages list.
 * The page will eventually be returned by a shrink function.
 */
void
nilfs_put_allocated_page(struct page *page, struct super_block *sb)
{
	struct list_head *list = &NILFS_SB(sb)->s_inactive_node_pages;
	spinlock_t *listlock = &NILFS_SB(sb)->s_inactive_node_pages_lock;
	int *listcnt = &NILFS_SB(sb)->s_inactive_node_pages_cnt;
	unsigned long flags;
	struct buffer_head *bh, *head;

	/*
	 * We can safely look into other buffers because the page
	 * is locked or used in private by the caller.
	 */
	bh = head = page_buffers(page);
	do {
		if (atomic_read(&bh->b_count) |
		    (bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock))))
			return;
		bh = bh->b_this_page;
	} while (bh != head);

	spin_lock_irqsave(listlock, flags);
	set_buffer_nilfs_inactive_node(head);
	list_add_tail(&page->lru, list);
	(*listcnt)++;
	spin_unlock_irqrestore(listlock, flags);
}

/*
 * nilfs_remove_node_page_from_tree()
 * static function of nilfs_{shrink,clean}_inactive_node_pages()
 * caller should hold inactive_node_pages_lock,
 * and examine if the page is for nilfs_node page.
 */
static void
nilfs_remove_node_page_from_tree(struct page *page)
{
	spinlock_t *rtreelock = NULL;
	struct radix_tree_64_root *rtree;
	struct buffer_head *bh;

	bh = page_buffers(page);
	if (buffer_nilfs_bbt_node(bh)) {
		rtree = &NILFS_I(page->mapping->host)->i_block_ntree;
		rtreelock = &NILFS_I(page->mapping->host)->i_block_ntree_lock;
	} else if (buffer_nilfs_ibt_node(bh)) {
		rtree = &NILFS_AS_SB(page->mapping)->s_inode_ntree;
		rtreelock = &NILFS_AS_SB(page->mapping)->s_inode_ntree_lock;
	} else {
		page_warn("invalid page %p bh %p\n", page, bh);
		return;
	}
	nilfs_remove_from_page_64_tree(page, rtree, rtreelock);
}

/*
 * nilfs_shrink_inactive_node_pages()
 */
int
nilfs_shrink_inactive_node_pages(struct super_block *sb, int nr)
{
#ifdef NILFS_PAGE_DISABLE_SHRINK
	return 0;
#else
	struct page *page, *next;
	struct buffer_head *head, *bh;
	struct list_head *inactive_pages;
	LIST_HEAD(shrink_pages);
	spinlock_t *listlock;
	int *listcnt;
	int num = 0;
	unsigned long flags;
	
	shrink_debug(3, "called for %s(%p)\n", sb->s_id, sb);
	listcnt = &NILFS_SB(sb)->s_inactive_node_pages_cnt;
	/* if nr=0, report the number of unused objects */
	if (nr == 0) {
		shrink_debug(2, "report %d for %s(%p)\n",
			     *listcnt, sb->s_id, sb);
		return *listcnt;
	}
	inactive_pages = &NILFS_SB(sb)->s_inactive_node_pages;
	listlock = &NILFS_SB(sb)->s_inactive_node_pages_lock;

	spin_lock_irqsave(listlock, flags);
	if (*listcnt == 0) {
		spin_unlock_irqrestore(listlock, flags);
		shrink_debug(2, "empty list\n");
		goto out;
	}
	if (list_empty(inactive_pages)) {
		page_warn("list empty while cnt=%d\n", *listcnt);
		spin_unlock_irqrestore(listlock, flags);
		goto out;
	}
	list_for_each_entry_safe(page, next, inactive_pages, lru) {
		if (nr == 0) {
			/* nr pages scaned */
			break;
		}
		nr--;
		num++;
		if (TestSetPageLocked(page)) {	/* lock */
			shrink_debug(3, "keep locked page %p\n", page);
			continue;
		}
		if (PageDirty(page) || PageWriteback(page)) {
			shrink_debug(3, "keep page %p (%d%d)\n",
				     page, PageDirty(page),
				     PageWriteback(page));
			goto skip_this_page;
		}
		if (!page_has_buffers(page)) {
			page_warn("*** page %p has no buffers ***\n", page);
			goto skip_this_page;
		}
		head = bh = page_buffers(page);
		if (!buffer_nilfs_allocated(head) &&
		    !nilfs_page_for_node(page)) {
			page_warn("*** not nilfs page %p ***\n", page);
			goto skip_this_page;
		}
		do {
			if (atomic_read(&bh->b_count)) {
				page_warn("ref page %p in inactive list\n",
					  page);
				/* do not return ref page to inactive list */
				goto skip_this_page;
			}
			if (buffer_dirty(bh) || buffer_locked(bh)) {
				page_warn("busy page %p in inactive list\n",
					  page);
				goto skip_this_page;
			}
			bh = bh->b_this_page;
		} while (bh != head);
		list_del_init(&page->lru);
		(*listcnt)--;
		clear_buffer_nilfs_inactive_node(page_buffers(page));
		list_add_tail(&page->lru, &shrink_pages);
		if (nilfs_page_for_node(page))
			nilfs_remove_node_page_from_tree(page);
	skip_this_page:
		unlock_page(page);
		continue;
	}
	spin_unlock_irqrestore(listlock, flags);
	if (list_empty(&shrink_pages))
		goto out;
	list_for_each_entry_safe(page, next, &shrink_pages, lru) {
		list_del_init(&page->lru);
		if (!page_has_buffers(page)) {
			page_warn("non-private page %p\n", page);
			/* someone may be freeing this */
			continue;
		}
		nilfs_putblkbh(page_buffers(page));
	}
 out:

	shrink_debug(2, "%d objects scanned on %s(%p), return %d for %d\n",
		     num, sb->s_id, sb, *listcnt, num + nr);

	return *listcnt;
#endif
}

/*
 * nilfs_clean_inactive_node_pages()
 */
int
nilfs_clean_inactive_node_pages(struct super_block *sb)
{
	struct page *page, *next;
	struct buffer_head *head, *bh;
	struct list_head *inactive_pages;
	LIST_HEAD(shrink_pages);
	spinlock_t *listlock;
	int *listcnt;
	int num = 0;
	unsigned long flags;
	
	page_debug(2, "sb=%p\n", sb);
	inactive_pages = &NILFS_SB(sb)->s_inactive_node_pages;
	listlock = &NILFS_SB(sb)->s_inactive_node_pages_lock;
	listcnt = &NILFS_SB(sb)->s_inactive_node_pages_cnt;
	
	spin_lock_irqsave(listlock, flags);
	if (*listcnt == 0) {
		spin_unlock_irqrestore(listlock, flags);
		page_debug(2, "empty list\n");
		goto out;
	}
	if (list_empty(inactive_pages)) {
		spin_unlock_irqrestore(listlock, flags);
		page_warn("list empty while cnt=%d\n", *listcnt);
		goto out;
	}
	list_for_each_entry_safe(page, next, inactive_pages, lru) {
		list_del_init(&page->lru);
		(*listcnt)--;
		list_add_tail(&page->lru, &shrink_pages);
		if (!page_has_buffers(page)) {
			page_warn("*** page %p has no buffers ***\n", page);
			continue;
		}
		head = bh = page_buffers(page);
		clear_buffer_nilfs_inactive_node(head);
		/* clear page flags */
		if (PageDirty(page)) {
			page_debug(2, "clear dirty page %p\n", page);
			ClearPageDirty(page);
		}
		if (PageWriteback(page)) {
			page_debug(2, "clear writeback page %p\n",
				   page);
			ClearPageWriteback(page);
		}
		/* clear refcounts */
		do {
			while (atomic_read(&bh->b_count) > 0)
				put_bh(bh);
			bh = bh->b_this_page;
		} while (bh != head);
		if (nilfs_page_for_node(page))
			nilfs_remove_node_page_from_tree(page);
	}
	spin_unlock_irqrestore(listlock, flags);
	list_for_each_entry_safe(page, next, &shrink_pages, lru) {
		list_del_init(&page->lru);
		if (!page_has_buffers(page)) {
			page_warn("non-private page %p\n", page);
			/* XXX the page should be freed (which fuction?) */
			num++;
			continue;
		}
		head = page_buffers(page);
		if (!buffer_nilfs_allocated(head) && !buffer_nilfs_node(head))
		{
			page_warn("not nilfs node page %p\n", page);
			nilfs_putblkbh(head); /* XXX should be changed */
			num++;
			continue;
		}
		nilfs_putblkbh(head);
		num++;
	}
 out:
	page_debug(2, "%d object cleaned.\n", num);
	return 0;
}


/*
 * file block B-Tree node
 */

struct buffer_head *
nilfs_get_file_node_blk(struct inode *inode, dbn_t block)
{
	struct buffer_head *bh;
	struct nilfs_inode_info *ii = NILFS_I(inode);
	
	bh = nilfs_get_node_blk(&ii->i_block_ntree,
				&ii->i_block_ntree_lock, inode->i_mapping,
				inode->i_sb->s_bdev, 1UL << inode->i_blkbits,
				inode->i_blkbits, block,
				&NILFS_SB(inode->i_sb)->
				s_inactive_node_pages_lock);
	if (bh) {
		if (!buffer_nilfs_bbt_node(bh)) {
			/* lock_buffer(bh); */
			set_buffer_nilfs_bbt_node(bh);
			/* unlock_buffer(bh); */
		}
	}
	page_debug(2, "page %p bh %p page_count %d b_count %d\n",
		   bh->b_page, bh, page_count(bh->b_page),
		   atomic_read(&bh->b_count));
	return bh;
}

struct buffer_head *
nilfs_get_new_file_node_blk(struct inode *inode)
{
	struct nilfs_inode_info *ii = NILFS_I(inode);
	struct list_head *partial_pages = &ii->i_partial_node_pages;
	struct semaphore *sem = &ii->i_node_pages_sem;
	struct buffer_head *bh;
	
	bh = nilfs_get_new_node_blk_partial(partial_pages, sem);
	if (bh) {
		page_debug(2, "got partial page %p bh %p\n",
			   bh->b_page, bh);
		goto out;
	}
	bh = nilfs_get_new_node_blk(&ii->i_block_ntree,
				    &ii->i_block_ntree_lock,
				    inode->i_mapping, inode->i_sb->s_bdev,
				    1UL << inode->i_blkbits, inode->i_blkbits);
	if (bh) {
		page_debug(2, "get new page %p bh %p.\n",
			   bh->b_page, bh);
	}
 out:
	if (bh) {
		set_buffer_nilfs_bbt_node(bh);
	}
	page_debug(2, "page %p bh %p page_count %d b_count %d\n",
		   bh->b_page, bh, page_count(bh->b_page),
		   atomic_read(&bh->b_count));
	return bh;
}

void
nilfs_put_file_node_blk(struct buffer_head *bh)
{
	page_debug(2, "bh=%p, page=%p\n", bh, bh->b_page);
	nilfs_put_node_blk(bh);
}

void
nilfs_delete_file_node_blk(struct buffer_head *bh)
{
	page_debug(2, "bh=%p, page=%p\n", bh, bh->b_page);
	nilfs_delete_node_blk(bh);
}

void
nilfs_delete_all_file_node_blk(struct inode *inode)
{
	struct nilfs_inode_info *ii = NILFS_I(inode);
	nilfs_delete_all_node_blk(&ii->i_block_ntree, &ii->i_block_ntree_lock,
				  &NILFS_SB(inode->i_sb)->
				  s_inactive_node_pages_lock);
}

void
nilfs_clean_all_file_node_blk(struct inode *inode)
{
	struct nilfs_inode_info *ii = NILFS_I(inode);
	nilfs_clean_all_node_blk(&ii->i_block_ntree, &ii->i_block_ntree_lock,
				 &NILFS_SB(inode->i_sb)->
				 s_inactive_node_pages_lock);
}

void
nilfs_mark_file_node_blk_dirty(struct buffer_head *bh)
{
	struct address_space *mapping;
	struct nilfs_inode_info *ii;
	struct radix_tree_64_root *rtree;
	spinlock_t *rtreelock;
	
	if (buffer_dirty(bh)) {
		page_debug(2, "already dirty %p\n", bh);
		return;
	}
	mapping = bh->b_page->mapping;
	BUG_ON(!mapping);	  /* file node block should have mapping */
	ii = NILFS_I(mapping->host);
	rtree = &ii->i_block_ntree;
	rtreelock = &ii->i_block_ntree_lock;
	nilfs_mark_node_blk_dirty(bh, rtree, rtreelock);
}

void
nilfs_mark_file_node_blk_prepare_dirty(struct buffer_head *bh)
{
	struct address_space *mapping;
	struct nilfs_inode_info *ii;
	struct radix_tree_64_root *rtree;
	spinlock_t *rtreelock;
	
	if (buffer_dirty(bh)) {
		page_debug(2, "already dirty %p\n", bh);
		return;
	}
	mapping = bh->b_page->mapping;
	BUG_ON(!mapping);	  /* file node block should have mapping */
	ii = NILFS_I(mapping->host);
	rtree = &ii->i_block_ntree;
	rtreelock = &ii->i_block_ntree_lock;
	nilfs_mark_node_blk_prepare_dirty(bh, rtree, rtreelock);
}

/*
 * inode block B-Tree node
 */

struct buffer_head *
nilfs_get_inode_node_blk(struct super_block *sb, dbn_t block)
{
	struct buffer_head *bh;
	struct nilfs_sb_info *sbi = NILFS_SB(sb);
	
	bh = nilfs_get_node_blk(&sbi->s_inode_ntree,
				&sbi->s_inode_ntree_lock,
				sbi->s_mapping, sb->s_bdev, sb->s_blocksize,
				sb->s_blocksize_bits, block,
				&sbi->s_inactive_node_pages_lock);
	if (bh) {
		if (!buffer_nilfs_ibt_node(bh)) {
			/* lock_buffer(bh); */
			set_buffer_nilfs_ibt_node(bh);
			/* unlock_buffer(bh); */
		}
	}
	page_debug(2, "page %p bh %p page_count %d b_count %d\n",
		   bh->b_page, bh, page_count(bh->b_page),
		   atomic_read(&bh->b_count));
	return bh;
}

struct buffer_head *
nilfs_get_new_inode_node_blk(struct super_block *sb)
{
	struct nilfs_sb_info *sbi = NILFS_SB(sb);
	struct list_head *partial_pages = &sbi->s_partial_node_pages;
	struct semaphore *sem = &sbi->s_node_pages_sem;
	struct buffer_head *bh;
	
	bh = nilfs_get_new_node_blk_partial(partial_pages, sem);
	if (bh) {
		page_debug(2, "got partial page %p bh %p\n", bh->b_page, bh);
		goto out;
	}
	bh = nilfs_get_new_node_blk(&sbi->s_inode_ntree,
				    &sbi->s_inode_ntree_lock,
				    sbi->s_mapping, sb->s_bdev,
				    sb->s_blocksize, sb->s_blocksize_bits);
	if (bh) {
		page_debug(2, "get new page %p bh %p.\n", bh->b_page, bh);
	}
 out:
	if (bh) {
		set_buffer_nilfs_ibt_node(bh);
	}
	page_debug(2, "page %p bh %p page_count %d b_count %d\n",
		   bh->b_page, bh, page_count(bh->b_page),
		   atomic_read(&bh->b_count));
	return bh;
}

void
nilfs_put_inode_node_blk(struct buffer_head *bh)
{
	page_debug(2, "bh=%p, page=%p\n", bh, bh->b_page);
	nilfs_put_node_blk(bh);
}

void
nilfs_delete_inode_node_blk(struct buffer_head *bh)
{
	page_debug(2, "bh=%p, page=%p\n", bh, bh->b_page);
	nilfs_delete_node_blk(bh);
}

void
nilfs_delete_all_inode_node_blk(struct super_block *sb)
{
	struct nilfs_sb_info *sbi = NILFS_SB(sb);
	nilfs_delete_all_node_blk(&sbi->s_inode_ntree,
				  &sbi->s_inode_ntree_lock,
				  &sbi->s_inactive_node_pages_lock);
}

void
nilfs_clean_all_inode_node_blk(struct super_block *sb)
{
	struct nilfs_sb_info *sbi = NILFS_SB(sb);
	nilfs_clean_all_node_blk(&sbi->s_inode_ntree, &sbi->s_inode_ntree_lock,
				 &sbi->s_inactive_node_pages_lock);
}

void
nilfs_mark_inode_node_blk_dirty(struct buffer_head *bh)
{
	struct address_space *mapping;
	struct nilfs_sb_info *sbi;
	struct radix_tree_64_root *rtree;
	spinlock_t *rtreelock;
	
	if (buffer_dirty(bh)) {
		page_debug(2, "already dirty %p\n", bh);
		return;
	}
	mapping = bh->b_page->mapping;
	BUG_ON(!mapping);	/* inode node block should have mapping */
	sbi = NILFS_AS_SB(mapping);
	rtree = &sbi->s_inode_ntree;
	rtreelock = &sbi->s_inode_ntree_lock;
	nilfs_mark_node_blk_dirty(bh, rtree, rtreelock);
}

void
nilfs_mark_inode_node_blk_prepare_dirty(struct buffer_head *bh)
{
	struct address_space *mapping;
	struct nilfs_sb_info *sbi;
	struct radix_tree_64_root *rtree;
	spinlock_t *rtreelock;
	
	if (buffer_dirty(bh)) {
		page_debug(2, "already dirty %p\n", bh);
		return;
	}
	mapping = bh->b_page->mapping;
	BUG_ON(!mapping);	/* inode node block should have mapping */
	sbi = NILFS_AS_SB(mapping);
	rtree = &sbi->s_inode_ntree;
	rtreelock = &sbi->s_inode_ntree_lock;
	nilfs_mark_node_blk_prepare_dirty(bh, rtree, rtreelock);
}

void nilfs_clear_node_page_dirty(struct page *page)
{
	struct radix_tree_64_root *rtree;
	spinlock_t *rtreelock;
	unsigned long flags;
		
	if (nilfs_page_for_bbt_node(page)) {
		struct nilfs_inode_info *ii =
			NILFS_I(page->mapping->host);
			
		rtree = &ii->i_block_ntree;
		rtreelock = &ii->i_block_ntree_lock;
	} else { /* nilfs_page_for_ibt_node(page) */
		struct nilfs_sb_info *sbi = NILFS_AS_SB(page->mapping);
			
		rtree = &sbi->s_inode_ntree;
		rtreelock = &sbi->s_inode_ntree_lock;
	}
		
	if (TestClearPageDirty(page)) {
		spin_lock_irqsave(rtreelock, flags);
		radix_tree_64_tag_clear(rtree,
					nilfs_node_page_index(page),
					PAGECACHE64_TAG_DIRTY);
		radix_tree_64_tag_clear(rtree,
					nilfs_node_page_index(page),
					PAGECACHE64_TAG_PREPARE_DIRTY);
		spin_unlock_irqrestore(rtreelock, flags);
		dec_page_state(nr_dirty);
	}
}

/*
 * nilfs_move_dirty_node_blk
 *  the buffer_head should be got and refered by the caller.
 */
int nilfs_move_dirty_node_blk(struct buffer_head **pbh, dbn_t dbn,
			      unsigned int blkbits)
{
	struct page *page = (*pbh)->b_page;
	u64 new_index, old_index;
	int err = 0;
	int block_per_page_bits = PAGE_SHIFT - blkbits;
	unsigned long flags;
	spinlock_t *listlock, *rtreelock;
	struct radix_tree_64_root *rtree;
	struct buffer_head *bh;
#ifdef BTNODE_SMALLBLK_SUPPORT
	struct buffer_head *head;
	struct page *new_page;
	int n;
#endif

	bh = *pbh;
	if (buffer_nilfs_bbt_node(bh)) {
		struct nilfs_inode_info *ii = NILFS_I(page->mapping->host);
		rtree = &ii->i_block_ntree;
		rtreelock = &ii->i_block_ntree_lock;
		listlock = &NILFS_SB(bh->b_page->mapping->host->i_sb)->
			s_inactive_node_pages_lock;
	} else if (buffer_nilfs_ibt_node(bh)) {
		struct nilfs_sb_info *sbi = NILFS_AS_SB(page->mapping);
		rtree = &sbi->s_inode_ntree;
		rtreelock = &sbi->s_inode_ntree_lock;
		listlock = &sbi->s_inactive_node_pages_lock;
	} else {
		page_warn("bad page %p bh %p\n", bh->b_page, bh);
		err = -EINVAL;
		goto failed;
	}
	new_index = dbn >> block_per_page_bits;
	old_index = nilfs_node_page_index(page);
	if (unlikely(old_index == new_index))
		return 0;

	spin_lock_irqsave(rtreelock, flags);
	if (block_per_page_bits == 0) {
		page_debug(2, "trying to move page %p "
			   "bh %p (index: %llu -> %llu)\n",
			   page, page_buffers(page), old_index, new_index);
		err = radix_tree_64_insert(rtree, new_index, page);
		if (unlikely(err == -EEXIST))
			goto try_to_copy;
		if (unlikely(err)) {
			spin_unlock_irqrestore(rtreelock, flags);
			goto failed;
		}
		radix_tree_64_delete(rtree, old_index);
		radix_tree_64_tag_set(rtree, new_index, PAGECACHE64_TAG_DIRTY);
		clear_buffer_prepare_dirty(*pbh);
		page->index = (pgoff_t) new_index;
		if (nilfs_node_page_new(page)) {
			clear_buffer_nilfs_new_node(page_buffers(page));
		}
		(*pbh)->b_blocknr = (sector_t)dbn;
		spin_unlock_irqrestore(rtreelock, flags);
		return 0;
	}
 try_to_copy:
#ifndef BTNODE_SMALLBLK_SUPPORT
	spin_unlock_irqrestore(rtreelock, flags);
	page_warn("*** NOT IMPLEMENTED *** *** *** *** *** *** *** ***.\n");
	BUG();
#else /* BTNODE_SMALLBLK_SUPPORT */
	page_debug(2, "trying to copy a buffer in page "
		   "(index: %llu -> %llu)\n", old_index, new_index);
	new_page = radix_tree_64_lookup(rtree, new_index);
	if (likely(!new_page)) {
		spin_unlock_irqrestore(rtreelock, flags);
		bh = head = nilfs_getblkbh((*pbh)->b_bdev,
					   new_index << block_per_page_bits,
					   1 << blkbits);
		if (unlikely(!head)) {
			err = -ENOMEM;
			goto failed;
		}
		do {
			set_buffer_nilfs_node(bh);
			bh = bh->b_this_page;
		} while (bh != head);

		new_page = bh->b_page;
		err = nilfs_add_to_page_64_tree(new_page, rtree, rtreelock, 
						new_index, page->mapping,
						GFP_NOFS);
		if (unlikely(err)) {
			nilfs_putblkbh(head);
			page_debug(2, "radix_tree_64_insert failed (err=%d)\n",
				   err);
			goto failed;
		}
		spin_lock_irqsave(rtreelock, flags);
#if 0
		radix_tree_64_tag_set(rtree, new_index, PAGECACHE64_TAG_DIRTY);
#endif
		goto try_to_copy;
		/*
		 * There is a possibility that the new_page might be deleted by
		 * the shrinker before re-locking the radix-tree.
		 * This problem should be solved when implementing the
		 * shrinker.
		 */
	}
	page_cache_get(new_page);
	spin_unlock_irqrestore(rtreelock, flags);

	n = dbn & ((1 << block_per_page_bits) - 1);
	lock_page(new_page);
	bh = nilfs_page_get_nth_block(new_page, n);
	unlock_page(new_page);
	/* The reference count of bh has been incremented. */
	BUG_ON(!bh);
	BUG_ON(bh == *pbh);
	page_cache_release(new_page);

	set_buffer_nilfs_node(bh);
	bh->b_blocknr = (sector_t)dbn;
	if (buffer_nilfs_bbt_node(*pbh))
		set_buffer_nilfs_bbt_node(bh);
	else /* if (buffer_nilfs_ibt_node(*pbh)) */
		set_buffer_nilfs_ibt_node(bh);
	/* set_buffer_nilfs_allocated(bh); */
	/* NOTE:
	 *  We have to comment out the above line because 'nilfs_allocated'
	 *  flag is not set for current node blocks.
	 */
	set_buffer_dirty(bh);

	/* lock_buffer(bh); */
	memcpy(bh->b_data, (*pbh)->b_data, 1 << blkbits);
	/* unlock_buffer(bh); */
	/*
	 * This function is called as a continuation of the segment
	 * construction. So we don't lock buffer when copying data.
	 */
	nilfs_mark_node_blk_dirty(bh, rtree, rtreelock);
	nilfs_set_page_writeback(bh->b_page);

	/*
	 * Replacing the pointer of place-holder
	 */
	page_debug(2, "replacing buffer (bh: %p -> %p)\n", *pbh, bh);

	/* Cleaning up old page. Is this sufficient? */
	clear_buffer_prepare_dirty(*pbh);
	nilfs_clear_node_blk_dirty(*pbh);
	if (!PageDirty((*pbh)->b_page))
		nilfs_end_page_writeback((*pbh)->b_page);
	put_bh(*pbh);
	if (buffer_nilfs_bbt_node(*pbh))
		nilfs_delete_file_node_blk(*pbh);
	else /* if (buffer_nilfs_ibt_node(*pbh)) */
		nilfs_delete_inode_node_blk(*pbh);

	*pbh = bh;
#endif /* BTNODE_SMALLBLK_SUPPORT */
 failed:
	return err;
}

/* Local Variables:	*/
/* eval: (c-set-style "linux")	*/
/* End:			*/
