/*
 * segment.c - NILFS segment constructor.
 *
 * Copyright (C) 2005, 2006 Nippon Telegraph and Telephone Corporation.
 *
 * This file is part of NILFS.
 *
 * NILFS is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * NILFS is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with NILFS; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 *
 * segment.c,v 1.198 2006/07/14 04:38:02 ryusuke Exp
 *
 * Written by Ryusuke Konishi <ryusuke@osrg.net>
 *
 */

#include <linux/pagemap.h>
#include <linux/buffer_head.h>
#include <linux/writeback.h>
#include <linux/crc32c.h>
#include <linux/bio.h>
#include <linux/completion.h>
#include <linux/blkdev.h>
#include <linux/suspend.h>
#include "nilfs.h"

static void start_segctor_timer(struct nilfs_sb_info *sbi);

#define nilfs_cnt32_gt(a,b)   \
        (typecheck(__u32, a) && typecheck(__u32, b) && \
         ((__s32)(b) - (__s32)(a) < 0))
#define nilfs_cnt32_ge(a,b)   \
        (typecheck(__u32, a) && typecheck(__u32, b) && \
         ((__s32)(a) - (__s32)(b) >= 0))
#define nilfs_cnt32_lt(a,b)   nilfs_cnt32_gt(b,a)
#define nilfs_cnt32_le(a,b)   nilfs_cnt32_ge(b,a)

/*
 * Macros to associate a freezing buffer with its copy
 */
#define __buffer_frozen_copy(bh)  \
   (!list_empty(&(bh)->b_assoc_buffers) && buffer_nilfs_allocated(page_buffers((bh)->b_page)))
#define __buffer_frozen_orig(bh)  (!list_empty(&(bh)->b_assoc_buffers))
#define __uncouple_frozen_buffer(bh)  do { list_del_init(&(bh)->b_assoc_buffers); } while (0)
#define __couple_frozen_buffer(orig, copy)  \
   do { \
	BUG_ON(!list_empty(&(orig)->b_assoc_buffers) ||  \
	       !list_empty(&(copy)->b_assoc_buffers));  \
	list_add(&(copy)->b_assoc_buffers, &(orig)->b_assoc_buffers);  \
   } while (0)

/* 
 * Transaction 
 *
 * We don't need the exclusion control among same task, because
 * all file operations are serialized through inode->i_mutex(i_sem) including
 * ones by the same task.
 */
static kmem_cache_t *nilfs_transaction_cachep;

/**
 * nilfs_init_transaction_cache - create a cache for nilfs_transaction_info
 *
 * nilfs_init_transaction_cache() creates a slab cache for the struct
 * nilfs_transaction_info. 
 *
 * Return Value: On success, it returns 0. On error, a following negative
 * error code is returned.
 * 
 * %-ENOMEM - Insufficient memory available.
 */
int nilfs_init_transaction_cache(void)
{
	nilfs_transaction_cachep =
		kmem_cache_create("nilfs_transaction_cache",
				  sizeof(struct nilfs_transaction_info),
				  0, SLAB_RECLAIM_ACCOUNT,
				  NULL, NULL);
  
	return ((nilfs_transaction_cachep == NULL) ? -ENOMEM : 0);
}

/**
 * nilfs_detroy_transaction_cache - destroy the cache for nilfs_transaction_info
 *
 * nilfs_destroy_transaction_cache() frees the slab cache for the struct
 * nilfs_transaction_info.
 */
void nilfs_destroy_transaction_cache(void)
{
	if (kmem_cache_destroy(nilfs_transaction_cachep))
		printk(KERN_INFO
		       "NILFS: some transaction on memory were not freed\n");
}

/**
 * nilfs_prepare_segment_lock - prepare to lock the segment semaphore.
 * @sb: super block
 * @ti: nilfs_transaction_info
 *
 * nilfs_prepare_segment_lock() sets up or updates the struct 
 * nilfs_transaction_info into the journal_info field of task_struct of
 * the current task. If the current task holds a valid 
 * nilfs_transaction_info, its ti_count field is incremented.
 * If the current task doesn't have a nilfs_transaction_info,
 * given nilfs_transaction_info by @ti or (when @ti is NULL) dynamically
 * allocated one is initialized and is hooked on journal_info of
 * the current task.
 * In the later case, %NILFS_TI_DYNAMIC_ALLOC flag is set on ti_flags.
 *
 * Return Value: On success, the value of ti_count (nest level count) is
 * returned. On error, a following negative error code is returned.
 * 
 * %-ENOMEM - Insufficient memory available.
 */
static int
nilfs_prepare_segment_lock(struct super_block *sb, struct nilfs_transaction_info *ti)
{
	struct nilfs_transaction_info *cur_ti = current->journal_info;
	void *save = NULL;

	if (cur_ti) {
		if (cur_ti->ti_super == sb) {
			return ++cur_ti->ti_count;
		} else {
			/*
			 * If journal_info field is occupied by other filesystem,
			 * we save it and restore on nilfs_transaction_end().
			 * But this should never happen.
			 */
			nilfs_warning(sb, __FUNCTION__, 
				      "journal info from a different FS");
			save = current->journal_info;
		}
	}
	if (!ti) {
		ti = kmem_cache_alloc(nilfs_transaction_cachep, SLAB_NOFS);
		if (!ti)
			return -ENOMEM;
		ti->ti_flags = NILFS_TI_DYNAMIC_ALLOC;
	} else {
		ti->ti_flags = 0;
	}
	ti->ti_count = 0;
	ti->ti_save = save;
	ti->ti_super = sb;
	current->journal_info = ti;
	return 0;
}

/**
 * nilfs_transaction_begin - start indivisible file operations.
 * @sb: super block
 * @ti: nilfs_transaction_info
 *
 * nilfs_transaction_begin() acquires the read semaphore that excludes
 * segment construction as needed. This function is used with 
 * nilfs_transaction_end() in pairs. The region enclosed by these
 * functions excludes the segment construction.  Their enclosure can be
 * nested; acquiring and releasing of the read semaphore is performed
 * only in the outermost call to avoid multiple read locking by the same
 * task.
 * @ti is used to specify a nilfs_transaction_info on local memory.
 * In outermost call, the memory is initialized and is hooked onto the 
 * current task. The allocation of nilfs_transaction_info can be 
 * done dynamically by giving NULL to @ti.
 * 
 * Return Value: On success, 0 is returned. On error, a following negative
 * error code is returned.
 *
 * %-ENOMEM - Insufficient memory available.
 */
int nilfs_transaction_begin(struct super_block *sb, struct nilfs_transaction_info *ti)
{
	int ret = nilfs_prepare_segment_lock(sb, ti);

	if (unlikely(ret < 0))
		return ret;
	if (ret == 0) {
		might_sleep();
		seg_debug(3, "task %p locking segment semaphore\n", current);
		down_read(&NILFS_SB(sb)->s_segctor.sem);
		seg_debug(3, "locked\n");
		start_segctor_timer(NILFS_SB(sb));
	} else
		seg_debug(3, "incremented transaction refcnt (ti=%p, cnt=%d)\n",
			  current->journal_info, 
			  ((struct nilfs_transaction_info *)current->journal_info)->ti_count);
	return 0;
}

/**
 * nilfs_transaction_end - end indivisible file operations.
 * @sb: super block
 *
 * nilfs_transaction_end() releases the read semaphore which is 
 * acquired by nilfs_transaction_begin(). Its releasing is only done
 * in outermost call of this function. If the nilfs_transaction_info
 * was allocated dynamically, it is given back to a slab cache.
 */
int nilfs_transaction_end(struct super_block *sb)
{
	struct nilfs_transaction_info *ti = current->journal_info;
	struct nilfs_sb_info *sbi;
	int err = 0;

	if (unlikely(ti == NULL || ti->ti_super != sb)) {
		seg_debug(1, "missing nilfs_transaction_begin()\n");
		BUG();
	}
	if (ti->ti_count > 0) {
		ti->ti_count--;
		seg_debug(3, "decremented transaction refcnt (ti=%p, cnt=%d)\n",
			  ti, ti->ti_count);
		return 0;
	}
	sbi = NILFS_SB(sb);
	up_read(&sbi->s_segctor.sem);
	seg_debug(3, "task %p unlocked segment semaphore\n", current);
	current->journal_info = ti->ti_save;
	if (ti->ti_flags & NILFS_TI_SYNC) {
		err = nilfs_construct_segment(sb);
		if (!NILFS_SEG_ERR(err))
			err = 0;
	}
	if (ti->ti_flags & NILFS_TI_DYNAMIC_ALLOC)
		kmem_cache_free(nilfs_transaction_cachep, ti);
	return err;
}

/* 
 * Segment constructor
 */
#define SC_N_PAGEVEC      16    /* size of locally allocated page vector */
#define SC_N_BHVEC        16    /* size of locally allocated buffer head
				   vector */

#define SC_BYTES_SECTOR(b)      ((((b) - 1) >> NILFS_SECTOR_BITS) + 1)

/*
 * Mode of construction
 */
#define SC_MODE_CP_MASK   (SS_FJCP | SS_FNCP)

#define SC_FLUSH_BASE     12
#define SC_FLUSH_MASK     0x3000
#define SC_FLUSH_DATA     (NILFS_SC_FLUSH_DATA << SC_FLUSH_BASE)
#define SC_FLUSH_IBT      (NILFS_SC_FLUSH_IBT << SC_FLUSH_BASE)
#define SC_FLUSH_CP       (NILFS_SC_FLUSH_CP << SC_FLUSH_BASE)
#define SC_FLUSH_FDATA    (NILFS_SC_FLUSH_FDATA << SC_FLUSH_BASE)

#define SC_LSEG_MJCP      (SC_FLUSH_CP | SS_FJCP)  /* Make a logical segment having a major CP */
#define SC_LSEG_MNCP      (SC_FLUSH_CP | SS_FNCP)  /* Make a logical segment having a minor CP */
#define SC_LSEG_DSYNC     SC_FLUSH_FDATA

#if (SC_MODE_CP_MASK & SC_FLUSH_MASK)
#error "Conflicted flags"
#endif


/* Internal return codes */
#define NILFS_SEG_FULL    1

/* Collection stages */
enum {
	SC_COLLECT_INIT = 0,
	SC_COLLECT_DATA,
	SC_COLLECT_FBT,
	SC_COLLECT_IBLK,
	SC_COLLECT_IBT,
	SC_COLLECT_CP,
	SC_COLLECT_DONE,
	SC_COLLECT_FDATA,
};


#define SC_BH(sci,idx)   ((sci)->sc_bh_arr[idx])
#define SC_SUM_BH(sci,idx)  SC_BH(sci, (sci)->sc_ba_max - idx - 1)


typedef int (*data_proc_t)(struct buffer_head *, void *, fbn_t);
typedef int (*node_proc_t)(struct buffer_head *, void *);

static int lookup_dirty_buffers(struct super_block *, struct address_space *,
				void *, data_proc_t);
static int lookup_buffers_64(struct radix_tree_64_root *, spinlock_t *,
			     void *, node_proc_t, int);
static void abort_segment_io(struct nilfs_sc_info *, int);
static void complete_segment_io(struct nilfs_sc_info *);

/**
 * wait_on_segment - wait for completion of requested BIOs
 * @sbi: nilfs_sb_info
 * @sci: nilfs_sc_info
 *
 * wait_on_segment() waits for the completion of currently requesting
 * BIOs and gets the result of submissions. When the construction of a
 * logical segment is completed successfully, it updates a pointer to the
 * latest partial segment by calling nilfs_segment_update_super().
 *
 * Return Value: On Success, 0 is returned. On Error, a following negative
 * error code is returned.
 *
 * %-EIO - I/O error
 */
static int
wait_on_segment(struct nilfs_sb_info *sbi, struct nilfs_sc_info *sci)
{
	int err = 0;

	seg_debug(2, "called nbio=%d\n", sci->sc_nbio);
	if (!sci->sc_nbio)
		return 0;

	do {
		wait_for_completion(&sci->sc_bio_event);
	} while (--sci->sc_nbio > 0);

	seg_debug(2, "wait completed\n");
	if (unlikely(test_and_clear_bit(NILFS_SC_EIO, &sci->sc_flags))) {
		printk(KERN_ERR "NILFS: IO error writing segment\n");
		err = -EIO;
	} else
		clear_bit(NILFS_SC_RETRY, &sci->sc_flags);

	if (test_and_clear_bit(NILFS_SC_LAST_CP, &sci->sc_flags))
		nilfs_update_last_segment(sbi);
	return err;
}

int nilfs_set_file_dirty(struct nilfs_sb_info *sbi, struct inode *inode)
{
	struct nilfs_inode_info *ii = NILFS_I(inode);
	struct nilfs_inode *raw_inode;
	struct buffer_head *ibh;
	int err;

	/* Readers need spinlock for accessing s_segctor.dirty_files */
	spin_lock_irq(&sbi->s_segctor.dirty_files_lock);
	raw_inode = __nilfs_load_inode_block_nolock(sbi, inode, &ibh);
	if (IS_ERR(raw_inode)) {
		/* We first try to get a buffer of the specified inode to
		   ensure that every inode in the dirty_files list has a
		   valid BH pointer in i_bh. */
		nilfs_warning(inode->i_sb, __FUNCTION__,
			      "failed to reget inode block.\n");
		err = PTR_ERR(raw_inode);
		spin_unlock_irq(&sbi->s_segctor.dirty_files_lock);
		return err;
	}

	if (ii->i_state & NILFS_STATE_UPDATED) { /* redirty file */
		list_del_init(&ii->i_dirty);
		ii->i_state &= ~NILFS_STATE_UPDATED;
		list_add_tail(&ii->i_dirty, &sbi->s_segctor.dirty_files);
		seg_debug(2, "redirtied file (ino=%lu)\n", inode->i_ino);
	} else if (list_empty(&ii->i_dirty)) {
		if (igrab(inode) == NULL) {
			/* This will happen when somebody is freeing 
			   this inode. */
			nilfs_warning(inode->i_sb, __FUNCTION__,
				      "cannot get inode (ino=%lu)\n",
				      inode->i_ino);
			spin_unlock_irq(&sbi->s_segctor.dirty_files_lock);
			brelse(ibh);
			return -EINVAL;
		}
		list_add_tail(&ii->i_dirty, &sbi->s_segctor.dirty_files);
		seg_debug(2, "registered dirty file (ino=%lu)\n", inode->i_ino);
	}
	ii->i_state |= NILFS_STATE_DIRTY;
	spin_unlock_irq(&sbi->s_segctor.dirty_files_lock);
	nilfs_mark_inode_buffer_dirty(sbi, ibh);
	brelse(ibh);
	return 0;
}

/**
 * nilfs_commit_dirty_file - register a file to the dirty file list
 * @inode: inode of the file to be registered.
 *
 * nilfs_commit_dirty_file() registers a file specified by @inode to
 * the dirty file list. This list is used to limit the scope of searches
 * of dirty blocks in the following segment construction.
 * If nilfs_inode of the file has a %NILFS_STATE_UPDATE flag on its i_state
 * field, it is reentered to the dirty file list and the 
 * %NILFS_STATE_UPDATE flag is cleared. The %NILFS_STATE_UPDATE flag 
 * indicates that the nilfs_inode has been removed from the dirty file list
 * but still has a reference of inode.
 * If the nilfs_inode does not belong to any list, it is simply added to
 * the dirty file list. If the registration succeeded, 
 * the %NILFS_STATE_DIRTY flag is set, the inode block which contains
 * the nilfs_inode is loaded and is marked dirty.
 * In any case, it accompanies a call of nilfs_transaction_end().
 *
 * Return Value: On success, 0 is returned. On error, a following negative
 * error code is returned.
 *
 * %-EINVAL - cannot grab the inode (This may happen when somebody is 
 * freeing the inode) or specified inode has an invalid inode number.
 *
 * %-EIO - I/O error
 */
int nilfs_commit_dirty_file(struct inode *inode)
{
	struct nilfs_sb_info *sbi;
	int err = -EIO, err2;

	if (is_bad_inode(inode)) {
		inode_debug(1, "tried to commit bad_inode. ignored.\n");
		nilfs_dump_stack(NILFS_VERBOSE_INODE, 2);
		goto failed;
	}
	sbi = NILFS_SB(inode->i_sb);

#ifdef CONFIG_NILFS_DEBUG
	if (unlikely(inode->i_state & I_FREEING))
		nilfs_warning(inode->i_sb, __FUNCTION__,
			      "trying to mark deleting file dirty.\n");
#endif	
	err = nilfs_set_file_dirty(sbi, inode);
 failed:
	err2 = nilfs_transaction_end(inode->i_sb);
	return (err ? : err2);
}

/**
 * nilfs_dirty_inode - reflect changes on given inode to an inode block.
 * @inode: inode of the file to be registered.
 *
 * nilfs_dirty_inode() loads a inode block containing the specified
 * @inode and copies data from a nilfs_inode to a corresponding inode
 * entry in the inode block. This operation is excluded from the segment
 * construction. This function can be called both as a single operation
 * and as a part of indivisible file operations.
 */
void nilfs_dirty_inode(struct inode *inode)
{
	struct nilfs_sb_info *sbi;
	struct nilfs_inode *raw_inode;
	struct buffer_head *ibh;
	struct nilfs_transaction_info ti;

	if (is_bad_inode(inode)) {
		inode_debug(1, "tried to make bad_inode dirty. ignored.\n");
		nilfs_dump_stack(NILFS_VERBOSE_INODE, 2);
		return;
	}
	sbi = NILFS_SB(inode->i_sb);
	nilfs_transaction_begin(inode->i_sb, &ti);

	raw_inode = nilfs_load_inode_block(sbi, inode, &ibh);
	if (IS_ERR(raw_inode)) {
		nilfs_warning(inode->i_sb, __FUNCTION__,
			      "failed to reget inode block.\n");
		nilfs_transaction_end(inode->i_sb); /* never fails */
		return;
	}
	lock_buffer(ibh);
	nilfs_update_inode(inode, raw_inode, ibh);
	unlock_buffer(ibh);
	nilfs_mark_inode_buffer_dirty(sbi, ibh);
	brelse(ibh);
	nilfs_transaction_end(inode->i_sb); /* never fails */
}

/*
 * BIO operations
 */

/**
 * clear_seg_bio - clear cancelled bio
 * @sci: nilfs_sc_info
 * @bio: cancelled BIO
 *
 * clear_seg_bio() releases buffer heads refered by the BIO page vector.
 */
static void
clear_seg_bio(struct nilfs_sc_info *sci, struct bio *bio)
{
	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;

	do {
		struct bio_vec *bv2 = bvec;
		struct buffer_head *bh, *head;
		unsigned int offset = 0;

		if (--bvec >= bio->bi_io_vec)
			prefetchw(&bvec->bv_page->flags);

		bh = head = page_buffers(bv2->bv_page);
		do {
			if (offset >= bv2->bv_offset) {
				if (offset >= bv2->bv_offset + bv2->bv_len)
					break;
				/* clear_buffer_uptodate(bh); */
				nilfs_brelse_isr(sci->sc_sb, bh);
			}
			offset += bh->b_size;
			bh = bh->b_this_page;
		} while (bh != head);
	} while (bvec >= bio->bi_io_vec);
}

/**
 * end_seg_bio_write - complete a BIO for the segment write
 * @bio: cancelled BIO
 * @bytes_done: byte count of completed data
 * @err: err code notified by the BIO layer
 *
 * end_seg_bio_write() is a completion routine of BIO for segment write.
 * This routine performs end processing for the BIO.
 * When the block count reaches the number of block in this segment, it 
 * calls complete_segment_io() to finish a paritial segment. If an error
 * flag is set, it calles instead abort_segment_io() to cancel BIOs.
 *
 * Return Value: If given BIO has not finished yet, 1 is returned.
 * Otherwise, 0 is returned.
 */
static int
end_seg_bio_write(struct bio *bio, unsigned int bytes_done, int err)
{
	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
	struct nilfs_sc_info *sci;
	unsigned int blk_done;

	if (bio->bi_size)
		return 1;

	sci = (struct nilfs_sc_info *)bio->bi_private;
	clear_seg_bio(sci, bio);

	if (err == -EOPNOTSUPP) {
		struct bio_vec *bv, *ep = bio->bi_io_vec + bio->bi_vcnt - 1;

		set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
		for (bv = bio->bi_io_vec; bv <= ep; bv++)
			unlock_page(bv->bv_page);

		bio_put(bio);
		return 0; /* to be detected by submit_seg_bio() */
	}

	blk_done = bytes_done >> sci->sc_sb->s_blocksize_bits;

	if (!uptodate)
		set_bit(NILFS_SC_EIO, &sci->sc_flags);

	seg_debug(2, "blocks written by this bio = %u, "
		  "residual blocks of this segment = %u\n",
		  blk_done, atomic_read(&sci->sc_bio_blk_cnt));
	if (atomic_sub_and_test(blk_done, &sci->sc_bio_blk_cnt)) {
		if (test_bit(NILFS_SC_EIO, &sci->sc_flags)) {
			abort_segment_io(sci, sci->sc_sum.nblocks);
			seg_debug(1, "failed to writeout a segment\n");
		} else {
			complete_segment_io(sci);
			seg_debug(1, "completed a segment\n");
		}
	}

	bio_put(bio);
	complete(&sci->sc_bio_event);
	return 0;
}

/**
 * submit_seg_bio - submit a given bio of the segment.
 * @bio: bio
 * @sci: nilfs_sc_info
 * @mode: write mode
 *
 * submit_seg_bio() sends a write request to the BIO layer.
 * The request is specified by @bio.
 * This function always locks pages and set their state writeback even
 * when it returns an error. If @sync is specified non-zero value,
 * bio is submitted with a WRITE_SYNC flag instead of a WRITE flag.
 *
 * Return Value: On success, 0 is returned. on error, a following negative
 * error code is returned.
 * 
 * %-EIO - I/O error.
 * %-EOPNOTSUPP - Operation is not supported.
 */
static inline int
submit_seg_bio(struct bio *bio, struct nilfs_sc_info *sci, int mode)
{
	struct bio_vec *bv, *ep = bio->bi_io_vec + bio->bi_vcnt - 1;
	int max_bio = atomic_read(&sci->sc_max_bio);

	for (bv = bio->bi_io_vec; bv <= ep; bv++)
		lock_page(bv->bv_page);

	while (sci->sc_nbio >= max_bio) {
		seg_debug(2, "waiting for a segment\n");
		wait_for_completion(&sci->sc_bio_event);
		sci->sc_nbio--;
		if (unlikely(test_bit(NILFS_SC_EIO, &sci->sc_flags))) {
			seg_debug(1, "detected error of a submitted segment\n");
			clear_seg_bio(sci, bio);
			bio_put(bio);
			return -EIO;
		}
	}
#if 0  /* issue i/o-request immediately */
	else if (sci->sc_nbio + 1 == max_bio)
		mode |= (1 << BIO_RW_SYNC);
#endif

	bio->bi_end_io = end_seg_bio_write;
	bio->bi_private = sci;
	seg_debug(2, "submitting bio (start_dbn=%llu, size=%u, vcnt=%hu, "
		  "barrier=%d)\n",
		  bio->bi_sector, bio->bi_size, bio->bi_vcnt,
		  (mode & (1 << BIO_RW_BARRIER)) != 0);

	bio_get(bio);
	submit_bio(mode, bio);
	if (bio_flagged(bio, BIO_EOPNOTSUPP)) {
		seg_debug(1, "aborted bio submission\n");
		bio_put(bio);
		return -EOPNOTSUPP;
	}
	sci->sc_nbio++;
	bio_put(bio);
	return 0;
}

/** 
 * alloc_seg_bio - allocate a bio for writing segment.
 * @sb: super block
 * @start: beginning disk block number of this BIO.
 * @nr_vecs: request size of page vector.
 *
 * alloc_seg_bio() allocates a new BIO structure and initialize it.
 *
 * Return Value: On success, pointer to the struct bio is returned.
 * On error, NULL is returned.
 */
static struct bio *
alloc_seg_bio(struct super_block *sb, dbn_t start, int nr_vecs)
{
	struct bio *bio;

	bio = bio_alloc(GFP_NOIO, nr_vecs);
	if (bio == NULL && (current->flags & PF_MEMALLOC)) {
		while (!bio && (nr_vecs >>= 1))
			bio = bio_alloc(GFP_NOIO, nr_vecs);
	}
	if (likely(bio)) {
		bio->bi_bdev = sb->s_bdev;
		bio->bi_sector = (sector_t)start << (sb->s_blocksize_bits - NILFS_SECTOR_BITS);
		seg_debug(2, "allocated bio (max_vecs=%d)\n", bio->bi_max_vecs);
	}
	return bio;
}

/** 
 * seg_bio_add_bh - appends a buffer head to specified bio.
 * @bio: bio 
 * @bh: buffer head to be appended to the bio.
 *
 * seg_bio_add_bh() adds a buffer head @bh to a page vector of @bio.
 * It also gets the reference of the buffer and locks it.
 * When returning an error, it will do nothing to the buffer.
 *
 * Return Value: On success, 0 is returned. If the page vector is full,
 * 1 is returned.
 */
static int
seg_bio_add_bh(struct bio *bio, struct buffer_head *bh)
{
	unsigned int offset = bh_offset(bh);
	unsigned int len;
	request_queue_t *q = bdev_get_queue(bio->bi_bdev);

	BUG_ON(bio_flagged(bio, BIO_CLONED));
	/* Cloned bio must not modify io-vectors.
	   This is a check against future changes */

	if (SC_BYTES_SECTOR(bio->bi_size + bh->b_size) > q->max_sectors)
		return 1; /* FULL */

	if (bio->bi_vcnt > 0) {
		struct bio_vec *bvec = &bio->bi_io_vec[bio->bi_vcnt - 1];

		if (bvec->bv_page == bh->b_page &&
		    bvec->bv_offset + bvec->bv_len == offset) {
			bvec->bv_len += bh->b_size;
			goto out;
		}
	}
	len = bio_add_page(bio, bh->b_page, bh->b_size, offset);
	if (len == 0)
		return 1;
 out:
	get_bh(bh);
	return 0;
}

/**
 * freeze_blocks - make a copy of data blocks and swap its original
 * @bhs: vector of buffer heads to be frozen
 * @nblocks: number of blocks to be frozen
 */
static int freeze_blocks(struct buffer_head **bhs, int nblock)
{
	struct buffer_head *bh, *bufs;

	seg_debug(3, "freezing blocks (bhs[0]=%p, nblock=%d)\n", *bhs, nblock);
	bufs = nilfs_copy_buffers(*bhs, nblock);
	if (unlikely(!bufs))
		return -ENOMEM;

	set_buffer_nilfs_allocated(bufs); /* mark this page nilfs_allocated */
	SetPageWriteback(bufs->b_page);
	bh = bufs;
	do {
		get_bh(bh);
		__couple_frozen_buffer(*bhs, bh);
		/* We remain the reference count of original blocks for this association */
		/*
		  get_bh(*bhs);
		  __brelse(*bhs);
		*/
		*bhs++ = bh;
		bh = bh->b_this_page;
	} while(--nblock > 0);

	unlock_page(bufs->b_page);
	return 0;
}

/**
 * peel_off_original_blocks - clear original blocks linked to given frozen buffers
 * @bh_copy: head of copied blocks
 * @err: whether bio was successfully completed or not
 */
static void peel_off_original_blocks(struct buffer_head *bh_copy, int err)
{
	struct buffer_head *bh, *prev;
	struct page *page;

	bh = list_entry(bh_copy->b_assoc_buffers.next,
			struct buffer_head, b_assoc_buffers);
	page = bh->b_page;
	seg_debug(3, "clearing blocks (bhs[0]=%p, page=%p, err=%d)\n",
		  bh, page, err);

	do {
		if (!err) {
			set_buffer_uptodate(bh);
			clear_buffer_dirty(bh);
		}
		__uncouple_frozen_buffer(bh);
		prev = bh;
		bh = bh->b_this_page;
		__brelse(prev);
	} while (__buffer_frozen_orig(bh));

	if (!err) {
		if (!nilfs_page_buffers_dirty(page)) {
			nilfs_clear_page_to_be_frozen(page);
			clear_page_dirty(page);
		}
		ClearPageError(page);
	} else
		SetPageError(page);

	end_page_writeback(page);
}

/*
 * Buffer array operations
 */
static inline int
ba_get_new_segsum_block(struct nilfs_sc_info *sci)
{
	struct buffer_head *bh;

	bh = nilfs_getblk(sci->sc_sb,
			  sci->sc_pseg_start + (sci->sc_ba_max - sci->sc_ba_sum_idx));
	if(unlikely(!bh))
		return -ENOMEM;

	SetPageWriteback(bh->b_page);
	seg_debug(3, "adding bh(%p) to bh_arr[%d]\n",
		  bh, sci->sc_ba_sum_idx - 1);
	SC_BH(sci, --sci->sc_ba_sum_idx) = bh;
	return 0;
}

static inline void
ba_register_block(struct nilfs_sc_info *sci, struct buffer_head *bh)
{
	struct buffer_head **pbh = &SC_BH(sci, sci->sc_ba_idx);
	struct page *page = bh->b_page;

	BUG_ON(*pbh);

	lock_page(page);
	nilfs_set_page_writeback(page);
	seg_debug(3, "adding bh(%p) to bh_arr[%td]\n",
		  bh, pbh - &SC_BH(sci, 0));
	get_bh(bh);
	*pbh = bh;
	sci->sc_ba_idx++;
	sci->sc_sum.nblocks++;
	unlock_page(page);
}

static int ba_init(struct nilfs_sc_info *sci, struct nilfs_sb_info *sbi)
{
	int err;

	sci->sc_ba_idx = 0;
	sci->sc_ba_sum_idx = sci->sc_ba_max;

	err = ba_get_new_segsum_block(sci);
	if (unlikely(err))
		return err;

	sci->sc_sum_bytes = sizeof(struct nilfs_seg_summary);
	sci->sc_sum.nblocks = sci->sc_sum.nblk_sum = 1;
	sci->sc_sum.nfinfo = sci->sc_sum.nfbinfo = 0;
	sci->sc_sum.nblk_file = sci->sc_sum.nblk_fbt = 0;
	sci->sc_sum.nblk_inode = sci->sc_sum.nblk_ibt = 0;
	sci->sc_sum.prev_pseg = sbi->s_nilfs->ns_prev_pseg;
	sci->sc_sum.seg_seq = sbi->s_nilfs->ns_seg_seq;
	sci->sc_blk_cnt = 0;

	return 0;
}

static void ba_clear(struct nilfs_sc_info *sci, int err)
{
	struct buffer_head **pbh = &SC_SUM_BH(sci, 0);
	struct buffer_head **ep = &SC_SUM_BH(sci, sci->sc_sum.nblk_sum);

	while (pbh > ep) {
		ClearPageWriteback((*pbh)->b_page);
		nilfs_brelse(*pbh);
		*pbh-- = NULL;
	}

	pbh = &SC_BH(sci, 0);
	ep = &SC_BH(sci, sci->sc_ba_idx);
	while (pbh < ep) {
		struct page *page = (*pbh)->b_page;

		if (PageWriteback(page)) {
			if (NILFS_SEG_ERR(err))
				SetPageError(page);
			else
				ClearPageError(page);
			nilfs_end_page_writeback(page);
		}
#ifdef CONFIG_NILFS_DEBUG
		else {
			/*
			 * Code for analysing unexpected BUG() at 
			 * end_page_writeback()
			 *
			 * NOTE: If the buffer size does not equal to
			 *   the page size, this happens as normal.
			 *   So, this code should be removed in the future.
			 */
			nilfs_warning(sci->sc_sb, __FUNCTION__,
				      "found page without writeback flag."
				      "(nblocks=%u, nblk_sum=%u, flag=0x%x, "
				      "ba_idx=%u, i=%tu)\n",
				      sci->sc_sum.nblocks,
				      sci->sc_sum.nblk_sum,
				      sci->sc_sum.flags,
				      sci->sc_ba_idx,
				      pbh - sci->sc_bh_arr);
		}
#endif
		if (__buffer_frozen_copy(*pbh))
			peel_off_original_blocks(*pbh, 1);
		nilfs_brelse(*pbh);
		*pbh++ = NULL;
	}
}

/**
 * ba_required_segsum_block - get the number of additional summary blocks
 * @sci: nilfs_sc_info
 * @bytes: byte count of the segment information that will be appended.
 *
 * ba_required_segsum_block() returns how many additional blocks are 
 * required for appending specified bytes to the segment summary.
 * The segment header MUST be written in advance because an incorrect
 * result will be returned when sci->sc_sum_bytes == 0.
 */
static inline unsigned int
ba_required_segsum_block(struct nilfs_sc_info *sci, unsigned int bytes)
{
	return (((sci->sc_sum_bytes - 1) & (sci->sc_sb->s_blocksize - 1)) + bytes)
		>> sci->sc_sb->s_blocksize_bits;
}

/**
 * ba_append_to_segsum - append given data to the segment summary.
 * @sci: nilfs_sc_info
 * @vp: pointer to the data
 * @bytes: byte count of the data
 *
 * ba_append_to_segsum() appends specified data to the segment summary
 * holded by @sci. It extends the segment summary and copies segment 
 * information from @vp to the extended region. The segment summary
 * block are added as needed. The data can be copied over two blocks
 * but cannot exceed two blocks.
 * The copying of data can be skipped; if @vp is specified to NULL,
 * only the space required to store data is reserved and no data are 
 * written in.
 *
 * Return Value: On success, 0 is returned. on error, a following negative
 * error code is returned.
 * 
 * %-ENOMEM - Insufficient memory available.
 */
static int
ba_append_to_segsum(struct nilfs_sc_info *sci, void *vp, unsigned int bytes)
{
	unsigned long offset = sci->sc_sum_bytes & (sci->sc_sb->s_blocksize - 1);
	struct buffer_head *bh = SC_BH(sci, sci->sc_ba_sum_idx);
	unsigned int bcnt = 0;
	char *bp = vp;
	int err;

	BUG_ON(bytes > sci->sc_sb->s_blocksize);
	if (offset) {
		bcnt = min_t(unsigned int,
			     bytes, sci->sc_sb->s_blocksize - offset);
		if (bp)
			memcpy(bh->b_data + offset, bp, bcnt);

		if (bcnt == bytes)
			goto out;
	}
	/* Extending segment summary block */
	seg_debug(2, "extending summary block (bcnt=%u)\n", bcnt);
	err = ba_get_new_segsum_block(sci);
	if (unlikely(err))
		return err;
	sci->sc_sum.nblocks++;
	sci->sc_sum.nblk_sum++;

	if (bp) {
		bp += bcnt;
		memcpy(bh->b_data, bp, bytes - bcnt);
	}
 out:
	sci->sc_sum_bytes += bytes;
	return 0;
}

static void
ba_write_to_segsum(struct nilfs_sc_info *sci, unsigned long offset,
		   void *p, unsigned int bytes)
{
	unsigned int b_index;
	unsigned int b_offset;
	struct buffer_head *bh;
	unsigned int bcnt;

	BUG_ON(offset + bytes > sci->sc_sum_bytes ||
	       bytes > sci->sc_sb->s_blocksize);
	b_index = sci->sc_ba_max - (offset >> sci->sc_sb->s_blocksize_bits) - 1;
	b_offset = offset & (sci->sc_sb->s_blocksize - 1);
	bh = SC_BH(sci, b_index);
	BUG_ON(bh == NULL);

	bcnt = min_t(unsigned int, bytes, sci->sc_sb->s_blocksize - b_offset);
	memcpy(bh->b_data + b_offset, p, bcnt);
	if (likely(bcnt == bytes))
		return;

	bh = SC_BH(sci, b_index - 1);
	BUG_ON(bh == NULL);

	memcpy(bh->b_data, (char *)p + bcnt, bytes - bcnt);
}

static void
ba_end_fdata(struct nilfs_sc_info *sci, struct inode *inode)
{
	struct nilfs_finfo finfo;
	struct nilfs_inode *raw_inode = nilfs_raw_inode(NILFS_I(inode));

	if (sci->sc_blk_cnt == 0)
		return;

	finfo.fi_nblocks = cpu_to_le32(sci->sc_blk_cnt);
	finfo.fi_version = raw_inode->i_version;  /* inode version for GC */
	finfo.fi_ino = cpu_to_le64(inode->i_ino);

	ba_write_to_segsum(sci, sci->sc_info_offset, &finfo, sizeof(finfo));

	sci->sc_sum.nblk_file += sci->sc_blk_cnt;
	sci->sc_sum.nfinfo++;
	sci->sc_blk_cnt = 0;
	sci->sc_dirty_file_ptr = NILFS_I(inode);
}

static inline int
ba_add_fdata(struct nilfs_sc_info *sci, struct inode *inode,
	     struct buffer_head *bh, fbn_t fbn)
{
	unsigned int required;
	unsigned long bytes = sizeof(__le64);
	int err;
	__le64 raw_fbn;
	
	if (sci->sc_blk_cnt == 0)
		bytes += sizeof(struct nilfs_finfo);

	required = ba_required_segsum_block(sci, bytes);
	if (sci->sc_sum.nblocks + required + 1 > sci->sc_residual_blocks) {
		ba_end_fdata(sci, inode);
		return NILFS_SEG_FULL;
	}

	/* Appending file block number of this block to segument summary */
	if (sci->sc_blk_cnt == 0) {
		sci->sc_info_offset = sci->sc_sum_bytes;
		err = ba_append_to_segsum(sci, NULL, sizeof(struct nilfs_finfo));
		if (unlikely(err))
			goto failed;
	}
	raw_fbn = cpu_to_le64(fbn);
	err = ba_append_to_segsum(sci, &raw_fbn, sizeof(raw_fbn));
	if (unlikely(err))
		goto failed;

	ba_register_block(sci, bh);
	sci->sc_blk_cnt++;
 failed:
	return err;
}

static void
ba_end_fbt_node(struct nilfs_sc_info *sci, struct inode *inode)
{
	struct nilfs_fbinfo fbinfo;

	if (sci->sc_blk_cnt == 0)
		return;

	fbinfo.fbi_nblocks = cpu_to_le32(sci->sc_blk_cnt);
	fbinfo.fbi_ino = cpu_to_le64(inode->i_ino);

	ba_write_to_segsum(sci, sci->sc_info_offset, &fbinfo, sizeof(fbinfo));

	sci->sc_sum.nblk_fbt += sci->sc_blk_cnt;
	sci->sc_sum.nfbinfo++;
	sci->sc_blk_cnt = 0;
}

static int
ba_add_fbt_node(struct nilfs_sc_info *sci, struct inode *inode,
		struct buffer_head *bh)
{
	unsigned int required;  /* additional blocks required */
	unsigned long bytes = sizeof(struct nilfs_fbinfo);
	int err = 0;
	
	if (sci->sc_blk_cnt == 0) {
		required = ba_required_segsum_block(sci, bytes);
		if (sci->sc_sum.nblocks + required + 1 > sci->sc_residual_blocks)
			return NILFS_SEG_FULL;

		sci->sc_info_offset = sci->sc_sum_bytes;
		err = ba_append_to_segsum(sci, NULL, sizeof(struct nilfs_fbinfo));
		if (unlikely(err))
			goto failed;
	} else {
		if (sci->sc_sum.nblocks + 1 > sci->sc_residual_blocks) {
			ba_end_fbt_node(sci, inode);
			return NILFS_SEG_FULL;
		}
	}
	ba_register_block(sci, bh);
	sci->sc_blk_cnt++;
 failed:
	return err;
}

static inline int
ba_add_iblock(struct nilfs_sc_info *sci, struct buffer_head *bh, fbn_t fbn)
{
	struct nilfs_inode_hdr *ih = (struct nilfs_inode_hdr *)bh->b_data;
	struct nilfs_iinfo iinfo;
	unsigned int required;
	int err;
	
	required = ba_required_segsum_block(sci, sizeof(iinfo));
	if (sci->sc_sum.nblocks + required + 1 > sci->sc_residual_blocks) {
		return NILFS_SEG_FULL;
	}
	iinfo.ii_ino = ih->ih_ino;
	err = ba_append_to_segsum(sci, &iinfo, sizeof(iinfo));
	if (unlikely(err))
		goto failed;

	ba_register_block(sci, bh);
	sci->sc_sum.nblk_inode++;
 failed:
	return err;
}

static inline int
ba_add_ibt_node(struct nilfs_sc_info *sci, struct buffer_head *bh)
{
	if (sci->sc_sum.nblocks + 1 > sci->sc_residual_blocks)
		return NILFS_SEG_FULL;

	ba_register_block(sci, bh);
	sci->sc_sum.nblk_ibt++;
	return 0;
}

static inline int
ba_add_cp(struct nilfs_sc_info *sci, unsigned int flags)
{
	dbn_t cp_start;
	struct buffer_head *bh = sci->sc_cp_bh;

	sci->sc_cp.flags = flags;

	if (sci->sc_sum.nblocks + 1 > sci->sc_residual_blocks)
		return NILFS_SEG_FULL;

	sci->sc_ba_cp_idx = sci->sc_ba_idx;
	/* This initialization must be done before calling
	   ba_write_to_cp() */

	/* Allocating checkpoint blocks */
	cp_start = sci->sc_pseg_start + sci->sc_ba_cp_idx + sci->sc_sum.nblk_sum;
	bh->b_blocknr = cp_start;
	get_bh(bh);
	SetPageWriteback(bh->b_page);
	SC_BH(sci, sci->sc_ba_idx) = sci->sc_cp_bh;
	sci->sc_ba_idx++;
	sci->sc_sum.nblocks++;
	return 0;
}
/*
 * Callback functions that enumerate, mark, and collect dirty blocks
 */
static int 
mark_fbt_from_data(struct buffer_head *bh, struct inode *inode, fbn_t fbn)
{
	struct super_block *sb = inode->i_sb;
	struct nilfs_inode_info *ii = NILFS_I(inode);
	struct nilfs_sc_info *sci = NILFS_SC(NILFS_SB(sb));
	int err;

	if (!buffer_dirty(bh))
		return 0;

	err = nilfs_btree_mark_from_data(&ii->i_block_root, bh);
	if (unlikely(err < 0))
		return nilfs_handle_fbt_error(err, __FUNCTION__, inode);
		/* This maps -NILFS_BTREE_ENOKEY to -EINVAL.  In that case,
		   the btree is deemed broken. */
	err = ba_add_fdata(sci, inode, bh, fbn);
	return err;
}

static int 
mark_fbt_from_node(struct buffer_head *bh, struct inode *inode)
{
	struct nilfs_inode_info *ii = NILFS_I(inode);
	int err;

	if (!buffer_dirty(bh) || buffer_prepare_dirty(bh))
		return 0;

	err = nilfs_btree_mark_from_node(&ii->i_block_root, bh);
	if (unlikely(err < 0))
		return nilfs_handle_fbt_error(err, __FUNCTION__, inode);
		/* This maps -NILFS_BTREE_ENOKEY to -EINVAL.  In that case,
		   the btree is deemed broken. */
	return 0;
}

static int 
count_fbt_dirty(struct buffer_head *bh, struct inode *inode)
{
	struct super_block *sb = inode->i_sb;
	struct nilfs_sc_info *sci = NILFS_SC(NILFS_SB(sb));
	int err;

	BUG_ON(!buffer_dirty(bh));

	err = ba_add_fbt_node(sci, inode, bh);
	return err;
}

static int 
mark_ibt_from_data(struct buffer_head *bh, struct nilfs_sb_info *sbi, fbn_t fbn)
{
	struct nilfs_sc_info *sci = NILFS_SC(sbi);
	int err;
	
	if (!buffer_dirty(bh))
		return 0;

	seg_debug(2, "detected dirty inode block (fbn=%lu, 1st-ino=%llu)\n",
		  fbn,
		  le64_to_cpu(((struct nilfs_inode_hdr *)bh->b_data)->ih_ino));
	err = nilfs_btree_mark_from_data(&sbi->s_inode_root, bh);
	if (unlikely(err < 0))
		return nilfs_handle_ibt_error(err, __FUNCTION__, sbi->s_super);
		/* This maps -NILFS_BTREE_ENOKEY to -EINVAL.  In that case,
		   the btree is deemed broken. */
	err = ba_add_iblock(sci, bh, fbn);
	return err;
}

static int 
mark_ibt_from_node(struct buffer_head *bh, struct nilfs_sb_info *sbi)
{
	int err;

	if (!buffer_dirty(bh) || buffer_prepare_dirty(bh))
		return 0;

	err = nilfs_btree_mark_from_node(&sbi->s_inode_root, bh);
	if (unlikely(err < 0))
		return nilfs_handle_ibt_error(err, __FUNCTION__, sbi->s_super);
		/* This maps -NILFS_BTREE_ENOKEY to -EINVAL.  In that case,
		   the btree is deemed broken. */
	return 0;
}

static int 
count_ibt_dirty(struct buffer_head *bh, struct nilfs_sb_info *sbi)
{
	int err;

	BUG_ON(!buffer_dirty(bh));

	err = ba_add_ibt_node(NILFS_SC(sbi), bh);
	return err;
}

static int
lookup_dirty_buffers(struct super_block *sb, struct address_space *mapping,
		     void *private, data_proc_t proc)
{
	struct page *pages[SC_N_PAGEVEC];
	unsigned int nr_page;
	pgoff_t index = 0;
	int i;
	int err = 0;
	
 repeat:
#if NEED_RWLOCK_FOR_PAGECACHE_LOCK
	read_lock_irq(&mapping->tree_lock);
#else
	spin_lock_irq(&mapping->tree_lock);
#endif
	nr_page = radix_tree_gang_lookup_tag(&mapping->page_tree,
					     (void **)pages,
					     index,
					     SC_N_PAGEVEC,
					     PAGECACHE_TAG_DIRTY);
	for (i = 0; i < nr_page; i++)
		page_cache_get(pages[i]);
#if NEED_RWLOCK_FOR_PAGECACHE_LOCK
	read_unlock_irq(&mapping->tree_lock);
#else
	spin_unlock_irq(&mapping->tree_lock);
#endif

	if (nr_page == 0)
		return 0;
	index = pages[nr_page - 1]->index + 1;

	for(i = 0; i < nr_page; i++) {
		struct buffer_head *bh, *head;
		fbn_t fbn;

		if (err)
			goto skip_page;

		bh = head = page_buffers(pages[i]);
		fbn = pages[i]->index >> (PAGE_SHIFT - sb->s_blocksize_bits);
		do {
			if (buffer_dirty(bh)) {
				get_bh(bh);
				err = (*proc)(bh, private, fbn);
				put_bh(bh);
				if (err) break;
			}
			bh = bh->b_this_page;
			fbn++;
		} while (bh != head);
	skip_page:
		page_cache_release(pages[i]);
	}
	if (!err)
		goto repeat;

	return err;
}
	    
static int
lookup_buffers_64(struct radix_tree_64_root *tree, spinlock_t *tree_lock,
		  void *private, node_proc_t proc, int tag)
{
	struct page *pages[SC_N_PAGEVEC];
	unsigned int nr_page;
	u64 index = 0;
	int i;
	int err = 0;
	
 repeat:
	spin_lock_irq(tree_lock);
	nr_page = radix_tree_64_gang_lookup_tag(tree,
						(void **)pages,
						index,
						SC_N_PAGEVEC,
						tag);
	for (i = 0; i < nr_page; i++)
		page_cache_get(pages[i]);
	spin_unlock_irq(tree_lock);

	if (nr_page == 0)
		return 0;
	index = nilfs_node_page_index(pages[nr_page - 1]) + 1;

	for(i = 0; i < nr_page; i++) {
		struct buffer_head *bh, *head;

		if (err)
			goto skip_page;
		BUG_ON(!page_has_buffers(pages[i]));
		bh = head = page_buffers(pages[i]);
		do {
			get_bh(bh);
			err = (*proc)(bh, private);
			put_bh(bh);
			if (err) break;
			bh = bh->b_this_page;
		} while (bh != head);
	skip_page:
		page_cache_release(pages[i]);
	}
	if (!err)
		goto repeat;

	return err;
}

static inline int
btree_lookup_dirty_buffers(struct nilfs_btree *btree, void *private,
			   node_proc_t proc)
{
	struct buffer_head *bhs[SC_N_BHVEC];
	int nitems, i, err = 0;

	nilfs_btree_lookup_dirty_buffers_begin(btree);
 repeat:
	nitems = nilfs_btree_lookup_dirty_buffers(btree, bhs, SC_N_BHVEC);
	if (nitems == 0) {
		nilfs_btree_lookup_dirty_buffers_end(btree);
		return 0;
	}
	for (i = 0; i < nitems; i++) {
		if (likely(!err))
			err = (*proc)(bhs[i], private);
		nilfs_brelse(bhs[i]);
	}
	if (!err)
		goto repeat;

	nilfs_btree_lookup_dirty_buffers_end(btree);
	return err;
}

static void dispose_garbage_list(struct nilfs_sb_info *sbi)
{

 repeat:
	spin_lock_irq(&sbi->s_segctor.dirty_files_lock);
	if (!list_empty(&sbi->s_segctor.garbage_list)) {
		struct list_head *h = sbi->s_segctor.garbage_list.next;
		struct nilfs_inode_info *ii = list_entry(h, struct nilfs_inode_info, i_dirty);

		list_del_init(h);
		seg_debug(3, "deleting file (ino=%lu) from garbage_list\n",
			  ii->vfs_inode.i_ino);
		ii->i_state &= ~(NILFS_STATE_DIRTY | NILFS_STATE_COLLECTED | NILFS_STATE_UPDATED | NILFS_STATE_INODE_DIRTY);
		brelse(ii->i_bh);
		ii->i_bh = NULL;

		spin_unlock_irq(&sbi->s_segctor.dirty_files_lock);
		iput(&ii->vfs_inode);
		goto repeat;
	}
	spin_unlock_irq(&sbi->s_segctor.dirty_files_lock);
}

/* 
 * reconfirm_construction - reconfirm needs of segment construction
 *        with clearing the global dirty flag of inode B-tree.
 *
 * This function must be called after synching with BIO completion routine.
 */
static inline int
reconfirm_construction(struct nilfs_sb_info *sbi, struct nilfs_sc_info *sci)
{
	if (nilfs_btree_test_and_clear_dirty(&sbi->s_inode_root) ||
	    test_bit(NILFS_SB_IBLK_DIRTY, &sbi->s_flags)) {
		clear_bit(NILFS_SB_IBLK_DIRTY, &sbi->s_flags);
		set_bit(NILFS_SC_DIRTY, &sci->sc_flags);
	}

	if (list_empty(&sbi->s_segctor.dirty_files) &&
	    !test_bit(NILFS_SC_DIRTY, &sci->sc_flags)) {
		seg_debug(1, "Aborted construction (no changes found in "
			  "reconfirmation)\n");
		return NILFS_SEG_EMPTY;
	}
	return 0;
}

static inline void fill_in_file_block_roots(struct nilfs_sb_info *sbi)
{
	struct nilfs_inode_info *ii;
	struct nilfs_inode *raw_inode;

	/* spin_lock_irq(&sbi->s_segctor.dirty_files_lock); */
	list_for_each_entry(ii, &sbi->s_segctor.dirty_files, i_dirty) {
		raw_inode = nilfs_raw_inode(ii);
		raw_inode->i_block_root = cpu_to_le64(NILFS_BTREE_ROOT_DBN(&ii->i_block_root));
		ii->i_state = (ii->i_state & ~NILFS_STATE_DIRTY) | NILFS_STATE_COLLECTED;
	}
	/* spin_unlock_irq(&sbi->s_segctor.dirty_files_lock); */
}

/*
 * CRC calculation routines
 */
static inline void fill_in_segsum_crc(struct nilfs_sc_info *sci, u32 seed)
{
	unsigned long size, bytes = sci->sc_sum_bytes;
	struct buffer_head **pbh = &SC_SUM_BH(sci, 0);
	struct buffer_head **ep = &SC_SUM_BH(sci, sci->sc_sum.nblk_sum);
	struct nilfs_seg_summary *raw_sum;
	u32 crc;

	raw_sum = (struct nilfs_seg_summary *)((*pbh)->b_data);
	size = min_t(unsigned long, bytes, (*pbh)->b_size);
	crc = crc32c_le(seed,
			(unsigned char *)raw_sum +
			sizeof(raw_sum->ss_datasum) +
			sizeof(raw_sum->ss_sumsum),
			size - (sizeof(raw_sum->ss_datasum) + 
				sizeof(raw_sum->ss_sumsum)));
	while (--pbh > ep) {
		bytes -= size;
		size = min_t(unsigned long, bytes, (*pbh)->b_size);
		crc = crc32c_le(crc, (*pbh)->b_data, size);
	}
	raw_sum->ss_sumsum = cpu_to_le32(crc);
}

static inline void fill_in_cp_crc(struct nilfs_sc_info *sci, u32 seed)
{
	struct buffer_head **pbh = &SC_BH(sci, sci->sc_ba_cp_idx);
	struct nilfs_checkpoint *raw_cp;
	u32 crc;

	raw_cp = (struct nilfs_checkpoint *)((*pbh)->b_data);
	BUG_ON(NILFS_CP_BYTES > (*pbh)->b_size);
	crc = crc32c_le(seed, (unsigned char *)raw_cp + sizeof(raw_cp->cp_sum),
			NILFS_CP_BYTES - sizeof(raw_cp->cp_sum));

	raw_cp->cp_sum = cpu_to_le32(crc);
}

static inline void fill_in_data_crc(struct nilfs_sc_info *sci, u32 seed)
{
	struct buffer_head **pbh = &SC_SUM_BH(sci, 0);
	struct buffer_head **ep = &SC_SUM_BH(sci, sci->sc_sum.nblk_sum);
	struct nilfs_seg_summary *raw_sum;
	void *kaddr;
	u32 crc;

	raw_sum = (struct nilfs_seg_summary *)((*pbh)->b_data);
	crc = crc32c_le(seed,
			(unsigned char *)raw_sum + sizeof(raw_sum->ss_datasum),
			(*pbh)->b_size - sizeof(raw_sum->ss_datasum));
	while (--pbh > ep)
		crc = crc32c_le(crc, (*pbh)->b_data, (*pbh)->b_size);

	pbh = &SC_BH(sci, 0);
	ep = &SC_BH(sci, sci->sc_sum.nblk_file);
	while (pbh < ep) {
		kaddr = kmap_atomic((*pbh)->b_page, KM_USER0);
		crc = crc32c_le(crc, kaddr + bh_offset(*pbh), (*pbh)->b_size);
		kunmap_atomic(kaddr, KM_USER0);
		pbh++;
	}
	ep = &SC_BH(sci, sci->sc_ba_idx);
	while (pbh < ep) {
		crc = crc32c_le(crc, (*pbh)->b_data, (*pbh)->b_size);
		pbh++;
	}
	raw_sum->ss_datasum = cpu_to_le32(crc);
}

static inline void 
fill_in_cp(struct nilfs_sc_info *sci, struct nilfs_sb_info *sbi)
{
	struct nilfs_checkpoint *raw_cp;

	sci->sc_cp.inode_root = NILFS_BTREE_ROOT_DBN(&sbi->s_inode_root);

	/* Writing checkpoint blocks */
	raw_cp = (struct nilfs_checkpoint *)(SC_BH(sci, sci->sc_ba_cp_idx)->b_data);
	/* raw_cp->cp_sum = 0; */
	raw_cp->cp_flags = cpu_to_le16(sci->sc_cp.flags);
	raw_cp->cp_bytes = cpu_to_le16(NILFS_CP_BYTES);
	raw_cp->cp_inode_root =	cpu_to_le64(sci->sc_cp.inode_root);
	raw_cp->cp_nblk_lseg = cpu_to_le64(sci->sc_nblk_lseg /* written segments */
					   + sci->sc_sum.nblocks /* current segment */);
	raw_cp->cp_inodes_count = cpu_to_le64(atomic_read(&sbi->s_inodes_count));
	raw_cp->cp_blocks_count = cpu_to_le64(atomic_read(&sbi->s_blocks_count));
	raw_cp->cp_sketch_size =
		(sci->sc_sketch_inode ?
		 cpu_to_le32(sci->sc_sketch_inode->i_size) : 0);
}

static inline void fill_in_segsum(struct nilfs_sc_info *sci)
{
	struct nilfs_seg_summary *raw_sum;

	if (!test_bit(NILFS_SC_PASSIVE, &sci->sc_flags))
		sci->sc_sum.ctime = get_seconds();

	raw_sum = (struct nilfs_seg_summary *)SC_SUM_BH(sci, 0)->b_data;
	raw_sum->ss_seq     = cpu_to_le64(sci->sc_sum.seg_seq);
	raw_sum->ss_magic   = cpu_to_le16(NILFS_SEGSUM_MAGIC);
	raw_sum->ss_nblocks = cpu_to_le16(sci->sc_sum.nblocks);
	raw_sum->ss_nfinfo  = cpu_to_le16(sci->sc_sum.nfinfo);
	raw_sum->ss_nfbinfo = cpu_to_le16(sci->sc_sum.nfbinfo);
	raw_sum->ss_nfblk   = cpu_to_le16(sci->sc_sum.nblk_file);
	raw_sum->ss_nfbblk  = cpu_to_le16(sci->sc_sum.nblk_fbt);
	raw_sum->ss_niblk   = cpu_to_le16(sci->sc_sum.nblk_inode);
	raw_sum->ss_nibblk  = cpu_to_le16(sci->sc_sum.nblk_ibt);
	raw_sum->ss_create  = cpu_to_le64(sci->sc_sum.ctime);
	raw_sum->ss_prev    = cpu_to_le16(sci->sc_sum.prev_pseg);
	raw_sum->ss_flags   = cpu_to_le16(sci->sc_sum.flags);
}

static void redirty_files(struct nilfs_sb_info *sbi)
{
	struct nilfs_inode_info *ii;
	unsigned long flags;

	spin_lock_irqsave(&sbi->s_segctor.dirty_files_lock, flags);
	list_for_each_entry(ii, &sbi->s_segctor.dirty_files, i_dirty) {
		if (ii->i_state & NILFS_STATE_COLLECTED) {
			ii->i_state = (ii->i_state & ~NILFS_STATE_COLLECTED) |
				NILFS_STATE_DIRTY;
			seg_debug(2, "redirtied file (ino=%lu)\n",
				  ii->vfs_inode.i_ino);
		}
	}
	spin_unlock_irqrestore(&sbi->s_segctor.dirty_files_lock, flags);
}

static void drop_collected_files(struct nilfs_sb_info *sbi)
{
	struct list_head *head;
	struct nilfs_inode_info *ii, *n;
	unsigned long flags;

	head = &sbi->s_segctor.dirty_files;
	spin_lock_irqsave(&sbi->s_segctor.dirty_files_lock, flags);
	list_for_each_entry_safe(ii, n, head, i_dirty) {
		if (!(ii->i_state & NILFS_STATE_COLLECTED))
			continue;
		ii->i_state &= ~(NILFS_STATE_COLLECTED | NILFS_STATE_INODE_DIRTY);
		if (ii->i_state & NILFS_STATE_DIRTY)
			continue;

		list_del_init(&ii->i_dirty);
		seg_debug(3, "dropping collected file (ino=%lu)\n",
			  ii->vfs_inode.i_ino);
		ii->i_state |= NILFS_STATE_UPDATED;
		list_add_tail(&ii->i_dirty, &sbi->s_segctor.garbage_list);
	}
	spin_unlock_irqrestore(&sbi->s_segctor.dirty_files_lock, flags);
}

static inline int 
collect_segment_blocks(struct nilfs_sc_info *sci, struct nilfs_sb_info *sbi, int mode)
{
	struct list_head *head;
	struct nilfs_inode_info *ii;
	struct inode *inode;
	int err = 0;

	sci->sc_sum.flags = 0;

 start:
	switch (sci->sc_stage) {
	case SC_COLLECT_INIT:
		/*
		 * Pre-processes before first segment construction are
		 * inserted here.
		 */
		if (!test_bit(NILFS_SC_UNCLOSED, &sci->sc_flags)) {
			sci->sc_nblk_lseg = 0;
			sci->sc_sum.flags = SS_LOGBGN;
			if (mode == SC_LSEG_DSYNC) {
				sci->sc_stage = SC_COLLECT_FDATA;
				seg_debug(2, "** FDATA BEGIN\n");
				goto start;
			}
			seg_debug(2, "** LSEG BEGIN\n");
		} else
			seg_debug(2, "** LSEG RESUME\n");

		sci->sc_dirty_file_ptr = NULL;
		sci->sc_stage++;
	case SC_COLLECT_DATA:
		seg_debug(2, "** DATA STAGE\n");
		/* spin_lock_irq(&sbi->s_segctor.dirty_files_lock); */
		head = &sbi->s_segctor.dirty_files;
		ii = list_prepare_entry(sci->sc_dirty_file_ptr, head, i_dirty);
		list_for_each_entry_continue(ii, head, i_dirty) {
			if (!(ii->i_state & NILFS_STATE_DIRTY))
				continue;
			inode = &ii->vfs_inode;

			err = lookup_dirty_buffers(sci->sc_sb, inode->i_mapping,
						   inode,
						   (data_proc_t)mark_fbt_from_data);
			if (err) {
				sci->sc_dirty_file_ptr = 
					list_entry(ii->i_dirty.prev,
						   struct nilfs_inode_info,
						   i_dirty);
				/* spin_unlock_irq(&sbi->s_segctor.dirty_files_lock); */
				goto break_or_fail;
			}
			ba_end_fdata(sci, inode);
		}
		sci->sc_dirty_file_ptr = NULL;
		/* spin_unlock_irq(&sbi->s_segctor.dirty_files_lock); */
		sci->sc_stage++;
	case SC_COLLECT_FBT:
		seg_debug(2, "** FBT STAGE\n");
		/* spin_lock_irq(&sbi->s_segctor.dirty_files_lock); */
		head = &sbi->s_segctor.dirty_files;
		ii = list_prepare_entry(sci->sc_dirty_file_ptr, head, i_dirty);
		list_for_each_entry_continue(ii, head, i_dirty) {
			if (!(ii->i_state & NILFS_STATE_DIRTY))
				continue;
			inode = &ii->vfs_inode;

			err = lookup_buffers_64(&ii->i_block_ntree,
						&ii->i_block_ntree_lock,
						inode,
						(node_proc_t)mark_fbt_from_node, 
						PAGECACHE64_TAG_DIRTY);
			if (err) {
				sci->sc_dirty_file_ptr = 
					list_entry(ii->i_dirty.prev,
						   struct nilfs_inode_info,
						   i_dirty);
				/* spin_unlock_irq(&sbi->s_segctor.dirty_files_lock); */
				goto break_or_fail;
			}
			
			err = btree_lookup_dirty_buffers(&ii->i_block_root,
							 inode,
							 (node_proc_t)count_fbt_dirty);
			if (err) {
				sci->sc_dirty_file_ptr = 
					list_entry(ii->i_dirty.prev,
						   struct nilfs_inode_info,
						   i_dirty);
				/* spin_unlock_irq(&sbi->s_segctor.dirty_files_lock); */
				goto break_or_fail;
			}
			ba_end_fbt_node(sci, inode);
		}
		sci->sc_dirty_file_ptr = NULL;
		/* spin_unlock_irq(&sbi->s_segctor.dirty_files_lock); */
		if (mode == SC_FLUSH_DATA) {
			sci->sc_stage = SC_COLLECT_DONE;
			seg_debug(2, "** LSEG CONTINUED\n");
			return 0;
		}
		sci->sc_stage++;
	case SC_COLLECT_IBLK:
		seg_debug(2, "** INODE STAGE\n");
		err = lookup_dirty_buffers(sci->sc_sb, sbi->s_mapping, sbi,
					   (data_proc_t)mark_ibt_from_data);
		if (err) break;
		sci->sc_stage++;
	case SC_COLLECT_IBT:
		seg_debug(2, "** IBT STAGE\n");
		err = lookup_buffers_64(&sbi->s_inode_ntree,
					&sbi->s_inode_ntree_lock,
					sbi,
					(node_proc_t)mark_ibt_from_node, 
					PAGECACHE64_TAG_DIRTY);
		if (err) break;

		err = btree_lookup_dirty_buffers(&sbi->s_inode_root,
						 sbi,
						 (node_proc_t)count_ibt_dirty);
		if (err) break;
		if (mode == SC_FLUSH_IBT) {
			sci->sc_stage = SC_COLLECT_DONE;
			seg_debug(2, "** LSEG CONTINUED\n");
			return 0;
		}
		sci->sc_stage++;
	case SC_COLLECT_CP:
		seg_debug(2, "** CP STAGE\n");
		if (mode & SC_MODE_CP_MASK) {
			/* Appending a checkpoint */
			err = ba_add_cp(sci, (mode & SS_FJCP) ? NILFS_CP_MAJOR : 0);
			if (err) break;
		}
		sci->sc_stage++;
	case SC_COLLECT_DONE:
		/*
		 * Post processes after final segment construction
		 * can be inserted here.
		 */
		sci->sc_sum.flags |= (mode & SC_MODE_CP_MASK) | SS_LOGEND;
		seg_debug(2, "** LSEG END\n");
		return 0;
	case SC_COLLECT_FDATA:
		sci->sc_sum.flags |= SS_SYNDT;
		ii = sci->sc_dirty_file_ptr;
		if (!(ii->i_state & NILFS_STATE_DIRTY))
			break;
		inode = &ii->vfs_inode;
		err = lookup_dirty_buffers(sci->sc_sb, inode->i_mapping, inode,
					   (data_proc_t)mark_fbt_from_data);
		if (err) break;
		ba_end_fdata(sci, inode);
		sci->sc_dirty_file_ptr = NULL;
		sci->sc_sum.flags |= SS_LOGEND;
		sci->sc_stage = SC_COLLECT_DONE;
		seg_debug(2, "** FDATA END\n");
		return 0;
	default:
		BUG();
	}
 break_or_fail:
	/* ba_cancel_blocks(sbi, sci, ...); */
	if (NILFS_SEG_ERR(err))
		seg_debug(2, "** ERROR(%d)\n", err);
	else
		seg_debug(2, "** SEG FEED\n");
	return err;
}

/**
 * follow_up_check - Check whether the segment is empty or not.
 * @sci: nilfs_sc_info
 * @has_cp: whether if the current segment includes checkpoint or not.
 *
 * We reject empty or CP-only segment if the previous write was continuing.
 */
static inline int follow_up_check(struct nilfs_sc_info *sci, int has_cp, int mode)
{
 	if (NILFS_SEG_SIMPLEX(&sci->sc_sum) &&
 	    sci->sc_sum.nblocks - sci->sc_sum.nblk_sum /* # of payload blocks */ <= !!has_cp) {
		if (mode != SC_LSEG_DSYNC)
			clear_bit(NILFS_SC_DIRTY, &sci->sc_flags);
		seg_debug(1, "Aborted construction (no blocks were collected)\n");
		return NILFS_SEG_EMPTY;
	}
	return 0;
}

static int
update_dbn(struct nilfs_sc_info *sci, struct nilfs_sb_info *sbi)
{
	struct nilfs_btree *inode_root = &sbi->s_inode_root;
	struct buffer_head **pbh, *bh, **spbh;
	struct inode *inode;
	int err = 0;
	dbn_t dbn;
	unsigned int cnt;
	struct page *prev_page = NULL;

	seg_debug(2, "called\n");
	spbh = pbh = &SC_BH(sci, 0);
	dbn = sci->sc_pseg_start + sci->sc_sum.nblk_sum;

	cnt = sci->sc_sum.nblk_file;
	while (cnt-- > 0) {
		BUG_ON(buffer_nilfs_node(*pbh));

		bh = *pbh;
		if (prev_page != bh->b_page) {
			if (prev_page && nilfs_page_to_be_frozen(prev_page)) {
				err = freeze_blocks(spbh, pbh - spbh);
				if (unlikely(err))
					goto out;
			}
			spbh = pbh;
		}
		inode = bh->b_page->mapping->host;
		err = nilfs_btree_update_data_dbn(&NILFS_I(inode)->i_block_root,
						  bh, dbn);
		if (unlikely(err)) {
			err = nilfs_handle_fbt_error(err, __FUNCTION__, inode);
			goto out;
		}
		bh->b_blocknr = (sector_t)dbn++;  pbh++;
		prev_page = bh->b_page;
	}
	if (prev_page && nilfs_page_to_be_frozen(prev_page)) {
		err = freeze_blocks(spbh, pbh - spbh);
		if (unlikely(err))
			goto out;
	}

	cnt = sci->sc_sum.nblk_fbt;
	while (cnt-- > 0) {
		BUG_ON(!buffer_nilfs_bbt_node(*pbh));

		inode = (*pbh)->b_page->mapping->host;
		err = nilfs_btree_update_node_dbn(&NILFS_I(inode)->i_block_root,
						 *pbh, dbn);
		if (unlikely(err)) {
			err = nilfs_handle_fbt_error(err, __FUNCTION__, inode);
			goto out;
		}
		err = nilfs_move_dirty_file_node_blk(inode, pbh, dbn);
		if (unlikely(err))
			goto out;
		pbh++;  dbn++;
	}
	cnt = sci->sc_sum.nblk_inode;
	while (cnt-- > 0) {
		BUG_ON(buffer_nilfs_node(*pbh));

		bh = *pbh++;
		err = nilfs_btree_update_data_dbn(inode_root, bh, dbn);
		if (unlikely(err)) {
			err = nilfs_handle_ibt_error(err, __FUNCTION__, sbi->s_super);
			goto out;
		}
		bh->b_blocknr = (sector_t)dbn++;
	}

	cnt = sci->sc_sum.nblk_ibt;
	while (cnt-- > 0) {
		BUG_ON(!buffer_nilfs_ibt_node(*pbh));

		err = nilfs_btree_update_node_dbn(inode_root, *pbh, dbn);
		if (unlikely(err)) {
			err = nilfs_handle_ibt_error(err, __FUNCTION__, sbi->s_super);
			goto out;
		}
		err = nilfs_move_dirty_inode_node_blk(sci->sc_sb, pbh, dbn);
		if (unlikely(err))
			goto out;
		pbh++;  dbn++;
	}
 out:
	seg_debug(2, "done\n");
	return err;
}

static void __end_page_io_by_error(struct page *page)
{
	if (!page)
		return;

	SetPageError(page);
	unlock_page(page);
	nilfs_end_page_writeback(page);
}

static void abort_segment_io(struct nilfs_sc_info *sci, int nblk_submit)
{
	struct buffer_head **pbh;
	struct page *page, *prev_page = NULL;
	int idx = sci->sc_sum.nblocks;

	seg_debug(2, "aborting segment (nblk_submit=%d)\n", nblk_submit);
	pbh = &SC_BH(sci, sci->sc_ba_idx);
	while (idx > sci->sc_sum.nblk_sum) {
		--idx;  --pbh;
		page = (*pbh)->b_page;
		if(idx < nblk_submit) {
			/* clear_buffer_uptodate(*pbh); */
			unlock_buffer(*pbh);
			if (__buffer_frozen_copy(*pbh))
				peel_off_original_blocks(*pbh, 1);
			if (page != prev_page) {
				__end_page_io_by_error(prev_page);
				prev_page = page;
			}
		}
		nilfs_brelse_isr(sci->sc_sb, *pbh);
		*pbh = NULL;
	}
	__end_page_io_by_error(prev_page);

	prev_page = NULL;
	pbh = &SC_SUM_BH(sci, sci->sc_sum.nblk_sum);
	while (idx > 0) {
		--idx;  ++pbh;
		page = (*pbh)->b_page;
		if(idx < nblk_submit) {
			/* clear_buffer_uptodate(*pbh); */
			unlock_buffer(*pbh);
			if (__buffer_frozen_copy(*pbh))
				peel_off_original_blocks(*pbh, 1);
			if (page != prev_page) {
				__end_page_io_by_error(prev_page);
				prev_page = page;
			}
		}
		nilfs_brelse_isr(sci->sc_sb, *pbh);
		*pbh = NULL;
	}
	__end_page_io_by_error(prev_page);

	if (sci->sc_prev_stage < SC_COLLECT_IBLK &&
	    sci->sc_stage >= SC_COLLECT_IBLK)
		redirty_files(NILFS_SB(sci->sc_sb));
}

static void __end_page_io(struct page *page)
{
	if (!page)
		return;

	if (!nilfs_page_buffers_dirty(page))
		nilfs_clear_page_dirty(page);

	ClearPageError(page);
	unlock_page(page);
	nilfs_end_page_writeback(page);
}

static void
__end_bh_io(struct super_block *sb, struct buffer_head *bh,
	    struct page **prev_page)
{
	set_buffer_uptodate(bh);
	clear_buffer_dirty(bh);
	unlock_buffer(bh);

	if (__buffer_frozen_copy(bh))
		peel_off_original_blocks(bh, 0);
	if (bh->b_page != *prev_page) {
		__end_page_io(*prev_page);
		*prev_page = bh->b_page;
	}
	nilfs_brelse_isr(sb, bh);
}

static void complete_segment_io(struct nilfs_sc_info *sci)
{
	struct nilfs_sb_info *sbi = NILFS_SB(sci->sc_sb);
	struct buffer_head **pbh;
	struct page *prev_page = NULL;
	int idx = sci->sc_sum.nblocks;

	sci->sc_nblk_lseg += sci->sc_sum.nblocks;
	/*
	 * We assume that the buffers which belong to the same page
	 * continue over the buffer head array.
	 * Under this assumption, the last BHs of pages is identifiable
	 * by the discontinuity of bh->b_page (page != prev_page).
	 */
	seg_debug(2, "completing segment (flags=0x%x)\n", sci->sc_sum.flags);
	pbh = &SC_BH(sci, sci->sc_ba_idx);
	while (idx > sci->sc_sum.nblk_sum) {
		--idx;  --pbh;
		__end_bh_io(sci->sc_sb, *pbh, &prev_page);
		*pbh = NULL;
	}
	__end_page_io(prev_page);

	prev_page = NULL;
	pbh = &SC_SUM_BH(sci, sci->sc_sum.nblk_sum);
	while (idx > 0) {
		--idx;  ++pbh;
		__end_bh_io(sci->sc_sb, *pbh, &prev_page);
		*pbh = NULL;
	}
	__end_page_io(prev_page);

	drop_collected_files(sbi);

	if (!NILFS_SEG_SIMPLEX(&sci->sc_sum)) {
		if (NILFS_SEG_LOGBGN(&sci->sc_sum)) {
			set_bit(NILFS_SC_UNCLOSED, &sci->sc_flags);
			sci->sc_lseg_stime = jiffies;
			seg_debug(2, "set UNCLOSED flag\n");
		}
		if (NILFS_SEG_LOGEND(&sci->sc_sum)) {
			clear_bit(NILFS_SC_UNCLOSED, &sci->sc_flags);
			seg_debug(2, "cleared UNCLOSED flag\n");
		}
	}

	if (NILFS_SEG_HAS_CP(&sci->sc_sum)) {
		set_nilfs_cp(sbi->s_nilfs, sci->sc_cp.inode_root,
			     sci->sc_sum.seg_seq,
			     sci->sc_pseg_start,
			     sci->sc_pseg_start + sci->sc_ba_cp_idx + sci->sc_sum.nblk_sum,
			     sci->sc_sum.ctime,
			     atomic_read(&sbi->s_inodes_count),
			     atomic_read(&sbi->s_blocks_count));

		clear_bit(NILFS_SC_DIRTY, &sci->sc_flags);
		set_bit(NILFS_SC_LAST_CP, &sci->sc_flags);
		seg_debug(2, "completed a segment having a checkpoint\n");
	}
	/* Completed all BIOs of this segment */
}

struct seg_write_info {
	struct bio *bio;
	int start, end;   /* The region to be submitted */
	int rest_blocks;
	int max_pages;
	int nr_vecs;
};

static int submit_segment(struct nilfs_sc_info *sci, 
			  struct seg_write_info *wi, int mode)
{
	int err = submit_seg_bio(wi->bio, sci, mode);
	wi->bio = NULL;
	if (likely(!err)) {
		wi->rest_blocks -= wi->end - wi->start;
		wi->nr_vecs = min(wi->max_pages, wi->rest_blocks);
		wi->start = wi->end;
	}
	return err;
}

static inline void
init_seg_write_info(struct seg_write_info *wi, int nr_blocks,
		    struct block_device *bdev)
{
	wi->bio = NULL;
	wi->rest_blocks = nr_blocks;
	wi->max_pages = bio_get_nr_vecs(bdev);
	wi->nr_vecs = min(wi->max_pages, wi->rest_blocks);
	wi->start = wi->end = 0;
}

static int
seg_bh_add_and_submit(struct nilfs_sc_info *sci, struct seg_write_info *wi,
		      struct buffer_head *bh, int mode)
{
	int res;

	if (!wi->bio) {
		wi->bio = alloc_seg_bio(sci->sc_sb, sci->sc_pseg_start + wi->end,
					wi->nr_vecs);
		if (unlikely(!wi->bio))
			return -ENOMEM;
	}
	res = seg_bio_add_bh(wi->bio, bh);
	if (unlikely(res < 0)) {
		clear_seg_bio(sci, wi->bio);
		bio_put(wi->bio);
	} else if (res > 0) { /* bio is FULL (bh was not added to bio) */
		int err = submit_segment(sci, wi, mode);
		if (unlikely(err))
			return err;
	} else /* res == 0 */ {
		wi->end++;
	}
	return res;
}

static int write_segment(struct nilfs_sc_info *sci, struct nilfs_sb_info *sbi, int has_cp)
{
	struct seg_write_info wi;
	struct buffer_head **pbh, **ep;
	int rw = nilfs_test_opt(sbi, SYNC_BIO) ? WRITE_SYNC : WRITE;
	int res;

	BUG_ON(sci->sc_nbio > 0);

	atomic_set(&sci->sc_bio_blk_cnt, sci->sc_sum.nblocks);
	init_seg_write_info(&wi, sci->sc_sum.nblocks, sbi->s_super->s_bdev);

	seg_debug(2, "submitting summary blocks (nblk_sum=%d)\n",
		  sci->sc_sum.nblk_sum);
	pbh = &SC_SUM_BH(sci, 0);
	ep = &SC_SUM_BH(sci, sci->sc_sum.nblk_sum);
	while (pbh > ep) {
		res = seg_bh_add_and_submit(sci, &wi, *pbh, rw);
		if (!res) {
			lock_buffer(*pbh);  pbh--;
			continue;
		}
		/* The current bio was submitted */
		if (unlikely(res < 0))
			goto failed_bio;
	}

	seg_debug(2, "submitting normal blocks (ba_idx=%d, idx=%d)\n",
		  sci->sc_ba_idx, wi.end);
	pbh = &SC_BH(sci, 0);
	ep = &SC_BH(sci, sci->sc_ba_idx);
#ifdef NILFS_CP_BARRIER
	if (has_cp)
		ep--;
#endif
	while (pbh < ep) {
		res = seg_bh_add_and_submit(sci, &wi, *pbh, rw);
		if (!res) {
			lock_buffer(*pbh);  pbh++;
			continue;
		}
		/* The current bio was submitted */
		if (unlikely(res < 0))
			goto failed_bio;
	}
	if (wi.bio) {
		/*
		 * Last BIO is always sent through the following
		 * submission.
		 */
#ifdef NILFS_CP_BARRIER
		if (!has_cp && nilfs_test_opt(sbi, SYNC_SEGMENT))
#else
		if (nilfs_test_opt(sbi, SYNC_SEGMENT))
#endif
			rw |= (1 << BIO_RW_SYNC);
		res = submit_segment(sci, &wi, rw);
		if (unlikely(res))
			goto failed_bio;
	}

#ifdef NILFS_CP_BARRIER
	if (has_cp) {
		seg_debug(2, "submitting a checkpoint block (ba_idx=%d, idx=%d)\n",
			  sci->sc_ba_idx, wi.end);

		if (nilfs_test_opt(sbi, SYNC_SEGMENT))
			rw |= (1 << BIO_RW_SYNC);
		if (nilfs_test_opt(sbi, BARRIER))
			rw |= (1 << BIO_RW_BARRIER);
	retry_cp:
		BUG_ON(wi.bio != NULL);
		wi.nr_vecs = 1;
		res = seg_bh_add_and_submit(sci, &wi, *pbh, 0);
		if (unlikely(res < 0))
			goto failed_bio;
		else
			BUG_ON(res > 0);

		lock_buffer(*pbh);  /* pbh++; */

		res = submit_segment(sci, &wi, rw);
		if (res == -EOPNOTSUPP && (rw & (1 << BIO_RW_BARRIER))) {
			nilfs_warning(sbi->s_super, __FUNCTION__,
				      "barrier-based sync failed. "
				      "disabling barriers\n");
			nilfs_clear_opt(sbi, BARRIER);
			wi.end--;  /* pbh--; */
			unlock_buffer(*pbh);
			rw &= ~(1 << BIO_RW_BARRIER);
			goto retry_cp;
		}
		if (unlikely(res))
			goto failed_bio;
	}
#endif
	seg_debug(1, "submitted a segment\n");
	return 0;

 failed_bio:
	seg_debug(1, "Failed to write. getting back the state of segment\n");
	set_bit(NILFS_SC_EIO, &sci->sc_flags);
	wait_on_segment(sbi, sci);
	abort_segment_io(sci, wi.end);
	return res;
}

static inline int
alloc_partial_segment(struct nilfs_sb_info *sbi, struct nilfs_sc_info *sci,
		      struct the_nilfs *nilfs, dbn_t *seg_start, dbn_t *seg_end)
{
	sci->sc_pseg_start = *seg_start + nilfs->ns_pseg_offset;
	sci->sc_residual_blocks = (*seg_end + 1) - sci->sc_pseg_start;
	if (sci->sc_residual_blocks < NILFS_PSEG_MIN_BLOCKS) {
		segnum_t segnum;
		int err;

		/* Allocating a new full segment when required */
		err = nilfs_alloc_segment(sbi, &segnum, 1);
		if (unlikely(err))
			return err;

		nilfs_get_segment_range(sbi, segnum, seg_start, seg_end);
		nilfs->ns_segnum = segnum;
		nilfs->ns_seg_seq++;
		nilfs->ns_pseg_offset = 0;
		nilfs->ns_prev_pseg = 0;
		sci->sc_pseg_start = *seg_start;
		sci->sc_residual_blocks = *seg_end - *seg_start + 1;
	}
	return 0;
}

static int 
__construct_segment(struct nilfs_sb_info *sbi, struct nilfs_sc_info *sci, int mode)
{
	struct the_nilfs *nilfs = sbi->s_nilfs;
	dbn_t seg_start, seg_end;
	int err, has_cp;

	sci->sc_prev_stage = sci->sc_stage = 0;

	do {
		nilfs_get_segment_range(sbi, nilfs->ns_segnum,
					&seg_start, &seg_end);

		/* Synchronizing previous BIOs */
		err = wait_on_segment(sbi, sci);
		if (unlikely(err)) {
			if (!test_and_set_bit(NILFS_SC_RETRY, &sci->sc_flags))
				nilfs_invalidate_segment(nilfs, seg_start, seg_end);
			goto out;
		}
		
		/* Re-check needs of construction */
		if (sci->sc_stage == SC_COLLECT_INIT) {
			err = reconfirm_construction(sbi, sci);
			if (err)
				goto out;
		}

		/* Determining next (partial) segment */
		err = alloc_partial_segment(sbi, sci, nilfs, &seg_start, &seg_end);
		if (unlikely(err))
			goto out;

		sci->sc_prev_stage = sci->sc_stage;
		err = ba_init(sci, sbi);
		if (unlikely(err))
			goto out;

		err = collect_segment_blocks(sci, sbi, mode);
		if (NILFS_SEG_ERR(err))
			goto failed_ba;

		has_cp = NILFS_SEG_HAS_CP(&sci->sc_sum);
		if (sci->sc_stage == SC_COLLECT_DONE) {
			err = follow_up_check(sci, has_cp, mode);
			if (err)
				goto failed_ba;
		}

		nilfs_print_seginfo(sci, seg_start, seg_end);

		err = update_dbn(sci, sbi);
		if (unlikely(err))
			goto failed_ba;

		if (mode != SC_LSEG_DSYNC) {
			if (sci->sc_prev_stage < SC_COLLECT_IBLK &&
			    sci->sc_stage >= SC_COLLECT_IBLK)
				fill_in_file_block_roots(sbi);

			if (has_cp)
				fill_in_cp(sci, sbi);
		}
		fill_in_segsum(sci);

		/*
		 * Checksum calculations
		 */
		if (has_cp)
			fill_in_cp_crc(sci, sbi->s_crc_seed);
		fill_in_segsum_crc(sci, sbi->s_crc_seed);
		fill_in_data_crc(sci, sbi->s_crc_seed);

		/* Write a partial segment */
		err = write_segment(sci, sbi, has_cp);
		if (unlikely(err))
			goto out; /* The buffer array has been cleared */

		/* Forward pointers */
		nilfs->ns_pseg_offset += sci->sc_sum.nblocks;
		nilfs->ns_prev_pseg = sci->sc_sum.nblocks;

	} while (sci->sc_stage != SC_COLLECT_DONE);

	/* Wait for completion of segment write */
	err = wait_on_segment(sbi, sci);
	if (unlikely(err) && !test_and_set_bit(NILFS_SC_RETRY, &sci->sc_flags))
		nilfs_invalidate_segment(nilfs, seg_start, seg_end);
	
	seg_debug(1, "submitted all segments\n");

	/* Clearing sketch data */
	if (!err && has_cp && sci->sc_sketch_inode) {
		memset(sci->sc_cp_bh->b_data + NILFS_CP_BYTES, 0,
		       sci->sc_sketch_inode->i_size);
		i_size_write(sci->sc_sketch_inode, 0);
	}
 out:
	return err;

 failed_ba:
	ba_clear(sci, err);
	goto out;
}

/**
 * start_segctor_timer - set timer of background write
 * @sbi: nilfs_sb_info
 *
 * If the timer has already been set, it ignores the new request.
 * This function MUST be called within a section locking the segment semaphore.
 */
static void start_segctor_timer(struct nilfs_sb_info *sbi)
{
	struct nilfs_sc_info *sci = NILFS_SC(sbi);

	if (!sci || nilfs_test_opt(sbi, PASSIVE))
		return;

	spin_lock(&sci->sc_state_lock);
	if (sci->sc_timer && !(sci->sc_state & NILFS_SEGCTOR_COMMIT)) {
		sci->sc_timer->expires = jiffies + sci->sc_interval;
		add_timer(sci->sc_timer);
		sci->sc_state |= NILFS_SEGCTOR_COMMIT;
	}
	spin_unlock(&sci->sc_state_lock);
}

/**
 * __nilfs_segctor_flush - trigger a segment construction for resource control
 * @sci: nilfs_sc_info
 *
 * Caller of this function must lock sc_state_lock
 */
static void __nilfs_segctor_flush(struct nilfs_sc_info *sci, unsigned long flag)
{
	if (sci->sc_state & NILFS_SEGCTOR_INIT)
		return;

	if (!(sci->sc_state & flag)) {
		sci->sc_state |= flag;
		wake_up(&sci->sc_wait_request);
	}
}

void nilfs_flush_segment(struct nilfs_sb_info *sbi, int flush_mode)
{
	struct nilfs_sc_info *sci = NILFS_SC(sbi);

	if (!sci) {
		nilfs_warning(sbi->s_super, __FUNCTION__,
			      "Tried to flush destructed FS.\n");
		nilfs_dump_stack(NILFS_VERBOSE_SEGMENT, 1);
		return;
	}
	spin_lock(&sci->sc_state_lock);
	seg_debug(1, "kick segment constructor (flush_mode=%d)\n", flush_mode);
	__nilfs_segctor_flush(sci, (flush_mode == NILFS_SC_FLUSH_DATA ?
				    NILFS_SEGCTOR_FLUSH_DATA : NILFS_SEGCTOR_FLUSH_IBT));
	spin_unlock(&sci->sc_state_lock);
}

void nilfs_segctor_add_dirty(struct nilfs_sc_info *sci, unsigned delta)
{
	BUG_ON(!sci);
	spin_lock(&sci->sc_state_lock);
	sci->sc_nr_dirty += delta;
	if (sci->sc_nr_dirty > sci->sc_block_max)
		__nilfs_segctor_flush(sci, NILFS_SEGCTOR_FLUSH_DATA);
	spin_unlock(&sci->sc_state_lock);
}

static int __nilfs_construct_segment(struct nilfs_sc_info *sci)
{
	int err;
	__u32 seq;

	if (sci->sc_state & NILFS_SEGCTOR_INIT)
		return -EROFS;

	seq = ++sci->sc_seq_request;
	while (nilfs_cnt32_lt(sci->sc_seq_done, seq)) {
		DEFINE_WAIT(wait);
		int should_sleep = 1;

		wake_up(&sci->sc_wait_request);
		prepare_to_wait(&sci->sc_wait_done, &wait,
				TASK_INTERRUPTIBLE);

		if (nilfs_cnt32_ge(sci->sc_seq_done, seq))
			should_sleep = 0;

		if (should_sleep) {
			spin_unlock(&sci->sc_state_lock);
			schedule();
			spin_lock(&sci->sc_state_lock);
		}
		finish_wait(&sci->sc_wait_done, &wait);
	}
	err = sci->sc_errno;
	sci->sc_seq_ack++;
	wake_up(&sci->sc_wait_request);
	return err;
}

/**
 * nilfs_construct_segment - construct a logical segment
 * @sb: super block
 *
 * nilfs_construct_segment() construct a segment. This thread base
 * implementation triggers a call of __construct_segment() by
 * the segment constructor daemon (segctord) and waits for its completion.
 * 
 * Return Value: On success, one of following non-negative values is 
 * returned.
 *
 * 0 - a logical segment was constructed and was written out.
 *
 * %NILFS_SEG_EMPTY - the construction was skipped because no dirty block
 * was found.
 *
 * On errors, one of following negative error code is returned.
 *
 * %-EROFS - Read only filesystem.
 *
 * %-EFAULT - Internal error (FS-shutdown is going?)
 *
 * %-EIO - I/O error
 *
 * %-ENOSPC - Disk full.
 *
 * %-ENOMEM - Insufficient memory available.
 */
int nilfs_construct_segment(struct super_block *sb)
{
	struct nilfs_sb_info *sbi = NILFS_SB(sb);
	struct nilfs_sc_info *sci = NILFS_SC(sbi);
	struct nilfs_transaction_info *ti;
	int err;

	if ((sb->s_flags & MS_RDONLY) || !sci) {
		seg_debug(1, "Skipped construction (read only)\n");
		return -EROFS;
	}
	/* A call inside transactions causes a deadlock. */
	BUG_ON((ti = current->journal_info) && ti->ti_super == sb);

	spin_lock(&sci->sc_state_lock);
	err = __nilfs_construct_segment(sci);
	spin_unlock(&sci->sc_state_lock);
	return err;
}

/**
 * nilfs_construct_fdata_segment - construct a data-only logical segment
 * @sb: super block
 * @inode: the inode whose data blocks should be written out
 */
int nilfs_construct_fdata_segment(struct super_block *sb,
				  struct inode *inode)
{
	struct nilfs_sb_info *sbi = NILFS_SB(sb);
	struct nilfs_sc_info *sci = NILFS_SC(sbi);
	struct nilfs_transaction_info *ti;
	struct nilfs_inode_info *ii;
	int err = 0;

	if ((sb->s_flags & MS_RDONLY) || !sci) {
		seg_debug(1, "Skipped construction (read only)\n");
		return -EROFS;
	}
	BUG_ON((ti = current->journal_info) && ti->ti_super == sb);
	might_sleep();
	down_write(&sbi->s_segctor.sem);

	ii = NILFS_I(inode);
	if (test_bit(NILFS_SC_UNCLOSED, &sci->sc_flags) ||
	    nilfs_test_opt(sbi, STRICT_ORDER) ||
	    (ii->i_state & NILFS_STATE_INODE_DIRTY)) {
		up_write(&sbi->s_segctor.sem);
		spin_lock(&sci->sc_state_lock);
		err = __nilfs_construct_segment(sci);
		spin_unlock(&sci->sc_state_lock);
		return err;
	}

	spin_lock(&sci->sc_state_lock);
	sci->sc_sum.ctime = sci->sc_ctime;
	spin_unlock(&sci->sc_state_lock);

	if (!(ii->i_state & NILFS_STATE_DIRTY)) {
		err = NILFS_SEG_EMPTY;
		goto out_lock;
	}
	sci->sc_dirty_file_ptr = ii;
	seg_debug(1, "begin (mode=0x%x)\n", SC_LSEG_DSYNC);
	err = __construct_segment(sbi, sci, SC_LSEG_DSYNC);
	seg_debug(1, "end (stage=%d)\n", sci->sc_stage);

	if (!err) {
		spin_lock(&sci->sc_state_lock);
		sci->sc_ctime = sci->sc_sum.ctime;
		spin_unlock(&sci->sc_state_lock);
	}
 out_lock:
	up_write(&sbi->s_segctor.sem);
	return err;
}

static void nilfs_segctor_construct(struct nilfs_sc_info *sci, int mode)
{
	struct nilfs_sb_info *sbi = NILFS_SB(sci->sc_sb);
	__u32 seq_accepted;
	int err, sync_req, cp_written = 0;
	struct inode *iprev = NULL;

	/*
	 * Writing lock the segment semaphore.
	 * A writer can monopolies s_segctor.dirty_files without spinlock 
	 */
	might_sleep();
	down_write(&sbi->s_segctor.sem);

	BUG_ON(!sci || !sci->sc_bh_arr);
	if (sci->sc_timer)
		del_timer_sync(sci->sc_timer);

	spin_lock(&sci->sc_state_lock);
	seq_accepted = sci->sc_seq_request;
	sync_req = sci->sc_sync;
	sci->sc_sum.ctime = sci->sc_ctime;
	spin_unlock(&sci->sc_state_lock);

	if (list_empty(&sbi->s_segctor.dirty_files) &&
	    !test_bit(NILFS_SC_DIRTY, &sci->sc_flags)) {
		/*
		 * Segment is constructed synchronously.
		 * So, all bios are completed here.
		 *
		 * We can safely move the dirty flag from B-tree.
		 */
		if (!nilfs_btree_test_and_clear_dirty(&sbi->s_inode_root) &&
		    !test_bit(NILFS_SB_IBLK_DIRTY, &sbi->s_flags)) {
			err = NILFS_SEG_EMPTY;
			seg_debug(1, "Skipped construction (no changes)\n");
			goto escape;
		}
		clear_bit(NILFS_SB_IBLK_DIRTY, &sbi->s_flags);
		set_bit(NILFS_SC_DIRTY, &sci->sc_flags);
	}

	seg_debug(1, "begin (mode=0x%x)\n", mode);
	err = __construct_segment(sbi, sci, mode);
	seg_debug(1, "end (stage=%d)\n", sci->sc_stage);

 escape:
	spin_lock(&sci->sc_state_lock);
	if (!err) {
		cp_written = NILFS_SEG_HAS_CP(&sci->sc_sum);
		sci->sc_nr_dirty = 0;
		sci->sc_ctime = sci->sc_sum.ctime;
	}
	/* Clear requests (even when the construction failed) */
	if (mode == SC_FLUSH_DATA)
		sci->sc_state &= ~(NILFS_SEGCTOR_COMMIT | NILFS_SEGCTOR_FLUSH_DATA);
	else
		sci->sc_state &= ~(NILFS_SEGCTOR_COMMIT | NILFS_SEGCTOR_FLUSH);

	if ((mode & SC_FLUSH_MASK) == SC_FLUSH_CP) {
		sci->sc_seq_done = seq_accepted;
		if (sync_req > 0)
			sci->sc_sync = (err > 0) ? -err : 0;
	}
	
	/*
	 * Unclosed segment should be retried.  We do this using sc_timer.
	 * Timeout of sc_timer will invoke complete construction which lead to
	 * close the current logical segment.
	 *
	 * A retry count should be introduced to avoid endless retries.
	 */
	if (!nilfs_test_opt(sbi, PASSIVE) &&
	    test_bit(NILFS_SC_UNCLOSED, &sci->sc_flags) && sci->sc_timer) {
		/*
		 * Call of nilfs_segctor_commit() is excluded by the segment semaphore.
		 * So we can safely add timer here.
		 */
		sci->sc_timer->expires = jiffies + sci->sc_interval;
		add_timer(sci->sc_timer);
		sci->sc_state |= NILFS_SEGCTOR_COMMIT;
	}
	sci->sc_errno = err;
	spin_unlock(&sci->sc_state_lock);

	if (cp_written && (iprev = sci->sc_sketch_inode))
		sci->sc_sketch_inode = NULL;

	up_write(&sbi->s_segctor.sem);

	if (!NILFS_SEG_ERR(err))
		dispose_garbage_list(sbi);
	if (iprev)
		iput(iprev);
}

static void construction_timeout(unsigned long data)
{
	struct task_struct *p = (struct task_struct *)data;
	wake_up_process(p);
}

/**
 * segctor_thread - main loop of the segment constructor thread.
 * @arg: pointer to a struct nilfs_sc_info.
 *
 * segctor_thread() initializes a timer and serves as a daemon
 * to execute segment constructions.
 */
static int segctor_thread(void *arg)
{
	struct nilfs_sc_info *sci = (struct nilfs_sc_info *)arg;
	struct timer_list timer;
	int timeout = 0;

	daemonize("segctord");

	init_timer(&timer);
	timer.data = (unsigned long)current;
	timer.function = construction_timeout;
	sci->sc_timer = &timer;

	/* start sync. */
	sci->sc_task = current;
	wake_up(&sci->sc_wait_done);
	if (test_bit(NILFS_SC_PASSIVE, &sci->sc_flags))
		printk(KERN_INFO "segctord starting in passive mode.\n");
	else
		printk(KERN_INFO
		       "segctord starting. Construction interval = %lu seconds, "
		       "CP frequency < %lu seconds\n",
		       sci->sc_interval / HZ, sci->sc_mjcp_freq / HZ);

	spin_lock(&sci->sc_state_lock);
 loop:
	for(;;) {
		int mode;
  
		if (sci->sc_state & NILFS_SEGCTOR_QUIT)
			goto end_thread;

		seg_debug(2, "sequence: req=%u, done=%u, ack=%u, state=%lx\n",
			  sci->sc_seq_request, sci->sc_seq_done,
			  sci->sc_seq_ack, sci->sc_state);

		if (sci->sc_seq_ack != sci->sc_seq_done)
			break;

		if (timeout || sci->sc_seq_request != sci->sc_seq_done) {
			mode = SC_LSEG_MJCP;
		} else if (sci->sc_state & NILFS_SEGCTOR_FLUSH) {
			if (test_bit(NILFS_SC_PASSIVE, &sci->sc_flags) ||
			    !test_bit(NILFS_SC_UNCLOSED, &sci->sc_flags) || 
			    time_before(jiffies, sci->sc_lseg_stime + sci->sc_mjcp_freq))
				mode = (sci->sc_state & NILFS_SEGCTOR_FLUSH_IBT) ?
					SC_FLUSH_IBT : SC_FLUSH_DATA;
			else
				mode = SC_LSEG_MJCP;
		} else
			break;

		spin_unlock(&sci->sc_state_lock);
		nilfs_segctor_construct(sci, mode);
		spin_lock(&sci->sc_state_lock);
		timeout = 0;
  	}

	wake_up(&sci->sc_wait_done);
#if NEED_REFRIGERATOR_ARGS
	if (current->flags & PF_FREEZE) {
#else
	if (freezing(current)) {
#endif
		seg_debug(2, "suspending segctord\n");
		spin_unlock(&sci->sc_state_lock);
#if NEED_REFRIGERATOR_ARGS
		refrigerator(PF_FREEZE);
#else
		refrigerator();
#endif
		spin_lock(&sci->sc_state_lock);
	} else {
		DEFINE_WAIT(wait);
		int should_sleep = 1;

		prepare_to_wait(&sci->sc_wait_request, &wait,
				TASK_INTERRUPTIBLE);

		if (sci->sc_seq_ack == sci->sc_seq_done) {
			if (sci->sc_seq_request != sci->sc_seq_done)
				should_sleep = 0;
			else if (sci->sc_state & NILFS_SEGCTOR_FLUSH)
				should_sleep = 0;
			else if (sci->sc_state & NILFS_SEGCTOR_COMMIT)
				should_sleep =
					time_before(jiffies, 
						    sci->sc_timer->expires);
		}
		if (should_sleep) {
			spin_unlock(&sci->sc_state_lock);
			schedule();
			spin_lock(&sci->sc_state_lock);
		}
		finish_wait(&sci->sc_wait_request, &wait);
		timeout = ((sci->sc_state & NILFS_SEGCTOR_COMMIT) &&
			   time_after_eq(jiffies, sci->sc_timer->expires));
	}
	seg_debug(2, "woke %s\n", timeout ? "(timeout)" : "");
	goto loop;

 end_thread:
	spin_unlock(&sci->sc_state_lock);
	del_timer_sync(sci->sc_timer);
	sci->sc_timer = NULL;

	/* end sync. */
	sci->sc_task = NULL;
	wake_up(&sci->sc_wait_done);
	seg_debug(1, "segctord exiting.\n");
	return 0;
}

static void nilfs_start_segctor_thread(struct nilfs_sc_info *sci)
{
	kernel_thread(segctor_thread, sci,
		      CLONE_VM | CLONE_FS | CLONE_FILES);
	wait_event(sci->sc_wait_done, sci->sc_task != NULL);
}

static void nilfs_kill_segctor_thread(struct nilfs_sc_info *sci)
{
	sci->sc_state |= NILFS_SEGCTOR_QUIT;

	while (sci->sc_task) {
		wake_up(&sci->sc_wait_request);
		spin_unlock(&sci->sc_state_lock);
		wait_event(sci->sc_wait_done, sci->sc_task == NULL);
		spin_lock(&sci->sc_state_lock);
	}
}

/**
 * nilfs_segctor_reset - reset the segment constructor thread.
 * @sci: nilfs_sc_info
 *
 * nilfs_segctor_reset() resets sequence counters used for handshaking
 * with a segment constructor thread (segctord), and starts segctord.
 */
int nilfs_segctor_reset(struct nilfs_sc_info *sci)
{
	sci->sc_state &= ~NILFS_SEGCTOR_INIT;
	sci->sc_seq_done = sci->sc_seq_request;
	sci->sc_seq_ack = sci->sc_seq_done;
	sci->sc_nr_dirty = 0;
	nilfs_start_segctor_thread(sci);
	return 0;
}

/*
 * Sysfs interface
 */
static ssize_t nilfs_show_sc_interval(struct the_nilfs *nilfs, char *page)
{
	struct nilfs_sb_info *sbi = nilfs_get_writer(nilfs);
	struct nilfs_sc_info *sci;
	unsigned long val;
	ssize_t ret = 0;

	if (sbi && (sci = NILFS_SC(sbi))) {
		spin_lock(&sci->sc_state_lock);
		val = sci->sc_interval / HZ;
		spin_unlock(&sci->sc_state_lock);
		ret = sprintf(page, "%lu\n", val);
	}
	nilfs_put_writer(nilfs);
	return ret;
}

static ssize_t
nilfs_store_sc_interval(struct the_nilfs *nilfs, const char *page, size_t length)
{
	struct nilfs_sb_info *sbi = nilfs_get_writer(nilfs);
	struct nilfs_sc_info *sci;
	unsigned long val;
	ssize_t ret = -EINVAL;

	if (sbi && (sci = NILFS_SC(sbi)) && sscanf(page, "%lu", &val) == 1) {
		if (!val)
			val = NILFS_SC_DEFAULT_TIMEOUT;
		spin_lock(&sci->sc_state_lock);
		sci->sc_interval = val * HZ;
		spin_unlock(&sci->sc_state_lock);
		ret = length;
	}
	nilfs_put_writer(nilfs);
	return ret;
}

static ssize_t nilfs_show_sc_threshold(struct the_nilfs *nilfs, char *page)
{
	struct nilfs_sb_info *sbi = nilfs_get_writer(nilfs);
	struct nilfs_sc_info *sci;
	unsigned long val;
	ssize_t ret = 0;

	if (sbi && (sci = NILFS_SC(sbi))) {
		spin_lock(&sci->sc_state_lock);
		val = sci->sc_block_max;
		spin_unlock(&sci->sc_state_lock);
		ret = sprintf(page, "%lu\n", val);
	}
	nilfs_put_writer(nilfs);
	return ret;
}

static ssize_t
nilfs_store_sc_threshold(struct the_nilfs *nilfs, const char *page, size_t length)
{
	struct nilfs_sb_info *sbi = nilfs_get_writer(nilfs);
	struct nilfs_sc_info *sci;
	unsigned long val;
	ssize_t ret = -EINVAL;

	if (sbi && (sci = NILFS_SC(sbi)) && sscanf(page, "%lu", &val) == 1) {
		if (!val)
			val = NILFS_SC_DEFAULT_THRESHOLD_DATA_AMOUNT;
		spin_lock(&sci->sc_state_lock);
		sci->sc_block_max = val;
		if (sci->sc_nr_dirty > sci->sc_block_max)
			__nilfs_segctor_flush(sci, NILFS_SEGCTOR_FLUSH_DATA);
		spin_unlock(&sci->sc_state_lock);
		ret = length;
	}
	nilfs_put_writer(nilfs);
	return ret;
}

static ssize_t nilfs_show_sc_max_bio(struct the_nilfs *nilfs, char *page)
{
	struct nilfs_sb_info *sbi = nilfs_get_writer(nilfs);
	struct nilfs_sc_info *sci;
	ssize_t ret = 0;

	if (sbi && (sci = NILFS_SC(sbi)))
		ret = sprintf(page, "%u\n", atomic_read(&sci->sc_max_bio));
	nilfs_put_writer(nilfs);
	return ret;
}

static ssize_t
nilfs_store_sc_max_bio(struct the_nilfs *nilfs, const char *page, size_t length)
{
	struct nilfs_sb_info *sbi = nilfs_get_writer(nilfs);
	struct nilfs_sc_info *sci;
	unsigned int val;
	ssize_t ret = -EINVAL;

	if (sbi && (sci = NILFS_SC(sbi)) && sscanf(page, "%u", &val) == 1) {
		if (!val)
			val = NILFS_SC_DEFAULT_MAX_BIO;
		atomic_set(&sci->sc_max_bio, val);
		ret = length;
	}
	nilfs_put_writer(nilfs);
	return ret;
}

static ssize_t nilfs_show_sc_mjcp_freq(struct the_nilfs *nilfs, char *page)
{
	struct nilfs_sb_info *sbi = nilfs_get_writer(nilfs);
	struct nilfs_sc_info *sci;
	unsigned long val;
	ssize_t ret = 0;

	if (sbi && (sci = NILFS_SC(sbi))) {
		spin_lock(&sci->sc_state_lock);
		val = sci->sc_mjcp_freq / HZ;
		spin_unlock(&sci->sc_state_lock);
		ret = sprintf(page, "%lu\n", val);
	}
	nilfs_put_writer(nilfs);
	return ret;
}

static ssize_t
nilfs_store_sc_mjcp_freq(struct the_nilfs *nilfs, const char *page, size_t length)
{
	struct nilfs_sb_info *sbi = nilfs_get_writer(nilfs);
	struct nilfs_sc_info *sci;
	unsigned long val;
	ssize_t ret = -EINVAL;

	if (sbi && (sci = NILFS_SC(sbi)) && sscanf(page, "%lu", &val) == 1) {
		if (!val)
			val = NILFS_SC_DEFAULT_MJCP_FREQ;
		spin_lock(&sci->sc_state_lock);
		sci->sc_mjcp_freq = val * HZ;
		spin_unlock(&sci->sc_state_lock);
		ret = length;
	}
	nilfs_put_writer(nilfs);
	return ret;
}

static ssize_t nilfs_show_sc_ctime(struct the_nilfs *nilfs, char *page)
{
	struct nilfs_sb_info *sbi = nilfs_get_writer(nilfs);
	struct nilfs_sc_info *sci;
	long val;
	ssize_t ret = 0;

	if (sbi && (sci = NILFS_SC(sbi))) {
		spin_lock(&sci->sc_state_lock);
		val = sci->sc_ctime;
		spin_unlock(&sci->sc_state_lock);
		ret = sprintf(page, "%ld\n", val);
	}
	nilfs_put_writer(nilfs);
	return ret;
}

static ssize_t
nilfs_store_sc_ctime(struct the_nilfs *nilfs, const char *page, size_t length)
{
	struct nilfs_sb_info *sbi = nilfs_get_writer(nilfs);
	struct nilfs_sc_info *sci;
	long val;
	ssize_t ret = -EINVAL;

	if (sbi && (sci = NILFS_SC(sbi)) && sscanf(page, "%ld", &val) == 1) {
		if (test_bit(NILFS_SC_PASSIVE, &sci->sc_flags)) {
			spin_lock(&sci->sc_state_lock);
			sci->sc_ctime = (time_t)val;
			spin_unlock(&sci->sc_state_lock);
		}
		ret = length;
	}
	nilfs_put_writer(nilfs);
	return ret;
}

static ssize_t nilfs_show_sync(struct the_nilfs *nilfs, char *page)
{
	struct nilfs_sb_info *sbi = nilfs_get_writer(nilfs);
	struct nilfs_sc_info *sci;
	int val;
	ssize_t ret = 0;

	if (sbi && (sci = NILFS_SC(sbi))) {
		spin_lock(&sci->sc_state_lock);
		val = sci->sc_sync;
		spin_unlock(&sci->sc_state_lock);
		ret = sprintf(page, "%d\n", val);
	}
	nilfs_put_writer(nilfs);
	return ret;
}

static ssize_t
nilfs_store_sync(struct the_nilfs *nilfs, const char *page, size_t length)
{
	struct nilfs_sb_info *sbi = nilfs_get_writer(nilfs);
	struct nilfs_sc_info *sci;
	int val, err;
	ssize_t ret = -EINVAL;

	if (sbi && (sci = NILFS_SC(sbi)) && sscanf(page, "%d", &val) == 1) {
		ret = length;
		if (val == NILFS_SYNC_MJCP) {
			spin_lock(&sci->sc_state_lock);
			sci->sc_sync = val;
			err = __nilfs_construct_segment(sci);
			spin_unlock(&sci->sc_state_lock);
			if (NILFS_SEG_ERR(err))
				ret = err;
		} else if (!val) {
			spin_lock(&sci->sc_state_lock);
			sci->sc_sync = 0;
			spin_unlock(&sci->sc_state_lock);
		}
	}
	nilfs_put_writer(nilfs);
	return ret;
}

define_nilfs_rw_attr(sc_interval, 0644)
define_nilfs_rw_attr(sc_threshold, 0644)
define_nilfs_rw_attr(sc_max_bio, 0644)
define_nilfs_rw_attr(sc_mjcp_freq, 0644)
define_nilfs_rw_attr(sc_ctime, 0644)
define_nilfs_rw_attr(sync, 0644)

/*
 * Setup & clean-up functions
 */
static struct nilfs_sc_info *
nilfs_segctor_init(struct nilfs_sb_info *sbi, unsigned int max_blocks)
{
	unsigned int bytes;
	struct nilfs_sc_info *sci;

	sci = kmalloc(sizeof(*sci), GFP_KERNEL);
	if (!sci)
		return NULL;
	memset(sci, 0, sizeof(*sci));
	
	bytes = sizeof(struct buffer_head *) * max_blocks;
	sci->sc_bh_arr = kmalloc(bytes, GFP_KERNEL);
	if (!sci->sc_bh_arr) {
		kfree(sci);
		return NULL;
	}
	memset(sci->sc_bh_arr, 0, bytes);

	sci->sc_cp_bh = nilfs_getblk(sbi->s_super, 0);
	if (!sci->sc_cp_bh) {
		kfree(sci->sc_bh_arr);
		kfree(sci);
		return NULL;
	}
	memset(sci->sc_cp_bh->b_data, 0, sci->sc_cp_bh->b_size);

	sci->sc_ba_max = max_blocks;
	sci->sc_sb = sbi->s_super;
	init_completion(&sci->sc_bio_event);

	init_waitqueue_head(&sci->sc_wait_done);
	init_waitqueue_head(&sci->sc_wait_request);
	spin_lock_init(&sci->sc_state_lock);

	sci->sc_interval = HZ * NILFS_SC_DEFAULT_TIMEOUT;
	sci->sc_mjcp_freq = HZ * NILFS_SC_DEFAULT_MJCP_FREQ;
	sci->sc_block_max = NILFS_SC_DEFAULT_THRESHOLD_DATA_AMOUNT;
	sci->sc_state = NILFS_SEGCTOR_INIT;
	atomic_set(&sci->sc_max_bio, NILFS_SC_DEFAULT_MAX_BIO);

	if (sbi->s_segctor.interval)
		sci->sc_interval = sbi->s_segctor.interval;
	if (sbi->s_segctor.block_max)
		sci->sc_block_max = sbi->s_segctor.block_max;
	if (nilfs_test_opt(sbi, PASSIVE))
		set_bit(NILFS_SC_PASSIVE, &sci->sc_flags);
	return sci;
}

/**
 * nilfs_segctor_destroy - destroy the segment constructor.
 * @sbi: nilfs_sb_info
 * @sci: nilfs_sc_info
 *
 * nilfs_segctor_destroy() kills the segctord thread and frees
 * the nilfs_sc_info struct.
 * Caller must hold the segment semaphore.
 */
static void nilfs_segctor_destroy(struct nilfs_sb_info *sbi, struct nilfs_sc_info *sci)
{
	struct buffer_head **pbh;
	int flag;
	struct inode *iprev = NULL;

	spin_lock(&sci->sc_state_lock);
	flag = (sci->sc_state & NILFS_SEGCTOR_INIT);
	spin_unlock(&sci->sc_state_lock);
	if (flag)
		/* segctord is not running */
		goto destroy;

	iprev = sci->sc_sketch_inode;
	sci->sc_sketch_inode = NULL;
	up_write(&sbi->s_segctor.sem);
	if (iprev) {
		seg_debug(1, "The sketch was updated just before the last "
			  "segment construction. (abandoned)\n");
		iput(iprev);
	}

	spin_lock(&sci->sc_state_lock);
	nilfs_kill_segctor_thread(sci);
	flag = ((sci->sc_state & (NILFS_SEGCTOR_COMMIT | NILFS_SEGCTOR_FLUSH)) ||
		sci->sc_seq_request != sci->sc_seq_done);
	spin_unlock(&sci->sc_state_lock);

	if (flag)
		/* The segctord thread was stopped and its timer was removed.
		   But some tasks remain. */
		nilfs_segctor_construct(sci, SC_LSEG_MJCP);

	down_write(&sbi->s_segctor.sem);

 destroy:
	nilfs_brelse(sci->sc_cp_bh);
	if ((pbh = sci->sc_bh_arr) != NULL) {
		while (pbh < sci->sc_bh_arr + sci->sc_ba_max)
			nilfs_brelse(*pbh++);
		kfree(sci->sc_bh_arr);
	}
	kfree(sci);
}

/**
 * nilfs_init_segment - initialize a segment constructor
 * @sbi: nilfs_sb_info
 * @max_blocks: maximum block count of a full segment.
 *
 * nilfs_init_segment() allocates a struct nilfs_sc_info and initilizes it.
 *
 * Return Value: On success, 0 is returned. On error, a following negative
 * error code is returned.
 *
 * %-ENOMEM - Insufficient memory available.
 */
int nilfs_init_segment(struct nilfs_sb_info *sbi, unsigned int max_blocks)
{
	/* Each field of nilfs_segctor is zero-cleared in the initialization of
	   super block info */
	if (!(sbi->s_super->s_flags & MS_RDONLY)) {
		sbi->s_segctor.info = nilfs_segctor_init(sbi, max_blocks);
		if (!sbi->s_segctor.info)
			return -ENOMEM;
	}
	init_rwsem(&sbi->s_segctor.sem);
	spin_lock_init(&sbi->s_segctor.dirty_files_lock);
	INIT_LIST_HEAD(&sbi->s_segctor.dirty_files);
	INIT_LIST_HEAD(&sbi->s_segctor.garbage_list);

	return 0;
}

/**
 * nilfs_clear_segment - destroy a segment constructor
 * @sbi: nilfs_sb_info
 *
 * nilfs_clear_segment() kills the segment constructor daemon,
 * frees the struct nilfs_sc_info, and destroy the dirty file list.
 */
void nilfs_clear_segment(struct nilfs_sb_info *sbi)
{
	/*
	 * Following destruction is not guarded.
	 * Sould we use an exclusion control?
	 */
	down_write(&sbi->s_segctor.sem);
	spin_lock_irq(&sbi->s_segctor.dirty_files_lock);
	if (!list_empty(&sbi->s_segctor.dirty_files))
		nilfs_warning(sbi->s_super, __FUNCTION__, 
			      "Non empty dirty list before the last "
			      "segment construction\n");
	spin_unlock_irq(&sbi->s_segctor.dirty_files_lock);

	if (NILFS_SC(sbi)) {
		nilfs_segctor_destroy(sbi, NILFS_SC(sbi));
		sbi->s_segctor.info = NULL;
	}
	
	/* Force to free the list of dirty files */
	spin_lock_irq(&sbi->s_segctor.dirty_files_lock);
	if (!list_empty(&sbi->s_segctor.dirty_files)) {
		list_splice(&sbi->s_segctor.dirty_files,
			    &sbi->s_segctor.garbage_list);
		nilfs_warning(sbi->s_super, __FUNCTION__, 
			      "Non empty dirty list after the last "
			      "segment construction\n");
	}
	spin_unlock_irq(&sbi->s_segctor.dirty_files_lock);
	up_write(&sbi->s_segctor.sem);

	dispose_garbage_list(sbi);
}


/* Local Variables:	*/
/* eval: (c-set-style "linux")	*/
/* End:			*/
