/*
 * nasd_cblock.c
 *
 * Individual block management for NASD on-disk cache.
 *
 * Author: Jim Zelenka
 */
/*
 * Copyright (c) of Carnegie Mellon University, 1997,1998,1999.
 *
 * Permission to reproduce, use, and prepare derivative works of
 * this software for internal use is granted provided the copyright
 * and "No Warranty" statements are included with all reproductions
 * and derivative works. This software may also be redistributed
 * without charge provided that the copyright and "No Warranty"
 * statements are included in all redistributions.
 *
 * NO WARRANTY. THIS SOFTWARE IS FURNISHED ON AN "AS IS" BASIS.
 * CARNEGIE MELLON UNIVERSITY MAKES NO WARRANTIES OF ANY KIND, EITHER
 * EXPRESSED OR IMPLIED AS TO THE MATTER INCLUDING, BUT NOT LIMITED
 * TO: WARRANTY OF FITNESS FOR PURPOSE OR MERCHANTABILITY, EXCLUSIVITY
 * OF RESULTS OR RESULTS OBTAINED FROM USE OF THIS SOFTWARE. CARNEGIE
 * MELLON UNIVERSITY DOES NOT MAKE ANY WARRANTY OF ANY KIND WITH RESPECT
 * TO FREEDOM FROM PATENT, TRADEMARK, OR COPYRIGHT INFRINGEMENT.
 */


#include <nasd/nasd_options.h>
#include <nasd/nasd_drive_options.h>
#include <nasd/nasd_types.h>
#include <nasd/nasd_freelist.h>
#include <nasd/nasd_itypes.h>
#include <nasd/nasd_mem.h>
#include <nasd/nasd_cache.h>
#include <nasd/nasd_common.h>
#include <nasd/nasd_timer.h>

/*
 * Locking info:
 *
 * - refcnt on block protected by LRU lock, not block lock
 * - do not lock LRU, ent simultaneously
 * - LRU lock also protects hash buckets
 * - perform no blocking operations holding LRU lock
 */

extern int nasd_odc_buckets;
extern int nasd_odc_logical_index_buckets;

nasd_odc_oq_t *nasd_odc_bucket_ents;
nasd_odc_oq_t *nasd_odc_logical_bucket_ents;

#define NASD_ODC_BUCKET(_sectno_) (((_sectno_)>>4)%nasd_odc_buckets)

#define NASD_LOGICAL_INDEX_HASH(_nid_,_offset_) \
  (NASD_ABS((nid&0xffffffUL) + ((_offset_)>>NASD_OD_BLOCK_SHIFT))%nasd_odc_logical_index_buckets)

#define NASD_ODC_LOGICAL_INDEX_INS(_ent_,_ih_) { \
  NASD_ASSERT(!((_ent_)->lru_flags&NASD_CL_LINDEX)); \
  if ((_ent_)->type == NASD_ODC_T_DATA) { \
    (_ent_)->lhprev = nasd_odc_logical_bucket_ents[ih].head.lhprev; \
    (_ent_)->lhnext = &nasd_odc_logical_bucket_ents[ih].head; \
  } \
  else { \
    (_ent_)->lhnext = nasd_odc_logical_bucket_ents[ih].head.lhnext; \
    (_ent_)->lhprev = &nasd_odc_logical_bucket_ents[ih].head; \
  } \
  (_ent_)->lhprev->lhnext = (_ent_); \
  (_ent_)->lhnext->lhprev = (_ent_); \
  (_ent_)->lru_flags |= NASD_CL_LINDEX; \
}

#define NASD_ODC_LOGICAL_INDEX_DEL(_ent_) { \
  NASD_ASSERT((_ent_)->lru_flags&NASD_CL_LINDEX); \
  NASD_ODC_Q_DEQ_NOLOCK(ent,lh); \
  (_ent_)->lru_flags &= ~NASD_CL_LINDEX; \
}

/*
 * Signalled when LRU gains first entry (size goes non-zero),
 * protected by lock on LRU placeholder.
 */
NASD_DECLARE_COND(nasd_odc_lru_cond);

nasd_ctrl_cache_stat_t nasd_drive_cache_stats;

extern int nasd_odc_pfcontig_only;
extern int nasd_odc_pfcontig_useless;
extern int nasd_odc_pfcontig_noskip;
extern int nasd_odc_pfcontig_afterskip_policy;

#define NASD_ODC_LRU_MAXTRIPS 5

#define DBG_OBJ_EJECT 0

#define LOSE_IREF(_ne_) { \
  NASD_ODC_CHECK_NODE_ENT(_ne_); \
  (_ne_)->irefcnt--; \
  if (((_ne_)->irefcnt == 0) && ((_ne_)->refcnt == 0) && ((_ne_)->lru_flags&NASD_CL_REMOVING)) { \
    (_ne_)->refcnt = 1; \
    if ((_ne_)->lru_flags&NASD_CL_LRU_Q) { \
      NASD_ASSERT(!((_ne_)->dirty_flags&NASD_CR_DIRTY)); \
      NASD_ASSERT(!((_ne_)->dirty_flags&NASD_CR_DIRTY_Q)); \
      NASD_ASSERT((_ne_)->refcnt == 1); \
      (_ne_)->lru_flags &= ~NASD_CL_LRU_Q; \
      NASD_ODC_Q_DEQ_NOLOCK((_ne_),l); \
      nasd_odc_lru[(_ne_)->lru_num].size--; \
      (_ne_)->lru_num = NASD_ODC_LRU_NONE; \
    } \
    nasd_odc_block_release_internal(_ne_, 1, __FILE__, __LINE__); \
  } \
}

#define NASD_ODC_ASSOC_ENT_WITH_NODE(_ent_,_type_,_node_ent_) { \
  NASD_ASSERT((_type_) != NASD_ODC_T_NODE); \
  NASD_ASSERT((_type_) != NASD_ODC_T_ANON); \
  NASD_ASSERT((_type_) != NASD_ODC_T_REFCNT); \
  NASD_ASSERT((_type_) != NASD_ODC_T_LAYOUT); \
  NASD_ASSERT((_type_) != NASD_ODC_T_LAYOUT_STATIC); \
  NASD_ODC_CHECK_NODE_ENT(_node_ent_); \
  (_node_ent_)->irefcnt++; \
  (_ent_)->node_ent = (_node_ent_); \
  (_ent_)->onext = (_node_ent_); \
  (_ent_)->oprev = (_node_ent_)->oprev; \
  (_ent_)->onext->oprev = (_ent_); \
  (_ent_)->oprev->onext = (_ent_); \
}

#if NASD_DRIVE_BLOCK_REPLACEMENT == 1
int nasd_odc_chances[NASD_ODC_T_NUM] = {
  0, /* NASD_ODC_T_BOGUS */
  2, /* NASD_ODC_T_NODE */
  1, /* NASD_ODC_T_IND */
  0, /* NASD_ODC_T_FREE */
  3, /* NASD_ODC_T_REFCNT */
  3, /* NASD_ODC_T_NPT1 */
  1, /* NASD_ODC_T_DATA */
  1, /* NASD_ODC_T_ANON */
  2, /* NASD_ODC_T_LAYOUT */
  0, /* NASD_ODC_T_LAYOUT_STATIC */
  3 /* NASD_ODC_T_NPT2 */
};
#endif /* NASD_DRIVE_BLOCK_REPLACEMENT == 1 */

/*
 * Given a virtual block number and block type, return
 * the actual on-disk block number. Data block numbers
 * are 1-based, so we subtract 1.
 */
nasd_sectno_t
nasd_odc_real_sectno(
  nasd_blkno_t  blkno,
  int           type)
{
  nasd_blkno_t ret;

  switch(type) {
    case NASD_ODC_T_NODE:
    case NASD_ODC_T_IND:
    case NASD_ODC_T_DATA:
    case NASD_ODC_T_LAYOUT:
    case NASD_ODC_T_LAYOUT_STATIC:
      NASD_ASSERT(blkno > nasd_odc_state->disk->npt2_ext.last);
    case NASD_ODC_T_ANON:
    case NASD_ODC_T_NPT1:
    case NASD_ODC_T_NPT2:
      ret = (blkno - 1) * NASD_OD_SECTORS_PER_BLK + nasd_firstblock_offset;
      break;
    case NASD_ODC_T_REFCNT:
      ret = blkno * NASD_OD_SECTORS_PER_BLK + nasd_firstref_offset;
      break;
    default:
      NASD_PANIC();
  }
  return(ret);
}

/*
 * Get a free block. Return NASD_NO_MEM if no free blocks
 * (note that clean blocks are not free). Caller must hold
 * LRU lock.
 */
nasd_status_t
nasd_odc_block_alloc_nolock(
  nasd_odc_ent_t  **entp)
{
  nasd_odc_ent_t *ent;
  nasd_status_t rc;

  if (NASD_ODC_Q_SIZE(&nasd_odc_unusedq) == 0) {
    *entp = NULL;
    NASD_ODC_CSINC(alloc_fail);
    rc = NASD_NO_MEM;
  }
  else {
    /* protected by LRU lock */
    NASD_ODC_Q_DEQ_TAIL_NOLOCK(&nasd_odc_unusedq,ent,l);
    *entp = ent;
    NASD_ODC_CSINC(alloc_succeed);
    rc = NASD_SUCCESS;
  }

  return(rc);
}

/*
 * Get a free block. Return NASD_NO_MEM
 * if no free blocks (note that clean
 * blocks are not free)
 */
nasd_status_t
nasd_odc_block_alloc(
  nasd_odc_ent_t  **entp)
{
  nasd_status_t rc;

  NASD_ODC_LRU_LOCK();
  rc = nasd_odc_block_alloc_nolock(entp);
  NASD_ODC_LRU_UNLOCK();

  return(rc);
}

/*
 * Get an empty block. Will grab oldest clean block in
 * LRU if none free. Will optionally wait for such a
 * block. Assumes caller holds lru lock.
 */
nasd_status_t
nasd_odc_block_grab(
  nasd_odc_ent_t  **entp,
  int               blockp)
{
  nasd_odc_ent_t *ent, *first_try, *ne;
  int retry, round_trips, lru_num;
  nasd_status_t rc;
  int otype;

  *entp = ent = NULL;
  rc = NASD_NO_MEM;

  /* try for freelist first */
  rc = nasd_odc_block_alloc_nolock(entp);
  if (rc == NASD_SUCCESS) {
    NASD_ODC_CSINC(grab_fl);
    return(rc);
  }

  retry = 1;

  /*
   * Mark down which block we would consider for replacement first.
   * If we get back to it, increment round_trips to indicate that
   * we've seen the whole LRU. If we go around NASD_ODC_LRU_MAXTRIPS
   * times, try to dump some dirty blocks.
   */
  round_trips = 0;
  first_try = NULL;
  for(lru_num=NASD_ODC_NLRUS-1;lru_num>=0;lru_num--) {
    if (NASD_ODC_Q_SIZE(&nasd_odc_lru[lru_num])) {
      first_try = nasd_odc_lru[lru_num].head.lprev;
      break;
    }
  }

  do {
    first_try = NULL;
    for(lru_num=NASD_ODC_NLRUS-1;lru_num>=0;lru_num--) {
      if (NASD_ODC_Q_SIZE(&nasd_odc_lru[lru_num])) {
        first_try = nasd_odc_lru[lru_num].head.lprev;
        break;
      }
    }

find_block:
    ent = NULL;
    for(lru_num=NASD_ODC_NLRUS-1;lru_num>=0;lru_num--) {
      if (NASD_ODC_Q_SIZE(&nasd_odc_lru[lru_num]) == 0)
        continue;
      /* found an entry, dequeue it into ent */
      NASD_ODC_Q_DEQ_TAIL_NOLOCK(&nasd_odc_lru[lru_num],ent,l);
      ent->chances++;
#if NASD_DRIVE_BLOCK_REPLACEMENT == 1
      /*
       * Found an entry. Be sure it doesn't need any more
       * trips around. We know that others aren't using it,
       * because it's in the LRU.
       */
      if (ent->chances < nasd_odc_chances[ent->type]) {
        NASD_ODC_Q_INS_NOLOCK(&nasd_odc_lru[lru_num],ent,l);
        goto find_block;
      }
#endif /* NASD_DRIVE_BLOCK_REPLACEMENT == 1 */
      ent->lru_flags &= ~NASD_CL_LRU_Q;
      NASD_ASSERT(ent->lru_num == lru_num);
      ent->lru_num = NASD_ODC_LRU_NONE;
      if ((ent == first_try) || (first_try == NULL))
        round_trips++;
      if (round_trips > 1) {
        nasd_odc_dirty_kick();
      }
      if (round_trips > NASD_ODC_LRU_MAXTRIPS) {
        NASD_ODC_CSINC(grab_lru_maxtrips);
        goto noclean;
      }
      NASD_ASSERT(ent->refcnt == 0);
      /* send it back around? */
      NASD_ODC_CSINC_TYPE(grab_consider,ent->type);
      if (ent->irefcnt) {
        /* node block with child blocks in cache (or deleting) */
        NASD_ASSERT(ent->type == NASD_ODC_T_NODE);
        ent->lru_num = lru_num;
        NASD_ODC_Q_INS_NOLOCK(&nasd_odc_lru[lru_num],ent,l);
        ent->lru_flags |= NASD_CL_LRU_Q;
        NASD_ODC_CSINC(grab_stillref);
        goto find_block;
      }
      ent->chances = 0;
      /* remove from hash bucket */
      NASD_ODC_Q_DEQ_NOLOCK(ent,h);
      ent->hbucket->size--;
      ent->hbucket = NULL;
      *entp = ent;
      rc = NASD_SUCCESS;
      /* remove from identity hash */
      otype = ent->type;
      if (NASD_ODC_TYPE_INDEX_LOGICAL(otype)) {
        NASD_ODC_LOGICAL_INDEX_DEL(ent);
      }
      else {
        NASD_ASSERT(!(ent->lru_flags & NASD_CL_LINDEX));
      }
      /* remove from node-member list if applicable */
      if (ent->node_ent) {
        NASD_ODC_Q_DEQ_NOLOCK(ent,o);
        ne = ent->node_ent;
        ent->node_ent = NULL;
        LOSE_IREF(ne);
      }
      else {
        NASD_ASSERT(ent->irefcnt == 0);
      }
      if (ent->type == NASD_ODC_T_ANON) {
        NASD_ODC_CSINC(anon_notused);
      }
      retry = 0;
      break;
    }
    if (retry) {
      /*
       * No clean blocks. If there are dirty blocks,
       * kick the dirty-writer thread. We don't lock
       * the dirty queue to check this case, because
       * it's really just a performance optimization
       * (the dirty-writer thread automatically wakes
       * up periodically to do its thing).
       */
      NASD_ASSERT(lru_num < 0);
      NASD_ASSERT(ent == NULL);
      NASD_ODC_CSINC(grab_dirty_wait);
noclean:
      round_trips = 0;
      if (nasd_odc_dirtycnt) {
        nasd_odc_dirty_kick();
      }
      if (blockp == 0) {
        retry = 0;
        break;
      }
      NASD_WAIT_COND(nasd_odc_lru_cond,nasd_odc_lru_mutex);
    }
  } while(retry);

  return(rc);
}

/*
 * Gain an extra reference on a block. Assumes
 * LRU, block locks _NOT_ held, but that caller
 * already holds a reference.
 */
void
nasd_odc_block_ref(
  nasd_odc_ent_t  *ent)
{
  NASD_ODC_LRU_LOCK();
  ent->refcnt++;
  NASD_ODC_LRU_UNLOCK();
}

/*
 * Convert an "internal" to an "external" reference.
 */
void
nasd_odc_block_iref_to_ref(
  nasd_odc_ent_t  *ent)
{
  NASD_ODC_LRU_LOCK();
  ent->irefcnt--;
  ent->refcnt++;
  NASD_ODC_LRU_UNLOCK();
}

/*
 * Release a reference on a block. Assumes
 * LRU, block locks _NOT_ held.
 */
nasd_status_t
_nasd_odc_block_release(
  nasd_odc_ent_t  *ent,
  char            *file,
  int              line)
{
  return(nasd_odc_block_release_internal(ent, 0, file, line));
}

/*
 * Release a reference on a block. Assumes
 * block lock _NOT_ held.
 */
nasd_status_t
nasd_odc_block_release_internal(
  nasd_odc_ent_t  *ent,
  int              lru_lock_held,
  char            *file,
  int              line)
{
  nasd_odc_oq_t *bucket;
  nasd_odc_flush_t *fl;
  int h, lru_num;

  bucket = &nasd_odc_bucket_ents[NASD_ODC_BUCKET(ent->real_sectno)];

  if (ent->type == NASD_ODC_T_NODE) {
    NASD_ODC_CHECK_NODE_ENT(ent);
  }

  if (lru_lock_held == 0) {
    NASD_ODC_LRU_LOCK();
  }

  h = 0;

#if NASD_DRIVE_DATA_BLOCK_REPLACEMENT < 2
  lru_num = NASD_ODC_LRU_COMMON;
#else /* NASD_DRIVE_DATA_BLOCK_REPLACEMENT < 2 */
  if (ent->type == NASD_ODC_T_DATA)
    lru_num = NASD_ODC_LRU_USEDDATA;
  else
    lru_num = NASD_ODC_LRU_COMMON;
#endif /* NASD_DRIVE_DATA_BLOCK_REPLACEMENT < 2 */

  NASD_ASSERT(ent->refcnt);

  ent->refcnt--;

  if (ent->refcnt) {
    if (lru_lock_held == 0) {
      NASD_ODC_LRU_UNLOCK();
    }
    return(NASD_SUCCESS);
  }

  /*
   * If refcnt is now 0, reinsert in LRU.
   */
  if (ent->refcnt == 0) {
#if NASD_DRIVE_DEBUG_RELEASE > 0
    ent->release_file = file;
    ent->release_line = line;
#endif /* NASD_DRIVE_DEBUG_RELEASE > 0 */
    if ((h == 0)
      && ((ent->lru_flags&NASD_CL_NOALLOC) || (ent->lru_flags&NASD_CL_AERROR)))
    {
      /* someone's waiting for this block back in nasd_odc_block_lookup */
      NASD_BROADCAST_COND(ent->cond);
      h = 1;
      goto done;
    }
    if (((h == 0) && (ent->lru_flags&NASD_CL_REMOVING)) && (ent->irefcnt == 0)) {
      nasd_odc_block_eject_real(ent);
      h = 1;
      goto done;
    }
    if ((h == 0) && (ent->lru_flags&NASD_CL_DELETING)) {
      /*
       * Object being deleted- let deleters know that no one's using it.
       * If we have a delete controller, let the delete requestor know
       * that they may now proceed. If we do not, this is a release following
       * an internal reference, which is probably occurring as a result
       * of unmapping cached data blocks associated with this object.
       * Since that is done by the delete requestor, we simply make sure
       * we only do anything about this the first time through.
       */
      if (ent->deletecp) {
        fl = ent->deletecp;
        ent->deletecp = NULL;
        NASD_ASSERT(fl);
        NASD_LOCK_MUTEX(fl->lock);
        fl->counter = 0;
        NASD_UNLOCK_MUTEX(fl->lock);
        NASD_BROADCAST_COND(fl->cond);
        /* object may have vanished from beneath us */
        h = 1;
        goto done;
      }
      /*
       * We don't do anything in the fallthrough case here.
       * Although the delete requestor is now free to
       * "have at it," we still want to shove this block
       * onto the dirty or clean LRU below. Later, when the
       * block is ejected, we'll get it back off.
       */
    }
    NASD_DIRTY_LOCK();
    if ((h == 0) && (ent->dirty_flags&NASD_CR_DIRTY)) {
      NASD_ASSERT(!(ent->dirty_flags&NASD_CR_DIRTY_Q));
      ent->refcnt = 1; /* dirty queue holds a ref */
      nasd_odc_dirty_ins_nolock(ent);
      h = 1;
    }
    NASD_DIRTY_UNLOCK();
    if ((h == 0) &&
      (!(ent->lru_flags & NASD_CL_WIRED)) &&
      (ent->type != NASD_ODC_T_LAYOUT_STATIC))
    {
      NASD_ASSERT(!(ent->lru_flags&NASD_CL_LRU_Q));
      ent->lru_num = lru_num;
      ent->chances = 0;
      NASD_ODC_Q_INS_NOLOCK(&nasd_odc_lru[lru_num],ent,l);
      ent->lru_flags |= NASD_CL_LRU_Q;
      /*
       * We used to only broadcast this when the queue size went
       * from zero to one here. This was incorrect; the queue
       * could have had a nonzero size of nonreplacable blocks,
       * causing us to bogusly not signal when someone was
       * waiting in nasd_odc_block_grab() for a clean, replacable
       * block.
       */
      NASD_BROADCAST_COND(nasd_odc_lru_cond);
      h = 1;
      goto done;
    }
  }
done:
  if (lru_lock_held == 0) {
    NASD_ODC_LRU_UNLOCK();
  }

  return(NASD_SUCCESS);
}

nasd_status_t
nasd_odc_blocksys_init()
{
  nasd_uint64 ticks;
  nasd_status_t rc;
  int i;

  rc = nasd_cond_init(&nasd_odc_lru_cond);
  if (rc)
    return(rc);
  rc = nasd_shutdown_cond(nasd_odc_shutdown, &nasd_odc_lru_cond);
  if (rc) {
    return(rc);
  }

  NASD_Malloc(nasd_odc_bucket_ents, nasd_odc_buckets*sizeof(nasd_odc_oq_t),
    (nasd_odc_oq_t *));
  if (nasd_odc_bucket_ents == NULL)
    return(NASD_NO_MEM);
  rc = nasd_shutdown_mem(nasd_odc_shutdown, nasd_odc_bucket_ents,
    nasd_odc_buckets*sizeof(nasd_odc_oq_t));
  if (rc) {
    NASD_Free(nasd_odc_bucket_ents, nasd_odc_buckets*sizeof(nasd_odc_oq_t));
    return(rc);
  }
  for(i=0;i<nasd_odc_buckets;i++) {
    rc = nasd_queue_init(&nasd_odc_bucket_ents[i]);
    if (rc)
      return(rc);
  }

  NASD_Malloc(nasd_odc_logical_bucket_ents,
    nasd_odc_logical_index_buckets*sizeof(nasd_odc_oq_t),
    (nasd_odc_oq_t *));
  if (nasd_odc_logical_bucket_ents == NULL)
    return(NASD_NO_MEM);
  rc = nasd_shutdown_mem(nasd_odc_shutdown, nasd_odc_logical_bucket_ents,
    nasd_odc_logical_index_buckets*sizeof(nasd_odc_oq_t));
  if (rc) {
    NASD_Free(nasd_odc_logical_bucket_ents,
      nasd_odc_logical_index_buckets*sizeof(nasd_odc_oq_t));
    return(rc);
  }
  for(i=0;i<nasd_odc_logical_index_buckets;i++) {
    rc = nasd_queue_init(&nasd_odc_logical_bucket_ents[i]);
    if (rc)
      return(rc);
  }

  bzero((char *)&nasd_drive_cache_stats, sizeof(nasd_drive_cache_stats));
  nasd_drive_cache_stats.ctrl_id = NASD_CTRL_DRIVE_INFO;
  rc = nasd_get_clock_speed(&ticks);
  if (rc) {
    return(rc);
  }
  nasd_drive_cache_stats.drive_cpu_speed = ticks;
  nasd_drive_cache_stats.region_blocks = nasd_od_region_blocks;

  return(NASD_SUCCESS);
}

/*
 * Find the entry for a block
 *
 * Flags:
 * NASD_ODC_L_FORCE - create an entry if one isn't there
 * NASD_ODC_L_BLOCK - block if necessary
 *
 * This is where we use that extra per-thread cache entry.
 * If there's no such entry, we need to grab a free one.
 * However, we want to release the LRU lock, because we
 * may have to wait for I/Os to complete before we can
 * get one. If we don't mark something, however, other threads
 * may come and try to generate the same cache entry. So, we
 * take this extra entry that is guaranteed available for our
 * thread, and we stick it in, marked busy and alloc-in-progress.
 * When we actually get our block, we swipe its data buf, and drop
 * it into this bogus ent, which is now part of the bucket (and
 * may be ref'd by other threads), clearing the ALLOC bit.
 * The newly-allocated ent, now data-free, becomes our new
 * super-secret hidden ent for the next time we end up here.
 * If we fail getting a buf for this bogus ent, we mark it
 * alloc-failed, and signal the alloc cond. When we succeeed, we
 * also signal the alloc-cond. So, others wanting to use an ent
 * that is alloc-in-progress must first wait on the alloc cond
 * for the alloc bit to clear, then check to see if it is marked
 * alloc-failed. If the latter, they _must_ release and fail.
 *
 * Returns block with a reference held on it. Remove from LRU
 * or dirty queue iff necessary.
 */
nasd_status_t
nasd_odc_block_lookup(
  nasd_odc_ent_t      *node_ent,
  nasd_blkno_t         blkno,
  int                  flags,
  nasd_odc_ent_t     **entp,
  nasd_identifier_t    nid,
  nasd_offset_t        offset,
  int                  type,     /* block type */
  int                 *creatorp) /* set non-zero if this triggered alloc */
{
  nasd_odc_ent_t *ent, *e;
  nasd_odc_oq_t *bucket;
  nasd_blkno_t real;
  int lk, otype, ih;
  nasd_status_t rc;

  NASD_ODC_CSINC_TYPE(lookup,type);

  if (type == NASD_ODC_T_REFCNT) {
    if (blkno >= nasd_odc_refblocks)
      return(NASD_BAD_BLKNO);
    real = blkno * NASD_OD_SECTORS_PER_BLK + nasd_firstref_offset;
  }
  else {
    if (blkno == 0)
      return(NASD_BAD_BLKNO);
    if (blkno > nasd_od_blocks)
      return(NASD_BAD_BLKNO);
    NASD_ASSERT((type == NASD_ODC_T_NODE)
      || (type == NASD_ODC_T_IND)
      || (type == NASD_ODC_T_DATA)
      || (type == NASD_ODC_T_LAYOUT)
      || (type == NASD_ODC_T_LAYOUT_STATIC)
      || (type == NASD_ODC_T_ANON)
      || (type == NASD_ODC_T_NPT1)
      || (type == NASD_ODC_T_NPT2));
    real = (blkno - 1) * NASD_OD_SECTORS_PER_BLK + nasd_firstblock_offset;
  }
  if ((blkno == 0) && (type != NASD_ODC_T_REFCNT))
  {
    return(NASD_BAD_BLKNO);
  }

  rc = NASD_NO_MEM;
  ent = *entp = NULL;
  *creatorp = 0;

#if 0
  /* computation duplicated inline above */
  real = nasd_odc_real_sectno(blkno, type);
#endif
  bucket = &nasd_odc_bucket_ents[NASD_ODC_BUCKET(real)];
  rc = NASD_SUCCESS;

  NASD_ODC_LRU_LOCK();
  lk = 1;

  for(e=bucket->head.hnext;e!=&bucket->head;e=e->hnext) {
    if (real == e->real_sectno) {
      NASD_ASSERT(e->blkno == blkno);
      NASD_ASSERT(e->real_sectno == real);
      ent = e;
      rc = NASD_SUCCESS;
      ent->refcnt++;
      if (ent->lru_flags&NASD_CL_DELETING) {
#if 0
nasd_printf("grabbed removing block 0x%lx refcnts %d %d\n", ent, ent->refcnt, ent->irefcnt);
nasd_printf("ent->data_flags=0x%x\n", ent->data_flags);
nasd_printf("ent->lru_flags=0x%x\n", ent->lru_flags);
nasd_printf("ent->io_flags=0x%x\n", ent->io_flags);
nasd_printf("ent->dirty_flags=0x%x\n", ent->dirty_flags);
#endif
        rc = NASD_OBJ_REMOVING;
        ent->refcnt--;
      }
      else if (ent->lru_flags&NASD_CL_REMOVING) {
#if 0
nasd_printf("grabbed ejecting block 0x%lx refcnts %d %d\n", ent, ent->refcnt, ent->irefcnt);
nasd_printf("ent->data_flags=0x%x\n", ent->data_flags);
nasd_printf("ent->lru_flags=0x%x\n", ent->lru_flags);
nasd_printf("ent->io_flags=0x%x\n", ent->io_flags);
nasd_printf("ent->dirty_flags=0x%x\n", ent->dirty_flags);
#endif
        if (flags&NASD_ODC_L_BLOCK) {
          /*
           * The block was flagged for ejection, but now we've
           * changed our minds and want it back. This is
           * probably happening because the block was deallocated,
           * but was still busy so it couldn't be reclaimed immediately.
           */
          NASD_ODC_CSINC(rescue_eject);
          NASD_ASSERT(ent->irefcnt == 0);
          ent->lru_flags &= ~NASD_CL_REMOVING;
          if (ent->node_ent) {
            NASD_ODC_CHECK_NODE_ENT(ent->node_ent);
            NASD_ODC_Q_DEQ_NOLOCK(ent,o);
            LOSE_IREF(ent->node_ent);
            ent->node_ent = NULL;
          }
          otype = ent->type;
          ent->type = type;
          ent->identifier = nid;
          ent->offset = offset;
          /*
           * Remove from old identity hash, insert in new.
           */
          if (NASD_ODC_TYPE_INDEX_LOGICAL(otype)) {
            /* remove from old ident hash */
            NASD_ODC_LOGICAL_INDEX_DEL(ent);
          }
          if (NASD_ODC_TYPE_INDEX_LOGICAL(ent->type)) {
            ih = NASD_LOGICAL_INDEX_HASH(nid, offset);
            NASD_ODC_LOGICAL_INDEX_INS(ent, ih);
          }
          if (node_ent) {
            NASD_ODC_ASSOC_ENT_WITH_NODE(ent,type,node_ent);
          }
          else {
            ent->irefcnt = 0;
            ent->onext = ent->oprev = ent;
            ent->node_ent = NULL;
          }
          goto gfind;
        }
        rc = NASD_EJECTING;
        ent->refcnt--;
      }
      else {
gfind:
        if (ent->type == NASD_ODC_T_ANON) {
          NASD_ASSERT(!(ent->lru_flags & NASD_CL_LINDEX));
          ent->type = type;
          ent->identifier = nid;
          ent->offset = offset;
          NASD_ODC_CSINC_TYPE(anon_promoted,type);
#if NASD_DRIVE_WIRE_NPT == 1
          if (ent->type == NASD_ODC_T_NPT2) {
            /* mark it wired */
            ent->lru_flags |= NASD_CL_WIRED;
            /* record in wireq (list of wired blocks) */
            NASD_ODC_Q_INS_NOLOCK(&nasd_odc_wireq,ent,w);
          }
#endif /* NASD_DRIVE_WIRE_NPT == 1 */
#if NASD_DRIVE_WIRE_NPT == 2
          if ((ent->type == NASD_ODC_T_NPT1) || (ent->type == NASD_ODC_T_NPT2))
            /* mark it wired */
            ent->lru_flags |= NASD_CL_WIRED;
            /* record in wireq (list of wired blocks) */
            NASD_ODC_Q_INS_NOLOCK(&nasd_odc_wireq,ent,w);
          }
#endif /* NASD_DRIVE_WIRE_NPT == 2 */
#if NASD_DRIVE_WIRE_REFCNT > 0
          if (ent->type == NASD_ODC_T_REFCNT) {
            /* mark it wired */
            ent->lru_flags |= NASD_CL_WIRED;
            /* record in wireq (list of wired blocks) */
            NASD_ODC_Q_INS_NOLOCK(&nasd_odc_wireq,ent,w);
          }
#endif /* NASD_DRIVE_WIRE_REFCNT > 0 */
          if (node_ent) {
            NASD_ODC_ASSOC_ENT_WITH_NODE(ent,type,node_ent);
          }
          else {
            ent->irefcnt = 0;
            ent->onext = ent->oprev = ent;
            ent->node_ent = NULL;
          }
          /*
           * Do identity hash iff necessary
           */
          if (NASD_ODC_TYPE_INDEX_LOGICAL(ent->type)) {
            ih = NASD_LOGICAL_INDEX_HASH(ent->identifier, ent->offset);
            NASD_ODC_LOGICAL_INDEX_INS(ent, ih);
          }
        }
        if (ent->lru_flags&NASD_CL_LRU_Q) {
          NASD_ASSERT(ent->lru_num >= 0);
          NASD_ASSERT(!(ent->dirty_flags&NASD_CR_DIRTY));
          NASD_ASSERT(!(ent->dirty_flags&NASD_CR_DIRTY_Q));
          NASD_ASSERT(ent->refcnt == 1);
          ent->lru_flags &= ~NASD_CL_LRU_Q;
          NASD_ODC_Q_DEQ_NOLOCK(ent,l);
          nasd_odc_lru[ent->lru_num].size--;
          ent->lru_num = NASD_ODC_LRU_NONE;
        }
        if (ent->dirty_flags&NASD_CR_DIRTY_Q) {
          NASD_ASSERT(ent->dirty_flags&NASD_CR_DIRTY);
          NASD_ASSERT(!(ent->lru_flags&NASD_CL_LRU_Q));
          NASD_ASSERT(ent->refcnt == 2);
          ent->dirty_flags &= ~NASD_CR_DIRTY_Q;
          NASD_ODC_Q_DEQ_NOLOCK(ent,d);
          ent->refcnt--;
        }
      }
      NASD_ODC_LRU_UNLOCK();
      lk = 0;
      break;
    }
  }
  if ((ent != NULL) && (flags&NASD_ODC_L_FORCE) && (rc == NASD_SUCCESS) &&
    (ent->lru_flags & NASD_CL_GHOST))
  {
    e = ent;
    goto unghost_lookup;
  }
  if ((ent == NULL) && (flags&NASD_ODC_L_FORCE) && (rc == NASD_SUCCESS)) {
    /*
     * No such entry in the cache, but finding one such
     * is mandatory. First, stick our magic entry in the
     * bucket so we can unlock it. Then try to get a real
     * entry.
     *
     * ODC_LRU_LOCK still held if we're here.
     */
    NASD_ASSERT(lk);
    NASD_ODC_CSINC_TYPE(miss,type);
    NASD_ODC_CSINC_TYPE(force,type);
    rc = nasd_odc_alloc_ent(&e);
    if (rc == NASD_SUCCESS) {
      /* init and insert in bucket */
      e->lru_flags = NASD_CL_ALLOC | NASD_CL_GHOST;
      e->data_flags = NASD_CD_INVALID;
      e->refcnt = 1;
      e->type = type;
      e->identifier = nid;
      e->offset = offset;
      if (NASD_ODC_TYPE_INDEX_LOGICAL(e->type)) {
        ih = NASD_LOGICAL_INDEX_HASH(e->identifier, e->offset);
        NASD_ODC_LOGICAL_INDEX_INS(e, ih);
      }

      e->dnext = e->dprev = NULL;
      e->lnext = e->lprev = NULL;
      e->snext = e->sprev = NULL;
      e->lru_num = NASD_ODC_LRU_NONE;
      e->data.buf = NULL;
      e->blkno = blkno;
      e->real_sectno = nasd_odc_real_sectno(blkno, type);
      e->invocation = NULL;
#if NASD_NL_REG_SCOREBOARD > 0
      nasd_nl_reg_init_ent(e);
#endif /* NASD_NL_REG_SCOREBOARD > 0 */
      e->hbucket = bucket;
      NASD_ODC_Q_INS_NOLOCK(bucket,e,h);
unghost_lookup:
      /* try to alloc an entry */
      rc = nasd_odc_block_grab(&ent, ((flags&NASD_ODC_L_BLOCK) ? 1 : 0));
      if (rc == NASD_SUCCESS) {
        /*
         * Got a block. Pull its data out and stick it on e
         * (our old bogus block), and send this new block back
         * to be our new squirrel block.
         *
         * This code duplicates that of nasd_odc_force_alloc().
         * Keep these codepaths in sync!
         * Also, nasd_odc_block_lookup_logical().
         */
        e->lru_flags &= ~(NASD_CL_ALLOC|NASD_CL_GHOST);
        nasd_odc_io_ent_swap_bufs(e, ent);
        *creatorp = 1;
        ent->lru_flags |= NASD_CL_GHOST;
        nasd_odc_put_ent(ent);
        ent = e;
        e = NULL;
        NASD_BROADCAST_COND(ent->acond);
        if (node_ent) {
          NASD_ODC_ASSOC_ENT_WITH_NODE(ent,type,node_ent);
        }
        else if (type == NASD_ODC_T_NODE) {
          ent->irefcnt = 0;
          ent->onext = ent->oprev = ent;
          ent->node_ent = NULL;
        }
        else {
          ent->irefcnt = 0;
          ent->onext = ent->oprev = ent;
          ent->node_ent = NULL;
        }
#if NASD_DRIVE_WIRE_NPT == 1
        if (type == NASD_ODC_T_NPT2) {
          /* mark it wired */
          ent->lru_flags |= NASD_CL_WIRED;
          /* record in wireq (list of wired blocks) */
          NASD_ODC_Q_INS_NOLOCK(&nasd_odc_wireq,ent,w);
        }
#endif /* NASD_DRIVE_WIRE_NPT == 1 */
#if NASD_DRIVE_WIRE_NPT == 2
        if ((type == NASD_ODC_T_NPT1) || (type == NASD_ODC_T_NPT2)) {
          /* mark it wired */
          ent->lru_flags |= NASD_CL_WIRED;
          /* record in wireq (list of wired blocks) */
          NASD_ODC_Q_INS_NOLOCK(&nasd_odc_wireq,ent,w);
        }
#endif /* NASD_DRIVE_WIRE_NPT == 2 */
      }
      else {
        /*
         * No block. Pull our squirrel block back out, and release
         * it. If others are holding refs, wait for them to be done.
         */
        ent = NULL;
        e->lru_flags &= ~NASD_CL_ALLOC;
        e->lru_flags |= NASD_CL_NOALLOC;
        while (e->refcnt > 1) {
          NASD_BROADCAST_COND(e->acond);
          NASD_WAIT_COND(e->cond,nasd_odc_lru_mutex);
          if ((e->lru_flags&NASD_CL_FALLOC) || (e->lru_flags&NASD_CL_AERROR)) {
            /* someone else took over */
            break;
          }
        }
        while (e->lru_flags&NASD_CL_FALLOC) {
          /* someone else forcing the allocation, we must wait for them */
          NASD_WAIT_COND(ent->acond,nasd_odc_lru_mutex);
        }
        if (e->lru_flags&NASD_CL_AERROR) {
          /*
           * someone else tried to force the issue and failed-
           * since they took responsibility, we're done
           */
          goto we_give_up;
        }
        if ((e->lru_flags&(NASD_CL_NOALLOC|NASD_CL_AERROR)) == 0) {
          ent = e;
          goto we_win;
        }
we_give_up:
        NASD_ASSERT(e->refcnt == 1);
        e->refcnt = 0;
        NASD_ODC_Q_DEQ_NOLOCK(e,h);
        e->hbucket->size--;
        e->hbucket = NULL;
        otype = e->type;
        if (NASD_ODC_TYPE_INDEX_LOGICAL(otype)) {
          NASD_ODC_LOGICAL_INDEX_DEL(e);
        }
        else {
          NASD_ASSERT(!(e->lru_flags & NASD_CL_LINDEX));
        }
        nasd_odc_put_ent(e);
        e = NULL;
      }
we_win:
      NASD_ODC_LRU_UNLOCK();
      lk = 0;
      if (ent) {
        /* we created entry- no one else should've queued it */
        NASD_ASSERT(!(ent->lru_flags&NASD_CL_LRU_Q));
        NASD_ASSERT(!(ent->dirty_flags&NASD_CR_DIRTY_Q));
      }
    }
    else {
      nasd_printf("WARNING: %s:%d no thread ent rc=%lu (%s)\n",
        __FILE__, __LINE__, (u_long)rc, nasd_error_string(rc));
      NASD_ODC_LRU_UNLOCK();
      lk = 0;
    }
  }

  if (lk) {
    NASD_ODC_LRU_UNLOCK();
  }

  if ((ent == NULL) && (rc == NASD_SUCCESS)) {
    NASD_ODC_CSINC_TYPE(miss,type);
    rc = NASD_BLOCK_NOT_CACHED;
  }
  if (rc == NASD_SUCCESS) {
#if NASD_NL_REG_SCOREBOARD > 0
    if (ent->reg_flags&NASD_ODC_R_VALID) {
      nasd_nl_reg_access_ent(ent);
    }
#endif /* NASD_NL_REG_SCOREBOARD > 0 */
    if (type != NASD_ODC_T_ANON) {
      NASD_ASSERT(ent->type == type);
    }
    *entp = ent;
  }
  return(rc);
}

/*
 * Call with LRU lock held.
 *
 * Take an entry which has failed into the NASD_CL_NOALLOC
 * state, and force the allocation to complete correctly
 * (if possible). This is used when two threads attempt to
 * allocate a block, and one thread has requested that the
 * allocation block to succeed, and the other requested
 * no-blocking. If the thread that was to perform the allocation
 * was the one that requested no-blocking, then the allocation
 * might fail when blocking would let it succeed, which is what
 * the second thread wants. In that case, we allow the second
 * thread to trigger the block and complete the allocation.
 */
nasd_status_t
nasd_odc_force_alloc(
  nasd_odc_ent_t  *e,
  nasd_odc_ent_t  *node_ent)
{
  nasd_odc_ent_t *ent;
  nasd_status_t rc;

  NASD_ODC_CSINC_TYPE(falloc,e->type);

  rc = nasd_odc_block_grab(&ent, 1);
  if (rc) {
    return(rc);
  }

  NASD_ASSERT(e->data_flags == NASD_CD_INVALID);
  NASD_ODC_IO_ENT_ASSERT_NO_PAGE(e);

  /*
   * Got a block. Pull its data out and stick it on e
   * (our old bogus block), and send this new block back
   * to be our new squirrel block.
   *
   * This code duplicates that of nasd_odc_block_lookup()
   * and nasd_odc_block_lookup_logical().
   * Keep these codepaths in sync!
   */
  nasd_odc_io_ent_swap_bufs(e, ent);
  ent->lru_flags |= NASD_CL_GHOST;
  nasd_odc_put_ent(ent);
  ent = e;
  ent->lru_flags &= ~NASD_CL_GHOST;
  if (node_ent) {
    NASD_ODC_ASSOC_ENT_WITH_NODE(ent,ent->type,node_ent);
  }
  else if (ent->type == NASD_ODC_T_NODE) {
    ent->irefcnt = 0;
    ent->onext = ent->oprev = ent;
    ent->node_ent = NULL;
  }
  else {
    ent->irefcnt = 0;
    ent->onext = ent->oprev = ent;
    ent->node_ent = NULL;
  }
#if NASD_DRIVE_WIRE_NPT == 1
  if (ent->type == NASD_ODC_T_NPT2) {
    /* mark it wired */
    ent->lru_flags |= NASD_CL_WIRED;
    /* record in wireq (list of wired blocks) */
    NASD_ODC_Q_INS_NOLOCK(&nasd_odc_wireq,ent,w);
  }
#endif /* NASD_DRIVE_WIRE_NPT == 1 */
#if NASD_DRIVE_WIRE_NPT == 2
  if ((ent->type == NASD_ODC_T_NPT1) || (ent->type == NASD_ODC_T_NPT2)) {
    /* mark it wired */
    ent->lru_flags |= NASD_CL_WIRED;
    /* record in wireq (list of wired blocks) */
    NASD_ODC_Q_INS_NOLOCK(&nasd_odc_wireq,ent,w);
  }
#endif /* NASD_DRIVE_WIRE_NPT == 2 */
  return(NASD_SUCCESS);
}

/*
 * Perform a lookup of a block by logical address
 * Caller MUST hold LRU lock.
 */
nasd_status_t
nasd_odc_block_lookup_logical(
  nasd_odc_ent_t      *node_ent,
  int                  flags,
  nasd_odc_ent_t     **entp,
  nasd_identifier_t    nid,
  nasd_offset_t        offset,
  int                  type,
  int                 *crp)
{
  nasd_odc_ent_t *ent, *grab_ent;
  nasd_status_t rc;
  int ih;

  rc = NASD_BLOCK_NOT_CACHED;

  NASD_ODC_CSINC_TYPE(lookup_logical,type);

  *entp = NULL;
  *crp = 0;

  if (!NASD_ODC_TYPE_INDEX_LOGICAL(type))
    return(NASD_BLOCK_NOT_CACHED);

  ih = NASD_LOGICAL_INDEX_HASH(nid, offset);

  for(ent=nasd_odc_logical_bucket_ents[ih].head.lhnext;
    ent!=&nasd_odc_logical_bucket_ents[ih].head;
    ent=ent->lhnext)
  {
    if ((ent->identifier == nid)
      && (ent->offset == offset)
      && (ent->type == type))
    {
      /*
       * Found the block. This duplicates the logic in
       * nasd_odc_block_lookup() when we have found a block.
       */
      ent->refcnt++;
      if (ent->lru_flags&NASD_CL_DELETING) {
#if 0
nasd_printf("logical grabbed removing block 0x%lx refcnts %d %d\n", ent, ent->refcnt, ent->irefcnt);
nasd_printf("ent->data_flags=0x%x\n", ent->data_flags);
nasd_printf("ent->lru_flags=0x%x\n", ent->lru_flags);
nasd_printf("ent->io_flags=0x%x\n", ent->io_flags);
nasd_printf("ent->dirty_flags=0x%x\n", ent->dirty_flags);
#endif
        rc = NASD_OBJ_REMOVING;
        ent->refcnt--;
      }
      else if (ent->lru_flags&NASD_CL_REMOVING) {
#if 0
nasd_printf("logical grabbed ejecting block 0x%lx refcnts %d %d\n", ent, ent->refcnt, ent->irefcnt);
nasd_printf("ent->data_flags=0x%x\n", ent->data_flags);
nasd_printf("ent->lru_flags=0x%x\n", ent->lru_flags);
nasd_printf("ent->io_flags=0x%x\n", ent->io_flags);
nasd_printf("ent->dirty_flags=0x%x\n", ent->dirty_flags);
#endif
        /*
         * The block was flagged for ejection, but now we've
         * changed our minds and want it back.
         */
        NASD_ODC_CSINC(rescue_eject);
        NASD_ASSERT(ent->irefcnt == 0);
        ent->lru_flags &= ~NASD_CL_REMOVING;
        if (ent->node_ent && (ent->node_ent != node_ent)) {
          NASD_ODC_CHECK_NODE_ENT(ent->node_ent);
          NASD_ODC_Q_DEQ_NOLOCK(ent,o);
          LOSE_IREF(ent->node_ent);
          ent->node_ent = NULL;
          if (node_ent) {
            NASD_ODC_ASSOC_ENT_WITH_NODE(ent,type,node_ent);
          }
          else {
            ent->irefcnt = 0;
            ent->onext = ent->oprev = ent;
            ent->node_ent = NULL;
          }
        }
        goto gfind_logical;
      }
      else {
gfind_logical:
        if (ent->lru_flags&NASD_CL_LRU_Q) {
          /* remove from LRU */
          NASD_ASSERT(ent->lru_num >= 0);
          NASD_ASSERT(!(ent->dirty_flags&NASD_CR_DIRTY));
          NASD_ASSERT(!(ent->dirty_flags&NASD_CR_DIRTY_Q));
          NASD_ASSERT(ent->refcnt == 1);
          ent->lru_flags &= ~NASD_CL_LRU_Q;
          NASD_ODC_Q_DEQ_NOLOCK(ent,l);
          nasd_odc_lru[ent->lru_num].size--;
          ent->lru_num = NASD_ODC_LRU_NONE;
        }
        if (ent->dirty_flags&NASD_CR_DIRTY_Q) {
          NASD_ASSERT(ent->dirty_flags&NASD_CR_DIRTY);
          NASD_ASSERT(!(ent->lru_flags&NASD_CL_LRU_Q));
          NASD_ASSERT(ent->refcnt == 2);
          ent->dirty_flags &= ~NASD_CR_DIRTY_Q;
          NASD_ODC_Q_DEQ_NOLOCK(ent,d);
          ent->refcnt--;
        }
      }
      NASD_ODC_CSINC_TYPE(logical_found,type);
      *entp = ent;
      rc = NASD_SUCCESS;
      break;
    }
  }

  if (ent == &nasd_odc_logical_bucket_ents[ih].head) {
    NASD_ODC_CSINC_TYPE(miss,type);
    return(NASD_BLOCK_NOT_CACHED);
  }

  if ((ent->lru_flags&NASD_CL_GHOST) && (flags&NASD_ODC_L_FORCE)) {
    /*
     * Entry is currently ghosted, so we need to go get a page
     * for it.
     * Keep this codpath in sync with nasd_odc_block_lookup(),
     * nasd_odc_force_alloc().
     */
    NASD_ODC_CSINC_TYPE(miss,type);
    NASD_ODC_CSINC_TYPE(force,type);
    ent->lru_flags |= NASD_CL_ALLOC|NASD_CL_FALLOC;
    rc = nasd_odc_block_grab(&grab_ent, ((flags&NASD_ODC_L_BLOCK) ? 1 : 0));
    if (rc == NASD_SUCCESS) {
      ent->lru_flags &= ~(NASD_CL_ALLOC|NASD_CL_GHOST);
      nasd_odc_io_ent_swap_bufs(ent, grab_ent);
      *crp = 1;
      grab_ent->lru_flags |= NASD_CL_GHOST;
      nasd_odc_put_ent(grab_ent);
      NASD_BROADCAST_COND(ent->acond);
      if (node_ent) {
        NASD_ODC_ASSOC_ENT_WITH_NODE(ent,type,node_ent);
      }
      else if (type == NASD_ODC_T_NODE) {
        ent->irefcnt = 0;
        ent->onext = ent->oprev = ent;
        ent->node_ent = NULL;
      }
      else {
        ent->irefcnt = 0;
        ent->onext = ent->oprev = ent;
        ent->node_ent = NULL;
      }
#if NASD_DRIVE_WIRE_NPT == 1
      if (type == NASD_ODC_T_NPT2) {
        /* mark it wired */
        ent->lru_flags |= NASD_CL_WIRED;
        /* record in wireq (list of wired blocks) */
        NASD_ODC_Q_INS_NOLOCK(&nasd_odc_wireq,ent,w);
      }
#endif /* NASD_DRIVE_WIRE_NPT == 1 */
#if NASD_DRIVE_WIRE_NPT == 2
      if ((type == NASD_ODC_T_NPT1) || (type == NASD_ODC_T_NPT2)) {
        /* mark it wired */
        ent->lru_flags |= NASD_CL_WIRED;
        /* record in wireq (list of wired blocks) */
        NASD_ODC_Q_INS_NOLOCK(&nasd_odc_wireq,ent,w);
      }
#endif /* NASD_DRIVE_WIRE_NPT == 2 */
    }
  }

  return(rc);
}

/*
 * Search for a block in the logical cache. Be sure to get
 * a page for it.
 */
nasd_status_t
nasd_odc_block_get_logical(
  nasd_odc_ent_t      *node_ent,
  int                  flags,
  nasd_odc_ent_t     **entp,
  nasd_identifier_t    nid,
  nasd_offset_t        offset,
  int                  type,
  nasd_odc_ent_t      *ichain)
{
  nasd_odc_ent_t *ent;
  nasd_status_t rc;
  int cr;

  NASD_ODC_CSINC_TYPE(get_logical,type);
  if (node_ent) {
    NASD_ODC_CHECK_NODE_ENT(node_ent);
  }

  *entp = NULL;
  cr = 0;

  NASD_ODC_LRU_LOCK();

  rc = nasd_odc_block_lookup_logical(node_ent, flags, &ent,
    nid, offset, type, &cr);
  if (rc || (ent == NULL)) {
    goto done_get_logical;
  }
  /*
   * If block is being allocated, wait for that to complete.
   * If said alloc failed, bail.
   */
  while ((ent->lru_flags&NASD_CL_ALLOC)||(ent->lru_flags&NASD_CL_FALLOC)) {
    NASD_WAIT_COND(ent->acond,nasd_odc_lru_mutex);
  }
  if ((ent->lru_flags&NASD_CL_NOALLOC) || (ent->lru_flags & NASD_CL_AERROR)) {
    /*
     * Someone else started to create this ent, but failed
     * because they were unwilling to block in order to
     * complete the allocation. They were weak. We shall
     * seize control and block to force-complete the
     * allocation. If we fail, we will do so with a
     * hard error, and tell the world.
     */
    if ((flags&NASD_ODC_L_FORCE) && (!(ent->lru_flags&NASD_CL_AERROR))) {
      ent->lru_flags &= ~NASD_CL_NOALLOC;
      ent->lru_flags |= NASD_CL_FALLOC;
      cr = 1;
      rc = nasd_odc_force_alloc(ent, node_ent);
      if (rc == NASD_SUCCESS) {
        rc = NASD_SUCCESS;
        goto done_get_logical;
      }
      else {
        /*
         * We have failed. Since we took responsibility for
         * the block, we must wait until everyone else
         * understands that they must give up, then free
         * it up.
         */
        ent->lru_flags &= ~NASD_CL_FALLOC;
        ent->lru_flags |= NASD_CL_AERROR;
        while(ent->refcnt > 1) {
          /* wait for everyone else to get out */
          NASD_BROADCAST_COND(ent->acond);
          NASD_WAIT_COND(ent->cond,nasd_odc_lru_mutex);
        }
        ent->refcnt = 0;
        NASD_ODC_Q_DEQ_NOLOCK(ent,h);
        ent->hbucket->size--;
        ent->hbucket = NULL;
        nasd_odc_put_ent(ent);
        ent = NULL;
        rc = NASD_NO_MEM;
        goto done_get_logical;
      }
    }
    NASD_ODC_LRU_UNLOCK();
    nasd_odc_block_release(ent);
    *entp = NULL;
    return(NASD_NO_MEM);
  }

done_get_logical:
  NASD_ODC_LRU_UNLOCK();

  *entp = ent;

  if ((rc == NASD_SUCCESS) && ent) {
    rc = nasd_odc_block_get_part2(node_ent, ent->blkno, flags,
      entp, ent->identifier, ent->offset, ent->type, ichain,
      &cr);
  }

  return(rc);
}

/*
 * Perform lookup and sanity check portion of block get.
 */
nasd_status_t
nasd_odc_block_get_part1(
  nasd_odc_ent_t      *node_ent,
  nasd_blkno_t         blkno,
  int                  flags,
  nasd_odc_ent_t     **entp,
  nasd_identifier_t    nid,
  nasd_offset_t        offset,
  int                  type,     /* block type */
  nasd_odc_ent_t      *ichain,   /* I/O chain */
  int                 *crp)
{
  nasd_odc_ent_t *ent;
  nasd_status_t rc;
  int otype;

  NASD_ODC_CSINC_TYPE(get1,type);
  if (node_ent) {
    NASD_ODC_CHECK_NODE_ENT(node_ent);
  }

  *crp = 0;

  rc = nasd_odc_block_lookup(node_ent, blkno, flags, &ent,
    nid, offset, type, crp);
  if (rc) {
    *entp = NULL;
    return(rc);
  }
  if (ent == NULL) {
    *entp = NULL;
    return(NASD_BLOCK_NOT_CACHED);
  }

  NASD_ODC_LRU_LOCK();
  /*
   * If block is being allocated, wait for that to complete.
   * If said alloc failed, bail.
   */
  while ((ent->lru_flags&NASD_CL_ALLOC)||(ent->lru_flags&NASD_CL_FALLOC)) {
    NASD_WAIT_COND(ent->acond,nasd_odc_lru_mutex);
  }
  if ((ent->lru_flags&NASD_CL_NOALLOC) || (ent->lru_flags & NASD_CL_AERROR)) {
    /*
     * Someone else started to create this ent, but failed
     * because they were unwilling to block in order to
     * complete the allocation. They were weak. We shall
     * seize control and block to force-complete the
     * allocation. If we fail, we will do so with a
     * hard error, and tell the world.
     */
    if ((flags&NASD_ODC_L_FORCE) && (!(ent->lru_flags&NASD_CL_AERROR))) {
      ent->lru_flags &= ~NASD_CL_NOALLOC;
      ent->lru_flags |= NASD_CL_FALLOC;
      rc = nasd_odc_force_alloc(ent, node_ent);
      if (rc == NASD_SUCCESS) {
        *crp = 1;
        rc = NASD_SUCCESS;
        goto done_get1;
      }
      else {
        /*
         * We have failed. Since we took responsibility for
         * the block, we must wait until everyone else
         * understands that they must give up, then free
         * it up.
         */
        ent->lru_flags &= ~NASD_CL_FALLOC;
        ent->lru_flags |= NASD_CL_AERROR;
        while(ent->refcnt > 1) {
          /* wait for everyone else to get out */
          NASD_BROADCAST_COND(ent->acond);
          NASD_WAIT_COND(ent->cond,nasd_odc_lru_mutex);
        }
        ent->refcnt = 0;
        NASD_ODC_Q_DEQ_NOLOCK(ent,h);
        ent->hbucket->size--;
        ent->hbucket = NULL;
        otype = ent->type;
        if (NASD_ODC_TYPE_INDEX_LOGICAL(otype)) {
          NASD_ODC_LOGICAL_INDEX_DEL(ent);
        }
        else {
          NASD_ASSERT(!(ent->lru_flags & NASD_CL_LINDEX));
        }
        nasd_odc_put_ent(ent);
        ent = NULL;
        rc = NASD_NO_MEM;
        goto done_get1;
      }
    }
    NASD_ODC_LRU_UNLOCK();
    nasd_odc_block_release(ent);
    *entp = NULL;
    return(NASD_NO_MEM);
  }
  rc = NASD_SUCCESS;
done_get1:
  NASD_ODC_LRU_UNLOCK();

  *entp = ent;

  return(rc);
}

/*
 * Perform load-and-validate phase of block_get
 */
nasd_status_t
nasd_odc_block_get_part2(
  nasd_odc_ent_t      *node_ent,
  nasd_blkno_t         blkno,
  int                  flags,
  nasd_odc_ent_t     **entp,
  nasd_identifier_t    nid,
  nasd_offset_t        offset,
  int                  type,     /* block type */
  nasd_odc_ent_t      *ichain,   /* I/O chain */
  int                 *crp)
{
  nasd_odc_ent_t *ent, *anon_ent, anon_ich, *anon_ichp, bich, *ref_ent;
  nasd_blkno_t anon_blk, base_blk, top_blk, ref_blk, cur_ref_blk;
  int cr, anon_flags, anon_cr, do_anon_loads, ref_slot, abf;
  nasd_blkno_t blk;
  nasd_status_t rc;

  NASD_ODC_CSINC_TYPE(get2,type);
  if (node_ent) {
    NASD_ODC_CHECK_NODE_ENT(node_ent);
  }

  cr = *crp;
  ent = *entp;

  ref_ent = NULL;
  cur_ref_blk = 0;

  anon_flags = flags & ~NASD_ODC_L_BLOCK;
  base_blk = blkno - ((blkno-1) % nasd_od_region_blocks);
  top_blk = base_blk + nasd_od_region_blocks - 1;
  if (flags&NASD_ODC_L_MLOAD) {
    anon_ichp = ichain;
    do_anon_loads = 1;
  }
  else {
    anon_ich.inext = anon_ich.iprev = &anon_ich;
    anon_ichp = &anon_ich;
    if (flags&NASD_ODC_L_LOAD)
      do_anon_loads = 1;
    else
      do_anon_loads = 0;
  }
  if ((type == NASD_ODC_T_LAYOUT_STATIC)
    || (type == NASD_ODC_T_REFCNT)
    || (type == NASD_ODC_T_NPT1)
    || (type == NASD_ODC_T_NPT2)
    || (flags & NASD_ODC_L_NOPRE))
  {
    do_anon_loads = 0;
  }

  if (do_anon_loads && cr && nasd_odc_read_regions) {
    /*
     * find first and last blocks in region that are ref'd
     */
    for(blk=top_blk;blk>blkno;blk--) {
      ref_blk = NASD_ODC_REFBLK_OF(blk);
      ref_slot = NASD_ODC_OFF_IN_REFBLK(blk);
      if ((ref_blk != cur_ref_blk) || (ref_ent == NULL)) {
        if (ref_ent) {
          NASD_ODC_UNLOCK_BLOCK(ref_ent);
          nasd_odc_block_release(ref_ent);
        }
        ref_ent = NULL;
        cur_ref_blk = ref_blk;
        rc = nasd_odc_block_get(NULL, cur_ref_blk,
          NASD_ODC_L_FORCE|NASD_ODC_L_BLOCK|NASD_ODC_L_LOAD,
          &ref_ent, NASD_ID_NULL, 0,
          NASD_ODC_T_REFCNT, NULL);
        if (rc) {
          ref_ent = NULL;
          NASD_ODC_CSINC(anon_before_fail);
          break;
        }
        NASD_ODC_LOCK_BLOCK(ref_ent);
        nasd_odc_wait_not_busy_invalid(ref_ent);
      }
      if (ref_ent->data.cnt[ref_slot] == 0) {
        top_blk--;
      }
      else {
        break;
      }
    }
    for(blk=base_blk;blk<blkno;blk++) {
      ref_blk = NASD_ODC_REFBLK_OF(blk);
      ref_slot = NASD_ODC_OFF_IN_REFBLK(blk);
      if ((ref_blk != cur_ref_blk) || (ref_ent == NULL)) {
        if (ref_ent) {
          NASD_ODC_UNLOCK_BLOCK(ref_ent);
          nasd_odc_block_release(ref_ent);
        }
        ref_ent = NULL;
        cur_ref_blk = ref_blk;
        rc = nasd_odc_block_get(NULL, cur_ref_blk,
          NASD_ODC_L_FORCE|NASD_ODC_L_BLOCK|NASD_ODC_L_LOAD,
          &ref_ent, NASD_ID_NULL, 0, NASD_ODC_T_REFCNT, NULL);
        if (rc) {
          ref_ent = NULL;
          NASD_ODC_CSINC(anon_before_fail);
          break;
        }
        NASD_ODC_LOCK_BLOCK(ref_ent);
        nasd_odc_wait_not_busy_invalid(ref_ent);
      }
      if (ref_ent->data.cnt[ref_slot] == 0) {
        base_blk++;
      }
      else {
        break;
      }
    }
  }

  if (do_anon_loads && cr && nasd_odc_read_regions && (base_blk < blkno)) {
    /*
     * We missed finding this entry. Pull in other
     * blocks from the region. This chunk of code
     * pulls in blocks before blkno. Start at
     * blkno and work backwards. Form reversed
     * I/O list on bich (it will be forward-facing
     * when we're done, because we're going backwards),
     * and add the whole thing to anon_ichp.
     */
    bich.inext = bich.iprev = &bich;
    for(anon_blk=blkno-1;anon_blk>=base_blk;anon_blk--) {
      ref_blk = NASD_ODC_REFBLK_OF(anon_blk);
      ref_slot = NASD_ODC_OFF_IN_REFBLK(anon_blk);
      if ((ref_blk != cur_ref_blk) || (ref_ent == NULL)) {
        if (ref_ent) {
          NASD_ODC_UNLOCK_BLOCK(ref_ent);
          nasd_odc_block_release(ref_ent);
        }
        ref_ent = NULL;
        cur_ref_blk = ref_blk;
        rc = nasd_odc_block_get(NULL, cur_ref_blk,
          NASD_ODC_L_FORCE|NASD_ODC_L_BLOCK|NASD_ODC_L_LOAD,
          &ref_ent, NASD_ID_NULL, 0, NASD_ODC_T_REFCNT, NULL);
        if (rc) {
          ref_ent = NULL;
          NASD_ODC_CSINC(anon_before_fail);
          break;
        }
        NASD_ODC_LOCK_BLOCK(ref_ent);
        nasd_odc_wait_not_busy_invalid(ref_ent);
      }
      if (ref_ent->data.cnt[ref_slot] == 0) {
        /* block in question is unused, don't bother */
        NASD_ODC_CSINC(anon_before_noref);
        if (nasd_odc_pfcontig_only)
          break;
        if (nasd_odc_pfcontig_useless == 0)
          continue;
      }
      rc = nasd_odc_block_get_part1(NULL, anon_blk,
        anon_flags, &anon_ent, NASD_ID_NULL, 0,
        NASD_ODC_T_ANON, ichain, &anon_cr);
      if (rc) {
        NASD_ODC_CSINC(anon_before_fail);
        break;
      }
      if (anon_cr == 0) {
        NASD_ODC_CSINC(anon_before_incore);
        nasd_odc_block_release(anon_ent);
        if (nasd_odc_pfcontig_only || nasd_odc_pfcontig_noskip)
          break;
        if (nasd_odc_pfcontig_useless == 0)
          continue;
      }
      NASD_ODC_CSINC(anon_before);
      NASD_ODC_LOCK_BLOCK(anon_ent);
      if ((anon_ent->data_flags&NASD_CD_INVALID) &&
        (!(anon_ent->data_flags&(NASD_CD_BUSY|NASD_CD_MBUSY))))
      {
        /*
         * We've caused this block to be added.
         * We must start I/O. Queue onto I/O
         * chain (this will be the caller chain
         * if we're in MLOAD mode, otherwise
         * it's a local chain we'll deal with
         * later).
         */
        anon_ent->inext = bich.inext;
        anon_ent->iprev = &bich;
        anon_ent->iprev->inext = anon_ent;
        anon_ent->inext->iprev = anon_ent;
        anon_ent->iocb = NULL;
        anon_ent->iocb_arg = NULL;
        anon_ent->data_flags |= NASD_CD_BUSY|NASD_CD_MBUSY|NASD_CD_ANONF;
        NASD_ASSERT(!(anon_ent->io_flags&(NASD_CI_IOQ|NASD_CI_DISPATCH)));
        abf = 0;
      }
      else {
        abf = 1;
      }
      NASD_ODC_UNLOCK_BLOCK(anon_ent);
      if (abf) {
        break;
      }
    }
    if (bich.inext != &bich) {
      /* tail insert our list on anon_ichp */
      bich.inext->iprev = anon_ichp->iprev;
      anon_ichp->iprev->inext = bich.inext;
      bich.iprev->inext = anon_ichp;
      anon_ichp->iprev = bich.iprev;
    }
  }

  NASD_ODC_LOCK_BLOCK(ent);
  /*
   * Block exists, we hold the lock. Must we start an I/O?
   */
  if (ent->data_flags&NASD_CD_INVALID) {
    if (cr || (ent->type == NASD_ODC_T_LAYOUT_STATIC)) {
      if (flags&NASD_ODC_L_LOAD) {
        /*
         * If caller does not specify load, it is responsible for
         * making the block become valid at some point. The only
         * case where that should happen is if the caller is
         * completely overwriting the old block, and therefore
         * doesn't want to burn time and cycles loading the data.
         *
         * We've generated this entry, so no one else could possibly
         * be starting an I/O on it, yet.
         */
        /*
         * We're not busy, so we get to touch the ilist bits here
         */
        if (do_anon_loads) {
          ent->iprev = anon_ichp->iprev;
          ent->inext = anon_ichp;
          ent->iprev->inext = ent;
          ent->inext->iprev = ent;
          ent->iocb = NULL;
          ent->iocb_arg = NULL;
          ent->data_flags |= NASD_CD_BUSY|NASD_CD_MBUSY;
          NASD_ASSERT(!(ent->io_flags&(NASD_CI_IOQ|NASD_CI_DISPATCH)));
        }
        else {
          ent->inext = ent->iprev = NULL;
          ent->iocb = NULL;
          ent->iocb_arg = NULL;
          ent->data_flags |= NASD_CD_BUSY;
          NASD_ASSERT(!(ent->io_flags&(NASD_CI_IOQ|NASD_CI_DISPATCH)));
          NASD_ODC_UNLOCK_BLOCK(ent);
          nasd_od_io_enq(ent, NASD_U_READ, NASD_IO_PRI_MED);
          NASD_ODC_LOCK_BLOCK(ent);
        }
      }
      else if (flags&NASD_ODC_L_MLOAD) {
        /*
         * Caller is now responsible for performing load
         * operation on block (and has promised to do so).
         * We'll just queue this onto the caller's mload chain.
         */
        /*
         * We're not busy, so we get to touch the ilist bits here
         */
        ent->iprev = ichain->iprev;
        ent->inext = ichain;
        ent->iprev->inext = ent;
        ent->inext->iprev = ent;
        ent->iocb = NULL;
        ent->iocb_arg = NULL;
        ent->data_flags |= NASD_CD_BUSY|NASD_CD_MBUSY;
        NASD_ASSERT(!(ent->io_flags&(NASD_CI_IOQ|NASD_CI_DISPATCH)));
      }
#if 0
      if ((flags&(NASD_ODC_L_LOAD|NASD_ODC_L_MLOAD)) && ((ent->type == NASD_ODC_T_NODE) || (ent->type == NASD_ODC_T_IND) || (ent->type == NASD_ODC_T_DATA))) {
        nasd_blkno_t regid;
        nasd_nl_reg_blkno_to_regid(ent->blkno, &regid);
        if (ent->type == NASD_ODC_T_NODE) {
          nasd_printf("fault reg %u node blk %u\n", regid, ent->blkno);
        }
        else {
          nasd_printf("fault reg %u\n", regid);
        }
      }
      if ((flags&(NASD_ODC_L_LOAD|NASD_ODC_L_MLOAD)) && ((ent->type == NASD_ODC_T_NPT1) || (ent->type == NASD_ODC_T_NPT2))) {
        nasd_printf("fault npt %u blk %u\n", ent->blkno/nasd_od_region_blocks, ent->blkno);
      }
#endif
    }
  }

  NASD_ODC_UNLOCK_BLOCK(ent);

  if (do_anon_loads && cr && nasd_odc_read_regions) {
    /*
     * We missed finding this entry. Pull in other
     * blocks from the region. This chunk of code
     * pulls in blocks after blkno.
     */
    for(anon_blk=blkno+1;
      (anon_blk<=top_blk)&&(anon_blk<=nasd_od_blocks);
      anon_blk++)
    {
      ref_blk = NASD_ODC_REFBLK_OF(anon_blk);
      ref_slot = NASD_ODC_OFF_IN_REFBLK(anon_blk);
      if ((ref_blk != cur_ref_blk) || (ref_ent == NULL)) {
        if (ref_ent) {
          NASD_ODC_UNLOCK_BLOCK(ref_ent);
          nasd_odc_block_release(ref_ent);
        }
        ref_ent = NULL;
        cur_ref_blk = ref_blk;
        rc = nasd_odc_block_get(NULL, cur_ref_blk,
          NASD_ODC_L_FORCE|NASD_ODC_L_BLOCK|NASD_ODC_L_LOAD,
          &ref_ent, NASD_ID_NULL, 0, NASD_ODC_T_REFCNT, NULL);
        if (rc) {
          ref_ent = NULL;
          NASD_ODC_CSINC(anon_before_fail);
          break;
        }
        NASD_ODC_LOCK_BLOCK(ref_ent);
        nasd_odc_wait_not_busy_invalid(ref_ent);
      }
      if (ref_ent->data.cnt[ref_slot] == 0) {
        /* block in question is unused, don't bother */
        NASD_ODC_CSINC(anon_after_noref);
        if (nasd_odc_pfcontig_afterskip_policy == 2)
          continue;
        if (nasd_odc_pfcontig_only)
          break;
        if (nasd_odc_pfcontig_useless == 0)
          continue;
      }
      rc = nasd_odc_block_get_part1(NULL, anon_blk,
        anon_flags, &anon_ent, NASD_ID_NULL, 0,
        NASD_ODC_T_ANON, ichain, &anon_cr);
      if (rc) {
        NASD_ODC_CSINC(anon_after_fail);
        break;
      }
      if (anon_cr == 0) {
        NASD_ODC_CSINC(anon_after_incore);
        nasd_odc_block_release(anon_ent);
        if (nasd_odc_pfcontig_afterskip_policy)
          continue;
        if (nasd_odc_pfcontig_only || nasd_odc_pfcontig_noskip)
          break;
        if (nasd_odc_pfcontig_useless == 0)
          continue;
      }
      NASD_ODC_CSINC(anon_after);
      NASD_ODC_LOCK_BLOCK(anon_ent);
      if ((anon_ent->data_flags&NASD_CD_INVALID) &&
        (!(anon_ent->data_flags&(NASD_CD_BUSY|NASD_CD_MBUSY))))
      {
        /*
         * We've caused this block to be added.
         * We must start I/O. Queue onto I/O
         * chain (this will be the caller chain
         * if we're in MLOAD mode, otherwise
         * it's a local chain we'll deal with
         * later).
         */
        anon_ent->iprev = anon_ichp->iprev;
        anon_ent->inext = anon_ichp;
        anon_ent->iprev->inext = anon_ent;
        anon_ent->inext->iprev = anon_ent;
        anon_ent->iocb = NULL;
        anon_ent->iocb_arg = NULL;
        anon_ent->data_flags |= NASD_CD_BUSY|NASD_CD_MBUSY|NASD_CD_ANONF;
        NASD_ASSERT(!(anon_ent->io_flags&(NASD_CI_IOQ|NASD_CI_DISPATCH)));
        abf = 0;
      }
      else {
        abf = 1;
      }
      NASD_ODC_UNLOCK_BLOCK(anon_ent);
      if (abf) {
        break;
      }
    }

    if (ref_ent) {
      NASD_ODC_UNLOCK_BLOCK(ref_ent);
      nasd_odc_block_release(ref_ent);
    }
  }

  if (do_anon_loads && cr) {
    if (anon_ichp == &anon_ich) {
      /*
       * Our I/O list of anonymous blocks is locally created/maintained.
       * Issue the I/Os here. (If it is not local, then the caller is
       * getting the I/Os back sorted into the passed-in I/O list.)
       */
      if (anon_ich.inext != &anon_ich) {
        NASD_ODC_CSINC(anon_issue_noorder);
        /* list is non-empty */
        /* disconnect I/O chain */
        anon_ich.iprev->inext = NULL;
        anon_ich.inext->iprev = NULL;
        /* issue I/O */
        nasd_od_io_enq(anon_ich.inext, NASD_U_READ, NASD_IO_PRI_MED);
      }
    }
  }

  return(NASD_SUCCESS);
}

/*
 * More commonly-used front-end to lookup- will
 * actually return a block with associated data,
 * and optionally perform the I/O to validate data
 * iff necessary.
 *
 * Returns with only a ref held on the new ent.
 */
nasd_status_t
nasd_odc_block_get(
  nasd_odc_ent_t      *node_ent,
  nasd_blkno_t         blkno,
  int                  flags,
  nasd_odc_ent_t     **entp,
  nasd_identifier_t    nid,
  nasd_offset_t        offset,
  int                  type,     /* block type */
  nasd_odc_ent_t      *ichain)   /* I/O chain */
{
  nasd_odc_ent_t *ent;
  nasd_status_t rc;
  int cr;

  rc = nasd_odc_block_get_part1(node_ent, blkno, flags, &ent,
    nid, offset, type, ichain, &cr);
  if (rc)
    return(rc);

  rc = nasd_odc_block_get_part2(node_ent, blkno, flags, &ent,
    nid, offset, type, ichain, &cr);

  if (rc == NASD_SUCCESS) {
    *entp = ent;
  }
  else {
    nasd_odc_block_release(ent);
    *entp = NULL;
    return(rc);
  }

  return(NASD_SUCCESS);
}

/*
 * Assumes caller holds lru lock, refcnt is 0
 * (so no one should hold ent lock)
 */
void
nasd_odc_block_eject_real(
  nasd_odc_ent_t  *ent)
{
  nasd_odc_ent_t *ne;
  nasd_status_t rc;
  int otype;

  NASD_ASSERT(ent->refcnt == 0);
  NASD_ASSERT(ent->irefcnt == 0);
  NASD_ASSERT(!(ent->lru_flags&NASD_CL_LRU_Q));
  NASD_ASSERT(ent->lru_flags&NASD_CL_REMOVING);
  otype = ent->type;
  /*
   * Taking this lock is okay, even though it violates our
   * locking protocol, because it should be uncontested
   * (see above).
   */
  NASD_ODC_LOCK_BLOCK(ent);
  /*
   * Should this happen? Or should we always hold a ref
   * somewhere on a busy block? (Real question: should dirty
   * queue hold a ref on dirty blocks?)
   *
   * Later answer, after more though: We hold a ref on
   * a block while it's dirty. This shouldn't happen.
   */
  if(ent->data_flags&NASD_CD_BUSY) {
    NASD_PANIC();
    rc = NASD_FAIL;
    do {
      if (rc == NASD_FAIL)
        rc = nasd_od_io_try_raise_pri(ent, NASD_IO_PRI_MED);
      NASD_WAIT_COND(ent->cond,ent->lock);
    } while(ent->data_flags&NASD_CD_BUSY);
  }
  NASD_ODC_UNLOCK_BLOCK(ent);
  NASD_DIRTY_LOCK();
  if (ent->dirty_flags&NASD_CR_DIRTY_Q) {
    NASD_PANIC();
    /* cancel writeback */
    nasd_odc_dirty_eject(ent);
  }
  if (ent->dirty_flags&NASD_CR_DIRTY) {
    nasd_odc_dirty_eject(ent);
  }
  NASD_DIRTY_UNLOCK();
  ent->data_flags |= NASD_CD_INVALID;
  ent->type = NASD_ODC_T_FREE;
  NASD_ODC_Q_INS(&nasd_odc_unusedq,ent,l);
  ent->lru_flags &= ~NASD_CL_REMOVING;
  /* remove from hash bucket */
  NASD_ODC_Q_DEQ_NOLOCK(ent,h);
  ent->hbucket->size--;
  ent->hbucket = NULL;
  /* remove from identity hash */
  if (NASD_ODC_TYPE_INDEX_LOGICAL(otype)) {
    NASD_ODC_LOGICAL_INDEX_DEL(ent);
  }
  else {
    NASD_ASSERT(!(ent->lru_flags & NASD_CL_LINDEX));
  }
  /* remove from node-member list iff applicable */
  if (ent->node_ent) {
    NASD_ODC_Q_DEQ_NOLOCK(ent,o);
    ne = ent->node_ent;
    ent->node_ent = NULL;
    LOSE_IREF(ne);
  }
  else {
    NASD_ASSERT(ent->irefcnt == 0);
  }
}

/*
 * Assumes caller holds a ref on ent
 * (sets removing flag, which will cause
 * block to eject once ref is gone)
 */
void
nasd_odc_block_eject(
  nasd_odc_ent_t  *ent)
{
  NASD_ODC_LRU_LOCK();
  ent->lru_flags |= NASD_CL_REMOVING;
  NASD_ODC_LRU_UNLOCK();
}

/*
 * If block #blkno is in the cache, get rid of it.
 * (Not for refblocks)
 */
void
nasd_odc_block_eject_by_num(
  nasd_blkno_t  blkno)
{
  nasd_odc_oq_t *bucket;
  nasd_odc_ent_t *e;
  nasd_blkno_t real;

  real = nasd_odc_real_sectno(blkno, NASD_ODC_T_DATA); /* cheat */
  bucket = &nasd_odc_bucket_ents[NASD_ODC_BUCKET(real)];

  NASD_ODC_LRU_LOCK();
  for(e=bucket->head.hnext;e!=&bucket->head;e=e->hnext) {
    if (e->real_sectno == real) {
      if (e->refcnt) {
        if (e->refcnt == 1) {
          NASD_DIRTY_LOCK();
          if (e->dirty_flags&NASD_CR_DIRTY_Q) {
            /*
             * Block is dirty, and awaits writing.
             * Never write it back.
             */
            NASD_ASSERT(!(e->dirty_flags&NASD_CR_DIRTYW_Q));
            e->refcnt = 0;
            nasd_odc_dirty_eject(e);
            NASD_ASSERT(!(e->lru_flags&NASD_CL_LRU_Q));
            NASD_DIRTY_UNLOCK();
            if (e->irefcnt)
              goto irefonly;
            else
              goto noref;
          }
          NASD_DIRTY_UNLOCK();
        }
        e->lru_flags |= NASD_CL_REMOVING;
      }
      else if (e->irefcnt) {
        /*
         * No external holders on this block, but
         * there is an internal holder.
         */
        NASD_ASSERT(e->lru_flags&NASD_CL_LRU_Q);
        e->lru_flags &= ~NASD_CL_LRU_Q;
        NASD_ODC_Q_DEQ_NOLOCK(e,l);
irefonly:
        NASD_ASSERT((e->lru_num >= 0) && (e->lru_num < NASD_ODC_NLRUS));
        nasd_odc_lru[e->lru_num].size--;
        e->lru_num = NASD_ODC_LRU_NONE;
        NASD_ASSERT(!(e->lru_flags&NASD_CL_LRU_Q));
        e->lru_flags |= NASD_CL_REMOVING;
      }
      else {
        NASD_ASSERT(e->lru_flags&NASD_CL_LRU_Q);
        e->lru_flags &= ~NASD_CL_LRU_Q;
        NASD_ODC_Q_DEQ_NOLOCK(e,l);
        nasd_odc_lru[e->lru_num].size--;
        e->lru_num = NASD_ODC_LRU_NONE;
noref:
        e->lru_flags |= NASD_CL_REMOVING;
        nasd_odc_block_eject_real(e);
      }
      break;
    }
  }
  NASD_ODC_LRU_UNLOCK();
}

/*
 * Put an entry in the hash table
 */
void
nasd_odc_block_hash_ins(
  nasd_odc_ent_t  *ent)
{
  nasd_odc_oq_t *bucket;

  bucket = &nasd_odc_bucket_ents[NASD_ODC_BUCKET(ent->real_sectno)];

  NASD_ODC_LRU_LOCK();

  ent->hbucket = bucket;
  NASD_ODC_Q_INS_NOLOCK(bucket,ent,h);

  NASD_ODC_LRU_UNLOCK();
}

#if 0
/*
 * Marks object for ejection. Assumes caller holds
 * ref on object node entry (which is the argument
 * to this function). When caller releases this node
 * after calling this function, the node will be
 * ejected (assuming no one else is using it). Data
 * blocks will be ejected inline here, again provided
 * no sharing. If there's sharing, blocks should be
 * ejected when they become unused.
 *
 * NOTE: this may not interact "correctly" with
 * blind prefetching- it assumes that all related
 * blocks have the node_ent field set correctly
 */
nasd_status_t
nasd_odc_obj_eject(
  nasd_odc_ent_t  *ne)
{
  nasd_odc_ent_t *e, *hn, *wait_ent;
  nasd_odc_oq_t *bucket;
  int b;
#if DBG_OBJ_EJECT > 0
  int not_done;
#endif /* DBG_OBJ_EJECT > 0 */

  NASD_ODC_LRU_LOCK();

  ne->lru_flags |= NASD_CL_REMOVING;

restart_flush:
  wait_ent = NULL;
#if DBG_OBJ_EJECT > 0
  not_done = 0;
#endif /* DBG_OBJ_EJECT > 0 */

  for(b=0;b<nasd_odc_buckets;b++) {
    bucket = &nasd_odc_bucket_ents[b];
    for(e=bucket->head.hnext;e!=&bucket->head;e=hn) {
      hn = e->hnext;
      if (e->node_ent == NULL)
        continue;
      if (e->node_ent->real_sectno == ne->real_sectno) {
        NASD_ASSERT(e->node_ent == ne);
        if ((e->lru_flags&NASD_CL_DELETING)
          || (e->lru_flags&NASD_CL_REMOVING))
        {
          continue;
        }
        if (e->lru_flags&NASD_CL_LRU_Q) {
          NASD_ASSERT(!(e->dirty_flags&NASD_CR_DIRTY));
          NASD_ASSERT(!(e->dirty_flags&NASD_CR_DIRTY_Q));
          NASD_ASSERT(e->refcnt == 0);
          e->lru_flags &= ~NASD_CL_LRU_Q;
          NASD_ODC_Q_DEQ_NOLOCK(e,l);
          nasd_odc_lru[e->lru_num].size--;
          e->lru_num = NASD_ODC_LRU_NONE;
        }
        if (e->dirty_flags&NASD_CR_DIRTY_Q) {
          NASD_ASSERT(e->dirty_flags&NASD_CR_DIRTY);
          NASD_ASSERT(!(e->lru_flags&NASD_CL_LRU_Q));
          NASD_ASSERT(e->refcnt == 1);
          e->dirty_flags &= ~NASD_CR_DIRTY_Q;
          NASD_ODC_Q_DEQ_NOLOCK(e,d);
          /* we now "own" the ref on this block */
          wait_ent = e;
          goto check_complete;
        }
        e->lru_flags |= NASD_CL_REMOVING;
        NASD_ODC_LOCK_BLOCK(e);
        if (e->data_flags&NASD_CD_BUSY) {
          wait_ent = e;
        }
        NASD_ODC_UNLOCK_BLOCK(e);
        if (wait_ent) {
          wait_ent->refcnt++;
          goto check_complete;
        }
        if (e->refcnt == 0) {
          /*
           * No one holds this block. We can check
           * the busy flag without the block lock
           * because no one should be able to transit
           * a block to BUSY without having a ref (we
           * caught the dirty case above).
           */
          if (e->data_flags & NASD_CD_BUSY) {
            /* I/O in flight */
            e->refcnt++;
            wait_ent = e;
            goto check_complete;
          }
          /* no holders, eject now */
          nasd_odc_block_eject_real(e);
        }
#if DBG_OBJ_EJECT > 0
        else {
          not_done++;
          nasd_printf("e->refcnt=%d\n", e->refcnt);
          nasd_printf("e->irefcnt=%d\n", e->irefcnt);
          nasd_printf("e->data_flags=0x%x\n", e->data_flags);
          nasd_printf("e->lru_flags=0x%x\n", e->lru_flags);
          nasd_printf("e->io_flags=0x%x\n", e->io_flags);
          nasd_printf("e->dirty_flags=0x%x\n", e->dirty_flags);
        }
#endif /* DBG_OBJ_EJECT > 0 */
      }
    }
  }
check_complete:
  NASD_ODC_LRU_UNLOCK();

  if (wait_ent) {
    NASD_ODC_LOCK_BLOCK(wait_ent);
    nasd_odc_wait_not_busy(wait_ent);
    if (wait_ent->dirty_flags&NASD_CR_DIRTY) {
      wait_ent->data_flags |= NASD_CD_BUSY;
    }
    NASD_ODC_UNLOCK_BLOCK(wait_ent);
    if (wait_ent->dirty_flags&NASD_CR_DIRTY) {
      nasd_od_io_flush_block(wait_ent);
    }
    nasd_odc_block_release(wait_ent);
    goto restart_flush;
  }

if (not_done) { nasd_printf("%d blocks not done\n", not_done); return(NASD_FAIL); }

  return(NASD_SUCCESS);
}
#else
/*
 * Marks object for ejection. Assumes caller holds
 * ref on object node entry (which is the argument
 * to this function). When caller releases this node
 * after calling this function, the node will be
 * ejected (assuming no one else is using it). Data
 * blocks will be ejected inline here, again provided
 * no sharing. If there's sharing, blocks should be
 * ejected when they become unused.
 *
 * NOTE: this may not interact "correctly" with
 * blind prefetching- it assumes that all related
 * blocks have the node_ent field set correctly
 */
nasd_status_t
nasd_odc_obj_eject(
  nasd_odc_ent_t  *ne)
{
  nasd_odc_ent_t *e, *next, *wait_ent;
#if DBG_OBJ_EJECT > 0
  int not_done;
#endif /* DBG_OBJ_EJECT > 0 */

  NASD_ODC_CHECK_NODE_ENT(ne);

  NASD_ODC_LRU_LOCK();

  ne->lru_flags |= NASD_CL_REMOVING;

restart_flush:
  wait_ent = NULL;
#if DBG_OBJ_EJECT > 0
  not_done = 0;
#endif /* DBG_OBJ_EJECT > 0 */

  for(e=ne->onext;e!=ne;e=next) {
    next = e->onext;
    NASD_ASSERT(e->node_ent == ne);

    if ((e->lru_flags&NASD_CL_DELETING)
      || (e->lru_flags&NASD_CL_REMOVING))
    {
      continue;
    }
    if (e->lru_flags&NASD_CL_LRU_Q) {
      NASD_ASSERT(!(e->dirty_flags&NASD_CR_DIRTY));
      NASD_ASSERT(!(e->dirty_flags&NASD_CR_DIRTY_Q));
      NASD_ASSERT(e->refcnt == 0);
      e->lru_flags &= ~NASD_CL_LRU_Q;
      NASD_ODC_Q_DEQ_NOLOCK(e,l);
      nasd_odc_lru[e->lru_num].size--;
      e->lru_num = NASD_ODC_LRU_NONE;
    }
    if (e->dirty_flags&NASD_CR_DIRTY_Q) {
      NASD_ASSERT(e->dirty_flags&NASD_CR_DIRTY);
      NASD_ASSERT(!(e->lru_flags&NASD_CL_LRU_Q));
      NASD_ASSERT(e->refcnt == 1);
      e->dirty_flags &= ~NASD_CR_DIRTY_Q;
      NASD_ODC_Q_DEQ_NOLOCK(e,d);
      /* we now "own" the ref on this block */
      wait_ent = e;
      goto check_complete;
    }
    e->lru_flags |= NASD_CL_REMOVING;
    NASD_ODC_LOCK_BLOCK(e);
    if (e->data_flags&NASD_CD_BUSY) {
      wait_ent = e;
    }
    NASD_ODC_UNLOCK_BLOCK(e);
    if (wait_ent) {
      wait_ent->refcnt++;
      goto check_complete;
    }
    if (e->refcnt == 0) {
      /*
       * No one holds this block. We can check
       * the busy flag without the block lock
       * because no one should be able to transit
       * a block to BUSY without having a ref (we
       * caught the dirty case above).
       */
      if (e->data_flags & NASD_CD_BUSY) {
        /* I/O in flight */
        e->refcnt++;
        wait_ent = e;
        goto check_complete;
      }
      /* no holders, eject now */
      nasd_odc_block_eject_real(e);
    }
#if DBG_OBJ_EJECT > 0
    else {
      not_done++;
      nasd_printf("e->refcnt=%d\n", e->refcnt);
      nasd_printf("e->irefcnt=%d\n", e->irefcnt);
      nasd_printf("e->data_flags=0x%x\n", e->data_flags);
      nasd_printf("e->lru_flags=0x%x\n", e->lru_flags);
      nasd_printf("e->io_flags=0x%x\n", e->io_flags);
      nasd_printf("e->dirty_flags=0x%x\n", e->dirty_flags);
    }
#endif /* DBG_OBJ_EJECT > 0 */
  }
check_complete:
  NASD_ODC_LRU_UNLOCK();

  if (wait_ent) {
    NASD_ODC_LOCK_BLOCK(wait_ent);
    nasd_odc_wait_not_busy(wait_ent);
    if (wait_ent->dirty_flags&NASD_CR_DIRTY) {
      wait_ent->data_flags |= NASD_CD_BUSY;
    }
    NASD_ODC_UNLOCK_BLOCK(wait_ent);
    if (wait_ent->dirty_flags&NASD_CR_DIRTY) {
      nasd_od_io_flush_block(wait_ent);
    }
    nasd_odc_block_release(wait_ent);
    goto restart_flush;
  }

#if DBG_OBJ_EJECT > 0
  if (not_done) {
    nasd_printf("%d blocks not done\n", not_done);
    return(NASD_FAIL);
  }
#endif /* DBG_OBJ_EJECT > 0 */

  return(NASD_SUCCESS);
}
#endif

/*
 * Caller holds ref on node cache block
 *
 * Disassociates cache entry for node ent from cache
 * entries for member blocks. Used when deleting objects
 * to avoid locking the node block into the state where
 * there are no pending I/Os, but it's wired into the
 * cache so its member blocks can complete pending I/Os
 * before ejection/reuse.
 */
nasd_status_t
nasd_odc_obj_disassoc(
  nasd_odc_ent_t *ne)
{
  nasd_odc_ent_t *e, *next;

  NASD_ODC_CHECK_NODE_ENT(ne);

  NASD_ODC_LRU_LOCK();

  for(e=ne->onext;e!=ne;e=next) {
    next = e->onext;
    NASD_ASSERT(e->node_ent == ne);
    NASD_ODC_Q_DEQ_NOLOCK(e,o);
    LOSE_IREF(ne);
    e->node_ent = NULL;
  }

  NASD_ODC_LRU_UNLOCK();

  return(NASD_SUCCESS);
}

/* Local Variables:  */
/* indent-tabs-mode: nil */
/* tab-width: 2 */
/* End: */
