www.pudn.com > efs.rar > fs_db.c


/*====*====*====*====*====*====*====*====*====*====*====*====*====*====*====* 
 
                   File System Database Code 
 
General Description 
  Database Code 
 
Copyright (c) 2002 -- 2006 by QUALCOMM, Incorporated.  All Rights Reserved. 
*====*====*====*====*====*====*====*====*====*====*====*====*====*====*====*/ 
 
/*=========================================================================== 
 
                           Edit History 
 
$Header: //depot/asic/MSMSHARED/services/efs/MSM_EFS.01.02/fs_db.c#9 $ $DateTime: 2006/08/31 17:09:53 $ $Author: davidb $ 
 
when       who     what, where, why 
--------   ---     ---------------------------------------------------------- 
08/31/06   dlb     Fix 'gid' usage, and add in-place upgrade code. 
04/13/06   dlb     Add 'const' to some pointers. 
04/07/06   dlb     Add query for iterator value. 
01/11/06   nrs     Fixed Copyright header 
12/23/05   nrs     Fixed bad declartion of node data 
12/09/05   dlb     Avoid memcpy to NULL. 
10/26/05    sh     Lint cleanup. 
10/18/05   nrs     Added uid hooks to db and removed erroneous gid parameter 
10/11/05   nrs     Remove FS_ALLOC_SOFT and FS_ALLOC_HARD 
09/21/05   nrs     Add checks for quota limit exceeded 
08/25/05   nrs     Implent buffer accounting for quotas and reservations 
05/26/05   sd      Compilation fixes for L4. 
04/26/05   dlb     Add 2K page support. 
03/07/05   nrs     Ignore irrelevant lint warnings 
10/15/04   dlb     Update copyright line. 
01/09/04   dlb     Fix update operation. 
06/17/03   jkl     Clean up code. 
05/01/03   dlb     Improved database algorithm. 
02/28/03   jkl     Clean Up code. 
===========================================================================*/ 
 
#ifdef FS_UNIT_TEST 
#error code not present 
#endif 
 
#ifdef FEATURE_IG_EFS_EXT_SERVER 
  #include "amssassert.h" 
#else 
  #include "assert.h" 
#endif 
 
#include  
#include  
 
#include "fs_sys_types.h" 
#include "fs_err.h" 
 
#include "fs_buffer.h" 
#include "fs_db.h" 
#include "fs_errno.h" 
 
#include "fs_upgrade.h" 
 
#include "msg.h" 
 
#ifdef FS_UNIT_TEST 
#error code not present 
#endif 
 
/*lint -save -e613 Ignoring all irrelevant lint warnings */ 
 
/* There are two types of nodes in the database, leaf and interior nodes. 
 * Both have different characteristics: 
 * 
 * Leaf nodes contain both keys and data.  Each key is associated with a 
 * since piece of data, and there is no data before the first key.  Both 
 * the key and the data are variable length. 
 * 
 * The key and value pairs are stored in the data region of the node.  Each 
 * pair has a 2 byte header, which is the length (in bytes) of the key, and 
 * the length (in bytes) of the value.  After this, are the immediate bytes 
 * of the key and the value.  The header for the next element immediately 
 * follows this. 
 * 
 * Interior nodes contain keys and cluster_ids.  There is one more 
 * cluster_id than there are keys (there is a cluster_id before the first 
 * id). 
 * 
 * The data region begins with an ID_SIZEd (4 bytes at the time of 
 * implementation) cluster_id for the leftmost child.  This is followed by 
 * pairs of key's and cluster_ids.  Since the values here are fixed size, 
 * no length is stored for them.  Also, the keys in the interior nodes are 
 * partial, only enough characters from the key is used to distinguish them 
 * one from another. 
 */ 
 
/* Active clusters. 
 * The database has the potential to look at, and manipulate a large number 
 * of clusters.  The active clusters structures (clust, node) keep track of 
 * cluster number and wired pointers for each of these nodes.  Leaf nodes 
 * are always manipulated at level zero, so this is fairly easy.  Inserts 
 * can touch at most 3 clusters per level, while deletes can touch 4. 
 * Only two need to stay wired, however.  So, for each level, we have a 
 * pair of clusters available to hold the wire in place.  There are also 
 * two aux clusters that are used during inserts and deletes (the left 
 * pointer afterward needs to be modified). 
 * 
 * The down_find search sets pos[] to the offset in the data area of the 
 * node where the appropriate index point is.  'matched' is only set on 
 * leaf nodes (since it isn't important in the interior). 
 */ 
 
/* Some convenient names for longer symbols. */ 
#define AUX1      FS_DB_AUX1 
 
/* Operations used to modify nodes. */ 
#define OP_ADD          1 
#define OP_REPLACE      2 
#define OP_REMOVE       3 
 
#define NODE_DATA_SIZE   (EFS_PAGE_SIZE-3*4-2-2-4) 
 
#ifdef FS_UNIT_TEST 
#error code not present 
#endif 
 
/* These are guesses about what are good points to try combining data.  The 
 * check threshold is the limit above which the combine isn't even 
 * considered.  Then the check threshold is the point at which a combine 
 * will be done (if the combined node is less than this size). */ 
#define COMBINE_THRESHOLD  (NODE_DATA_SIZE*2/3) 
#define COMBINE_CHECK_THRESHOLD  (NODE_DATA_SIZE/3) 
 
/* This is the format of a given node. */ 
struct node_data { 
  cluster_id    left; 
  cluster_id    right; 
  uint16        used; 
  uint32        gid; 
  uint8         bogus_count; 
  uint8         level; 
  uint8         data[NODE_DATA_SIZE]; 
}; 
 
#define ID_SIZE ((int) sizeof (cluster_id)) 
 
/* Since the cluster ID's are misaligned in the data, use these two 
 * routines to fetch and store them.  These always store little endian. */ 
static void 
put_id (uint8 *place, cluster_id id) 
{ 
  int i; 
  for (i = 0; i < ID_SIZE; i++) { 
    place[i] = id & 0xFF; 
    id >>= 8; 
  } 
} 
 
static cluster_id 
get_id (uint8 *place) 
{ 
  cluster_id res = 0; 
  int i; 
 
  for (i = ID_SIZE - 1; i >= 0; i--) { 
    res <<= 8; 
    res |= place[i]; 
  } 
 
  return res; 
} 
 
/********************************************************************** 
 * Utilities for wiring and unwiring clusters.  These are short, and should 
 * inline nicely. 
 */ 
 
/* Wire down a specific cluster. */ 
static void 
db_read_wire (fs_db_t db, int num) 
{ 
  db->node[num] = db->buf->read_wire (db->buf, db->clust[num]); 
} 
 
/* Wire for writing. */ 
static void 
db_write_wire (fs_db_t db, int num) 
{ 
  db->node[num] = db->buf->write_wire (db->buf, db->clust[num]); 
} 
 
/* Free a specified wire.  Clears the pointer, but the cluster number is 
 * still valid. */ 
static void 
db_unwire (fs_db_t db, int num) 
{ 
  if (db->node[num] != NULL) { 
    db->buf->free_wire (db->buf, db->clust[num]); 
    db->node[num] = NULL; 
  } 
} 
 
/* Unwire all of the clusters that are still wired down. */ 
static void 
db_unwire_all (fs_db_t db) 
{ 
  int i; 
 
  for (i = 0; i < FS_DB_ACTIVE_CLUSTERS; i++) { 
    db_unwire (db, i); 
  } 
} 
 
/********************************************************************** 
 * Utilities for manipulating the left/right cluster chain at each level. 
 */ 
 
/* Insert a new node into a chain.  'new_node' is inserted to the right of 
 * 'left'.  'old_right' can be used to wire down the old right (it should 
 * not be wired yet).  left, and new_node must both already be write wired. 
 */ 
static void 
chain_insert (fs_db_t db, int left, int new_node, int old_right) 
{ 
  node_t l = db->node[left]; 
  node_t n = db->node[new_node]; 
  node_t r; 
 
  /* Update the three pointers already wired. */ 
  n->right = l->right; 
  n->left = db->clust[left]; 
  l->right = db->clust[new_node]; 
 
  /* If there is a right sibling, its 'left' pointer needs to be adjusted 
   * to point to the new_node. */ 
  if (n->right != INVALID_CLUSTER_ID) { 
    db->clust[old_right] = n->right; 
    db_write_wire (db, old_right); 
    r = db->node[old_right]; 
    r->left = db->clust[new_node]; 
 
    /* Unwire the right pointer, since it won't get used any more. */ 
    db_unwire (db, old_right); 
  } 
} 
 
/* Delete a node in a chain.  Left is the left node, drop is the node to 
 * drop, and right is an (unwired) node that may need to be modified to do 
 * the delete.  This will change the chain from left->drop->right to 
 * left->right.  Drop can be unallocated once this is finished.  Left 
 * should be write wired, drop doesn't matter, and right is not wired at 
 * all.  The drop node will also be unwired, and deallocated. */ 
static void 
chain_delete (fs_db_t db, int left, int drop, int right) 
{ 
  cluster_id clust; 
 
  node_t l = db->node[left]; 
  node_t d = db->node[drop]; 
 
  uint32 gid = d->gid; 
 
  l->right = d->right; 
 
  if (l->right != INVALID_CLUSTER_ID) { 
    db->clust[right] = l->right; 
    db_write_wire (db, right); 
 
    db->node[right]->left = d->left; 
 
    db_unwire (db, right); 
  } 
 
  clust = db->clust[drop]; 
  db_unwire (db, drop); 
  db->buf->deallocate (db->buf, clust, gid); 
} 
 
/* Initialize a new nodes fields. Set it up according to the specified 
 * level. */ 
static void 
erase_node (node_t n, int level, uint32 gid) 
{ 
  n->left = INVALID_CLUSTER_ID; 
  n->right = INVALID_CLUSTER_ID; 
  n->used = 0; 
  /* n->count = 0; */ 
  n->level = level; 
  n->gid = gid; 
 
#ifdef FS_UNIT_TEST 
#error code not present 
#endif 
} 
 
/* Compare two keys.  Lots of room for optimization, but don't bother 
 * unless we waste a lot of time in here. */ 
static int 
compare_keys (const uint8 *left, unsigned left_length, 
    const uint8 *right, unsigned right_length) 
{ 
  unsigned pos = 0; 
 
  while (1) { 
    if (pos == left_length && pos == right_length) 
      return 0; 
 
    if (pos == left_length) 
      return -1; 
 
    if (pos == right_length) 
      return 1; 
 
    if (left[pos] < right[pos]) 
      return -1; 
 
    if (left[pos] > right[pos]) 
      return 1; 
 
    pos++; 
  } 
} 
 
/* Scan the "active" cluster for the given key.  db->pos will be left 
 * pointing to the first key that is greater than the given key. 
 * db->matched will be set iff db->pos points to a key that exactly matches 
 * the given key. */ 
static void 
scan (fs_db_t db, const uint8 *key, unsigned key_length) 
{ 
  unsigned pos = 0; 
  node_t n = db->node[db->active]; 
 
  if (n->level == 0) { 
    db->matched = 0; 
    pos = 0; 
    while (pos < n->used) { 
      unsigned len = n->data[pos]; 
      int res = compare_keys (key, key_length, &n->data[pos+2], len); 
      if (res == 0) { 
        db->matched = 1; 
        break; 
      } else if (res < 0) { 
        break; 
      } 
 
      pos += len + n->data[pos+1] + 2; 
    } 
  } else { 
    db->matched = 0; 
    pos = ID_SIZE; 
    while (pos < n->used) { 
      unsigned len = n->data[pos]; 
      int res = compare_keys (key, key_length, &n->data[pos+1], len); 
      if (res == 0) { 
        /* In index nodes, equal moves it to the right. */ 
        pos += len + 1 + ID_SIZE; 
        break; 
      } else if (res < 0) { 
        break; 
      } 
 
      pos += len + 1 + ID_SIZE; 
    } 
  } 
 
  db->pos[n->level] = pos; 
} 
 
/* Given two strings that are lexographically sorted, determine the minimal 
 * prefix of string 'b' that is needed to distinguish them.  If the strings 
 * are improperly lexigraphically sorted, the assertion will fail. */ 
static unsigned 
find_unique_length (uint8 *a, unsigned alen, uint8 *b, unsigned blen) 
{ 
  unsigned len = 0; 
 
  while (1) { 
    ASSERT (len < blen); 
 
    if (len >= alen) 
      return len + 1; 
 
    if (a[len] != b[len]) 
      return len + 1; 
 
    len++; 
  } 
} 
 
static void index_expand (fs_db_t db, int level, 
    uint8 *key, uint8 key_length, uint32 gid); 
 
/* Adjust the after its children have been modified.  The 'op' should 
 * either be OP_ADD to indicate a new field to be added, and OP_REMOVE to 
 * indicate a field to be removed.  'pos' should point to the beginning of 
 * a key in the index node (meaning we can't delete the first pointer).  If 
 * OP_ADD, then the given key and new_pointer will be inserted at this 
 * point. 
 */ 
static void 
index_adjust (fs_db_t db, int level, int op, 
    unsigned pos, uint8 *key, unsigned key_length, 
    cluster_id new_ptr, uint32 gid) 
{ 
  int base = level * FS_DB_CLUSTERS_PER_LEVEL; 
  node_t node; 
  node_t n2 = NULL; 
  uint8 *src[5]; 
  int slen[5]; 
  uint8 *dest; 
  unsigned dlen; 
  int nspans; 
  unsigned new_length; 
  int i; 
  int tmp; 
  unsigned spoint = 0; 
  int do_split = 0; 
  uint8 *split_key = NULL; 
  int do_combine = 0; 
  int phase; 
  int result; 
 
  db_write_wire (db, base); 
 
  node = db->node[base]; 
 
  /* Place the tmp key into the buffer.  It is important to do this in such 
   * a way that if it is already there, it won't be bothered. */ 
  if (key != NULL && key != db->tmp_key + 1) { 
    db->tmp_key[0] = key_length; 
    memcpy (&db->tmp_key[1], key, key_length); 
  } 
  put_id (&db->tmp_key[1] + key_length, new_ptr); 
 
#ifdef FS_UNIT_TEST 
#error code not present 
#endif 
 
  /* For simplicity, copy all of the data into a tmp buffer, and then copy 
   * it back as needed. */ 
  memcpy (db->tmp_buf[0], &node->data[0], node->used); 
 
  src[0] = &db->tmp_buf[0][0]; 
  slen[0] = pos; 
 
  /* ADD */ 
  if (op == OP_ADD) { 
    src[1] = &db->tmp_key[0]; 
    slen[1] = 1 + ID_SIZE + key_length; 
  } else { 
    ASSERT (pos < node->used); 
 
    /* Skip the key to remove. */ 
    src[1] = NULL; 
    slen[1] = 0; 
 
    pos += db->tmp_buf[0][pos] + 1 + ID_SIZE; 
  } 
 
  src[2] = &db->tmp_buf[0][pos]; 
  slen[2] = node->used - pos; 
 
  nspans = 3; 
 
  new_length = 0; 
  for (i = 0; i < nspans; i++) { 
    new_length += slen[i]; 
  } 
 
  if (new_length > NODE_DATA_SIZE) { 
    /* The node must be split.  The split point will be roughly half of the 
     * new length. */ 
    spoint = new_length / 2; 
    do_split = 1; 
 
    /* Create the new node. */ 
    result = db->buf->allocate (db->buf, gid, 
        &db->clust[base+1]); 
    if (result < 0) 
    { 
      db_unwire_all (db); 
      if (result == -ENOSPC) 
        FS_RAISE (-ENOSPC, "Space exhausted", 0, 0, 0); 
      else 
        FS_RAISE (-EDQUOT, "Quota limit exceeded", 0, 0, 0); 
    } 
    db_write_wire (db, base+1); 
 
    n2 = db->node[base+1]; 
    erase_node (n2, node->level, gid); 
  } else if (op != OP_ADD && new_length < COMBINE_CHECK_THRESHOLD 
      && node->right != INVALID_CLUSTER_ID 
      && db->pos[level+1] < db->node[base + FS_DB_CLUSTERS_PER_LEVEL]->used) 
  { 
    node_t parent; 
    /* Check for the possible need/benefit of combining with our right 
     * neighbor.  To combine index nodes, we must take the key from the 
     * parent index node and insert it. */ 
 
    db->clust[base+1] = node->right; 
    db_read_wire (db, base+1); 
 
    n2 = db->node[base+1]; 
 
    parent = db->node[base + FS_DB_CLUSTERS_PER_LEVEL]; 
    if (new_length + n2->used + parent->data[db->pos[level+1]] + 1 
        < COMBINE_THRESHOLD) 
    { 
      do_combine = 1; 
      src[3] = &parent->data[db->pos[level+1]]; 
      slen[3] = parent->data[db->pos[level+1]] + 1; 
      src[4] = &n2->data[0]; 
      slen[4] = n2->used; 
 
      nspans = 5; 
    } else { 
      /* Not necessary to combine. */ 
      db_unwire (db, base + 1); 
 
      n2 = NULL; 
    } 
  } 
 
  dest = &node->data[0]; 
  dlen = 0; 
 
  phase = 0; /* Start copying an ID. */ 
 
  for (i = 0; i < nspans; i++) { 
    while (slen[i] > 0) { 
      if (phase == 0) { 
        ASSERT (slen[i] >= ID_SIZE); 
        memcpy (dest, src[i], ID_SIZE); 
        dest += ID_SIZE; 
        dlen += ID_SIZE; 
        src[i] += ID_SIZE; 
        slen[i] -= ID_SIZE; 
 
        phase = 1; 
      } else { 
        /* If we are splitting, check if it is time to move to the other 
         * node. */ 
        if (do_split == 1 && dlen > spoint) { 
          do_split = 2; 
          spoint = dlen;  /* Adjust the split point. */ 
 
          split_key = &src[i][0]; 
          node->used = dlen; 
          dest = &n2->data[0]; 
 
          /* To split the index, the split key will be inserted above us. 
           * Its data value becomes the leftmost pointer in the new node. */ 
          tmp = src[i][0] + 1; 
          src[i] += tmp; 
          slen[i] -= tmp; 
          dlen = 0; 
        } else { 
          /* Otherwise, just copy over a single key. */ 
          tmp = 1 + src[i][0]; 
          memcpy (&dest[0], &src[i][0], tmp); 
          src[i] += tmp; 
          slen[i] -= tmp; 
          dlen += tmp; 
          dest += tmp; 
        } 
 
        phase = 0; 
      } 
    } 
  } 
 
#ifdef FS_UNIT_TEST 
#error code not present 
#endif 
 
  /* Special check for removing the top node in the tree. */ 
  if (new_length == ID_SIZE && node->level == db->top_level 
      && !do_combine) 
  { 
    cluster_id clust; 
    uint32 dealloc_gid = node->gid; 
 
    ASSERT (!do_split); 
    db->top = get_id (&node->data[0]); 
    db->buf->set_field (db->buf, FS_FIELD_DB_ROOT, db->top); 
    db->top_level = level - 1; 
 
    clust = db->clust[base]; 
    db_unwire (db, base); 
    db->buf->deallocate (db->buf, clust, dealloc_gid); 
 
    return; 
  } 
 
  if (do_split) { 
    n2->used = dlen; 
    chain_insert (db, base, base+1, AUX1); 
  } else { 
    node->used = dlen; 
  } 
 
#ifdef FS_UNIT_TEST 
#error code not present 
#endif 
 
  if (do_split) { 
    /* The split must be propagated. */ 
 
    if (split_key != db->tmp_key) 
      memcpy (db->tmp_key, split_key, split_key[0] + 1); 
 
    if (level + 1 > db->top_level) { 
      index_expand (db, level + 1, &db->tmp_key[1], db->tmp_key[0], gid); 
    } else { 
      index_adjust (db, level + 1, OP_ADD, db->pos[level + 1], 
          &db->tmp_key[1], db->tmp_key[0], db->clust[base+1], gid); 
    } 
  } 
 
  if (do_combine) { 
    chain_delete (db, base, base + 1, AUX1); 
    index_adjust (db, level + 1, OP_REMOVE, db->pos[level + 1], 
        0, 0, 0, gid); 
  } 
} 
 
/* Expand the current index up one level.  The tree will be re-rooted at 
 * this point. */ 
static void 
index_expand (fs_db_t db, int level, uint8 *key, uint8 key_length, uint32 gid) 
{ 
  int base = level * FS_DB_CLUSTERS_PER_LEVEL; 
  node_t n; 
  int result; 
 
  if (level >= FS_DB_MAX_DEPTH) { 
    FS_ERR_FATAL ("Database tree is not deep enough", 0, 0, 0); 
  } 
 
  result = db->buf->allocate (db->buf, gid, &db->clust[base]); 
  if (result < 0) 
  { 
    db_unwire_all (db); 
    if (result == -ENOSPC) 
      FS_RAISE (-ENOSPC, "Space exhausted", 0, 0, 0); 
    else 
      FS_RAISE (-EDQUOT, "Quota limit exceeded", 0, 0, 0); 
  } 
  db_write_wire (db, base); 
  n = db->node[base]; 
 
  erase_node (n, level, gid); 
 
  /* Fill it in, appropriately, with the specified key. */ 
  n->used = ID_SIZE + key_length + 1 + ID_SIZE; 
  put_id (&n->data[0], db->clust[base - FS_DB_CLUSTERS_PER_LEVEL]); 
  n->data[ID_SIZE] = key_length; 
  memcpy (&n->data[ID_SIZE + 1], key, key_length); 
  put_id (&n->data[ID_SIZE + key_length + 1], 
      db->clust[base - FS_DB_CLUSTERS_PER_LEVEL + 1]); 
 
  /* Re-root the database tree. */ 
  db->top = db->clust[base]; 
  db->buf->set_field (db->buf, FS_FIELD_DB_ROOT, db->top); 
  db->top_level = level; 
 
#ifdef FS_UNIT_TEST 
#error code not present 
#endif 
} 
 
/* Modify a leaf node, possibly splitting it, or combining it with one of 
 * its neighbors.  The operation must be one of OP_ADD, to add a new 
 * key/value pair at the marked position, OP_REMOVE to remove a key/value 
 * pair at the position, or OP_REPLACE to replace the key value pair with a 
 * different value.  For OP_REMOVE, the given key/value are not used.  For 
 * OP_REPLACE, the key is not used. 
 * 
 * If the modification results in a split or combine of two leaf nodes, 
 * index_adjust will be called to fixup the index. 
 * 
 * Any iterators that are live in this page will be adjusted to continue to 
 * be valid. 
 * 
 * This will use the [node] index into the active clusters.  If an 
 * additional node is needed [node+1] will be used.  AUX1 might also be 
 * used. 
 */ 
static void 
leaf_modify (fs_db_t db, cluster_id ind, int op, int pos, 
    const uint8 *key, unsigned key_length, 
    const uint8 *value, unsigned value_length, 
    uint32 gid, uint32 uid) 
{ 
  node_t node; 
  node_t n2 = NULL; 
  int new_length; 
  uint8 *src[4]; 
  int    slen[4]; 
  int dlen; 
  uint8 *dest; 
  uint8 *pre_key = NULL; 
  uint8 *post_key = NULL; 
  int nspans; 
  int i; 
  int tmp; 
  int spoint = 0; 
  int do_split = 0; 
  int do_combine = 0; 
  int uni_len; 
  int slide; 
  int old_used; 
  int result; 
 
  /* XXX: Use this parameter to implement UID's in the future if needed */ 
  (void) uid; 
  db_write_wire (db, ind); 
 
  node = db->node[ind]; 
 
  old_used = node->used; 
 
#ifdef FS_UNIT_TEST 
#error code not present 
#endif 
 
  /* For simplicity, copy all of the data into a tmp buffer, and then copy 
   * it back as needed. */ 
  memcpy (db->tmp_buf[0], &node->data[0], node->used); 
 
  /* Determine the spans to copy over. */ 
  src[0] = &db->tmp_buf[0][0]; 
  slen[0] = pos; 
 
  /* If we are replacing, set the key argument appropriately to avoid 
   * having to check elsewhere. */ 
  if (op == OP_REPLACE) { 
    key = &db->tmp_buf[0][pos+2]; 
    key_length = db->tmp_buf[0][pos]; 
  } 
 
  if (op == OP_ADD || op == OP_REPLACE) { 
    db->tmp_key[0] = key_length; 
    db->tmp_key[1] = value_length; 
    memcpy (&db->tmp_key[2], key, key_length); 
    memcpy (&db->tmp_key[2 + key_length], value, value_length); 
 
    src[1] = &db->tmp_key[0]; 
    slen[1] = 2 + key_length + value_length; 
  } else { 
    src[1] = NULL; 
    slen[1] = 0; 
  } 
 
  src[2] = &db->tmp_buf[0][pos]; 
  slen[2] = node->used - pos; 
  if (op != OP_ADD) { 
    tmp = 2 + node->data[pos] + node->data[pos+1]; 
    src[2] += tmp; 
    slen[2] -= tmp; 
  } 
 
  /* Slide is the amount of adjustment to make to the iterators if they are 
   * past pos. */ 
  if (op != OP_REMOVE) 
    slide = 2 + key_length + value_length; 
  else 
    slide = 0; 
  if (op != OP_ADD) 
    slide -= 2 + node->data[pos] + node->data[pos+1]; 
 
  nspans = 3; 
 
  new_length = 0; 
  for (i = 0; i < nspans; i++) { 
    new_length += slen[i]; 
  } 
 
  if (new_length > NODE_DATA_SIZE) { 
    /* The node must be split.  The split point will be roughly half of the 
     * new length. */ 
    spoint = new_length / 2; 
    do_split = 1; 
 
    /* Create the new node. */ 
    result = db->buf->allocate (db->buf, gid, 
        &db->clust[ind+1]); 
    if (result < 0) 
    { 
      db_unwire_all (db); 
      if (result == -ENOSPC) 
        FS_RAISE (-ENOSPC, "Space exhausted", 0, 0, 0); 
      else 
        FS_RAISE (-EDQUOT, "Quota limit exceeded", 0, 0, 0); 
    } 
    db_write_wire (db, ind+1); 
 
    n2 = db->node[ind+1]; 
    erase_node (n2, node->level, gid); 
 
  } else if (op != OP_ADD && new_length < COMBINE_CHECK_THRESHOLD 
      && node->right != INVALID_CLUSTER_ID 
      && db->pos[1] < db->node[ind + FS_DB_CLUSTERS_PER_LEVEL]->used) 
  { 
 
    if (db->pos[1] == db->node[ind + FS_DB_CLUSTERS_PER_LEVEL]->used) 
      FS_ERR_FATAL ("Note, handle this", 0, 0, 0); 
 
    /* The last check prevents us from merging two nodes if the 
     * distinguishing key is not in our parent (the pos should point to the 
     * distinguishing key. */ 
 
    /* If the new node is under the check threshold for combination with 
     * our right neighbor.  We only check the right neighbor, because it 
     * makes the propatagion up the tree much simpler. */ 
 
    db->clust[ind+1] = node->right; 
    db_read_wire (db, ind+1); 
 
    n2 = db->node[ind+1]; 
 
    if (new_length + n2->used < COMBINE_THRESHOLD) { 
      do_combine = 1; 
 
      /* The first source is the entire left node. */ 
      src[3] = &n2->data[0]; 
      slen[3] = n2->used; 
 
      nspans = 4; 
    } else { 
      /* Not necessary to combine. */ 
      db_unwire (db, ind+1); 
 
      n2 = NULL; 
    } 
  } 
 
  /* Note, that at the cost of making this code significantly more 
   * complicated, the data could be copied less.  This should only be done 
   * if profiling indicates we spend noticeable time here. */ 
 
  dest = &node->data[0]; 
  dlen = 0; 
  for (i = 0; i < nspans; i++) { 
    while (slen[i] > 0) { 
      /* Copy over one key/value pair. */ 
      tmp = 2 + src[i][0] + src[i][1]; 
      memcpy (&dest[0], &src[i][0], tmp); 
      src[i] += tmp; 
      slen[i] -= tmp; 
      dlen += tmp; 
      dest += tmp; 
 
      /* If we are splitting, check if it is time to move to the other 
       * node. */ 
      if (do_split == 1 && dlen > spoint) { 
        do_split = 2;  /* Make sure we don't split again. */ 
        spoint = dlen;  /* Adjust the split point. */ 
 
        pre_key = dest - tmp; 
        node->used = dlen; 
        dest = &n2->data[0]; 
        dlen = 0; 
        post_key = dest; 
      } 
    } 
  } 
 
  if (do_split) { 
    n2->used = dlen; 
    chain_insert (db, ind, ind+1, AUX1); 
  } else { 
    node->used = dlen; 
  } 
 
#ifdef FS_UNIT_TEST 
#error code not present 
#endif 
 
  /* Adjust all of the iterators, as necessary. */ 
  if (db->iter_count > 0) { 
    cluster_id a; 
    int tmp_pos; 
 
    a = db->clust[ind]; 
 
    for (i = 0; i < FS_DB_ACTIVE_ITERATORS; i++) { 
      if (db->iters[i].cluster == a) { 
        tmp_pos = db->iters[i].pos; 
 
        if (slide >= 0 ? tmp_pos >= pos : tmp_pos > pos) { 
          db->iters[i].pos += slide; 
        } 
 
        if (do_split && db->iters[i].pos >= spoint) { 
          db->iters[i].pos -= spoint; 
          db->iters[i].cluster = db->clust[ind+1]; 
        } 
      } 
 
      if (do_combine && db->iters[i].cluster == db->clust[ind+1]) { 
        db->iters[i].pos += old_used + slide; 
        db->iters[i].cluster = a; 
      } 
 
      if (slide < 0 && db->iters[i].cluster == a 
          && db->iters[i].pos == node->used 
          && node->right != INVALID_CLUSTER_ID) 
      { 
        db->iters[i].pos = 0; 
        db->iters[i].cluster = node->right; 
      } 
    } 
  } 
 
  if (do_split) { 
    /* The split must be propagated, figure out our distinguished key. */ 
    uni_len = find_unique_length (pre_key+2, pre_key[0], 
        post_key+2, post_key[0]); 
 
    if (1 > db->top_level) { 
      index_expand (db, 1, post_key+2, uni_len, gid); 
    } else { 
      index_adjust (db, 1, OP_ADD, db->pos[1], 
          post_key+2, uni_len, db->clust[ind+1], gid); 
    } 
  } 
 
  if (do_combine) { 
    chain_delete (db, ind, ind + 1, AUX1); 
    index_adjust (db, 1, OP_REMOVE, db->pos[1], 0, 0, 0, node->gid); 
  } 
 
  /* If there was a split or a combine, adjust the index above. */ 
} 
 
/* Search the tree and find the leaf node which either contains the given 
 * key, or contains the insertion point where this key should be inserted. 
 * When finished, db->active will be 0, and node[0] will be read wired to 
 * the appropriate cluster.  db->pos will be set to the index in this 
 * cluster where the key belongs, and db->matched will be set, if this key 
 * matches the one pointed to by db->pos. */ 
static void 
down_find (fs_db_t db, const uint8 *key, unsigned key_length) 
{ 
  int level; 
  int dest; 
  cluster_id next_cluster; 
 
  next_cluster = db->top; 
 
  for (level = db->top_level; level >= 0; level--) { 
    dest = FS_DB_CLUSTERS_PER_LEVEL * level; 
    ASSERT (dest < FS_DB_ACTIVE_CLUSTERS); 
 
    db->active = dest; 
    db->clust[dest] = next_cluster; 
    db_read_wire (db, dest); 
 
    ASSERT (level == db->node[dest]->level); 
 
    scan (db, key, key_length); 
 
    /* For interior nodes, get the appropriate next node. */ 
    if (level > 0) { 
      ASSERT (db->pos[level] >= ID_SIZE); 
      next_cluster = get_id (&db->node[dest]->data[db->pos[level] - ID_SIZE]); 
    } 
  } 
} 
 
/********************************************************************** 
 * Previous versions of this code erroneously handled the 'gid' field.  In 
 * order to upgrade from these versions, we set all of the 'gid' fields to 
 * FS_GROUP_ZERO.  This can cause space leaks of db nodes, but it is better 
 * than the assertion failures caused by the corrupt 'gid' fields. 
 */ 
 
static void upgrade_one_node (fs_db_t db, unsigned *transaction_count); 
static void 
fs_db_upgrade_gids (fs_db_t db) 
{ 
  cluster_id clust; 
  cluster_id nclust; 
  node_t n; 
  cluster_id next_level = db->top; 
  unsigned transaction_count = 0; 
 
  MSG_HIGH ("Upgrading database node gid fields", 0, 0, 0); 
 
  while (next_level != INVALID_CLUSTER_ID) { 
    clust = next_level; 
 
    db->clust[0] = clust; 
    db_read_wire (db, 0); 
    n = (node_t) db->node[0]; 
 
    if (n->level > 0) 
      next_level = get_id (&n->data[0]); 
    else 
      next_level = INVALID_CLUSTER_ID; 
 
    upgrade_one_node (db, &transaction_count); 
    n = (node_t) db->node[0]; 
    // dump_node (db, clust, n); 
 
    while (n->right != INVALID_CLUSTER_ID) { 
      db_unwire (db, 0); 
 
      nclust = n->right; 
      clust = nclust; 
 
      db->clust[0] = clust; 
      db_read_wire (db, 0); 
      n = (node_t) db->node[0]; 
 
      upgrade_one_node (db, &transaction_count); 
      n = (node_t) db->node[0]; 
      // dump_node (db, clust, n); 
    } 
    db_unwire (db, 0); 
  } 
 
  if (transaction_count > 0) { 
    db->buf->end_transaction (db->buf); 
  } 
 
  MSG_HIGH ("Done gid upgrade", 0, 0, 0); 
} 
 
/* Performs an upgrade of a single database node.  If the transaction count 
 * is zero, will start a transaction.  If it is non-zero after return, a 
 * transaction is open. 
 * May rewire the node as a write wire, so be sure to use the updated node 
 * pointer. */ 
static void 
upgrade_one_node (fs_db_t db, unsigned *transaction_count) 
{ 
  node_t n = (node_t) db->node[0]; 
 
  // MSG_HIGH ("Upgrade 0x%x", db->clust[0], 0, 0); 
  if (n->gid != FS_GROUP_ZERO) { 
    if (*transaction_count == 0) { 
      db->buf->start_transaction (db->buf); 
    } 
 
    db_write_wire (db, 0); 
    n = (node_t) db->node[0]; 
    n->gid = FS_GROUP_ZERO; 
    (*transaction_count)++; 
 
    if (*transaction_count > 16) { 
      db->buf->end_transaction (db->buf); 
      *transaction_count = 0; 
    } 
  } 
} 
 
/********************************************************************** 
 * Public interface 
 */ 
 
void 
fs_db_init (fs_db_t db, fsb_t buf) 
{ 
  node_t n; 
  int i; 
  int result; 
 
#ifdef FS_UNIT_TEST 
#error code not present 
#endif 
 
  db->buf = buf; 
 
  db->top = buf->get_field (buf, FS_FIELD_DB_ROOT); 
 
  if (db->top == INVALID_CLUSTER_ID) { 
    buf->start_transaction (buf); 
    result = buf->allocate (buf, 0, &db->top); 
 
    if (result < 0) 
    { 
      FS_ERR_FATAL ("Not enough space to create initial DB", 0, 0, 0); 
    } 
    n = (node_t) buf->write_wire (buf, db->top); 
    erase_node (n, 0, FS_GROUP_ZERO); 
    buf->free_wire (buf, db->top); 
    buf->set_field (buf, FS_FIELD_DB_ROOT, db->top); 
    buf->end_transaction (buf); 
  } 
 
  /* Clear out all of the wired clusters. */ 
  for (i = 0; i < FS_DB_ACTIVE_CLUSTERS; i++) { 
    db->clust[i] = INVALID_CLUSTER_ID; 
    db->node[i] = NULL; 
  } 
  db->active = 0; 
 
  /* Setup the iterators. */ 
  db->iter_count = 0; 
  for (i = 0; i < FS_DB_ACTIVE_ITERATORS; i++) { 
    db->iters[i].db = db; 
    db->iters[i].index = i; 
    db->iters[i].cluster = INVALID_CLUSTER_ID; 
    db->iters[i].pos = 0; 
  } 
 
  /* Determine the level of the top of the tree. */ 
  n = (node_t) buf->read_wire (buf, db->top); 
  db->top_level = n->level; 
  buf->free_wire (buf, db->top); 
 
  /* Perform a bug-fix upgrade of the database if needed. */ 
  if (fs_upgrade_check (FS_UPGRADE_DBGID_FIX)) { 
    fs_db_upgrade_gids (db); 
    fs_upgrade_finished (FS_UPGRADE_DBGID_FIX); 
  } 
} 
 
int 
fs_db_add (fs_db_t db, const void *key, unsigned key_length, 
    const void *value, unsigned value_length, uint32 gid, uint32 uid) 
{ 
  int result = 0; 
 
  down_find (db, key, key_length); 
 
  if (!db->matched) { 
    leaf_modify (db, 0, OP_ADD, 
        db->pos[0], key, key_length, value, value_length, gid, uid); 
  } else { 
    /* This key is already present. */ 
    result = -1; 
  } 
 
  db_unwire_all (db); 
 
  return result; 
} 
 
/* Search for the node and replace it.  The replacement does not need to be 
 * the same length as the original data. */ 
int 
fs_db_update (fs_db_t db, const void *key, unsigned key_length, 
    const void *value, unsigned value_length, uint32 gid, uint32 uid) 
{ 
  int result = 0; 
 
  down_find (db, key, key_length); 
 
  if (db->matched) { 
    leaf_modify (db, 0, OP_REPLACE, 
        db->pos[0], 0, 0, value, value_length, gid, uid); 
  } else { 
    result = -1; 
  } 
 
  db_unwire_all (db); 
 
  return result; 
} 
 
/* Delete an entry from the database. */ 
int 
fs_db_delete (fs_db_t db, void *key, unsigned key_length) 
{ 
  int result = 0; 
 
  down_find (db, key, key_length); 
 
  if (db->matched) { 
    leaf_modify (db, 0, OP_REMOVE, 
        db->pos[0], 0, 0, 0, 0, FS_GROUP_ZERO, FS_CURR_UID); 
    /* Pass in bogus GID to be set in leaf_modify */ 
  } else { 
    result = -1; 
  } 
 
  db_unwire_all (db); 
 
  return result; 
} 
 
int 
fs_db_lookup (fs_db_t db, const void *key, unsigned key_length, 
    void *value, unsigned *value_length) 
{ 
  int result = 0; 
  unsigned vlen; 
  unsigned klen; 
  int pos; 
  node_t n; 
 
  down_find (db, key, key_length); 
  pos = db->pos[0]; 
 
  if (db->matched) { 
    n = db->node[0]; 
 
    /* Check the size of the user's buffer.  If there isn't room, just 
     * error. */ 
    vlen = n->data[pos + 1]; 
    if (vlen <= *value_length) { 
      klen = n->data[pos]; 
      memcpy (value, &n->data[pos + 2 + klen], vlen); 
      *value_length = vlen; 
    } else { 
      /* It doesn't fit, copy over as much as there is room for, and then 
       * indicate the error condition. */ 
      klen = n->data[pos]; 
      memcpy (value, &n->data[pos + 2 + klen], *value_length); 
      *value_length = vlen; 
      result = -ENAMETOOLONG; 
    } 
  } else { 
    /* The key wasn't found, so this is also an error. */ 
    result = -ENOENT; 
  } 
 
  /* Free any used wires. */ 
  db_unwire_all (db); 
 
  return result; 
} 
 
/********************************************************************** 
 * Database iteration. 
 * 
 * The iterators themselves don't worry about the database being modified. 
 * Each routine that modifies leaf nodes will check the iterators and 
 * update any pointers if they have changed. 
 */ 
fs_db_iter_t 
fs_db_iter_start (fs_db_t db, void *key, unsigned key_length) 
{ 
  int i; 
  fs_db_iter_t iter; 
 
  /* Is there an available iterator? */ 
  if (db->iter_count == FS_DB_ACTIVE_ITERATORS) 
    return NULL; 
 
  /* Scan for the active one. */ 
  for (i = 0; i < FS_DB_ACTIVE_ITERATORS; i++) { 
    if (db->iters[i].cluster == INVALID_CLUSTER_ID) 
      break; 
  } 
 
  ASSERT (i < FS_DB_ACTIVE_ITERATORS); 
 
  iter = &db->iters[i]; 
  db->iter_count++; 
 
  /* Search for the appropriate cluster.  The iterator doesn't check for an 
   * exact match, so it doesn't matter if the key was found. */ 
  down_find (db, key, key_length); 
  iter->cluster = db->clust[0]; 
  iter->pos = db->pos[0]; 
 
  /* If the position is at the end of a cluster, and there are more 
   * clusters to our right, advance it to the beginning of the next 
   * cluster. */ 
  if (iter->pos == db->node[0]->used && 
      db->node[0]->right != INVALID_CLUSTER_ID) 
  { 
    iter->pos = 0; 
    iter->cluster = db->node[0]->right; 
  } 
 
  db_unwire_all (db); 
 
  return iter; 
} 
 
void 
fs_db_iter_free (fs_db_iter_t iter) 
{ 
  if (iter->cluster == INVALID_CLUSTER_ID) 
    return; 
 
  iter->db->iter_count--; 
  iter->cluster = INVALID_CLUSTER_ID; 
  iter->pos = 0; 
} 
 
/* XXX: Should have distinct return values that indicate between there 
 * being no more, and this key not fitting. */ 
int 
fs_db_iter_get_key (fs_db_iter_t iter, void *key, unsigned *key_length) 
{ 
  int result; 
  fs_db_t db = iter->db; 
  unsigned len; 
 
  ASSERT (iter->cluster != INVALID_CLUSTER_ID); 
 
  /* Get the cluster and wire it down. */ 
  db->clust[0] = iter->cluster; 
  db_read_wire (db, 0); 
 
  if (iter->pos == db->node[0]->used) { 
    /* EOF. */ 
    result = -EEOF; 
  } else if (db->node[0]->data[iter->pos] > *key_length) { 
    /* Key doesn't fit. */ 
    result = -2; /* Hmm. */ 
  } else { 
    len = db->node[0]->data[iter->pos]; 
 
    *key_length = len; 
    memcpy (key, &db->node[0]->data[iter->pos + 2], len); 
    result = 0; 
  } 
 
  db_unwire (db, 0); 
 
  return result; 
} 
 
int 
fs_db_iter_get_value (fs_db_iter_t iter, void *value, unsigned *value_length) 
{ 
  int result; 
  fs_db_t db = iter->db; 
  unsigned len; 
  unsigned pos; 
 
  ASSERT (iter->cluster != INVALID_CLUSTER_ID); 
 
  /* Get the cluster and wire it down. */ 
  db->clust[0] = iter->cluster; 
  db_read_wire (db, 0); 
 
  if (iter->pos == db->node[0]->used) { 
    /* EOF. */ 
    result = -EEOF; 
  } else if (db->node[0]->data[iter->pos + 1] > *value_length) { 
    /* Key doesn't fit. */ 
    result = -2; /* Hmm. */ 
  } else { 
    len = db->node[0]->data[iter->pos + 1]; 
    pos = iter->pos + 2 + db->node[0]->data[iter->pos]; 
 
    *value_length = len; 
    memcpy (value, &db->node[0]->data[pos], len); 
    result = 0; 
  } 
 
  db_unwire (db, 0); 
 
  return result; 
} 
 
int 
fs_db_iter_next (fs_db_iter_t iter) 
{ 
  int result; 
 
  fs_db_t db = iter->db; 
 
  ASSERT (iter->cluster != INVALID_CLUSTER_ID); 
 
  /* Get the current cluster and wire it down. */ 
  db->clust[0] = iter->cluster; 
  db_read_wire (db, 0); 
 
  if (iter->pos == db->node[0]->used) { 
    /* We're already at the end. */ 
    result = -1; 
  } else { 
    iter->pos += db->node[0]->data[iter->pos] + 
      db->node[0]->data[iter->pos + 1] + 2; 
 
    /* If we reach the end, go on. */ 
    if (iter->pos == db->node[0]->used && 
        db->node[0]->right != INVALID_CLUSTER_ID) 
    { 
      iter->pos = 0; 
      iter->cluster = db->node[0]->right; 
 
      /* Need to skip empty nodes. */ 
      while (1) { 
        db_unwire (db, 0); 
        db->clust[0] = iter->cluster; 
        db_read_wire (db, 0); 
 
        if (db->node[0]->right == INVALID_CLUSTER_ID || 
            db->node[0]->used != 0) 
          break; 
 
        iter->cluster = db->node[0]->right; 
      } 
    } 
 
    result = 0; 
  } 
 
  db_unwire (db, 0); 
 
  return result; 
} 
 
#ifdef FS_UNIT_TEST 
#error code not present 
#endif 
/*lint -restore */