[Ocfs2-tools-devel] [RFC 2/8] dx_dirs v4: indexed dirs core code in libocfs2
Coly Li
coly.li at suse.de
Wed Jan 20 07:23:31 PST 2010
This patch contains the core part of current indexed dirs support in libocfs2, includes,
- Indexed tree truncate.
- Build indexed tree.
- Insert dx record into indexed tree.
- Expand inlined dx_root to an extent tree.
- Rebalance indexed tree.
I worry about whether I use the extent tree code correctly, though I tried my best to understand the code, IMHO it's
still a gray area to me so far. Any review/comment is helpful !
Signed-off-by: Coly Li <coly.li at suse.de>
Cc: Tao Ma <tao.ma at oracle.com>
---
Makefile | 3
dir_indexed.c | 1117 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
dir_indexed.h | 25 +
inode.c | 4
4 files changed, 1148 insertions(+), 1 deletion(-)
diff --git a/libocfs2/Makefile b/libocfs2/Makefile
index 8e94cc3..6d3b563 100644
--- a/libocfs2/Makefile
+++ b/libocfs2/Makefile
@@ -76,7 +76,8 @@ CFILES = \
quota.c \
image.c \
xattr.c \
- extent_tree.c
+ extent_tree.c \
+ dir_indexed.c
HFILES = \
bitmap.h \
diff --git a/libocfs2/inode.c b/libocfs2/inode.c
index c1a8646..b41b286 100644
--- a/libocfs2/inode.c
+++ b/libocfs2/inode.c
@@ -139,6 +139,10 @@ static void ocfs2_swap_inode_second(struct ocfs2_dinode *di)
sb->s_uuid_hash = bswap_32(sb->s_uuid_hash);
sb->s_first_cluster_group = bswap_64(sb->s_first_cluster_group);
sb->s_xattr_inline_size = bswap_16(sb->s_xattr_inline_size);
+ sb->s_dx_seed[0] = bswap_32(sb->s_dx_seed[0]);
+ sb->s_dx_seed[1] = bswap_32(sb->s_dx_seed[1]);
+ sb->s_dx_seed[2] = bswap_32(sb->s_dx_seed[2]);
+ sb->s_dx_seed[3] = bswap_32(sb->s_dx_seed[3]);
} else if (di->i_flags & OCFS2_LOCAL_ALLOC_FL) {
struct ocfs2_local_alloc *la = &di->id2.i_lab;
diff --git a/libocfs2/dir_indexed.c b/libocfs2/dir_indexed.c
new file mode 100644
index 0000000..2d1d602
--- /dev/null
+++ b/libocfs2/dir_indexed.c
@@ -0,0 +1,1117 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * Copyright (C) 2009 Novell. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License, version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <assert.h>
+#include <ocfs2/ocfs2.h>
+#include <ocfs2/bitops.h>
+#include <ocfs2/kernel-rbtree.h>
+#include "extent_tree.h"
+#include "dir_indexed.h"
+
+
+int ocfs2_dx_dir_truncate(ocfs2_filesys *fs,
+ uint64_t dir)
+{
+ struct ocfs2_dx_root_block *dx_root;
+ char *dx_root_buf = NULL, *di_buf = NULL;
+ struct ocfs2_dinode *di;
+ int ret = 0;
+
+ ret = ocfs2_malloc_block(fs->fs_io, &di_buf);
+ if (ret)
+ goto out;
+ ret = ocfs2_read_inode(fs, dir, di_buf);
+ if (ret)
+ goto out;
+ di = (struct ocfs2_dinode *)di_buf;
+
+ /* we have to trust i_dyn_features */
+ if ((!ocfs2_dir_indexed(di)) ||
+ (di->i_dyn_features & OCFS2_INLINE_DATA_FL))
+ goto out;
+
+ ret = ocfs2_malloc_block(fs->fs_io, &dx_root_buf);
+ if (ret)
+ goto out;
+ ret = ocfs2_read_dx_root(fs, (uint64_t)di->i_dx_root, dx_root_buf);
+ if (ret)
+ goto out;
+ dx_root = (struct ocfs2_dx_root_block *)dx_root_buf;
+
+ if (dx_root->dr_flags & OCFS2_DX_FLAG_INLINE)
+ goto remove_index;
+
+ ret = ocfs2_dir_indexed_tree_truncate(fs, dx_root);
+
+remove_index:
+ ret = ocfs2_delete_dx_root(fs, dx_root->dr_blkno);
+
+ di->i_dyn_features &= ~OCFS2_INDEXED_DIR_FL;
+ di->i_dx_root = 0;
+
+ ret = ocfs2_write_inode(fs, di->i_blkno, (char *)di);
+out:
+ if (di_buf)
+ ocfs2_free(&di_buf);
+ if (dx_root_buf)
+ ocfs2_free(&dx_root_buf);
+ return ret;
+}
+
+static unsigned int ocfs2_figure_dirent_hole(struct ocfs2_dir_entry *de)
+{
+ unsigned int hole;
+
+ if (de->inode == 0)
+ hole = de->rec_len;
+ else
+ hole = de->rec_len - OCFS2_DIR_REC_LEN(de->name_len);
+
+ return hole;
+}
+
+static int ocfs2_find_max_rec_len(ocfs2_filesys *fs,
+ char *buf)
+{
+ int size, this_hole, largest_hole = 0;
+ char *de_buf, *limit, *start;
+ struct ocfs2_dir_block_trailer *trailer;
+ struct ocfs2_dir_entry *de;
+
+ start = buf;
+ trailer = ocfs2_dir_trailer_from_block(fs, buf);
+ size = ocfs2_dir_trailer_blk_off(fs);
+ limit = start + size;
+ de_buf = start;
+ de = (struct ocfs2_dir_entry *)de_buf;
+ do {
+ if (de_buf != (char *)trailer) {
+ this_hole = ocfs2_figure_dirent_hole(de);
+ if (this_hole > largest_hole)
+ largest_hole = this_hole;
+ }
+
+ de_buf += de->rec_len;
+ de = (struct ocfs2_dir_entry *)de_buf;
+ } while (de_buf < limit);
+
+ if (largest_hole >= OCFS2_DIR_MIN_REC_LEN)
+ return largest_hole;
+ return 0;
+}
+
+struct trailer_ctxt {
+ struct ocfs2_dx_root_block *dx_root;
+ struct ocfs2_dinode *di;
+};
+
+
+static int dir_trailer_func(ocfs2_filesys *fs,
+ uint64_t blkno,
+ uint64_t bcount,
+ uint16_t ext_flags,
+ void *priv_data)
+{
+ struct trailer_ctxt *ctxt = (struct trailer_ctxt *)priv_data;
+ struct ocfs2_dinode *di = ctxt->di;
+ struct ocfs2_dx_root_block *dx_root = ctxt->dx_root;
+ struct ocfs2_dir_block_trailer *trailer;
+ int max_rec_len = 0;
+ errcode_t ret = 0;
+ char *blk = NULL;
+
+ ret = ocfs2_malloc_block(fs->fs_io, &blk);
+ if (ret)
+ goto out;
+
+ /* here we don't trust trailer, cannot use
+ * ocfs2_read_dir_block() */
+ ret = ocfs2_read_blocks(fs, blkno, 1, blk);
+ if (ret)
+ goto out;
+
+ max_rec_len = ocfs2_find_max_rec_len(fs, blk);
+ trailer = ocfs2_dir_trailer_from_block(fs, blk);
+ trailer->db_free_rec_len = max_rec_len;
+ ocfs2_init_dir_trailer(fs, di, blkno, blk);
+
+ if (max_rec_len) {
+ trailer->db_free_next = dx_root->dr_free_blk;
+ dx_root->dr_free_blk = blkno;
+ }
+
+ /* comput trailer->db_check here, after writes out,
+ * trailer is trustable */
+ ret = ocfs2_write_dir_block(fs, di, blkno, blk);
+out:
+ if (blk)
+ ocfs2_free(&blk);
+ return ret;
+}
+
+static errcode_t ocfs2_init_dir_trailers(ocfs2_filesys *fs,
+ struct ocfs2_dinode *di,
+ struct ocfs2_dx_root_block *dx_root)
+{
+ errcode_t ret = 0;
+ struct trailer_ctxt ctxt;
+
+ if (di->i_dyn_features & OCFS2_INLINE_DATA_FL) {
+ ret = OCFS2_ET_INODE_NOT_VALID;
+ goto out;
+ }
+
+ ctxt.di = di;
+ ctxt.dx_root = dx_root;
+
+ ret = ocfs2_block_iterate_inode(fs, di,
+ 0, dir_trailer_func, &ctxt);
+out:
+ return ret;
+}
+
+static void ocfs2_dx_entry_list_insert(struct ocfs2_dx_entry_list *entry_list,
+ struct ocfs2_dx_hinfo *hinfo,
+ uint64_t dirent_blk)
+{
+ int i;
+ struct ocfs2_dx_entry *dx_entry;
+
+ i = entry_list->de_num_used;
+ dx_entry = &entry_list->de_entries[i];
+
+ memset(dx_entry, 0, sizeof(struct ocfs2_dx_entry));
+ dx_entry->dx_major_hash = hinfo->major_hash;
+ dx_entry->dx_minor_hash = hinfo->minor_hash;
+ dx_entry->dx_dirent_blk = dirent_blk;
+
+ entry_list->de_num_used += 1;
+}
+
+struct dx_insert_ctxt {
+ uint64_t dir_blkno;
+ uint64_t dx_root_blkno;
+ ocfs2_filesys *fs;
+};
+
+
+inline static int ocfs2_inline_dx_has_space(struct ocfs2_dx_root_block *dx_root)
+{
+ struct ocfs2_dx_entry_list *entry_list;
+
+ entry_list = &dx_root->dr_entries;
+
+ if (entry_list->de_num_used >= entry_list->de_count)
+ return OCFS2_ET_DIR_NO_SPACE;
+
+ return 1;
+}
+
+static struct ocfs2_dx_leaf **ocfs2_dx_dir_alloc_leaves(ocfs2_filesys *fs,
+ int *ret_num_leaves)
+{
+ int ret, num_dx_leaves = ocfs2_clusters_to_blocks(fs, 1);
+ char *dx_leaves_buf = NULL;
+
+
+ ret = ocfs2_malloc0(num_dx_leaves * sizeof(void *), &dx_leaves_buf);
+ if (dx_leaves_buf && ret_num_leaves)
+ *ret_num_leaves = num_dx_leaves;
+
+ return (struct ocfs2_dx_leaf **)dx_leaves_buf;
+}
+
+static int ocfs2_dx_dir_format_cluster(ocfs2_filesys *fs,
+ struct ocfs2_dx_leaf **dx_leaves,
+ int num_dx_leaves,
+ uint64_t start_blk)
+{
+ int ret, i;
+ struct ocfs2_dx_leaf *dx_leaf;
+ char *blk;
+
+ for (i = 0; i < num_dx_leaves; i++) {
+ ret = ocfs2_malloc_block(fs->fs_io, &blk);
+ if (ret)
+ goto out;
+
+ dx_leaves[i] = (struct ocfs2_dx_leaf *)blk;
+ dx_leaf = (struct ocfs2_dx_leaf *)blk;
+
+ memset(dx_leaf, 0, fs->fs_blocksize);
+ strcpy((char *)dx_leaf->dl_signature, OCFS2_DX_LEAF_SIGNATURE);
+ dx_leaf->dl_fs_generation = fs->fs_super->i_fs_generation;
+ dx_leaf->dl_blkno = start_blk + i;
+ dx_leaf->dl_list.de_count = ocfs2_dx_entries_per_leaf(fs->fs_blocksize);
+
+ ret = ocfs2_write_dx_leaf(fs, dx_leaf->dl_blkno, dx_leaf);
+ if (ret)
+ goto out;
+ }
+ ret = 0;
+out:
+ return ret;
+}
+
+static inline unsigned int __ocfs2_dx_dir_hash_idx(ocfs2_filesys *fs,
+ uint32_t minor_hash)
+{
+ unsigned int cbits, bbits, dx_mask;
+
+ cbits = OCFS2_RAW_SB(fs->fs_super)->s_clustersize_bits;
+ bbits = OCFS2_RAW_SB(fs->fs_super)->s_blocksize_bits;
+ dx_mask = (1 << (cbits - bbits)) -1;
+
+ return (minor_hash & dx_mask);
+}
+
+static inline unsigned int ocfs2_dx_dir_hash_idx(ocfs2_filesys *fs,
+ struct ocfs2_dx_hinfo *hinfo)
+{
+ return __ocfs2_dx_dir_hash_idx(fs, hinfo->minor_hash);
+}
+
+static void ocfs2_dx_dir_leaf_insert_tail(struct ocfs2_dx_leaf *dx_leaf,
+ struct ocfs2_dx_entry *dx_new_entry)
+{
+ int i;
+
+ i = dx_leaf->dl_list.de_num_used;
+ dx_leaf->dl_list.de_entries[i] = *dx_new_entry;
+
+ dx_leaf->dl_list.de_num_used += 1;
+}
+
+/* XXX should add bail part */
+static int ocfs2_expand_inline_dx_root(ocfs2_filesys *fs,
+ struct ocfs2_dx_root_block *dx_root)
+{
+ int ret, num_dx_leaves, i, j;
+ uint64_t start_blkno = 0;
+ uint32_t clusters_found = 0;
+ struct ocfs2_dx_leaf **dx_leaves = NULL;
+ struct ocfs2_dx_leaf *target_leaf;
+ struct ocfs2_dx_entry_list *entry_list;
+ struct ocfs2_extent_tree et;
+ struct ocfs2_dx_entry *dx_entry;
+
+ dx_leaves = ocfs2_dx_dir_alloc_leaves(fs, &num_dx_leaves);
+ if (!dx_leaves) {
+ ret = OCFS2_ET_NO_MEMORY;
+ goto out;
+ }
+
+ ret = ocfs2_new_clusters(fs, 1, 1, &start_blkno, &clusters_found);
+ if (ret)
+ goto out;
+ assert(clusters_found == 1);
+ ret = ocfs2_dx_dir_format_cluster(fs, dx_leaves,
+ num_dx_leaves, start_blkno);
+
+ /* Transfer the entries from inline dx_root into the appropriate
+ * block
+ */
+ entry_list = &dx_root->dr_entries;
+
+ for (i = 0; i < entry_list->de_num_used; i++) {
+ dx_entry = &entry_list->de_entries[i];
+ j = __ocfs2_dx_dir_hash_idx(fs, dx_entry->dx_minor_hash);
+ target_leaf = (struct ocfs2_dx_leaf *)dx_leaves[j];
+
+ ocfs2_dx_dir_leaf_insert_tail(target_leaf, dx_entry);
+ ret = ocfs2_write_dx_leaf(fs, target_leaf->dl_blkno,
+ target_leaf);
+ if (ret)
+ goto out;
+ }
+
+ dx_root->dr_flags &= ~OCFS2_DX_FLAG_INLINE;
+ memset(&dx_root->dr_list, 0, fs->fs_blocksize -
+ offsetof(struct ocfs2_dx_root_block, dr_list));
+ dx_root->dr_list.l_count =
+ ocfs2_extent_recs_per_dx_root(fs->fs_blocksize);
+
+ /* This should never fail considering we start with an empty
+ * dx_root */
+ ocfs2_init_dx_root_extent_tree(&et, fs, (char *)dx_root, dx_root->dr_blkno);
+ ret = ocfs2_tree_insert_extent(fs, &et, 0, start_blkno, 1, 0);
+ if (ret)
+ goto out;
+
+out:
+ return ret;
+}
+
+static int ocfs2_dx_dir_lookup_rec(ocfs2_filesys *fs,
+ struct ocfs2_dx_root_block *dx_root,
+ struct ocfs2_extent_list *el,
+ uint32_t major_hash,
+ uint32_t *ret_cpos,
+ uint64_t *ret_phys_blkno,
+ unsigned int *ret_clen)
+{
+ int ret = 0, i, found;
+ struct ocfs2_extent_block *eb;
+ struct ocfs2_extent_rec *rec = NULL;
+ char *eb_buf = NULL;
+
+ if (el->l_tree_depth) {
+ ret = ocfs2_tree_find_leaf(fs,
+ &dx_root->dr_list,
+ dx_root->dr_blkno,
+ (char *)dx_root,
+ major_hash, &eb_buf);
+ if (ret)
+ goto out;
+
+ eb = (struct ocfs2_extent_block *)eb_buf;
+ el = &eb->h_list;
+
+ if (el->l_tree_depth) {
+ ret = OCFS2_ET_CORRUPT_EXTENT_BLOCK;
+ goto out;
+ }
+ }
+
+ found = 0;
+ for (i = el->l_next_free_rec - 1; i >= 0; i--) {
+ rec = &el->l_recs[i];
+ if (rec->e_cpos <= major_hash) {
+ found = 1;
+ break;
+ }
+ }
+ if (!found) {
+ ret = OCFS2_ET_CORRUPT_EXTENT_BLOCK;
+ goto out;
+ }
+
+ if (ret_phys_blkno)
+ *ret_phys_blkno = rec->e_blkno;
+ if (ret_cpos)
+ *ret_cpos = rec->e_cpos;
+ if (ret_clen)
+ *ret_clen = rec->e_leaf_clusters;
+
+out:
+ if (eb_buf)
+ ocfs2_free(&eb_buf);
+ return ret;
+}
+
+static int ocfs2_dx_dir_lookup(ocfs2_filesys *fs,
+ struct ocfs2_dx_root_block *dx_root,
+ struct ocfs2_extent_list *el,
+ struct ocfs2_dx_hinfo *hinfo,
+ uint32_t *ret_cpos,
+ uint64_t *ret_phys_blkno)
+{
+ int ret = 0;
+ unsigned int cend, clen;
+ uint32_t cpos = 0;
+ uint64_t blkno = 0;
+ uint32_t name_hash = hinfo->major_hash;
+
+ ret = ocfs2_dx_dir_lookup_rec(fs, dx_root, el,
+ name_hash, &cpos, &blkno, &clen);
+ if (ret)
+ goto out;
+ cend = cpos + clen;
+ if (name_hash >= cend) {
+ blkno += ocfs2_clusters_to_blocks(fs, clen - 1);
+ cpos += clen - 1;
+ } else {
+ blkno += ocfs2_clusters_to_blocks(fs, name_hash - cpos);
+ cpos = name_hash;
+ }
+
+ blkno += ocfs2_dx_dir_hash_idx(fs, hinfo);
+
+ if (ret_phys_blkno)
+ *ret_phys_blkno = blkno;
+ if (ret_cpos)
+ *ret_cpos = cpos;
+
+out:
+ return ret;
+}
+
+static int dx_leaf_sort_cmp(const void *a, const void *b)
+{
+ const struct ocfs2_dx_entry *e1 = a;
+ const struct ocfs2_dx_entry *e2 = b;
+ uint32_t major_hash1 = e1->dx_major_hash;
+ uint32_t major_hash2 = e2->dx_major_hash;
+ uint32_t minor_hash1 = e1->dx_minor_hash;
+ uint32_t minor_hash2 = e2->dx_minor_hash;
+
+ if (major_hash1 > major_hash2)
+ return 1;
+ if (major_hash1 < major_hash2)
+ return -1;
+
+ /* it is not strictly necessary to sort by minor */
+ if (minor_hash1 > minor_hash2)
+ return 1;
+ if (minor_hash1 < minor_hash2)
+ return -1;
+ return 0;
+}
+
+static void dx_leaf_sort_swap(void *a, void *b, int size)
+{
+ struct ocfs2_dx_entry *e1 = a;
+ struct ocfs2_dx_entry *e2 = b;
+ struct ocfs2_dx_entry tmp;
+
+ assert(size == sizeof (struct ocfs2_dx_entry));
+
+ tmp = *e1;
+ *e1 = *e2;
+ *e2 = tmp;
+}
+
+static int ocfs2_dx_leaf_same_major(struct ocfs2_dx_leaf *dx_leaf)
+{
+ struct ocfs2_dx_entry_list *dl_list = &dx_leaf->dl_list;
+ int i, num = dl_list->de_num_used;
+
+ for (i = 0; i < (num - 1); i++) {
+ if (dl_list->de_entries[i].dx_major_hash !=
+ dl_list->de_entries[i + 1].dx_major_hash)
+ return 0;
+ }
+ return 1;
+}
+
+/*
+ * Find the optimal value to split this leaf on. This expects the leaf
+ * entries to be in sorted order.
+ *
+ * leaf_cpos is the cpos of the leaf we're splitting. insert_hash is
+ * the hash we want to insert.
+ *
+ * This function is only concerned with the major hash - that which
+ * determines which cluster an item belongs to.
+ */
+static int ocfs2_dx_dir_find_leaf_split(struct ocfs2_dx_leaf *dx_leaf,
+ uint32_t leaf_cpos,
+ uint32_t insert_hash,
+ uint32_t *split_hash)
+{
+ struct ocfs2_dx_entry_list *dl_list = &dx_leaf->dl_list;
+ int i, num_used = dl_list->de_num_used;
+ int allsame;
+
+ /*
+ * There's a couple rare, but nasty corner cases we have to
+ * check for here. All of them involve a leaf where all value
+ * have the same hash, which is what we look for first.
+ *
+ * Most of the time, all of the above is false, and we simply
+ * pick the median value for a split.
+ */
+ allsame = ocfs2_dx_leaf_same_major(dx_leaf);
+ if (allsame) {
+ uint32_t val = dl_list->de_entries[0].dx_major_hash;
+ if (val == insert_hash) {
+ /*
+ * No matter where we would choose to split,
+ * the new entry would want to occupy the same
+ * block as these. Since there's no space left
+ * in their existing block, we know there
+ * won't be space after the split.
+ */
+ return OCFS2_ET_DIR_NO_SPACE;
+ }
+
+ if (val == leaf_cpos) {
+ /*
+ * Because val is the same as leaf_cpos (which
+ * is the smallest value this leaf can have),
+ * yet is not equal to insert_hash, then we
+ * know that insert_hash *must* be larger than
+ * val (and leaf_cpos). At least cpos+1 in value.
+ *
+ * We also know then, that there cannot be an
+ * adjacent extent (otherwise we'd be looking
+ * at it). Choosing this value gives us a
+ * chance to get some continguousness.
+ */
+ *split_hash = leaf_cpos + 1;
+ return 0;
+ }
+
+ if (val > insert_hash) {
+ /*
+ * val can not be the same as insert_hash, and
+ * also must be larger than leaf_cpos. Also,
+ * we know that there can't be a leaf between
+ * cpos and val, otherwise the entries with
+ * hash 'val' would be there.
+ */
+ *split_hash = val;
+ return 0;
+ }
+
+ *split_hash = insert_hash;
+ return 0;
+ }
+
+ /*
+ * Since the records are sorted and the checks above
+ * guaranteed that not all records in this block are the same,
+ * we simple travel forward, from the median, and pick the 1st
+ * record whose value is larger than leaf_cpos.
+ */
+ for (i = (num_used /2); i < num_used; i++) {
+ if (dl_list->de_entries[i].dx_major_hash > leaf_cpos)
+ break;
+ }
+
+ assert(i < num_used); /* Should be impossible */
+ *split_hash = dl_list->de_entries[i].dx_major_hash;
+ return 0;
+}
+
+static int ocfs2_read_dx_leaves(ocfs2_filesys *fs,
+ uint64_t start,
+ int num,
+ struct ocfs2_dx_leaf **orig_dx_leaves)
+{
+ int ret, i;
+ struct ocfs2_dx_leaf *dx_leaf;
+ for (i = 0; i < num; i++) {
+ dx_leaf = orig_dx_leaves[i];
+ assert(dx_leaf);
+ ret = ocfs2_read_blocks(fs, start + i, 1, (char *)dx_leaf);
+ if (ret)
+ goto out;
+ }
+ ret = 0;
+out:
+ return ret;
+}
+
+static int __ocfs2_dx_dir_new_cluster(ocfs2_filesys *fs,
+ uint32_t cpos,
+ struct ocfs2_dx_leaf **dx_leaves,
+ int num_dx_leaves,
+ uint64_t *ret_phys_blkno)
+{
+ int ret;
+ uint32_t num;
+ uint64_t phys;
+
+ ret = ocfs2_new_clusters(fs, 1, 1, &phys, &num);
+ if (ret)
+ goto out;
+ assert(num == 1);
+ ret = ocfs2_dx_dir_format_cluster(fs, dx_leaves,
+ num_dx_leaves, phys);
+ if (ret)
+ goto out;
+
+ *ret_phys_blkno = phys;
+
+out:
+ return ret;
+}
+
+static int ocfs2_dx_dir_new_cluster(ocfs2_filesys *fs,
+ struct ocfs2_extent_tree *et,
+ uint32_t cpos,
+ struct ocfs2_dx_leaf **dx_leaves,
+ int num_dx_leaves)
+{
+ int ret;
+ uint64_t phys_blkno;
+
+ ret = __ocfs2_dx_dir_new_cluster(fs, cpos, dx_leaves,
+ num_dx_leaves, &phys_blkno);
+ if (ret)
+ goto out;
+
+ ret = ocfs2_tree_insert_extent(fs, et, cpos, phys_blkno, 1, 0);
+
+out:
+ return ret;
+}
+
+
+static int ocfs2_dx_dir_transfer_leaf(ocfs2_filesys *fs,
+ uint32_t split_hash,
+ struct ocfs2_dx_leaf *tmp_dx_leaf,
+ struct ocfs2_dx_leaf **orig_dx_leaves,
+ uint64_t orig_dx_leaves_blkno,
+ struct ocfs2_dx_leaf **new_dx_leaves,
+ uint64_t new_dx_leaves_blkno,
+ int num_dx_leaves)
+{
+ int ret, i, j, num_used;
+ uint32_t major_hash;
+ struct ocfs2_dx_leaf *orig_dx_leaf, *new_dx_leaf;
+ struct ocfs2_dx_entry_list *orig_list, *new_list, *tmp_list;
+ struct ocfs2_dx_entry *dx_entry;
+
+ tmp_list = &tmp_dx_leaf->dl_list;
+
+ for (i = 0; i < num_dx_leaves; i++) {
+ orig_dx_leaf = orig_dx_leaves[i];
+ orig_list = &orig_dx_leaf->dl_list;
+ new_dx_leaf = new_dx_leaves[i];
+ new_list = &new_dx_leaf->dl_list;
+
+ num_used = orig_list->de_num_used;
+
+ memcpy(tmp_dx_leaf, orig_dx_leaf, fs->fs_blocksize);
+ tmp_list->de_num_used = 0;
+ memset(&tmp_list->de_entries, 0,
+ sizeof(struct ocfs2_dx_entry) * num_used);
+
+ for (j = 0; j < num_used; j++) {
+ dx_entry = &orig_list->de_entries[j];
+ major_hash = dx_entry->dx_major_hash;
+ if (major_hash >= split_hash)
+ ocfs2_dx_dir_leaf_insert_tail(new_dx_leaf,
+ dx_entry);
+ else
+ ocfs2_dx_dir_leaf_insert_tail(tmp_dx_leaf,
+ dx_entry);
+ }
+ memcpy(orig_dx_leaf, tmp_dx_leaf, fs->fs_blocksize);
+
+ ret = ocfs2_write_dx_leaf(fs, orig_dx_leaves_blkno + j,
+ (char *)orig_dx_leaf);
+ if (ret)
+ goto out;
+ ret = ocfs2_write_dx_leaf(fs, new_dx_leaves_blkno + j,
+ (char *)new_dx_leaf);
+ if (ret)
+ goto out;
+ }
+
+out:
+ return ret;
+}
+
+static int ocfs2_dx_dir_free_leaves(ocfs2_filesys *fs,
+ struct ocfs2_dx_leaf **dx_leaves)
+{
+ int i, num;
+
+ num = ocfs2_clusters_to_blocks(fs, 1);
+ for (i = 0; i < num; i++) {
+ if (dx_leaves[i])
+ ocfs2_free(&dx_leaves[i]);
+ }
+ ocfs2_free(dx_leaves);
+ return 0;
+}
+
+/* from Linux kernel lib/sort.c */
+static void ocfs2_sort(void *base, size_t num, size_t size,
+ int (*cmp_func)(const void *, const void *),
+ void (*swap_func)(void *, void *, int size))
+{
+ /* pre-scale counters for performance */
+ int i = (num/2 - 1) * size, n = num * size, c, r;
+
+ /* heapify */
+ for (; i >= 0; i -= size) {
+ for (r = i; r * 2 + size < n; r = c) {
+ c = r * 2 + size;
+ if (c < n - size &&
+ cmp_func(base + c, base + c + size) < 0)
+ c += size;
+ if (cmp_func(base + r, base + c) >= 0)
+ break;
+ swap_func(base + r, base + c, size);
+ }
+ }
+
+ /* sort */
+ for (i = n - size; i > 0; i -= size) {
+ swap_func(base, base + i, size);
+ for (r = 0; r * 2 + size < i; r = c) {
+ c = r * 2 + size;
+ if (c < i - size &&
+ cmp_func(base + c, base + c + size) < 0)
+ c += size;
+ if (cmp_func(base + r, base + c) >= 0)
+ break;
+ swap_func(base + r, base + c, size);
+ }
+ }
+
+}
+
+static int ocfs2_dx_dir_rebalance(ocfs2_filesys *fs,
+ struct ocfs2_dx_root_block *dx_root,
+ struct ocfs2_dx_leaf *dx_leaf,
+ struct ocfs2_dx_hinfo *hinfo,
+ uint32_t leaf_cpos,
+ uint64_t leaf_blkno)
+{
+ struct ocfs2_extent_tree et;
+ struct ocfs2_dx_leaf **orig_dx_leaves = NULL;
+ struct ocfs2_dx_leaf **new_dx_leaves = NULL;
+ struct ocfs2_dx_leaf *tmp_dx_leaf = NULL;
+ uint32_t insert_hash = hinfo->major_hash;
+ uint32_t split_hash, cpos;
+ uint64_t orig_leaves_start;
+ int ret, num_used, num_dx_leaves;
+
+ ocfs2_init_dx_root_extent_tree(&et, fs, (char *)dx_root, dx_root->dr_blkno);
+
+ if (dx_root->dr_clusters == UINT_MAX) {
+ ret = OCFS2_ET_DIR_NO_SPACE;
+ goto out;
+ }
+
+ num_used = dx_leaf->dl_list.de_num_used;
+ if (num_used < dx_leaf->dl_list.de_count) {
+ ret = OCFS2_ET_DX_BALANCE_EMPTY_LEAF;
+ goto out;
+ }
+
+ orig_dx_leaves = ocfs2_dx_dir_alloc_leaves(fs, &num_dx_leaves);
+ if (!orig_dx_leaves) {
+ ret = OCFS2_ET_NO_MEMORY;
+ goto out;
+ }
+
+ new_dx_leaves = ocfs2_dx_dir_alloc_leaves(fs, NULL);
+ if (!new_dx_leaves) {
+ ret = OCFS2_ET_NO_MEMORY;
+ goto out;
+ }
+
+ ocfs2_sort(dx_leaf->dl_list.de_entries, num_used,
+ sizeof(struct ocfs2_dx_entry), dx_leaf_sort_cmp,
+ dx_leaf_sort_swap);
+
+ ret = ocfs2_dx_dir_find_leaf_split(dx_leaf, leaf_cpos,
+ insert_hash, &split_hash);
+ if (ret)
+ goto out;
+
+ ret = ocfs2_malloc_block(fs->fs_io, (char **)(&tmp_dx_leaf));
+ if (ret)
+ goto out;
+ orig_leaves_start = ocfs2_blocks_to_clusters(fs, leaf_blkno);
+ ret = ocfs2_read_dx_leaves(fs, orig_leaves_start, num_dx_leaves,
+ orig_dx_leaves);
+ if (ret)
+ goto out;
+
+ cpos = split_hash;
+ ret = ocfs2_dx_dir_new_cluster(fs, &et, cpos,
+ new_dx_leaves, num_dx_leaves);
+ if (ret)
+ goto out;
+ ret = ocfs2_dx_dir_transfer_leaf(fs, split_hash, tmp_dx_leaf,
+ orig_dx_leaves, orig_leaves_start,
+ new_dx_leaves, ocfs2_clusters_to_blocks(fs, cpos),
+ num_dx_leaves);
+
+out:
+ if (tmp_dx_leaf)
+ ocfs2_free((char **)(&tmp_dx_leaf));
+
+ if (orig_dx_leaves)
+ ocfs2_dx_dir_free_leaves(fs, orig_dx_leaves);
+ if (new_dx_leaves)
+ ocfs2_dx_dir_free_leaves(fs, new_dx_leaves);
+ return ret;
+}
+
+static int ocfs2_find_dir_space_dx(ocfs2_filesys *fs,
+ struct ocfs2_dx_root_block *dx_root,
+ const char *name, int namelen,
+ struct ocfs2_dir_lookup_result *lookup)
+{
+ int ret, rebalanced = 0;
+ struct ocfs2_dx_leaf *dx_leaf;
+ char *dx_leaf_buf = NULL;
+ uint64_t blkno;
+ uint32_t leaf_cpos;
+
+ ret = ocfs2_malloc_block(fs->fs_io, &dx_leaf_buf);
+ if (ret)
+ goto out;
+
+restart_search:
+ ret = ocfs2_dx_dir_lookup(fs, dx_root, &dx_root->dr_list,
+ &lookup->dl_hinfo, &leaf_cpos, &blkno);
+ if (ret)
+ goto out;
+ ret = ocfs2_read_dx_leaf(fs, blkno, dx_leaf_buf);
+ if (ret)
+ goto out;
+ dx_leaf = (struct ocfs2_dx_leaf *)dx_leaf_buf;
+ if (dx_leaf->dl_list.de_num_used >= dx_leaf->dl_list.de_count) {
+ if (rebalanced) {
+ /*
+ * Rebalancing should have provided us with
+ * space in an appropriate leaf.
+ */
+ ret = OCFS2_ET_DIR_NO_SPACE;
+ goto out;
+ }
+
+ ret = ocfs2_dx_dir_rebalance(fs, dx_root, dx_leaf,
+ &lookup->dl_hinfo, leaf_cpos, blkno);
+ if (ret)
+ goto out;
+ rebalanced = 1;
+ goto restart_search;
+ }
+ lookup->dl_dx_leaf_blkno = blkno;
+
+out:
+ if (dx_leaf_buf)
+ ocfs2_free(&dx_leaf_buf);
+ return ret;
+}
+
+/*
+ * Hashing code adapted from ext3
+ */
+#define DELTA 0x9E3779B9
+
+static void TEA_transform(uint32_t buf[4], uint32_t const in[])
+{
+ uint32_t sum = 0;
+ uint32_t b0 = buf[0], b1 = buf[1];
+ uint32_t a = in[0], b = in[1], c = in[2], d = in[3];
+ int n = 16;
+
+ do {
+ sum += DELTA;
+ b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b);
+ b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d);
+ } while (--n);
+
+ buf[0] += b0;
+ buf[1] += b1;
+}
+
+static void str2hashbuf(const char *msg, int len, uint32_t *buf, int num)
+{
+ uint32_t pad, val;
+ int i;
+
+ pad = (uint32_t)len | ((uint32_t)len << 8);
+ pad |= pad << 16;
+
+ val = pad;
+ if (len > (num * 4))
+ len = num * 4;
+ for (i = 0; i < len; i++) {
+ if ((i % 4) == 0)
+ val = pad;
+ val = msg[i] + (val << 8);
+ if ((i % 4) == 3) {
+ *buf++ = val;
+ val = pad;
+ num --;
+ }
+ }
+ if (--num >= 0)
+ *buf++ = val;
+ while(--num >= 0)
+ *buf++ = pad;
+}
+
+static void ocfs2_dx_dir_name_hash(ocfs2_filesys *fs,
+ const char *name,
+ int len,
+ struct ocfs2_dx_hinfo *hinfo)
+{
+ const char *p;
+ uint32_t in[8], buf[4];
+
+ /*
+ * XXX: Is this really necessary, if the index is never looked
+ * at by readdir? Is a hash value of '0' a bad idea ?
+ */
+ if ((len == 1 && !strncmp(".", name, 1)) ||
+ (len == 2 && !strncmp("..", name, 2))) {
+ buf[0] = buf[1] = 0;
+ goto out;
+ }
+
+ memcpy(buf, OCFS2_RAW_SB(fs->fs_super)->s_dx_seed, sizeof(buf));
+
+ p = name;
+ while(len > 0) {
+ str2hashbuf(p, len, in, 4);
+ TEA_transform(buf, in);
+ len -= 16;
+ p += 16;
+ }
+
+out:
+ hinfo->major_hash = buf[0];
+ hinfo->minor_hash = buf[1];
+}
+
+static int ocfs2_dx_dir_insert(struct ocfs2_dir_entry *dentry,
+ int offset,
+ int blocksize,
+ char *buf,
+ void *priv_data)
+{
+ errcode_t ret = 0;
+ char *dx_buf = NULL;
+ char *dx_leaf_buf = NULL;
+ struct ocfs2_dx_root_block *dx_root;
+ struct ocfs2_dx_leaf *dx_leaf;
+ struct ocfs2_dir_lookup_result lookup = {0,};
+ struct ocfs2_dx_entry_list *entry_list;
+ struct dx_insert_ctxt *ctxt = (struct dx_insert_ctxt *)priv_data;
+ ocfs2_filesys *fs = ctxt->fs;
+ uint64_t dx_root_blkno = ctxt->dx_root_blkno;
+
+ ret = ocfs2_malloc_block(fs->fs_io, &dx_buf);
+ if (ret)
+ goto out;
+
+ ret = ocfs2_malloc_block(fs->fs_io, &dx_leaf_buf);
+ if (ret)
+ goto out;
+
+ ret = ocfs2_read_dx_root(fs, dx_root_blkno, dx_buf);
+ if (ret)
+ goto out;
+ dx_root = (struct ocfs2_dx_root_block *)dx_buf;
+
+ if (dx_root->dr_flags & OCFS2_DX_FLAG_INLINE) {
+ ret = (ocfs2_inline_dx_has_space(dx_root));
+ if (ret == 0) {
+ entry_list = &dx_root->dr_entries;
+ goto insert_into_entries;
+ } else {
+ /* root block is full, expand it to an extent */
+ ret = ocfs2_expand_inline_dx_root(fs, dx_root);
+ if (ret)
+ goto out;
+ }
+ }
+
+ ocfs2_dx_dir_name_hash(fs, dentry->name,
+ dentry->name_len, &lookup.dl_hinfo);
+ ret = ocfs2_find_dir_space_dx(fs, dx_root,
+ dentry->name, dentry->name_len, &lookup);
+ if (ret)
+ goto out;
+ ret = ocfs2_read_dx_leaf(fs, lookup.dl_dx_leaf_blkno, dx_leaf_buf);
+ if (ret)
+ goto out;
+ dx_leaf = (struct ocfs2_dx_leaf *)dx_leaf_buf;
+ entry_list = &dx_leaf->dl_list;
+
+insert_into_entries:
+ ocfs2_dx_entry_list_insert(entry_list, &lookup.dl_hinfo, dentry->inode);
+ ret = ocfs2_write_dx_leaf(fs, dx_leaf->dl_blkno, dx_leaf);
+ if (ret)
+ goto out;
+ dx_root->dr_num_entries += 1;
+ ret = ocfs2_write_dx_root(fs, dx_root_blkno, dx_buf);
+out:
+ if (dx_leaf_buf)
+ ocfs2_free(&dx_leaf_buf);
+ if (dx_buf)
+ ocfs2_free(&dx_buf);
+ return ret;
+}
+
+
+errcode_t ocfs2_dx_dir_build(ocfs2_filesys *fs,
+ uint64_t dir)
+{
+ int ret = 0;
+ uint64_t dr_blkno;
+ char *dx_buf = NULL, *di_buf = NULL;
+ struct ocfs2_dinode *di;
+ struct ocfs2_dx_root_block *dx_root;
+ struct dx_insert_ctxt ctxt;
+
+ ret = ocfs2_malloc_block(fs->fs_io, &di_buf);
+ if (ret)
+ goto out;
+
+ ret = ocfs2_read_inode(fs, dir, di_buf);
+ if (ret)
+ goto out;
+ di = (struct ocfs2_dinode *)di_buf;
+
+ if ((!ocfs2_dir_indexed(di)) ||
+ (di->i_dyn_features & OCFS2_INLINE_DATA_FL))
+ goto out;
+
+ ret = ocfs2_new_dx_root(fs, di, &dr_blkno);
+ if (ret)
+ goto out;
+
+ ret = ocfs2_malloc_block(fs->fs_io, &dx_buf);
+ if (ret)
+ goto out;
+
+ ret = ocfs2_read_dx_root(fs, dr_blkno, dx_buf);
+ if (ret)
+ goto out;
+ dx_root = (struct ocfs2_dx_root_block *)dx_buf;
+
+ ret = ocfs2_init_dir_trailers(fs, di, dx_root);
+ if (ret)
+ goto out;
+
+ dx_root->dr_dir_blkno = di->i_blkno;
+ dx_root->dr_num_entries = 0;
+ dx_root->dr_flags |= OCFS2_DX_FLAG_INLINE;
+ dx_root->dr_entries.de_count = ocfs2_dx_entries_per_root(fs->fs_blocksize);
+
+ di->i_dx_root = dr_blkno;
+ di->i_dyn_features |= OCFS2_INDEXED_DIR_FL;
+
+ ret = ocfs2_write_dx_root(fs, dir, dx_buf);
+ if (ret)
+ goto out;
+ ret = ocfs2_write_inode(fs, dir, di_buf);
+ if (ret)
+ goto out;
+
+ ocfs2_free(&dx_buf);
+ ocfs2_free(&di_buf);
+
+ ctxt.dir_blkno = dir;
+ ctxt.dx_root_blkno = dr_blkno;
+ ctxt.fs = fs;
+ ret = ocfs2_dir_iterate(fs, dir, 0, NULL,
+ ocfs2_dx_dir_insert, &ctxt);
+out:
+ if (di_buf)
+ ocfs2_free(&di_buf);
+ if (dx_buf)
+ ocfs2_free(&dx_buf);
+
+ return ret;
+}
diff --git a/libocfs2/dir_indexed.h b/libocfs2/dir_indexed.h
new file mode 100644
index 0000000..66a1991
--- /dev/null
+++ b/libocfs2/dir_indexed.h
@@ -0,0 +1,25 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dir_indexed.h
+ *
+ * Copyright (C) 2009 Novell. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+struct ocfs2_dir_lookup_result {
+ uint64_t dl_dx_leaf_blkno; /* Indexed leaf block */
+ struct ocfs2_dx_hinfo dl_hinfo; /* Name hash results */
+};
+
+int __ocfs2_is_dir_trailer(ocfs2_filesys *fs, unsigned long de_off);
+int ocfs2_is_dir_trailer(ocfs2_filesys *fs, struct ocfs2_dinode *di,
+ unsigned long de_off);
--
Coly Li
SuSE Labs
More information about the Ocfs2-tools-devel
mailing list