[Ocfs2-devel] [PATCH 3/4] ocfs2: Add a name indexed b-tree to directory inodes
Joel Becker
Joel.Becker at oracle.com
Wed Nov 12 19:28:19 PST 2008
On Wed, Nov 12, 2008 at 06:24:07PM -0800, Mark Fasheh wrote:
> This patch makes use of Ocfs2's flexible btree code to add an additional
> tree to directory inodes. The new tree stores an array of small,
> fixed-length records in each leaf block. Each record stores a hash value,
> and pointer to a block in the traditional (unindexed) directory tree where a
> dirent with the given name hash resides. Lookup exclusively uses this tree
> to find dirents, thus providing us with constant time name lookups.
Whee!
> +static struct ocfs2_extent_tree_operations ocfs2_dx_root_et_ops = {
> + .eo_set_last_eb_blk = ocfs2_dx_root_set_last_eb_blk,
> + .eo_get_last_eb_blk = ocfs2_dx_root_get_last_eb_blk,
> + .eo_update_clusters = ocfs2_dx_root_update_clusters,
> + .eo_sanity_check = ocfs2_dx_root_sanity_check,
> + .eo_fill_root_el = ocfs2_dx_root_fill_root_el,
> +};
This makes me happy.
> +static int ocfs2_dx_dir_lookup(struct inode *inode,
> + struct ocfs2_extent_list *el,
> + struct ocfs2_dx_hinfo *hinfo,
> + u32 *ret_cpos,
> + u64 *ret_phys_blkno)
> +{
> + int ret = 0;
> + unsigned int cend, uninitialized_var(clen);
> + u32 uninitialized_var(cpos);
> + u64 uninitialized_var(blkno);
> + u32 name_hash = hinfo->major_hash;
> +
> + ret = ocfs2_dx_dir_lookup_rec(inode, el, name_hash, &cpos, &blkno,
> + &clen);
> + if (ret) {
> + mlog_errno(ret);
> + goto out;
> + }
> +
> + cend = cpos + clen;
> + if (name_hash >= cend) {
> + /* We want the last cluster */
> + blkno += ocfs2_clusters_to_blocks(inode->i_sb, clen - 1);
> + cpos += clen - 1;
I assume this means that we have an "anything at the end"
behavior going on here? Perhaps an explicit comment on that.
> +static int ocfs2_delete_entry_dx(handle_t *handle, struct inode *dir,
> + struct ocfs2_dir_lookup_result *lookup)
> +{
> + int ret, index;
> + struct buffer_head *leaf_bh = lookup->dl_leaf_bh;
> + struct ocfs2_dx_leaf *dx_leaf;
> + struct ocfs2_dx_entry *dx_entry = lookup->dl_dx_entry;
> +
> + dx_leaf = (struct ocfs2_dx_leaf *) lookup->dl_dx_leaf_bh->b_data;
> + /* Neither of these are a disk corruption - that should have
> + * been caught by lookup, before we got here. */
> + BUG_ON(le16_to_cpu(dx_leaf->dl_count) <= 0);
> + BUG_ON(le16_to_cpu(dx_leaf->dl_num_used) <= 0);
> +
> + index = (char *)dx_entry - (char *)dx_leaf->dl_entries;
> + index /= sizeof(*dx_entry);
> +
> + if (index >= le16_to_cpu(dx_leaf->dl_num_used)) {
> + mlog(ML_ERROR, "Dir %llu: Bad dx_entry ptr idx %d, (%p, %p)\n",
> + (unsigned long long)OCFS2_I(dir)->ip_blkno, index, dx_leaf,
> + dx_entry);
> + return -EIO;
> + }
> +
> + mlog(0, "Dir %llu: delete entry at index: %d\n",
> + (unsigned long long)OCFS2_I(dir)->ip_blkno, index);
> +
> + /*
> + * Add the index leaf into the journal before removing the
> + * unindexed entry. If we get an error return from
> + * __ocfs2_delete_entry(), then it hasn't removed the entry
> + * yet. Likewise, successful return means we *must* remove the
> + * indexed entry.
> + */
> + ret = ocfs2_journal_access(handle, dir, lookup->dl_dx_leaf_bh,
> + OCFS2_JOURNAL_ACCESS_WRITE);
> + if (ret) {
> + mlog_errno(ret);
> + goto out;
> + }
> +
> + ret = __ocfs2_delete_entry(handle, dir, lookup->dl_entry,
> + leaf_bh, leaf_bh->b_data, leaf_bh->b_size);
> + if (ret) {
> + mlog_errno(ret);
> + goto out;
> + }
> +
> + ocfs2_dx_leaf_remove_entry(dx_leaf, index);
> +
> + ocfs2_journal_dirty(handle, lookup->dl_dx_leaf_bh);
> +
> +out:
> + return ret;
> +}
I love using indexed lookup for unlink(2).
> static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
> unsigned int blocks_wanted,
> + struct ocfs2_dir_lookup_result *lookup,
> struct buffer_head **first_block_bh)
> {
<snip>
> + if (ocfs2_supports_indexed_dirs(osb)) {
The multiple places you have to stick this check in for proper
ordering is fun. I understand why, though.
> /*
> + * A directory indexing block. Each indexed directory has one of these,
> + * pointed to by ocfs2_dinode.
> + *
> + * This block stores an indexed btree root, and a set of free space
> + * start-of-list pointers.
> + */
> +struct ocfs2_dx_root_block {
> + __u8 dr_signature[8]; /* Signature for verification */
> + __le64 dr_csum; /* Checksum (unused) */
> + __le16 dr_suballoc_slot; /* Slot suballocator this
> + * block belongs to. */
> + __le16 dr_suballoc_bit; /* Bit offset in suballocator
> + * block group */
> + __le32 dr_fs_generation; /* Must match super block */
> + __le64 dr_blkno; /* Offset on disk, in blocks */
> + __le64 dr_last_eb_blk; /* Pointer to last
> + * extent block */
> + __le32 dr_clusters; /* Clusters allocated
> + * to the indexed tree. */
> + __le32 dr_reserved1;
> + __le64 dr_dir_blkno; /* Pointer to parent inode */
> + __le64 dr_reserved2;
> + __le64 dr_reserved3[16];
> + struct ocfs2_extent_list dr_list; /* Keep this aligned to 128
> + * bits for maximum space
> + * efficiency. */
> +};
> +
> +/*
> + * A directory entry in the indexed tree. We don't store the full name here,
> + * but instead provide a pointer to the full dirent in the unindexed tree.
> + *
> + * We also store name_len here so as to reduce the number of leaf blocks we
> + * need to search in case of collisions.
> + */
> +struct ocfs2_dx_entry {
> + __le32 dx_major_hash; /* Used to find logical
> + * cluster in index */
> + __le32 dx_minor_hash; /* Lower bits used to find
> + * block in cluster */
> + __le64 dx_dirent_blk; /* Physical block in unindexed
> + * tree holding this dirent. */
> +};
> +
> +/*
> + * The header of a leaf block in the indexed tree.
> + */
> +struct ocfs2_dx_leaf {
> + __u8 dl_signature[8];/* Signature for verification */
> + __le64 dl_csum; /* Checksum (unused) */
> + __le64 dl_reserved1;
> + __le32 dl_reserved2;
> + __le16 dl_count; /* Maximum number of entries
> + * possible in ih_entries */
> + __le16 dl_num_used; /* Current number of
> + * ih_entries entries */
> + struct ocfs2_dx_entry dl_entries[0]; /* Indexed dir entries
> + * in a packed array of
> + * length ih_num_used */
> +};
I'd love to see offset annotations on these structures.
Otherwise, this is awesome!
Joel
--
"The first requisite of a good citizen in this republic of ours
is that he shall be able and willing to pull his weight."
- Theodore Roosevelt
Joel Becker
Principal Software Developer
Oracle
E-mail: joel.becker at oracle.com
Phone: (650) 506-8127
More information about the Ocfs2-devel
mailing list