[Ocfs2-tools-devel] [PATCH 07/12] dx_dirs v7: more library support for directory indexing

Tue Feb 2 15:54:23 PST 2010

On Sat, Jan 30, 2010 at 02:18:26AM +0800, Coly Li wrote:
> This patch adds more library support for indexed dirs,
> - dx_root alloc/delete
> - dx_leaf alloc/delete
> - dx_root read/write
> - dx_leaf read/write
> - indexed tree insert/truncate
> - dx_root extent tree operations
> 
> With this patch, indexed dirs support in fsck.ocfs2 is possible.
> 
> Signed-off-by: Coly Li <coly.li at suse.de>
> Cc: Mark Fasheh <mfasheh at suse.com>
> ---
>  include/ocfs2-kernel/ocfs2_fs.h |   31 ++
>  include/ocfs2/ocfs2.h           |   23 +-
>  libocfs2/Makefile               |    3 +-
>  libocfs2/alloc.c                |   95 ++++
>  libocfs2/dir_indexed.c          | 1135 +++++++++++++++++++++++++++++++++++++++
>  libocfs2/dir_indexed.h          |   25 +
>  libocfs2/dir_iterate.c          |   16 +-
>  libocfs2/dir_iterate.h          |    1 +
>  libocfs2/dirblock.c             |  186 ++++++--
>  libocfs2/extent_tree.c          |   53 ++
>  libocfs2/extent_tree.h          |    4 +
>  libocfs2/inode.c                |    3 +
>  libocfs2/link.c                 |    1 +
>  libocfs2/lookup.c               |    1 +
>  libocfs2/ocfs2_err.et           |    3 +
>  libocfs2/truncate.c             |   16 +
>  libocfs2/unlink.c               |    1 +
>  sizetest/sizetest.c             |    2 +-
>  18 files changed, 1558 insertions(+), 41 deletions(-)
>  create mode 100644 libocfs2/dir_indexed.c
>  create mode 100644 libocfs2/dir_indexed.h

> 
> diff --git a/include/ocfs2/ocfs2.h b/include/ocfs2/ocfs2.h
> index b586bf2..1c2c69a 100644
> --- a/include/ocfs2/ocfs2.h
> +++ b/include/ocfs2/ocfs2.h
> @@ -1382,5 +1396,12 @@ errcode_t ocfs2_extent_iterate_xattr(ocfs2_filesys *fs,
>  				     void *priv_data,
>  				     int *changed);
>  errcode_t ocfs2_delete_xattr_block(ocfs2_filesys *fs, uint64_t blkno);
> +errcode_t ocfs2_dir_indexed_tree_truncate(ocfs2_filesys *fs,
> +					struct ocfs2_dx_root_block *dx_root);
> +errcode_t ocfs2_write_dx_root(ocfs2_filesys *fs, uint64_t block, char *buf);
> +int ocfs2_is_dir_trailer(ocfs2_filesys *fs, struct ocfs2_dinode *di, unsigned long de_off);

Duplicate definition, noted below.

> diff --git a/libocfs2/alloc.c b/libocfs2/alloc.c
> index 18c6e0e..4144539 100644
> --- a/libocfs2/alloc.c
> +++ b/libocfs2/alloc.c
> @@ -492,6 +492,100 @@ out:
>  	return ret;
>  }
> 
> +/* only initiate part of dx_root:
> + *   dr_subllaoc_slot
> + *   dr_sbualloc_bit
> + *   dr_fs_generation
> + *   dr_blkno
> + */
> +static void init_dx_root(ocfs2_filesys *fs,
> +			struct ocfs2_dx_root_block *dx_root,
> +			int slot, uint64_t gd_blkno, uint64_t dr_blkno)
> +{
> +
> +	memset(dx_root, 0, fs->fs_blocksize);
> +	strcpy((char *)dx_root->dr_signature, OCFS2_DX_ROOT_SIGNATURE);
> +	dx_root->dr_suballoc_slot = slot;
> +	dx_root->dr_suballoc_bit = (uint16_t)(dr_blkno - gd_blkno);
> +	dx_root->dr_fs_generation = fs->fs_super->i_fs_generation;
> +	dx_root->dr_blkno = dr_blkno;

Why do you wait until build_dx_root to initialize dr_flags to
OCFS2_DX_FLAG_INLINE?

<snip alloc / del routines> - they look fine.

> @@ -636,6 +730,7 @@ out:
>  	return ret;
>  }
> 
> +

Added an extra newline here. No big deal, but thought I'd point it out.

>  #ifdef DEBUG_EXE
>  #include <stdio.h>
> 
> diff --git a/libocfs2/dir_indexed.c b/libocfs2/dir_indexed.c
> new file mode 100644
> index 0000000..e7c106d
> --- /dev/null
> +++ b/libocfs2/dir_indexed.c
> @@ -0,0 +1,1135 @@
> +/* -*- mode: c; c-basic-offset: 8; -*-
> + * vim: noexpandtab sw=8 ts=8 sts=0:
> + *
> + * Copyright (C) 2009, 2010 Novell.  All rights reserved.
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU General Public
> + * License, version 2,  as published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * General Public License for more details.
> + */
> +#include <assert.h>
> +#include <ocfs2/ocfs2.h>
> +#include <ocfs2/bitops.h>
> +#include <ocfs2/kernel-rbtree.h>
> +#include "ocfs2_err.h"
> +#include "extent_tree.h"
> +#include "dir_indexed.h"
> +
> +
> +errcode_t ocfs2_dx_dir_truncate(ocfs2_filesys *fs,
> +			uint64_t dir)
> +{
> +	struct ocfs2_dx_root_block *dx_root;
> +	char *dx_root_buf = NULL, *di_buf = NULL;
> +	struct ocfs2_dinode *di;
> +	errcode_t ret = 0;
> +
> +	ret = ocfs2_malloc_block(fs->fs_io, &di_buf);
> +	if (ret)
> +		goto out;
> +	ret = ocfs2_read_inode(fs, dir, di_buf);
> +	if (ret)
> +		goto out;
> +	di = (struct ocfs2_dinode *)di_buf;
> +

This is a function intended to be called by users of libocfs2, right? We
should probably check S_ISDIR() here too then in case the user passes the
wrong inode # down.

> +	/* we have to trust i_dyn_features */
> +	if ((!ocfs2_dir_indexed(di)) ||
> +	    (di->i_dyn_features & OCFS2_INLINE_DATA_FL))
> +		goto out;
> +
> +	ret = ocfs2_malloc_block(fs->fs_io, &dx_root_buf);
> +	if (ret)
> +		goto out;
> +	ret = ocfs2_read_dx_root(fs, (uint64_t)di->i_dx_root, dx_root_buf);
> +	if (ret)
> +		goto out;
> +	dx_root = (struct ocfs2_dx_root_block *)dx_root_buf;
> +
> +	if (dx_root->dr_flags & OCFS2_DX_FLAG_INLINE)
> +		goto remove_index;
> +
> +	ret = ocfs2_dir_indexed_tree_truncate(fs, dx_root);
> +
> +remove_index:
> +	ret = ocfs2_delete_dx_root(fs, dx_root->dr_blkno);
> +
> +	di->i_dyn_features &= ~OCFS2_INDEXED_DIR_FL;
> +	di->i_dx_root = 0;
> +
> +	ret = ocfs2_write_inode(fs, di->i_blkno, (char *)di);

Return values from ocfs2_dir_indexed_tree_truncate(),
ocfs2_delete_dx_root() and ocfs2_write_inode() are being ignored here. If
there's a good reason, we should have a comment.

We could also re-order the operations so that errors in the tree truncate
and root delete aren't fatal. Remove any notion of the index from the inode
block first and then write it - if you fail the write, we can exit
gracefully. If any of the other steps (truncate, delete_dx_root) fail after
the inode has been updated, we *could* ignore those errors as non-fatal.

> +out:
> +	if (di_buf)
> +		ocfs2_free(&di_buf);
> +	if (dx_root_buf)
> +		ocfs2_free(&dx_root_buf);
> +	return ret;
> +}
> +
> +static unsigned int ocfs2_figure_dirent_hole(struct ocfs2_dir_entry *de)
> +{
> +	unsigned int hole;
> +
> +	if (de->inode == 0)
> +		hole = de->rec_len;
> +	else
> +		hole = de->rec_len - OCFS2_DIR_REC_LEN(de->name_len);
> +
> +	return hole;
> +}
> +
> +static int ocfs2_find_max_rec_len(ocfs2_filesys *fs,
> +				char *buf)
> +{
> +	int size, this_hole, largest_hole = 0;
> +	char *de_buf, *limit;
> +	struct ocfs2_dir_entry *de;
> +
> +	size = ocfs2_dir_trailer_blk_off(fs);
> +	limit = buf + size;
> +	de_buf = buf;
> +	de = (struct ocfs2_dir_entry *)de_buf;
> +	do {
> +		this_hole = ocfs2_figure_dirent_hole(de);
> +		if (this_hole > largest_hole)
> +			largest_hole = this_hole;
> +
> +		de_buf += de->rec_len;
> +		de = (struct ocfs2_dir_entry *)de_buf;
> +	} while (de_buf < limit);
> +
> +	if (largest_hole >= OCFS2_DIR_MIN_REC_LEN)
> +		return largest_hole;
> +	return 0;
> +}
> +
> +struct trailer_ctxt {
> +	struct ocfs2_dx_root_block *dx_root;
> +	struct ocfs2_dinode *di;
> +};
> +
> +
> +static int dir_trailer_func(ocfs2_filesys *fs,
> +				uint64_t blkno,
> +				uint64_t bcount,
> +				uint16_t ext_flags,
> +				void *priv_data)
> +{
> +	struct trailer_ctxt *ctxt = (struct trailer_ctxt *)priv_data;
> +	struct ocfs2_dinode *di = ctxt->di;
> +	struct ocfs2_dx_root_block *dx_root = ctxt->dx_root;
> +	struct ocfs2_dir_block_trailer *trailer;
> +	int max_rec_len = 0;
> +	errcode_t ret = 0;
> +	char *blk = NULL;
> +
> +	ret = ocfs2_malloc_block(fs->fs_io, &blk);
> +	if (ret)
> +		goto out;
> +
> +	/* here we don't trust trailer, cannot use
> +	 * ocfs2_read_dir_block() */
> +	ret = ocfs2_read_blocks(fs, blkno, 1, blk);
> +	if (ret)
> +		goto out;
> +
> +	max_rec_len = ocfs2_find_max_rec_len(fs, blk);
> +	trailer = ocfs2_dir_trailer_from_block(fs, blk);
> +	trailer->db_free_rec_len = max_rec_len;
> +	ocfs2_init_dir_trailer(fs, di, blkno, blk);

Doesn't ocfs2_init_dir_trailer() overwrite the db_free_rec_len set you just
did? Should you move the init a line or two up?

Also, I think we're missing a step in this function - you need to change the
rec_len of the dirent which comes before the trailer. Otherwise it will
still point to the end of the block, which means the trailer might later be
overwritten by a dirent insert.

> +	if (max_rec_len) {
> +		trailer->db_free_next = dx_root->dr_free_blk;
> +		dx_root->dr_free_blk = blkno;
> +	}
> +
> +	/* comput trailer->db_check here, after writes out,
> +	 * trailer is trustable */
> +	ret = ocfs2_write_dir_block(fs, di, blkno, blk);
> +out:
> +	if (blk)
> +		ocfs2_free(&blk);
> +	return ret;
> +}
> +
> +static errcode_t ocfs2_init_dir_trailers(ocfs2_filesys *fs,
> +				struct ocfs2_dinode *di,
> +				struct ocfs2_dx_root_block *dx_root)
> +{
> +	errcode_t ret = 0;
> +	struct trailer_ctxt ctxt;
> +
> +	if (di->i_dyn_features & OCFS2_INLINE_DATA_FL) {
> +		ret = OCFS2_ET_INODE_NOT_VALID;
> +		goto out;
> +	}
> +
> +	ctxt.di = di;
> +	ctxt.dx_root = dx_root;
> +
> +	ret = ocfs2_block_iterate_inode(fs, di,
> +			0, dir_trailer_func, &ctxt);
> +out:
> +	return ret;
> +}
> +
> +static void ocfs2_dx_entry_list_insert(struct ocfs2_dx_entry_list *entry_list,
> +					struct ocfs2_dx_hinfo *hinfo,
> +					uint64_t dirent_blk)
> +{
> +	int i;
> +	struct ocfs2_dx_entry *dx_entry;
> +
> +	i = entry_list->de_num_used;
> +	dx_entry = &entry_list->de_entries[i];
> +
> +	memset(dx_entry, 0, sizeof(struct ocfs2_dx_entry));
> +	dx_entry->dx_major_hash = hinfo->major_hash;
> +	dx_entry->dx_minor_hash = hinfo->minor_hash;
> +	dx_entry->dx_dirent_blk = dirent_blk;
> +
> +	entry_list->de_num_used += 1;
> +}
> +
> +struct dx_insert_ctxt {
> +	uint64_t dir_blkno;
> +	uint64_t dx_root_blkno;
> +	ocfs2_filesys *fs;
> +};
> +
> +
> +inline static int ocfs2_inline_dx_has_space(struct ocfs2_dx_root_block *dx_root)
> +{
> +	struct ocfs2_dx_entry_list *entry_list;
> +
> +	entry_list = &dx_root->dr_entries;
> +
> +	if (entry_list->de_num_used >= entry_list->de_count)
> +		return 0;
> +
> +	return 1;
> +}
> +
> +static struct ocfs2_dx_leaf **ocfs2_dx_dir_alloc_leaves(ocfs2_filesys *fs,
> +					int *ret_num_leaves)
> +{
> +	errcode_t num_dx_leaves = ocfs2_clusters_to_blocks(fs, 1);
> +	char **dx_leaves_buf = NULL;
> +
> +	dx_leaves_buf = calloc(num_dx_leaves, sizeof (void *));
> +	if (dx_leaves_buf && ret_num_leaves)
> +		*ret_num_leaves = num_dx_leaves;
> +
> +	return (struct ocfs2_dx_leaf **)dx_leaves_buf;
> +}
> +
> +static errcode_t ocfs2_dx_dir_format_cluster(ocfs2_filesys *fs,
> +				struct ocfs2_dx_leaf  **dx_leaves,
> +				int num_dx_leaves,
> +				uint64_t start_blk)
> +{
> +	errcode_t ret;
> +	int i;
> +	struct ocfs2_dx_leaf *dx_leaf;
> +	char *blk;
> +
> +	for (i = 0; i < num_dx_leaves; i++) {
> +		ret = ocfs2_malloc_block(fs->fs_io, &blk);
> +		if (ret)
> +			goto out;
> +
> +		dx_leaves[i] = (struct ocfs2_dx_leaf *)blk;
> +		dx_leaf = (struct ocfs2_dx_leaf *)blk;
> +
> +		memset(dx_leaf, 0, fs->fs_blocksize);
> +		strcpy((char *)dx_leaf->dl_signature, OCFS2_DX_LEAF_SIGNATURE);
> +		dx_leaf->dl_fs_generation = fs->fs_super->i_fs_generation;
> +		dx_leaf->dl_blkno = start_blk + i;
> +		dx_leaf->dl_list.de_count = ocfs2_dx_entries_per_leaf(fs->fs_blocksize);
> +
> +		ret = ocfs2_write_dx_leaf(fs, dx_leaf->dl_blkno, dx_leaf);
> +		if (ret)
> +			goto out;
> +	}
> +	ret = 0;
> +out:
> +	return ret;
> +}
> +
> +static inline unsigned int __ocfs2_dx_dir_hash_idx(ocfs2_filesys *fs,
> +						uint32_t minor_hash)
> +{
> +	unsigned int cbits, bbits, dx_mask;
> +
> +	cbits = OCFS2_RAW_SB(fs->fs_super)->s_clustersize_bits;
> +	bbits = OCFS2_RAW_SB(fs->fs_super)->s_blocksize_bits;
> +	dx_mask = (1 << (cbits - bbits)) -1;
> +
> +	return (minor_hash & dx_mask);
> +}
> +
> +static inline unsigned int ocfs2_dx_dir_hash_idx(ocfs2_filesys *fs,
> +					struct ocfs2_dx_hinfo *hinfo)
> +{
> +	return __ocfs2_dx_dir_hash_idx(fs, hinfo->minor_hash);
> +}
> +
> +static void ocfs2_dx_dir_leaf_insert_tail(struct ocfs2_dx_leaf *dx_leaf,
> +				struct ocfs2_dx_entry *dx_new_entry)
> +{
> +	int i;
> +
> +	i = dx_leaf->dl_list.de_num_used;
> +	dx_leaf->dl_list.de_entries[i] = *dx_new_entry;
> +
> +	dx_leaf->dl_list.de_num_used += 1;
> +}
> +
> +/* XXX should add bail part */

What does this comment mean?

> +static errcode_t ocfs2_expand_inline_dx_root(ocfs2_filesys *fs,
> +					struct ocfs2_dx_root_block *dx_root)
> +{
> +	errcode_t ret;
> +	int num_dx_leaves, i, j;
> +	uint64_t start_blkno = 0;
> +	uint32_t clusters_found = 0;
> +	struct ocfs2_dx_leaf **dx_leaves = NULL;
> +	struct ocfs2_dx_leaf *target_leaf;
> +	struct ocfs2_dx_entry_list *entry_list;
> +	struct ocfs2_extent_tree et;
> +	struct ocfs2_dx_entry *dx_entry;
> +
> +	dx_leaves = ocfs2_dx_dir_alloc_leaves(fs, &num_dx_leaves);
> +	if (!dx_leaves) {
> +		ret = OCFS2_ET_NO_MEMORY;
> +		goto out;
> +	}
> +
> +	ret = ocfs2_new_clusters(fs, 1, 1, &start_blkno, &clusters_found);
> +	if (ret)
> +		goto out;
> +	assert(clusters_found == 1);
> +	ret = ocfs2_dx_dir_format_cluster(fs, dx_leaves,
> +				num_dx_leaves, start_blkno);
> +	if (ret)
> +		goto out;
> +	/* Transfer the entries from inline dx_root into the appropriate
> +	 * block
> +	 */
> +	entry_list = &dx_root->dr_entries;
> +
> +	for (i = 0; i < entry_list->de_num_used; i++) {
> +		dx_entry = &entry_list->de_entries[i];
> +		j = __ocfs2_dx_dir_hash_idx(fs, dx_entry->dx_minor_hash);
> +		target_leaf = (struct ocfs2_dx_leaf *)dx_leaves[j];
> +
> +		ocfs2_dx_dir_leaf_insert_tail(target_leaf, dx_entry);
> +		ret = ocfs2_write_dx_leaf(fs, target_leaf->dl_blkno,
> +					target_leaf);
> +		if (ret)
> +			goto out;

This operation would be faster if we moved the writes out of this look.
Otherwise, we're doing a disk I/O for every entry...

> +	}
> +
> +	dx_root->dr_flags &= ~OCFS2_DX_FLAG_INLINE;
> +	memset(&dx_root->dr_list, 0, fs->fs_blocksize -
> +		offsetof(struct ocfs2_dx_root_block, dr_list));
> +	dx_root->dr_list.l_count =
> +		ocfs2_extent_recs_per_dx_root(fs->fs_blocksize);
> +
> +	/* This should never fail considering we start with an empty
> +	 * dx_root */
> +	ocfs2_init_dx_root_extent_tree(&et, fs, (char *)dx_root, dx_root->dr_blkno);
> +	ret = ocfs2_tree_insert_extent(fs, &et, 0, start_blkno, 1, 0);
> +	if (ret)
> +		goto out;
> +
> +out:
> +	return ret;
> +}
> +

Rest of the patch looks reasonable to me. One comment - I don't see where
the new dx functions (like insert entry, remove entry, etc) are being
plugged into the user-exported api for doing these things. Specifically, I'm
thinking ocfs2_link(), ocfs2_unlink(), ocfs2_init_dir(), ocfs2_lookup(),
ocfs2_lookup_system_inode(), etc. Basically, if the fs has indexed
directories, I want the existing tools to automatically use them without
much (if any) modification.

Generally, this probably just means that you add code to check whether the
directory is indexed and call the dx_* function if need be.
	--Mark

--
Mark Fasheh