[Ocfs2-tools-devel] [RFC 2/8] dx_dirs v4: indexed dirs core code in libocfs2

Fri Jan 22 00:21:45 PST 2010

Coly Li wrote:
> This patch contains the core part of current indexed dirs support in libocfs2, includes,
> - Indexed tree truncate.
> - Build indexed tree.
> - Insert dx record into indexed tree.
> - Expand inlined dx_root to an extent tree.
> - Rebalance indexed tree.
> 
> I worry about whether I use the extent tree code correctly, though I tried my best to understand the code, IMHO it's
> still a gray area to me so far. Any review/comment is helpful !
> 
> Signed-off-by: Coly Li <coly.li at suse.de>
> Cc: Tao Ma <tao.ma at oracle.com>
> ---
>  Makefile      |    3
>  dir_indexed.c | 1117 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  dir_indexed.h |   25 +
>  inode.c       |    4
>  4 files changed, 1148 insertions(+), 1 deletion(-)
> 
> diff --git a/libocfs2/Makefile b/libocfs2/Makefile
> index 8e94cc3..6d3b563 100644
> --- a/libocfs2/Makefile
> +++ b/libocfs2/Makefile
> @@ -76,7 +76,8 @@ CFILES = 		\
>  	quota.c		\
>  	image.c		\
>  	xattr.c		\
> -	extent_tree.c
> +	extent_tree.c	\
> +	dir_indexed.c
> 
>  HFILES =		\
>  	bitmap.h	\
> diff --git a/libocfs2/inode.c b/libocfs2/inode.c
> index c1a8646..b41b286 100644
> --- a/libocfs2/inode.c
> +++ b/libocfs2/inode.c
> @@ -139,6 +139,10 @@ static void ocfs2_swap_inode_second(struct ocfs2_dinode *di)
>  		sb->s_uuid_hash           = bswap_32(sb->s_uuid_hash);
>  		sb->s_first_cluster_group = bswap_64(sb->s_first_cluster_group);
>  		sb->s_xattr_inline_size   = bswap_16(sb->s_xattr_inline_size);
> +		sb->s_dx_seed[0]          = bswap_32(sb->s_dx_seed[0]);
> +		sb->s_dx_seed[1]          = bswap_32(sb->s_dx_seed[1]);
> +		sb->s_dx_seed[2]          = bswap_32(sb->s_dx_seed[2]);
> +		sb->s_dx_seed[3]          = bswap_32(sb->s_dx_seed[3]);
I just noticed that in ocfs2_super_block we define
	__le32 s_dx_seed[3];
So swap s_dx_seed[3] is wrong?
> 
>  	} else if (di->i_flags & OCFS2_LOCAL_ALLOC_FL) {
>  		struct ocfs2_local_alloc *la = &di->id2.i_lab;
> 
> diff --git a/libocfs2/dir_indexed.c b/libocfs2/dir_indexed.c
> new file mode 100644
> index 0000000..2d1d602
> --- /dev/null
> +++ b/libocfs2/dir_indexed.c
> @@ -0,0 +1,1117 @@
> +/* -*- mode: c; c-basic-offset: 8; -*-
> + * vim: noexpandtab sw=8 ts=8 sts=0:
> + *
> + * Copyright (C) 2009 Novell.  All rights reserved.
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU General Public
> + * License, version 2,  as published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public
> + * License along with this program; if not, write to the
> + * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
> + * Boston, MA 021110-1307, USA.
Please use the right information.
See http://oss.oracle.com/pipermail/ocfs2-devel/2010-January/005711.html.
> + */
> +#include <assert.h>
> +#include <ocfs2/ocfs2.h>
> +#include <ocfs2/bitops.h>
> +#include <ocfs2/kernel-rbtree.h>
> +#include "extent_tree.h"
> +#include "dir_indexed.h"
> +
> +
> +int ocfs2_dx_dir_truncate(ocfs2_filesys *fs,
> +			uint64_t dir)
> +{
> +	struct ocfs2_dx_root_block *dx_root;
> +	char *dx_root_buf = NULL, *di_buf = NULL;
> +	struct ocfs2_dinode *di;
> +	int ret = 0;
> +
> +	ret = ocfs2_malloc_block(fs->fs_io, &di_buf);
> +	if (ret)
> +		goto out;
> +	ret = ocfs2_read_inode(fs, dir, di_buf);
> +	if (ret)
> +		goto out;
> +	di = (struct ocfs2_dinode *)di_buf;
> +
> +	/* we have to trust i_dyn_features */
> +	if ((!ocfs2_dir_indexed(di)) ||
> +	    (di->i_dyn_features & OCFS2_INLINE_DATA_FL))
> +		goto out;
Do we need to check S_ISDIR or only dir inode can call this function?
> +
> +	ret = ocfs2_malloc_block(fs->fs_io, &dx_root_buf);
> +	if (ret)
> +		goto out;
> +	ret = ocfs2_read_dx_root(fs, (uint64_t)di->i_dx_root, dx_root_buf);
> +	if (ret)
> +		goto out;
> +	dx_root = (struct ocfs2_dx_root_block *)dx_root_buf;
> +	
> +	if (dx_root->dr_flags & OCFS2_DX_FLAG_INLINE)
> +		goto remove_index;
> +
> +	ret = ocfs2_dir_indexed_tree_truncate(fs, dx_root);
> +
> +remove_index:
> +	ret = ocfs2_delete_dx_root(fs, dx_root->dr_blkno);
> +
> +	di->i_dyn_features &= ~OCFS2_INDEXED_DIR_FL;
> +	di->i_dx_root = 0;
> +
> +	ret = ocfs2_write_inode(fs, di->i_blkno, (char *)di);
> +out:
> +	if (di_buf)
> +		ocfs2_free(&di_buf);
> +	if (dx_root_buf)
> +		ocfs2_free(&dx_root_buf);
> +	return ret;
> +}
> +
> +static unsigned int ocfs2_figure_dirent_hole(struct ocfs2_dir_entry *de)
> +{
> +	unsigned int hole;
> +
> +	if (de->inode == 0)
> +		hole = de->rec_len;
> +	else
> +		hole = de->rec_len - OCFS2_DIR_REC_LEN(de->name_len);
> +
> +	return hole;
> +}
> +
> +static int ocfs2_find_max_rec_len(ocfs2_filesys *fs,
> +				char *buf)
> +{
> +	int size, this_hole, largest_hole = 0;
> +	char *de_buf, *limit, *start;
> +	struct ocfs2_dir_block_trailer *trailer;
> +	struct ocfs2_dir_entry *de;
> +
> +	start = buf;
> +	trailer = ocfs2_dir_trailer_from_block(fs, buf);
> +	size = ocfs2_dir_trailer_blk_off(fs);
> +	limit = start + size;
> +	de_buf = start;
> +	de = (struct ocfs2_dir_entry *)de_buf;
> +	do {
> +		if (de_buf != (char *)trailer) {
does this check redundant? if de_buf== trailer, we should already exit 
since de_buf = limit? btw, "start" is set to "buf" and never changed, I 
guess we can remove it.
> +			this_hole = ocfs2_figure_dirent_hole(de);
> +			if (this_hole > largest_hole)
> +				largest_hole = this_hole;
> +		}
> +
> +		de_buf += de->rec_len;
> +		de = (struct ocfs2_dir_entry *)de_buf;
> +	} while (de_buf < limit);
> +
> +	if (largest_hole >= OCFS2_DIR_MIN_REC_LEN)
> +		return largest_hole;
> +	return 0;
> +}
> +
> +struct trailer_ctxt {
> +	struct ocfs2_dx_root_block *dx_root;
> +	struct ocfs2_dinode *di;
> +};
> +
> +
> +static int dir_trailer_func(ocfs2_filesys *fs,
> +				uint64_t blkno,
> +				uint64_t bcount,
> +				uint16_t ext_flags,
> +				void *priv_data)
> +{
> +	struct trailer_ctxt *ctxt = (struct trailer_ctxt *)priv_data;
> +	struct ocfs2_dinode *di = ctxt->di;
> +	struct ocfs2_dx_root_block *dx_root = ctxt->dx_root;
> +	struct ocfs2_dir_block_trailer *trailer;
> +	int max_rec_len = 0;
> +	errcode_t ret = 0;
> +	char *blk = NULL;
> +
> +	ret = ocfs2_malloc_block(fs->fs_io, &blk);
> +	if (ret)
> +		goto out;
> +
> +	/* here we don't trust trailer, cannot use
> +	 * ocfs2_read_dir_block() */
> +	ret = ocfs2_read_blocks(fs, blkno, 1, blk);
> +	if (ret)
> +		goto out;
> +
> +	max_rec_len = ocfs2_find_max_rec_len(fs, blk);
> +	trailer = ocfs2_dir_trailer_from_block(fs, blk);
> +	trailer->db_free_rec_len = max_rec_len;
> +	ocfs2_init_dir_trailer(fs, di, blkno, blk);
sorry for a silly qs. How could you make sure the dir_block has enough 
space for adding a trailer? What if the block is already full?
> +
> +	if (max_rec_len) {
> +		trailer->db_free_next = dx_root->dr_free_blk;
> +		dx_root->dr_free_blk = blkno;
> +	}
> +
> +	/* comput trailer->db_check here, after writes out,
> +	 * trailer is trustable */
> +	ret = ocfs2_write_dir_block(fs, di, blkno, blk);
> +out:
> +	if (blk)
> +		ocfs2_free(&blk);
> +	return ret;
> +}
> +
> +static errcode_t ocfs2_init_dir_trailers(ocfs2_filesys *fs,
> +				struct ocfs2_dinode *di,
> +				struct ocfs2_dx_root_block *dx_root)
> +{
> +	errcode_t ret = 0;
> +	struct trailer_ctxt ctxt;
> +
> +	if (di->i_dyn_features & OCFS2_INLINE_DATA_FL) {
> +		ret = OCFS2_ET_INODE_NOT_VALID;
> +		goto out;
why not return 0 here since actually there is no error?
> +	}
> +
> +	ctxt.di = di;
> +	ctxt.dx_root = dx_root;
> +
> +	ret = ocfs2_block_iterate_inode(fs, di,
> +			0, dir_trailer_func, &ctxt);
> +out:
> +	return ret;
> +}
> +
> +static void ocfs2_dx_entry_list_insert(struct ocfs2_dx_entry_list *entry_list,
> +					struct ocfs2_dx_hinfo *hinfo,
> +					uint64_t dirent_blk)
> +{
> +	int i;
> +	struct ocfs2_dx_entry *dx_entry;
> +
> +	i = entry_list->de_num_used;
> +	dx_entry = &entry_list->de_entries[i];
> +
> +	memset(dx_entry, 0, sizeof(struct ocfs2_dx_entry));
> +	dx_entry->dx_major_hash = hinfo->major_hash;
> +	dx_entry->dx_minor_hash = hinfo->minor_hash;
> +	dx_entry->dx_dirent_blk = dirent_blk;
> +
> +	entry_list->de_num_used += 1;
> +}
> +
> +struct dx_insert_ctxt {
> +	uint64_t dir_blkno;
> +	uint64_t dx_root_blkno;
> +	ocfs2_filesys *fs;
> +};
> +
> +
> +inline static int ocfs2_inline_dx_has_space(struct ocfs2_dx_root_block *dx_root)
> +{
> +	struct ocfs2_dx_entry_list *entry_list;
> +
> +	entry_list = &dx_root->dr_entries;
> +
> +	if (entry_list->de_num_used >= entry_list->de_count)
> +		return OCFS2_ET_DIR_NO_SPACE;
hey, the below you check ocfs2_inline_dx_has_space with 0, here neither 
of the 2 returns have the value 0.
> +
> +	return 1;
> +}
> +
> +static struct ocfs2_dx_leaf **ocfs2_dx_dir_alloc_leaves(ocfs2_filesys *fs,
> +					int *ret_num_leaves)
> +{
> +	int ret, num_dx_leaves = ocfs2_clusters_to_blocks(fs, 1);
> +	char *dx_leaves_buf = NULL;
> +
> +
> +	ret = ocfs2_malloc0(num_dx_leaves * sizeof(void *),  &dx_leaves_buf);
> +	if (dx_leaves_buf && ret_num_leaves)
> +		*ret_num_leaves = num_dx_leaves;
> +
> +	return (struct ocfs2_dx_leaf **)dx_leaves_buf;
&dx_leaves_buf?

I just wonder whether you have written a test case for your patch? These 
errors in the patch should be pretty easy to find out.

Regards,
Tao