[Ocfs2-tools-devel] [PATCH 09/11] ocfs2-tools: add xattr support in fsck.ocfs2

Tue Mar 3 14:49:56 PST 2009

On Fri, Feb 27, 2009 at 03:54:49PM +0800, Tiger Yang wrote:
> This patch will check all extended attributes related with each inode,
> It will check and try to fix xattr structures in inode, block and bucket,
> extent tree in index block and xattr's value tree, mark all xattr
> block/cluster as used.

	This is much better.  I've copied Tao as well because there are
a couple of questions I have about our xattr scheme.

> +enum xattr_location {
> +	IN_INODE = 0,
> +	IN_BLOCK,
> +	IN_BUCKET
> +};
> +
> +static const char *xattr_object[] = {
> +	"inode",
> +	"block",
> +	"bucket",
> +};
> +
> +struct xattr_info {
> +	uint16_t location;

This should be 'enum xattr_location location'.  The compiler can then
check that only valid enum values are used, and the debugger can print
the enum values as strings.

> +static uint16_t detect_xattr_count_in_bucket(struct ocfs2_xattr_header *xh,
> +					     struct xattr_info *xi)
> +{
> +	uint16_t count = 0;
> +	struct ocfs2_xattr_entry *entry = xh->xh_entries;
> +	struct ocfs2_xattr_entry *pre_xe = entry;
> +	uint16_t max_count = (xi->max_offset - HEADER_SIZE) / ENTRY_SIZE;
> +
> +	if (xh->xh_count == 0 && xh->xh_free_start == OCFS2_XATTR_BUCKET_SIZE)
> +		return 0;
> +
> +	while (!IS_LAST_ENTRY(entry)) {
> +		/*
> +		 * If we could not get the reasonable count,
> +		 * we will remove them by setting count to zero
> +		 */
> +		if (count >= max_count)
> +			return 0;
> +		/* xattr entries in bucket had sorted by name_hash */
> +		if (entry->xe_name_hash < pre_xe->xe_name_hash)
> +			return count;
> +		count++;
> +		pre_xe = entry;
> +		entry++;
> +	}
> +
> +	return count;
> +}

	Boy, I really want this to use the pre_xe->xe_name_offset to
verify that the entry hasn't walked past the free_start.  There is no
guarantee that the start of a name value won't be greater than the
name_hash of the last valid entry.  This would give a false extra entry.
	This is why I wish we had a consistent free_start in all types
of xattr structures, but it's too late for that.  I realize you didn't
use the xe_name_offset fields here because you haven't validated them
yet.  So on to that part.

> +static int check_xattr_count(o2fsck_state *ost,
> +			     struct ocfs2_dinode *di,
> +			     struct ocfs2_xattr_header *xh,
> +			     int *changed,
> +			     struct xattr_info *xi)
> +{
> +	uint16_t count = 0;
> +	uint16_t max_count = (xi->max_offset - HEADER_SIZE) / ENTRY_SIZE;
> +	struct ocfs2_xattr_entry *entry = xh->xh_entries;
> +
> +	/*
> +	 * The last entry's name_offset is not always the
> +	 * minimum offset of all entry in inode, block or bucket.
> +	 * And xattr entries in bucket may not have GAP between
> +	 * entry and name/value, we have to use different way
> +	 * to detect the count of the entries.
> +	 */
> +	if (xi->location == IN_BUCKET)
> +		count = detect_xattr_count_in_bucket(xh, xi);
> +	else {
> +		/*
> +		 * The gap between entry and name/value is the only way
> +		 * for us to know the total count of the entries when
> +		 * them in inode, block.
> +		 */
> +		while (!IS_LAST_ENTRY(entry)) {
> +			if (count >= max_count) {
> +				/*
> +				 * If we could not get the reasonable count,
> +				 * we will remove them by setting count to zero
> +				 */
> +				count = 0;
> +				break;
> +			}
> +			count++;
> +			entry++;
> +		}
> +	}

	And here, the gap could be corrupt, so we have a similar problem
to the bucket count code.

> +
> +	if (xh->xh_count != count) {
> +		if (prompt(ost, PY, PR_XATTR_COUNT_INVALID,
> +			   "Extended attributes in %s #%"PRIu64" claims to"
> +			   " have %u entries, but fsck believes it is %u,"
> +			   " Fix the entries count?",
> +			   xattr_object[xi->location], xi->blkno,
> +			   xh->xh_count, count)) {
> +			xh->xh_count = count;
> +			if (!count && xi->location == IN_BUCKET) {
> +				xh->xh_free_start = OCFS2_XATTR_BUCKET_SIZE;
> +				xh->xh_name_value_len = 0;
> +			}
> +			*changed = 1;
> +		} else
> +			return -1;
> +	}
> +
> +	return 0;
> +}

	I have an idea.  We can't use the offsets to check the count,
because they might be invalid.  But we can't use the count to check the
offsets, because it might be invalid.  Tiger, you chose to check the
count first without the offsets, and then use that count to validate the
offsets.  How about we do this in two passes.
	First, change the above code to merely check a *maximum* count.
That is, instead of returning 0 when count >= max_count, you just return
max_count.  Then instead of comparing "if (xh->xh_cout != count)", you
compare "if (xh->xh_count > count)".  if it happens, you set xh_count to
count, which will be an upper bound.  That is, as long as xh_count is
less than or equal to the detected count, we can trust xh_count.  Call
this check_xattr_count_bounds().
	Next, you use this updated xh_count to check the entries.  This
is safe, because your xh_count is either the same as the detected count
or less.
	After you've checked all the entries, you can use the offsets to
re-walk the entries and detect if an entry crosses over with the
name-value section.  Call that check_xattr_count_strict().  You can then
shrink xh_count if there is a problem.
	Finally, you can check the values.

> +static errcode_t check_xattr_entry(o2fsck_state *ost,
> +				   struct ocfs2_dinode *di,
> +				   struct ocfs2_xattr_header *xh,
> +				   int *changed,
> +				   struct xattr_info *xi)
> +{
> +	int i;
> +
> +	for (i = 0 ; i < xh->xh_count; i++) {
> +		struct ocfs2_xattr_entry *xe = &xh->xh_entries[i];
> +		uint32_t hash;
> +
> +		/* check and fix name_offset */
> +		if (xe->xe_name_offset >= xi->max_offset) {
> +			if (prompt(ost, PY, PR_XATTR_OFFSET_INVALID,
> +				   "Extended attribute entry in %s #%"PRIu64
> +				   " refers to an invalid name offset %u,"
> +				   " clear this entry?",
> +				   xattr_object[xi->location], xi->blkno,
> +				   xe->xe_name_offset)) {
> +				/*
> +				 * remove the entry and left the name/value,
> +				 * because we don't know the correct offset
> +				 * of them.
> +				 */
> +				memmove((void *)xe, (void *)(xe + 1),
> +					(xh->xh_count - i - 1) * ENTRY_SIZE);
> +				memset(&xh->xh_entries[xh->xh_count - 1], 0,
> +					ENTRY_SIZE);
> +				xh->xh_count -= 1;
> +				i -= 1;
> +				*changed = 1;
> +				continue;
> +			} else
> +				return -1;
> +		}

	When you find invalid entries, you leave them in place.  This
means that name_value_len and free_start get out of sync, because
name_value_len only covers the good values, but free_start covers all
values.  I think the kernel handles this ok because it never compares
free_start and name_value_len, but what happens if someone doesn't
realize this and changes it?
	I suppose we all just watch out for that in future patches.

> +static errcode_t ocfs2_check_xattr_buckets(o2fsck_state *ost,
> +					   struct ocfs2_dinode *di,
> +					   uint64_t blkno,
> +					   uint32_t clusters)
> +{
> +	int i;
> +	errcode_t ret = 0;
> +	char *bucket = NULL;
> +	char *bucket_buf = NULL;
> +	struct ocfs2_xattr_header *xh;
> +	int blk_per_bucket = ocfs2_blocks_per_xattr_bucket(ost->ost_fs);
> +	uint32_t bpc = ocfs2_xattr_buckets_per_cluster(ost->ost_fs);
> +	uint32_t max_buckets = clusters * bpc;
> +	uint32_t max_blocks = max_buckets * blk_per_bucket;
> +	uint32_t num_buckets = 0;
> +	uint64_t blk = 0;
> +
> +	/* malloc space for all buckets */
> +	ret = ocfs2_malloc_blocks(ost->ost_fs->fs_io, max_blocks, &bucket);
> +	if (ret) {
> +		com_err(whoami, ret, "while allocating room to read"
> +			" extended attributes bucket");
> +		goto out;
> +	}
> +
> +	/* read all buckets for detect (some of them may not be used) */
> +	bucket_buf = bucket;
> +	blk = blkno;
> +	for (i = 0; i < max_buckets; i++) {
> +		ret = ocfs2_read_xattr_bucket(ost->ost_fs, blk, bucket_buf);
> +		if (ret) {
> +			com_err(whoami, ret, "while reading bucket of"
> +				" extended attributes ");
> +			goto out;
> +		}
> +		blk += blk_per_bucket;
> +		bucket_buf += OCFS2_XATTR_BUCKET_SIZE;
> +	}

	This code reads all the possible buckets in a cluster, but the
actual number of used buckets might be less.  If that's the case,
ocfs2_read_xattr_bucket() is going to return an error for the invalid
bucket (at least in the metaecc case).  So you can't just exit out.
Instead, why not just change max_buckets = i and continue on?  You'll
just be validating the buckets you could read, which should, in fact,
match the buckets you expected.

> +static errcode_t o2fsck_check_xattr_ibody(o2fsck_state *ost,
> +					  struct ocfs2_dinode *di,
> +					  int *i_changed)
> +{
> +	errcode_t ret;
> +	struct ocfs2_xattr_header *xh = NULL;
> +	struct xattr_info xi = {
> +		.location = IN_INODE,
> +		.max_offset = di->i_xattr_inline_size,
> +		.blkno = di->i_blkno,
> +	};
> +
> +	xh = (struct ocfs2_xattr_header *)
> +		 ((void *)di + ost->ost_fs->fs_blocksize -
> +		  di->i_xattr_inline_size);
> +
> +	ocfs2_swap_xattrs_to_cpu(xh);

	As I commented on the tunefs patch, this swapping should have
been handled inside ocfs2_swap_inode_{to,from}_cpu().

Joel

-- 

"In a crisis, don't hide behind anything or anybody. They're going
 to find you anyway."
	- Paul "Bear" Bryant

Joel Becker
Principal Software Developer
Oracle
E-mail: joel.becker at oracle.com
Phone: (650) 506-8127