[Ocfs2-tools-devel] [PATCH 1/2 v4] fsck: supporting fixing inode alloc group desc

Fri Feb 23 00:38:51 PST 2018

Hi,

On 02/14/2018 11:38 AM, piaojun wrote:
> When inode_alloc's gd is corrupted, we may reinitialize it and then set
> its bitmap by iterating all files of root dir.
>
> How to make corrupted gd?
> 1. Find the gd blkno of inode_alloc with debugfs.ocfs2:
>      # debugfs.ocfs2 -R "stat //inode_alloc:0000" /dev/mapper/xxxx
>      ...
>      ##   Block#            Total    Used     Free     Contig   Size
>      0    167424            1024     3        1021     1021     4032
> 2. clear the gd with 'dd' command:
>      # dd if=/dev/zero of=/dev/mapper/xxxx bs=4k count=1 seek=167424
>      oflag=direct
>
> How to fix corrupted gd?
> 1. Identify the corrupted gd by generation and magic.
> 2. Initialize the corrupted gd with ocfs2_init_group_desc().
> 3. Iterate all files in root dir, and set inode_alloc's bitmap by inode
>     blknum.
> 4. Write back the good gd to disk.
>
> Currently we could only fix the situation that there is one gd in each
> chain, because we can hardly rebuild the gd far from chain header. The
> key problem is that we can not trust gd anymore as they have been
> corrupted. So we must relay on the ocfs2_chain_list struct to restore
> all gds.
>
> Signed-off-by: Jun Piao <piaojun at huawei.com>
> ---
>   fsck.ocfs2/pass0.c | 182 +++++++++++++++++++++++++++++++++++++++++++++++++++++
>   1 file changed, 182 insertions(+)
>
> diff --git a/fsck.ocfs2/pass0.c b/fsck.ocfs2/pass0.c
> index bfd11fb..25cb779 100644
> --- a/fsck.ocfs2/pass0.c
> +++ b/fsck.ocfs2/pass0.c
> @@ -1308,6 +1308,184 @@ static errcode_t verify_bitmap_descs(o2fsck_state *ost,
>   	return ret;
>   }
>
> +struct walk_path {
> +	const char *argv0;
> +	char *path;
> +	ocfs2_filesys *fs;
> +	struct ocfs2_group_desc *bgs;
> +	int corrupted_bgs;
> +};
> +
> +static int set_bitmap_func(struct ocfs2_dir_entry *dentry,
> +			  uint64_t blocknr,
> +			  int offset,
> +			  int blocksize,
> +			  char *buf,
> +			  void *priv_data)
> +{
> +	struct walk_path *wp = priv_data;
> +	struct ocfs2_group_desc *bg;
> +	__le64 inode = dentry->inode;
> +	__le64 bg_blkno;
> +	errcode_t ret;
> +	int len;
> +	int reti = 0;
> +	int i = 0;
> +	char *old_path, *path = NULL;
> +
> +	if (!strncmp(dentry->name, ".", dentry->name_len) ||
> +	    !strncmp(dentry->name, "..", dentry->name_len))
> +		return 0;
> +
> +	ret = ocfs2_malloc0(PATH_MAX, &path);
> +	if (ret) {
> +		com_err(wp->argv0, ret,
> +			"while allocating path memory in %s\n", wp->path);
> +		return OCFS2_DIRENT_ABORT;
> +	}
> +
> +	len = strlen(wp->path);
> +	memcpy(path, wp->path, len);
> +	memcpy(path + len, dentry->name, dentry->name_len);
> +	if (dentry->file_type == OCFS2_FT_DIR)
> +		path[len + dentry->name_len] = '/';
> +
> +	/* set group desc bitmap */
> +	for (i = 0; i < wp->corrupted_bgs; i++) {
> +		bg = &wp->bgs[i];
> +		bg_blkno = bg->bg_blkno;
> +		if (inode > bg_blkno && inode <= bg_blkno + bg->bg_bits) {
> +			ocfs2_set_bit(inode - bg_blkno, bg->bg_bitmap);
> +			bg->bg_free_bits_count--;
> +		}
> +	}
> +
> +	if (dentry->file_type == OCFS2_FT_DIR) {
> +		old_path = wp->path;
> +		wp->path = path;
> +		ret = ocfs2_dir_iterate(wp->fs, inode, 0, NULL,
> +					set_bitmap_func, wp);
> +		if (ret) {
> +			com_err(wp->argv0, ret, "while walking %s", wp->path);
> +			reti = OCFS2_DIRENT_ABORT;
> +		}
> +		wp->path = old_path;
> +	}
> +
> +	ocfs2_free(&path);
> +
> +	return reti;
> +}
> +
> +static errcode_t verify_group_desc(o2fsck_state *ost,
> +				     struct ocfs2_dinode *di, int type)
> +{
> +	uint16_t bits;
> +	uint64_t blkno;
> +	errcode_t ret = 0;
> +	int corrupted_bgs = 0, i;
> +	struct ocfs2_chain_list *cl = &di->id2.i_chain;
> +	struct ocfs2_chain_rec *rec;
> +	struct ocfs2_group_desc *bgs = NULL;
> +
> +	ret = ocfs2_malloc_blocks(ost->ost_fs->fs_io,
> +			cl->cl_next_free_rec, &bgs);
> +	if (ret) {
> +		com_err(whoami, ret, "while allocating block group descriptors");
> +		goto out;
> +	}
> +	memset(bgs, 0, ost->ost_fs->fs_blocksize * cl->cl_next_free_rec);
> +
> +	/*
> +	 * Currently we could only fix the situation that there is one gd
> +	 * in each chain, because we can hardly rebuild the gd far from
> +	 * chain header. The key problem is that we can not trust gd
> +	 * anymore as they have been corrupted. So we must relay on
> +	 * the ocfs2_chain_list struct to restore all gds.
> +	 */

As talked in v3 as below:

===

>I still think it's not proper to silently assume that there is only one gd in each chain.
>So, could we give warning for user's attention by check how many
>gds the allocator has?  If number of gds > the number of chain recs, give some warnings?

> Eric

Agree, we could give user a choice whether fixing this problem if
gds > the number of chain recs.

===

I cannot see where you enforce this?

Eric

> +	for (i = 0; i < cl->cl_next_free_rec; i++) {
> +		rec = &cl->cl_recs[i];
> +		blkno = rec->c_blkno;
> +		bits = rec->c_total;
> +
> +		ret = ocfs2_read_group_desc(ost->ost_fs, blkno,
> +				(char *)&bgs[corrupted_bgs]);
> +		if ((ret == OCFS2_ET_BAD_GROUP_DESC_MAGIC) ||
> +		    (!ret && bgs[corrupted_bgs].bg_generation != ost->ost_fs_generation)) {
> +			if (!prompt(ost, PY, PR_GROUP_EXPECTED_DESC,
> +			    "Block %"PRIu64" should be a group "
> +			    "descriptor for the bitmap chain allocator "
> +			    "but it was corrupted.  Reinitialize it as "
> +			    "a group desc and link it into the bitmap "
> +			    "allocator? Note that we could only fix the "
> +			    "situation that there is one gd in each chain",
> +			    blkno))
> +				continue;
> +			ocfs2_init_group_desc(ost->ost_fs,
> +					&bgs[corrupted_bgs],
> +					blkno, ost->ost_fs_generation,
> +					di->i_blkno, bits, i, 1);
> +			corrupted_bgs++;
> +		} else if (ret) {
> +			com_err(whoami, ret, "while reading a block bitmap "
> +				"group descriptor from block %"PRIu64,
> +				blkno);
> +		}
> +	}
> +
> +	/* traverse all inodes, and set group desc bitmap */
> +	if (corrupted_bgs) {
> +		/* Walk root dir */
> +		struct walk_path wp;
> +		uint64_t root_blkno;
> +		char *path = NULL;
> +
> +		switch (type) {
> +		case GLOBAL_INODE_ALLOC_SYSTEM_INODE:
> +			path = "//";
> +			root_blkno = ost->ost_fs->fs_sysdir_blkno;
> +			break;
> +		case INODE_ALLOC_SYSTEM_INODE:
> +			path = "/";
> +			root_blkno = ost->ost_fs->fs_root_blkno;
> +			break;
> +		default:
> +			ret = OCFS2_ET_INTERNAL_FAILURE;
> +			com_err(whoami, ret, "while verifying group desc");
> +			goto out;
> +		}
> +
> +		wp.argv0 = whoami;
> +		wp.path = path;
> +		wp.fs = ost->ost_fs;
> +		wp.bgs = bgs;
> +		wp.corrupted_bgs = corrupted_bgs;
> +		ret = ocfs2_dir_iterate(ost->ost_fs,
> +				root_blkno, 0, NULL,
> +				set_bitmap_func, &wp);
> +		if (ret) {
> +			com_err(whoami, ret, "while walking root dir");
> +			goto out;
> +		}
> +	}
> +
> +	/* write back fixed bgs */
> +	for (i = 0; i < corrupted_bgs; i++) {
> +		ret = ocfs2_write_group_desc(ost->ost_fs,
> +				bgs[i].bg_blkno,
> +				(char *)&bgs[i]);
> +		if (ret) {
> +			com_err(whoami, ret, "while writing a block group "
> +				"descriptor at block %"PRIu64, blkno);
> +			ost->ost_saw_error = 1;
> +		}
> +	}
> +
> +out:
> +	ocfs2_free(&bgs);
> +	return ret;
> +}
> +
>   /* this returns an error if it didn't leave the allocators in a state that
>    * the iterators will be able to work with.  There is probably some room
>    * for more resiliance here. */
> @@ -1483,6 +1661,10 @@ errcode_t o2fsck_pass0(o2fsck_state *ost)
>   			verbosef("Caching inode alloc failed, err %d\n",
>   				 (int)ret);
>
> +		ret = verify_group_desc(ost, di, type);
> +		if (ret)
> +			goto out;
> +
>   		ret = verify_chain_alloc(ost, di,
>   					 blocks + ost->ost_fs->fs_blocksize,
>   					 blocks +