[Ocfs2-devel] [PATCH 3/3] Implement "GROUP_ADD" for online resize, take 2

Mark Fasheh mark.fasheh at oracle.com
Fri Nov 30 15:20:04 PST 2007


On Tue, Nov 27, 2007 at 04:21:49PM +0800, tao.ma wrote:
> diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
> index 8a6925b..805a76e 100644
> --- a/fs/ocfs2/ocfs2.h
> +++ b/fs/ocfs2/ocfs2.h
> @@ -531,6 +531,7 @@ static inline unsigned int ocfs2_pages_per_cluster(struct super_block *sb)
>   * and return that block offset. */
>  u64 ocfs2_which_cluster_group(struct inode *inode, u32 cluster);
>  int ocfs2_group_extend(struct inode * inode, u32 new_clusters);
> +int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input);

resize.h please


>  #define ocfs2_set_bit ext2_set_bit
>  #define ocfs2_clear_bit ext2_clear_bit
> diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
> index 4b5813d..969c310 100644
> --- a/fs/ocfs2/ocfs2_fs.h
> +++ b/fs/ocfs2/ocfs2_fs.h
> @@ -231,7 +231,18 @@ struct ocfs2_space_resv {
>  #define OCFS2_IOC_RESVSP64	_IOW ('X', 42, struct ocfs2_space_resv)
>  #define OCFS2_IOC_UNRESVSP64	_IOW ('X', 43, struct ocfs2_space_resv)
>  
> +/* Used to pass group descriptor data when online resize is done */
> +struct ocfs2_new_group_input {
> +	__u64 group;		/* Group descriptor's blkno. */
> +	__u32 clusters;		/* Total number of clusters in this group */
> +	__u32 frees;		/* Total free clusters in this group */
> +	__u16 chain;		/* Chain for this group */
> +	__u16 reserved1;
> +	__u32 reserved2;
> +};
> +
>  #define OCFS2_IOC_GROUP_EXTEND	_IOW('f', 7, unsigned long)
> +#define OCFS2_IOC_GROUP_ADD	_IOW('f', 8,struct ocfs2_new_group_input)
>  
>  /*
>   * Journal Flags (ocfs2_dinode.id1.journal1.i_flags)
> diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c
> index 5c863af..efd7805 100644
> --- a/fs/ocfs2/resize.c
> +++ b/fs/ocfs2/resize.c
> @@ -364,3 +364,234 @@ out:
>  	mlog_exit_void();
>  	return ret;
>  }
> +
> +static int ocfs2_check_new_group(struct inode *inode,
> +				 struct ocfs2_dinode *di,
> +				 struct ocfs2_new_group_input *input)
> +{
> +	int ret;
> +	struct ocfs2_group_desc *gd;
> +	struct buffer_head *group_bh = NULL;
> +	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
> +	u16 cl_bpc = le16_to_cpu(di->id2.i_chain.cl_bpc);
> +	u64 cr_blkno;
> +	unsigned int max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) *
> +				le16_to_cpu(di->id2.i_chain.cl_bpc);
> +
> +	ret = ocfs2_read_block(osb, input->group, &group_bh, 0, NULL);
> +	if (ret < 0)
> +		goto out;

You should pass the inode so that the group gets added to the caching info.
We're fine if the checks here fail and userspace tries again since this read
(that doesn't use the caching flag) always goes to disk.


> +	gd = (struct ocfs2_group_desc *)group_bh->b_data;
> +	cr_blkno = le64_to_cpu(di->id2.i_chain.cl_recs[input->chain].c_blkno);
> +
> +	ret = -EIO;
> +	if (!OCFS2_IS_VALID_GROUP_DESC(gd))
> +		OCFS2_RO_ON_INVALID_GROUP_DESC(inode->i_sb, gd);

We can probably just throw an error instead of going read-only since an
invalid group descriptor here isn't referenced by the fs yet...


> +	else if (di->i_blkno != gd->bg_parent_dinode)
> +		mlog(0, "Group descriptor # %llu has bad parent "
> +		     "pointer (%llu, expected %llu)\n",
> +		     (unsigned long long)le64_to_cpu(gd->bg_blkno),
> +		     (unsigned long long)le64_to_cpu(gd->bg_parent_dinode),
> +		     (unsigned long long)le64_to_cpu(di->i_blkno));
> +	else if (le16_to_cpu(gd->bg_bits) > max_bits)
> +		mlog(0, "Group descriptor # %llu has bit count of %u\n",
> +		     (unsigned long long)le64_to_cpu(gd->bg_blkno),
> +		     le16_to_cpu(gd->bg_bits));
> +	else if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits))
> +		mlog(0, "Group descriptor # %llu has bit count %u but "
> +		     "claims that %u are free\n",
> +		     (unsigned long long)le64_to_cpu(gd->bg_blkno),
> +		     le16_to_cpu(gd->bg_bits),
> +		     le16_to_cpu(gd->bg_free_bits_count));
> +	else if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size)))
> +		mlog(0, "Group descriptor # %llu has bit count %u but "
> +		     "max bitmap bits of %u\n",
> +		     (unsigned long long)le64_to_cpu(gd->bg_blkno),
> +		     le16_to_cpu(gd->bg_bits),
> +		     8 * le16_to_cpu(gd->bg_size));
> +	else if (le16_to_cpu(gd->bg_chain) != input->chain)
> +		mlog(0, "Group descriptor # %llu has bad chain %u "
> +		     "while input has %u set.\n",
> +		     (unsigned long long)le64_to_cpu(gd->bg_blkno),
> +		    le16_to_cpu(gd->bg_chain), input->chain);
> +	else if (le16_to_cpu(gd->bg_bits) != input->clusters * cl_bpc)
> +		mlog(0, "Group descriptor # %llu has bit count %u but "
> +		     "input has %u clusters set\n",
> +		     (unsigned long long)le64_to_cpu(gd->bg_blkno),
> +		     le16_to_cpu(gd->bg_bits), input->clusters);
> +	else if (le16_to_cpu(gd->bg_free_bits_count) != input->frees * cl_bpc)
> +		mlog(0, "Group descriptor # %llu has free bit count %u but "
> +		     "it should have %u set\n",
> +		     (unsigned long long)le64_to_cpu(gd->bg_blkno),
> +		     le16_to_cpu(gd->bg_bits),
> +		     input->frees * cl_bpc);
> +	else if (le64_to_cpu(gd->bg_next_group) != cr_blkno)
> +		mlog(0, "Group descriptor # %llu has next group set as %llu "
> +		     "while the chain head has  %llu set\n",
> +		     (unsigned long long)le64_to_cpu(gd->bg_blkno),
> +		     le64_to_cpu(gd->bg_next_group), cr_blkno);

The 1st block pointer in a chain can change when we re-link groups during
allocation, so this check is invalid.


> +	else
> +		ret = 0;
> +out:
> +	if (group_bh)
> +		brelse(group_bh);
> +
> +	return ret;
> +}
> +
> +static int ocfs2_verify_group_input(struct inode *inode,
> +				    struct ocfs2_dinode *di,
> +				    struct ocfs2_new_group_input *input)
> +{
> +	u16 cl_count = le16_to_cpu(di->id2.i_chain.cl_count);
> +	u16 cl_cpg = le16_to_cpu(di->id2.i_chain.cl_cpg);
> +	u16 next_free = le16_to_cpu(di->id2.i_chain.cl_next_free_rec);
> +	u32 cluster = ocfs2_blocks_to_clusters(inode->i_sb, input->group);
> +	u32 total_clusters = le32_to_cpu(di->i_clusters);
> +	int ret = -EINVAL;
> +
> +	if (cluster < total_clusters)
> +		mlog(0, "add a group which is in the current volume.\n");
> +	else if (input->chain >= cl_count)
> +		mlog(0, "input chain exceeds the limit.\n");
> +	else if (next_free != cl_count && next_free != input->chain)
> +		mlog(0, "the add group should be in chain %u\n", next_free);

If I read this properly, then it's insisting that new groups always be added
to the rightmost chain... I'm not sure that's correct - we want to fill
chains which have fewer groups than the others first.

Anyway, weren't we trying to avoid enforcing group placement policy in the
kernel, short of making sure that the requested chain is a valid one?


> +	else if (total_clusters + input->clusters < total_clusters)
> +		mlog(0, "add group's clusters overflow.\n");
> +	else if (input->clusters > cl_cpg)
> +		mlog(0, "the cluster exceeds the maximum of a group\n");
> +	else if (input->frees > input->clusters)
> +		mlog(0, "the free cluster exceeds the total clusters\n");
> +	else if (total_clusters % cl_cpg != 0)
> +		mlog(0, "the last group isn't full. Use group extend first.\n");
> +	else if (input->group != ocfs2_which_cluster_group(inode, cluster))
> +		mlog(0, "group blkno is invalid\n");
> +	else if ((ret = ocfs2_check_new_group(inode, di, input)))
> +		mlog(0, "group descriptor check failed.\n");
> +	else
> +		ret = 0;
> +
> +	return ret;
> +}
> +
> +/* Add a new group descriptor to global_bitmap. */
> +int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
> +{
> +	int ret;
> +	handle_t *handle;
> +	struct buffer_head *main_bm_bh = NULL;
> +	struct inode *main_bm_inode = NULL;
> +	struct ocfs2_dinode *fe = NULL;
> +	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
> +	struct ocfs2_chain_list *cl;
> +	struct ocfs2_chain_rec *cr;
> +	u16 cl_bpc;
> +
> +	mlog_entry_void();
> +
> +	if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
> +		return -EROFS;
> +
> +	main_bm_inode = ocfs2_get_system_file_inode(osb,
> +						    GLOBAL_BITMAP_SYSTEM_INODE,
> +						    OCFS2_INVALID_SLOT);
> +	if (!main_bm_inode) {
> +		ret = -EINVAL;
> +		mlog_errno(ret);
> +		goto out;
> +	}
> +
> +	mutex_lock(&main_bm_inode->i_mutex);
> +
> +	ret = ocfs2_meta_lock(main_bm_inode, &main_bm_bh, 1);
> +	if (ret < 0) {
> +		mlog_errno(ret);
> +		goto out_mutex;
> +	}
> +
> +	fe = (struct ocfs2_dinode *)main_bm_bh->b_data;
> +
> +	if (le16_to_cpu(fe->id2.i_chain.cl_cpg) !=
> +				 ocfs2_group_bitmap_size(osb->sb) * 8) {
> +		mlog(0, "The disk is too old and small."
> +		     " Force to do offline resize.");
> +		ret = -EINVAL;
> +		goto out_unlock;
> +	}
> +
> +	ret = ocfs2_verify_group_input(main_bm_inode, fe, input);
> +	if (ret) {
> +		mlog_errno(ret);
> +		goto out_unlock;
> +	}
> +
> +	mlog(0, "Add a new group  %llu in chain = %u, length = %u\n",
> +	     input->group, input->chain, input->clusters);
> +
> +	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
> +	if (IS_ERR(handle)) {
> +		mlog_errno(PTR_ERR(handle));
> +		handle = NULL;
> +		ret = -EINVAL;
> +		goto out_unlock;
> +	}
> +
> +	ret = ocfs2_journal_access(handle, main_bm_inode, main_bm_bh,
> +				   OCFS2_JOURNAL_ACCESS_WRITE);
> +	if (ret < 0) {
> +		mlog_errno(ret);
> +		goto out_unlock;
> +	}
> +
> +	cl_bpc = le16_to_cpu(fe->id2.i_chain.cl_bpc);
> +	cl = &fe->id2.i_chain;
> +	cr = &cl->cl_recs[input->chain];
> +
> +	if (input->chain == le16_to_cpu(cl->cl_next_free_rec)) {
> +		le16_add_cpu(&cl->cl_next_free_rec, 1);
> +		memset(cr, 0, sizeof(struct ocfs2_chain_rec));
> +	}
> +
> +	cr->c_blkno = le64_to_cpu(input->group);

Before this, you need to set the new group descriptors bg_next_group value
to cr->c_blkno.

By the way, this probably means you need to read the group descriptor near
the top of this function instead of within ocfs2_check_new_group().


> +	le32_add_cpu(&cr->c_total, input->clusters * cl_bpc);
> +	le32_add_cpu(&cr->c_free, input->frees * cl_bpc);
> +
> +	le32_add_cpu(&fe->id1.bitmap1.i_total, input->clusters *cl_bpc);
> +	le32_add_cpu(&fe->id1.bitmap1.i_used,
> +		     (input->clusters - input->frees) * cl_bpc);
> +	le32_add_cpu(&fe->i_clusters, input->clusters);
> +	le64_add_cpu(&fe->i_size, input->clusters << osb->s_clustersize_bits);
> +
> +	ret = ocfs2_journal_dirty(handle, main_bm_bh);
> +	if (ret < 0) {
> +		mlog_errno(ret);
> +		goto out_unlock;
> +	}
> +
> +	ret = ocfs2_commit_trans(osb, handle);
> +	if (ret < 0) {
> +		mlog_errno(ret);
> +		goto out_unlock;
> +	}
> +
> +	spin_lock(&OCFS2_I(main_bm_inode)->ip_lock);
> +	OCFS2_I(main_bm_inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
> +	spin_unlock(&OCFS2_I(main_bm_inode)->ip_lock);

Need to also update i_size. I think the other patch was missing
that too...
	--Mark

--
Mark Fasheh
Senior Software Developer, Oracle
mark.fasheh at oracle.com



More information about the Ocfs2-devel mailing list