[Ocfs2-devel] [PATCH 1/2] Add support for cross extent block merge.V1

Mark Fasheh mark.fasheh at oracle.com
Sun Jan 6 21:51:16 PST 2008


On Fri, Jan 04, 2008 at 04:45:38PM +0800, tao.ma wrote:
> In ocfs2_merge_rec_left, when we find the merge extent is "CONTIG_RIGHT"
> with the first extent record of the next extent block, we will merge it to
> the next extent block and change all the related extent blocks accordingly.
> 
> In ocfs2_merge_rec_right, when we find the merge extent is "CONTIG_LEFT"
> with the last extent record of the previous extent block, we will merge
> it to the prevoius extent block and change all the related extent blocks
> accordingly.
> 
> As for CONTIG_LEFTRIGHT, we will handle CONTIG_RIGHT first when the split_index
> is zero since it is more efficient and easier.
> 
> Signed-off-by: Tao Ma <tao.ma at oracle.com>
> ---
>  fs/ocfs2/alloc.c |  367 ++++++++++++++++++++++++++++++++++++++++++++++++-----
>  1 files changed, 332 insertions(+), 35 deletions(-)
> 
> diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
> index 23c8cda..c256750 100644
> --- a/fs/ocfs2/alloc.c
> +++ b/fs/ocfs2/alloc.c
> @@ -1450,6 +1450,8 @@ static void ocfs2_adjust_root_records(struct ocfs2_extent_list *root_el,
>   *   - When our insert into the right path leaf is at the leftmost edge
>   *     and requires an update of the path immediately to it's left. This
>   *     can occur at the end of some types of rotation and appending inserts.
> + *   - When we've adjusted the last extent record in the left path leaf and the
> + *     1st extent record in the right path leaf during cross extent block merge.
>   */
>  static void ocfs2_complete_edge_insert(struct inode *inode, handle_t *handle,
>  				       struct ocfs2_path *left_path,

Ok, great. I like that we're keeping these functions documented... It's hard
enough to follow as-is!



>  /*
>   * Remove split_rec clusters from the record at index and merge them
>   * onto the beginning of the record at index + 1.
>   */

Should update this comment though...

> -static int ocfs2_merge_rec_right(struct inode *inode, struct buffer_head *bh,
> -				handle_t *handle,
> -				struct ocfs2_extent_rec *split_rec,
> -				struct ocfs2_extent_list *el, int index)
> +static int ocfs2_merge_rec_right(struct inode *inode,
> +				 struct ocfs2_path *left_path,
> +				 handle_t *handle,
> +				 struct ocfs2_extent_rec *split_rec,
> +				 struct ocfs2_extent_list *el, int index)
>  {


> @@ -2751,7 +2848,90 @@ static int ocfs2_merge_rec_right(struct inode *inode, struct buffer_head *bh,
>  	if (ret)
>  		mlog_errno(ret);
>  
> +	if (right_path) {
> +		ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
> +		if (ret)
> +			mlog_errno(ret);
> +
> +		root_bh = left_path->p_node[subtree_index].bh;
> +		BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
> +
> +		ret = ocfs2_journal_access(handle, inode, root_bh,
> +					   OCFS2_JOURNAL_ACCESS_WRITE);
> +		if (ret) {
> +			mlog_errno(ret);
> +			goto out;
> +		}
> +
> +		for (i = subtree_index + 1;
> +		     i < path_num_items(right_path); i++) {
> +			ret = ocfs2_journal_access(handle, inode,
> +						   right_path->p_node[i].bh,
> +						   OCFS2_JOURNAL_ACCESS_WRITE);
> +			if (ret) {
> +				mlog_errno(ret);
> +				goto out;
> +			}
> +
> +			ret = ocfs2_journal_access(handle, inode,
> +						   left_path->p_node[i].bh,
> +						   OCFS2_JOURNAL_ACCESS_WRITE);
> +			if (ret) {
> +				mlog_errno(ret);
> +				goto out;
> +			}
> +		}

I'd move the journal_access() calls here up before we actually change the
records. That way we can catch journal errors before the tree gets into an
transient state.


> @@ -2759,22 +2939,64 @@ out:
>   * Remove split_rec clusters from the record at index and merge them
>   * onto the tail of the record at index - 1.
>   */

Should update this comment.


> -static int ocfs2_merge_rec_left(struct inode *inode, struct buffer_head *bh,
> +static int ocfs2_merge_rec_left(struct inode *inode,
> +				struct ocfs2_path *right_path,
>  				handle_t *handle,
>  				struct ocfs2_extent_rec *split_rec,
>  				struct ocfs2_extent_list *el, int index)
>  {
> -	int ret, has_empty_extent = 0;
> +	int ret, i, subtree_index = 0, has_empty_extent = 0;
>  	unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters);
>  	struct ocfs2_extent_rec *left_rec;
>  	struct ocfs2_extent_rec *right_rec;
> +	struct buffer_head *bh = path_leaf_bh(right_path);
> +	struct buffer_head *root_bh = NULL;
> +	struct ocfs2_path *left_path = NULL;
> +	struct ocfs2_extent_list *left_el;
>  
> -	BUG_ON(index <= 0);
> +	BUG_ON(index < 0);
>  
> -	left_rec = &el->l_recs[index - 1];
>  	right_rec = &el->l_recs[index];
> -	if (ocfs2_is_empty_extent(&el->l_recs[0]))
> -		has_empty_extent = 1;
> +	if (index == 0) {
> +		/* we meet with a cross extent block merge. */
> +		ret = ocfs2_get_left_path(inode, right_path, &left_path);
> +		if (ret) {
> +			mlog_errno(ret);
> +			goto out;
> +		}
> +
> +		left_el = path_leaf_el(left_path);
> +		BUG_ON(le16_to_cpu(left_el->l_next_free_rec) !=
> +		       le16_to_cpu(left_el->l_count));
> +
> +		left_rec = &left_el->l_recs[le16_to_cpu(left_el->l_count) - 1];

Using left_el->l_count to index the array is correct here, but would you
mind changing this to use l_next_free_rec so it's more consistent with what
we usually do? It actually reads a bit weird to my eyes because I'm used to
seeing l_next_free_rec inside of array derefs!


> @@ -2802,24 +3024,77 @@ static int ocfs2_merge_rec_left(struct inode *inode, struct buffer_head *bh,
>  	ocfs2_cleanup_merge(el, index);
>  
>  	ret = ocfs2_journal_dirty(handle, bh);
> -	if (ret)
> +	if (ret) {
>  		mlog_errno(ret);
> +		goto out;
> +	}
>  
> +	if (left_path) {
> +		ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
> +		if (ret) {
> +			mlog_errno(ret);
> +			goto out;
> +		}
> +
> +		/*
> +		 * In the situation that the right_rec is empty and the extent
> +		 * block is empty also, ocfs2_complete_edge_insert can't handle
> +		 * it, while ocfs2_rotate_tree_left can be called and the extent
> +		 * block will be deleted since c_split_covers_rec is set.
> +		 */
> +		if (le16_to_cpu(right_rec->e_leaf_clusters) == 0 &&
> +		    le16_to_cpu(el->l_next_free_rec) == 1)
> +			goto out;
>

Hmm, I'm worried that we're exiting this function with a path whose upper
cpos values are inconsistent. An error before we get the chance to actually
do the rotate / removal could leave us with an inconsistent tree.

I'm not sure that we can just slap some code to handle it here though - from
what I can tell, the callers are expecting that we'd leave an empty extent
for them to handle.

I see that ocfs2_rotate_tree_left _almost_ handles this situation
correctly, except the transaction extend could fail before we get to remove
the blocks. That might actually be a bug, though I have to think about it
some more. Generally if ocfs2_rotate_tree_left handled it 100% correctly, we
_could_ just comment the heck out of this function that callers *must* call
the rotate function later. Since calling ocfs2_rotate_tree_left() is
implicitely decided by the callers, I'd also want to audit the code to make
sure that it always gets called in the new merge situations we've
introduced.


> +		root_bh = left_path->p_node[subtree_index].bh;
> +		BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
> +
> +		ret = ocfs2_journal_access(handle, inode, root_bh,
> +					   OCFS2_JOURNAL_ACCESS_WRITE);
> +		if (ret) {
> +			mlog_errno(ret);
> +			goto out;
> +		}
> +
> +		for (i = subtree_index + 1;
> +		     i < path_num_items(right_path); i++) {
> +			ret = ocfs2_journal_access(handle, inode,
> +						   right_path->p_node[i].bh,
> +						   OCFS2_JOURNAL_ACCESS_WRITE);
> +			if (ret) {
> +				mlog_errno(ret);
> +				goto out;
> +			}
> +
> +			ret = ocfs2_journal_access(handle, inode,
> +						   left_path->p_node[i].bh,
> +						   OCFS2_JOURNAL_ACCESS_WRITE);
> +			if (ret) {
> +				mlog_errno(ret);
> +				goto out;
> +			}
> +		}

Same comment as before regarding the journal_access calls.


> @@ -2858,9 +3132,21 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
>  		 * Since the adding of an empty extent shifts
>  		 * everything back to the right, there's no need to
>  		 * update split_index here.
> -		 */
> -		ret = ocfs2_merge_rec_left(inode, path_leaf_bh(left_path),
> -					   handle, split_rec, el, split_index);
> +		 *
> + 		 * When the split_index is zero, we need to merge it to the
> + 		 * prevoius extent block. It is more efficient and easier
> + 		 * if we do merge_right first and merge_left later.
> + 		 */

So why don't we just always do the left merge 1st, instead of switching on
them like this?
	--Mark

--
Mark Fasheh
Principal Software Developer, Oracle
mark.fasheh at oracle.com



More information about the Ocfs2-devel mailing list