[Ocfs2-devel] [PATCH] ocfs2: fix deadlock due to wrong locking order

alex chen alex.chen at huawei.com
Wed Sep 24 01:49:08 PDT 2014


Hi, Junxiao:

On 2014/9/10 9:48, Junxiao Bi wrote:
> For commit ocfs2 journal, ocfs2 journal thread will acquire the mutex
> osb->journal->j_trans_barrier and wake up jbd2 commit thread, then it
> will wait until jbd2 commit thread done. In order journal mode, jbd2
> needs flushing dirty data pages first, and this needs get page lock.
> So osb->journal->j_trans_barrier should be got before page lock.
> 
> But ocfs2_write_zero_page() and ocfs2_write_begin_inline() obey this
> locking order, and this will cause deadlock and hung the whole cluster.
> 
> One deadlock catched is the following:
> 
> PID: 13449  TASK: ffff8802e2f08180  CPU: 31  COMMAND: "oracle"
>  #0 [ffff8802ee3f79b0] __schedule at ffffffff8150a524
>  #1 [ffff8802ee3f7a58] schedule at ffffffff8150acbf
>  #2 [ffff8802ee3f7a68] rwsem_down_failed_common at ffffffff8150cb85
>  #3 [ffff8802ee3f7ad8] rwsem_down_read_failed at ffffffff8150cc55
>  #4 [ffff8802ee3f7ae8] call_rwsem_down_read_failed at ffffffff812617a4
>  #5 [ffff8802ee3f7b50] ocfs2_start_trans at ffffffffa0498919 [ocfs2]
>  #6 [ffff8802ee3f7ba0] ocfs2_zero_start_ordered_transaction at ffffffffa048b2b8 [ocfs2]
>  #7 [ffff8802ee3f7bf0] ocfs2_write_zero_page at ffffffffa048e9bd [ocfs2]
>  #8 [ffff8802ee3f7c80] ocfs2_zero_extend_range at ffffffffa048ec83 [ocfs2]
>  #9 [ffff8802ee3f7ce0] ocfs2_zero_extend at ffffffffa048edfd [ocfs2]
>  #10 [ffff8802ee3f7d50] ocfs2_extend_file at ffffffffa049079e [ocfs2]
>  #11 [ffff8802ee3f7da0] ocfs2_setattr at ffffffffa04910ed [ocfs2]
>  #12 [ffff8802ee3f7e70] notify_change at ffffffff81187d29
>  #13 [ffff8802ee3f7ee0] do_truncate at ffffffff8116bbc1
>  #14 [ffff8802ee3f7f50] sys_ftruncate at ffffffff8116bcbd
>  #15 [ffff8802ee3f7f80] system_call_fastpath at ffffffff81515142
>     RIP: 00007f8de750c6f7  RSP: 00007fffe786e478  RFLAGS: 00000206
>     RAX: 000000000000004d  RBX: ffffffff81515142  RCX: 0000000000000000
>     RDX: 0000000000000200  RSI: 0000000000028400  RDI: 000000000000000d
>     RBP: 00007fffe786e040   R8: 0000000000000000   R9: 000000000000000d
>     R10: 0000000000000000  R11: 0000000000000206  R12: 000000000000000d
>     R13: 00007fffe786e710  R14: 00007f8de70f8340  R15: 0000000000028400
>     ORIG_RAX: 000000000000004d  CS: 0033  SS: 002b
> 
> crash64> bt
> PID: 7610   TASK: ffff88100fd56140  CPU: 1   COMMAND: "ocfs2cmt"
>  #0 [ffff88100f4d1c50] __schedule at ffffffff8150a524
>  #1 [ffff88100f4d1cf8] schedule at ffffffff8150acbf
>  #2 [ffff88100f4d1d08] jbd2_log_wait_commit at ffffffffa01274fd [jbd2]
>  #3 [ffff88100f4d1d98] jbd2_journal_flush at ffffffffa01280b4 [jbd2]
>  #4 [ffff88100f4d1dd8] ocfs2_commit_cache at ffffffffa0499b14 [ocfs2]
>  #5 [ffff88100f4d1e38] ocfs2_commit_thread at ffffffffa0499d38 [ocfs2]
>  #6 [ffff88100f4d1ee8] kthread at ffffffff81090db6
>  #7 [ffff88100f4d1f48] kernel_thread_helper at ffffffff81516284
> 
> crash64> bt
> PID: 7609   TASK: ffff88100f2d4480  CPU: 0   COMMAND: "jbd2/dm-20-86"
>  #0 [ffff88100def3920] __schedule at ffffffff8150a524
>  #1 [ffff88100def39c8] schedule at ffffffff8150acbf
>  #2 [ffff88100def39d8] io_schedule at ffffffff8150ad6c
>  #3 [ffff88100def39f8] sleep_on_page at ffffffff8111069e
>  #4 [ffff88100def3a08] __wait_on_bit_lock at ffffffff8150b30a
>  #5 [ffff88100def3a58] __lock_page at ffffffff81110687
>  #6 [ffff88100def3ab8] write_cache_pages at ffffffff8111b752
>  #7 [ffff88100def3be8] generic_writepages at ffffffff8111b901
>  #8 [ffff88100def3c48] journal_submit_data_buffers at ffffffffa0120f67 [jbd2]
>  #9 [ffff88100def3cf8] jbd2_journal_commit_transaction at ffffffffa0121372[jbd2]
>  #10 [ffff88100def3e68] kjournald2 at ffffffffa0127a86 [jbd2]
>  #11 [ffff88100def3ee8] kthread at ffffffff81090db6
>  #12 [ffff88100def3f48] kernel_thread_helper at ffffffff81516284
> 
> Signed-off-by: Junxiao Bi <junxiao.bi at oracle.com>
> ---
>  fs/ocfs2/aops.c |   15 ++++++++-------
>  fs/ocfs2/file.c |   52 ++++++++++++++++++++++++----------------------------
>  2 files changed, 32 insertions(+), 35 deletions(-)
> 
> diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
> index c04183b..c71174a 100644
> --- a/fs/ocfs2/aops.c
> +++ b/fs/ocfs2/aops.c
> @@ -1485,8 +1485,16 @@ static int ocfs2_write_begin_inline(struct address_space *mapping,
>  	handle_t *handle;
>  	struct ocfs2_dinode *di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
>  
> +	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
> +	if (IS_ERR(handle)) {
> +		ret = PTR_ERR(handle);
> +		mlog_errno(ret);
> +		goto out;
> +	}
> +
>  	page = find_or_create_page(mapping, 0, GFP_NOFS);
>  	if (!page) {
> +		ocfs2_commit_trans(osb, handle);
>  		ret = -ENOMEM;
>  		mlog_errno(ret);
>  		goto out;
> @@ -1498,13 +1506,6 @@ static int ocfs2_write_begin_inline(struct address_space *mapping,
>  	wc->w_pages[0] = wc->w_target_page = page;
>  	wc->w_num_pages = 1;
>  
> -	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
> -	if (IS_ERR(handle)) {
> -		ret = PTR_ERR(handle);
> -		mlog_errno(ret);
> -		goto out;
> -	}
> -
>  	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh,
>  				      OCFS2_JOURNAL_ACCESS_WRITE);
>  	if (ret) {
> diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
> index 2930e23..c534cb0 100644
> --- a/fs/ocfs2/file.c
> +++ b/fs/ocfs2/file.c
> @@ -760,7 +760,7 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
>  	struct address_space *mapping = inode->i_mapping;
>  	struct page *page;
>  	unsigned long index = abs_from >> PAGE_CACHE_SHIFT;
> -	handle_t *handle = NULL;
> +	handle_t *handle;
>  	int ret = 0;
>  	unsigned zero_from, zero_to, block_start, block_end;
>  	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
> @@ -769,11 +769,17 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
>  	BUG_ON(abs_to > (((u64)index + 1) << PAGE_CACHE_SHIFT));
>  	BUG_ON(abs_from & (inode->i_blkbits - 1));
>  
> +	handle = ocfs2_zero_start_ordered_transaction(inode, di_bh);
> +	if (IS_ERR(handle)) {
> +		ret = PTR_ERR(handle);
> +		goto out;
> +	}
> +

In ocfs2_zero_start_ordered_transaction, the inode will be filed into
inode list of transaction by ocfs2_jbd2_file_inode.
The relationship between ocfs2_jbd2_file_inode and lock_page as
follow:
	find_or_create_page
		->lock_page
	ocfs2_zero_start_ordered_transaction
		->ocfs2_jbd2_file_inode
	unlock_page

But in your patch, the inode will be filed into inode list before the
lock_page, is there a problem?

Thanks.
	--Alex

>  	page = find_or_create_page(mapping, index, GFP_NOFS);
>  	if (!page) {
>  		ret = -ENOMEM;
>  		mlog_errno(ret);
> -		goto out;
> +		goto out_commit_trans;
>  	}
>  
>  	/* Get the offsets within the page that we want to zero */
> @@ -805,15 +811,6 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
>  			goto out_unlock;
>  		}
>  
> -		if (!handle) {
> -			handle = ocfs2_zero_start_ordered_transaction(inode,
> -								      di_bh);
> -			if (IS_ERR(handle)) {
> -				ret = PTR_ERR(handle);
> -				handle = NULL;
> -				break;
> -			}
> -		}
>  
>  		/* must not update i_size! */
>  		ret = block_commit_write(page, block_start + 1,
> @@ -824,27 +821,26 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
>  			ret = 0;
>  	}
>  
> -	if (handle) {
> -		/*
> -		 * fs-writeback will release the dirty pages without page lock
> -		 * whose offset are over inode size, the release happens at
> -		 * block_write_full_page().
> -		 */
> -		i_size_write(inode, abs_to);
> -		inode->i_blocks = ocfs2_inode_sector_count(inode);
> -		di->i_size = cpu_to_le64((u64)i_size_read(inode));
> -		inode->i_mtime = inode->i_ctime = CURRENT_TIME;
> -		di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
> -		di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
> -		di->i_mtime_nsec = di->i_ctime_nsec;
> -		ocfs2_journal_dirty(handle, di_bh);
> -		ocfs2_update_inode_fsync_trans(handle, inode, 1);
> -		ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
> -	}
> +	/*
> +	 * fs-writeback will release the dirty pages without page lock
> +	 * whose offset are over inode size, the release happens at
> +	 * block_write_full_page().
> +	 */
> +	i_size_write(inode, abs_to);
> +	inode->i_blocks = ocfs2_inode_sector_count(inode);
> +	di->i_size = cpu_to_le64((u64)i_size_read(inode));
> +	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
> +	di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
> +	di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
> +	di->i_mtime_nsec = di->i_ctime_nsec;
> +	ocfs2_journal_dirty(handle, di_bh);
> +	ocfs2_update_inode_fsync_trans(handle, inode, 1);
>  
>  out_unlock:
>  	unlock_page(page);
>  	page_cache_release(page);
> +out_commit_trans:
> +	ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
>  out:
>  	return ret;
>  }
> 




More information about the Ocfs2-devel mailing list