[Ocfs2-devel] [PATCH] ocfs2: fix a tiny race that leads file system read-only

Wed Mar 2 17:07:20 PST 2016

On 2016/3/2 17:29, xuejiufei wrote:
> when o2hb detect a node down, it first set the dead node to recovery
> map and create ocfs2rec which will replay journal for dead node.
> o2hb thread then call dlm_do_local_recovery_cleanup() to delete the
> lock for dead node. After the lock of dead node is gone, locks for other
> nodes can be granted and may modify the meta data without replaying
> journal of the dead node. The detail is described as follows.
> 
>      N1                         N2                   N3(master)
> modify the extent tree of
> inode, and commit
> dirty metadata to journal,
> then goes down.
>                                                  o2hb thread detects
>                                                  N1 goes down, set
>                                                  recovery map and
>                                                  delete the lock of N1.
> 
>                                                  dlm_thread flush ast
>                                                  for the lock of N2.
>                         do not detect the death
>                         of N1, so recovery map is
>                         empty.
> 
>                         read inode from disk
>                         without replaying
>                         the journal of N1 and
>                         modify the extent tree
>                         of the inode that N1
>                         had modified.
>                                                  ocfs2rec recover the
>                                                  journal of N1.
>                                                  The modification of N2
>                                                  is lost.
> 
> The modification of N1 and N2 are not serial, and it will lead to
> read-only file system. We can set recovery_waiting flag to the lock
> resource after delete the lock for dead node to prevent other node
> from getting the lock before dlm recovery. After dlm recovery, the
> recovery map on N2 is not empty, ocfs2_inode_lock_full_nested() will
> wait for ocfs2 recovery.
> 
> Signed-off-by: Jiufei Xue <xuejiufei at huawei.com>

Lockres $RECOVERY cannot be set the flag. So I agree that the flag is
set in dlm_free_dead_locks which has already excluded lockres $RECOVERY.
So this fix looks good to me.

Reviewed-by: Joseph Qi <joseph.qi at huawei.com>

> ---
>  fs/ocfs2/dlm/dlmcommon.h   | 5 ++++-
>  fs/ocfs2/dlm/dlmmaster.c   | 3 ++-
>  fs/ocfs2/dlm/dlmrecovery.c | 8 ++++++++
>  fs/ocfs2/dlm/dlmthread.c   | 6 ++++--
>  4 files changed, 18 insertions(+), 4 deletions(-)
> 
> diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
> index 68c607e..1276d91 100644
> --- a/fs/ocfs2/dlm/dlmcommon.h
> +++ b/fs/ocfs2/dlm/dlmcommon.h
> @@ -282,6 +282,7 @@ static inline void __dlm_set_joining_node(struct dlm_ctxt *dlm,
>  #define DLM_LOCK_RES_DROPPING_REF         0x00000040
>  #define DLM_LOCK_RES_BLOCK_DIRTY          0x00001000
>  #define DLM_LOCK_RES_SETREF_INPROG        0x00002000
> +#define DLM_LOCK_RES_RECOVERY_WAITING     0x00004000
> 
>  /* max milliseconds to wait to sync up a network failure with a node death */
>  #define DLM_NODE_DEATH_WAIT_MAX (5 * 1000)
> @@ -789,7 +790,8 @@ __dlm_lockres_state_to_status(struct dlm_lock_resource *res)
> 
>  	assert_spin_locked(&res->spinlock);
> 
> -	if (res->state & DLM_LOCK_RES_RECOVERING)
> +	if (res->state & (DLM_LOCK_RES_RECOVERING|
> +			DLM_LOCK_RES_RECOVERY_WAITING))
>  		status = DLM_RECOVERING;
>  	else if (res->state & DLM_LOCK_RES_MIGRATING)
>  		status = DLM_MIGRATING;
> @@ -1009,6 +1011,7 @@ static inline void __dlm_wait_on_lockres(struct dlm_lock_resource *res)
>  {
>  	__dlm_wait_on_lockres_flags(res, (DLM_LOCK_RES_IN_PROGRESS|
>  				    	  DLM_LOCK_RES_RECOVERING|
> +					  DLM_LOCK_RES_RECOVERY_WAITING|
>  					  DLM_LOCK_RES_MIGRATING));
>  }
> 
> diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
> index 9477d6e..12d385b 100644
> --- a/fs/ocfs2/dlm/dlmmaster.c
> +++ b/fs/ocfs2/dlm/dlmmaster.c
> @@ -2432,7 +2432,8 @@ static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm,
>  		return 0;
> 
>  	/* delay migration when the lockres is in RECOCERING state */
> -	if (res->state & DLM_LOCK_RES_RECOVERING)
> +	if (res->state & (DLM_LOCK_RES_RECOVERING|
> +			DLM_LOCK_RES_RECOVERY_WAITING))
>  		return 0;
> 
>  	if (res->owner != dlm->node_num)
> diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
> index b94a425..099714e 100644
> --- a/fs/ocfs2/dlm/dlmrecovery.c
> +++ b/fs/ocfs2/dlm/dlmrecovery.c
> @@ -2163,6 +2163,13 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
>  	for (i = 0; i < DLM_HASH_BUCKETS; i++) {
>  		bucket = dlm_lockres_hash(dlm, i);
>  		hlist_for_each_entry(res, bucket, hash_node) {
> +			if (res->state & DLM_LOCK_RES_RECOVERY_WAITING) {
> +				spin_lock(&res->spinlock);
> +				res->state &= ~DLM_LOCK_RES_RECOVERY_WAITING;
> +				spin_unlock(&res->spinlock);
> +				wake_up(&res->wq);
> +			}
> +
>  			if (!(res->state & DLM_LOCK_RES_RECOVERING))
>  				continue;
> 
> @@ -2300,6 +2307,7 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm,
>  			     res->lockname.len, res->lockname.name, freed, dead_node);
>  			__dlm_print_one_lock_resource(res);
>  		}
> +		res->state |= DLM_LOCK_RES_RECOVERY_WAITING;
>  		dlm_lockres_clear_refmap_bit(dlm, res, dead_node);
>  	} else if (test_bit(dead_node, res->refmap)) {
>  		mlog(0, "%s:%.*s: dead node %u had a ref, but had "
> diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
> index c5f6c24..73c03be 100644
> --- a/fs/ocfs2/dlm/dlmthread.c
> +++ b/fs/ocfs2/dlm/dlmthread.c
> @@ -106,7 +106,8 @@ int __dlm_lockres_unused(struct dlm_lock_resource *res)
>  	if (!list_empty(&res->dirty) || res->state & DLM_LOCK_RES_DIRTY)
>  		return 0;
> 
> -	if (res->state & DLM_LOCK_RES_RECOVERING)
> +	if (res->state & (DLM_LOCK_RES_RECOVERING|
> +			DLM_LOCK_RES_RECOVERY_WAITING))
>  		return 0;
> 
>  	/* Another node has this resource with this node as the master */
> @@ -700,7 +701,8 @@ static int dlm_thread(void *data)
>  			 * dirty for a short while. */
>  			BUG_ON(res->state & DLM_LOCK_RES_MIGRATING);
>  			if (res->state & (DLM_LOCK_RES_IN_PROGRESS |
> -					  DLM_LOCK_RES_RECOVERING)) {
> +					  DLM_LOCK_RES_RECOVERING |
> +					  DLM_LOCK_RES_RECOVERY_WAITING)) {
>  				/* move it to the tail and keep going */
>  				res->state &= ~DLM_LOCK_RES_DIRTY;
>  				spin_unlock(&res->spinlock);
>