[Ocfs2-devel] [PATCH] ocfs2: fix a tiny race that leads file system read-only
Joseph Qi
joseph.qi at huawei.com
Wed Mar 2 17:07:20 PST 2016
On 2016/3/2 17:29, xuejiufei wrote:
> when o2hb detect a node down, it first set the dead node to recovery
> map and create ocfs2rec which will replay journal for dead node.
> o2hb thread then call dlm_do_local_recovery_cleanup() to delete the
> lock for dead node. After the lock of dead node is gone, locks for other
> nodes can be granted and may modify the meta data without replaying
> journal of the dead node. The detail is described as follows.
>
> N1 N2 N3(master)
> modify the extent tree of
> inode, and commit
> dirty metadata to journal,
> then goes down.
> o2hb thread detects
> N1 goes down, set
> recovery map and
> delete the lock of N1.
>
> dlm_thread flush ast
> for the lock of N2.
> do not detect the death
> of N1, so recovery map is
> empty.
>
> read inode from disk
> without replaying
> the journal of N1 and
> modify the extent tree
> of the inode that N1
> had modified.
> ocfs2rec recover the
> journal of N1.
> The modification of N2
> is lost.
>
> The modification of N1 and N2 are not serial, and it will lead to
> read-only file system. We can set recovery_waiting flag to the lock
> resource after delete the lock for dead node to prevent other node
> from getting the lock before dlm recovery. After dlm recovery, the
> recovery map on N2 is not empty, ocfs2_inode_lock_full_nested() will
> wait for ocfs2 recovery.
>
> Signed-off-by: Jiufei Xue <xuejiufei at huawei.com>
Lockres $RECOVERY cannot be set the flag. So I agree that the flag is
set in dlm_free_dead_locks which has already excluded lockres $RECOVERY.
So this fix looks good to me.
Reviewed-by: Joseph Qi <joseph.qi at huawei.com>
> ---
> fs/ocfs2/dlm/dlmcommon.h | 5 ++++-
> fs/ocfs2/dlm/dlmmaster.c | 3 ++-
> fs/ocfs2/dlm/dlmrecovery.c | 8 ++++++++
> fs/ocfs2/dlm/dlmthread.c | 6 ++++--
> 4 files changed, 18 insertions(+), 4 deletions(-)
>
> diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
> index 68c607e..1276d91 100644
> --- a/fs/ocfs2/dlm/dlmcommon.h
> +++ b/fs/ocfs2/dlm/dlmcommon.h
> @@ -282,6 +282,7 @@ static inline void __dlm_set_joining_node(struct dlm_ctxt *dlm,
> #define DLM_LOCK_RES_DROPPING_REF 0x00000040
> #define DLM_LOCK_RES_BLOCK_DIRTY 0x00001000
> #define DLM_LOCK_RES_SETREF_INPROG 0x00002000
> +#define DLM_LOCK_RES_RECOVERY_WAITING 0x00004000
>
> /* max milliseconds to wait to sync up a network failure with a node death */
> #define DLM_NODE_DEATH_WAIT_MAX (5 * 1000)
> @@ -789,7 +790,8 @@ __dlm_lockres_state_to_status(struct dlm_lock_resource *res)
>
> assert_spin_locked(&res->spinlock);
>
> - if (res->state & DLM_LOCK_RES_RECOVERING)
> + if (res->state & (DLM_LOCK_RES_RECOVERING|
> + DLM_LOCK_RES_RECOVERY_WAITING))
> status = DLM_RECOVERING;
> else if (res->state & DLM_LOCK_RES_MIGRATING)
> status = DLM_MIGRATING;
> @@ -1009,6 +1011,7 @@ static inline void __dlm_wait_on_lockres(struct dlm_lock_resource *res)
> {
> __dlm_wait_on_lockres_flags(res, (DLM_LOCK_RES_IN_PROGRESS|
> DLM_LOCK_RES_RECOVERING|
> + DLM_LOCK_RES_RECOVERY_WAITING|
> DLM_LOCK_RES_MIGRATING));
> }
>
> diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
> index 9477d6e..12d385b 100644
> --- a/fs/ocfs2/dlm/dlmmaster.c
> +++ b/fs/ocfs2/dlm/dlmmaster.c
> @@ -2432,7 +2432,8 @@ static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm,
> return 0;
>
> /* delay migration when the lockres is in RECOCERING state */
> - if (res->state & DLM_LOCK_RES_RECOVERING)
> + if (res->state & (DLM_LOCK_RES_RECOVERING|
> + DLM_LOCK_RES_RECOVERY_WAITING))
> return 0;
>
> if (res->owner != dlm->node_num)
> diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
> index b94a425..099714e 100644
> --- a/fs/ocfs2/dlm/dlmrecovery.c
> +++ b/fs/ocfs2/dlm/dlmrecovery.c
> @@ -2163,6 +2163,13 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
> for (i = 0; i < DLM_HASH_BUCKETS; i++) {
> bucket = dlm_lockres_hash(dlm, i);
> hlist_for_each_entry(res, bucket, hash_node) {
> + if (res->state & DLM_LOCK_RES_RECOVERY_WAITING) {
> + spin_lock(&res->spinlock);
> + res->state &= ~DLM_LOCK_RES_RECOVERY_WAITING;
> + spin_unlock(&res->spinlock);
> + wake_up(&res->wq);
> + }
> +
> if (!(res->state & DLM_LOCK_RES_RECOVERING))
> continue;
>
> @@ -2300,6 +2307,7 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm,
> res->lockname.len, res->lockname.name, freed, dead_node);
> __dlm_print_one_lock_resource(res);
> }
> + res->state |= DLM_LOCK_RES_RECOVERY_WAITING;
> dlm_lockres_clear_refmap_bit(dlm, res, dead_node);
> } else if (test_bit(dead_node, res->refmap)) {
> mlog(0, "%s:%.*s: dead node %u had a ref, but had "
> diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
> index c5f6c24..73c03be 100644
> --- a/fs/ocfs2/dlm/dlmthread.c
> +++ b/fs/ocfs2/dlm/dlmthread.c
> @@ -106,7 +106,8 @@ int __dlm_lockres_unused(struct dlm_lock_resource *res)
> if (!list_empty(&res->dirty) || res->state & DLM_LOCK_RES_DIRTY)
> return 0;
>
> - if (res->state & DLM_LOCK_RES_RECOVERING)
> + if (res->state & (DLM_LOCK_RES_RECOVERING|
> + DLM_LOCK_RES_RECOVERY_WAITING))
> return 0;
>
> /* Another node has this resource with this node as the master */
> @@ -700,7 +701,8 @@ static int dlm_thread(void *data)
> * dirty for a short while. */
> BUG_ON(res->state & DLM_LOCK_RES_MIGRATING);
> if (res->state & (DLM_LOCK_RES_IN_PROGRESS |
> - DLM_LOCK_RES_RECOVERING)) {
> + DLM_LOCK_RES_RECOVERING |
> + DLM_LOCK_RES_RECOVERY_WAITING)) {
> /* move it to the tail and keep going */
> res->state &= ~DLM_LOCK_RES_DIRTY;
> spin_unlock(&res->spinlock);
>
More information about the Ocfs2-devel
mailing list