[Ocfs2-devel] [PATCH] ocfs2: avoid occurring deadlock by changing ocfs2_wq from global to local.

Joseph Qi joseph.qi at huawei.com
Fri Oct 23 02:18:42 PDT 2015


Initialize osb->ocfs2_wq can be moved up and it should be checked if it
is NULL when flush work queue. Because it may fail in
ocfs2_initialize_super before osb->ocfs2_wq is initialized.

On 2015/10/5 18:03, jiangyiwen wrote:
> This patch is solving a scene of deadlock, as follows:
>   Node 1                Node 2                  Node 3
> 1)volume a and b are    only mount vol a        only mount vol b
>   mounted
> 
> 2)                      start to mount b        start to mount a
> 
> 3)                      check hb of Node 3      check hb of Node 2
>                         in vol a, qs_holds++    in vol b, qs_holds++
> 
> 4) -------------------- all nodes' network down --------------------
> 
> 5)                      progress of mount b     the same situation as
>                         failed, and then call   Node 2
>                         ocfs2_dismount_volume.
>                         but the process is hung,
>                         since there is a work
>                         in ocfs2_wq cannot beo
>                         completed. This work is
>                         about vol a, because
>                         ocfs2_wq is global wq.
>                         BTW, this work which is
>                         scheduled in ocfs2_wq is
>                         ocfs2_orphan_scan_work,
>                         and the context in this work
>                         needs to take inode lock
>                         of orphan_dir, because
>                         lockres owner are Node 1 and
>                         all nodes' nework has been down
>                         at the same time, so it can't
>                         get the inode lock.
> 
> 6)                      Why can't this node be fenced
>                         when network disconnected?
>                         Because the process of
>                         mount is hung what caused qs_holds
>                         is not equal 0.
> 
> Because all works in the ocfs2_wq are relative to the super block,
> so the solution is to change the ocfs2_wq from global to local,
> in other words, moving it into the structure of ocfs2 super.
> 
> Signed-off-by: Yiwen Jiang <jiangyiwen at huawei.com>
> ---
>  fs/ocfs2/alloc.c        |  4 ++--
>  fs/ocfs2/journal.c      |  8 ++++----
>  fs/ocfs2/localalloc.c   |  4 ++--
>  fs/ocfs2/ocfs2.h        |  6 ++++++
>  fs/ocfs2/quota_global.c |  2 +-
>  fs/ocfs2/super.c        | 37 +++++++++++++++----------------------
>  fs/ocfs2/super.h        |  2 --
>  7 files changed, 30 insertions(+), 33 deletions(-)
> 
> diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
> index 86181d6..f9e0ae7 100644
> --- a/fs/ocfs2/alloc.c
> +++ b/fs/ocfs2/alloc.c
> @@ -6079,7 +6079,7 @@ void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb,
>  		if (cancel)
>  			cancel_delayed_work(&osb->osb_truncate_log_wq);
> 
> -		queue_delayed_work(ocfs2_wq, &osb->osb_truncate_log_wq,
> +		queue_delayed_work(osb->ocfs2_wq, &osb->osb_truncate_log_wq,
>  				   OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL);
>  	}
>  }
> @@ -6254,7 +6254,7 @@ void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb)
> 
>  	if (tl_inode) {
>  		cancel_delayed_work(&osb->osb_truncate_log_wq);
> -		flush_workqueue(ocfs2_wq);
> +		flush_workqueue(osb->ocfs2_wq);
> 
>  		status = ocfs2_flush_truncate_log(osb);
>  		if (status < 0)
> diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
> index ff82b28..93acf2d 100644
> --- a/fs/ocfs2/journal.c
> +++ b/fs/ocfs2/journal.c
> @@ -231,7 +231,7 @@ void ocfs2_recovery_exit(struct ocfs2_super *osb)
>  	/* At this point, we know that no more recovery threads can be
>  	 * launched, so wait for any recovery completion work to
>  	 * complete. */
> -	flush_workqueue(ocfs2_wq);
> +	flush_workqueue(osb->ocfs2_wq);
> 
>  	/*
>  	 * Now that recovery is shut down, and the osb is about to be
> @@ -1327,7 +1327,7 @@ static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
> 
>  	spin_lock(&journal->j_lock);
>  	list_add_tail(&item->lri_list, &journal->j_la_cleanups);
> -	queue_work(ocfs2_wq, &journal->j_recovery_work);
> +	queue_work(journal->j_osb->ocfs2_wq, &journal->j_recovery_work);
>  	spin_unlock(&journal->j_lock);
>  }
> 
> @@ -1972,7 +1972,7 @@ static void ocfs2_orphan_scan_work(struct work_struct *work)
>  	mutex_lock(&os->os_lock);
>  	ocfs2_queue_orphan_scan(osb);
>  	if (atomic_read(&os->os_state) == ORPHAN_SCAN_ACTIVE)
> -		queue_delayed_work(ocfs2_wq, &os->os_orphan_scan_work,
> +		queue_delayed_work(osb->ocfs2_wq, &os->os_orphan_scan_work,
>  				      ocfs2_orphan_scan_timeout());
>  	mutex_unlock(&os->os_lock);
>  }
> @@ -2012,7 +2012,7 @@ void ocfs2_orphan_scan_start(struct ocfs2_super *osb)
>  		atomic_set(&os->os_state, ORPHAN_SCAN_INACTIVE);
>  	else {
>  		atomic_set(&os->os_state, ORPHAN_SCAN_ACTIVE);
> -		queue_delayed_work(ocfs2_wq, &os->os_orphan_scan_work,
> +		queue_delayed_work(osb->ocfs2_wq, &os->os_orphan_scan_work,
>  				   ocfs2_orphan_scan_timeout());
>  	}
>  }
> diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
> index 0a4457f..3e19312 100644
> --- a/fs/ocfs2/localalloc.c
> +++ b/fs/ocfs2/localalloc.c
> @@ -387,7 +387,7 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
>  	struct ocfs2_dinode *alloc = NULL;
> 
>  	cancel_delayed_work(&osb->la_enable_wq);
> -	flush_workqueue(ocfs2_wq);
> +	flush_workqueue(osb->ocfs2_wq);
> 
>  	if (osb->local_alloc_state == OCFS2_LA_UNUSED)
>  		goto out;
> @@ -1087,7 +1087,7 @@ static int ocfs2_recalc_la_window(struct ocfs2_super *osb,
>  		} else {
>  			osb->local_alloc_state = OCFS2_LA_DISABLED;
>  		}
> -		queue_delayed_work(ocfs2_wq, &osb->la_enable_wq,
> +		queue_delayed_work(osb->ocfs2_wq, &osb->la_enable_wq,
>  				   OCFS2_LA_ENABLE_INTERVAL);
>  		goto out_unlock;
>  	}
> diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
> index 7a01262..2ce47bf 100644
> --- a/fs/ocfs2/ocfs2.h
> +++ b/fs/ocfs2/ocfs2.h
> @@ -464,6 +464,12 @@ struct ocfs2_super
>  	struct ocfs2_refcount_tree *osb_ref_tree_lru;
> 
>  	struct mutex system_file_mutex;
> +
> +	/* OCFS2 needs to schedule several different types of work which
> +	 * require cluster locking, disk I/O, recovery waits, etc. Since these
> +	 * types of work tend to be heavy we avoid using the kernel events
> +	 * workqueue and schedule on our own. */
> +	struct workqueue_struct *ocfs2_wq;
>  };
> 
>  #define OCFS2_SB(sb)	    ((struct ocfs2_super *)(sb)->s_fs_info)
> diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
> index c93d672..44df24b 100644
> --- a/fs/ocfs2/quota_global.c
> +++ b/fs/ocfs2/quota_global.c
> @@ -726,7 +726,7 @@ static int ocfs2_release_dquot(struct dquot *dquot)
>  		dqgrab(dquot);
>  		/* First entry on list -> queue work */
>  		if (llist_add(&OCFS2_DQUOT(dquot)->list, &osb->dquot_drop_list))
> -			queue_work(ocfs2_wq, &osb->dquot_drop_work);
> +			queue_work(osb->ocfs2_wq, &osb->dquot_drop_work);
>  		goto out;
>  	}
>  	status = ocfs2_lock_global_qf(oinfo, 1);
> diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
> index 2de4c8a..130a17f 100644
> --- a/fs/ocfs2/super.c
> +++ b/fs/ocfs2/super.c
> @@ -79,12 +79,6 @@ static struct kmem_cache *ocfs2_inode_cachep;
>  struct kmem_cache *ocfs2_dquot_cachep;
>  struct kmem_cache *ocfs2_qf_chunk_cachep;
> 
> -/* OCFS2 needs to schedule several different types of work which
> - * require cluster locking, disk I/O, recovery waits, etc. Since these
> - * types of work tend to be heavy we avoid using the kernel events
> - * workqueue and schedule on our own. */
> -struct workqueue_struct *ocfs2_wq = NULL;
> -
>  static struct dentry *ocfs2_debugfs_root;
> 
>  MODULE_AUTHOR("Oracle");
> @@ -1612,33 +1606,25 @@ static int __init ocfs2_init(void)
>  	if (status < 0)
>  		goto out2;
> 
> -	ocfs2_wq = create_singlethread_workqueue("ocfs2_wq");
> -	if (!ocfs2_wq) {
> -		status = -ENOMEM;
> -		goto out3;
> -	}
> -
>  	ocfs2_debugfs_root = debugfs_create_dir("ocfs2", NULL);
>  	if (!ocfs2_debugfs_root) {
>  		status = -ENOMEM;
>  		mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n");
> -		goto out4;
> +		goto out3;
>  	}
> 
>  	ocfs2_set_locking_protocol();
> 
>  	status = register_quota_format(&ocfs2_quota_format);
>  	if (status < 0)
> -		goto out4;
> +		goto out3;
>  	status = register_filesystem(&ocfs2_fs_type);
>  	if (!status)
>  		return 0;
> 
>  	unregister_quota_format(&ocfs2_quota_format);
> -out4:
> -	destroy_workqueue(ocfs2_wq);
> -	debugfs_remove(ocfs2_debugfs_root);
>  out3:
> +	debugfs_remove(ocfs2_debugfs_root);
>  	ocfs2_free_mem_caches();
>  out2:
>  	exit_ocfs2_uptodate_cache();
> @@ -1649,11 +1635,6 @@ out1:
> 
>  static void __exit ocfs2_exit(void)
>  {
> -	if (ocfs2_wq) {
> -		flush_workqueue(ocfs2_wq);
> -		destroy_workqueue(ocfs2_wq);
> -	}
> -
>  	unregister_quota_format(&ocfs2_quota_format);
> 
>  	debugfs_remove(ocfs2_debugfs_root);
> @@ -2348,6 +2329,12 @@ static int ocfs2_initialize_super(struct super_block *sb,
>  	}
>  	cleancache_init_shared_fs(sb);
> 
> +	osb->ocfs2_wq = create_singlethread_workqueue("ocfs2_wq");
> +	if (!osb->ocfs2_wq) {
> +		status = -ENOMEM;
> +		mlog_errno(status);
> +	}
> +
>  bail:
>  	return status;
>  }
> @@ -2535,6 +2522,12 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb)
>  {
>  	/* This function assumes that the caller has the main osb resource */
> 
> +	/* ocfs2_initializer_super have already created this workqueue */
> +	if (osb->ocfs2_wq) {
> +		flush_workqueue(osb->ocfs2_wq);
> +		destroy_workqueue(osb->ocfs2_wq);
> +	}
> +
>  	ocfs2_free_slot_info(osb);
> 
>  	kfree(osb->osb_orphan_wipes);
> diff --git a/fs/ocfs2/super.h b/fs/ocfs2/super.h
> index b477d0b..b023e4f 100644
> --- a/fs/ocfs2/super.h
> +++ b/fs/ocfs2/super.h
> @@ -26,8 +26,6 @@
>  #ifndef OCFS2_SUPER_H
>  #define OCFS2_SUPER_H
> 
> -extern struct workqueue_struct *ocfs2_wq;
> -
>  int ocfs2_publish_get_mount_state(struct ocfs2_super *osb,
>  				  int node_num);
> 





More information about the Ocfs2-devel mailing list