[Ocfs2-devel] [PATCH 2/2] Thread recovery

Sunil Mushran sunil.mushran at oracle.com
Wed Dec 1 21:13:37 PST 2010


One problem I see is that we may spawn too many threads. Imagine
a setup with 50 mounts and 16 nodes. Not uncommon at all. If 5 nodes
die, that's 300 threads, 50 of which will be coordinating threads.

One solution is to have a universal mount count and use that and the
number of cpus to come up with a per-mount parallel reco count.

Thoughts, anyone?

On 11/17/2010 07:50 AM, Goldwyn Rodrigues wrote:
> This threads the recovery procedure so recovery of different nodes can be done
> in parallel.
>
> Signed-off-by: Goldwyn Rodrigues<rgoldwyn at suse.de>
> ---
>   fs/ocfs2/journal.c |   35 ++++++++++++++++++++++-------------
>   fs/ocfs2/journal.h |    2 ++
>   fs/ocfs2/ocfs2.h   |    2 ++
>   fs/ocfs2/super.c   |   15 +++++++++++++++
>   4 files changed, 41 insertions(+), 13 deletions(-)
>
> diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
> index 277b810..4eb2af8 100644
> --- a/fs/ocfs2/journal.c
> +++ b/fs/ocfs2/journal.c
> @@ -58,7 +58,7 @@ DEFINE_SPINLOCK(trans_inc_lock);
>   #define ORPHAN_SCAN_SCHEDULE_TIMEOUT 300000
>
>   static int ocfs2_force_read_journal(struct inode *inode);
> -static int ocfs2_recover_one_node(struct ocfs2_recover_node *rn);
> +static int ocfs2_recover_one_node(void *);
>   static int __ocfs2_recovery_thread(void *arg);
>   static int ocfs2_commit_cache(struct ocfs2_super *osb);
>   static int __ocfs2_wait_on_mount(struct ocfs2_super *osb, int quota);
> @@ -183,6 +183,8 @@ int ocfs2_recovery_init(struct ocfs2_super *osb)
>   	osb->recovery_thread_task = NULL;
>   	init_waitqueue_head(&osb->recovery_event);
>   	INIT_LIST_HEAD(&osb->s_recovery_nodes);
> +	init_waitqueue_head(&osb->recovery_wait);
> +	atomic_set(&osb->num_recovery_threads, 0);
>
>   	return 0;
>   }
> @@ -1372,21 +1374,16 @@ restart:
>   		if (i == rm_quota_used)
>   			rm_quota[rm_quota_used++] = rn->rn_slot_num;
>
> -		status = ocfs2_recover_one_node(rn);
> +		sprintf(rn->rn_str, "ocfs2rec%d", rn->rn_slot_num);

snprintf always. But wondering whether this adds any value
considering we cannot differentiate between volumes. And osb_dump
has the details. I would suggest leaving it as ocfs2rec.

> +		rn->rn_thread = kthread_run(ocfs2_recover_one_node,
> +				(void *)rn, rn->rn_str);
> +		atomic_inc(&osb->num_recovery_threads);
>   skip_recovery:
> -		if (!status) {
> -			ocfs2_recovery_node_clear(osb, rn);
> -		} else {
> -			mlog(ML_ERROR,
> -			     "Error %d recovering node %d on device (%u,%u)!\n",
> -			     status, rn->rn_node_num,
> -			     MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
> -			mlog(ML_ERROR, "Volume requires unmount.\n");
> -		}
> -
>   		spin_lock(&osb->osb_lock);
>   	}
>   	spin_unlock(&osb->osb_lock);
> +	wait_event(osb->recovery_wait,
> +			!atomic_read(&osb->num_recovery_threads));
>   	mlog(0, "All nodes recovered\n");
>
>   	/* Refresh all journal recovery generations from disk */
> @@ -1666,11 +1663,12 @@ done:
>    * second part of a nodes recovery process (local alloc recovery) is
>    * far less concerning.
>    */
> -static int ocfs2_recover_one_node(struct ocfs2_recover_node *rn)
> +static int ocfs2_recover_one_node(void *arg)
>   {
>   	int status = 0;
>   	struct ocfs2_dinode *la_copy = NULL;
>   	struct ocfs2_dinode *tl_copy = NULL;
> +	struct ocfs2_recover_node *rn = (struct ocfs2_recover_node *)arg;
>   	struct ocfs2_super *osb = rn->rn_osb;
>
>   	mlog_entry("(node_num=%d, slot_num=%d, osb->node_num = %d)\n",
> @@ -1723,6 +1721,17 @@ static int ocfs2_recover_one_node(struct
> ocfs2_recover_node *rn)
>   	status = 0;
>   done:
>   	rn->rn_status = status;
> +	if (!status) {
> +		ocfs2_recovery_node_clear(osb, rn);
> +	} else {
> +		mlog(ML_ERROR,
> +			"Error %d recovering node %d on device (%u,%u)!\n",
> +			rn->rn_status, rn->rn_node_num,
> +			MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
> +		mlog(ML_ERROR, "Volume requires unmount.\n");
> +	}
> +	if (atomic_dec_and_test(&osb->num_recovery_threads))
> +		wake_up(&osb->recovery_wait);
>
>   	mlog_exit(status);
>   	return status;
> diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
> index 0325d81..247f3b1 100644
> --- a/fs/ocfs2/journal.h
> +++ b/fs/ocfs2/journal.h
> @@ -49,6 +49,8 @@ struct ocfs2_recover_node {
>   	int rn_slot_num;
>   	int rn_status;
>   	struct list_head rn_list;
> +	struct task_struct *rn_thread;
> +	char rn_str[13];
>   };
>
>
> diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
> index f70c25a..f4ee02b 100644
> --- a/fs/ocfs2/ocfs2.h
> +++ b/fs/ocfs2/ocfs2.h
> @@ -338,6 +338,8 @@ struct ocfs2_super
>   	atomic_t vol_state;
>   	struct mutex recovery_lock;
>   	struct list_head s_recovery_nodes;
> +	atomic_t num_recovery_threads;
> +	wait_queue_head_t recovery_wait;
>
>   	struct ocfs2_replay_map *replay_map;
>   	struct task_struct *recovery_thread_task;
> diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
> index 478715b..1661ab8 100644
> --- a/fs/ocfs2/super.c
> +++ b/fs/ocfs2/super.c
> @@ -222,6 +222,8 @@ static int ocfs2_osb_dump(struct ocfs2_super *osb,
> char *buf, int len)
>   	struct ocfs2_cluster_connection *cconn = osb->cconn;
>   	struct ocfs2_orphan_scan *os =&osb->osb_orphan_scan;
>   	int i, out = 0;
> +	struct list_head *iter;
> +	struct ocfs2_recover_node *rn;
>
>   	out += snprintf(buf + out, len - out,
>   			"%10s =>  Id: %-s  Uuid: %-s  Gen: 0x%X  Label: %-s\n",
> @@ -266,6 +268,19 @@ static int ocfs2_osb_dump(struct ocfs2_super
> *osb, char *buf, int len)
>   			osb->dc_work_sequence);
>   	spin_unlock(&osb->dc_task_lock);
>
> +	spin_lock(&osb->osb_lock);
> +	out += snprintf(buf + out, len - out, "Recovery(main) Pid: %d\n",
> +			(osb->recovery_thread_task ?
> +			task_pid_nr(osb->recovery_thread_task) : -1));
> +	list_for_each(iter,&osb->s_recovery_nodes) {
> +		rn = list_entry(iter, struct ocfs2_recover_node, rn_list);
> +		out += snprintf(buf + out, len - out,
> +			"Recovery(%d) =>  Pid: %d Node:%d\n", rn->rn_slot_num,
> +			task_pid_nr(rn->rn_thread), rn->rn_node_num);
> +	}
> +	spin_unlock(&osb->osb_lock);
> +
> +
>   	out += snprintf(buf + out, len - out,
>   			"%10s =>  Pid: %d  Interval: %lu  Needs: %d\n", "Commit",
>   			(osb->commit_task ? task_pid_nr(osb->commit_task) : -1),




More information about the Ocfs2-devel mailing list