[Ocfs2-devel] [PATCH 01/11] ocfs2: event-driven quorum

Mon Jan 9 16:39:42 CST 2006

This patch separates o2net and o2quo from knowing about one another as much
 as possible. This is the first in a series of patches that will allow
 userspace cluster interaction. Quorum is separated out first, and will
 ultimately only be associated with the disk heartbeat as a separate module.

 To do so, this patch performs the following changes:
 * o2hb_notify() is added to handle injection of events in a synchronous
   manner. All locking is preserved as expected.
 * disk hearbeat timeouts now inject an event for this node being down. This
   event is handled as special by o2quo which fences the node.
 * o2quo callbacks are now called directly by heartbeat rather than going
   through o2net. Previously, o2net callbacks called o2quo callbacks
   immediately. This ordering is preserved by increasing o2quo's priority over
   o2net.
 * Two new heartbeat event types are added: O2HB_CONN_{UP,DOWN}_CB, which
   correspond to tcp connections being established and terminated.
 * Outside of callbacks, where o2net used to call o2quo functions directly,
   it now injects the O2HB_CONN_{UP,DOWN}_CB events.
 * o2net knowledge of o2quo in header files has been moved to quorum.h
 * o2net's handling of quorum decisions on connection failure has been
   moved to o2quo.
 * o2quo is initialized by the nodemanager rather than by o2net.

 *******
 Unfortunately, this code is actually broken. It will cause a deadlock when
 umounting the last file system due to a deadlock on o2hb_callback_sem.

 Don't actually use this code; It's just posted for review
 *******

 fs/ocfs2/cluster/heartbeat.c    |   14 ++++++
 fs/ocfs2/cluster/heartbeat.h    |    5 ++
 fs/ocfs2/cluster/nodemanager.c  |    3 +
 fs/ocfs2/cluster/quorum.c       |   82 +++++++++++++++++++++++++++++++++++++---
 fs/ocfs2/cluster/quorum.h       |   13 ++----
 fs/ocfs2/cluster/tcp.c          |   36 +++++------------
 fs/ocfs2/cluster/tcp_internal.h |   12 -----
 7 files changed, 117 insertions(+), 48 deletions(-)

Signed-off-by: Jeff Mahoney <jeffm at suse.com>
diff -ruNpX dontdiff linux-2.6.15-staging1/fs/ocfs2/cluster/heartbeat.c linux-2.6.15-staging2/fs/ocfs2/cluster/heartbeat.c

--- linux-2.6.15-staging1/fs/ocfs2/cluster/heartbeat.c	2006-01-08 18:23:29.376721976 -0500
+++ linux-2.6.15-staging2/fs/ocfs2/cluster/heartbeat.c	2006-01-08 18:15:23.647564032 -0500
@@ -158,6 +158,7 @@ struct o2hb_bio_wait_ctxt {
 static void o2hb_write_timeout(void *arg)
 {
 	struct o2hb_region *reg = arg;
+	struct o2nm_node *node = o2nm_get_node_by_num(o2nm_this_node());
 
 	mlog(ML_ERROR, "Heartbeat write timeout to device %s after %u "
 	     "milliseconds\n", reg->hr_dev_name,
@@ -588,6 +589,7 @@ static void o2hb_queue_node_event(struct
 {
 	assert_spin_locked(&o2hb_live_lock);
 
+	INIT_LIST_HEAD(&event->hn_item);
 	event->hn_event_type = type;
 	event->hn_node = node;
 	event->hn_node_num = node_num;
@@ -598,6 +600,18 @@ static void o2hb_queue_node_event(struct
 	list_add_tail(&event->hn_item, &o2hb_node_events);
 }
 
+void o2hb_notify(enum o2hb_callback_type type, struct o2nm_node *node,
+                 int node_num)
+{
+	struct o2hb_node_event event;
+
+	spin_lock(&o2hb_live_lock);
+	o2hb_queue_node_event(&event, type, node, node_num);
+	spin_unlock(&o2hb_live_lock);
+	o2hb_run_event_list(&event);
+}
+EXPORT_SYMBOL_GPL(o2hb_notify);
+
 static void o2hb_shutdown_slot(struct o2hb_disk_slot *slot)
 {
 	struct o2hb_node_event event =
diff -ruNpX dontdiff linux-2.6.15-staging1/fs/ocfs2/cluster/heartbeat.h linux-2.6.15-staging2/fs/ocfs2/cluster/heartbeat.h
--- linux-2.6.15-staging1/fs/ocfs2/cluster/heartbeat.h	2006-01-08 18:23:29.376721976 -0500
+++ linux-2.6.15-staging2/fs/ocfs2/cluster/heartbeat.h	2006-01-08 18:13:52.643398768 -0500
@@ -46,6 +46,8 @@ extern unsigned int o2hb_dead_threshold;
 enum o2hb_callback_type {
 	O2HB_NODE_DOWN_CB = 0,
 	O2HB_NODE_UP_CB,
+	O2HB_CONN_DOWN_CB,		/* When a TCP connection fails */
+	O2HB_CONN_UP_CB,		/* When a TCP connection is made */
 	O2HB_NUM_CB
 };
 
@@ -78,5 +80,8 @@ int o2hb_check_node_heartbeating(u8 node
 int o2hb_check_node_heartbeating_from_callback(u8 node_num);
 int o2hb_check_local_node_heartbeating(void);
 void o2hb_stop_all_regions(void);
+void o2hb_notify(enum o2hb_callback_type type, struct o2nm_node *node,
+                 int node_num);
+
 
 #endif /* O2CLUSTER_HEARTBEAT_H */
diff -ruNpX dontdiff linux-2.6.15-staging1/fs/ocfs2/cluster/nodemanager.c linux-2.6.15-staging2/fs/ocfs2/cluster/nodemanager.c
--- linux-2.6.15-staging1/fs/ocfs2/cluster/nodemanager.c	2006-01-08 18:23:29.377721824 -0500
+++ linux-2.6.15-staging2/fs/ocfs2/cluster/nodemanager.c	2006-01-08 18:13:52.644398616 -0500
@@ -27,6 +27,7 @@
 #include "endian.h"
 #include "tcp.h"
 #include "nodemanager.h"
+#include "quorum.h"
 #include "heartbeat.h"
 #include "masklog.h"
 #include "sys.h"
@@ -740,6 +741,7 @@ static void __exit exit_o2nm(void)
 	configfs_unregister_subsystem(&o2nm_cluster_group.cs_subsys);
 	o2cb_sys_shutdown();
 
+	o2quo_exit();
 	o2net_exit();
 }
 
@@ -750,6 +752,7 @@ static int __init init_o2nm(void)
 	cluster_print_version();
 
 	o2hb_init();
+	o2quo_init();
 	o2net_init();
 
 	ocfs2_table_header = register_sysctl_table(ocfs2_root_table, 0);
diff -ruNpX dontdiff linux-2.6.15-staging1/fs/ocfs2/cluster/quorum.c linux-2.6.15-staging2/fs/ocfs2/cluster/quorum.c
--- linux-2.6.15-staging1/fs/ocfs2/cluster/quorum.c	2006-01-08 18:23:29.377721824 -0500
+++ linux-2.6.15-staging2/fs/ocfs2/cluster/quorum.c	2006-01-08 18:17:37.908153320 -0500
@@ -63,8 +63,14 @@ static struct o2quo_state {
 	unsigned long		qs_conn_bm[BITS_TO_LONGS(O2NM_MAX_NODES)];
 	int			qs_holds;
 	unsigned long		qs_hold_bm[BITS_TO_LONGS(O2NM_MAX_NODES)];
+	struct work_struct	qs_node_work[O2NM_MAX_NODES];
 } o2quo_state;
 
+static struct o2hb_callback_func o2quo_hb_up_cb, o2quo_hb_down_cb;
+static struct o2hb_callback_func o2quo_hb_conn_up, o2quo_hb_conn_down;
+#define O2QUO_HB_PRI 0x1
+#define O2QUO_DELAY_MS   ((o2hb_dead_threshold + 2) * O2HB_REGION_TIMEOUT_MS)
+
 /* this is horribly heavy-handed.  It should instead flip the file
  * system RO and call some userspace script. */
 static void o2quo_fence_self(void)
@@ -184,7 +190,7 @@ static void o2quo_clear_hold(struct o2qu
  * the connection.  the hold will be droped in conn_up or hb_down.  it might be
  * perpetuated by con_err until hb_down.  if we already have a conn, we might
  * be dropping a hold that conn_up got. */
-void o2quo_hb_up(u8 node)
+void o2quo_hb_up(struct o2nm_node *_node, int node, void *data)
 {
 	struct o2quo_state *qs = &o2quo_state;
 
@@ -208,7 +214,7 @@ void o2quo_hb_up(u8 node)
 
 /* hb going down releases any holds we might have had due to this node from
  * conn_up, conn_err, or hb_up */
-void o2quo_hb_down(u8 node)
+void o2quo_hb_down(struct o2nm_node *_node, int node, void *data)
 {
 	struct o2quo_state *qs = &o2quo_state;
 
@@ -226,6 +237,8 @@ void o2quo_hb_down(u8 node)
 	o2quo_clear_hold(qs, node);
 
 	spin_unlock(&qs->qs_lock);
+
+	cancel_delayed_work(&qs->qs_node_work[node]);
 }
 
 /* this tells us that we've decided that the node is still heartbeating
@@ -233,9 +246,10 @@ void o2quo_hb_down(u8 node)
  * and indicates that we must now make a quorum decision in the future,
  * though we might be doing so after waiting for holds to drain.  Here
  * we'll be dropping the hold from conn_err. */
-void o2quo_hb_still_up(u8 node)
+void o2quo_hb_still_up(void *arg)
 {
 	struct o2quo_state *qs = &o2quo_state;
+	u8 node = (u8)(long)arg;
 
 	spin_lock(&qs->qs_lock);
 
@@ -252,7 +266,7 @@ void o2quo_hb_still_up(u8 node)
  * hb_up or hb_down.  it might be perpetuated by con_err until hb_down.  if
  * it's already heartbeating we we might be dropping a hold that conn_up got.
  * */
-void o2quo_conn_up(u8 node)
+void o2quo_conn_up(struct o2nm_node *_node, int node, void *data)
 {
 	struct o2quo_state *qs = &o2quo_state;
 
@@ -278,7 +292,7 @@ void o2quo_conn_up(u8 node)
  * still heartbeating we grab a hold that will delay decisions until either the
  * node stops heartbeating from hb_down or the caller decides that the node is
  * still up and calls still_up */
-void o2quo_conn_err(u8 node)
+void o2quo_conn_err(struct o2nm_node *_node, int node, void *data)
 {
 	struct o2quo_state *qs = &o2quo_state;
 
@@ -299,17 +313,78 @@ void o2quo_conn_err(u8 node)
 		o2quo_set_hold(qs, node);
 
 	spin_unlock(&qs->qs_lock);
+
+	schedule_delayed_work(&qs->qs_node_work[node],
+	                      msecs_to_jiffies(O2QUO_DELAY_MS));
 }
 
-void o2quo_init(void)
+static void o2quo_unregister_hb_callbacks(void)
+{
+	int ret;
+
+	ret = o2hb_unregister_callback(&o2quo_hb_conn_up);
+	if (ret < 0)
+		mlog(ML_ERROR, "Status return %d unregistering heartbeat "
+		     "conn up callback!\n", ret);
+
+	ret = o2hb_unregister_callback(&o2quo_hb_conn_down);
+	if (ret < 0)
+		mlog(ML_ERROR, "Status return %d unregistering heartbeat "
+		     "conn down callback!\n", ret);
+	ret = o2hb_unregister_callback(&o2quo_hb_up_cb);
+	if (ret < 0)
+		mlog(ML_ERROR, "Status return %d unregistering heartbeat up "
+		     "callback!\n", ret);
+
+	ret = o2hb_unregister_callback(&o2quo_hb_down_cb);
+	if (ret < 0)
+		mlog(ML_ERROR, "Status return %d unregistering heartbeat down "
+		     "callback!\n", ret);
+}
+
+static int o2quo_register_hb_callbacks(void)
+{
+	int ret;
+
+	o2hb_setup_callback(&o2quo_hb_down_cb, O2HB_NODE_DOWN_CB,
+	                    o2quo_hb_down, NULL, O2QUO_HB_PRI);
+	o2hb_setup_callback(&o2quo_hb_up_cb, O2HB_NODE_UP_CB,
+	                    o2quo_hb_up, NULL, O2QUO_HB_PRI);
+	o2hb_setup_callback(&o2quo_hb_conn_down, O2HB_CONN_DOWN_CB,
+	                    o2quo_conn_err, NULL, O2QUO_HB_PRI);
+	o2hb_setup_callback(&o2quo_hb_conn_up, O2HB_CONN_UP_CB,
+	                    o2quo_conn_up, NULL, O2QUO_HB_PRI);
+
+	ret = o2hb_register_callback(&o2quo_hb_up_cb);
+	if (ret == 0)
+		ret = o2hb_register_callback(&o2quo_hb_down_cb);
+	if (ret == 0)
+		ret = o2hb_register_callback(&o2quo_hb_conn_up);
+	if (ret == 0)
+		ret = o2hb_register_callback(&o2quo_hb_conn_down);
+
+	if (ret)
+		o2quo_unregister_hb_callbacks();
+
+	return ret;
+}
+
+
+int o2quo_init(void)
 {
 	struct o2quo_state *qs = &o2quo_state;
+	int i;
 
 	spin_lock_init(&qs->qs_lock);
 	INIT_WORK(&qs->qs_work, o2quo_make_decision, NULL);
+	for (i = 0; i < O2NM_MAX_NODES; i++)
+		INIT_WORK(&qs->qs_node_work[i], o2quo_hb_still_up, (void *)i);
+
+	return o2quo_register_hb_callbacks();
 }
 
 void o2quo_exit(void)
 {
 	flush_scheduled_work();
+	o2quo_unregister_hb_callbacks();
 }
diff -ruNpX dontdiff linux-2.6.15-staging1/fs/ocfs2/cluster/quorum.h linux-2.6.15-staging2/fs/ocfs2/cluster/quorum.h
--- linux-2.6.15-staging1/fs/ocfs2/cluster/quorum.h	2006-01-08 18:23:29.378721672 -0500
+++ linux-2.6.15-staging2/fs/ocfs2/cluster/quorum.h	2006-01-08 18:23:55.863695344 -0500
@@ -23,14 +23,13 @@
 #ifndef O2CLUSTER_QUORUM_H
 #define O2CLUSTER_QUORUM_H
 
-void o2quo_init(void);
+int o2quo_init(void);
 void o2quo_exit(void);
-
-void o2quo_hb_up(u8 node);
-void o2quo_hb_down(u8 node);
-void o2quo_hb_still_up(u8 node);
-void o2quo_conn_up(u8 node);
-void o2quo_conn_err(u8 node);
 void o2quo_disk_timeout(void);
 
+/* we're delaying our quorum decision so that heartbeat will have timed
+ * out truly dead nodes by the time we come around to making decisions
+ * on their number */
+#define O2NET_QUORUM_DELAY_MS	((o2hb_dead_threshold + 2) * O2HB_REGION_TIMEOUT_MS)
+
 #endif /* O2CLUSTER_QUORUM_H */
diff -ruNpX dontdiff linux-2.6.15-staging1/fs/ocfs2/cluster/tcp.c linux-2.6.15-staging2/fs/ocfs2/cluster/tcp.c
--- linux-2.6.15-staging1/fs/ocfs2/cluster/tcp.c	2006-01-08 18:23:29.379721520 -0500
+++ linux-2.6.15-staging2/fs/ocfs2/cluster/tcp.c	2006-01-08 18:13:52.646398312 -0500
@@ -67,7 +67,6 @@
 #include "nodemanager.h"
 #define MLOG_MASK_PREFIX ML_TCP
 #include "masklog.h"
-#include "quorum.h"
 
 #include "tcp_internal.h"
 
@@ -128,7 +127,7 @@ static struct workqueue_struct *o2net_wq
 static struct work_struct o2net_listen_work;
 
 static struct o2hb_callback_func o2net_hb_up, o2net_hb_down;
-#define O2NET_HB_PRI 0x1
+#define O2NET_HB_PRI 0x2
 
 static struct o2net_handshake *o2net_hand;
 static struct o2net_msg *o2net_keep_req, *o2net_keep_resp;
@@ -390,9 +389,9 @@ static void o2net_set_nn_state(struct o2
 		wake_up(&nn->nn_sc_wq);
 
 	if (!was_err && nn->nn_persistent_error) {
-		o2quo_conn_err(o2net_num_from_nn(nn));
-		queue_delayed_work(o2net_wq, &nn->nn_still_up,
-				   msecs_to_jiffies(O2NET_QUORUM_DELAY_MS));
+		u8 node_num = o2net_num_from_nn(nn);
+		struct o2nm_node *node = o2nm_get_node_by_num(node_num);
+		o2hb_notify(O2HB_CONN_DOWN_CB, node, node_num);
 	}
 
 	if (was_valid && !valid) {
@@ -402,7 +401,11 @@ static void o2net_set_nn_state(struct o2
 	}
 
 	if (!was_valid && valid) {
-		o2quo_conn_up(o2net_num_from_nn(nn));
+		u8 node_num = o2net_num_from_nn(nn);
+		struct o2nm_node *node = o2nm_get_node_by_num(node_num);
+
+		o2hb_notify(O2HB_CONN_UP_CB, node, node_num);
+
 		/* this is a bit of a hack.  we only try reconnecting
 		 * when heartbeating starts until we get a connection.
 		 * if that connection then dies we don't try reconnecting.
@@ -1424,13 +1427,6 @@ static void o2net_connect_expired(void *
 	spin_unlock(&nn->nn_lock);
 }
 
-static void o2net_still_up(void *arg)
-{
-	struct o2net_node *nn = arg;
-
-	o2quo_hb_still_up(o2net_num_from_nn(nn));
-}
-
 /* ------------------------------------------------------------ */
 
 void o2net_disconnect_node(struct o2nm_node *node)
@@ -1445,7 +1441,6 @@ void o2net_disconnect_node(struct o2nm_n
 	if (o2net_wq) {
 		cancel_delayed_work(&nn->nn_connect_expired);
 		cancel_delayed_work(&nn->nn_connect_work);
-		cancel_delayed_work(&nn->nn_still_up);
 		flush_workqueue(o2net_wq);
 	}
 }
@@ -1453,8 +1448,6 @@ void o2net_disconnect_node(struct o2nm_n
 static void o2net_hb_node_down_cb(struct o2nm_node *node, int node_num,
 				  void *data)
 {
-	o2quo_hb_down(node_num);
-
 	if (node_num != o2nm_this_node())
 		o2net_disconnect_node(node);
 }
@@ -1464,8 +1457,6 @@ static void o2net_hb_node_up_cb(struct o
 {
 	struct o2net_node *nn = o2net_nn_from_num(node_num);
 
-	o2quo_hb_up(node_num);
-
 	/* ensure an immediate connect attempt */
 	nn->nn_last_connect_attempt = jiffies -
 		(msecs_to_jiffies(O2NET_RECONNECT_DELAY_MS) + 1);
@@ -1739,7 +1730,7 @@ int o2net_start_listening(struct o2nm_no
 		destroy_workqueue(o2net_wq);
 		o2net_wq = NULL;
 	} else
-		o2quo_conn_up(node->nd_num);
+		o2hb_notify(O2HB_CONN_UP_CB, node, node->nd_num);
 
 	return ret;
 }
@@ -1776,7 +1767,7 @@ void o2net_stop_listening(struct o2nm_no
 	sock_release(o2net_listen_sock);
 	o2net_listen_sock = NULL;
 
-	o2quo_conn_err(node->nd_num);
+	o2hb_notify(O2HB_CONN_DOWN_CB, node, node->nd_num);
 }
 
 /* ------------------------------------------------------------ */
@@ -1785,8 +1776,6 @@ int o2net_init(void)
 {
 	unsigned long i;
 
-	o2quo_init();
-
 	o2net_hand = kcalloc(1, sizeof(struct o2net_handshake), GFP_KERNEL);
 	o2net_keep_req = kcalloc(1, sizeof(struct o2net_msg), GFP_KERNEL);
 	o2net_keep_resp = kcalloc(1, sizeof(struct o2net_msg), GFP_KERNEL);
@@ -1805,11 +1794,11 @@ int o2net_init(void)
 
 	for (i = 0; i < ARRAY_SIZE(o2net_nodes); i++) {
 		struct o2net_node *nn = o2net_nn_from_num(i);
+		memset(nn, 0, sizeof (*nn));
 
 		spin_lock_init(&nn->nn_lock);
 		INIT_WORK(&nn->nn_connect_work, o2net_start_connect, nn);
 		INIT_WORK(&nn->nn_connect_expired, o2net_connect_expired, nn);
-		INIT_WORK(&nn->nn_still_up, o2net_still_up, nn);
 		/* until we see hb from a node we'll return einval */
 		nn->nn_persistent_error = -ENOTCONN;
 		init_waitqueue_head(&nn->nn_sc_wq);
@@ -1822,7 +1811,6 @@ int o2net_init(void)
 
 void o2net_exit(void)
 {
-	o2quo_exit();
 	kfree(o2net_hand);
 	kfree(o2net_keep_req);
 	kfree(o2net_keep_resp);
diff -ruNpX dontdiff linux-2.6.15-staging1/fs/ocfs2/cluster/tcp_internal.h linux-2.6.15-staging2/fs/ocfs2/cluster/tcp_internal.h
--- linux-2.6.15-staging1/fs/ocfs2/cluster/tcp_internal.h	2006-01-08 18:23:29.379721520 -0500
+++ linux-2.6.15-staging2/fs/ocfs2/cluster/tcp_internal.h	2006-01-08 18:13:52.646398312 -0500
@@ -28,12 +28,7 @@
 #define O2NET_MSG_KEEP_RESP_MAGIC ((u16)0xfa58)
 
 /* same as hb delay, we're waiting for another node to recognize our hb */
-#define O2NET_RECONNECT_DELAY_MS	O2HB_REGION_TIMEOUT_MS
-
-/* we're delaying our quorum decision so that heartbeat will have timed
- * out truly dead nodes by the time we come around to making decisions
- * on their number */
-#define O2NET_QUORUM_DELAY_MS	((o2hb_dead_threshold + 2) * O2HB_REGION_TIMEOUT_MS)
+#define O2NET_RECONNECT_DELAY_MS	2000	
 
 #define O2NET_KEEPALIVE_DELAY_SECS	5
 #define O2NET_IDLE_TIMEOUT_SECS		10
@@ -87,11 +82,6 @@ struct o2net_node {
 	 * established.  this expiring gives up on the node and errors out
 	 * transmits */
 	struct work_struct		nn_connect_expired;
-
-	/* after we give up on a socket we wait a while before deciding
-	 * that it is still heartbeating and that we should do some
-	 * quorum work */
-	struct work_struct		nn_still_up;
 };
 
 struct o2net_sock_container {