[Ocfs2-commits] zab commits r2444 - trunk/fs/ocfs2/cluster

svn-commits at oss.oracle.com svn-commits at oss.oracle.com
Tue Jun 28 17:50:20 CDT 2005


Author: zab
Signed-off-by: mfasheh
Date: 2005-06-28 17:50:18 -0500 (Tue, 28 Jun 2005)
New Revision: 2444

Modified:
   trunk/fs/ocfs2/cluster/tcp.c
   trunk/fs/ocfs2/cluster/tcp_internal.h
Log:
o the kernel tcp's idle keepalives think that a socket that has unacked packets
  in flight isn't idle, despite not trading a single packet for well beyond
  our idle timeout.  this implements our own messaging level keepalives which
  more aggressively require active communication and also make sure that the
  remote servicing thread is participating instead of just the socket.

Signed-off-by: mfasheh



Modified: trunk/fs/ocfs2/cluster/tcp.c
===================================================================
--- trunk/fs/ocfs2/cluster/tcp.c	2005-06-28 19:04:16 UTC (rev 2443)
+++ trunk/fs/ocfs2/cluster/tcp.c	2005-06-28 22:50:18 UTC (rev 2444)
@@ -114,6 +114,7 @@
 #define O2NET_HB_PRI 0x1
 
 static struct o2net_handshake *o2net_hand;
+static o2net_msg *o2net_keep_req, *o2net_keep_resp;
 
 /* these node totals include our node.  I think the hb and net threads
  * sufficiently serialize things so that these don't need locking */
@@ -131,6 +132,9 @@
 static void o2net_rx_until_empty(void *arg);
 static void o2net_shutdown_sc(void *arg);
 static void o2net_listen_data_ready(struct sock *sk, int bytes);
+static void o2net_sc_send_keep_req(void *arg);
+static void o2net_idle_timer(unsigned long data);
+static void o2net_sc_postpone_idle(struct o2net_sock_container *sc);
 
 static inline int o2net_sys_err_to_errno(enum o2net_system_error err)
 {
@@ -327,7 +331,12 @@
 	INIT_WORK(&sc->sc_connect_work, o2net_sc_connect_completed, sc);
 	INIT_WORK(&sc->sc_rx_work, o2net_rx_until_empty, sc);
 	INIT_WORK(&sc->sc_shutdown_work, o2net_shutdown_sc, sc);
+	INIT_WORK(&sc->sc_keepalive_work, o2net_sc_send_keep_req, sc);
 
+	init_timer(&sc->sc_idle_timeout);
+	sc->sc_idle_timeout.function = o2net_idle_timer;
+	sc->sc_idle_timeout.data = (unsigned long)sc;
+
 	sclog(sc, "alloced\n");
 
 	ret = sc;
@@ -353,6 +362,20 @@
 	if (!queue_work(o2net_wq, work))
 		sc_put(sc);
 }
+static void o2net_sc_queue_delayed_work(struct o2net_sock_container *sc,
+					struct work_struct *work,
+					int delay)
+{
+	sc_get(sc);
+	if (!queue_delayed_work(o2net_wq, work, delay))
+		sc_put(sc);
+}
+static void o2net_sc_cancel_delayed_work(struct o2net_sock_container *sc,
+					 struct work_struct *work)
+{
+	if (cancel_delayed_work(work))
+		sc_put(sc);
+}
 
 static void o2net_mod_connected_nodes(u8 node, int delta)
 {
@@ -506,14 +529,6 @@
 			o2net_sc_queue_work(sc, &sc->sc_connect_work);
 			break;
 		default:
-			if (sk->sk_err == ETIMEDOUT)
-				mlog(ML_NOTICE, "connection to node %s num %u "
-				     "at %u.%u.%u.%u:%d has been idle for 10 "
-				     "seconds, shutting it down.\n",
-				     sc->sc_node->nd_name,
-				     sc->sc_node->nd_num,
-				     NIPQUAD(sc->sc_node->nd_ipv4_address), 
-				     ntohs(sc->sc_node->nd_ipv4_port));
 			o2net_sc_queue_work(sc, &sc->sc_shutdown_work);
 			break;
 	}
@@ -602,6 +617,7 @@
 	if (o2net_unregister_callbacks(sc->sc_sock->sk, sc)) {
 		/* we shouldn't flush as we're in the thread, the
 		 * races with pending sc work structs are harmless */
+		del_timer_sync(&sc->sc_idle_timeout);
 		sc_put(sc);
 		sc->sc_sock->ops->shutdown(sc->sc_sock,
 					   RCV_SHUTDOWN|SEND_SHUTDOWN);
@@ -835,6 +851,25 @@
 	return ret;
 }
 
+static void o2net_sendpage(struct o2net_sock_container *sc,
+			   void *kmalloced_virt,
+			   size_t size)
+{
+	struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
+	ssize_t ret;
+
+
+	ret = sc->sc_sock->ops->sendpage(sc->sc_sock, 
+					 virt_to_page(kmalloced_virt),
+					 (long)kmalloced_virt & ~PAGE_MASK,
+					 size, MSG_DONTWAIT);
+	if (ret != size) {
+		mlog(ML_ERROR, "sendpage of size %zu failed with %zu\n",
+		     size, ret);
+		o2net_ensure_shutdown(nn, sc, 0);
+	}
+}
+
 static void o2net_init_msg(o2net_msg *msg, u16 data_len, u16 msg_type, u32 key)
 {
 	memset(msg, 0, sizeof(o2net_msg));
@@ -1033,26 +1068,36 @@
 				 o2net_msg *hdr)
 {
 	struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
-	int ret, handler_status;
+	int ret = 0, handler_status;
 	enum  o2net_system_error syserr;
 	struct o2net_msg_handler *nmh = NULL;
 
 	msglog(hdr, "processing message\n");
 
-	if (hdr->magic == O2NET_MSG_STATUS_MAGIC) {
-		/* special type for returning message status */
-		o2net_complete_nsw(nn, NULL, hdr->msg_num, hdr->sys_status,
-				   hdr->status);
-		ret = 0;
-		goto out;
-	} else if (hdr->magic != O2NET_MSG_MAGIC) {
-		msglog(hdr, "bad magic\n");
-		ret = -EINVAL;
-		goto out;
+	o2net_sc_postpone_idle(sc);
+
+	switch(hdr->magic) {
+		case O2NET_MSG_STATUS_MAGIC:
+			/* special type for returning message status */
+			o2net_complete_nsw(nn, NULL, hdr->msg_num,
+					   hdr->sys_status, hdr->status);
+			goto out;
+		case O2NET_MSG_KEEP_REQ_MAGIC:
+			o2net_sendpage(sc, o2net_keep_resp,
+				       sizeof(*o2net_keep_resp));
+			goto out;
+		case O2NET_MSG_KEEP_RESP_MAGIC:
+			goto out;
+		case O2NET_MSG_MAGIC:
+			break;
+		default:
+			msglog(hdr, "bad magic\n");
+			ret = -EINVAL;
+			goto out;
+			break;
 	}
 
 	/* find a handler for it */
-	ret = 0;
 	handler_status = 0;
 	nmh = o2net_handler_get(hdr->msg_type, hdr->key);
 	if (!nmh) {
@@ -1112,9 +1157,12 @@
 	sc->sc_handshake_ok = 1;
 
 	spin_lock(&nn->nn_lock);
-	/* set valid if it hasn't been shutdown already.. */
-	if (nn->nn_sc == sc) 
+	/* set valid and queue the idle timers only if it hasn't been
+	 * shut down already */
+	if (nn->nn_sc == sc) {
+		o2net_sc_postpone_idle(sc);
 		o2net_set_nn_state(nn, sc, 1, 0);
+	}
 	spin_unlock(&nn->nn_lock);
 
 	/* shift everything up as though it wasn't there */
@@ -1227,35 +1275,16 @@
 	sc_put(sc);
 }
 
-static int o2net_set_options(struct socket *sock)
+static int o2net_set_nodelay(struct socket *sock)
 {
-	int ret, i;
+	int ret, val = 1;
 	mm_segment_t oldfs;
-	static struct optpairs {
-		int opt, val;
-	} pairs[] = {
-		{TCP_NODELAY, 1}, 
-		{TCP_KEEPCNT, O2NET_KEEPCNT}, 
-		{TCP_KEEPIDLE, O2NET_KEEPIDLE}, 
-		{TCP_KEEPINTVL, O2NET_KEEPINTVL}, 
-	}; 
 
 	oldfs = get_fs();
 	set_fs(KERNEL_DS);
 
-	i = 1;
-	/* SOL_SOCKET is magical */
-	ret = sock_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE,
-			      (char __user *)&i, sizeof(i));
-
-	for (i = 0; ret == 0 && i < ARRAY_SIZE(pairs); i++) {
-		ret = sock->ops->setsockopt(sock, SOL_TCP, pairs[i].opt,
-					    (char __user *)&pairs[i].val,
-					    sizeof(pairs[i].val));
-		if (ret)
-			break;
-	}
-
+	ret = sock_setsockopt(sock, SOL_TCP, TCP_NODELAY,
+			      (char __user *)&val, sizeof(val));
 	set_fs(oldfs);
 	return ret;
 }
@@ -1264,35 +1293,52 @@
 
 /* called when a connect completes and after a sock is accepted.  the
  * rx path will see the response and mark the sc valid */
-static void o2net_sc_send_handshake(struct o2net_sock_container *sc)
+static void o2net_sc_connect_completed(void *arg)
 {
-	struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
-	ssize_t ret;
+	struct o2net_sock_container *sc = arg;
 
 	mlog(ML_MSG, "sc sending handshake with ver %llu id %llx\n",
               (unsigned long long)O2NET_PROTOCOL_VERSION,
 	      (unsigned long long)o2net_hand->connector_id);
 
-	ret = sc->sc_sock->ops->sendpage(sc->sc_sock, 
-					 virt_to_page(o2net_hand),
-					 (long)o2net_hand & ~PAGE_MASK,
-					 sizeof(*o2net_hand), MSG_DONTWAIT);
-	if (ret != sizeof(*o2net_hand)) {
-		if (ret >= 0)
-			ret = -EBADE;
-		mlog(ML_CONN, "sendpage failed with %zu\n", ret);
-		o2net_ensure_shutdown(nn, sc, 0);
-	}
+	o2net_sendpage(sc, o2net_hand, sizeof(*o2net_hand));
+	sc_put(sc);
 }
 
-static void o2net_sc_connect_completed(void *arg)
+/* this is called as a work_struct func. */
+static void o2net_sc_send_keep_req(void *arg)
 {
 	struct o2net_sock_container *sc = arg;
 
-	o2net_sc_send_handshake(sc);
+	o2net_sendpage(sc, o2net_keep_req, sizeof(*o2net_keep_req));
 	sc_put(sc);
 }
 
+/* socket shutdown does a del_timer_sync against this as it tears down.
+ * we can't start this timer until we've got to the point in sc buildup
+ * where shutdown is going to be involved */
+static void o2net_idle_timer(unsigned long data)
+{
+	struct o2net_sock_container *sc = (struct o2net_sock_container *)data;
+
+	mlog(ML_NOTICE, "connection to node %s num %u at %u.%u.%u.%u:%d has "
+	     "been idle for 10 seconds, shutting it down.\n",
+	     sc->sc_node->nd_name, sc->sc_node->nd_num,
+	     NIPQUAD(sc->sc_node->nd_ipv4_address), 
+	     ntohs(sc->sc_node->nd_ipv4_port));
+
+	o2net_sc_queue_work(sc, &sc->sc_shutdown_work);
+}
+
+static void o2net_sc_postpone_idle(struct o2net_sock_container *sc)
+{
+	o2net_sc_cancel_delayed_work(sc, &sc->sc_keepalive_work);
+	o2net_sc_queue_delayed_work(sc, &sc->sc_keepalive_work,
+				    O2NET_KEEPALIVE_DELAY_SECS * HZ);
+	mod_timer(&sc->sc_idle_timeout,
+		  jiffies + (O2NET_IDLE_TIMEOUT_SECS * HZ));
+}
+
 /* this work func is kicked whenever a path sets the nn state which doesn't
  * have valid set.  This includes seeing hb come up, losing a connection,
  * having a connect attempt fail, etc. This centralizes the logic which decides
@@ -1352,7 +1398,12 @@
 		goto out;
 	}
 	
-	o2net_set_options(sc->sc_sock);
+	ret = o2net_set_nodelay(sc->sc_sock);
+	if (ret) {
+		mlog(ML_ERROR, "setting TCP_NODELAY failed with %d\n", ret);
+		goto out;
+	}
+
 	o2net_register_callbacks(sc->sc_sock->sk, sc);
 
 	spin_lock(&nn->nn_lock);
@@ -1399,7 +1450,7 @@
 	if (!nn->nn_sc_valid) {
 		mlog(ML_ERROR, "no connection established with node %u after "
 		     "%u seconds, giving up and returning errors.\n",
-		     o2net_num_from_nn(nn), O2NET_CONN_IDLE_SECS);
+		     o2net_num_from_nn(nn), O2NET_IDLE_TIMEOUT_SECS);
 
 		o2net_set_nn_state(nn, NULL, 0, -ENOTCONN);
 	}
@@ -1535,7 +1586,7 @@
 		 * configured and doing so brings up the o2net_wq, so we can
 		 * use it.. */
 		queue_delayed_work(o2net_wq, &nn->nn_connect_expired,
-				   O2NET_CONN_IDLE_SECS * HZ);
+				   O2NET_IDLE_TIMEOUT_SECS * HZ);
 
 		/* believe it or not, accept and node hearbeating testing
 		 * can succeed for this node before we got here.. so
@@ -1621,7 +1672,11 @@
 	if (ret < 0)
 		goto out;
 
-	o2net_set_options(new_sock);
+	ret = o2net_set_nodelay(new_sock);
+	if (ret) {
+		mlog(ML_ERROR, "setting TCP_NODELAY failed with %d\n", ret);
+		goto out;
+	}
 
 	slen = sizeof(sin);
 	ret = new_sock->ops->getname(new_sock, (struct sockaddr *) &sin,
@@ -1690,7 +1745,7 @@
 	o2net_register_callbacks(sc->sc_sock->sk, sc);
 	o2net_sc_queue_work(sc, &sc->sc_rx_work);
 
-	o2net_sc_send_handshake(sc);
+	o2net_sendpage(sc, o2net_hand, sizeof(*o2net_hand));
 
 out:
 	if (new_sock)
@@ -1855,12 +1910,21 @@
 	unsigned long i;
 
 	o2net_hand = kcalloc(1, sizeof(struct o2net_handshake), GFP_KERNEL);
-	if (o2net_hand == NULL)
+	o2net_keep_req = kcalloc(1, sizeof(o2net_msg), GFP_KERNEL);
+	o2net_keep_resp = kcalloc(1, sizeof(o2net_msg), GFP_KERNEL);
+	if (!o2net_hand || !o2net_keep_req || !o2net_keep_resp) {
+		kfree(o2net_hand);
+		kfree(o2net_keep_req);
+		kfree(o2net_keep_resp);
 		return -ENOMEM;
+	}
 
 	o2net_hand->protocol_version = cpu_to_be64(O2NET_PROTOCOL_VERSION);
 	o2net_hand->connector_id = cpu_to_be64(1);
 
+	o2net_keep_req->magic = cpu_to_be16(O2NET_MSG_KEEP_REQ_MAGIC);
+	o2net_keep_resp->magic = cpu_to_be16(O2NET_MSG_KEEP_RESP_MAGIC);
+
 	INIT_WORK(&o2net_quorum_work, o2net_check_quorum, NULL);
 
 	for (i = 0; i < ARRAY_SIZE(o2net_nodes); i++) {
@@ -1882,4 +1946,6 @@
 void o2net_exit(void)
 {
 	kfree(o2net_hand);
+	kfree(o2net_keep_req);
+	kfree(o2net_keep_resp);
 }

Modified: trunk/fs/ocfs2/cluster/tcp_internal.h
===================================================================
--- trunk/fs/ocfs2/cluster/tcp_internal.h	2005-06-28 19:04:16 UTC (rev 2443)
+++ trunk/fs/ocfs2/cluster/tcp_internal.h	2005-06-28 22:50:18 UTC (rev 2444)
@@ -26,6 +26,8 @@
 
 #define O2NET_MSG_MAGIC           ((u16)0xfa55)
 #define O2NET_MSG_STATUS_MAGIC    ((u16)0xfa56)
+#define O2NET_MSG_KEEP_REQ_MAGIC  ((u16)0xfa57)
+#define O2NET_MSG_KEEP_RESP_MAGIC ((u16)0xfa58)
 
 /* same as hb delay, we're waiting for another node to recognize our hb */
 #define O2NET_RECONNECT_DELAY_MS	O2HB_REGION_TIMEOUT_MS
@@ -34,15 +36,10 @@
  * out truly dead nodes by the time we come around to making decisions
  * on their number */
 #define O2NET_QUORUM_DELAY_MS	((O2HB_DEAD_THRESHOLD + 2) * O2HB_REGION_TIMEOUT_MS)
-/* send 5 keepalives every 1 second after 5 seconds of idle.  this 
- * is *so short* because we have to wait for quorum then wait for hb
- * timeouts again and have recovery complete all within 45 seconds */
-#define O2NET_KEEPCNT		(5)
-#define O2NET_KEEPIDLE		(5)
-#define O2NET_KEEPINTVL		(1)
-#define O2NET_CONN_IDLE_SECS	(O2NET_KEEPIDLE + \
-					(O2NET_KEEPCNT * O2NET_KEEPINTVL))
 
+#define O2NET_KEEPALIVE_DELAY_SECS	5
+#define O2NET_IDLE_TIMEOUT_SECS		10
+
 #define O2NET_PROTOCOL_VERSION 1ULL
 struct o2net_handshake {
 	u64	protocol_version;
@@ -112,6 +109,9 @@
 	 */
 	struct work_struct	sc_shutdown_work;
 
+	struct timer_list	sc_idle_timeout;
+	struct work_struct	sc_keepalive_work;
+
 	unsigned		sc_handshake_ok:1;
 
 	struct page 		*sc_page;



More information about the Ocfs2-commits mailing list