[Ocfs2-commits] khackel commits r2752 - branches/ocfs2-1.2/fs/ocfs2/dlm

svn-commits@oss.oracle.com svn-commits at oss.oracle.com
Wed Jan 18 19:05:32 CST 2006


Author: khackel
Signed-off-by: khackel
Signed-off-by: mfasheh
Date: 2006-01-18 19:05:22 -0600 (Wed, 18 Jan 2006)
New Revision: 2752

Modified:
   branches/ocfs2-1.2/fs/ocfs2/dlm/dlmcommon.h
   branches/ocfs2-1.2/fs/ocfs2/dlm/dlmconvert.c
   branches/ocfs2-1.2/fs/ocfs2/dlm/dlmlock.c
   branches/ocfs2-1.2/fs/ocfs2/dlm/dlmrecovery.c
Log:
* add dlm_wait_for_node_death function to be used after receiving a network error.
  this will wait for the given timeout to allow the heartbeat callbacks to update
  the domain map.  without this, some paths may spin and consume enough cpu that
  the heartbeat gets starved and never updates.

Signed-off-by: khackel
Signed-off-by: mfasheh



Modified: branches/ocfs2-1.2/fs/ocfs2/dlm/dlmcommon.h
===================================================================
--- branches/ocfs2-1.2/fs/ocfs2/dlm/dlmcommon.h	2006-01-19 01:04:13 UTC (rev 2751)
+++ branches/ocfs2-1.2/fs/ocfs2/dlm/dlmcommon.h	2006-01-19 01:05:22 UTC (rev 2752)
@@ -208,6 +208,9 @@
 #define DLM_LOCK_RES_IN_PROGRESS          0x00000010
 #define DLM_LOCK_RES_MIGRATING            0x00000020
 
+/* max milliseconds to wait to sync up a network failure with a node death */
+#define DLM_NODE_DEATH_WAIT_MAX (5 * 1000)
+
 #define DLM_PURGE_INTERVAL_MS   (8 * 1000)
 
 struct dlm_lock_resource
@@ -663,6 +666,7 @@
 void dlm_complete_recovery_thread(struct dlm_ctxt *dlm);
 void dlm_wait_for_recovery(struct dlm_ctxt *dlm);
 int dlm_is_node_dead(struct dlm_ctxt *dlm, u8 node);
+int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout);
 
 void dlm_get(struct dlm_ctxt *dlm);
 void dlm_put(struct dlm_ctxt *dlm);

Modified: branches/ocfs2-1.2/fs/ocfs2/dlm/dlmconvert.c
===================================================================
--- branches/ocfs2-1.2/fs/ocfs2/dlm/dlmconvert.c	2006-01-19 01:04:13 UTC (rev 2751)
+++ branches/ocfs2-1.2/fs/ocfs2/dlm/dlmconvert.c	2006-01-19 01:05:22 UTC (rev 2752)
@@ -392,6 +392,11 @@
 	} else {
 		mlog_errno(tmpret);
 		if (dlm_is_host_down(tmpret)) {
+			/* instead of logging the same network error over
+			 * and over, sleep here and wait for the heartbeat
+			 * to notice the node is dead.  times out after 5s. */
+			dlm_wait_for_node_death(dlm, res->owner, 
+						DLM_NODE_DEATH_WAIT_MAX);
 			ret = DLM_RECOVERING;
 			mlog(0, "node %u died so returning DLM_RECOVERING "
 			     "from convert message!\n", res->owner);

Modified: branches/ocfs2-1.2/fs/ocfs2/dlm/dlmlock.c
===================================================================
--- branches/ocfs2-1.2/fs/ocfs2/dlm/dlmlock.c	2006-01-19 01:04:13 UTC (rev 2751)
+++ branches/ocfs2-1.2/fs/ocfs2/dlm/dlmlock.c	2006-01-19 01:05:22 UTC (rev 2752)
@@ -645,7 +645,19 @@
 			mlog(0, "retrying lock with migration/"
 			     "recovery/in progress\n");
 			msleep(100);
-			dlm_wait_for_recovery(dlm);
+			/* no waiting for dlm_reco_thread */
+			if (recovery) {
+				if (status == DLM_RECOVERING) {
+					mlog(0, "%s: got RECOVERING "
+					     "for $REOCVERY lock, master "
+					     "was %u\n", dlm->name, 
+					     res->owner);
+					dlm_wait_for_node_death(dlm, res->owner, 
+							DLM_NODE_DEATH_WAIT_MAX);
+				}
+			} else {
+				dlm_wait_for_recovery(dlm);
+			}
 			goto retry_lock;
 		}
 

Modified: branches/ocfs2-1.2/fs/ocfs2/dlm/dlmrecovery.c
===================================================================
--- branches/ocfs2-1.2/fs/ocfs2/dlm/dlmrecovery.c	2006-01-19 01:04:13 UTC (rev 2751)
+++ branches/ocfs2-1.2/fs/ocfs2/dlm/dlmrecovery.c	2006-01-19 01:05:22 UTC (rev 2752)
@@ -278,6 +278,24 @@
 	return dead;
 }
 
+int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout)
+{
+	if (timeout) {
+		mlog(ML_NOTICE, "%s: waiting %dms for notification of "
+		     "death of node %u\n", dlm->name, timeout, node);
+		wait_event_timeout(dlm->dlm_reco_thread_wq,
+			   dlm_is_node_dead(dlm, node),
+			   msecs_to_jiffies(timeout));
+	} else {
+		mlog(ML_NOTICE, "%s: waiting indefinitely for notification "
+		     "of death of node %u\n", dlm->name, node);
+		wait_event(dlm->dlm_reco_thread_wq,
+			   dlm_is_node_dead(dlm, node));
+	}
+	/* for now, return 0 */
+	return 0;
+}
+
 /* callers of the top-level api calls (dlmlock/dlmunlock) should
  * block on the dlm->reco.event when recovery is in progress.
  * the dlm recovery thread will set this state when it begins



More information about the Ocfs2-commits mailing list