[Ocfs2-commits] mfasheh commits r2408 - trunk/fs/ocfs2/cluster

Fri Jun 17 19:45:13 CDT 2005

Author: mfasheh
Signed-off-by: zab
Date: 2005-06-17 19:45:11 -0500 (Fri, 17 Jun 2005)
New Revision: 2408

Modified:
   trunk/fs/ocfs2/cluster/heartbeat.c
   trunk/fs/ocfs2/cluster/ocfs2_heartbeat.h
Log:
* write out a block crc in each live heartbeat slot. verify this on all
  nodes.

* fix a bug where the slot data wasn't being initialized to the values on   
  disk, thus causing heartbeat to initially see a change which may not have 
  actually occurred.

* write a current generation value. this helps heartbeat on other nodes
  detect a condition which can happen when a heartbeating node reboots and
  comes back before timeout -- a change in generation (for an otherwise
  'good' block) will immediately trigger a node down event.

Signed-off-by: zab



Modified: trunk/fs/ocfs2/cluster/heartbeat.c
===================================================================

--- trunk/fs/ocfs2/cluster/heartbeat.c	2005-06-18 00:06:24 UTC (rev 2407)
+++ trunk/fs/ocfs2/cluster/heartbeat.c	2005-06-18 00:45:11 UTC (rev 2408)
@@ -28,6 +28,8 @@
 #include <linux/file.h>
 #include <linux/kthread.h>
 #include <linux/configfs.h>
+#include <linux/random.h>
+#include <linux/crc32.h>
 
 #include "heartbeat.h"
 #include "tcp.h"
@@ -53,6 +55,7 @@
 static unsigned long o2hb_live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
 static LIST_HEAD(o2hb_node_events);
 static DECLARE_WAIT_QUEUE_HEAD(o2hb_steady_queue);
+static u64 o2hb_generation;
 
 static struct o2hb_callback {
 	struct list_head list;
@@ -79,6 +82,7 @@
 	struct o2hb_disk_heartbeat_block *ds_raw_block;
 	u8			ds_node_num;
 	unsigned long		ds_last_time;
+	u64			ds_last_generation;
 	u16			ds_equal_samples;
 	u16			ds_changed_samples;
 	struct list_head	ds_live_item;
@@ -366,6 +370,35 @@
 	return status;
 }
 
+static u32 o2hb_compute_block_crc_le(struct o2hb_region *reg,
+				     struct o2hb_disk_heartbeat_block *hb_block)
+{
+	u32 old_cksum, ret;
+
+	/* We want to compute the block crc with a 0 value in the
+	 * hb_cksum field. Save it off here and replace after the
+	 * crc. */
+	old_cksum = hb_block->hb_cksum;
+	hb_block->hb_cksum = 0;
+
+	ret = crc32_le(0, (unsigned char *) hb_block, reg->hr_block_bytes);
+
+	hb_block->hb_cksum = old_cksum;
+
+	return ret;
+}
+
+static int o2hb_verify_crc(struct o2hb_region *reg,
+			   struct o2hb_disk_heartbeat_block *hb_block)
+{
+	u32 computed, read;
+
+	read = le32_to_cpu(hb_block->hb_cksum);
+	computed = o2hb_compute_block_crc_le(reg, hb_block);
+
+	return read == le32_to_cpu(computed);
+}
+
 /* We want to make sure that nobody is heartbeating on top of us --
  * this will help detect an invalid configuration. */
 static int o2hb_check_last_timestamp(struct o2hb_region *reg)
@@ -389,7 +422,7 @@
 	return ret;
 }
 
-static inline void o2hb_set_local_node_timestamp(struct o2hb_region *reg)
+static inline void o2hb_prepare_block(struct o2hb_region *reg)
 {
 	int node_num;
 	u64 cputime;
@@ -405,8 +438,16 @@
 	cputime = CURRENT_TIME.tv_sec;
 	if (!cputime)
 		cputime = 1;
+
 	hb_block->hb_seq = cpu_to_le64(cputime);
 	hb_block->hb_node = node_num;
+	hb_block->hb_generation = cpu_to_le64(o2hb_generation);
+
+	/* This step must always happen last! */
+	hb_block->hb_cksum = o2hb_compute_block_crc_le(reg, hb_block);
+
+	mlog(ML_HB_BIO, "our node generation = 0x%"MLFx64", cksum = 0x%x\n",
+	     cpu_to_le64(o2hb_generation), le32_to_cpu(hb_block->hb_cksum));
 }
 
 static void o2hb_fire_callbacks(struct o2hb_callback *hbcall,
@@ -521,7 +562,7 @@
 static int o2hb_check_slot(struct o2hb_region *reg,
 			   struct o2hb_disk_slot *slot)
 {
-	int changed = 0;
+	int changed = 0, gen_changed = 0;
 	struct o2hb_node_event event = 
 		{ .hn_item = LIST_HEAD_INIT(event.hn_item), };
 	struct o2nm_node *node;
@@ -534,6 +575,27 @@
 	if (!node)
 		return 0;
 
+	if (!o2hb_verify_crc(reg, hb_block)) {
+		/* all paths from here will drop o2hb_live_lock for
+		 * us. */
+		spin_lock(&o2hb_live_lock);
+
+		/* Don't print an error on the console in this case -
+		 * a freshly formatted heartbeat area will not have a
+		 * crc set on it. */
+		if (list_empty(&slot->ds_live_item))
+			goto out;
+
+		/* The node is live but pushed out a bad crc. We
+		 * consider it a transient miss but don't populate any
+		 * other values as they may be junk. */
+		mlog(ML_ERROR, "Node %d has written a bad crc to %s\n",
+		     slot->ds_node_num, reg->hr_dev_name);
+
+		slot->ds_equal_samples++;
+		goto fire_callbacks;
+	}
+
 	/* we don't care if these wrap.. the state transitions below
 	 * clear at the right places */
 	cputime = le64_to_cpu(hb_block->hb_seq);
@@ -543,13 +605,37 @@
 		slot->ds_equal_samples++;
 	slot->ds_last_time = cputime;
 
+	/* The node changed heartbeat generations. We assume this to
+	 * mean it dropped off but came back before we timed out. We
+	 * want to consider it down for the time being but don't want
+	 * to lose any changed_samples state we might build up to
+	 * considering it live again. */
+	if (slot->ds_last_generation != le64_to_cpu(hb_block->hb_generation)) {
+		gen_changed = 1;
+		slot->ds_equal_samples = 0;
+		mlog(ML_HEARTBEAT, "Node %d changed generation (0x%"MLFx64" "
+		     "to 0x%"MLFx64")\n", slot->ds_node_num,
+		     slot->ds_last_generation,
+		     le64_to_cpu(hb_block->hb_generation));
+	}
+
+	slot->ds_last_generation = le64_to_cpu(hb_block->hb_generation);
+
+	mlog(ML_HEARTBEAT, "Slot %d has generation 0x%"MLFx64", cksum 0x%x "
+	     "changed samples %u, equal samples %u\n",
+	     slot->ds_node_num, slot->ds_last_generation,
+	     le32_to_cpu(hb_block->hb_cksum), slot->ds_changed_samples,
+	     slot->ds_equal_samples);
+
 	spin_lock(&o2hb_live_lock);
+
+fire_callbacks:
 	/* dead nodes only come to life after some number of 
 	 * changes at any time during their dead time */
 	if (list_empty(&slot->ds_live_item) &&
 	    slot->ds_changed_samples >= O2HB_LIVE_THRESHOLD) {
-		mlog(ML_HEARTBEAT, "Node %d joined my region\n",
-		     slot->ds_node_num);
+		mlog(ML_HEARTBEAT, "Node %d (id 0x%"MLFx64") joined my "
+		     "region\n", slot->ds_node_num, slot->ds_last_generation);
 
 		/* first on the list generates a callback */
 		if (list_empty(&o2hb_live_slots[slot->ds_node_num])) {
@@ -557,7 +643,7 @@
 
 			o2hb_queue_node_event(&event, O2HB_NODE_UP_CB, node,
 					      slot->ds_node_num);
-	
+
 			changed = 1;
 		}
 
@@ -575,7 +661,7 @@
 	/* live nodes only go dead after enough consequtive missed
 	 * samples..  reset the missed counter whenever we see 
 	 * activity */
-	if (slot->ds_equal_samples >= reg->hr_dead_iter) {
+	if (slot->ds_equal_samples >= reg->hr_dead_iter || gen_changed) {
 		mlog(ML_HEARTBEAT, "Node %d left my region\n",
 		     slot->ds_node_num);
 
@@ -590,7 +676,10 @@
 			changed = 1;
 		}
 
-		slot->ds_changed_samples = 0;
+		/* We don't clear this because the node is still
+		 * actually writing new blocks. */
+		if (!gen_changed)
+			slot->ds_changed_samples = 0;
 		goto out;
 	}
 	if (slot->ds_changed_samples) {
@@ -658,8 +747,8 @@
 		mlog(ML_ERROR, "Device \"%s\": another node is heartbeating "
 		     "in our slot!\n", reg->hr_dev_name);
 
-	/* Set our raw timestamp */
-	o2hb_set_local_node_timestamp(reg);
+	/* fill in the proper info for our next heartbeat */
+	o2hb_prepare_block(reg);
 
 	/* And fire off the write. Note that we don't wait on this I/O
 	 * until later. */
@@ -731,6 +820,11 @@
 	INIT_LIST_HEAD(&o2hb_node_events);
 
 	memset(o2hb_live_node_bitmap, 0, sizeof(o2hb_live_node_bitmap));
+
+	o2hb_generation = 0;
+	/* Generation of zero is invalid */
+	while (!o2hb_generation)
+		get_random_bytes(&o2hb_generation, sizeof(o2hb_generation));
 }
 
 /*
@@ -1036,6 +1130,42 @@
 	return 0;
 }
 
+/* Read in all the slots available and populate the tracking
+ * structures so that we can start with a baseline idea of what's
+ * there. */
+static int o2hb_populate_slot_data(struct o2hb_region *reg)
+{
+	int ret, i;
+	struct o2hb_disk_slot *slot;
+	struct o2hb_disk_heartbeat_block *hb_block;
+
+	mlog_entry_void();
+
+	ret = o2hb_read_slots(reg, reg->hr_blocks);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/* We only want to get an idea of the values initially in each
+	 * slot, so we do no verification - o2hb_check_slot will
+	 * actually determine if each configured slot is valid and
+	 * whether any values have changed. */
+	for(i = 0; i < reg->hr_blocks; i++) {
+		slot = &reg->hr_slots[i];
+		hb_block = (struct o2hb_disk_heartbeat_block *) slot->ds_raw_block;
+
+		/* Only fill the values that o2hb_check_slot uses to
+		 * determine changing slots */
+		slot->ds_last_time = le64_to_cpu(hb_block->hb_seq);
+		slot->ds_last_generation = le64_to_cpu(hb_block->hb_generation);
+	}
+
+out:
+	mlog_exit(ret);
+	return ret;
+}
+
 /* this is acting as commit; we set up all of hr_bdev and hr_task or nothing */
 static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
 				     const char *page,
@@ -1106,6 +1236,12 @@
 		goto out;
 	}
 
+	ret = o2hb_populate_slot_data(reg);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
 	/*
 	 * A node is considered live after it has beat LIVE_THRESHOLD
 	 * times.  We're not steady until we've given them a chance

Modified: trunk/fs/ocfs2/cluster/ocfs2_heartbeat.h
===================================================================
--- trunk/fs/ocfs2/cluster/ocfs2_heartbeat.h	2005-06-18 00:06:24 UTC (rev 2407)
+++ trunk/fs/ocfs2/cluster/ocfs2_heartbeat.h	2005-06-18 00:45:11 UTC (rev 2408)
@@ -31,6 +31,7 @@
 	__u8  hb_node;
 	__u8  hb_pad1[3];
 	__u32 hb_cksum;
+	__u64 hb_generation;
 };
 
 #define O2HB_DEFAULT_TIMEOUT_MS		2000