[Ocfs2-commits] mfasheh commits r2408 - trunk/fs/ocfs2/cluster
svn-commits at oss.oracle.com
svn-commits at oss.oracle.com
Fri Jun 17 19:45:13 CDT 2005
Author: mfasheh
Signed-off-by: zab
Date: 2005-06-17 19:45:11 -0500 (Fri, 17 Jun 2005)
New Revision: 2408
Modified:
trunk/fs/ocfs2/cluster/heartbeat.c
trunk/fs/ocfs2/cluster/ocfs2_heartbeat.h
Log:
* write out a block crc in each live heartbeat slot. verify this on all
nodes.
* fix a bug where the slot data wasn't being initialized to the values on
disk, thus causing heartbeat to initially see a change which may not have
actually occurred.
* write a current generation value. this helps heartbeat on other nodes
detect a condition which can happen when a heartbeating node reboots and
comes back before timeout -- a change in generation (for an otherwise
'good' block) will immediately trigger a node down event.
Signed-off-by: zab
Modified: trunk/fs/ocfs2/cluster/heartbeat.c
===================================================================
--- trunk/fs/ocfs2/cluster/heartbeat.c 2005-06-18 00:06:24 UTC (rev 2407)
+++ trunk/fs/ocfs2/cluster/heartbeat.c 2005-06-18 00:45:11 UTC (rev 2408)
@@ -28,6 +28,8 @@
#include <linux/file.h>
#include <linux/kthread.h>
#include <linux/configfs.h>
+#include <linux/random.h>
+#include <linux/crc32.h>
#include "heartbeat.h"
#include "tcp.h"
@@ -53,6 +55,7 @@
static unsigned long o2hb_live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
static LIST_HEAD(o2hb_node_events);
static DECLARE_WAIT_QUEUE_HEAD(o2hb_steady_queue);
+static u64 o2hb_generation;
static struct o2hb_callback {
struct list_head list;
@@ -79,6 +82,7 @@
struct o2hb_disk_heartbeat_block *ds_raw_block;
u8 ds_node_num;
unsigned long ds_last_time;
+ u64 ds_last_generation;
u16 ds_equal_samples;
u16 ds_changed_samples;
struct list_head ds_live_item;
@@ -366,6 +370,35 @@
return status;
}
+static u32 o2hb_compute_block_crc_le(struct o2hb_region *reg,
+ struct o2hb_disk_heartbeat_block *hb_block)
+{
+ u32 old_cksum, ret;
+
+ /* We want to compute the block crc with a 0 value in the
+ * hb_cksum field. Save it off here and replace after the
+ * crc. */
+ old_cksum = hb_block->hb_cksum;
+ hb_block->hb_cksum = 0;
+
+ ret = crc32_le(0, (unsigned char *) hb_block, reg->hr_block_bytes);
+
+ hb_block->hb_cksum = old_cksum;
+
+ return ret;
+}
+
+static int o2hb_verify_crc(struct o2hb_region *reg,
+ struct o2hb_disk_heartbeat_block *hb_block)
+{
+ u32 computed, read;
+
+ read = le32_to_cpu(hb_block->hb_cksum);
+ computed = o2hb_compute_block_crc_le(reg, hb_block);
+
+ return read == le32_to_cpu(computed);
+}
+
/* We want to make sure that nobody is heartbeating on top of us --
* this will help detect an invalid configuration. */
static int o2hb_check_last_timestamp(struct o2hb_region *reg)
@@ -389,7 +422,7 @@
return ret;
}
-static inline void o2hb_set_local_node_timestamp(struct o2hb_region *reg)
+static inline void o2hb_prepare_block(struct o2hb_region *reg)
{
int node_num;
u64 cputime;
@@ -405,8 +438,16 @@
cputime = CURRENT_TIME.tv_sec;
if (!cputime)
cputime = 1;
+
hb_block->hb_seq = cpu_to_le64(cputime);
hb_block->hb_node = node_num;
+ hb_block->hb_generation = cpu_to_le64(o2hb_generation);
+
+ /* This step must always happen last! */
+ hb_block->hb_cksum = o2hb_compute_block_crc_le(reg, hb_block);
+
+ mlog(ML_HB_BIO, "our node generation = 0x%"MLFx64", cksum = 0x%x\n",
+ cpu_to_le64(o2hb_generation), le32_to_cpu(hb_block->hb_cksum));
}
static void o2hb_fire_callbacks(struct o2hb_callback *hbcall,
@@ -521,7 +562,7 @@
static int o2hb_check_slot(struct o2hb_region *reg,
struct o2hb_disk_slot *slot)
{
- int changed = 0;
+ int changed = 0, gen_changed = 0;
struct o2hb_node_event event =
{ .hn_item = LIST_HEAD_INIT(event.hn_item), };
struct o2nm_node *node;
@@ -534,6 +575,27 @@
if (!node)
return 0;
+ if (!o2hb_verify_crc(reg, hb_block)) {
+ /* all paths from here will drop o2hb_live_lock for
+ * us. */
+ spin_lock(&o2hb_live_lock);
+
+ /* Don't print an error on the console in this case -
+ * a freshly formatted heartbeat area will not have a
+ * crc set on it. */
+ if (list_empty(&slot->ds_live_item))
+ goto out;
+
+ /* The node is live but pushed out a bad crc. We
+ * consider it a transient miss but don't populate any
+ * other values as they may be junk. */
+ mlog(ML_ERROR, "Node %d has written a bad crc to %s\n",
+ slot->ds_node_num, reg->hr_dev_name);
+
+ slot->ds_equal_samples++;
+ goto fire_callbacks;
+ }
+
/* we don't care if these wrap.. the state transitions below
* clear at the right places */
cputime = le64_to_cpu(hb_block->hb_seq);
@@ -543,13 +605,37 @@
slot->ds_equal_samples++;
slot->ds_last_time = cputime;
+ /* The node changed heartbeat generations. We assume this to
+ * mean it dropped off but came back before we timed out. We
+ * want to consider it down for the time being but don't want
+ * to lose any changed_samples state we might build up to
+ * considering it live again. */
+ if (slot->ds_last_generation != le64_to_cpu(hb_block->hb_generation)) {
+ gen_changed = 1;
+ slot->ds_equal_samples = 0;
+ mlog(ML_HEARTBEAT, "Node %d changed generation (0x%"MLFx64" "
+ "to 0x%"MLFx64")\n", slot->ds_node_num,
+ slot->ds_last_generation,
+ le64_to_cpu(hb_block->hb_generation));
+ }
+
+ slot->ds_last_generation = le64_to_cpu(hb_block->hb_generation);
+
+ mlog(ML_HEARTBEAT, "Slot %d has generation 0x%"MLFx64", cksum 0x%x "
+ "changed samples %u, equal samples %u\n",
+ slot->ds_node_num, slot->ds_last_generation,
+ le32_to_cpu(hb_block->hb_cksum), slot->ds_changed_samples,
+ slot->ds_equal_samples);
+
spin_lock(&o2hb_live_lock);
+
+fire_callbacks:
/* dead nodes only come to life after some number of
* changes at any time during their dead time */
if (list_empty(&slot->ds_live_item) &&
slot->ds_changed_samples >= O2HB_LIVE_THRESHOLD) {
- mlog(ML_HEARTBEAT, "Node %d joined my region\n",
- slot->ds_node_num);
+ mlog(ML_HEARTBEAT, "Node %d (id 0x%"MLFx64") joined my "
+ "region\n", slot->ds_node_num, slot->ds_last_generation);
/* first on the list generates a callback */
if (list_empty(&o2hb_live_slots[slot->ds_node_num])) {
@@ -557,7 +643,7 @@
o2hb_queue_node_event(&event, O2HB_NODE_UP_CB, node,
slot->ds_node_num);
-
+
changed = 1;
}
@@ -575,7 +661,7 @@
/* live nodes only go dead after enough consequtive missed
* samples.. reset the missed counter whenever we see
* activity */
- if (slot->ds_equal_samples >= reg->hr_dead_iter) {
+ if (slot->ds_equal_samples >= reg->hr_dead_iter || gen_changed) {
mlog(ML_HEARTBEAT, "Node %d left my region\n",
slot->ds_node_num);
@@ -590,7 +676,10 @@
changed = 1;
}
- slot->ds_changed_samples = 0;
+ /* We don't clear this because the node is still
+ * actually writing new blocks. */
+ if (!gen_changed)
+ slot->ds_changed_samples = 0;
goto out;
}
if (slot->ds_changed_samples) {
@@ -658,8 +747,8 @@
mlog(ML_ERROR, "Device \"%s\": another node is heartbeating "
"in our slot!\n", reg->hr_dev_name);
- /* Set our raw timestamp */
- o2hb_set_local_node_timestamp(reg);
+ /* fill in the proper info for our next heartbeat */
+ o2hb_prepare_block(reg);
/* And fire off the write. Note that we don't wait on this I/O
* until later. */
@@ -731,6 +820,11 @@
INIT_LIST_HEAD(&o2hb_node_events);
memset(o2hb_live_node_bitmap, 0, sizeof(o2hb_live_node_bitmap));
+
+ o2hb_generation = 0;
+ /* Generation of zero is invalid */
+ while (!o2hb_generation)
+ get_random_bytes(&o2hb_generation, sizeof(o2hb_generation));
}
/*
@@ -1036,6 +1130,42 @@
return 0;
}
+/* Read in all the slots available and populate the tracking
+ * structures so that we can start with a baseline idea of what's
+ * there. */
+static int o2hb_populate_slot_data(struct o2hb_region *reg)
+{
+ int ret, i;
+ struct o2hb_disk_slot *slot;
+ struct o2hb_disk_heartbeat_block *hb_block;
+
+ mlog_entry_void();
+
+ ret = o2hb_read_slots(reg, reg->hr_blocks);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /* We only want to get an idea of the values initially in each
+ * slot, so we do no verification - o2hb_check_slot will
+ * actually determine if each configured slot is valid and
+ * whether any values have changed. */
+ for(i = 0; i < reg->hr_blocks; i++) {
+ slot = ®->hr_slots[i];
+ hb_block = (struct o2hb_disk_heartbeat_block *) slot->ds_raw_block;
+
+ /* Only fill the values that o2hb_check_slot uses to
+ * determine changing slots */
+ slot->ds_last_time = le64_to_cpu(hb_block->hb_seq);
+ slot->ds_last_generation = le64_to_cpu(hb_block->hb_generation);
+ }
+
+out:
+ mlog_exit(ret);
+ return ret;
+}
+
/* this is acting as commit; we set up all of hr_bdev and hr_task or nothing */
static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
const char *page,
@@ -1106,6 +1236,12 @@
goto out;
}
+ ret = o2hb_populate_slot_data(reg);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
/*
* A node is considered live after it has beat LIVE_THRESHOLD
* times. We're not steady until we've given them a chance
Modified: trunk/fs/ocfs2/cluster/ocfs2_heartbeat.h
===================================================================
--- trunk/fs/ocfs2/cluster/ocfs2_heartbeat.h 2005-06-18 00:06:24 UTC (rev 2407)
+++ trunk/fs/ocfs2/cluster/ocfs2_heartbeat.h 2005-06-18 00:45:11 UTC (rev 2408)
@@ -31,6 +31,7 @@
__u8 hb_node;
__u8 hb_pad1[3];
__u32 hb_cksum;
+ __u64 hb_generation;
};
#define O2HB_DEFAULT_TIMEOUT_MS 2000
More information about the Ocfs2-commits
mailing list