[Ocfs2-commits] mfasheh commits r2501 - trunk/fs/ocfs2/cluster
svn-commits at oss.oracle.com
svn-commits at oss.oracle.com
Mon Aug 8 15:31:19 CDT 2005
Author: mfasheh
Signed-off-by: manish
Date: 2005-08-08 15:31:18 -0500 (Mon, 08 Aug 2005)
New Revision: 2501
Modified:
trunk/fs/ocfs2/cluster/heartbeat.c
Log:
* Set a higher priority for the heartbeat thread. this hopefully alleviates
a problem with heartbeat where we were seeing the thread not being
scheduled for over 12 seconds (current heartbeat timeout)
* Report the actual timing of a heartbeat timeout more accurately
* Report more information about bad slot write - this may help in future
debugging.
* Use a block copy when computing checksums as we have to temporarily zero
out the hb_cksum field. This fixes a bug seen where our own write would hit
disk with a zero checksum.
Signed-off-by: manish
Modified: trunk/fs/ocfs2/cluster/heartbeat.c
===================================================================
--- trunk/fs/ocfs2/cluster/heartbeat.c 2005-08-08 20:12:46 UTC (rev 2500)
+++ trunk/fs/ocfs2/cluster/heartbeat.c 2005-08-08 20:31:18 UTC (rev 2501)
@@ -20,6 +20,7 @@
*/
#include <linux/kernel.h>
+#include <linux/sched.h>
#include <linux/jiffies.h>
#include <linux/module.h>
#include <linux/fs.h>
@@ -124,6 +125,12 @@
u64 hr_generation;
struct work_struct hr_write_timeout_work;
+ unsigned long hr_last_timeout_start;
+
+ /* Used during o2hb_check_slot to hold a copy of the block
+ * being checked because we temporarily have to zero out the
+ * crc field. */
+ struct o2hb_disk_heartbeat_block *hr_tmp_block;
};
struct o2hb_bio_wait_ctxt {
@@ -136,8 +143,8 @@
struct o2hb_region *reg = arg;
mlog(ML_ERROR, "Heartbeat write timeout to device %s after %u "
- "milliseconds\n", reg->hr_dev_name, O2HB_MAX_WRITE_TIMEOUT_MS);
-
+ "milliseconds\n", reg->hr_dev_name,
+ jiffies_to_msecs(jiffies - reg->hr_last_timeout_start));
o2quo_disk_timeout();
}
@@ -419,6 +426,15 @@
return ret;
}
+static void o2hb_dump_slot(struct o2hb_disk_heartbeat_block *hb_block)
+{
+ mlog(ML_ERROR, "Dump slot information: seq = 0x%"MLFx64", node = %u, "
+ "cksum = 0x%x, generation 0x%"MLFx64"\n",
+ le64_to_cpu(hb_block->hb_seq), hb_block->hb_node,
+ le32_to_cpu(hb_block->hb_cksum),
+ le64_to_cpu(hb_block->hb_generation));
+}
+
static int o2hb_verify_crc(struct o2hb_region *reg,
struct o2hb_disk_heartbeat_block *hb_block)
{
@@ -598,9 +614,11 @@
struct o2hb_node_event event =
{ .hn_item = LIST_HEAD_INIT(event.hn_item), };
struct o2nm_node *node;
- struct o2hb_disk_heartbeat_block *hb_block = slot->ds_raw_block;
+ struct o2hb_disk_heartbeat_block *hb_block = reg->hr_tmp_block;
u64 cputime;
+ memcpy(hb_block, slot->ds_raw_block, reg->hr_block_bytes);
+
/* Is this correct? Do we assume that the node doesn't exist
* if we're not configured for him? */
node = o2nm_get_node_by_num(slot->ds_node_num);
@@ -623,6 +641,7 @@
* other values as they may be junk. */
mlog(ML_ERROR, "Node %d has written a bad crc to %s\n",
slot->ds_node_num, reg->hr_dev_name);
+ o2hb_dump_slot(hb_block);
slot->ds_equal_samples++;
goto fire_callbacks;
@@ -826,6 +845,8 @@
mlog(ML_HEARTBEAT|ML_KTHREAD, "hb thread running\n");
+ set_user_nice(current, -20);
+
while (!kthread_should_stop() && !reg->hr_unclean_stop) {
o2hb_do_disk_heartbeat(reg);
@@ -917,6 +938,9 @@
struct page *page;
struct o2hb_region *reg = to_o2hb_region(item);
+ if (reg->hr_tmp_block)
+ kfree(reg->hr_tmp_block);
+
if (reg->hr_slot_data) {
for (i = 0; i < reg->hr_num_pages; i++) {
page = reg->hr_slot_data[i];
@@ -1081,6 +1105,12 @@
char *raw;
struct o2hb_disk_slot *slot;
+ reg->hr_tmp_block = kmalloc(reg->hr_block_bytes, GFP_KERNEL);
+ if (reg->hr_tmp_block == NULL) {
+ mlog_errno(-ENOMEM);
+ return -ENOMEM;
+ }
+
reg->hr_slots = kcalloc(reg->hr_blocks,
sizeof(struct o2hb_disk_slot), GFP_KERNEL);
if (reg->hr_slots == NULL) {
More information about the Ocfs2-commits
mailing list