[Ocfs2-commits] smushran commits r2794 - branches/ocfs2-1.2-cert/patches

svn-commits@oss.oracle.com svn-commits at oss.oracle.com
Mon Mar 27 18:55:08 CST 2006


Author: smushran
Signed-off-by: mfasheh
Date: 2006-03-27 18:55:07 -0600 (Mon, 27 Mar 2006)
New Revision: 2794

Added:
   branches/ocfs2-1.2-cert/patches/ocfs2_heartbeat-better_I_O_error_handling.patch
Modified:
   branches/ocfs2-1.2-cert/patches/series
Log:
ocfs2_heartbeat-better_I_O_error_handling.patch added
Signed-off-by: mfasheh

Added: branches/ocfs2-1.2-cert/patches/ocfs2_heartbeat-better_I_O_error_handling.patch
===================================================================
--- branches/ocfs2-1.2-cert/patches/ocfs2_heartbeat-better_I_O_error_handling.patch	2006-03-28 00:45:22 UTC (rev 2793)
+++ branches/ocfs2-1.2-cert/patches/ocfs2_heartbeat-better_I_O_error_handling.patch	2006-03-28 00:55:07 UTC (rev 2794)
@@ -0,0 +1,129 @@
+Propagate errors received in o2hb_bio_end_io() back to the heartbeat thread
+so it can skip re-arming the timer.
+
+Index: fs/ocfs2/cluster/heartbeat.c
+===================================================================
+--- fs/ocfs2/cluster/heartbeat.c	(revision 2788)
++++ fs/ocfs2/cluster/heartbeat.c	(working copy)
+@@ -165,6 +165,7 @@ static spinlock_t o2hb_blocker_lock = SP
+ struct o2hb_bio_wait_ctxt {
+ 	atomic_t          wc_num_reqs;
+ 	struct completion wc_io_complete;
++	int               wc_error;
+ };
+ 
+ static unsigned int o2hb_elapsed_msecs(struct timeval *start,
+@@ -293,6 +294,7 @@ static inline void o2hb_bio_wait_init(st
+ {
+ 	atomic_set(&wc->wc_num_reqs, num_ios);
+ 	init_completion(&wc->wc_io_complete);
++	wc->wc_error = 0;
+ }
+ 
+ /* Used in error paths too */
+@@ -325,8 +327,10 @@ static int o2hb_bio_end_io(struct bio *b
+ {
+ 	struct o2hb_bio_wait_ctxt *wc = bio->bi_private;
+ 
+-	if (error)
++	if (error) {
+ 		mlog(ML_ERROR, "IO Error %d\n", error);
++		wc->wc_error = error;
++	}
+ 
+ 	if (bio->bi_size)
+ 		return 1;
+@@ -515,6 +519,8 @@ static int o2hb_read_slots(struct o2hb_r
+ bail_and_wait:
+ 	o2hb_mlog_blocking(reg, &start, "waiting for read completion");
+ 	o2hb_wait_on_io(reg, &wc);
++	if (wc.wc_error && !status)
++		status = wc.wc_error;
+ 	o2hb_mlog_blocking_done(reg, &start);
+ 
+ 	if (bios) {
+@@ -917,7 +923,7 @@ static int o2hb_highest_node(unsigned lo
+ 	return highest;
+ }
+ 
+-static void o2hb_do_disk_heartbeat(struct o2hb_region *reg)
++static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
+ {
+ 	int i, ret, highest_node, change = 0;
+ 	unsigned long configured_nodes[BITS_TO_LONGS(O2NM_MAX_NODES)];
+@@ -925,13 +931,17 @@ static void o2hb_do_disk_heartbeat(struc
+ 	struct o2hb_bio_wait_ctxt write_wc;
+ 	struct timeval start;
+ 
+-	if (o2nm_configured_node_map(configured_nodes, sizeof(configured_nodes)))
+-		return;
++	ret = o2nm_configured_node_map(configured_nodes,
++				       sizeof(configured_nodes));
++	if (ret) {
++		mlog_errno(ret);
++		return ret;
++	}
+ 
+ 	highest_node = o2hb_highest_node(configured_nodes, O2NM_MAX_NODES);
+ 	if (highest_node >= O2NM_MAX_NODES) {
+ 		mlog(ML_NOTICE, "ocfs2_heartbeat: no configured nodes found!\n");
+-		return;
++		return -EINVAL;
+ 	}
+ 
+ 	/* No sense in reading the slots of nodes that don't exist
+@@ -941,7 +951,7 @@ static void o2hb_do_disk_heartbeat(struc
+ 	ret = o2hb_read_slots(reg, highest_node + 1);
+ 	if (ret < 0) {
+ 		mlog_errno(ret);
+-		return;
++		return ret;
+ 	}
+ 
+ 	/* With an up to date view of the slots, we can check that no
+@@ -959,7 +969,7 @@ static void o2hb_do_disk_heartbeat(struc
+ 	ret = o2hb_issue_node_write(reg, &write_bio, &write_wc);
+ 	if (ret < 0) {
+ 		mlog_errno(ret);
+-		return;
++		return ret;
+ 	}
+ 
+ 	o2hb_mlog_blocking(reg, &start, "checking slots");
+@@ -979,6 +989,15 @@ static void o2hb_do_disk_heartbeat(struc
+ 	o2hb_wait_on_io(reg, &write_wc);
+ 	o2hb_mlog_blocking_done(reg, &start);
+ 	bio_put(write_bio);
++	if (write_wc.wc_error) {
++		/* Do not re-arm the write timeout on I/O error - we
++		 * can't be sure that the new block ever made it to
++		 * disk */
++		mlog(ML_ERROR, "Write error %d on device \"%s\"\n",
++		     write_wc.wc_error, reg->hr_dev_name);
++		return write_wc.wc_error;
++	}
++
+ 	o2hb_arm_write_timeout(reg);
+ 
+ 	/* let the person who launched us know when things are steady */
+@@ -986,6 +1005,8 @@ static void o2hb_do_disk_heartbeat(struc
+ 		if (atomic_dec_and_test(&reg->hr_steady_iterations))
+ 			wake_up(&o2hb_steady_queue);
+ 	}
++
++	return 0;
+ }
+ 
+ /* Subtract b from a, storing the result in a. a *must* have a larger
+@@ -1045,7 +1066,10 @@ static int o2hb_thread(void *data)
+ 		 * likely to time itself out. */
+ 		do_gettimeofday(&before_hb);
+ 
+-		o2hb_do_disk_heartbeat(reg);
++		i = 0;
++		do {
++			ret = o2hb_do_disk_heartbeat(reg);
++		} while (ret && ++i < 2);
+ 
+ 		do_gettimeofday(&after_hb);
+ 		elapsed_msec = o2hb_elapsed_msecs(&before_hb, &after_hb);

Modified: branches/ocfs2-1.2-cert/patches/series
===================================================================
--- branches/ocfs2-1.2-cert/patches/series	2006-03-28 00:45:22 UTC (rev 2793)
+++ branches/ocfs2-1.2-cert/patches/series	2006-03-28 00:55:07 UTC (rev 2794)
@@ -17,3 +17,4 @@
 hold-dirty-ref.patch
 fix-recovery-spin.patch
 fix-dlmlock_remote.patch
+ocfs2_heartbeat-better_I_O_error_handling.patch -p0




More information about the Ocfs2-commits mailing list