[Ocfs2-commits] seeda commits r3114 - branches/ocfs2-1.2/fs/ocfs2/cluster

Wed Mar 11 14:06:50 PDT 2009

Author: seeda
Date: 2009-03-11 14:06:49 -0700 (Wed, 11 Mar 2009)
New Revision: 3114

Modified:
   branches/ocfs2-1.2/fs/ocfs2/cluster/heartbeat.c
Log:
This fixes issue in bugzilla 944

OCFS2 is expecting bio_add_page() to add pages to BIOs in an easily
predictable manner.

That is not true, especially for devices with own merge_bvec_fn().

Therefore OCFS2's heartbeat code is very likely to fail on such devices.

Move the bio_put() call into the bio's bi_end_io() function. This makes the
whole idea of trying to predict the behaviour of bio_add_page() unnecessary.
Removed compute_max_sectors() and o2hb_compute_request_limits().

Signed-off-By:smushran



Modified: branches/ocfs2-1.2/fs/ocfs2/cluster/heartbeat.c
===================================================================

--- branches/ocfs2-1.2/fs/ocfs2/cluster/heartbeat.c	2008-11-12 19:49:37 UTC (rev 3113)
+++ branches/ocfs2-1.2/fs/ocfs2/cluster/heartbeat.c	2009-03-11 21:06:49 UTC (rev 3114)
@@ -289,10 +289,9 @@
 	flush_scheduled_work();
 }
 
-static inline void o2hb_bio_wait_init(struct o2hb_bio_wait_ctxt *wc,
-				      unsigned int num_ios)
+static inline void o2hb_bio_wait_init(struct o2hb_bio_wait_ctxt *wc)
 {
-	atomic_set(&wc->wc_num_reqs, num_ios);
+	atomic_set(&wc->wc_num_reqs, 1);
 	init_completion(&wc->wc_io_complete);
 	wc->wc_error = 0;
 }
@@ -317,6 +316,7 @@
 	struct address_space *mapping = reg->hr_bdev->bd_inode->i_mapping;
 
 	blk_run_address_space(mapping);
+	o2hb_bio_wait_dec(wc, 1);
 
 	wait_for_completion(&wc->wc_io_complete);
 }
@@ -336,6 +336,7 @@
 		return 1;
 
 	o2hb_bio_wait_dec(wc, 1);
+	bio_put(bio);
 	return 0;
 }
 
@@ -343,20 +344,19 @@
  * start_slot. */
 static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg,
 				      struct o2hb_bio_wait_ctxt *wc,
-				      unsigned int start_slot,
-				      unsigned int num_slots,
+				      unsigned int *current_slot,
+				      unsigned int max_slots,
 				      int write)
 {
-	int i, nr_vecs, len, first_page, last_page;
+	int len, current_page;
 	unsigned int vec_len, vec_start;
 	unsigned int bits = reg->hr_block_bits;
 	unsigned int spp = reg->hr_slots_per_page;
+	unsigned int cs = *current_slot;
 	struct bio *bio;
 	struct page *page;
 	struct timeval start;
 
-	nr_vecs = (num_slots + spp - 1) / spp;
-
 	if (write)
 		o2hb_mlog_blocking(reg, &start, "bio alloc write");
 	else
@@ -365,7 +365,7 @@
 	 * GFP_KERNEL that the local node can get fenced. It would be
 	 * nicest if we could pre-allocate these bios and avoid this
 	 * all together. */
-	bio = bio_alloc(GFP_ATOMIC, nr_vecs);
+	bio = bio_alloc(GFP_ATOMIC, 16);
 	o2hb_mlog_blocking_done(reg, &start);
 	if (!bio) {
 		mlog(ML_ERROR, "Could not alloc slots BIO!\n");
@@ -374,141 +374,60 @@
 	}
 
 	/* Must put everything in 512 byte sectors for the bio... */
-	bio->bi_sector = (reg->hr_start_block + start_slot) << (bits - 9);
+	bio->bi_sector = (reg->hr_start_block + cs) << (bits - 9);
 	bio->bi_bdev = reg->hr_bdev;
 	bio->bi_private = wc;
 	bio->bi_end_io = o2hb_bio_end_io;
 
-	first_page = start_slot / spp;
-	last_page = first_page + nr_vecs;
-	vec_start = (start_slot << bits) % PAGE_CACHE_SIZE;
-	for(i = first_page; i < last_page; i++) {
-		page = reg->hr_slot_data[i];
+ 	vec_start = (cs << bits) % PAGE_CACHE_SIZE;
+ 	while(cs < max_slots) {
+ 		current_page = cs / spp;
+ 		page = reg->hr_slot_data[current_page];
 
-		vec_len = PAGE_CACHE_SIZE;
-		/* last page might be short */
-		if (((i + 1) * spp) > (start_slot + num_slots))
-			vec_len = ((num_slots + start_slot) % spp) << bits;
-		vec_len -=  vec_start;
+ 		vec_len = min(PAGE_CACHE_SIZE,
+ 			      (max_slots-cs) * (PAGE_CACHE_SIZE/spp) );
 
-		mlog(ML_HB_BIO, "page %d, vec_len = %u, vec_start = %u\n",
-		     i, vec_len, vec_start);
+  		mlog(ML_HB_BIO, "page %d, vec_len = %u, vec_start = %u\n",
+ 		     current_page, vec_len, vec_start);
 
-		if (write)
+  		if (write)
 			o2hb_mlog_blocking(reg, &start, "bio add page write");
 		else
 			o2hb_mlog_blocking(reg, &start, "bio add page read");
-		len = bio_add_page(bio, page, vec_len, vec_start);
+  		len = bio_add_page(bio, page, vec_len, vec_start);
 		o2hb_mlog_blocking_done(reg, &start);
-		if (len != vec_len) {
-			bio_put(bio);
-			bio = ERR_PTR(-EIO);
+ 		if (len != vec_len) break;
 
-			mlog(ML_ERROR, "Error adding page to bio i = %d, "
-			     "vec_len = %u, len = %d\n, start = %u\n",
-			     i, vec_len, len, vec_start);
-			goto bail;
-		}
+ 		cs += vec_len / (PAGE_CACHE_SIZE/spp);
+  		vec_start = 0;
+  	}
 
-		vec_start = 0;
-	}
-
 bail:
+	*current_slot = cs;
 	return bio;
 }
 
-/*
- * Compute the maximum number of sectors the bdev can handle in one bio,
- * as a power of two.
- *
- * Stolen from oracleasm, thanks Joel!
- */
-static int compute_max_sectors(struct block_device *bdev)
-{
-	int max_pages, max_sectors, pow_two_sectors;
-
-	struct request_queue *q;
-
-	q = bdev_get_queue(bdev);
-	max_pages = q->max_sectors >> (PAGE_SHIFT - 9);
-	if (max_pages > BIO_MAX_PAGES)
-		max_pages = BIO_MAX_PAGES;
-	if (max_pages > q->max_phys_segments)
-		max_pages = q->max_phys_segments;
-	if (max_pages > q->max_hw_segments)
-		max_pages = q->max_hw_segments;
-	max_pages--; /* Handle I/Os that straddle a page */
-
-	max_sectors = max_pages << (PAGE_SHIFT - 9);
-
-	/* Why is fls() 1-based???? */
-	pow_two_sectors = 1 << (fls(max_sectors) - 1);
-
-	return pow_two_sectors;
-}
-
-static inline void o2hb_compute_request_limits(struct o2hb_region *reg,
-					       unsigned int num_slots,
-					       unsigned int *num_bios,
-					       unsigned int *slots_per_bio)
-{
-	unsigned int max_sectors, io_sectors;
-
-	max_sectors = compute_max_sectors(reg->hr_bdev);
-
-	io_sectors = num_slots << (reg->hr_block_bits - 9);
-
-	*num_bios = (io_sectors + max_sectors - 1) / max_sectors;
-	*slots_per_bio = max_sectors >> (reg->hr_block_bits - 9);
-
-	mlog(ML_HB_BIO, "My io size is %u sectors for %u slots. This "
-	     "device can handle %u sectors of I/O\n", io_sectors, num_slots,
-	     max_sectors);
-	mlog(ML_HB_BIO, "Will need %u bios holding %u slots each\n",
-	     *num_bios, *slots_per_bio);
-}
-
 static int o2hb_read_slots(struct o2hb_region *reg,
 			   unsigned int max_slots)
 {
-	unsigned int num_bios, slots_per_bio, start_slot, num_slots;
-	int i, status;
+	unsigned int current_slot=0;
+	int status;
 	struct o2hb_bio_wait_ctxt wc;
-	struct bio **bios;
 	struct bio *bio;
 	struct timeval start;
 
-	o2hb_compute_request_limits(reg, max_slots, &num_bios, &slots_per_bio);
+	o2hb_bio_wait_init(&wc);
 
-	o2hb_mlog_blocking(reg, &start, "allocating bios for read");
-	bios = kcalloc(num_bios, sizeof(struct bio *), GFP_KERNEL);
-	o2hb_mlog_blocking_done(reg, &start);
-	if (!bios) {
-		status = -ENOMEM;
-		mlog_errno(status);
-		return status;
-	}
-
-	o2hb_bio_wait_init(&wc, num_bios);
-
-	num_slots = slots_per_bio;
-	for(i = 0; i < num_bios; i++) {
-		start_slot = i * slots_per_bio;
-
-		/* adjust num_slots at last bio */
-		if (max_slots < (start_slot + num_slots))
-			num_slots = max_slots - start_slot;
-
-		bio = o2hb_setup_one_bio(reg, &wc, start_slot, num_slots, 0);
+	while(current_slot < max_slots) {
+		bio = o2hb_setup_one_bio(reg, &wc, &current_slot, max_slots, 0);
 		if (IS_ERR(bio)) {
-			o2hb_bio_wait_dec(&wc, num_bios - i);
-
 			status = PTR_ERR(bio);
 			mlog_errno(status);
 			goto bail_and_wait;
 		}
-		bios[i] = bio;
 
+		atomic_inc(&wc.wc_num_reqs);
+
 		o2hb_mlog_blocking(reg, &start, "submit_bio for read");
 		submit_bio(READ, bio);
 		o2hb_mlog_blocking_done(reg, &start);
@@ -523,18 +442,10 @@
 		status = wc.wc_error;
 	o2hb_mlog_blocking_done(reg, &start);
 
-	if (bios) {
-		for(i = 0; i < num_bios; i++)
-			if (bios[i])
-				bio_put(bios[i]);
-		kfree(bios);
-	}
-
 	return status;
 }
 
 static int o2hb_issue_node_write(struct o2hb_region *reg,
-				 struct bio **write_bio,
 				 struct o2hb_bio_wait_ctxt *write_wc)
 {
 	int status;
@@ -542,22 +453,22 @@
 	struct bio *bio;
 	struct timeval start;
 
-	o2hb_bio_wait_init(write_wc, 1);
+	o2hb_bio_wait_init(write_wc);
 
 	slot = o2nm_this_node();
 
-	bio = o2hb_setup_one_bio(reg, write_wc, slot, 1, 1);
+	bio = o2hb_setup_one_bio(reg, write_wc, &slot, slot+1, 1);
 	if (IS_ERR(bio)) {
 		status = PTR_ERR(bio);
 		mlog_errno(status);
 		goto bail;
 	}
 
+	atomic_inc(&write_wc->wc_num_reqs);
 	o2hb_mlog_blocking(reg, &start, "submit_bio for write");
 	submit_bio(WRITE, bio);
 	o2hb_mlog_blocking_done(reg, &start);
 
-	*write_bio = bio;
 	status = 0;
 bail:
 	return status;
@@ -835,7 +746,7 @@
 	mlog(ML_HEARTBEAT, "Slot %d gen 0x%"MLFx64" cksum 0x%x "
 	     "seq %"MLFu64" last %"MLFu64" changed %u equal %u\n",
 	     slot->ds_node_num, slot->ds_last_generation,
-	     le32_to_cpu(hb_block->hb_cksum), le64_to_cpu(hb_block->hb_seq), 
+	     le32_to_cpu(hb_block->hb_cksum), le64_to_cpu(hb_block->hb_seq),
 	     slot->ds_last_time, slot->ds_changed_samples,
 	     slot->ds_equal_samples);
 
@@ -947,7 +858,6 @@
 {
 	int i, ret, highest_node, change = 0;
 	unsigned long configured_nodes[BITS_TO_LONGS(O2NM_MAX_NODES)];
-	struct bio *write_bio;
 	struct o2hb_bio_wait_ctxt write_wc;
 	struct timeval start;
 
@@ -986,7 +896,7 @@
 
 	/* And fire off the write. Note that we don't wait on this I/O
 	 * until later. */
-	ret = o2hb_issue_node_write(reg, &write_bio, &write_wc);
+	ret = o2hb_issue_node_write(reg, &write_wc);
 	if (ret < 0) {
 		mlog_errno(ret);
 		return ret;
@@ -1008,7 +918,6 @@
 	o2hb_mlog_blocking(reg, &start, "waiting for write completion");
 	o2hb_wait_on_io(reg, &write_wc);
 	o2hb_mlog_blocking_done(reg, &start);
-	bio_put(write_bio);
 	if (write_wc.wc_error) {
 		/* Do not re-arm the write timeout on I/O error - we
 		 * can't be sure that the new block ever made it to
@@ -1069,7 +978,6 @@
 {
 	int i, ret;
 	struct o2hb_region *reg = data;
-	struct bio *write_bio;
 	struct o2hb_bio_wait_ctxt write_wc;
 	struct timeval before_hb, after_hb;
 	unsigned int elapsed_msec;
@@ -1121,10 +1029,9 @@
 	 *
 	 * XXX: Should we skip this on unclean_stop? */
 	o2hb_prepare_block(reg, 0);
-	ret = o2hb_issue_node_write(reg, &write_bio, &write_wc);
+	ret = o2hb_issue_node_write(reg, &write_wc);
 	if (ret == 0) {
 		o2hb_wait_on_io(reg, &write_wc);
-		bio_put(write_bio);
 	} else {
 		mlog_errno(ret);
 	}