[Ocfs2-tools-devel] [PATCH 02/11] fsck.ocfs2: Reverse earlier commits wrt pre-caching

Sunil Mushran sunil.mushran at oracle.com
Thu Sep 22 19:04:30 PDT 2011


This patch reverses three earlier commits. This has been done because the
information gleaned from the added statistics show a significant performance
hit when the file system metadata is larger than the cache size.

The test was run on a 2TB volume having 15M files. The cache size was 820MB.

Before:
  I/O read disk/cache: 138736MB / 793MB, write: 0MB, rate: 17.46MB/s
  Times real: 7989.577s, user: 430.803s, sys: 110.852s

After:
  I/O read disk/cache: 68724MB / 261MB, write: 0MB, rate: 12.65MB/s
  Times real: 5452.819s, user: 368.226s, sys: 73.978s

Later patches will address the performance issue using mostly pre-caching but
done slightly differently.

The patches reversed (mostly) are:

commit 7fd354d5bd63370316088267fb9832800f4c9b53
fsck.ocfs2: Pre-fill the I/O cache with metadata.

commit 69223be5af7868605c5d681ad64ccf63838f4858
fsck.ocfs2: Pre-cache inodes in reverse order.

commit 1fa5d9dea32caf99efb4e0811a48655f24938468
fsck.ocfs2: Pre-cache dirblocks before we go through them.

Signed-off-by: Sunil Mushran <sunil.mushran at oracle.com>
---
 fsck.ocfs2/dirblocks.c |   88 ------------------------------------------------
 fsck.ocfs2/pass0.c     |   62 +++++-----------------------------
 2 files changed, 9 insertions(+), 141 deletions(-)

diff --git a/fsck.ocfs2/dirblocks.c b/fsck.ocfs2/dirblocks.c
index 085dd1f..1e9fbb1 100644
--- a/fsck.ocfs2/dirblocks.c
+++ b/fsck.ocfs2/dirblocks.c
@@ -70,69 +70,6 @@ out:
 	return ret;
 }
 
-/*
- * Go through the dirblocks pre-filling them.  We try to coalesce adjacent
- * ones.  Don't care to return errors, because it's a cache pre-fill.
- */
-static int try_to_cache(ocfs2_filesys *fs, struct rb_node *node,
-			char *pre_cache_buf, int pre_cache_blocks)
-{
-	int cached_blocks = 0;
-	o2fsck_dirblock_entry *dbe;
-	uint64_t io_blkno = 0, next_blkno = 0;
-	int count = 0;
-	errcode_t err;
-	uint64_t blocks_seen = 0;
-
-	o2fsck_reset_blocks_cached();
-	for (; node; node = rb_next(node)) {
-		blocks_seen++;
-		dbe = rb_entry(node, o2fsck_dirblock_entry, e_node);
-		if (io_blkno) {
-			assert(count);
-			assert(next_blkno > io_blkno);
-
-			if ((next_blkno == dbe->e_blkno) &&
-			    (count < pre_cache_blocks)) {
-				count++;
-				next_blkno++;
-				continue;
-			}
-
-			if (!o2fsck_worth_caching(count)) {
-				io_blkno = 0;
-				break;
-			}
-
-			err = ocfs2_read_blocks(fs, io_blkno, count,
-						pre_cache_buf);
-			io_blkno = 0;
-			next_blkno = 0;
-
-			if (err)
-				break;
-
-			cached_blocks += count;
-			count = 0;
-		}
-
-		assert(!io_blkno);
-		io_blkno = dbe->e_blkno;
-		next_blkno = io_blkno + 1;
-		count = 1;
-	}
-
-	/* Catch the last pre-fill buffer */
-	if (io_blkno && o2fsck_worth_caching(count)) {
-		assert(count);
-		err = ocfs2_read_blocks(fs, io_blkno, count, pre_cache_buf);
-		if (!err)
-			cached_blocks += count;
-	}
-
-	return cached_blocks;
-}
-
 uint64_t o2fsck_search_reidx_dir(struct rb_root *root, uint64_t dino)
 {
 	struct rb_node *node = root->rb_node;
@@ -204,41 +141,16 @@ void o2fsck_dir_block_iterate(o2fsck_state *ost, dirblock_iterator func,
 			      void *priv_data)
 {
 	o2fsck_dirblocks *db = &ost->ost_dirblocks;
-	ocfs2_filesys *fs = ost->ost_fs;
 	o2fsck_dirblock_entry *dbe;
 	struct rb_node *node;
 	unsigned ret;
-	errcode_t err;
-	char *pre_cache_buf = NULL;
-	int pre_cache_blocks = ocfs2_blocks_in_bytes(fs, 1024 * 1024);
-	int cached_blocks = 0;
-
-	o2fsck_reset_blocks_cached();
-	if (o2fsck_worth_caching(1)) {
-		err = ocfs2_malloc_blocks(fs->fs_io, pre_cache_blocks,
-					  &pre_cache_buf);
-		if (err)
-			verbosef("Unable to allocate dirblock pre-cache "
-				 "buffer, %s\n",
-				 "ignoring");
-	}
 
 	for (node = rb_first(&db->db_root); node; node = rb_next(node)) {
-		if (!cached_blocks && pre_cache_buf)
-			cached_blocks = try_to_cache(fs, node, pre_cache_buf,
-						     pre_cache_blocks);
-
 		dbe = rb_entry(node, o2fsck_dirblock_entry, e_node);
 		ret = func(dbe, priv_data);
 		if (ret & OCFS2_DIRENT_ABORT)
 			break;
-
-		if (cached_blocks)
-			cached_blocks--;
 	}
-
-	if (pre_cache_buf)
-		ocfs2_free(&pre_cache_buf);
 }
 
 static errcode_t ocfs2_rebuild_indexed_dir(ocfs2_filesys *fs, uint64_t ino)
diff --git a/fsck.ocfs2/pass0.c b/fsck.ocfs2/pass0.c
index 3d8957e..6187d77 100644
--- a/fsck.ocfs2/pass0.c
+++ b/fsck.ocfs2/pass0.c
@@ -673,7 +673,6 @@ static errcode_t check_chain(o2fsck_state *ost,
 			     struct ocfs2_chain_rec *chain,
 			     char *buf1,
 			     char *buf2,
-			     char *pre_cache_buf,
 			     int *chain_changed,
 			     ocfs2_bitmap *allowed,
 			     ocfs2_bitmap *forbidden)
@@ -683,8 +682,6 @@ static errcode_t check_chain(o2fsck_state *ost,
 	uint64_t blkno;
 	errcode_t ret = 0;
 	int depth = 0, clear_ref = 0;
-	int blocks_per_group = ocfs2_clusters_to_blocks(ost->ost_fs,
-							cs->cs_cpg);
 
 	verbosef("free %u total %u blkno %"PRIu64"\n", chain->c_free,
 		 chain->c_total, (uint64_t)chain->c_blkno);
@@ -739,15 +736,6 @@ static errcode_t check_chain(o2fsck_state *ost,
 			 * the read below.. */
 		}
 
-		/*
-		 * Pre-cache the entire group.  Don't care about failure.
-		 * If it works, the following ocfs2_read_group_desc() will
-		 * get the block out of the cache.
-		 */
-		if (pre_cache_buf)
-			ocfs2_read_blocks(ost->ost_fs, blkno,
-					  blocks_per_group, pre_cache_buf);
-
 		ret = ocfs2_read_group_desc(ost->ost_fs, blkno, (char *)bg2);
 		if (ret == OCFS2_ET_BAD_GROUP_DESC_MAGIC) {
 			if (prompt(ost, PY, PR_CHAIN_LINK_MAGIC,
@@ -855,13 +843,12 @@ out:
 static errcode_t verify_chain_alloc(o2fsck_state *ost,
 				    struct ocfs2_dinode *di,
 				    char *buf1, char *buf2,
-				    char *pre_cache_buf,
 				    ocfs2_bitmap *allowed,
 				    ocfs2_bitmap *forbidden)
 {
 	struct chain_state cs = {0, };
 	struct ocfs2_chain_list *cl;
-	int i, max_count;
+	uint16_t i, max_count;
 	struct ocfs2_chain_rec *cr;
 	uint32_t free = 0, total = 0;
 	int changed = 0, trust_next_free = 1;
@@ -952,13 +939,7 @@ static errcode_t verify_chain_alloc(o2fsck_state *ost,
 	if (trust_next_free)
 		max_count = cl->cl_next_free_rec;
 
-	/*
-	 * We walk the chains backwards for caching reasons.  Basically,
-	 * at the end the last blocks we read will be the most recently
-	 * used in the cache.  We want that to be the first chains,
-	 * especially for the inode scan, which will read forwards.
-	 */
-	for (i = max_count - 1; i >= 0; i--) {
+	for (i = 0; i < max_count; i++) {
 		cr = &cl->cl_recs[i];
 
 		/* reset for each run */
@@ -966,8 +947,8 @@ static errcode_t verify_chain_alloc(o2fsck_state *ost,
 			.cs_chain_no = i,
 			.cs_cpg = cl->cl_cpg,
 		};
-		ret = check_chain(ost, di, &cs, cr, buf1, buf2, pre_cache_buf,
-				  &changed, allowed, forbidden);
+		ret = check_chain(ost, di, &cs, cr, buf1, buf2, &changed,
+				  allowed, forbidden);
 		/* XXX what?  not checking ret? */
 
 		if (cr->c_blkno != 0) {
@@ -994,13 +975,12 @@ static errcode_t verify_chain_alloc(o2fsck_state *ost,
 			 * we copy the last chain into the missing spot
 			 * instead of shifting everyone over a spot 
 			 * to minimize the number of chains we have to
-			 * update.  we then reset i so that we can go
-			 * over that chain and fix bg_chain */
+			 * update */
 			if (i < (cl->cl_next_free_rec - 1)) {
 				*cr = cl->cl_recs[cl->cl_next_free_rec - 1];
 				memset(&cl->cl_recs[cl->cl_next_free_rec - 1],
 					0, sizeof(struct ocfs2_chain_rec));
-				i++;
+				i--;
 			}
 
 			cl->cl_next_free_rec--;
@@ -1120,7 +1100,7 @@ static errcode_t verify_bitmap_descs(o2fsck_state *ost,
 		o2fsck_bitmap_set(allowed, blkno, NULL);
 	}
 
-	ret = verify_chain_alloc(ost, di, buf1, buf2, NULL, allowed, forbidden);
+	ret = verify_chain_alloc(ost, di, buf1, buf2, allowed, forbidden);
 	if (ret) {
 		com_err(whoami, ret, "while looking up chain allocator inode "
 			"%"PRIu64, (uint64_t)di->i_blkno);
@@ -1275,7 +1255,6 @@ errcode_t o2fsck_pass0(o2fsck_state *ost)
 	uint64_t blkno;
 	uint32_t pre_repair_clusters;
 	char *blocks = NULL;
-	char *pre_cache_buf = NULL;
 	struct ocfs2_dinode *di = NULL;
 	ocfs2_filesys *fs = ost->ost_fs;
 	ocfs2_cached_inode **ci;
@@ -1302,27 +1281,6 @@ errcode_t o2fsck_pass0(o2fsck_state *ost)
 	}
 	di = (struct ocfs2_dinode *)blocks;
 
-	/*
-	 * We also allocate a pre-cache buffer of 4MB for reading entire
-	 * suballocator groups.  Some blocksizes have smaller groups, but
-	 * none have larger (see
-	 * libocfs2/alloc.c:ocfs2_clusters_per_group()).  This allows
-	 * us to pre-fill the I/O cache; we're already reading the group
-	 * descriptor, so slurping the whole thing shouldn't hurt.
-	 *
-	 * If this allocation fails, we just ignore it.  It's a cache.
-	 */
-	o2fsck_reset_blocks_cached();
-	if (o2fsck_worth_caching(1)) {
-		ret = ocfs2_malloc_blocks(fs->fs_io,
-					  ocfs2_blocks_in_bytes(fs, 4 * 1024 * 1024),
-					  &pre_cache_buf);
-		if (ret)
-			verbosef("Unable to allocate group pre-cache "
-				 "buffer, %s\n",
-				 "ignoring");
-	}
-
 	ret = ocfs2_malloc0(max_slots * sizeof(ocfs2_cached_inode *), 
 			    &ost->ost_inode_allocs);
 	if (ret) {
@@ -1456,7 +1414,7 @@ retry_bitmap:
 					 blocks + ost->ost_fs->fs_blocksize,
 					 blocks + 
 					 (ost->ost_fs->fs_blocksize * 2), 
-					 pre_cache_buf, NULL, NULL);
+					 NULL, NULL);
 
 		/* XXX maybe helped by the alternate super block */
 		if (ret)
@@ -1515,7 +1473,7 @@ retry_bitmap:
 					 blocks + ost->ost_fs->fs_blocksize,
 					 blocks + 
 					 (ost->ost_fs->fs_blocksize * 2), 
-					 pre_cache_buf, NULL, NULL);
+					 NULL, NULL);
 
 		/* XXX maybe helped by the alternate super block */
 		if (ret)
@@ -1527,8 +1485,6 @@ retry_bitmap:
 	o2fsck_add_resource_track(&ost->ost_rt, &rt);
 
 out:
-	if (pre_cache_buf)
-		ocfs2_free(&pre_cache_buf);
 	if (blocks)
 		ocfs2_free(&blocks);
 	if (ret)
-- 
1.7.4.1




More information about the Ocfs2-tools-devel mailing list