[Ocfs2-tools-devel] [PATCH 11/13] fsck.ocfs2: Pre-fill the I/O cache with metadata.

Sunil Mushran sunil.mushran at oracle.com
Fri Jun 19 15:19:16 PDT 2009


Joel Becker wrote:
> In pass0, we walk all of the suballocators to verify they look OK.  In
> the walk, we read each group descriptor.  Because each group is a linear
> hunk of disk, reading the entire group in one slurp is about the same
> amount of effort for the disk.  The big problem is the seek, not the
> data.  So with almost no impact to pass0, we now pre-fill the I/O cache
> will all of our inodes and metadata blocks.
>
> In pass1, this should mean almost everything is in cache if we had a big
> enough cache.  If we didn't, oh well.  The worst case is about identical
> to the uncached case.
>
> Signed-off-by: Joel Becker <joel.becker at oracle.com>
> ---
>  fsck.ocfs2/pass0.c |   55 +++++++++++++++++++++++++++++++++++++++++++++++----
>  1 files changed, 50 insertions(+), 5 deletions(-)
>
> diff --git a/fsck.ocfs2/pass0.c b/fsck.ocfs2/pass0.c
> index 90c0ef9..1961aae 100644
> --- a/fsck.ocfs2/pass0.c
> +++ b/fsck.ocfs2/pass0.c
> @@ -462,6 +462,7 @@ static errcode_t check_chain(o2fsck_state *ost,
>  			     struct ocfs2_chain_rec *chain,
>  			     char *buf1,
>  			     char *buf2,
> +			     char *pre_cache_buf,
>  			     int *chain_changed,
>  			     ocfs2_bitmap *allowed,
>  			     ocfs2_bitmap *forbidden)
> @@ -471,6 +472,8 @@ static errcode_t check_chain(o2fsck_state *ost,
>  	uint64_t blkno;
>  	errcode_t ret = 0;
>  	int depth = 0, clear_ref = 0;
> +	int blocks_per_group = ocfs2_clusters_to_blocks(ost->ost_fs,
> +							cs->cs_cpg);
>  
>  	verbosef("free %u total %u blkno %"PRIu64"\n", chain->c_free,
>  		 chain->c_total, (uint64_t)chain->c_blkno);
> @@ -525,6 +528,15 @@ static errcode_t check_chain(o2fsck_state *ost,
>  			 * the read below.. */
>  		}
>  
> +		/*
> +		 * Pre-cache the entire group.  Don't care about failure.
> +		 * If it works, the following ocfs2_read_group_desc() will
> +		 * get the block out of the cache.
> +		 */
> +		if (pre_cache_buf)
> +			ocfs2_read_blocks(ost->ost_fs, blkno,
> +					  blocks_per_group, pre_cache_buf);
> +
>  		ret = ocfs2_read_group_desc(ost->ost_fs, blkno, (char *)bg2);
>  		if (ret == OCFS2_ET_BAD_GROUP_DESC_MAGIC) {
>  			if (prompt(ost, PY, PR_CHAIN_LINK_MAGIC,
> @@ -632,6 +644,7 @@ out:
>  static errcode_t verify_chain_alloc(o2fsck_state *ost,
>  				    struct ocfs2_dinode *di,
>  				    char *buf1, char *buf2,
> +				    char *pre_cache_buf,
>  				    ocfs2_bitmap *allowed,
>  				    ocfs2_bitmap *forbidden)
>  {
> @@ -736,8 +749,8 @@ static errcode_t verify_chain_alloc(o2fsck_state *ost,
>  			.cs_chain_no = i,
>  			.cs_cpg = cl->cl_cpg,
>  		};
> -		ret = check_chain(ost, di, &cs, cr, buf1, buf2, &changed,
> -				  allowed, forbidden);
> +		ret = check_chain(ost, di, &cs, cr, buf1, buf2, pre_cache_buf,
> +				  &changed, allowed, forbidden);
>  		/* XXX what?  not checking ret? */
>  
>  		if (cr->c_blkno != 0) {
> @@ -889,7 +902,7 @@ static errcode_t verify_bitmap_descs(o2fsck_state *ost,
>  		ocfs2_bitmap_set(allowed, blkno, NULL);
>  	}
>  
> -	ret = verify_chain_alloc(ost, di, buf1, buf2, allowed, forbidden);
> +	ret = verify_chain_alloc(ost, di, buf1, buf2, NULL, allowed, forbidden);
>   

Isn't this verify_chain_alloc() run first()? Any reason why we
are not passing the pre_cache_buf here too.

>  	if (ret) {
>  		com_err(whoami, ret, "while looking up chain allocator inode "
>  			"%"PRIu64, (uint64_t)di->i_blkno);
> @@ -1041,6 +1054,7 @@ errcode_t o2fsck_pass0(o2fsck_state *ost)
>  	errcode_t ret;
>  	uint64_t blkno;
>  	char *blocks = NULL;
> +	char *pre_cache_buf = NULL;
>  	struct ocfs2_dinode *di = NULL;
>  	ocfs2_filesys *fs = ost->ost_fs;
>  	ocfs2_cached_inode **ci;
> @@ -1049,6 +1063,14 @@ errcode_t o2fsck_pass0(o2fsck_state *ost)
>  
>  	printf("Pass 0a: Checking cluster allocation chains\n");
>  
> +	/*
> +	 * The I/O buffer is 3 blocks. We apportion our I/O buffer
> +	 * thusly:
> +	 *
> +	 * blocks[0] is the allocator inode we're working on.
> +	 * blocks[1] & blocks[2] are used to hold group descriptors
> +	 *     in functions below this one.
> +	 */
>  	ret = ocfs2_malloc_blocks(fs->fs_io, 3, &blocks);
>  	if (ret) {
>  		com_err(whoami, ret, "while allocating block buffers");
> @@ -1056,6 +1078,27 @@ errcode_t o2fsck_pass0(o2fsck_state *ost)
>  	}
>  	di = (struct ocfs2_dinode *)blocks;
>  
> +	/*
> +	 * We also allocate a pre-cache buffer of 4MB for reading entire
> +	 * suballocator groups.  Some blocksizes have smaller groups, but
> +	 * none have larger (see
> +	 * libocfs2/alloc.c:ocfs2_clusters_per_group()).  This allows
> +	 * us to pre-fill the I/O cache; we're already reading the group
> +	 * descriptor, so slurping the whole thing shouldn't hurt.
> +	 *
> +	 * If this allocation fails, we just ignore it.  It's a cache.
> +	 */
> +	o2fsck_reset_blocks_cached();
> +	if (o2fsck_worth_caching(1)) {
> +		ret = ocfs2_malloc_blocks(fs->fs_io,
> +					  ocfs2_blocks_in_bytes(fs, 4 * 1024 * 1024),
> +					  &pre_cache_buf);
> +		if (ret)
> +			verbosef("Unable to allocate group pre-cache "
> +				 "buffer, %s\n",
> +				 "ignoring");
> +	}
> +
>  	ret = ocfs2_malloc0(max_slots * sizeof(ocfs2_cached_inode *), 
>  			    &ost->ost_inode_allocs);
>  	if (ret) {
> @@ -1144,7 +1187,7 @@ errcode_t o2fsck_pass0(o2fsck_state *ost)
>  					 blocks + ost->ost_fs->fs_blocksize,
>  					 blocks + 
>  					 (ost->ost_fs->fs_blocksize * 2), 
> -					 NULL, NULL);
> +					 pre_cache_buf, NULL, NULL);
>  
>  		/* XXX maybe helped by the alternate super block */
>  		if (ret)
> @@ -1197,7 +1240,7 @@ errcode_t o2fsck_pass0(o2fsck_state *ost)
>  					 blocks + ost->ost_fs->fs_blocksize,
>  					 blocks + 
>  					 (ost->ost_fs->fs_blocksize * 2), 
> -					 NULL, NULL);
> +					 pre_cache_buf, NULL, NULL);
>  
>  		/* XXX maybe helped by the alternate super block */
>  		if (ret)
> @@ -1205,6 +1248,8 @@ errcode_t o2fsck_pass0(o2fsck_state *ost)
>  	}
>  
>  out:
> +	if (pre_cache_buf)
> +		ocfs2_free(&pre_cache_buf);
>  	if (blocks)
>  		ocfs2_free(&blocks);
>  	if (ret)
>   




More information about the Ocfs2-tools-devel mailing list