[Ocfs2-tools-devel] [PATCH 10/13] fsck.ocfs2: Use the I/O cache.

Sunil Mushran sunil.mushran at oracle.com
Tue Jun 2 21:45:49 PDT 2009


Possible typo in the patch.

Signed-off-by: Sunil Mushran <sunil.mushran at oracle.com>


Joel Becker wrote:
> fsck.ocfs2 travels the filesystem multiple times.  The I/O cache should
> make this faster.  Since read-write fsck is only allowed when there are
> no other users or mounters of the device, the cache should be safe.
>
> We use two caches.  First, we allocate a cache big enough for all the
> journals.  Since we don't know their size at the start, we guess the
> default 256MB.  The hope is that we cache the journal blocks on the
> first pass when we check their contents and avoid having to re-read them
> on the second pass when we replay them.
>
> Once the journals are replayed, we drop this cache and try to allocate a
> cache equal to the number of blocks in the filesystem.  This should,
> hopefully, keep all of fsck in cache.
>
> We make sure to mlock() our cache, because it's pointless to swap out
> cache data; we'd rather just read it from the device.  Now, obviously,
> we can't allocate and lock more memory than the system has available.
> fsck will keep shrinking the cache size until it gets an allocation.
>
> For the main fsck operation, we don't just get the largest cache
> available.  We will need memory for the fsck accounting structures too.
> fsck will start with a cache _larger_ than needed.  If this
> succeeds, fsck knows that the needed size is safe to allocate.  fsck
> will actually use a cache smaller than the largest cache it could get,
> ensuring available memory.
>
> Signed-off-by: Joel Becker <joel.becker at oracle.com>
> ---
>  fsck.ocfs2/fsck.c         |    6 +++
>  fsck.ocfs2/include/util.h |   12 +++++
>  fsck.ocfs2/util.c         |  105 +++++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 123 insertions(+), 0 deletions(-)
>
> diff --git a/fsck.ocfs2/fsck.c b/fsck.ocfs2/fsck.c
> index a686886..b269e75 100644
> --- a/fsck.ocfs2/fsck.c
> +++ b/fsck.ocfs2/fsck.c
> @@ -837,6 +837,9 @@ int main(int argc, char **argv)
>  	printf("  max slots:          %u\n\n", 
>  	       OCFS2_RAW_SB(ost->ost_fs->fs_super)->s_max_slots);
>  
> +	/* Let's get enough of a cache to replay the journals */
> +	o2fsck_init_cache(ost, O2FSCK_CACHE_MODE_JOURNAL);
> +
>  	if (open_flags & OCFS2_FLAG_RW) {
>  		ret = o2fsck_check_journals(ost);
>  		if (ret) {
> @@ -854,6 +857,9 @@ int main(int argc, char **argv)
>  		goto unlock;
>  	}
>  
> +	/* Grow the cache */
> +	o2fsck_init_cache(ost, O2FSCK_CACHE_MODE_FULL);
> +
>  	/* allocate all this junk after we've replayed the journal and the
>  	 * sb should be stable */
>  	if (o2fsck_state_init(ost->ost_fs, ost)) {
> diff --git a/fsck.ocfs2/include/util.h b/fsck.ocfs2/include/util.h
> index 98fc24a..77d36e4 100644
> --- a/fsck.ocfs2/include/util.h
> +++ b/fsck.ocfs2/include/util.h
> @@ -37,6 +37,18 @@
>  #define FSCK_CANCELED    32     /* Aborted with a signal or ^C */
>  #define FSCK_LIBRARY     128    /* Shared library error */
>  
> +/* Managing the I/O cache */
> +enum o2fsck_cache_hint {
> +	O2FSCK_CACHE_MODE_NONE = 0,
> +	O2FSCK_CACHE_MODE_JOURNAL,	/* Enough of a cache to replay a
> +					   journal */
> +	O2FSCK_CACHE_MODE_FULL,		/* Enough of a cache to recover the
> +					   filesystem */
> +};
> +void o2fsck_init_cache(o2fsck_state *ost, enum o2fsck_cache_hint hint);
> +int o2fsck_worth_caching(int blocks_to_read);
> +void o2fsck_reset_blocks_cached(void);
> +
>  void o2fsck_write_inode(o2fsck_state *ost, uint64_t blkno,
>                          struct ocfs2_dinode *di);
>  void o2fsck_mark_cluster_allocated(o2fsck_state *ost, uint32_t cluster);
> diff --git a/fsck.ocfs2/util.c b/fsck.ocfs2/util.c
> index 86d5972..54ad322 100644
> --- a/fsck.ocfs2/util.c
> +++ b/fsck.ocfs2/util.c
> @@ -27,6 +27,7 @@
>   */
>  #include <inttypes.h>
>  #include <string.h>
> +#include <assert.h>
>  #include "ocfs2/ocfs2.h"
>  
>  #include "util.h"
> @@ -169,3 +170,107 @@ bail:
>  		ocfs2_free(&buf);
>  	return ret;
>  }
> +
> +/* Number of blocks available in the I/O cache */
> +static int cache_blocks;
> +/*
> + * Number of blocks we've currently cached.  This is an imperfect guess
> + * designed for pre-caching.  Code can keep slurping blocks until
> + * o2fsck_worth_caching() returns 0.
> + */
> +static int blocks_cached;
> +
> +void o2fsck_init_cache(o2fsck_state *ost, enum o2fsck_cache_hint hint)
> +{
> +	errcode_t ret;
> +	uint64_t blocks_wanted;
> +	int leave_room;
> +	ocfs2_filesys *fs = ost->ost_fs;
> +	int max_slots = OCFS2_RAW_SB(fs->fs_super)->s_max_slots;
> +
> +	switch (hint) {
> +		case O2FSCK_CACHE_MODE_FULL:
> +			leave_room = 1;
> +			blocks_wanted = fs->fs_blocks;
> +			break;
> +		case O2FSCK_CACHE_MODE_JOURNAL:
> +			/*
> +			 * We need enough blocks for all the journal
> +			 * data.  Let's guess at 256M journals.
> +			 */
> +			leave_room = 0;
> +			blocks_wanted = ocfs2_blocks_in_bytes(fs,
> +					max_slots * 1024 * 1024 * 256);
> +			break;
> +		case O2FSCK_CACHE_MODE_NONE:
> +			return;
> +		default:
> +			assert(0);
> +	}
> +
> +	verbosef("Want %"PRIu64" blocks for the I/O cache\n",
> +		 blocks_wanted);
> +	/*
> +	 * leave_room means that we don't want our cache to be taking
> +	 * all available memory.  So we try to get twice as much as we
> +	 * want; if that works, we know that getting exactly as much as
> +	 * we want is going to be safe.
> +	 */
> +	if (leave_room)
> +		blocks_wanted <<= 2;
>   

This is 4 times what we want.

> +
> +	if (blocks_wanted > INT_MAX)
> +		blocks_wanted = INT_MAX;
> +
> +	while (blocks_wanted > 0) {
> +		io_destroy_cache(fs->fs_io);
> +		verbosef("Asking for %"PRIu64" blocks of I/O cache\n",
> +			 blocks_wanted);
> +		ret = io_init_cache(fs->fs_io, blocks_wanted);
> +		if (!ret) {
> +			/*
> +			 * We want to pin our cache; there's no point in
> +			 * having a large cache if half of it is in swap.
> +			 * However, some callers may not be privileged
> +			 * enough, so once we get down to a small enough
> +			 * number (512 blocks), we'll stop caring.
> +			 */
> +			ret = io_mlock_cache(fs->fs_io);
> +			if (ret && (blocks_wanted <= 512))
> +				ret = 0;
> +		}
> +		if (!ret) {
> +			verbosef("Got %"PRIu64" blocks\n", blocks_wanted);
> +			/*
> +			 * We've found an allocation that works.  If
> +			 * we're not leaving room, we're done.  But if
> +			 * we're leaving room, we clear leave_room and go
> +			 * around again.  We expect to succeed there.
> +			 */
> +			if (!leave_room) {
> +				cache_blocks = blocks_wanted;
> +				break;
> +			}
> +
> +			verbosef("Leaving room for other %s\n",
> +				 "allocations");
> +			leave_room = 0;
> +		}
> +
> +		blocks_wanted >>= 1;
> +	}
> +}
> +
> +int o2fsck_worth_caching(int blocks_to_read)
> +{
> +	if ((blocks_to_read + blocks_cached) > cache_blocks)
> +		return 0;
> +
> +	blocks_cached += blocks_to_read;
> +	return 1;
> +}
> +
> +void o2fsck_reset_blocks_cached(void)
> +{
> +	blocks_cached = 0;
> +}
>   




More information about the Ocfs2-tools-devel mailing list