[Ocfs2-tools-devel] [PATCH 10/13] fsck.ocfs2: Use the I/O cache.

Joel Becker Joel.Becker at oracle.com
Tue Jun 2 22:21:09 PDT 2009


On Tue, Jun 02, 2009 at 09:45:49PM -0700, Sunil Mushran wrote:
> Possible typo in the patch.

	Definite typo.  Thanks.

Joel

> Signed-off-by: Sunil Mushran <sunil.mushran at oracle.com>
>
>
> Joel Becker wrote:
>> fsck.ocfs2 travels the filesystem multiple times.  The I/O cache should
>> make this faster.  Since read-write fsck is only allowed when there are
>> no other users or mounters of the device, the cache should be safe.
>>
>> We use two caches.  First, we allocate a cache big enough for all the
>> journals.  Since we don't know their size at the start, we guess the
>> default 256MB.  The hope is that we cache the journal blocks on the
>> first pass when we check their contents and avoid having to re-read them
>> on the second pass when we replay them.
>>
>> Once the journals are replayed, we drop this cache and try to allocate a
>> cache equal to the number of blocks in the filesystem.  This should,
>> hopefully, keep all of fsck in cache.
>>
>> We make sure to mlock() our cache, because it's pointless to swap out
>> cache data; we'd rather just read it from the device.  Now, obviously,
>> we can't allocate and lock more memory than the system has available.
>> fsck will keep shrinking the cache size until it gets an allocation.
>>
>> For the main fsck operation, we don't just get the largest cache
>> available.  We will need memory for the fsck accounting structures too.
>> fsck will start with a cache _larger_ than needed.  If this
>> succeeds, fsck knows that the needed size is safe to allocate.  fsck
>> will actually use a cache smaller than the largest cache it could get,
>> ensuring available memory.
>>
>> Signed-off-by: Joel Becker <joel.becker at oracle.com>
>> ---
>>  fsck.ocfs2/fsck.c         |    6 +++
>>  fsck.ocfs2/include/util.h |   12 +++++
>>  fsck.ocfs2/util.c         |  105 +++++++++++++++++++++++++++++++++++++++++++++
>>  3 files changed, 123 insertions(+), 0 deletions(-)
>>
>> diff --git a/fsck.ocfs2/fsck.c b/fsck.ocfs2/fsck.c
>> index a686886..b269e75 100644
>> --- a/fsck.ocfs2/fsck.c
>> +++ b/fsck.ocfs2/fsck.c
>> @@ -837,6 +837,9 @@ int main(int argc, char **argv)
>>  	printf("  max slots:          %u\n\n",  	       
>> OCFS2_RAW_SB(ost->ost_fs->fs_super)->s_max_slots);
>>  +	/* Let's get enough of a cache to replay the journals */
>> +	o2fsck_init_cache(ost, O2FSCK_CACHE_MODE_JOURNAL);
>> +
>>  	if (open_flags & OCFS2_FLAG_RW) {
>>  		ret = o2fsck_check_journals(ost);
>>  		if (ret) {
>> @@ -854,6 +857,9 @@ int main(int argc, char **argv)
>>  		goto unlock;
>>  	}
>>  +	/* Grow the cache */
>> +	o2fsck_init_cache(ost, O2FSCK_CACHE_MODE_FULL);
>> +
>>  	/* allocate all this junk after we've replayed the journal and the
>>  	 * sb should be stable */
>>  	if (o2fsck_state_init(ost->ost_fs, ost)) {
>> diff --git a/fsck.ocfs2/include/util.h b/fsck.ocfs2/include/util.h
>> index 98fc24a..77d36e4 100644
>> --- a/fsck.ocfs2/include/util.h
>> +++ b/fsck.ocfs2/include/util.h
>> @@ -37,6 +37,18 @@
>>  #define FSCK_CANCELED    32     /* Aborted with a signal or ^C */
>>  #define FSCK_LIBRARY     128    /* Shared library error */
>>  +/* Managing the I/O cache */
>> +enum o2fsck_cache_hint {
>> +	O2FSCK_CACHE_MODE_NONE = 0,
>> +	O2FSCK_CACHE_MODE_JOURNAL,	/* Enough of a cache to replay a
>> +					   journal */
>> +	O2FSCK_CACHE_MODE_FULL,		/* Enough of a cache to recover the
>> +					   filesystem */
>> +};
>> +void o2fsck_init_cache(o2fsck_state *ost, enum o2fsck_cache_hint hint);
>> +int o2fsck_worth_caching(int blocks_to_read);
>> +void o2fsck_reset_blocks_cached(void);
>> +
>>  void o2fsck_write_inode(o2fsck_state *ost, uint64_t blkno,
>>                          struct ocfs2_dinode *di);
>>  void o2fsck_mark_cluster_allocated(o2fsck_state *ost, uint32_t cluster);
>> diff --git a/fsck.ocfs2/util.c b/fsck.ocfs2/util.c
>> index 86d5972..54ad322 100644
>> --- a/fsck.ocfs2/util.c
>> +++ b/fsck.ocfs2/util.c
>> @@ -27,6 +27,7 @@
>>   */
>>  #include <inttypes.h>
>>  #include <string.h>
>> +#include <assert.h>
>>  #include "ocfs2/ocfs2.h"
>>   #include "util.h"
>> @@ -169,3 +170,107 @@ bail:
>>  		ocfs2_free(&buf);
>>  	return ret;
>>  }
>> +
>> +/* Number of blocks available in the I/O cache */
>> +static int cache_blocks;
>> +/*
>> + * Number of blocks we've currently cached.  This is an imperfect guess
>> + * designed for pre-caching.  Code can keep slurping blocks until
>> + * o2fsck_worth_caching() returns 0.
>> + */
>> +static int blocks_cached;
>> +
>> +void o2fsck_init_cache(o2fsck_state *ost, enum o2fsck_cache_hint hint)
>> +{
>> +	errcode_t ret;
>> +	uint64_t blocks_wanted;
>> +	int leave_room;
>> +	ocfs2_filesys *fs = ost->ost_fs;
>> +	int max_slots = OCFS2_RAW_SB(fs->fs_super)->s_max_slots;
>> +
>> +	switch (hint) {
>> +		case O2FSCK_CACHE_MODE_FULL:
>> +			leave_room = 1;
>> +			blocks_wanted = fs->fs_blocks;
>> +			break;
>> +		case O2FSCK_CACHE_MODE_JOURNAL:
>> +			/*
>> +			 * We need enough blocks for all the journal
>> +			 * data.  Let's guess at 256M journals.
>> +			 */
>> +			leave_room = 0;
>> +			blocks_wanted = ocfs2_blocks_in_bytes(fs,
>> +					max_slots * 1024 * 1024 * 256);
>> +			break;
>> +		case O2FSCK_CACHE_MODE_NONE:
>> +			return;
>> +		default:
>> +			assert(0);
>> +	}
>> +
>> +	verbosef("Want %"PRIu64" blocks for the I/O cache\n",
>> +		 blocks_wanted);
>> +	/*
>> +	 * leave_room means that we don't want our cache to be taking
>> +	 * all available memory.  So we try to get twice as much as we
>> +	 * want; if that works, we know that getting exactly as much as
>> +	 * we want is going to be safe.
>> +	 */
>> +	if (leave_room)
>> +		blocks_wanted <<= 2;
>>   
>
> This is 4 times what we want.
>
>> +
>> +	if (blocks_wanted > INT_MAX)
>> +		blocks_wanted = INT_MAX;
>> +
>> +	while (blocks_wanted > 0) {
>> +		io_destroy_cache(fs->fs_io);
>> +		verbosef("Asking for %"PRIu64" blocks of I/O cache\n",
>> +			 blocks_wanted);
>> +		ret = io_init_cache(fs->fs_io, blocks_wanted);
>> +		if (!ret) {
>> +			/*
>> +			 * We want to pin our cache; there's no point in
>> +			 * having a large cache if half of it is in swap.
>> +			 * However, some callers may not be privileged
>> +			 * enough, so once we get down to a small enough
>> +			 * number (512 blocks), we'll stop caring.
>> +			 */
>> +			ret = io_mlock_cache(fs->fs_io);
>> +			if (ret && (blocks_wanted <= 512))
>> +				ret = 0;
>> +		}
>> +		if (!ret) {
>> +			verbosef("Got %"PRIu64" blocks\n", blocks_wanted);
>> +			/*
>> +			 * We've found an allocation that works.  If
>> +			 * we're not leaving room, we're done.  But if
>> +			 * we're leaving room, we clear leave_room and go
>> +			 * around again.  We expect to succeed there.
>> +			 */
>> +			if (!leave_room) {
>> +				cache_blocks = blocks_wanted;
>> +				break;
>> +			}
>> +
>> +			verbosef("Leaving room for other %s\n",
>> +				 "allocations");
>> +			leave_room = 0;
>> +		}
>> +
>> +		blocks_wanted >>= 1;
>> +	}
>> +}
>> +
>> +int o2fsck_worth_caching(int blocks_to_read)
>> +{
>> +	if ((blocks_to_read + blocks_cached) > cache_blocks)
>> +		return 0;
>> +
>> +	blocks_cached += blocks_to_read;
>> +	return 1;
>> +}
>> +
>> +void o2fsck_reset_blocks_cached(void)
>> +{
>> +	blocks_cached = 0;
>> +}
>>   
>

-- 

"Baby, even the losers
 Get luck sometimes.
 Even the losers
 Keep a little bit of pride."

Joel Becker
Principal Software Developer
Oracle
E-mail: joel.becker at oracle.com
Phone: (650) 506-8127



More information about the Ocfs2-tools-devel mailing list