[Ocfs2-tools-devel] [PATCH 10/13] fsck.ocfs2: Use the I/O cache.
Joel Becker
Joel.Becker at oracle.com
Tue Jun 2 22:21:09 PDT 2009
On Tue, Jun 02, 2009 at 09:45:49PM -0700, Sunil Mushran wrote:
> Possible typo in the patch.
Definite typo. Thanks.
Joel
> Signed-off-by: Sunil Mushran <sunil.mushran at oracle.com>
>
>
> Joel Becker wrote:
>> fsck.ocfs2 travels the filesystem multiple times. The I/O cache should
>> make this faster. Since read-write fsck is only allowed when there are
>> no other users or mounters of the device, the cache should be safe.
>>
>> We use two caches. First, we allocate a cache big enough for all the
>> journals. Since we don't know their size at the start, we guess the
>> default 256MB. The hope is that we cache the journal blocks on the
>> first pass when we check their contents and avoid having to re-read them
>> on the second pass when we replay them.
>>
>> Once the journals are replayed, we drop this cache and try to allocate a
>> cache equal to the number of blocks in the filesystem. This should,
>> hopefully, keep all of fsck in cache.
>>
>> We make sure to mlock() our cache, because it's pointless to swap out
>> cache data; we'd rather just read it from the device. Now, obviously,
>> we can't allocate and lock more memory than the system has available.
>> fsck will keep shrinking the cache size until it gets an allocation.
>>
>> For the main fsck operation, we don't just get the largest cache
>> available. We will need memory for the fsck accounting structures too.
>> fsck will start with a cache _larger_ than needed. If this
>> succeeds, fsck knows that the needed size is safe to allocate. fsck
>> will actually use a cache smaller than the largest cache it could get,
>> ensuring available memory.
>>
>> Signed-off-by: Joel Becker <joel.becker at oracle.com>
>> ---
>> fsck.ocfs2/fsck.c | 6 +++
>> fsck.ocfs2/include/util.h | 12 +++++
>> fsck.ocfs2/util.c | 105 +++++++++++++++++++++++++++++++++++++++++++++
>> 3 files changed, 123 insertions(+), 0 deletions(-)
>>
>> diff --git a/fsck.ocfs2/fsck.c b/fsck.ocfs2/fsck.c
>> index a686886..b269e75 100644
>> --- a/fsck.ocfs2/fsck.c
>> +++ b/fsck.ocfs2/fsck.c
>> @@ -837,6 +837,9 @@ int main(int argc, char **argv)
>> printf(" max slots: %u\n\n",
>> OCFS2_RAW_SB(ost->ost_fs->fs_super)->s_max_slots);
>> + /* Let's get enough of a cache to replay the journals */
>> + o2fsck_init_cache(ost, O2FSCK_CACHE_MODE_JOURNAL);
>> +
>> if (open_flags & OCFS2_FLAG_RW) {
>> ret = o2fsck_check_journals(ost);
>> if (ret) {
>> @@ -854,6 +857,9 @@ int main(int argc, char **argv)
>> goto unlock;
>> }
>> + /* Grow the cache */
>> + o2fsck_init_cache(ost, O2FSCK_CACHE_MODE_FULL);
>> +
>> /* allocate all this junk after we've replayed the journal and the
>> * sb should be stable */
>> if (o2fsck_state_init(ost->ost_fs, ost)) {
>> diff --git a/fsck.ocfs2/include/util.h b/fsck.ocfs2/include/util.h
>> index 98fc24a..77d36e4 100644
>> --- a/fsck.ocfs2/include/util.h
>> +++ b/fsck.ocfs2/include/util.h
>> @@ -37,6 +37,18 @@
>> #define FSCK_CANCELED 32 /* Aborted with a signal or ^C */
>> #define FSCK_LIBRARY 128 /* Shared library error */
>> +/* Managing the I/O cache */
>> +enum o2fsck_cache_hint {
>> + O2FSCK_CACHE_MODE_NONE = 0,
>> + O2FSCK_CACHE_MODE_JOURNAL, /* Enough of a cache to replay a
>> + journal */
>> + O2FSCK_CACHE_MODE_FULL, /* Enough of a cache to recover the
>> + filesystem */
>> +};
>> +void o2fsck_init_cache(o2fsck_state *ost, enum o2fsck_cache_hint hint);
>> +int o2fsck_worth_caching(int blocks_to_read);
>> +void o2fsck_reset_blocks_cached(void);
>> +
>> void o2fsck_write_inode(o2fsck_state *ost, uint64_t blkno,
>> struct ocfs2_dinode *di);
>> void o2fsck_mark_cluster_allocated(o2fsck_state *ost, uint32_t cluster);
>> diff --git a/fsck.ocfs2/util.c b/fsck.ocfs2/util.c
>> index 86d5972..54ad322 100644
>> --- a/fsck.ocfs2/util.c
>> +++ b/fsck.ocfs2/util.c
>> @@ -27,6 +27,7 @@
>> */
>> #include <inttypes.h>
>> #include <string.h>
>> +#include <assert.h>
>> #include "ocfs2/ocfs2.h"
>> #include "util.h"
>> @@ -169,3 +170,107 @@ bail:
>> ocfs2_free(&buf);
>> return ret;
>> }
>> +
>> +/* Number of blocks available in the I/O cache */
>> +static int cache_blocks;
>> +/*
>> + * Number of blocks we've currently cached. This is an imperfect guess
>> + * designed for pre-caching. Code can keep slurping blocks until
>> + * o2fsck_worth_caching() returns 0.
>> + */
>> +static int blocks_cached;
>> +
>> +void o2fsck_init_cache(o2fsck_state *ost, enum o2fsck_cache_hint hint)
>> +{
>> + errcode_t ret;
>> + uint64_t blocks_wanted;
>> + int leave_room;
>> + ocfs2_filesys *fs = ost->ost_fs;
>> + int max_slots = OCFS2_RAW_SB(fs->fs_super)->s_max_slots;
>> +
>> + switch (hint) {
>> + case O2FSCK_CACHE_MODE_FULL:
>> + leave_room = 1;
>> + blocks_wanted = fs->fs_blocks;
>> + break;
>> + case O2FSCK_CACHE_MODE_JOURNAL:
>> + /*
>> + * We need enough blocks for all the journal
>> + * data. Let's guess at 256M journals.
>> + */
>> + leave_room = 0;
>> + blocks_wanted = ocfs2_blocks_in_bytes(fs,
>> + max_slots * 1024 * 1024 * 256);
>> + break;
>> + case O2FSCK_CACHE_MODE_NONE:
>> + return;
>> + default:
>> + assert(0);
>> + }
>> +
>> + verbosef("Want %"PRIu64" blocks for the I/O cache\n",
>> + blocks_wanted);
>> + /*
>> + * leave_room means that we don't want our cache to be taking
>> + * all available memory. So we try to get twice as much as we
>> + * want; if that works, we know that getting exactly as much as
>> + * we want is going to be safe.
>> + */
>> + if (leave_room)
>> + blocks_wanted <<= 2;
>>
>
> This is 4 times what we want.
>
>> +
>> + if (blocks_wanted > INT_MAX)
>> + blocks_wanted = INT_MAX;
>> +
>> + while (blocks_wanted > 0) {
>> + io_destroy_cache(fs->fs_io);
>> + verbosef("Asking for %"PRIu64" blocks of I/O cache\n",
>> + blocks_wanted);
>> + ret = io_init_cache(fs->fs_io, blocks_wanted);
>> + if (!ret) {
>> + /*
>> + * We want to pin our cache; there's no point in
>> + * having a large cache if half of it is in swap.
>> + * However, some callers may not be privileged
>> + * enough, so once we get down to a small enough
>> + * number (512 blocks), we'll stop caring.
>> + */
>> + ret = io_mlock_cache(fs->fs_io);
>> + if (ret && (blocks_wanted <= 512))
>> + ret = 0;
>> + }
>> + if (!ret) {
>> + verbosef("Got %"PRIu64" blocks\n", blocks_wanted);
>> + /*
>> + * We've found an allocation that works. If
>> + * we're not leaving room, we're done. But if
>> + * we're leaving room, we clear leave_room and go
>> + * around again. We expect to succeed there.
>> + */
>> + if (!leave_room) {
>> + cache_blocks = blocks_wanted;
>> + break;
>> + }
>> +
>> + verbosef("Leaving room for other %s\n",
>> + "allocations");
>> + leave_room = 0;
>> + }
>> +
>> + blocks_wanted >>= 1;
>> + }
>> +}
>> +
>> +int o2fsck_worth_caching(int blocks_to_read)
>> +{
>> + if ((blocks_to_read + blocks_cached) > cache_blocks)
>> + return 0;
>> +
>> + blocks_cached += blocks_to_read;
>> + return 1;
>> +}
>> +
>> +void o2fsck_reset_blocks_cached(void)
>> +{
>> + blocks_cached = 0;
>> +}
>>
>
--
"Baby, even the losers
Get luck sometimes.
Even the losers
Keep a little bit of pride."
Joel Becker
Principal Software Developer
Oracle
E-mail: joel.becker at oracle.com
Phone: (650) 506-8127
More information about the Ocfs2-tools-devel
mailing list