[Ocfs2-tools-devel] [PATCH 10/13] fsck.ocfs2: Use the I/O cache.
Sunil Mushran
sunil.mushran at oracle.com
Tue Jun 2 21:45:49 PDT 2009
Possible typo in the patch.
Signed-off-by: Sunil Mushran <sunil.mushran at oracle.com>
Joel Becker wrote:
> fsck.ocfs2 travels the filesystem multiple times. The I/O cache should
> make this faster. Since read-write fsck is only allowed when there are
> no other users or mounters of the device, the cache should be safe.
>
> We use two caches. First, we allocate a cache big enough for all the
> journals. Since we don't know their size at the start, we guess the
> default 256MB. The hope is that we cache the journal blocks on the
> first pass when we check their contents and avoid having to re-read them
> on the second pass when we replay them.
>
> Once the journals are replayed, we drop this cache and try to allocate a
> cache equal to the number of blocks in the filesystem. This should,
> hopefully, keep all of fsck in cache.
>
> We make sure to mlock() our cache, because it's pointless to swap out
> cache data; we'd rather just read it from the device. Now, obviously,
> we can't allocate and lock more memory than the system has available.
> fsck will keep shrinking the cache size until it gets an allocation.
>
> For the main fsck operation, we don't just get the largest cache
> available. We will need memory for the fsck accounting structures too.
> fsck will start with a cache _larger_ than needed. If this
> succeeds, fsck knows that the needed size is safe to allocate. fsck
> will actually use a cache smaller than the largest cache it could get,
> ensuring available memory.
>
> Signed-off-by: Joel Becker <joel.becker at oracle.com>
> ---
> fsck.ocfs2/fsck.c | 6 +++
> fsck.ocfs2/include/util.h | 12 +++++
> fsck.ocfs2/util.c | 105 +++++++++++++++++++++++++++++++++++++++++++++
> 3 files changed, 123 insertions(+), 0 deletions(-)
>
> diff --git a/fsck.ocfs2/fsck.c b/fsck.ocfs2/fsck.c
> index a686886..b269e75 100644
> --- a/fsck.ocfs2/fsck.c
> +++ b/fsck.ocfs2/fsck.c
> @@ -837,6 +837,9 @@ int main(int argc, char **argv)
> printf(" max slots: %u\n\n",
> OCFS2_RAW_SB(ost->ost_fs->fs_super)->s_max_slots);
>
> + /* Let's get enough of a cache to replay the journals */
> + o2fsck_init_cache(ost, O2FSCK_CACHE_MODE_JOURNAL);
> +
> if (open_flags & OCFS2_FLAG_RW) {
> ret = o2fsck_check_journals(ost);
> if (ret) {
> @@ -854,6 +857,9 @@ int main(int argc, char **argv)
> goto unlock;
> }
>
> + /* Grow the cache */
> + o2fsck_init_cache(ost, O2FSCK_CACHE_MODE_FULL);
> +
> /* allocate all this junk after we've replayed the journal and the
> * sb should be stable */
> if (o2fsck_state_init(ost->ost_fs, ost)) {
> diff --git a/fsck.ocfs2/include/util.h b/fsck.ocfs2/include/util.h
> index 98fc24a..77d36e4 100644
> --- a/fsck.ocfs2/include/util.h
> +++ b/fsck.ocfs2/include/util.h
> @@ -37,6 +37,18 @@
> #define FSCK_CANCELED 32 /* Aborted with a signal or ^C */
> #define FSCK_LIBRARY 128 /* Shared library error */
>
> +/* Managing the I/O cache */
> +enum o2fsck_cache_hint {
> + O2FSCK_CACHE_MODE_NONE = 0,
> + O2FSCK_CACHE_MODE_JOURNAL, /* Enough of a cache to replay a
> + journal */
> + O2FSCK_CACHE_MODE_FULL, /* Enough of a cache to recover the
> + filesystem */
> +};
> +void o2fsck_init_cache(o2fsck_state *ost, enum o2fsck_cache_hint hint);
> +int o2fsck_worth_caching(int blocks_to_read);
> +void o2fsck_reset_blocks_cached(void);
> +
> void o2fsck_write_inode(o2fsck_state *ost, uint64_t blkno,
> struct ocfs2_dinode *di);
> void o2fsck_mark_cluster_allocated(o2fsck_state *ost, uint32_t cluster);
> diff --git a/fsck.ocfs2/util.c b/fsck.ocfs2/util.c
> index 86d5972..54ad322 100644
> --- a/fsck.ocfs2/util.c
> +++ b/fsck.ocfs2/util.c
> @@ -27,6 +27,7 @@
> */
> #include <inttypes.h>
> #include <string.h>
> +#include <assert.h>
> #include "ocfs2/ocfs2.h"
>
> #include "util.h"
> @@ -169,3 +170,107 @@ bail:
> ocfs2_free(&buf);
> return ret;
> }
> +
> +/* Number of blocks available in the I/O cache */
> +static int cache_blocks;
> +/*
> + * Number of blocks we've currently cached. This is an imperfect guess
> + * designed for pre-caching. Code can keep slurping blocks until
> + * o2fsck_worth_caching() returns 0.
> + */
> +static int blocks_cached;
> +
> +void o2fsck_init_cache(o2fsck_state *ost, enum o2fsck_cache_hint hint)
> +{
> + errcode_t ret;
> + uint64_t blocks_wanted;
> + int leave_room;
> + ocfs2_filesys *fs = ost->ost_fs;
> + int max_slots = OCFS2_RAW_SB(fs->fs_super)->s_max_slots;
> +
> + switch (hint) {
> + case O2FSCK_CACHE_MODE_FULL:
> + leave_room = 1;
> + blocks_wanted = fs->fs_blocks;
> + break;
> + case O2FSCK_CACHE_MODE_JOURNAL:
> + /*
> + * We need enough blocks for all the journal
> + * data. Let's guess at 256M journals.
> + */
> + leave_room = 0;
> + blocks_wanted = ocfs2_blocks_in_bytes(fs,
> + max_slots * 1024 * 1024 * 256);
> + break;
> + case O2FSCK_CACHE_MODE_NONE:
> + return;
> + default:
> + assert(0);
> + }
> +
> + verbosef("Want %"PRIu64" blocks for the I/O cache\n",
> + blocks_wanted);
> + /*
> + * leave_room means that we don't want our cache to be taking
> + * all available memory. So we try to get twice as much as we
> + * want; if that works, we know that getting exactly as much as
> + * we want is going to be safe.
> + */
> + if (leave_room)
> + blocks_wanted <<= 2;
>
This is 4 times what we want.
> +
> + if (blocks_wanted > INT_MAX)
> + blocks_wanted = INT_MAX;
> +
> + while (blocks_wanted > 0) {
> + io_destroy_cache(fs->fs_io);
> + verbosef("Asking for %"PRIu64" blocks of I/O cache\n",
> + blocks_wanted);
> + ret = io_init_cache(fs->fs_io, blocks_wanted);
> + if (!ret) {
> + /*
> + * We want to pin our cache; there's no point in
> + * having a large cache if half of it is in swap.
> + * However, some callers may not be privileged
> + * enough, so once we get down to a small enough
> + * number (512 blocks), we'll stop caring.
> + */
> + ret = io_mlock_cache(fs->fs_io);
> + if (ret && (blocks_wanted <= 512))
> + ret = 0;
> + }
> + if (!ret) {
> + verbosef("Got %"PRIu64" blocks\n", blocks_wanted);
> + /*
> + * We've found an allocation that works. If
> + * we're not leaving room, we're done. But if
> + * we're leaving room, we clear leave_room and go
> + * around again. We expect to succeed there.
> + */
> + if (!leave_room) {
> + cache_blocks = blocks_wanted;
> + break;
> + }
> +
> + verbosef("Leaving room for other %s\n",
> + "allocations");
> + leave_room = 0;
> + }
> +
> + blocks_wanted >>= 1;
> + }
> +}
> +
> +int o2fsck_worth_caching(int blocks_to_read)
> +{
> + if ((blocks_to_read + blocks_cached) > cache_blocks)
> + return 0;
> +
> + blocks_cached += blocks_to_read;
> + return 1;
> +}
> +
> +void o2fsck_reset_blocks_cached(void)
> +{
> + blocks_cached = 0;
> +}
>
More information about the Ocfs2-tools-devel
mailing list