[Ocfs2-tools-devel] [PATCH 07/11] libocfs2: Add aio read support

Goldwyn Rodrigues rgoldwyn at gmail.com
Fri Sep 23 09:49:44 PDT 2011


On Thu, Sep 22, 2011 at 9:04 PM, Sunil Mushran <sunil.mushran at oracle.com> wrote:
> Added public function io_aio_read_blocks() that performs aio reads on the
> provided set of blocks. It is io cache friendly. One use case is to use this
> to warm the cache, which has proven to be very useful in fsck.
>
> Signed-off-by: Sunil Mushran <sunil.mushran at oracle.com>
> ---
>  debugfs.ocfs2/Makefile |    2 +-
>  extras/Makefile        |    2 +-
>  fsck.ocfs2/Makefile    |    2 +-
>  fswreck/Makefile       |    2 +-
>  include/ocfs2/ocfs2.h  |    9 ++++
>  libocfs2/unix_io.c     |  114 ++++++++++++++++++++++++++++++++++++++++++++++++
>  listuuid/Makefile      |    2 +-
>  mkfs.ocfs2/Makefile    |    2 +-
>  mount.ocfs2/Makefile   |    2 +-
>  mounted.ocfs2/Makefile |    2 +-
>  o2cb_ctl/Makefile      |    2 +-
>  o2image/Makefile       |    2 +-
>  o2info/Makefile        |    2 +-
>  ocfs2_hb_ctl/Makefile  |    2 +-
>  tunefs.ocfs2/Makefile  |    2 +-
>  15 files changed, 136 insertions(+), 13 deletions(-)
>
> diff --git a/debugfs.ocfs2/Makefile b/debugfs.ocfs2/Makefile
> index 556d284..d2ce1a9 100644
> --- a/debugfs.ocfs2/Makefile
> +++ b/debugfs.ocfs2/Makefile
> @@ -31,7 +31,7 @@ HFILES =                              \
>
>  OBJS = $(subst .c,.o,$(CFILES))
>
> -LIBOCFS2_LIBS = -L$(TOPDIR)/libocfs2 -locfs2
> +LIBOCFS2_LIBS = -L$(TOPDIR)/libocfs2 -locfs2 -laio
>  LIBO2CB_LIBS = -L$(TOPDIR)/libo2cb -lo2cb
>
>  MANS = debugfs.ocfs2.8
> diff --git a/extras/Makefile b/extras/Makefile
> index 2d30004..7f90404 100644
> --- a/extras/Makefile
> +++ b/extras/Makefile
> @@ -33,7 +33,7 @@ CHECK_METAECC_OBJS = $(subst .c,.o,$(CHECK_METAECC_CFILES))
>  RESIZE_SLOTMAP_OBJS = $(subst .c,.o,$(RESIZE_SLOTMAP_CFILES))
>
>  LIBOCFS2 = ../libocfs2/libocfs2.a
> -EXTRAS_LIBS = $(LIBOCFS2) $(COM_ERR_LIBS)
> +EXTRAS_LIBS = $(LIBOCFS2) $(COM_ERR_LIBS) -laio
>
>  find_hardlinks: $(FIND_HARDLINKS_OBJS) $(LIBOCFS2)
>        $(LINK) $(EXTRAS_LIBS)
> diff --git a/fsck.ocfs2/Makefile b/fsck.ocfs2/Makefile
> index f806ba6..36f9dbc 100644
> --- a/fsck.ocfs2/Makefile
> +++ b/fsck.ocfs2/Makefile
> @@ -8,7 +8,7 @@ SBIN_PROGRAMS = fsck.ocfs2
>  DEFINES += -DVERSION=\"$(VERSION)\"
>
>  INCLUDES = -I$(TOPDIR)/include -Iinclude
> -LIBOCFS2_LIBS = -L$(TOPDIR)/libocfs2 -locfs2
> +LIBOCFS2_LIBS = -L$(TOPDIR)/libocfs2 -locfs2 -laio
>  LIBOCFS2_DEPS = $(TOPDIR)/libocfs2/libocfs2.a
>  LIBO2DLM_LIBS = -L$(TOPDIR)/libo2dlm -lo2dlm $(DL_LIBS)
>  LIBO2DLM_DEPS = $(TOPDIR)/libo2dlm/libo2dlm.a
> diff --git a/fswreck/Makefile b/fswreck/Makefile
> index b1ee546..53c2dc0 100644
> --- a/fswreck/Makefile
> +++ b/fswreck/Makefile
> @@ -35,7 +35,7 @@ DIST_RULES = dist-subdircreate
>
>  OBJS = $(subst .c,.o,$(CFILES))
>
> -LIBOCFS2_LIBS = -L$(TOPDIR)/libocfs2 -locfs2
> +LIBOCFS2_LIBS = -L$(TOPDIR)/libocfs2 -locfs2 -laio
>  LIBOCFS2_DEPS = $(TOPDIR)/libocfs2/libocfs2.a
>
>  LIBO2DLM_LIBS = -L$(TOPDIR)/libo2dlm -lo2dlm $(DL_LIBS)
> diff --git a/include/ocfs2/ocfs2.h b/include/ocfs2/ocfs2.h
> index 9fcb2ce..c44b764 100644
> --- a/include/ocfs2/ocfs2.h
> +++ b/include/ocfs2/ocfs2.h
> @@ -354,6 +354,15 @@ errcode_t io_share_cache(io_channel *from, io_channel *to);
>  errcode_t io_mlock_cache(io_channel *channel);
>  void io_destroy_cache(io_channel *channel);
>
> +
> +struct io_aio_unit {
> +       int64_t         aio_blkno;
> +       char            *aio_buf;
> +};
> +
> +errcode_t io_aio_read_blocks(io_channel *channel, struct io_aio_unit *aios,
> +                            int count);
> +
>  errcode_t ocfs2_read_super(ocfs2_filesys *fs, uint64_t superblock, char *sb);
>  /* Writes the main superblock at OCFS2_SUPER_BLOCK_BLKNO */
>  errcode_t ocfs2_write_primary_super(ocfs2_filesys *fs);
> diff --git a/libocfs2/unix_io.c b/libocfs2/unix_io.c
> index a805ffc..369fc0f 100644
> --- a/libocfs2/unix_io.c
> +++ b/libocfs2/unix_io.c
> @@ -42,6 +42,7 @@
>  #include <sys/resource.h>
>  #include <sys/utsname.h>
>  #include <linux/fs.h>
> +#include <libaio.h>
>  #endif
>  #include <sys/mman.h>
>  #include <inttypes.h>
> @@ -119,6 +120,66 @@ static inline int one_meg_of_blocks(io_channel *channel)
>        return count / channel->io_blksize;
>  }
>
> +static errcode_t unix_aio_read_blocks(io_channel *channel,
> +                                     struct io_aio_unit *aios, int count)
> +{
> +       int i;
> +       int ret;
> +       io_context_t io_ctx;
> +       struct iocb *iocb = NULL, **iocbs = NULL;
> +       struct io_event *events = NULL;
> +       int64_t offset;
> +       int submitted, completed = 0;
> +
> +       ret = OCFS2_ET_NO_MEMORY;
> +       iocb = malloc((sizeof(struct iocb) * count));
> +       iocbs = malloc((sizeof(struct iocb *) * count));
> +       events = malloc((sizeof(struct io_event) * count));
> +       if (!iocb || !iocbs || !events)
> +               goto out;
> +
> +       memset(&io_ctx, 0, sizeof(io_ctx));
> +       ret = io_queue_init(count, &io_ctx);
> +       if (ret)
> +               return ret;
> +
> +       for (i = 0; i < count; ++i) {
> +               offset = aios[i].aio_blkno * channel->io_blksize;
> +               io_prep_pread(&(iocb[i]), channel->io_fd,
> +                             aios[i].aio_buf,
> +                             channel->io_blksize, offset);
> +               iocbs[i] = &iocb[i];
> +       }
> +
> +resubmit:
> +       ret = io_submit(io_ctx, count - completed, &iocbs[completed]);
> +       if (!ret && (count - completed))
> +               ret = OCFS2_ET_SHORT_READ;
> +       if (ret < 0)
> +               goto out;
> +       submitted = ret;
> +
> +       ret = io_getevents(io_ctx, submitted, submitted, events, NULL);
> +       if (ret < 0)
> +               goto out;
> +
> +       completed += submitted;
> +       if (completed < count)
> +               goto resubmit;

You can replace resubmit with a while loop.

However, Are you using the full potential of asynchronous reads
though? This seems like a function for performing bulk I/O of
different blocks.
By collecting the results immediately, you are waiting for all
submitted I/O to complete, or blocking. How about breaking the whole
thing into two, ie submission and collection. Submitting the I/O when
you know what is going to be read, and collecting the events when you
actually need it.

Taking the example of inodes, you can io_submit inode blocks when you
read the inode_alloc file in pass 0, and io_getevents in pass 1 when
you actually need it.

> +
> +out:
> +       if (ret >= 0)
> +               ret = 0;
> +       if (!ret)
> +               channel->io_bytes_read += (count * channel->io_blksize);
> +       free(iocb);
> +       free(iocbs);
> +       free(events);
> +       io_queue_release(io_ctx);
> +
> +       return ret;
> +}
> +
>  static errcode_t unix_io_read_block(io_channel *channel, int64_t blkno,
>                                    int count, char *data)
>  {
> @@ -299,6 +360,49 @@ static struct io_cache_block *io_cache_pop_lru(struct io_cache *ic)
>  }
>
>  /*
> + * Unlike its sync counterpart, this function issues ios even for cached blocks.
> + */
> +static errcode_t io_cache_aio_read_blocks(io_channel *channel,
> +                                         struct io_aio_unit *aios,
> +                                         int count, bool nocache)
> +{
> +       struct io_cache *ic = channel->io_cache;
> +       struct io_cache_block *icb;
> +       errcode_t ret = 0;
> +       int i;
> +
> +       /*
> +        * Read all blocks. We could extend this to not issue ios for already
> +        * cached blocks. But is it worth the effort?
> +        */
> +       ret = unix_aio_read_blocks(channel, aios, count);
> +       if (ret)
> +               goto out;
> +
> +       /* refresh cache */
> +       for (i = 0; i < count; i++) {
> +               icb = io_cache_lookup(ic, aios[i].aio_blkno);
> +               if (!icb) {
> +                       if (nocache)
> +                               continue;
> +                       icb = io_cache_pop_lru(ic);
> +                       icb->icb_blkno = aios[i].aio_blkno;
> +                       io_cache_insert(ic, icb);
> +               }
> +
> +               memcpy(icb->icb_buf, aios[i].aio_buf, channel->io_blksize);
> +
> +               if (nocache)
> +                       io_cache_unsee(ic, icb);
> +               else
> +                       io_cache_seen(ic, icb);
> +       }
> +
> +out:
> +       return ret;
> +}
> +
> +/*
>  * This relies on the fact that our cache is always up to date.  If a
>  * block is in the cache, the same thing is on disk.  Even if we re-read
>  * the disk block, we don't need to update the cache.  This allows us
> @@ -822,6 +926,16 @@ void io_set_nocache(io_channel *channel, bool nocache)
>        channel->io_nocache = nocache;
>  }
>
> +errcode_t io_aio_read_blocks(io_channel *channel, struct io_aio_unit *aios,
> +                            int count)
> +{
> +       if (channel->io_cache)
> +               return io_cache_aio_read_blocks(channel, aios, count,
> +                                               channel->io_nocache);
> +       else
> +               return unix_aio_read_blocks(channel, aios, count);
> +}
> +
>  errcode_t io_read_block(io_channel *channel, int64_t blkno, int count,
>                        char *data)
>  {
> diff --git a/listuuid/Makefile b/listuuid/Makefile
> index 784e804..cda4232 100644
> --- a/listuuid/Makefile
> +++ b/listuuid/Makefile
> @@ -4,7 +4,7 @@ include $(TOPDIR)/Preamble.make
>
>  INCLUDES = -I$(TOPDIR)/include
>
> -LIBOCFS2_LIBS = -L$(TOPDIR)/libocfs2 -locfs2
> +LIBOCFS2_LIBS = -L$(TOPDIR)/libocfs2 -locfs2 -laio
>  LIBOCFS2_DEPS = $(TOPDIR)/libocfs2/libocfs2.a
>
>  LIBO2DLM_LIBS = -L$(TOPDIR)/libo2dlm -lo2dlm $(DL_LIBS)
> diff --git a/mkfs.ocfs2/Makefile b/mkfs.ocfs2/Makefile
> index b80b8b7..179b145 100644
> --- a/mkfs.ocfs2/Makefile
> +++ b/mkfs.ocfs2/Makefile
> @@ -5,7 +5,7 @@ include $(TOPDIR)/Preamble.make
>  sbindir = $(root_sbindir)
>  SBIN_PROGRAMS = mkfs.ocfs2
>
> -LIBOCFS2_LIBS = -L$(TOPDIR)/libocfs2 -locfs2
> +LIBOCFS2_LIBS = -L$(TOPDIR)/libocfs2 -locfs2 -laio
>  LIBOCFS2_DEPS = $(TOPDIR)/libocfs2/libocfs2.a
>
>  LIBO2CB_LIBS = -L$(TOPDIR)/libo2cb -lo2cb
> diff --git a/mount.ocfs2/Makefile b/mount.ocfs2/Makefile
> index 1f0e688..7b43bd0 100644
> --- a/mount.ocfs2/Makefile
> +++ b/mount.ocfs2/Makefile
> @@ -6,7 +6,7 @@ sbindir = $(root_sbindir)
>  SBIN_PROGRAMS = mount.ocfs2
>
>  INCLUDES = -I$(TOPDIR)/include
> -LIBOCFS2_LIBS = -L$(TOPDIR)/libocfs2 -locfs2
> +LIBOCFS2_LIBS = -L$(TOPDIR)/libocfs2 -locfs2 -laio
>  LIBOCFS2_DEPS = $(TOPDIR)/libocfs2/libocfs2.a
>  LIBO2DLM_LIBS = -L$(TOPDIR)/libo2dlm -lo2dlm $(DL_LIBS)
>  LIBO2DLM_DEPS = $(TOPDIR)/libo2dlm/libo2dlm.a
> diff --git a/mounted.ocfs2/Makefile b/mounted.ocfs2/Makefile
> index e63414a..039cf48 100644
> --- a/mounted.ocfs2/Makefile
> +++ b/mounted.ocfs2/Makefile
> @@ -2,7 +2,7 @@ TOPDIR = ..
>
>  include $(TOPDIR)/Preamble.make
>
> -LIBOCFS2_LIBS = -L$(TOPDIR)/libocfs2 -locfs2
> +LIBOCFS2_LIBS = -L$(TOPDIR)/libocfs2 -locfs2 -laio
>  LIBOCFS2_DEPS = $(TOPDIR)/libocfs2/libocfs2.a
>
>  LIBO2DLM_LIBS = -L$(TOPDIR)/libo2dlm -lo2dlm $(DL_LIBS)
> diff --git a/o2cb_ctl/Makefile b/o2cb_ctl/Makefile
> index 0db99c6..9c73d45 100644
> --- a/o2cb_ctl/Makefile
> +++ b/o2cb_ctl/Makefile
> @@ -10,7 +10,7 @@ INCLUDES = -I$(TOPDIR)/include
>  LIBTOOLS_INTERNAL_LIBS = -L$(TOPDIR)/libtools-internal -ltools-internal
>  LIBTOOLS_INTERNAL_DEPS = $(TOPDIR)/libtools-internal/libtools-internal.a
>
> -LIBOCFS2_LIBS = -L$(TOPDIR)/libocfs2 -locfs2
> +LIBOCFS2_LIBS = -L$(TOPDIR)/libocfs2 -locfs2 -laio
>  LIBOCFS2_DEPS = $(TOPDIR)/libocfs2/libocfs2.a
>
>  LIBO2CB_LIBS  = -L$(TOPDIR)/libo2cb -lo2cb
> diff --git a/o2image/Makefile b/o2image/Makefile
> index eed2e0d..491ceec 100644
> --- a/o2image/Makefile
> +++ b/o2image/Makefile
> @@ -7,7 +7,7 @@ WARNINGS = -Wall -Wstrict-prototypes -Wno-format -Wmissing-prototypes \
>
>  CFLAGS += $(WARNINGS)
>
> -LIBOCFS2_LIBS = -L$(TOPDIR)/libocfs2 -locfs2
> +LIBOCFS2_LIBS = -L$(TOPDIR)/libocfs2 -locfs2 -laio
>  LIBOCFS2_DEPS = $(TOPDIR)/libocfs2/libocfs2.a
>
>  LIBO2DLM_LIBS = -L$(TOPDIR)/libo2dlm -lo2dlm
> diff --git a/o2info/Makefile b/o2info/Makefile
> index 579223a..25e4b3d 100644
> --- a/o2info/Makefile
> +++ b/o2info/Makefile
> @@ -10,7 +10,7 @@ CFLAGS += $(WARNINGS)
>  LIBTOOLS_INTERNAL_LIBS = -L$(TOPDIR)/libtools-internal -ltools-internal
>  LIBTOOLS_INTERNAL_DEPS = $(TOPDIR)/libtools-internal/libtools-internal.a
>
> -LIBOCFS2_LIBS = -L$(TOPDIR)/libocfs2 -locfs2
> +LIBOCFS2_LIBS = -L$(TOPDIR)/libocfs2 -locfs2 -laio
>  LIBOCFS2_DEPS = $(TOPDIR)/libocfs2/libocfs2.a
>
>  BIN_PROGRAMS = o2info
> diff --git a/ocfs2_hb_ctl/Makefile b/ocfs2_hb_ctl/Makefile
> index 0e1f583..e52d422 100644
> --- a/ocfs2_hb_ctl/Makefile
> +++ b/ocfs2_hb_ctl/Makefile
> @@ -6,7 +6,7 @@ sbindir = $(root_sbindir)
>  SBIN_PROGRAMS = ocfs2_hb_ctl
>
>  INCLUDES = -I$(TOPDIR)/include
> -LIBOCFS2_LIBS = -L$(TOPDIR)/libocfs2 -locfs2
> +LIBOCFS2_LIBS = -L$(TOPDIR)/libocfs2 -locfs2 -laio
>  LIBOCFS2_DEPS = $(TOPDIR)/libocfs2/libocfs2.a
>  LIBO2DLM_LIBS = -L$(TOPDIR)/libo2dlm -lo2dlm $(DL_LIBS)
>  LIBO2DLM_DEPS = $(TOPDIR)/libo2dlm/libo2dlm.a
> diff --git a/tunefs.ocfs2/Makefile b/tunefs.ocfs2/Makefile
> index 3847d0f..81cf108 100644
> --- a/tunefs.ocfs2/Makefile
> +++ b/tunefs.ocfs2/Makefile
> @@ -5,7 +5,7 @@ include $(TOPDIR)/Preamble.make
>  LIBTOOLS_INTERNAL_LIBS = -L$(TOPDIR)/libtools-internal -ltools-internal
>  LIBTOOLS_INTERNAL_DEPS = $(TOPDIR)/libtools-internal/libtools-internal.a
>
> -LIBOCFS2_LIBS = -L$(TOPDIR)/libocfs2 -locfs2
> +LIBOCFS2_LIBS = -L$(TOPDIR)/libocfs2 -locfs2 -laio
>  LIBOCFS2_DEPS = $(TOPDIR)/libocfs2/libocfs2.a
>
>  LIBO2DLM_LIBS = -L$(TOPDIR)/libo2dlm -lo2dlm $(DL_LIBS)
> --
> 1.7.4.1
>
>
> _______________________________________________
> Ocfs2-tools-devel mailing list
> Ocfs2-tools-devel at oss.oracle.com
> http://oss.oracle.com/mailman/listinfo/ocfs2-tools-devel
>



-- 
Goldwyn



More information about the Ocfs2-tools-devel mailing list