[Ocfs2-tools-devel] [PATCH 3/6] libocfs2: Large I/Os in the cache.

Thu May 21 14:26:05 PDT 2009

Our I/O cache is dumb.  It works one block at a time.  We really want
large I/Os to go out like that.

We change the write case to write the I/O first, as big as it can.  Then
it runs through each completed block and updates the cache.  If there
was a short write, it will still update the cache for the blocks that
were written.

The read code has even more smarts.  First, it checks to see if the
entire read is in cache.  If not, it does I/O from the start of the
first uncached block; it skips cached blocks at the front of the buffer.
Then it runs through each block and syncs the cache to the buffer.

We do the reads in 1MB hunks.  This gives us the opportunity to check
for cached blocks every megabyte.  Imagine a 10MB buffer with only one
uncached block - the very first one.  Doing it all at once will trigger
a 10MB read.  But doing it in 1MB hunks will read the first 1MB, then
discover the remaining 9MB are all in cache.

Signed-off-by: Joel Becker <joel.becker at oracle.com>
---
 libocfs2/unix_io.c |  191 +++++++++++++++++++++++++++++++++++++--------------
 1 files changed, 138 insertions(+), 53 deletions(-)

diff --git a/libocfs2/unix_io.c b/libocfs2/unix_io.c
index 411c3e1..4c6b759 100644
--- a/libocfs2/unix_io.c
+++ b/libocfs2/unix_io.c
@@ -49,6 +49,14 @@
 
 
 /*
+ * We do cached I/O in 1MB hunks, so we need this constant.
+ */
+#ifndef ONE_MEGABYTE
+# define ONE_MEGABYTE (1024 * 1024)
+#endif
+
+
+/*
  * The cache looks up blocks in two ways:
  *
  * 1) If it needs a new block, it gets one off of ic->ic_lru.  The blocks
@@ -83,6 +91,16 @@ struct _io_channel {
 	struct io_cache *io_cache;
 };
 
+/*
+ * We open code this because we don't have the ocfs2_filesys to call
+ * ocfs2_blocks_in_bytes().
+ */
+static inline int one_meg_of_blocks(io_channel *channel)
+{
+	int count = ONE_MEGABYTE + channel->io_blksize - 1;
+	return count / channel->io_blksize;
+}
+
 static errcode_t unix_io_read_block(io_channel *channel, int64_t blkno,
 				    int count, char *data)
 {
@@ -121,8 +139,9 @@ out:
 	return ret;
 }
 
-static errcode_t unix_io_write_block(io_channel *channel, int64_t blkno,
-				     int count, const char *data)
+static errcode_t unix_io_write_block_full(io_channel *channel, int64_t blkno,
+					  int count, const char *data,
+					  int *completed)
 {
 	int ret;
 	ssize_t size, tot, wr;
@@ -150,13 +169,19 @@ static errcode_t unix_io_write_block(io_channel *channel, int64_t blkno,
 
 	ret = 0;
 out:
+	if (completed)
+		*completed = tot / channel->io_blksize;
 	if (!ret && (tot != size))
 		ret = OCFS2_ET_SHORT_WRITE;
 
 	return ret;
 }
 
-
+static errcode_t unix_io_write_block(io_channel *channel, int64_t blkno,
+				     int count, const char *data)
+{
+	return unix_io_write_block_full(channel, blkno, count, data, NULL);
+}
 
 /*
  * See if the rbtree has a block for the given block number.
@@ -238,34 +263,76 @@ static struct io_cache_block *io_cache_pop_lru(struct io_cache *ic)
 	return icb;
 }
 
-static errcode_t io_cache_read_one_block(io_channel *channel, int64_t blkno,
-					 char *data)
+/*
+ * This relies on the fact that our cache is always up to date.  If a
+ * block is in the cache, the same thing is on disk.  Even if we re-read
+ * the disk block, we don't need to update the cache.  This allows us
+ * to look for optimal I/O sizes; it's better to call one read 1MB of
+ * half-cached blocks than to read every other block.
+ */
+static errcode_t io_cache_read_blocks(io_channel *channel, int64_t blkno,
+				      int count, char *data)
 {
+	int i, good_blocks;
 	errcode_t ret = 0;
 	struct io_cache *ic = channel->io_cache;
 	struct io_cache_block *icb;
 
-	icb = io_cache_lookup(ic, blkno);
-	if (icb)
-		goto found;
-
-	/* Ok, this blkno isn't in the cache.  Steal something. */
-	icb = io_cache_pop_lru(ic);
-
 	/*
-	 * If the read fails, we leave the block at the end of the LRU
-	 * and out of the lookup tree.
+	 * Here we check two things:
+	 *
+	 * 1) Are all the blocks cached?  If so, we can skip I/O.
+	 * 2) If they are not all cached, we want to start our read at the
+	 *    first uncached blkno.
 	 */
-	ret = unix_io_read_block(channel, blkno, 1, icb->icb_buf);
-	if (ret)
-		goto out;
+	for (good_blocks = 0; good_blocks < count; good_blocks++) {
+		icb = io_cache_lookup(ic, blkno + good_blocks);
+		if (!icb)
+			break;
+	}
 
-	icb->icb_blkno = blkno;
-	io_cache_insert(ic, icb);
+	/* Read any blocks not in the cache */
+	if (good_blocks < count) {
+		ret = unix_io_read_block(channel, blkno + good_blocks,
+					 count - good_blocks,
+					 data + (channel->io_blksize *
+						 good_blocks));
+		if (ret)
+			goto out;
+	}
 
-found:
-	memcpy(data, icb->icb_buf, channel->io_blksize);
-	io_cache_seen(ic, icb);
+	/* Now we sync up the cache with the data buffer */
+	for (i = 0; i < count; i++, data += channel->io_blksize) {
+		icb = io_cache_lookup(ic, blkno + i);
+		if (i < good_blocks) {
+			/*
+			 * We skipped reading this because it was in the
+			 * cache.  Copy it to the data buffer.
+			 */
+			assert(icb);
+			memcpy(data, icb->icb_buf, channel->io_blksize);
+		} else if (!icb) {
+			/* Steal the LRU buffer */
+			icb = io_cache_pop_lru(ic);
+			icb->icb_blkno = blkno + i;
+			io_cache_insert(ic, icb);
+
+			/*
+			 * We did I/O into the data buffer, now update
+			 * the cache.
+			 */
+			memcpy(icb->icb_buf, data, channel->io_blksize);
+		}
+		/*
+		 * What about if ((i >= good_blocks) && icb)?  That means
+		 * we had the buffer in the cache, but we read it anyway
+		 * to get a single I/O.  Our cache guarantees that the
+		 * contents will match, so we just skip to marking the
+		 * buffer seen.
+		 */
+
+		io_cache_seen(ic, icb);
+	}
 
 out:
 	return ret;
@@ -275,60 +342,78 @@ static errcode_t io_cache_read_block(io_channel *channel, int64_t blkno,
 				     int count, char *data)
 
 {
-	int i;
+	int todo = one_meg_of_blocks(channel);
 	errcode_t ret = 0;
 
-	for (i = 0; i < count; i++, blkno++, data += channel->io_blksize) {
-		ret = io_cache_read_one_block(channel, blkno, data);
+	/*
+	 * We do this in one meg hunks so that each hunk has an
+	 * opportunity to be in cache, but we get a good throughput.
+	 */
+	while (count) {
+		if (todo > count)
+			todo = count;
+		ret = io_cache_read_blocks(channel, blkno, todo, data);
 		if (ret)
 			break;
+
+		blkno += todo;
+		count -= todo;
+		data += (channel->io_blksize * todo);
 	}
 
 	return ret;
 }
 
-static errcode_t io_cache_write_one_block(io_channel *channel,
-					  int64_t blkno, const char *data)
+/*
+ * This relies on the fact that our cache is always up to date.  If a
+ * block is in the cache, the same thing is on disk.  So here we'll write
+ * a whole stream and update the cache as needed.
+ */
+static errcode_t io_cache_write_blocks(io_channel *channel, int64_t blkno,
+				       int count, const char *data)
 {
+	int i, completed = 0;
 	errcode_t ret;
 	struct io_cache *ic = channel->io_cache;
 	struct io_cache_block *icb;
 
-	icb = io_cache_lookup(ic, blkno);
-	if (icb)
-		goto found;
-
-	/* Ok, this blkno isn't in the cache.  Steal something. */
-	icb = io_cache_pop_lru(ic);
-
-	icb->icb_blkno = blkno;
-	io_cache_insert(ic, icb);
+	/* Get the write out of the way */
+	ret = unix_io_write_block_full(channel, blkno, count, data,
+				       &completed);
 
-found:
-	memcpy(icb->icb_buf, data, channel->io_blksize);
-	io_cache_seen(ic, icb);
+	/*
+	 * Now we sync up the cache with the data buffer.  We have
+	 * to sync up I/O that completed, even if the entire I/O did not.
+	 */
+	for (i = 0; i < completed; i++, data += channel->io_blksize) {
+		icb = io_cache_lookup(ic, blkno + i);
+		if (!icb) {
+			/*
+			 * Steal the LRU buffer.  We can't error here, so
+			 * we can safely insert it before we copy the data.
+			 */
+			icb = io_cache_pop_lru(ic);
+			icb->icb_blkno = blkno + i;
+			io_cache_insert(ic, icb);
+		}
 
-	ret = unix_io_write_block(channel, blkno, 1, icb->icb_buf);
-	if (ret)
-		io_cache_disconnect(ic, icb);
+		memcpy(icb->icb_buf, data, channel->io_blksize);
+		io_cache_seen(ic, icb);
+	}
 
 	return ret;
 }
 
 static errcode_t io_cache_write_block(io_channel *channel, int64_t blkno,
 				      int count, const char *data)
-
 {
-	int i;
-	errcode_t ret = 0;
-
-	for (i = 0; i < count; i++, blkno++, data += channel->io_blksize) {
-		ret = io_cache_write_one_block(channel, blkno, data);
-		if (ret)
-			break;
-	}
-
-	return ret;
+	/*
+	 * Unlike io_read_cache_block(), we're going to do all of the
+	 * I/O no matter what.  We keep the separation of
+	 * io_cache_write_block() and io_cache_write_blocks() for
+	 * consistency.
+	 */
+	return io_cache_write_blocks(channel, blkno, count, data);
 }
 
 static void io_free_cache(struct io_cache *ic)
-- 
1.6.3