[Ocfs2-devel] [PATCH 4/4] ocfs2-1.6: add splice read/write support

Tiger Yang tiger.yang at oracle.com
Fri Mar 12 01:07:36 PST 2010


This patch copied splice code in mainline 2.6.29
to allow support for splice io with enterprise
kernels based on 2.6.18.

Signed-off-by: Tiger Yang <tiger.yang at oracle.com>
---
 Config.make.in               |    1 +
 configure.in                 |   10 ++-
 fs/ocfs2/Makefile            |    7 +
 fs/ocfs2/compat_splice.c     |  251 ++++++++++++++++++++++++++++++++++++++++++
 fs/ocfs2/file.c              |   90 ++++-----------
 fs/ocfs2/xattr.c             |    2 +-
 kapi-compat/include/splice.h |   21 ++++
 7 files changed, 314 insertions(+), 68 deletions(-)
 create mode 100644 fs/ocfs2/compat_splice.c
 create mode 100644 kapi-compat/include/splice.h

diff --git a/Config.make.in b/Config.make.in
index 6c556d6..adebda0 100644
--- a/Config.make.in
+++ b/Config.make.in
@@ -81,6 +81,7 @@ HAS_FOPS_SENDFILE = @HAS_FOPS_SENDFILE@
 NO_CONFIRM_IN_PIPE_OPERATIONS = @NO_CONFIRM_IN_PIPE_OPERATIONS@
 NO_INODE_DOUBLE_LOCK = @NO_INODE_DOUBLE_LOCK@
 NO_FILE_REMOVE_SUID = @NO_FILE_REMOVE_SUID@
+NO_SPLICE_HEADER = @NO_SPLICE_HEADER@
 SKIP_SPLICE = @SKIP_SPLICE@
 SKIP_BUFFER_TRIGGERS = @SKIP_BUFFER_TRIGGERS@
 NO_NAME_IN_BACKING_DEV_INFO=@NO_NAME_IN_BACKING_DEV_INFO@
diff --git a/configure.in b/configure.in
index 36a1867..c71fc00 100644
--- a/configure.in
+++ b/configure.in
@@ -432,9 +432,15 @@ OCFS2_CHECK_KERNEL([file_remove_suid() in fs.h], fs.h,
 AC_SUBST(NO_FILE_REMOVE_SUID)
 KAPI_COMPAT_HEADERS="$KAPI_COMPAT_HEADERS $NO_FILE_REMOVE_SUID"
 
+NO_SPLICE_HEADER=
+OCFS2_CHECK_KERNEL([struct splice_desc in splice.h], splice.h,
+ , NO_SPLICE_HEADER=splice.h, [^struct splice_desc ])
+AC_SUBST(NO_SPLICE_HEADER)
+KAPI_COMPAT_HEADERS="$KAPI_COMPAT_HEADERS $NO_SPLICE_HEADER"
+
 SKIP_SPLICE=
-OCFS2_CHECK_KERNEL([splice.h], splice.h,
-  , SKIP_SPLICE=yes, [struct splice_desc {])
+OCFS2_CHECK_KERNEL([splice_read() in fs.h], fs.h,
+ , SKIP_SPLICE=yes, [ssize_t (\*splice_read)])
 AC_SUBST(SKIP_SPLICE)
 
 mnt_want_write=
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index 866f87e..13a9d07 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -124,8 +124,15 @@ ifdef NO_FILE_REMOVE_SUID
 EXTRA_CFLAGS += -DNO_FILE_REMOVE_SUID
 endif
 
+ifdef NO_SPLICE_HEADER
+EXTRA_CFLAGS += -DNO_SPLICE_HEADER
+endif
+
+COMPAT_SOURCES += compat_splice.c
 ifdef SKIP_SPLICE
 EXTRA_CFLAGS += -DSKIP_SPLICE
+else
+FS_SOURCES += compat_splice.c
 endif
 
 ifdef SKIP_BUFFER_TRIGGERS
diff --git a/fs/ocfs2/compat_splice.c b/fs/ocfs2/compat_splice.c
new file mode 100644
index 0000000..9533d37
--- /dev/null
+++ b/fs/ocfs2/compat_splice.c
@@ -0,0 +1,251 @@
+/*
+ * compat_splice.c
+ *
+ * This code has been copied from mainline linux kernel 2.6.29
+ * to allow ocfs2 to build against older kernels. For license,
+ * refer to fs/splice.c in mainline linux kernel.
+ */
+
+void pipe_wait(struct pipe_inode_info *pipe)
+{
+	DEFINE_WAIT(wait);
+
+	/*
+	 * Pipes are system-local resources, so sleeping on them
+	 * is considered a noninteractive wait:
+	 */
+	prepare_to_wait(&pipe->wait, &wait, TASK_INTERRUPTIBLE);
+	if (pipe->inode)
+		mutex_unlock(&pipe->inode->i_mutex);
+	schedule();
+	finish_wait(&pipe->wait, &wait);
+	if (pipe->inode)
+		mutex_lock(&pipe->inode->i_mutex);
+}
+
+/*
+ * This is a little more tricky than the file -> pipe splicing. There are
+ * basically three cases:
+ *
+ *	- Destination page already exists in the address space and there
+ *	  are users of it. For that case we have no other option that
+ *	  copying the data. Tough luck.
+ *	- Destination page already exists in the address space, but there
+ *	  are no users of it. Make sure it's uptodate, then drop it. Fall
+ *	  through to last case.
+ *	- Destination page does not exist, we can add the pipe page to
+ *	  the page cache and avoid the copy.
+ *
+ * If asked to move pages to the output file (SPLICE_F_MOVE is set in
+ * sd->flags), we attempt to migrate pages from the pipe to the output
+ * file address space page cache. This is possible if no one else has
+ * the pipe page referenced outside of the pipe and page cache. If
+ * SPLICE_F_MOVE isn't set, or we cannot move the page, we simply create
+ * a new page in the output file page cache and fill/dirty that.
+ */
+static int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
+			struct splice_desc *sd)
+{
+	struct file *file = sd_file(sd);
+	struct address_space *mapping = file->f_mapping;
+	unsigned int offset, this_len;
+	struct page *page;
+	void *fsdata;
+	int ret;
+
+	/*
+	 * make sure the data in this buffer is uptodate
+	 */
+	ret = buf->ops->kapi_confirm(pipe, buf);
+	if (unlikely(ret))
+		return ret;
+
+	offset = sd->pos & ~PAGE_CACHE_MASK;
+
+	this_len = sd->len;
+	if (this_len + offset > PAGE_CACHE_SIZE)
+		this_len = PAGE_CACHE_SIZE - offset;
+
+	ret = pagecache_write_begin(file, mapping, sd->pos, this_len,
+				AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata);
+	if (unlikely(ret))
+		goto out;
+
+	if (buf->page != page) {
+		/*
+		 * Careful, ->map() uses KM_USER0!
+		 */
+		char *src = buf->ops->map(pipe, buf, 1);
+		char *dst = kmap_atomic(page, KM_USER1);
+
+		memcpy(dst + offset, src + buf->offset, this_len);
+		flush_dcache_page(page);
+		kunmap_atomic(dst, KM_USER1);
+		buf->ops->unmap(pipe, buf, src);
+	}
+	ret = pagecache_write_end(file, mapping, sd->pos, this_len, this_len,
+				page, fsdata);
+out:
+	return ret;
+}
+
+/**
+ * __splice_from_pipe - splice data from a pipe to given actor
+ * @pipe:	pipe to splice from
+ * @sd:		information to @actor
+ * @actor:	handler that splices the data
+ *
+ * Description:
+ *    This function does little more than loop over the pipe and call
+ *    @actor to do the actual moving of a single struct pipe_buffer to
+ *    the desired destination. See pipe_to_file, pipe_to_sendpage, or
+ *    pipe_to_user.
+ *
+ */
+ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd,
+			   splice_actor *actor)
+{
+	int ret, do_wakeup, err;
+
+	ret = 0;
+	do_wakeup = 0;
+
+	for (;;) {
+		if (pipe->nrbufs) {
+			struct pipe_buffer *buf = pipe->bufs + pipe->curbuf;
+			const struct pipe_buf_operations *ops = buf->ops;
+
+			sd->len = buf->len;
+			if (sd->len > sd->total_len)
+				sd->len = sd->total_len;
+
+			err = actor(pipe, buf, sd);
+			if (err <= 0) {
+				if (!ret && err != -ENODATA)
+					ret = err;
+
+				break;
+			}
+
+			ret += err;
+			buf->offset += err;
+			buf->len -= err;
+
+			sd->len -= err;
+			sd->pos += err;
+			sd->total_len -= err;
+			if (sd->len)
+				continue;
+
+			if (!buf->len) {
+				buf->ops = NULL;
+				ops->release(pipe, buf);
+				pipe->curbuf = (pipe->curbuf + 1) &
+						(PIPE_BUFFERS - 1);
+				pipe->nrbufs--;
+				if (pipe->inode)
+					do_wakeup = 1;
+			}
+
+			if (!sd->total_len)
+				break;
+		}
+
+		if (pipe->nrbufs)
+			continue;
+		if (!pipe->writers)
+			break;
+		if (!pipe->waiting_writers) {
+			if (ret)
+				break;
+		}
+
+		if (sd->flags & SPLICE_F_NONBLOCK) {
+			if (!ret)
+				ret = -EAGAIN;
+			break;
+		}
+
+		if (signal_pending(current)) {
+			if (!ret)
+				ret = -ERESTARTSYS;
+			break;
+		}
+
+		if (do_wakeup) {
+			smp_mb();
+			if (waitqueue_active(&pipe->wait))
+				wake_up_interruptible_sync(&pipe->wait);
+			kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
+			do_wakeup = 0;
+		}
+
+		pipe_wait(pipe);
+	}
+
+	if (do_wakeup) {
+		smp_mb();
+		if (waitqueue_active(&pipe->wait))
+			wake_up_interruptible(&pipe->wait);
+		kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
+	}
+
+	return ret;
+}
+
+/**
+ * generic_file_splice_write_nolock - generic_file_splice_write without mutexes
+ * @pipe:	pipe info
+ * @out:	file to write to
+ * @ppos:	position in @out
+ * @len:	number of bytes to splice
+ * @flags:	splice modifier flags
+ *
+ * Description:
+ *    Will either move or copy pages (determined by @flags options) from
+ *    the given pipe inode to the given file. The caller is responsible
+ *    for acquiring i_mutex on both inodes.
+ *
+ */
+ssize_t
+generic_file_splice_write_nolock(struct pipe_inode_info *pipe, struct file *out,
+				 loff_t *ppos, size_t len, unsigned int flags)
+{
+	struct address_space *mapping = out->f_mapping;
+	struct inode *inode = mapping->host;
+	struct splice_desc sd = {
+		.total_len = len,
+		.flags = flags,
+		.pos = *ppos,
+	};
+	ssize_t ret;
+	int err;
+
+	sd_file(&sd) = out;
+	err = file_remove_suid(out);
+	if (unlikely(err))
+		return err;
+
+	ret = __splice_from_pipe(pipe, &sd, pipe_to_file);
+	if (ret > 0) {
+		unsigned long nr_pages;
+
+		*ppos += ret;
+		nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+
+		/*
+		 * If file or inode is SYNC and we actually wrote some data,
+		 * sync it.
+		 */
+		if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) {
+			err = generic_osync_inode(inode, mapping,
+						  OSYNC_METADATA|OSYNC_DATA);
+
+			if (err)
+				ret = err;
+		}
+		balance_dirty_pages_ratelimited_nr(mapping, nr_pages);
+	}
+
+	return ret;
+}
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index ed8a49d..6c1af9e 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -31,8 +31,10 @@
 #include <linux/pagemap.h>
 #include <linux/uio.h>
 #include <linux/sched.h>
-#ifndef SKIP_SPLICE
+#ifndef NO_SPLICE_HEADER
 #include <linux/splice.h>
+#else
+#include <linux/pipe_fs_i.h>
 #endif
 #include <linux/mount.h>
 #include <linux/writeback.h>
@@ -2077,22 +2079,7 @@ out_sems:
 }
 
 #ifndef SKIP_SPLICE
-static int ocfs2_splice_to_file(struct pipe_inode_info *pipe,
-				struct file *out,
-				struct splice_desc *sd)
-{
-	int ret;
-
-	ret = ocfs2_prepare_inode_for_write(filp_dentry(out), &sd->pos,
-					    sd->total_len, 0, NULL, NULL);
-	if (ret < 0) {
-		mlog_errno(ret);
-		return ret;
-	}
-
-	return splice_from_pipe_feed(pipe, sd, pipe_to_file);
-}
-
+/* These two functions have been copied from mainline linux kernel 2.6.29. */
 static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
 				       struct file *out,
 				       loff_t *ppos,
@@ -2100,61 +2087,34 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
 				       unsigned int flags)
 {
 	int ret;
-	struct address_space *mapping = out->f_mapping;
-	struct inode *inode = mapping->host;
-	struct splice_desc sd = {
-		.total_len = len,
-		.flags = flags,
-		.pos = *ppos,
-		.u.file = out,
-	};
+	struct inode *inode = filp_dentry(out)->d_inode;
 
 	mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", out, pipe,
 		   (unsigned int)len,
 		   filp_dentry(out)->d_name.len,
 		   filp_dentry(out)->d_name.name);
 
-	if (pipe->inode)
-		mutex_lock_nested(&pipe->inode->i_mutex, I_MUTEX_PARENT);
-
-	splice_from_pipe_begin(&sd);
-	do {
-		ret = splice_from_pipe_next(pipe, &sd);
-		if (ret <= 0)
-			break;
-
-		mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
-		ret = ocfs2_rw_lock(inode, 1);
-		if (ret < 0)
-			mlog_errno(ret);
-		else {
-			ret = ocfs2_splice_to_file(pipe, out, &sd);
-			ocfs2_rw_unlock(inode, 1);
-		}
-		mutex_unlock(&inode->i_mutex);
-	} while (ret > 0);
-	splice_from_pipe_end(pipe, &sd);
-
-	if (pipe->inode)
-		mutex_unlock(&pipe->inode->i_mutex);
+	inode_double_lock(inode, pipe->inode);
 
-	if (sd.num_spliced)
-		ret = sd.num_spliced;
-
-	if (ret > 0) {
-		unsigned long nr_pages;
-		int err;
+	ret = ocfs2_rw_lock(inode, 1);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out;
+	}
 
-		nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	ret = ocfs2_prepare_inode_for_write(filp_dentry(out), ppos, len, 0,
+					    NULL, NULL);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out_unlock;
+	}
 
-		err = generic_write_sync(out, *ppos, ret);
-		if (err)
-			ret = err;
-		else
-			*ppos += ret;
+	ret = generic_file_splice_write_nolock(pipe, out, ppos, len, flags);
 
-		balance_dirty_pages_ratelimited_nr(mapping, nr_pages);
-	}
+out_unlock:
+	ocfs2_rw_unlock(inode, 1);
+out:
+	inode_double_unlock(inode, pipe->inode);
 
 	mlog_exit(ret);
 	return ret;
@@ -2166,7 +2126,7 @@ static ssize_t ocfs2_file_splice_read(struct file *in,
 				      size_t len,
 				      unsigned int flags)
 {
-	int ret = 0, lock_level = 0;
+	int ret = 0;
 	struct inode *inode = filp_dentry(in)->d_inode;
 
 	mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", in, pipe,
@@ -2177,12 +2137,12 @@ static ssize_t ocfs2_file_splice_read(struct file *in,
 	/*
 	 * See the comment in ocfs2_file_aio_read()
 	 */
-	ret = ocfs2_inode_lock_atime(inode, in->f_vfsmnt, &lock_level);
+	ret = ocfs2_inode_lock(inode, NULL, 0);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto bail;
 	}
-	ocfs2_inode_unlock(inode, lock_level);
+	ocfs2_inode_unlock(inode, 0);
 
 	ret = generic_file_splice_read(in, ppos, pipe, len, flags);
 
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 3cedf95..b782673 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -27,7 +27,7 @@
 #include <linux/pagemap.h>
 #include <linux/uio.h>
 #include <linux/sched.h>
-#ifndef SKIP_SPLICE
+#ifndef NO_SPLICE_HEADER
 #include <linux/splice.h>
 #endif
 #include <linux/mount.h>
diff --git a/kapi-compat/include/splice.h b/kapi-compat/include/splice.h
new file mode 100644
index 0000000..4aa2e72
--- /dev/null
+++ b/kapi-compat/include/splice.h
@@ -0,0 +1,21 @@
+#ifndef KAPI_SPLICE_H
+#define KAPI_SPLICE_H
+
+#ifndef SKIP_SPLICE
+#include <linux/pipe_fs_i.h>
+#include <linux/pagemap.h>
+#include <linux/writeback.h>
+
+ssize_t generic_file_splice_write_nolock(struct pipe_inode_info *pipe,
+					 struct file *out, loff_t *ppos,
+					 size_t len, unsigned int flags);
+
+#ifdef NO_SPLICE_HEADER
+# define sd_file(i) ((i)->file)
+#else
+# define sd_file(i) ((i)->u.file)
+#endif
+
+#endif /* SKIP_SPLICE */
+
+#endif
-- 
1.5.2.3




More information about the Ocfs2-devel mailing list