[Ocfs2-commits] zab commits r2544 - branches/locking-changes/fs/ocfs2

Thu Aug 25 14:21:40 CDT 2005

Author: zab
Date: 2005-08-25 14:21:37 -0500 (Thu, 25 Aug 2005)
New Revision: 2544

Removed:
   branches/locking-changes/fs/ocfs2/aio.c
   branches/locking-changes/fs/ocfs2/aio.h
Modified:
   branches/locking-changes/fs/ocfs2/Makefile
   branches/locking-changes/fs/ocfs2/aops.c
   branches/locking-changes/fs/ocfs2/file.c
   branches/locking-changes/fs/ocfs2/mmap.c
   branches/locking-changes/fs/ocfs2/mmap.h
   branches/locking-changes/fs/ocfs2/ocfs2.h
   branches/locking-changes/fs/ocfs2/super.c
Log:
This moves aio and file read and write into one path.  Locks are acquired
and released around the generic helpers.  This doesn't hold locks around
aio ops which means we need i_alloc_sem.  We use blockdev_direct_IO and
need to introduce a clustered i_alloc_sem.  

This doesn't try to retry partial read/write buildup state if the dlm returns
eiocbretry, which will let us get rid of a bunch of dlm-related code soon.

vmas are no longer walked around read/write, more locking work needs to be
done.



Modified: branches/locking-changes/fs/ocfs2/Makefile
===================================================================

--- branches/locking-changes/fs/ocfs2/Makefile	2005-08-25 19:02:58 UTC (rev 2543)
+++ branches/locking-changes/fs/ocfs2/Makefile	2005-08-25 19:21:37 UTC (rev 2544)
@@ -36,7 +36,6 @@
 SAFE_SUBDIRS = cluster dlm
 
 SOURCES =			\
-	aio.c 			\
 	alloc.c 		\
 	aops.c 			\
 	buffer_head_io.c	\
@@ -67,7 +66,6 @@
 	ocfs2_lockid.h		\
 	ocfs2.h			\
 	buffer_head_io.h	\
-	aio.h			\
 	alloc.h			\
 	dcache.h		\
 	dir.h			\

Deleted: branches/locking-changes/fs/ocfs2/aio.c
===================================================================
--- branches/locking-changes/fs/ocfs2/aio.c	2005-08-25 19:02:58 UTC (rev 2543)
+++ branches/locking-changes/fs/ocfs2/aio.c	2005-08-25 19:21:37 UTC (rev 2544)
@@ -1,389 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * aio.c
- *
- * aio read and write
- *
- * Copyright (C) 2002, 2004, 2005 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#include <linux/fs.h>
-#include <linux/types.h>
-#include <linux/slab.h>
-#include <linux/highmem.h>
-#include <linux/pagemap.h>
-#include <linux/uio.h>
-
-#define MLOG_MASK_PREFIX ML_FILE_IO|ML_AIO
-#include <cluster/masklog.h>
-
-#include "ocfs2.h"
-
-#include "aio.h"
-#include "alloc.h"
-#include "dir.h"
-#include "dlmglue.h"
-#include "extent_map.h"
-#include "file.h"
-#include "sysfile.h"
-#include "inode.h"
-#include "mmap.h"
-#include "suballoc.h"
-
-
-struct ocfs2_kiocb_private {
-	struct ocfs2_kiocb_private	*kp_teardown_next;
-	ocfs2_super			*kp_osb;
-	unsigned			kp_have_alloc_sem:1,
-					kp_have_write_locks:1;
-	struct inode			*kp_inode;
-	struct ocfs2_buffer_lock_ctxt	kp_ctxt;
-	struct ocfs2_write_lock_info	kp_info;
-};
-
-static void okp_teardown(struct ocfs2_kiocb_private *okp)
-{
-	mlog(0, "okp %p\n", okp);
-
-	BUG_ON(okp->kp_inode == NULL);
-
-	if (okp->kp_info.wl_unlock_ctxt)
-		ocfs2_unlock_buffer_inodes(&okp->kp_ctxt);
-	if (okp->kp_have_alloc_sem)
-		up_read(&OCFS2_I(okp->kp_inode)->ip_alloc_sem);
-
-	iput(okp->kp_inode);
-	kfree(okp);
-}
-
-void okp_teardown_from_list(void *data)
-{
-	ocfs2_super *osb = data;
-	struct ocfs2_kiocb_private *okp, *next;
-
-	for (okp = xchg(&osb->osb_okp_teardown_next, NULL); okp != NULL;
-	     okp = next) {
-
-		next = okp->kp_teardown_next;
-		okp_teardown(okp);
-	}
-}
-
-/*
- * This releases the dlm locks we held across an aio operation and frees the
- * space we were tracking them in.
- *
- * While aio operations are in flight they have a vfsmnt reference for the file
- * which prevents unmount.  This dtor gets called *after* that ref is dropped,
- * however, so we have to make sure to account for pending work we have here in
- * the unmount path.  The race starts when aio does its fputs, before it calls
- * dtor which queues work, so just synchronizing with the work queue could miss
- * that first phase.  So unmount first waits for the pending count to drop.
- * Then it has to wait for keventd to finish the work freeing the okps.
- *
- * _dtor can be called from just about any context and lock teardown is
- * anything but interrupt safe.  We used to hand the okps to
- * okp_teardown_from_list with a normal list_head and irq masking lock but we
- * want to avoid masking interrupts so it was shifted to the {cmp,}xchg() and
- * atomic_t.
- *
- * Adding to the singly linked ->next list is only a little tricky.  We have to
- * watch for races between sampling the head to assign ->next in the inserting
- * okp and a new head being written before we point the head to the inserting
- * okp.
- */
-static void ocfs2_ki_dtor(struct kiocb *iocb)
-{
-	struct ocfs2_kiocb_private *next, *okp = iocb->private;
-	ocfs2_super *osb = okp->kp_osb;
-
-	mlog(0, "iocb %p okp %p\n", iocb, okp);
-
-	/* okp_alloc only assigns the iocb->private and ->ki_dtor pointers if
-	 * it was able to alloc the okp and get an inode reference */
-	BUG_ON(okp == NULL);
-	BUG_ON(okp->kp_inode == NULL);
-
-	/* we had better not try to work with this iocb again */
-	iocb->private = NULL;
-
-	 /* once this cmpxchg succeeds the okp can be freed so we have to be
-	  * careful not to deref it when testing success */
-	do {
-		next = osb->osb_okp_teardown_next;
-		okp->kp_teardown_next = next;
-	} while (cmpxchg(&osb->osb_okp_teardown_next, next, okp) != next);
-
-	schedule_work(&osb->osb_okp_teardown_work);
-
-	if (atomic_dec_and_test(&osb->osb_okp_pending))
-		wake_up(&osb->osb_okp_pending_wq);
-}
-
-/* see ocfs2_ki_dtor() */
-void ocfs2_wait_for_okp_destruction(ocfs2_super *osb)
-{
-	/* first wait for okps to enter the work queue */
-	wait_event(osb->osb_okp_pending_wq,
-		   atomic_read(&osb->osb_okp_pending) == 0);
-	/*
-	 * then wait for keventd to finish with all its work, including ours.
-	 *
-	 * XXX this makes me very nervous.  what if our work blocks keventd
-	 * during an unlock and the unlock can only proceed if keventd
-	 * can get to some more work that the dlm might have queued?
-	 * do we push any dlm work to keventd?
-	 */
-	flush_scheduled_work();
-}
-
-/* just to stop sys_io_cancel() from spewing to the console when it sees an
- * iocb without ki_cancel */
-static int ocfs2_ki_cancel(struct kiocb *iocb, struct io_event *ev)
-{
-	mlog(0, "iocb %p\n", iocb);
-	aio_put_req(iocb);
-	return -EAGAIN;
-}
-
-static struct ocfs2_kiocb_private *okp_alloc(struct kiocb *iocb)
-{
-	struct inode *inode = iocb->ki_filp->f_dentry->d_inode;
-	struct ocfs2_kiocb_private *okp;
-	ocfs2_super *osb;
-
-	okp = kcalloc(1, sizeof(*okp), GFP_KERNEL);
-	if (okp == NULL) {
-		okp = ERR_PTR(-ENOMEM);
-		goto out;
-	}
-
-	/* our dtor only gets registerd if we can guarantee that it holds
-	 * a reference to the inode */
-	okp->kp_inode = igrab(inode);
-	if (okp->kp_inode == NULL) {
-		kfree(okp);
-		okp = ERR_PTR(-EINVAL);
-		goto out;
-	}
-	/* unmount syncs with work using this ref before destroying the osb */
-	osb = OCFS2_SB(inode->i_sb);
-	okp->kp_osb = osb;
-
-	iocb->private = okp;
-	iocb->ki_dtor = ocfs2_ki_dtor;
-	iocb->ki_cancel = ocfs2_ki_cancel;
-	INIT_BUFFER_LOCK_CTXT(&okp->kp_ctxt);
-
-	atomic_inc(&osb->osb_okp_pending);
-out:
-	mlog(0, "iocb %p returning %p\n", iocb, okp);
-	return okp;
-}
-
-/* The DLM supports a minimal notion of AIO lock acquiry.  Instead of testing
- * the iocb or current-> like kernel fs/block paths tend to, it takes an
- * explicit callback which it calls when a lock state attempt makes forward
- * progress.  It would be better if it worked with the native
- * kernel AIO mechanics */
-static void ocfs2_aio_kick(int status, unsigned long data)
-{
-	struct kiocb *iocb = (struct kiocb *)data;
-	/* XXX worry about racing with ki_cancel once we set it */
-	mlog(0, "iocb %p\n", iocb);
-	kick_iocb(iocb);
-}
-
-/* this is called as iocb->ki_retry so it is careful to only repeat
- * what is needed */
-ssize_t ocfs2_file_aio_read(struct kiocb *iocb, char __user *buf, size_t count,
-			    loff_t pos)
-{
-	struct ocfs2_kiocb_private *okp = iocb->private;
-	struct file *filp = iocb->ki_filp;
-	struct inode *inode = filp->f_dentry->d_inode;
-	struct ocfs2_backing_inode *target_binode;
-	ssize_t ret, ret2;
-	sigset_t blocked, oldset;
-
-	/*
-	 * The DLM doesn't block waiting for network traffic or anything, it
-	 * modifies state and calls our callback when things have changed.
-	 * However, it still likes to check signals and return ERESTARTSYS.
-	 * The AIO core does not appreciate ERESTARTSYS as its semantics are
-	 * not exactly clear for submission, etc.  So we block signals and
-	 * ensure that the DLM won't notice them.  The caller, particularly
-	 * sys_io_getevents(), will eventually check signals before sleeping
-	 * and so things should still work as expected, if perhaps with
-	 * slightly higher signal delivery latency.
-	 */
-	sigfillset(&blocked);
-	ret = sigprocmask(SIG_BLOCK, &blocked, &oldset);
-	if (ret < 0) {
-		mlog_errno(ret);
-		goto out;
-	}
-
-	mlog(0, "iocb %p okp %p\n", iocb, okp);
-
-	if (okp == NULL) {
-		okp = okp_alloc(iocb);
-		if (IS_ERR(okp)) {
-			ret = PTR_ERR(okp);
-			mlog_errno(ret);
-			goto setmask;
-		}
-
-		ret = ocfs2_setup_io_locks(inode->i_sb, inode, buf, count,
-					   &okp->kp_ctxt, &target_binode);
-		if (ret < 0) {
-			mlog_errno(ret);
-			goto setmask;
-		}
-
-		okp->kp_ctxt.b_cb = ocfs2_aio_kick;
-		okp->kp_ctxt.b_cb_data = (unsigned long)iocb;
-		target_binode->ba_lock_data = filp->f_flags & O_DIRECT ? 0 : 1;
-	}
-
-	/* this might return EIOCBRETRY and we'll come back again to
-	 * continue the locking.  It's harmless to call it once it has
-	 * returned success.. */
-	okp->kp_info.wl_unlock_ctxt = 1; /* re-use the write info path */
-	ret = ocfs2_lock_buffer_inodes(&okp->kp_ctxt, NULL);
-	if (ret < 0) {
-		if (ret != -EIOCBRETRY)
-			mlog_errno(ret);
-		goto setmask;
-	}
-
-	/* hold the ip_alloc_sem across the op */
-	if (!okp->kp_have_alloc_sem) {
-		down_read(&OCFS2_I(inode)->ip_alloc_sem);
-		okp->kp_have_alloc_sem = 1;
-	}
-
-	ret = generic_file_aio_read(iocb, buf, count, pos);
-
-setmask:
-	ret2 = sigprocmask(SIG_SETMASK, &oldset, NULL);
-	if (ret2 < 0) {
-		mlog_errno(ret2);
-		if (ret == 0)
-			ret = ret2;
-	}
-
-out:
-	/* ki_dtor will always be called eventually, no tear down here */
-	mlog(0, "iocb %p returning %lld\n", iocb, (long long)ret);
-	return ret;
-}
-
-/* this is called as iocb->ki_retry so it is careful to only repeat
- * what is needed */
-ssize_t ocfs2_file_aio_write(struct kiocb *iocb, const char __user *buf,
-			     size_t count, loff_t pos)
-{
-	struct ocfs2_kiocb_private *okp = iocb->private;
-	struct file *filp = iocb->ki_filp;
-	struct inode *inode = filp->f_dentry->d_inode;
-	ssize_t ret = 0, ret2;
-	sigset_t blocked, oldset;
-	struct iovec local_iov = { .iov_base = (void __user *)buf,
-				   .iov_len = count };
-
-	/* explained up in ocfs2_file_aio_read() */
-	sigfillset(&blocked);
-	ret = sigprocmask(SIG_BLOCK, &blocked, &oldset);
-	if (ret < 0) {
-		mlog_errno(ret);
-		goto out;
-	}
-
-	mlog(0, "iocb %p okp %p\n", iocb, okp);
-
-	if (okp == NULL) {
-		okp = okp_alloc(iocb);
-		if (IS_ERR(okp)) {
-			ret = PTR_ERR(okp);
-			mlog_errno(ret);
-			goto up_io;
-		}
-
-		okp->kp_ctxt.b_cb = ocfs2_aio_kick;
-		okp->kp_ctxt.b_cb_data = (unsigned long)iocb;
-	}
-
-	if (!okp->kp_have_write_locks) {
-		ret = ocfs2_write_lock_maybe_extend(filp, buf, count,
-						    &iocb->ki_pos,
-						    &okp->kp_info,
-						    &okp->kp_ctxt);
-		okp->kp_have_write_locks = 1;
-		if (okp->kp_info.wl_extended) {
-			/*
-			 * this is not a particularly nice place to do this but
-			 * extending aio in ocfs2 is not yet a priority.  it
-			 * means that we'll write zeros in the buffered case
-			 * before then over-writing them with the real op.  It
-			 * also sleeps in the aio submission context.
-			 */
-			ocfs2_file_finish_extension(inode,
-						    !okp->kp_info.wl_newsize,
-						    okp->kp_info.wl_do_direct_io);
-			okp->kp_info.wl_extended = 0;
-		}
-		if (ret) {
-			mlog_errno(ret);
-			goto up_io;
-		}
-	}
-
-	/* hold the ip_alloc_sem across the op */
-	if (!okp->kp_have_alloc_sem) {
-		down_read(&OCFS2_I(inode)->ip_alloc_sem);
-		okp->kp_have_alloc_sem = 1;
-	}
-
-up_io:
-	/*
-	 * never hold i_sem when we leave this function, nor when we call
-	 * g_f_a_w().  we've done all extending and inode field updating under
-	 * the i_sem and we hold the ip_alloc_sem for reading across the ops.
-	 * ocfs2_direct_IO calls blockdev_direct_IO with NO_LOCKING.
-	 */
-	if (okp->kp_info.wl_have_i_sem) {
-		up(&inode->i_sem);
-		okp->kp_info.wl_have_i_sem = 0;
-	}
-	if (ret == 0)
-		ret = generic_file_aio_write_nolock(iocb, &local_iov, 1,
-						    &iocb->ki_pos);
-
-	ret2 = sigprocmask(SIG_SETMASK, &oldset, NULL);
-	if (ret2 < 0) {
-		mlog_errno(ret2);
-		if (ret == 0)
-			ret = ret2;
-	}
-out:
-	/* ki_dtor will always be called eventually, no tear down here */
-	mlog(0, "iocb %p returning %lld\n", iocb, (long long)ret);
-	return ret;
-}

Deleted: branches/locking-changes/fs/ocfs2/aio.h
===================================================================
--- branches/locking-changes/fs/ocfs2/aio.h	2005-08-25 19:02:58 UTC (rev 2543)
+++ branches/locking-changes/fs/ocfs2/aio.h	2005-08-25 19:21:37 UTC (rev 2544)
@@ -1,37 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * aio.h
- *
- * Function prototypes
- *
- * Copyright (C) 2002, 2004, 2005 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#ifndef OCFS2_AIO_H
-#define OCFS2_AIO_H
-
-ssize_t ocfs2_file_aio_write(struct kiocb *iocb, const char __user *buf,
-			     size_t count, loff_t pos);
-ssize_t ocfs2_file_aio_read(struct kiocb *iocb, char __user *buf, size_t count,
-			    loff_t pos);
-
-void okp_teardown_from_list(void *data);
-void ocfs2_wait_for_okp_destruction(ocfs2_super *osb);
-
-#endif /* OCFS2_AIO_H */

Modified: branches/locking-changes/fs/ocfs2/aops.c
===================================================================
--- branches/locking-changes/fs/ocfs2/aops.c	2005-08-25 19:02:58 UTC (rev 2543)
+++ branches/locking-changes/fs/ocfs2/aops.c	2005-08-25 19:21:37 UTC (rev 2544)
@@ -388,13 +388,9 @@
 	int ret;
 
 	mlog_entry_void();
-
-	/* blockdev_direct_IO checks alignment for us, using */
-	ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
-					    inode->i_sb->s_bdev, iov, offset,
-					    nr_segs, ocfs2_direct_IO_get_blocks,
-					    NULL);
-
+	ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
+				 offset, nr_segs, ocfs2_direct_IO_get_blocks,
+				 NULL);
 	mlog_exit(ret);
 	return ret;
 }

Modified: branches/locking-changes/fs/ocfs2/file.c
===================================================================
--- branches/locking-changes/fs/ocfs2/file.c	2005-08-25 19:02:58 UTC (rev 2543)
+++ branches/locking-changes/fs/ocfs2/file.c	2005-08-25 19:21:37 UTC (rev 2544)
@@ -35,7 +35,6 @@
 
 #include "ocfs2.h"
 
-#include "aio.h"
 #include "alloc.h"
 #include "dir.h"
 #include "dlmglue.h"
@@ -158,7 +157,7 @@
 				 loff_t newsize,
 				 unsigned direct_extend)
 {
-	int status;
+	int ret;
 
 	mlog(0, "inode %"MLFu64", newsize = %lld, direct_extend = %u\n",
 	     OCFS2_I(inode)->ip_blkno, (long long)newsize, direct_extend);
@@ -176,29 +175,26 @@
 	}
 #endif
 
-	status = ocfs2_zero_extend(inode);
-	/*
-	 * Don't overwrite the result of
-	 * generic_file_write
-	 */
-	if (status)
-		mlog(ML_ERROR, "Unable to pre-zero extension of inode "
-		     "(%d)\n", status);
+	/* caller won't overwrite return from g_f_w so we don't return */
+	ret = ocfs2_zero_extend(inode);
+	if (ret)
+		mlog(ML_ERROR, "Unable to pre-zero extension of inode (%d)\n",
+		     ret);
 }
 
-static ssize_t ocfs2_file_write(struct file *filp,
-				const char __user *buf,
-				size_t count,
-				loff_t *ppos)
+static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
+				    const char __user *buf,
+				    size_t count,
+				    loff_t pos)
 {
 	struct iovec local_iov = { .iov_base = (void __user *)buf,
 				   .iov_len = count };
-	int ret = 0;
+	int ret, level;
 	ocfs2_super *osb = NULL;
-	struct dentry *dentry = filp->f_dentry;
-	struct inode *inode = dentry->d_inode;
-	struct ocfs2_write_lock_info info = {0, };
-	DECLARE_BUFFER_LOCK_CTXT(ctxt);
+	struct file *filp = iocb->ki_filp;
+	struct inode *inode = filp->f_dentry->d_inode;
+	int do_direct = 0, extended = 0;
+	loff_t newsize, saved_pos;
 
 	mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", filp, buf,
 		   (unsigned int)count,
@@ -219,57 +215,161 @@
 
 	osb = OCFS2_SB(inode->i_sb);
 
-	ret = ocfs2_write_lock_maybe_extend(filp, buf, count, ppos, &info,
-					    &ctxt);
-	if (ret)
-		goto bail;
+	down(&inode->i_sem);
 
+	/* this ginormous block is in here because it has so many inputs
+	 * and outputs from this function.. */
+	level = !!(filp->f_flags & O_APPEND);
+	for(;;) {
+		u64 bytes_added;
+
+		ret = ocfs2_meta_lock(inode, NULL, NULL, level);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto bail_i_sem;
+		}
+
+		/* work on a copy of ppos until we're sure that we won't have
+		 * to recalculate it due to relocking. */
+		if (filp->f_flags & O_APPEND) {
+			saved_pos = i_size_read(inode);
+			mlog(0, "O_APPEND: inode->i_size=%llu\n", saved_pos);
+#ifdef OCFS2_ORACORE_WORKAROUNDS
+			if (osb->s_mount_opt & OCFS2_MOUNT_COMPAT_OCFS) {
+				/* ugh, work around some applications which
+				 * open everything O_DIRECT + O_APPEND and
+				 * really don't mean to use O_DIRECT. */
+				filp->f_flags &= ~O_DIRECT;
+			}
+#endif
+		} else {
+			saved_pos = iocb->ki_pos;
+		}
+		newsize = count + saved_pos;
+
+		mlog(0, "pos=%lld newsize=%"MLFu64" cursize=%lld\n",
+		     saved_pos, newsize, i_size_read(inode));
+
+		if (newsize <= i_size_read(inode))
+			break;
+
+		if (level == 0) {
+			ocfs2_meta_unlock(inode, level);
+			level = 1;
+			continue;
+		}
+
+		mlog(0, "Writing at EOF, will need more allocation: "
+		     "i_size=%lld, " "need=%"MLFu64"\n", i_size_read(inode),
+		     newsize);
+
+		/* If we extend AT ALL here then we update our state
+		 * and continue the write call, regardless of error --
+		 * this is basically a short write. */
+		ret = ocfs2_extend_file(osb, inode, newsize, &bytes_added);
+		if (ret < 0 &&
+		    ret != -ERESTARTSYS && ret != -EINTR && ret != -ENOSPC) {
+			mlog_errno(ret);
+			mlog(ML_ERROR, "Failed to extend inode %"MLFu64
+			     " from %lld to %"MLFu64, OCFS2_I(inode)->ip_blkno,
+			     i_size_read(inode), newsize);
+		}
+		if (ret < 0 && (!bytes_added)) 
+			goto bail_meta_unlock;
+
+		extended = 1;
+
+		/* We need to recalulate newsize and count according
+		 * to what extend could give us. If we got the whole
+		 * extend then this doesn't wind up changing the
+		 * values. */
+		newsize = i_size_read(inode) + bytes_added;
+		count = newsize - saved_pos;
+		ret = 0;
+		break;
+	}
+
+	/* we've got whatever cluster lock is appropriate now, so we
+	 * can stuff *ppos back. */
+	iocb->ki_pos = saved_pos;
+
+	if (filp->f_flags & O_DIRECT) {
+#ifdef OCFS2_ORACORE_WORKAROUNDS
+		if (osb->s_mount_opt & OCFS2_MOUNT_COMPAT_OCFS) {
+			int sector_size = 1 << osb->s_sectsize_bits;
+
+			if ((saved_pos & (sector_size - 1)) ||
+			    (count & (sector_size - 1)) ||
+			    ((unsigned long)buf & (sector_size - 1))) {
+				do_direct = 0;
+				filp->f_flags |= O_SYNC;
+			} else {
+				do_direct = 1;
+			}
+		} else
+#endif
+			do_direct = 1;
+
+		mlog(0, "O_DIRECT\n");
+	}
+
+	if (!do_direct) {
+		ret = ocfs2_data_lock(inode, 1);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto bail_extend;
+		}
+	}
+
 	down_read(&OCFS2_I(inode)->ip_alloc_sem);
 
 #ifdef OCFS2_ORACORE_WORKAROUNDS
 	if (osb->s_mount_opt & OCFS2_MOUNT_COMPAT_OCFS) {
 		unsigned int saved_flags = filp->f_flags;
 
-		if (info.wl_do_direct_io)
+		if (do_direct)
 			filp->f_flags |= O_DIRECT;
 		else
 			filp->f_flags &= ~O_DIRECT;
 
-		ret = generic_file_write_nolock(filp, &local_iov, 1, ppos);
+		ret = generic_file_aio_write_nolock(iocb, &local_iov, 1,
+						    &iocb->ki_pos);
 
 		filp->f_flags = saved_flags;
 	} else
 #endif
-		ret = generic_file_write_nolock(filp, &local_iov, 1, ppos);
+		ret = generic_file_aio_write_nolock(iocb, &local_iov, 1,
+						    &iocb->ki_pos);
 
 	up_read(&OCFS2_I(inode)->ip_alloc_sem);
 
-bail:
+	if (!do_direct)
+		ocfs2_data_unlock(inode, 1);
+
 	/* we might have to finish up extentions that were performed before
 	 * an error was returned by, say, data locking */
-	if (info.wl_extended)
-		ocfs2_file_finish_extension(inode, info.wl_newsize,
-					    info.wl_do_direct_io);
-	if (info.wl_unlock_ctxt)
-		ocfs2_unlock_buffer_inodes(&ctxt);
-	if (info.wl_have_i_sem)
-		up(&inode->i_sem);
+bail_extend:
+	if (extended)
+		ocfs2_file_finish_extension(inode, newsize, do_direct);
+bail_meta_unlock:
+	ocfs2_meta_unlock(inode, level);
+bail_i_sem:
+	up(&inode->i_sem);
+
+bail:
 	mlog_exit(ret);
-
 	return ret;
 }
 
-static ssize_t ocfs2_file_read(struct file *filp,
-			       char __user *buf,
-			       size_t count,
-			       loff_t *ppos)
+static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
+				   char __user *buf,
+				   size_t count,
+				   loff_t pos)
 {
 	int ret = 0;
 	ocfs2_super *osb = NULL;
-	struct dentry *dentry = filp->f_dentry;
-	struct inode *inode = dentry->d_inode;
-	struct ocfs2_backing_inode *target_binode;
-	DECLARE_BUFFER_LOCK_CTXT(ctxt);
+	struct file *filp = iocb->ki_filp;
+	struct inode *inode = filp->f_dentry->d_inode;
 
 	mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", filp, buf,
 		   (unsigned int)count,
@@ -289,7 +389,7 @@
 		if (filp->f_flags & O_DIRECT) {
 			int sector_size = 1 << osb->s_sectsize_bits;
 
-			if (((*ppos) & (sector_size - 1)) ||
+			if ((pos & (sector_size - 1)) ||
 			    (count & (sector_size - 1)) ||
 			    ((unsigned long)buf & (sector_size - 1)) ||
 			    (i_size_read(inode) & (sector_size -1))) {
@@ -299,32 +399,33 @@
 	}
 #endif
 
-	ret = ocfs2_setup_io_locks(inode->i_sb, inode, buf, count, &ctxt,
-				   &target_binode);
+	ret = ocfs2_meta_lock(inode, NULL, NULL, 0);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto bail;
 	}
 
-	target_binode->ba_lock_data = (filp->f_flags & O_DIRECT) ? 0 : 1;
-
-	ret = ocfs2_lock_buffer_inodes(&ctxt, NULL);
-	if (ret < 0) {
-		mlog_errno(ret);
-		goto bail_unlock;
+	if (!(filp->f_flags & O_DIRECT)) {
+		ret = ocfs2_data_lock(inode, 0);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto bail_unlock_meta;
+		}
 	}
 
 	down_read(&OCFS2_I(inode)->ip_alloc_sem);
 
-	ret = generic_file_read(filp, buf, count, ppos);
+	ret = generic_file_aio_read(iocb, buf, count, iocb->ki_pos);
 
 	up_read(&OCFS2_I(inode)->ip_alloc_sem);
 
 	if (ret == -EINVAL)
-		mlog(ML_ERROR, "Generic_file_read returned -EINVAL\n");
+		mlog(ML_ERROR, "generic_file_aio_read returned -EINVAL\n");
 
-bail_unlock:
-	ocfs2_unlock_buffer_inodes(&ctxt);
+	if (!(filp->f_flags & O_DIRECT))
+		ocfs2_data_unlock(inode, 0);
+bail_unlock_meta:
+	ocfs2_meta_unlock(inode, 0);
 
 bail:
 	mlog_exit(ret);
@@ -379,8 +480,8 @@
 }
 
 struct file_operations ocfs2_fops = {
-	.read		= ocfs2_file_read,
-	.write		= ocfs2_file_write,
+	.read		= do_sync_read,
+	.write		= do_sync_write,
 	.sendfile	= ocfs2_file_sendfile,
 	.mmap		= ocfs2_mmap,
 	.fsync		= ocfs2_sync_file,

Modified: branches/locking-changes/fs/ocfs2/mmap.c
===================================================================
--- branches/locking-changes/fs/ocfs2/mmap.c	2005-08-25 19:02:58 UTC (rev 2543)
+++ branches/locking-changes/fs/ocfs2/mmap.c	2005-08-25 19:21:37 UTC (rev 2544)
@@ -42,100 +42,66 @@
 #include "inode.h"
 #include "mmap.h"
 
-static inline u64 ocfs2_binode_blkno(struct ocfs2_backing_inode *binode);
-static inline struct rb_node * __ocfs2_buffer_lock_ctxt_root(
-	struct ocfs2_buffer_lock_ctxt *ctxt);
-static int ocfs2_buffer_lock_ctxt_insert(struct ocfs2_buffer_lock_ctxt *ctxt,
-					 struct inode *inode,
-					 struct ocfs2_backing_inode **binode_ret);
-static int ocfs2_fill_ctxt_from_buf(struct super_block *sb,
-				    struct inode *target_inode,
-				    char __user *buf,
-				    size_t size,
-				    struct ocfs2_buffer_lock_ctxt *ctxt);
-
 static struct page *ocfs2_nopage(struct vm_area_struct * area,
 				 unsigned long address,
 				 int *type)
 {
-	int status, tmpstat, locked;
 	struct inode *inode = area->vm_file->f_dentry->d_inode;
 	struct page *page;
 	sigset_t blocked, oldset;
-	DECLARE_IO_MARKER(io_marker);
+	int ret;
 
-	mlog_entry("(inode %lu, address %lu)\n", inode->i_ino,
-		   address);
+	mlog_entry("(inode %lu, address %lu)\n", inode->i_ino, address);
 
-	locked = ocfs2_is_in_io_marker_list(inode, current);
+	/* For lack of a better error... Unfortunately returns
+	 * from nopage aren't very expressive right now. */
+	page = NOPAGE_SIGBUS;
 
-	if (!locked) {
-		/* For lack of a better error... Unfortunately returns
-		 * from nopage aren't very expressive right now. */
-		page = NOPAGE_SIGBUS;
+	/* The best way to deal with signals in this path is
+	 * to block them upfront, rather than allowing the
+	 * locking paths to return -ERESTARTSYS. */
+	sigfillset(&blocked);
 
-		/* The best way to deal with signals in this path is
-		 * to block them upfront, rather than allowing the
-		 * locking paths to return -ERESTARTSYS. */
-		sigfillset(&blocked);
+	/* We should technically never get a bad ret return
+	 * from sigprocmask */
+	ret = sigprocmask(SIG_BLOCK, &blocked, &oldset);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto bail;
+	}
 
-		/* We should technically never get a bad status return
-		 * from sigprocmask */
-		status = sigprocmask(SIG_BLOCK, &blocked, &oldset);
-		if (status < 0) {
-			mlog_errno(status);
-			goto bail;
-		}
+	/* Since we don't allow shared writable, we need only
+	 * worry about read locking here. */
+	ret = ocfs2_meta_lock(inode, NULL, NULL, 0);
+	if (ret < 0) {
+		mlog_errno(ret);
 
-		/* Since we don't allow shared writable, we need only
-		 * worry about read locking here. */
-		status = ocfs2_meta_lock(inode, NULL, NULL, 0);
-		if (status < 0) {
-			mlog_errno(status);
+		if (ret == -ENOMEM)
+			page = NOPAGE_OOM;
+		goto bail_setmask;
+	}
 
-			if (status == -ENOMEM)
-				page = NOPAGE_OOM;
-			goto bail_setmask;
-		}
+	ret = ocfs2_data_lock(inode, 0);
+	if (ret < 0) {
+		mlog_errno(ret);
 
-		status = ocfs2_data_lock(inode, 0);
-		if (status < 0) {
-			mlog_errno(status);
-
-			if (status == -ENOMEM)
-				page = NOPAGE_OOM;
-			goto bail_unlock;
-		}
-
-		tmpstat = sigprocmask(SIG_SETMASK, &oldset, NULL);
-		if (tmpstat < 0)
-			mlog_errno(tmpstat);
-
-		/* I'm not sure if we can somehow recurse back into
-		 * nopage or not, but this doesn't cost us anything,
-		 * so lets do it for now. */
-		ocfs2_add_io_marker(inode, &io_marker);
+		if (ret == -ENOMEM)
+			page = NOPAGE_OOM;
+		goto bail_unlock;
 	}
 
 	page = filemap_nopage(area, address, type);
 
-	if (!locked) {
-		ocfs2_del_io_marker(inode, &io_marker);
-		ocfs2_data_unlock(inode, 0);
-		ocfs2_meta_unlock(inode, 0);
-	}
-bail:
-	mlog_exit_ptr(page);
-	return page;
+	ocfs2_data_unlock(inode, 0);
 
 bail_unlock:
 	ocfs2_meta_unlock(inode, 0);
 
 bail_setmask:
-	tmpstat = sigprocmask(SIG_SETMASK, &oldset, NULL);
-	if (tmpstat < 0)
-		mlog_errno(tmpstat);
-
+	ret = sigprocmask(SIG_SETMASK, &oldset, NULL);
+	if (ret < 0)
+		mlog_errno(ret);
+bail:
 	mlog_exit_ptr(page);
 	return page;
 }
@@ -164,516 +130,3 @@
 	return 0;
 }
 
-static inline u64 ocfs2_binode_blkno(struct ocfs2_backing_inode *binode)
-{
-	struct inode *inode = binode->ba_inode;
-
-	BUG_ON(!inode);
-
-	return OCFS2_I(inode)->ip_blkno;
-}
-
-static inline struct rb_node * __ocfs2_buffer_lock_ctxt_root(
-	struct ocfs2_buffer_lock_ctxt *ctxt)
-{
-	return ctxt->b_inodes.rb_node;
-}
-
-static int ocfs2_buffer_lock_ctxt_insert(struct ocfs2_buffer_lock_ctxt *ctxt,
-					 struct inode *inode,
-					 struct ocfs2_backing_inode **binode_ret)
-{
-	u64 blkno;
-	struct ocfs2_backing_inode *tmp, *binode;
-	struct rb_node * parent = NULL;
-	struct rb_node ** p = &ctxt->b_inodes.rb_node;
-
-	BUG_ON(!ctxt);
-	BUG_ON(!inode);
-
-	blkno = OCFS2_I(inode)->ip_blkno;
-
-	while(*p) {
-		parent = *p;
-		tmp = rb_entry(parent, struct ocfs2_backing_inode, ba_node);
-
-		if (blkno < ocfs2_binode_blkno(tmp))
-			p = &(*p)->rb_left;
-		else if (blkno > ocfs2_binode_blkno(tmp))
-			p = &(*p)->rb_right;
-		else
-			return 0; /* Don't insert duplicates */
-	}
-
-	binode = kcalloc(1, sizeof(struct ocfs2_backing_inode), GFP_KERNEL);
-	if (!binode)
-		return -ENOMEM;
-	binode->ba_inode = inode;
-	ocfs2_init_io_marker(&binode->ba_task);
-
-	if (binode_ret)
-		*binode_ret = binode;
-
-	rb_link_node(&binode->ba_node, parent, p);
-	rb_insert_color(&binode->ba_node, &ctxt->b_inodes);
-
-	return 0;
-}
-
-static int ocfs2_fill_ctxt_from_buf(struct super_block *sb,
-				    struct inode *target_inode,
-				    char __user *buf,
-				    size_t size,
-				    struct ocfs2_buffer_lock_ctxt *ctxt)
-{
-	int status;
-	unsigned long start = (unsigned long)buf;
-	unsigned long end = start + size;
-	struct inode *inode;
-	struct mm_struct *mm = current->mm;
-	struct vm_area_struct *vma;
-
-	for (vma = find_vma(mm, start); vma; vma = vma->vm_next) {
-		if (end <= vma->vm_start)
-			break;
-		if (vma->vm_ops == &ocfs2_file_vm_ops) {
-			if (!vma->vm_file)
-				continue;
-			inode = vma->vm_file->f_dentry->d_inode;
-			if (inode->i_sb == sb &&
-			    inode != target_inode) {
-				status = ocfs2_buffer_lock_ctxt_insert(ctxt,
-								       inode,
-								       NULL);
-				if (status < 0)
-					goto bail;
-			}
-		}
-	}
-	status = 0;
-bail:
-	return status;
-}
-
-int ocfs2_setup_io_locks(struct super_block *sb,
-			 struct inode *target_inode,
-			 char __user *buf,
-			 size_t size,
-			 struct ocfs2_buffer_lock_ctxt *ctxt,
-			 struct ocfs2_backing_inode **target_binode)
-{
-	struct mm_struct *mm = current->mm;
-	int skip_sem = (current->flags & PF_DUMPCORE) || !mm;
-	int status;
-
-	if (!skip_sem)
-		down_read(&mm->mmap_sem);
-
-	BUG_ON(__ocfs2_buffer_lock_ctxt_root(ctxt));
-
-	/* We always insert target because it might not be backing part of the
-	 * buffer - but it needs to be in there so that it's lock gets ordered
-	 * with everything else */
-	status = ocfs2_buffer_lock_ctxt_insert(ctxt, target_inode,
-					       target_binode);
-
-	/* knfsd, which lacks an mm, may call us to do I/O. Since the buffer
-	 * is private to the kernel, there isn't any need to insert any other
-	 * locks, so we can skip it.
-	 *
-	 * The pile of duct tape and mixed nuts that is NFS 1, universe 0
-	 */
-	if (!status && mm) {
-		/* Now fill the tree with any inodes that back this
-		 * buffer. If target inode is in there, it will be
-		 * skipped over. */
-		status = ocfs2_fill_ctxt_from_buf(sb, target_inode, buf, size,
-						  ctxt);
-	}
-
-	if (!skip_sem)
-		up_read(&mm->mmap_sem);
-
-	if (status < 0) {
-		mlog_errno(status);
-		ocfs2_unlock_buffer_inodes(ctxt);
-		goto bail;
-	}
-
-	status = 0;
-bail:
-	return status;
-}
-
-/* starting from pos, which can be null for the first call, give the
- * next buffer that needs unlocking.  we return null when there are none
- * left or we see last_inode */
-static struct ocfs2_backing_inode *
-ocfs2_next_unlocked(struct ocfs2_buffer_lock_ctxt *ctxt,
-		    struct inode *last_inode,
-		    struct ocfs2_backing_inode *pos)
-{
-	struct ocfs2_backing_inode *binode = NULL;
-	struct rb_node *node = NULL;
-
-	if (pos == NULL) {
-		if (ctxt->b_next_unlocked)
-			binode = ctxt->b_next_unlocked;
-		else
-			node = rb_first(&ctxt->b_inodes);
-	} else
-		node = rb_next(&pos->ba_node);
-
-	if (node)
-		binode = rb_entry(node, struct ocfs2_backing_inode, ba_node);
-
-	if (binode && last_inode && binode->ba_inode == last_inode)
-		binode = NULL;
-
-	/* this is just an optimization to skip nodes in the tree
-	 * that we've already seen.  If we're moving from one we've locked
-	 * to one we haven't then we mark this node in the ctxt so that
-	 * we'll return to it in a future after, say, hitting last_inode
-	 * or EIOCBRETRY in lock_buffer_inodes */
-	if (pos && pos->ba_locked && binode)
-		ctxt->b_next_unlocked = binode;
-
-	return binode;
-}
-
-/* Will take locks on all inodes in the ctxt up until 'last_inode'. If
- * last_inode is NULL, then we take locks on everything. We mark lock
- * status on the context so we skip any that have already been
- * locked. On error we will completely abort the context. */
-/* WARNING: If you get a failure case here, you *must* call
- * "ocfs2_unlock_buffer_inodes" as we may have left a few inodes under
- * cluster lock. */
-int ocfs2_lock_buffer_inodes(struct ocfs2_buffer_lock_ctxt *ctxt,
-			     struct inode *last_inode)
-{
-	int status, data_level;
-	struct ocfs2_backing_inode *binode = NULL;
-	struct inode *inode;
-
-	while((binode = ocfs2_next_unlocked(ctxt, last_inode, binode))) {
-		/* the tricksy caller might have locked inodes themselves
-		 * between calls. */
-		if (binode->ba_locked)
-			continue;
-		inode = binode->ba_inode;
-
-		if (!binode->ba_meta_locked) {
-			status = ocfs2_meta_lock_full(inode, NULL, NULL,
-						      binode->ba_lock_meta_level,
-						      0, ctxt->b_cb,
-						      ctxt->b_cb_data);
-
-			if (status < 0) {
-				if (status != -EIOCBRETRY)
-					mlog_errno(status);
-				goto bail;
-			}
-
-			binode->ba_meta_locked = 1;
-		}
-
-		/* ba_lock_data isn't set for direct io */
-		if (binode->ba_lock_data) {
-			data_level = binode->ba_lock_data_level;
-			status = ocfs2_data_lock(inode, data_level);
-			if (status < 0) {
-				if (status == -EIOCBRETRY)
-					goto bail;
-
-				/* clean up the metadata lock that we took
-				 * above
-				 */
-				ocfs2_meta_unlock(inode,
-						  binode->ba_lock_meta_level);
-				binode->ba_meta_locked = 0;
-
-				mlog_errno(status);
-				goto bail;
-			}
-		}
-		ocfs2_add_io_marker(inode, &binode->ba_task);
-		binode->ba_locked = 1;
-	}
-
-	status = 0;
-bail:
-	return status;
-}
-
-void ocfs2_unlock_buffer_inodes(struct ocfs2_buffer_lock_ctxt *ctxt)
-{
-	struct ocfs2_backing_inode *binode;
-	struct rb_node *node;
-
-	/* dlm locks don't mask ints.. this should be lower down */
-	BUG_ON(in_interrupt());
-
-	/* unlock in reverse order to minimize waking forward lockers */
-	while ((node = rb_last(&ctxt->b_inodes)) != NULL) {
-		binode = rb_entry(node, struct ocfs2_backing_inode, ba_node);
-
-		ocfs2_del_io_marker(binode->ba_inode, &binode->ba_task);
-
-		if (binode->ba_locked && binode->ba_lock_data)
-			ocfs2_data_unlock(binode->ba_inode,
-					  binode->ba_lock_data_level);
-
-		if (binode->ba_locked || binode->ba_meta_locked)
-			ocfs2_meta_unlock(binode->ba_inode,
-					  binode->ba_lock_meta_level);
-
-		rb_erase(node, &ctxt->b_inodes);
-		kfree(binode);
-	}
-
-	ctxt->b_next_unlocked = NULL;
-}
-
-/*
- * This builds up the locking state that will be used by a write.  both normal
- * file writes and AIO writes come in through here.  This function does no
- * teardown on its own.  The caller must examine the info struct to see if it
- * needs to release locks or i_sem, etc.  This function is also restartable in
- * that it can return EIOCBRETRY if it would have blocked in the dlm.  It
- * stores its partial progress in the info struct so the caller can call back
- * in when it thinks the dlm won't block any more.  Thus, the caller must zero
- * the info struct before calling in the first time.
- */
-ssize_t ocfs2_write_lock_maybe_extend(struct file *filp,
-				      const char __user *buf,
-				      size_t count,
-				      loff_t *ppos,
-				      struct ocfs2_write_lock_info *info,
-				      struct ocfs2_buffer_lock_ctxt *ctxt)
-{
-	int ret = 0;
-	ocfs2_super *osb = NULL;
-	struct dentry *dentry = filp->f_dentry;
-	struct inode *inode = dentry->d_inode;
-	int status;
-	int level = filp->f_flags & O_APPEND;
-	loff_t saved_ppos;
-	u64 bytes_added = 0;
-
-	osb = OCFS2_SB(inode->i_sb);
-
-	/* the target inode is different from the other inodes.  in o_direct it
-	 * doesn't get a data lock and when appending it gets a level 1 meta
-	 * lock.  we use target_binode to set its flags accordingly */
-	if (info->wl_target_binode == NULL) {
-		ret = ocfs2_setup_io_locks(inode->i_sb, inode,
-					   (char __user *) buf,
-					   count, ctxt,
-					   &info->wl_target_binode);
-		if (ret < 0) {
-			BUG_ON(ret == -EIOCBRETRY);
-			mlog_errno(ret);
-			goto bail;
-		}
-	}
-
-	/* This will lock everyone in the context who's order puts
-	 * them before us. */
-	if (!info->wl_have_before) {
-		info->wl_unlock_ctxt = 1;
-		ret = ocfs2_lock_buffer_inodes(ctxt, inode);
-		if (ret < 0) {
-			if (ret != -EIOCBRETRY)
-				mlog_errno(ret);
-			goto bail;
-		}
-		info->wl_have_before = 1;
-		/* we're writing so get an ex data cluster lock */
-		info->wl_target_binode->ba_lock_data_level = 1;
-	}
-
-	if (!info->wl_have_i_sem) {
-		down(&inode->i_sem);
-		info->wl_have_i_sem = 1;
-	}
-
-lock:
-	if (!info->wl_have_target_meta) {
-		status = ocfs2_meta_lock(inode, NULL, NULL, level);
-		if (status < 0) {
-			mlog_errno(status);
-			ret = status;
-			goto bail;
-		}
-		info->wl_have_target_meta = 1;
-	}
-	/* to handle extending writes, we do a bit of our own locking
-	 * here, but we setup the ctxt do unlock for us (as well as
-	 * handle locking everything else. */
-	if (level)
-		info->wl_target_binode->ba_lock_meta_level = 1;
-
-	/* work on a copy of ppos until we're sure that we won't have
-	 * to recalculate it due to relocking. */
-	saved_ppos = *ppos;
-
-	if (filp->f_flags & O_APPEND) {
-		saved_ppos = i_size_read(inode);
-		mlog(0, "O_APPEND: inode->i_size=%llu\n", saved_ppos);
-
-#ifdef OCFS2_ORACORE_WORKAROUNDS
-		if (osb->s_mount_opt & OCFS2_MOUNT_COMPAT_OCFS) {
-			/* ugh, work around some applications which open
-			 * everything O_DIRECT + O_APPEND and really don't
-			 * mean to use O_DIRECT. */
-			filp->f_flags &= ~O_DIRECT;
-		}
-#endif
-	}
-
-	if (filp->f_flags & O_DIRECT) {
-#ifdef OCFS2_ORACORE_WORKAROUNDS
-		if (osb->s_mount_opt & OCFS2_MOUNT_COMPAT_OCFS) {
-			int sector_size = 1 << osb->s_sectsize_bits;
-
-			if ((saved_ppos & (sector_size - 1)) ||
-			    (count & (sector_size - 1)) ||
-			    ((unsigned long)buf & (sector_size - 1))) {
-				info->wl_do_direct_io = 0;
-				filp->f_flags |= O_SYNC;
-			} else {
-				info->wl_do_direct_io = 1;
-			}
-		} else
-#endif
-			info->wl_do_direct_io = 1;
-
-		mlog(0, "O_DIRECT\n");
-	}
-
-	info->wl_target_binode->ba_lock_data = info->wl_do_direct_io ? 0 : 1;
-
-	info->wl_newsize = count + saved_ppos;
-	if (filp->f_flags & O_APPEND)
-		info->wl_newsize = count + i_size_read(inode);
-
-	mlog(0, "ppos=%lld newsize=%"MLFu64" cursize=%lld\n", saved_ppos,
-	     info->wl_newsize, i_size_read(inode));
-
-	if (info->wl_newsize > i_size_read(inode)) {
-		if (!level) {
-			/* we want an extend, but need a higher
-			 * level cluster lock. */
-			mlog(0, "inode %"MLFu64", had a PR, looping back "
-			     "for EX\n", OCFS2_I(inode)->ip_blkno);
-			ocfs2_meta_unlock(inode, level);
-			info->wl_have_target_meta = 0;
-			level = 1;
-			goto lock;
-		}
-
-		mlog(0, "Writing at EOF, will need more allocation: "
-		     "i_size=%lld, need=%"MLFu64"\n", i_size_read(inode),
-		     info->wl_newsize);
-
-		/* If we extend AT ALL here then we update our state
-		 * and continue the write call, regardless of error --
-		 * this is basically a short write. */
-		status = ocfs2_extend_file(osb, inode, info->wl_newsize,
-					   &bytes_added);
-		if (status < 0 && (!bytes_added)) {
-			if (status != -ERESTARTSYS
-			    && status != -EINTR
-			    && status != -ENOSPC) {
-				mlog_errno(status);
-				mlog(ML_ERROR, "Failed to extend inode %"MLFu64
-				     " from %lld to %"MLFu64,
-				     OCFS2_I(inode)->ip_blkno,
-				     *ppos, info->wl_newsize);
-			}
-			ret = status;
-
-			info->wl_have_target_meta = 0;
-			ocfs2_meta_unlock(inode, level);
-			goto bail;
-		}
-
-		info->wl_extended = 1;
-
-		/* We need to recalulate newsize and count according
-		 * to what extend could give us. If we got the whole
-		 * extend then this doesn't wind up changing the
-		 * values. */
-		info->wl_newsize = i_size_read(inode) + bytes_added;
-		count = info->wl_newsize - saved_ppos;
-
-		if (status < 0
-		    && status != -ENOSPC
-		    && status != -EINTR
-		    && status != -ERESTARTSYS)
-			mlog(ML_ERROR, "status return of %d extending inode "
-			     "%"MLFu64"\n", status,
-			     OCFS2_I(inode)->ip_blkno);
-		status = 0;
-	}
-
-	/* we've got whatever cluster lock is appropriate now, so we
-	 * can stuff *ppos back. */
-	*ppos = saved_ppos;
-
-	if (!info->wl_do_direct_io && !info->wl_have_data_lock) {
-		status = ocfs2_data_lock(inode, 1);
-		if (status < 0) {
-			mlog_errno(status);
-			ret = status;
-
-			info->wl_have_target_meta = 0;
-			ocfs2_meta_unlock(inode, level);
-			goto bail;
-		}
-		info->wl_have_data_lock = 1;
-	}
-
-	/* Alright, fool the io locking stuff into thinking it's
-	 * handled our inode for us. We can now count on it to do the
-	 * unlock for us. */
-	info->wl_target_binode->ba_locked = 1;
-
-	/* This will lock everyone who's order puts them *after* our inode. */
-	ret = ocfs2_lock_buffer_inodes(ctxt, NULL);
-	if (ret < 0) {
-		if (ret != -EIOCBRETRY)
-			mlog_errno(ret);
-		goto bail;
-	}
-
-bail:
-	mlog_exit(ret);
-	return ret;
-}
-
-#if 0
-static void ocfs2_buffer_ctxt_debug(struct ocfs2_buffer_lock_ctxt *ctxt)
-{
-	struct ocfs2_backing_inode *binode;
-	struct inode *inode;
-	struct rb_node *node;
-
-	printk("(%u) ocfs2: buffer lock ctxt: direct io = %d\n",
-	       current->pid, ctxt->b_lock_direct);
-
-	node = rb_first(&ctxt->b_inodes);
-	while (node) {
-		binode = rb_entry(node, struct ocfs2_backing_inode, ba_node);
-		inode = binode->ba_inode;
-
-		printk("(%u) ocfs2: inode %llu, locked %d, is target? %s\n",
-		       current->pid, OCFS2_I(inode)->ip_blkno,
-		       binode->ba_locked,
-		       ocfs2_buffer_lock_is_target(ctxt, inode) ? "yes" :
-		       "no");
-
-		node = rb_next(node);
-	}
-}
-#endif

Modified: branches/locking-changes/fs/ocfs2/mmap.h
===================================================================
--- branches/locking-changes/fs/ocfs2/mmap.h	2005-08-25 19:02:58 UTC (rev 2543)
+++ branches/locking-changes/fs/ocfs2/mmap.h	2005-08-25 19:21:37 UTC (rev 2544)
@@ -1,131 +1,6 @@
 #ifndef OCFS2_MMAP_H
 #define OCFS2_MMAP_H
 
-int ocfs2_mmap(struct file *file,
-	       struct vm_area_struct *vma);
+int ocfs2_mmap(struct file *file, struct vm_area_struct *vma);
 
-/* used by file_read/file_write and nopage to coordinate file
- * locking. I keep this out of the dlmglue code, because quite frankly
- * I don't like that we have to do this stuff. */
-struct ocfs2_io_marker {
-	struct list_head io_list;
-	struct task_struct *io_task;
-};
-
-#define __IOMARKER_INITIALIZER(name) {					\
-	.io_list      = { &(name).io_list, &(name).io_list },		\
-	.io_task      = NULL }
-
-#define DECLARE_IO_MARKER(name)						\
-	struct ocfs2_io_marker name = __IOMARKER_INITIALIZER(name)
-
-static inline void ocfs2_init_io_marker(struct ocfs2_io_marker *task)
-{
-	INIT_LIST_HEAD(&task->io_list);
-	task->io_task = NULL;
-}
-
-static inline void ocfs2_add_io_marker(struct inode *inode,
-				       struct ocfs2_io_marker *task)
-{
-	struct ocfs2_inode_info *oi = OCFS2_I(inode);
-
-	task->io_task = current;
-	spin_lock(&oi->ip_lock);
-	list_add(&task->io_list, &oi->ip_io_markers);
-	spin_unlock(&oi->ip_lock);
-}
-
-static inline void ocfs2_del_io_marker(struct inode *inode,
-				       struct ocfs2_io_marker *task)
-{
-	spin_lock(&OCFS2_I(inode)->ip_lock);
-	if (!list_empty(&task->io_list))
-		list_del_init(&task->io_list);
-	spin_unlock(&OCFS2_I(inode)->ip_lock);
-}
-
-static inline int ocfs2_is_in_io_marker_list(struct inode *inode,
-					   struct task_struct *task)
-{
-	int ret = 0;
-	struct ocfs2_inode_info *oi = OCFS2_I(inode);
-	struct list_head *p;
-	struct ocfs2_io_marker *tmp;
-
-	spin_lock(&oi->ip_lock);
-	list_for_each(p, &oi->ip_io_markers) {
-		tmp = list_entry(p, struct ocfs2_io_marker, io_list);
-		if (tmp->io_task == task) {
-			ret = 1;
-			break;
-		}
-	}
-	spin_unlock(&oi->ip_lock);
-
-	return ret;
-}
-
-struct ocfs2_backing_inode {
-	struct rb_node           ba_node;
-	struct inode            *ba_inode;
-	unsigned		 ba_meta_locked:1, 	/* meta is locked */
-				 ba_locked:1,		/* both are locked */
-				 ba_lock_data:1,	/* should lock data */
-				 ba_lock_meta_level:1,
-				 ba_lock_data_level:1;
-	struct ocfs2_io_marker   ba_task;
-};
-
-/* Used to manage the locks taken during I/O. */
-struct ocfs2_buffer_lock_ctxt {
-	struct rb_root			b_inodes;
-	struct ocfs2_backing_inode	*b_next_unlocked;
-	ocfs2_lock_callback		b_cb;
-	unsigned long			b_cb_data;
-};
-
-#define __BUFFERLOCK_INITIALIZER {					\
-	.b_inodes               = RB_ROOT,				\
-	.b_next_unlocked	= NULL,					\
-	.b_cb			= NULL,					\
-	.b_cb_data		= 0 }
-
-#define DECLARE_BUFFER_LOCK_CTXT(name)					\
-	struct ocfs2_buffer_lock_ctxt name = __BUFFERLOCK_INITIALIZER
-
-#define INIT_BUFFER_LOCK_CTXT(ctxt)	\
-	*(ctxt) = (struct ocfs2_buffer_lock_ctxt) __BUFFERLOCK_INITIALIZER
-
-int ocfs2_setup_io_locks(struct super_block *sb,
-			 struct inode *target_inode,
-			 char __user *buf,
-			 size_t size,
-			 struct ocfs2_buffer_lock_ctxt *ctxt,
-			 struct ocfs2_backing_inode **target_binode);
-
-int ocfs2_lock_buffer_inodes(struct ocfs2_buffer_lock_ctxt *ctxt,
-			     struct inode *last_inode);
-
-void ocfs2_unlock_buffer_inodes(struct ocfs2_buffer_lock_ctxt *ctxt);
-
-struct ocfs2_write_lock_info {
-	u64				wl_newsize;
-	unsigned			wl_extended:1,
-					wl_do_direct_io:1,
-					wl_have_i_sem:1,
-					wl_unlock_ctxt:1,
-					wl_have_before:1,
-					wl_have_target_meta:1,
-					wl_have_data_lock:1;
-	struct ocfs2_backing_inode	*wl_target_binode;
-};
-
-ssize_t ocfs2_write_lock_maybe_extend(struct file *filp,
-				      const char __user *buf,
-				     size_t count,
-				      loff_t *ppos,
-				     struct ocfs2_write_lock_info *info,
-				     struct ocfs2_buffer_lock_ctxt *ctxt);
-
 #endif  /* OCFS2_MMAP_H */

Modified: branches/locking-changes/fs/ocfs2/ocfs2.h
===================================================================
--- branches/locking-changes/fs/ocfs2/ocfs2.h	2005-08-25 19:02:58 UTC (rev 2543)
+++ branches/locking-changes/fs/ocfs2/ocfs2.h	2005-08-25 19:21:37 UTC (rev 2544)
@@ -274,12 +274,6 @@
 
 	struct list_head	osb_net_handlers;
 
-	/* see ocfs2_ki_dtor() */
-	struct work_struct		osb_okp_teardown_work;
-	struct ocfs2_kiocb_private	*osb_okp_teardown_next;
-	atomic_t			osb_okp_pending;
-	wait_queue_head_t		osb_okp_pending_wq;
-
 	wait_queue_head_t		osb_mount_event;
 
 	/* Truncate log info */

Modified: branches/locking-changes/fs/ocfs2/super.c
===================================================================
--- branches/locking-changes/fs/ocfs2/super.c	2005-08-25 19:02:58 UTC (rev 2543)
+++ branches/locking-changes/fs/ocfs2/super.c	2005-08-25 19:21:37 UTC (rev 2544)
@@ -45,7 +45,6 @@
 #include <cluster/masklog.h>
 
 #include "ocfs2.h"
-#include "aio.h"
 
 /* this should be the only file to include a version 1 header */
 #include "ocfs1_fs_compat.h"
@@ -1001,8 +1000,6 @@
 	osb = OCFS2_SB(sb);
 	BUG_ON(!osb);
 
-	ocfs2_wait_for_okp_destruction(osb);
-
 	ocfs2_shutdown_local_alloc(osb);
 
 	ocfs2_truncate_log_shutdown(osb);
@@ -1122,12 +1119,6 @@
 	INIT_LIST_HEAD(&osb->vote_list);
 	spin_lock_init(&osb->s_next_gen_lock);
 
-	osb->osb_okp_teardown_next = NULL;
-	atomic_set(&osb->osb_okp_pending, 0);
-	init_waitqueue_head(&osb->osb_okp_pending_wq);
-	/* we sync with this work queue (and sb ref) on unmount */
-	INIT_WORK(&osb->osb_okp_teardown_work, okp_teardown_from_list, osb);
-
 	atomic_set(&osb->alloc_stats.moves, 0);
 	atomic_set(&osb->alloc_stats.local_data, 0);
 	atomic_set(&osb->alloc_stats.bitmap_data, 0);