[Ocfs2-commits] zab commits r2677 - in trunk: . Documentation Documentation/filesystems fs/ocfs2 fs/ocfs2/cluster

Fri Nov 4 18:27:18 CST 2005

Author: zab
Signed-off-by: mfasheh
Signed-off-by: jlbec
Date: 2005-11-04 18:27:07 -0600 (Fri, 04 Nov 2005)
New Revision: 2677

Added:
   trunk/Documentation/
   trunk/Documentation/filesystems/
   trunk/Documentation/filesystems/ocfs2.txt
   trunk/fs/ocfs2/aops.h
Removed:
   trunk/Documentation/filesystems/
   trunk/Documentation/filesystems/ocfs2.txt
   trunk/fs/ocfs2/aio.c
   trunk/fs/ocfs2/aio.h
Modified:
   trunk/Config.make.in
   trunk/README
   trunk/configure.in
   trunk/fs/ocfs2/Makefile
   trunk/fs/ocfs2/aops.c
   trunk/fs/ocfs2/cluster/masklog.h
   trunk/fs/ocfs2/cluster/tcp_internal.h
   trunk/fs/ocfs2/dir.c
   trunk/fs/ocfs2/dlmglue.c
   trunk/fs/ocfs2/dlmglue.h
   trunk/fs/ocfs2/file.c
   trunk/fs/ocfs2/file.h
   trunk/fs/ocfs2/inode.c
   trunk/fs/ocfs2/inode.h
   trunk/fs/ocfs2/journal.c
   trunk/fs/ocfs2/journal.h
   trunk/fs/ocfs2/mmap.c
   trunk/fs/ocfs2/mmap.h
   trunk/fs/ocfs2/namei.c
   trunk/fs/ocfs2/ocfs2.h
   trunk/fs/ocfs2/ocfs2_lockid.h
   trunk/fs/ocfs2/super.c
Log:
o land [2543:2676] from branches/locking-changes

This shuffles dlm locking around.  Consistentcy is maintained by locks
acquired in prepare_write/commit_write/readpages.  A seperate dlm lock
used to serialize file system operations (extend, write, truncate) is 
added.

The results of this can be summarized as:

- locks in file read write and nopage are acquired consistently: no deadlock
- bufferd extending write updates i_size at commit_write, not at end of write
- provide fop aio_{read,write}, can use generic do_sync_{read,write},sendfile
- aio path less weird: dlm ip_alloc_sem work-a-like, unlock at dio->end_io
- ordered data mode introduced

Signed-off-by: mfasheh
Signed-off-by: jlbec


Modified: trunk/Config.make.in
===================================================================

--- trunk/Config.make.in	2005-11-03 22:39:00 UTC (rev 2676)
+++ trunk/Config.make.in	2005-11-05 00:27:07 UTC (rev 2677)
@@ -58,6 +58,7 @@
 HAVE_SPARSE_ENDIAN_TYPES = @HAVE_SPARSE_ENDIAN_TYPES@
 HAVE_GENERIC_READLINK = @HAVE_GENERIC_READLINK@
 NEW_FOLLOW_LINK_API = @NEW_FOLLOW_LINK_API@
+GENERIC_DELETE_INODE_TRUNCATES = @GENERIC_DELETE_INODE_TRUNCATES@
 
 OCFS_DEBUG = @OCFS_DEBUG@
 

Copied: trunk/Documentation (from rev 2676, branches/locking-changes/Documentation)

Copied: trunk/Documentation/filesystems (from rev 2676, branches/locking-changes/Documentation/filesystems)

Deleted: trunk/Documentation/filesystems/ocfs2.txt
===================================================================
--- branches/locking-changes/Documentation/filesystems/ocfs2.txt	2005-11-03 22:39:00 UTC (rev 2676)
+++ trunk/Documentation/filesystems/ocfs2.txt	2005-11-05 00:27:07 UTC (rev 2677)
@@ -1,126 +0,0 @@
-
-For now this just documents locking ordering.  The intent is to have a 
-dense enough markup that one can visualize an entire path in a reasonably
-sized terminal.  So:
-
- (  - lock
- )  - unlock
- () - lock and unlock without anything interesting in between
- +  - descending into another function
-
-worst case buffered file write (extending, buf from mmaped ocfs2)
-+ ocfs2_file_aio_write
-  ( i_sem
-  ( rw_lock
-  ( meta_lock
-  + ocfs2_extend_allocation
-    () ip_alloc_sem
-  ) meta_lock
-  + generic_file_aio_write_nolock
-    + ocfs2_prepare_write
-      ( meta_lock
-      ( ip_alloc_sem
-      ) ip_alloc_sem
-      ) meta_lock
-    + ocfs2_nopage
-      + filemap_nopage
-        + ocfs2_readpage
-          ( meta_lock
-          ( ip_alloc_sem
-          ( data_lock
-          ) data_lock
-          ) ip_alloc_sem
-          ) meta_lock
-    + ocfs2_commit_write
-      ( meta_lock
-      ( data_lock
-      ) data_lock
-      ) meta_lock
-  ) rw_lock
-  ) i_sem
-
-O_DIRECT file write:
-+ ocfs2_file_aio_write
-  ( i_sem
-  ( i_alloc_sem
-  ( rw_lock
-  () meta_lock
-  + __blkdev_direct_IO
-    + ocfs2_direct_io_get_blocks
-      ( meta_lock
-      ( ip_alloc_sem
-      ) ip_alloc_sem
-      ) meta_lock
-  ) i_sem
-+ dio_complete
-  + ocfs2_dio_end_io
-    ) rw_lock
-    ) i_alloc_sem
-
-buffered file read (nopage when buf is mmaped):
-+ ocfs2_file_aio_read
-  + generic_file_aio_read
-    + ocfs2_readpage
-      ( meta_lock
-      ( ip_alloc_sem
-      ( data_lock
-      ) data_lock
-      ) ip_alloc_sem
-      ) meta_lock
-    + ocfs2_nopage
-      + filemap_nopage
-        + ocfs2_readpage
-          ( meta_lock
-          ( ip_alloc_sem
-          ( data_lock
-          ) data_lock
-          ) ip_alloc_sem
-          ) meta_lock
-
-O_DIRECT file read:
-+ ocfs2_file_aio_read
-  ( i_alloc_sem
-  ) rw_lock
-  + __blkdev_direct_IO
-    + ocfs2_direct_io_get_blocks
-      ( meta_lock
-      ( ip_alloc_sem
-      ) ip_alloc_sem
-      ) meta_lock
-+ dio_complete
-  + ocfs2_dio_end_io
-    ) i_alloc_sem
-    ) rw_lock
-
-truncate:
-+ do_truncate
-  ( i_sem
-  + notify_change
-    ( i_alloc_sem
-    + ocfs2_setattr
-      ( rw_lock
-      ( meta_lock
-      + ocfs2_truncate_file
-        () data_lock
-      ) meta_lock
-      ) rw_lock
-    ) i_alloc_sem
-  ) i_sem
-
-readpage: (via sendfile, sys_readahead, fault->nopage)
-+ ocfs2_readpage
-      ( meta_lock
-      ( ip_alloc_sem
-      ( data_lock
-      ) data_lock
-      ) ip_alloc_sem
-      ) meta_lock
-
-Lock Ordering - The Short Story:
-i_sem -> i_alloc_sem -> rw_lock -> meta_lock -> ip_alloc_sem -> data_lock
-
-.. make this prettier ..
-
--- Locks and what they cover --
-
-

Copied: trunk/Documentation/filesystems/ocfs2.txt (from rev 2676, branches/locking-changes/Documentation/filesystems/ocfs2.txt)

Modified: trunk/README
===================================================================
--- trunk/README	2005-11-03 22:39:00 UTC (rev 2676)
+++ trunk/README	2005-11-05 00:27:07 UTC (rev 2677)
@@ -50,6 +50,12 @@
 			barrier=1 enables it.
 errors=remount-ro(*)	Remount the filesystem read-only on an error.
 errors=panic		Panic and halt the machine if an error occurs.
-intr		(*)	Allow signals to interrupt cluster operations.
-nointr			Do not allow signals to interrupt cluster
-			operations.
+data=ordered     (*)    All data are forced directly out to the main file
+                        system prior to its metadata being committed to
+                        the journal.
+data=writeback          Data ordering is not preserved, data may be
+                        written into the main file system after its
+                        metadata has been committed to the journal.
+intr             (*)    Allow signals to interrupt cluster operations.
+nointr                  Do not allow signals to interrupt cluster
+                        operations.

Modified: trunk/configure.in
===================================================================
--- trunk/configure.in	2005-11-03 22:39:00 UTC (rev 2676)
+++ trunk/configure.in	2005-11-05 00:27:07 UTC (rev 2677)
@@ -267,6 +267,32 @@
   NEW_FOLLOW_LINK_API=yes, , [void \* (\*follow_link)])
 AC_SUBST(NEW_FOLLOW_LINK_API)
 
+# previously the workaround was always enabled unless you edited
+# fs/ocfs2/Makefile -- reverting before commits, etc; totally unreasonable.
+# This at least has a chance of working without intervention when building
+# against a full source tree.  Someone who cares could replace this with
+# a version check, I guess, though getting the right -mm is irritating.
+GENERIC_DELETE_INODE_TRUNCATES=yes
+if [ test -f $kernelsourcedir/fs/inode.c ]; then
+	AC_MSG_CHECKING(truncate call in generic_delete_inode)
+	first=`awk '(/void generic_delete_inode/) { gdi=1 } 
+		(/\<delete\(inode\);/) {if (gdi == 1) { print "delete" ; exit }}
+		(/truncate_inode_pages\(&inode/) { if (gdi == 1) {print "truncate"; exit}}' $kernelsourcedir/fs/inode.c`
+	case "$first" in
+		delete)
+			AC_MSG_RESULT(no; disabling workaround)
+			GENERIC_DELETE_INODE_TRUNCATES=
+			;;
+		truncate)
+			AC_MSG_RESULT(yes; leaving workaround enabled)
+			;;
+		*)
+			AC_MSG_RESULT(unknown; leaving workaround enabled)
+			;;
+	esac
+fi
+AC_SUBST(GENERIC_DELETE_INODE_TRUNCATES)
+
 # using -include has two advantages:
 #  the source doesn't need to know to include compat headers
 #  the compat header file names don't go through the search path

Modified: trunk/fs/ocfs2/Makefile
===================================================================
--- trunk/fs/ocfs2/Makefile	2005-11-03 22:39:00 UTC (rev 2676)
+++ trunk/fs/ocfs2/Makefile	2005-11-05 00:27:07 UTC (rev 2677)
@@ -30,7 +30,10 @@
 EXTRA_CFLAGS += -DNEW_FOLLOW_LINK_API
 endif
 
+ifdef GENERIC_DELETE_INODE_TRUNCATES
 EXTRA_CFLAGS += -DOCFS2_DELETE_INODE_WORKAROUND
+endif
+
 EXTRA_CFLAGS += -DOCFS2_CDSL
 
 #
@@ -40,7 +43,6 @@
 SAFE_SUBDIRS = cluster dlm
 
 SOURCES =			\
-	aio.c 			\
 	alloc.c 		\
 	aops.c 			\
 	buffer_head_io.c	\
@@ -71,8 +73,8 @@
 	ocfs2_lockid.h		\
 	ocfs2.h			\
 	buffer_head_io.h	\
-	aio.h			\
 	alloc.h			\
+	aops.h			\
 	dcache.h		\
 	dir.h			\
 	dlmglue.h		\

Deleted: trunk/fs/ocfs2/aio.c
===================================================================
--- trunk/fs/ocfs2/aio.c	2005-11-03 22:39:00 UTC (rev 2676)
+++ trunk/fs/ocfs2/aio.c	2005-11-05 00:27:07 UTC (rev 2677)
@@ -1,389 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * aio.c
- *
- * aio read and write
- *
- * Copyright (C) 2002, 2004, 2005 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#include <linux/fs.h>
-#include <linux/types.h>
-#include <linux/slab.h>
-#include <linux/highmem.h>
-#include <linux/pagemap.h>
-#include <linux/uio.h>
-
-#define MLOG_MASK_PREFIX ML_FILE_IO|ML_AIO
-#include <cluster/masklog.h>
-
-#include "ocfs2.h"
-
-#include "aio.h"
-#include "alloc.h"
-#include "dir.h"
-#include "dlmglue.h"
-#include "extent_map.h"
-#include "file.h"
-#include "sysfile.h"
-#include "inode.h"
-#include "mmap.h"
-#include "suballoc.h"
-
-
-struct ocfs2_kiocb_private {
-	struct ocfs2_kiocb_private	*kp_teardown_next;
-	ocfs2_super			*kp_osb;
-	unsigned			kp_have_alloc_sem:1,
-					kp_have_write_locks:1;
-	struct inode			*kp_inode;
-	struct ocfs2_buffer_lock_ctxt	kp_ctxt;
-	struct ocfs2_write_lock_info	kp_info;
-};
-
-static void okp_teardown(struct ocfs2_kiocb_private *okp)
-{
-	mlog(0, "okp %p\n", okp);
-
-	BUG_ON(okp->kp_inode == NULL);
-
-	if (okp->kp_info.wl_unlock_ctxt)
-		ocfs2_unlock_buffer_inodes(&okp->kp_ctxt);
-	if (okp->kp_have_alloc_sem)
-		up_read(&OCFS2_I(okp->kp_inode)->ip_alloc_sem);
-
-	iput(okp->kp_inode);
-	kfree(okp);
-}
-
-void okp_teardown_from_list(void *data)
-{
-	ocfs2_super *osb = data;
-	struct ocfs2_kiocb_private *okp, *next;
-
-	for (okp = xchg(&osb->osb_okp_teardown_next, NULL); okp != NULL;
-	     okp = next) {
-
-		next = okp->kp_teardown_next;
-		okp_teardown(okp);
-	}
-}
-
-/*
- * This releases the dlm locks we held across an aio operation and frees the
- * space we were tracking them in.
- *
- * While aio operations are in flight they have a vfsmnt reference for the file
- * which prevents unmount.  This dtor gets called *after* that ref is dropped,
- * however, so we have to make sure to account for pending work we have here in
- * the unmount path.  The race starts when aio does its fputs, before it calls
- * dtor which queues work, so just synchronizing with the work queue could miss
- * that first phase.  So unmount first waits for the pending count to drop.
- * Then it has to wait for keventd to finish the work freeing the okps.
- *
- * _dtor can be called from just about any context and lock teardown is
- * anything but interrupt safe.  We used to hand the okps to
- * okp_teardown_from_list with a normal list_head and irq masking lock but we
- * want to avoid masking interrupts so it was shifted to the {cmp,}xchg() and
- * atomic_t.
- *
- * Adding to the singly linked ->next list is only a little tricky.  We have to
- * watch for races between sampling the head to assign ->next in the inserting
- * okp and a new head being written before we point the head to the inserting
- * okp.
- */
-static void ocfs2_ki_dtor(struct kiocb *iocb)
-{
-	struct ocfs2_kiocb_private *next, *okp = iocb->private;
-	ocfs2_super *osb = okp->kp_osb;
-
-	mlog(0, "iocb %p okp %p\n", iocb, okp);
-
-	/* okp_alloc only assigns the iocb->private and ->ki_dtor pointers if
-	 * it was able to alloc the okp and get an inode reference */
-	BUG_ON(okp == NULL);
-	BUG_ON(okp->kp_inode == NULL);
-
-	/* we had better not try to work with this iocb again */
-	iocb->private = NULL;
-
-	 /* once this cmpxchg succeeds the okp can be freed so we have to be
-	  * careful not to deref it when testing success */
-	do {
-		next = osb->osb_okp_teardown_next;
-		okp->kp_teardown_next = next;
-	} while (cmpxchg(&osb->osb_okp_teardown_next, next, okp) != next);
-
-	schedule_work(&osb->osb_okp_teardown_work);
-
-	if (atomic_dec_and_test(&osb->osb_okp_pending))
-		wake_up(&osb->osb_okp_pending_wq);
-}
-
-/* see ocfs2_ki_dtor() */
-void ocfs2_wait_for_okp_destruction(ocfs2_super *osb)
-{
-	/* first wait for okps to enter the work queue */
-	wait_event(osb->osb_okp_pending_wq,
-		   atomic_read(&osb->osb_okp_pending) == 0);
-	/*
-	 * then wait for keventd to finish with all its work, including ours.
-	 *
-	 * XXX this makes me very nervous.  what if our work blocks keventd
-	 * during an unlock and the unlock can only proceed if keventd
-	 * can get to some more work that the dlm might have queued?
-	 * do we push any dlm work to keventd?
-	 */
-	flush_scheduled_work();
-}
-
-/* just to stop sys_io_cancel() from spewing to the console when it sees an
- * iocb without ki_cancel */
-static int ocfs2_ki_cancel(struct kiocb *iocb, struct io_event *ev)
-{
-	mlog(0, "iocb %p\n", iocb);
-	aio_put_req(iocb);
-	return -EAGAIN;
-}
-
-static struct ocfs2_kiocb_private *okp_alloc(struct kiocb *iocb)
-{
-	struct inode *inode = iocb->ki_filp->f_dentry->d_inode;
-	struct ocfs2_kiocb_private *okp;
-	ocfs2_super *osb;
-
-	okp = kcalloc(1, sizeof(*okp), GFP_KERNEL);
-	if (okp == NULL) {
-		okp = ERR_PTR(-ENOMEM);
-		goto out;
-	}
-
-	/* our dtor only gets registerd if we can guarantee that it holds
-	 * a reference to the inode */
-	okp->kp_inode = igrab(inode);
-	if (okp->kp_inode == NULL) {
-		kfree(okp);
-		okp = ERR_PTR(-EINVAL);
-		goto out;
-	}
-	/* unmount syncs with work using this ref before destroying the osb */
-	osb = OCFS2_SB(inode->i_sb);
-	okp->kp_osb = osb;
-
-	iocb->private = okp;
-	iocb->ki_dtor = ocfs2_ki_dtor;
-	iocb->ki_cancel = ocfs2_ki_cancel;
-	INIT_BUFFER_LOCK_CTXT(&okp->kp_ctxt);
-
-	atomic_inc(&osb->osb_okp_pending);
-out:
-	mlog(0, "iocb %p returning %p\n", iocb, okp);
-	return okp;
-}
-
-/* The DLM supports a minimal notion of AIO lock acquiry.  Instead of testing
- * the iocb or current-> like kernel fs/block paths tend to, it takes an
- * explicit callback which it calls when a lock state attempt makes forward
- * progress.  It would be better if it worked with the native
- * kernel AIO mechanics */
-static void ocfs2_aio_kick(int status, unsigned long data)
-{
-	struct kiocb *iocb = (struct kiocb *)data;
-	/* XXX worry about racing with ki_cancel once we set it */
-	mlog(0, "iocb %p\n", iocb);
-	kick_iocb(iocb);
-}
-
-/* this is called as iocb->ki_retry so it is careful to only repeat
- * what is needed */
-ssize_t ocfs2_file_aio_read(struct kiocb *iocb, char __user *buf, size_t count,
-			    loff_t pos)
-{
-	struct ocfs2_kiocb_private *okp = iocb->private;
-	struct file *filp = iocb->ki_filp;
-	struct inode *inode = filp->f_dentry->d_inode;
-	struct ocfs2_backing_inode *target_binode;
-	ssize_t ret, ret2;
-	sigset_t blocked, oldset;
-
-	/*
-	 * The DLM doesn't block waiting for network traffic or anything, it
-	 * modifies state and calls our callback when things have changed.
-	 * However, it still likes to check signals and return ERESTARTSYS.
-	 * The AIO core does not appreciate ERESTARTSYS as its semantics are
-	 * not exactly clear for submission, etc.  So we block signals and
-	 * ensure that the DLM won't notice them.  The caller, particularly
-	 * sys_io_getevents(), will eventually check signals before sleeping
-	 * and so things should still work as expected, if perhaps with
-	 * slightly higher signal delivery latency.
-	 */
-	sigfillset(&blocked);
-	ret = sigprocmask(SIG_BLOCK, &blocked, &oldset);
-	if (ret < 0) {
-		mlog_errno(ret);
-		goto out;
-	}
-
-	mlog(0, "iocb %p okp %p\n", iocb, okp);
-
-	if (okp == NULL) {
-		okp = okp_alloc(iocb);
-		if (IS_ERR(okp)) {
-			ret = PTR_ERR(okp);
-			mlog_errno(ret);
-			goto setmask;
-		}
-
-		ret = ocfs2_setup_io_locks(inode->i_sb, inode, buf, count,
-					   &okp->kp_ctxt, &target_binode);
-		if (ret < 0) {
-			mlog_errno(ret);
-			goto setmask;
-		}
-
-		okp->kp_ctxt.b_cb = ocfs2_aio_kick;
-		okp->kp_ctxt.b_cb_data = (unsigned long)iocb;
-		target_binode->ba_lock_data = filp->f_flags & O_DIRECT ? 0 : 1;
-	}
-
-	/* this might return EIOCBRETRY and we'll come back again to
-	 * continue the locking.  It's harmless to call it once it has
-	 * returned success.. */
-	okp->kp_info.wl_unlock_ctxt = 1; /* re-use the write info path */
-	ret = ocfs2_lock_buffer_inodes(&okp->kp_ctxt, NULL);
-	if (ret < 0) {
-		if (ret != -EIOCBRETRY)
-			mlog_errno(ret);
-		goto setmask;
-	}
-
-	/* hold the ip_alloc_sem across the op */
-	if (!okp->kp_have_alloc_sem) {
-		down_read(&OCFS2_I(inode)->ip_alloc_sem);
-		okp->kp_have_alloc_sem = 1;
-	}
-
-	ret = generic_file_aio_read(iocb, buf, count, pos);
-
-setmask:
-	ret2 = sigprocmask(SIG_SETMASK, &oldset, NULL);
-	if (ret2 < 0) {
-		mlog_errno(ret2);
-		if (ret == 0)
-			ret = ret2;
-	}
-
-out:
-	/* ki_dtor will always be called eventually, no tear down here */
-	mlog(0, "iocb %p returning %lld\n", iocb, (long long)ret);
-	return ret;
-}
-
-/* this is called as iocb->ki_retry so it is careful to only repeat
- * what is needed */
-ssize_t ocfs2_file_aio_write(struct kiocb *iocb, const char __user *buf,
-			     size_t count, loff_t pos)
-{
-	struct ocfs2_kiocb_private *okp = iocb->private;
-	struct file *filp = iocb->ki_filp;
-	struct inode *inode = filp->f_dentry->d_inode;
-	ssize_t ret = 0, ret2;
-	sigset_t blocked, oldset;
-	struct iovec local_iov = { .iov_base = (void __user *)buf,
-				   .iov_len = count };
-
-	/* explained up in ocfs2_file_aio_read() */
-	sigfillset(&blocked);
-	ret = sigprocmask(SIG_BLOCK, &blocked, &oldset);
-	if (ret < 0) {
-		mlog_errno(ret);
-		goto out;
-	}
-
-	mlog(0, "iocb %p okp %p\n", iocb, okp);
-
-	if (okp == NULL) {
-		okp = okp_alloc(iocb);
-		if (IS_ERR(okp)) {
-			ret = PTR_ERR(okp);
-			mlog_errno(ret);
-			goto up_io;
-		}
-
-		okp->kp_ctxt.b_cb = ocfs2_aio_kick;
-		okp->kp_ctxt.b_cb_data = (unsigned long)iocb;
-	}
-
-	if (!okp->kp_have_write_locks) {
-		ret = ocfs2_write_lock_maybe_extend(filp, buf, count,
-						    &iocb->ki_pos,
-						    &okp->kp_info,
-						    &okp->kp_ctxt);
-		okp->kp_have_write_locks = 1;
-		if (okp->kp_info.wl_extended) {
-			/*
-			 * this is not a particularly nice place to do this but
-			 * extending aio in ocfs2 is not yet a priority.  it
-			 * means that we'll write zeros in the buffered case
-			 * before then over-writing them with the real op.  It
-			 * also sleeps in the aio submission context.
-			 */
-			ocfs2_file_finish_extension(inode,
-						    !okp->kp_info.wl_newsize,
-						    okp->kp_info.wl_do_direct_io);
-			okp->kp_info.wl_extended = 0;
-		}
-		if (ret) {
-			mlog_errno(ret);
-			goto up_io;
-		}
-	}
-
-	/* hold the ip_alloc_sem across the op */
-	if (!okp->kp_have_alloc_sem) {
-		down_read(&OCFS2_I(inode)->ip_alloc_sem);
-		okp->kp_have_alloc_sem = 1;
-	}
-
-up_io:
-	/*
-	 * never hold i_sem when we leave this function, nor when we call
-	 * g_f_a_w().  we've done all extending and inode field updating under
-	 * the i_sem and we hold the ip_alloc_sem for reading across the ops.
-	 * ocfs2_direct_IO calls blockdev_direct_IO with NO_LOCKING.
-	 */
-	if (okp->kp_info.wl_have_i_sem) {
-		up(&inode->i_sem);
-		okp->kp_info.wl_have_i_sem = 0;
-	}
-	if (ret == 0)
-		ret = generic_file_aio_write_nolock(iocb, &local_iov, 1,
-						    &iocb->ki_pos);
-
-	ret2 = sigprocmask(SIG_SETMASK, &oldset, NULL);
-	if (ret2 < 0) {
-		mlog_errno(ret2);
-		if (ret == 0)
-			ret = ret2;
-	}
-out:
-	/* ki_dtor will always be called eventually, no tear down here */
-	mlog(0, "iocb %p returning %lld\n", iocb, (long long)ret);
-	return ret;
-}

Deleted: trunk/fs/ocfs2/aio.h
===================================================================
--- trunk/fs/ocfs2/aio.h	2005-11-03 22:39:00 UTC (rev 2676)
+++ trunk/fs/ocfs2/aio.h	2005-11-05 00:27:07 UTC (rev 2677)
@@ -1,37 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * aio.h
- *
- * Function prototypes
- *
- * Copyright (C) 2002, 2004, 2005 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#ifndef OCFS2_AIO_H
-#define OCFS2_AIO_H
-
-ssize_t ocfs2_file_aio_write(struct kiocb *iocb, const char __user *buf,
-			     size_t count, loff_t pos);
-ssize_t ocfs2_file_aio_read(struct kiocb *iocb, char __user *buf, size_t count,
-			    loff_t pos);
-
-void okp_teardown_from_list(void *data);
-void ocfs2_wait_for_okp_destruction(ocfs2_super *osb);
-
-#endif /* OCFS2_AIO_H */

Modified: trunk/fs/ocfs2/aops.c
===================================================================
--- trunk/fs/ocfs2/aops.c	2005-11-03 22:39:00 UTC (rev 2676)
+++ trunk/fs/ocfs2/aops.c	2005-11-05 00:27:07 UTC (rev 2677)
@@ -31,11 +31,13 @@
 #include "ocfs2.h"
 
 #include "alloc.h"
+#include "aops.h"
 #include "dlmglue.h"
 #include "extent_map.h"
 #include "file.h"
 #include "inode.h"
 #include "journal.h"
+#include "super.h"
 #include "symlink.h"
 
 #include "buffer_head_io.h"
@@ -131,17 +133,15 @@
 static int ocfs2_get_block(struct inode *inode, sector_t iblock,
 			   struct buffer_head *bh_result, int create)
 {
-	int err = -EIO;
-	u64 vbo = 0;
-	u64 p_blkno;
+	int err = 0;
+	u64 p_blkno, past_eof;
 
 	mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode,
 		   (unsigned long long)iblock, bh_result, create);
 
-	if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE) {
+	if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE)
 		mlog(ML_NOTICE, "get_block on system inode 0x%p (%lu)\n",
 		     inode, inode->i_ino);
-	}
 
 	if (S_ISLNK(inode->i_mode)) {
 		/* this always does I/O for some reason. */
@@ -149,22 +149,17 @@
 		goto bail;
 	}
 
-	vbo = (u64)iblock << inode->i_sb->s_blocksize_bits;
-
 	/* this can happen if another node truncs after our extend! */
 	spin_lock(&OCFS2_I(inode)->ip_lock);
-	if (iblock >=
-	    ocfs2_clusters_to_blocks(inode->i_sb,
-				     OCFS2_I(inode)->ip_clusters)) {
-		spin_unlock(&OCFS2_I(inode)->ip_lock);
+	if (iblock >= ocfs2_clusters_to_blocks(inode->i_sb,
+					       OCFS2_I(inode)->ip_clusters))
 		err = -EIO;
+	spin_unlock(&OCFS2_I(inode)->ip_lock);
+	if (err)
 		goto bail;
-	}
-	spin_unlock(&OCFS2_I(inode)->ip_lock);
 
 	err = ocfs2_extent_map_get_blocks(inode, iblock, 1, &p_blkno,
 					  NULL);
-
 	if (err) {
 		mlog(ML_ERROR, "Error %d from get_blocks(0x%p, %llu, 1, "
 		     "%"MLFu64", NULL)\n", err, inode,
@@ -174,8 +169,6 @@
 
 	map_bh(bh_result, inode->i_sb, p_blkno);
 
-	err = 0;
-
 	if (bh_result->b_blocknr == 0) {
 		err = -EIO;
 		mlog(ML_ERROR, "iblock = %llu p_blkno = %"MLFu64" "
@@ -183,22 +176,11 @@
 		     p_blkno, OCFS2_I(inode)->ip_blkno);
 	}
 
-	if (vbo < OCFS2_I(inode)->ip_mmu_private)
-		goto bail;
-	if (!create)
-		goto bail;
-	if (vbo != OCFS2_I(inode)->ip_mmu_private) {
-		mlog(ML_ERROR, "Uh-oh, vbo = %"MLFi64", i_size = %lld, "
-		     "mmu = %lld, inode = %"MLFu64"\n", vbo,
-		     i_size_read(inode), OCFS2_I(inode)->ip_mmu_private,
-		     OCFS2_I(inode)->ip_blkno);
-		BUG();
-		err = -EIO;
-		goto bail;
-	}
+	past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
+	mlog(0, "Inode %lu, past_eof = %"MLFu64"\n", inode->i_ino, past_eof);
 
-	set_buffer_new(bh_result);
-	OCFS2_I(inode)->ip_mmu_private += inode->i_sb->s_blocksize;
+	if (create && (iblock >= past_eof))
+		set_buffer_new(bh_result);
 
 bail:
 	if (err < 0)
@@ -210,17 +192,75 @@
 
 static int ocfs2_readpage(struct file *file, struct page *page)
 {
-	int ret;
+	struct inode *inode = page->mapping->host;
+	loff_t start = (loff_t)page->index << PAGE_CACHE_SHIFT;
+	int ret, unlock = 1;
 
 	mlog_entry("(0x%p, %lu)\n", file, (page ? page->index : 0));
 
+	ret = ocfs2_meta_lock_with_page(inode, NULL, NULL, 0, page);
+	if (ret != 0) {
+		if (ret == AOP_TRUNCATED_PAGE)
+			unlock = 0;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	down_read(&OCFS2_I(inode)->ip_alloc_sem);
+
+	/*
+	 * i_size might have just been updated as we grabed the meta lock.  We
+	 * might now be discovering a truncate that hit on another node.
+	 * block_read_full_page->get_block freaks out if it is asked to read
+	 * beyond the end of a file, so we check here.  Callers
+	 * (generic_file_read, fault->nopage) are clever enough to check i_size
+	 * and notice that the page they just read isn't needed.
+	 *
+	 * XXX sys_readahead() seems to get that wrong?
+	 */
+	if (start >= i_size_read(inode)) {
+		char *addr = kmap(page);
+		memset(addr, 0, PAGE_SIZE);
+		flush_dcache_page(page);
+		kunmap(page);
+		SetPageUptodate(page);
+		ret = 0;
+		goto out_alloc;
+	}
+
+	ret = ocfs2_data_lock_with_page(inode, 0, page);
+	if (ret != 0) {
+		if (ret == AOP_TRUNCATED_PAGE)
+			unlock = 0;
+		mlog_errno(ret);
+		goto out_alloc;
+	}
+
 	ret = block_read_full_page(page, ocfs2_get_block);
+	unlock = 0;
 
+	ocfs2_data_unlock(inode, 0);
+out_alloc:
+	up_read(&OCFS2_I(inode)->ip_alloc_sem);
+	ocfs2_meta_unlock(inode, 0);
+out:
+	if (unlock)
+		unlock_page(page);
 	mlog_exit(ret);
-
 	return ret;
 }
 
+/* Note: Because we don't support holes, our allocation has
+ * already happened (allocation writes zeros to the file data)
+ * so we don't have to worry about ordered writes in
+ * ocfs2_writepage.
+ *
+ * ->writepage is called during the process of invalidating the page cache
+ * during blocked lock processing.  It can't block on any cluster locks
+ * to during block mapping.  It's relying on the fact that the block
+ * mapping can't have disappeared under the dirty pages that it is
+ * being asked to write back.
+ */
 static int ocfs2_writepage(struct page *page, struct writeback_control *wbc)
 {
 	int ret;
@@ -234,32 +274,206 @@
 	return ret;
 }
 
-static int ocfs2_prepare_write(struct file *file, struct page *page,
-		unsigned from, unsigned to)
+/*
+ * ocfs2_prepare_write() can be an outer-most ocfs2 call when it is called
+ * from loopback.  It must be able to perform its own locking around
+ * ocfs2_get_block().
+ */
+int ocfs2_prepare_write(struct file *file, struct page *page,
+			unsigned from, unsigned to)
 {
+	struct inode *inode = page->mapping->host;
 	int ret;
 
 	mlog_entry("(0x%p, 0x%p, %u, %u)\n", file, page, from, to);
 
-	ret = cont_prepare_write(page, from, to, ocfs2_get_block,
-		&(OCFS2_I(page->mapping->host)->ip_mmu_private));
+	ret = ocfs2_meta_lock_with_page(inode, NULL, NULL, 0, page);
+	if (ret != 0) {
+		mlog_errno(ret);
+		goto out;
+	}
 
+	down_read(&OCFS2_I(inode)->ip_alloc_sem);
+
+	ret = block_prepare_write(page, from, to, ocfs2_get_block);
+
+	up_read(&OCFS2_I(inode)->ip_alloc_sem);
+
+	ocfs2_meta_unlock(inode, 0);
+out:
 	mlog_exit(ret);
+	return ret;
+}
 
+/* Taken from ext3. We don't necessarily need the full blown
+ * functionality yet, but IMHO it's better to cut and paste the whole
+ * thing so we can avoid introducing our own bugs (and easily pick up
+ * their fixes when they happen) --Mark */
+static int walk_page_buffers(	handle_t *handle,
+				struct buffer_head *head,
+				unsigned from,
+				unsigned to,
+				int *partial,
+				int (*fn)(	handle_t *handle,
+						struct buffer_head *bh))
+{
+	struct buffer_head *bh;
+	unsigned block_start, block_end;
+	unsigned blocksize = head->b_size;
+	int err, ret = 0;
+	struct buffer_head *next;
+
+	for (	bh = head, block_start = 0;
+		ret == 0 && (bh != head || !block_start);
+	    	block_start = block_end, bh = next)
+	{
+		next = bh->b_this_page;
+		block_end = block_start + blocksize;
+		if (block_end <= from || block_start >= to) {
+			if (partial && !buffer_uptodate(bh))
+				*partial = 1;
+			continue;
+		}
+		err = (*fn)(handle, bh);
+		if (!ret)
+			ret = err;
+	}
 	return ret;
 }
 
+ocfs2_journal_handle *ocfs2_start_walk_page_trans(struct inode *inode,
+						  struct page *page,
+						  unsigned from,
+						  unsigned to)
+{
+	ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	ocfs2_journal_handle *handle = NULL;
+	int ret = 0;
+
+	handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS);
+	if (!handle) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (ocfs2_should_order_data(inode)) {
+		ret = walk_page_buffers(handle->k_handle,
+					page_buffers(page),
+					from, to, NULL,
+					ocfs2_journal_dirty_data);
+		if (ret < 0) 
+			mlog_errno(ret);
+	}
+out:
+	if (ret) {
+		if (handle)
+			ocfs2_commit_trans(handle);
+		handle = ERR_PTR(ret);
+	}
+	return handle;
+}
+
 static int ocfs2_commit_write(struct file *file, struct page *page,
 			      unsigned from, unsigned to)
 {
-	int ret;
+	int ret, extending = 0, locklevel = 0;
+	loff_t new_i_size;
+	struct buffer_head *di_bh = NULL;
+	struct inode *inode = page->mapping->host;
+	ocfs2_journal_handle *handle = NULL;
 
 	mlog_entry("(0x%p, 0x%p, %u, %u)\n", file, page, from, to);
 
+	/* NOTE: ocfs2_file_aio_write has ensured that it's safe for
+	 * us to sample inode->i_size here without the metadata lock:
+	 *
+	 * 1) We're currently holding the inode alloc lock, so no
+	 *    nodes can change it underneath us.
+	 *
+	 * 2) We've had to take the metadata lock at least once
+	 *    already to check for extending writes, hence insuring
+	 *    that our current copy is also up to date.
+	 */
+	new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
+	if (new_i_size > i_size_read(inode)) {
+		extending = 1;
+		locklevel = 1;
+	}
+
+	ret = ocfs2_meta_lock_with_page(inode, NULL, &di_bh, locklevel, page);
+	if (ret != 0) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_data_lock_with_page(inode, 1, page);
+	if (ret != 0) {
+		mlog_errno(ret);
+		goto out_unlock_meta;
+	}
+
+	if (extending) {
+		handle = ocfs2_start_walk_page_trans(inode, page, from, to);
+		if (IS_ERR(handle)) {
+			ret = PTR_ERR(handle);
+			handle = NULL;
+			goto out_unlock_data;
+		}
+
+		/* Mark our buffer early. We'd rather catch this error up here
+		 * as opposed to after a successful commit_write which would
+		 * require us to set back inode->i_size. */
+		ret = ocfs2_journal_access(handle, inode, di_bh,
+					   OCFS2_JOURNAL_ACCESS_WRITE);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out_commit;
+		}
+	}
+
+	/* might update i_size */
 	ret = generic_commit_write(file, page, from, to);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
 
+	if (extending) {
+		loff_t size = (u64) i_size_read(inode);
+		ocfs2_dinode *di = (ocfs2_dinode *)di_bh->b_data;
+
+		/* ocfs2_mark_inode_dirty is too heavy to use here. */
+		inode->i_blocks = ocfs2_align_bytes_to_sectors(size);
+		inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+
+		di->i_size = cpu_to_le64(size);
+		di->i_ctime = di->i_mtime = 
+				cpu_to_le64(inode->i_mtime.tv_sec);
+		di->i_ctime_nsec = di->i_mtime_nsec = 
+				cpu_to_le32(inode->i_mtime.tv_nsec);
+
+		ret = ocfs2_journal_dirty(handle, di_bh);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out_commit;
+		}
+	}
+
+	BUG_ON(extending && (i_size_read(inode) != new_i_size));
+
+out_commit:
+	if (handle)
+		ocfs2_commit_trans(handle);
+out_unlock_data:
+	ocfs2_data_unlock(inode, 1);
+out_unlock_meta:
+	ocfs2_meta_unlock(inode, locklevel);
+out:
+	if (di_bh)
+		brelse(di_bh);
+
 	mlog_exit(ret);
-
 	return ret;
 }
 
@@ -377,6 +591,26 @@
 	return ret;
 }
 
+/* 
+ * ocfs2_dio_end_io is called by the dio core when a dio is finished.  We're
+ * particularly interested in the aio/dio case.  Like the core uses
+ * i_alloc_sem, we use the rw_lock DLM lock to protect io on one node from
+ * truncation on another.
+ */
+static void ocfs2_dio_end_io(struct kiocb *iocb,
+			     loff_t offset,
+			     ssize_t bytes,
+			     void *private)
+{
+	struct inode *inode = iocb->ki_filp->f_dentry->d_inode;
+
+	/* this io's submitter should not have unlocked this before we could */
+	BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
+	ocfs2_iocb_clear_rw_locked(iocb);
+	up_read(&inode->i_alloc_sem);
+	ocfs2_rw_unlock(inode, 0);
+}
+
 static ssize_t ocfs2_direct_IO(int rw,
 			       struct kiocb *iocb,
 			       const struct iovec *iov,
@@ -388,13 +622,11 @@
 	int ret;
 
 	mlog_entry_void();
-
-	/* blockdev_direct_IO checks alignment for us, using */
 	ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
 					    inode->i_sb->s_bdev, iov, offset,
-					    nr_segs, ocfs2_direct_IO_get_blocks,
-					    NULL);
-
+					    nr_segs, 
+					    ocfs2_direct_IO_get_blocks,
+					    ocfs2_dio_end_io);
 	mlog_exit(ret);
 	return ret;
 }

Copied: trunk/fs/ocfs2/aops.h (from rev 2676, branches/locking-changes/fs/ocfs2/aops.h)

Modified: trunk/fs/ocfs2/cluster/masklog.h
===================================================================
--- trunk/fs/ocfs2/cluster/masklog.h	2005-11-03 22:39:00 UTC (rev 2676)
+++ trunk/fs/ocfs2/cluster/masklog.h	2005-11-05 00:27:07 UTC (rev 2677)
@@ -198,8 +198,10 @@
 } while (0)
 
 #define mlog_errno(st) do {						\
-	if ((st) != -ERESTARTSYS && (st) != -EINTR)			\
-		mlog(ML_ERROR, "status = %lld\n", (long long)(st));	\
+	int _st = (st);							\
+	if (_st != -ERESTARTSYS && _st != -EINTR &&			\
+	    _st != AOP_TRUNCATED_PAGE)					\
+		mlog(ML_ERROR, "status = %lld\n", (long long)_st);	\
 } while (0)
 
 #define mlog_entry(fmt, args...) do {					\

Modified: trunk/fs/ocfs2/cluster/tcp_internal.h
===================================================================
--- trunk/fs/ocfs2/cluster/tcp_internal.h	2005-11-03 22:39:00 UTC (rev 2676)
+++ trunk/fs/ocfs2/cluster/tcp_internal.h	2005-11-05 00:27:07 UTC (rev 2677)
@@ -38,7 +38,17 @@
 #define O2NET_KEEPALIVE_DELAY_SECS	5
 #define O2NET_IDLE_TIMEOUT_SECS		10
 
-#define O2NET_PROTOCOL_VERSION 1ULL
+/* 
+ * This version number represents quite a lot, unfortunately.  It not
+ * only represents the raw network message protocol on the wire but also
+ * locking semantics of the file system using the protocol.  It should 
+ * be somewhere else, I'm sure, but right now it isn't.
+ *
+ * New in version 2:
+ * 	- full 64 bit i_size in the metadata lock lvbs
+ * 	- introduction of "rw" lock and pushing meta/data locking down
+ */
+#define O2NET_PROTOCOL_VERSION 2ULL
 struct o2net_handshake {
 	__be64	protocol_version;
 	__be64	connector_id;

Modified: trunk/fs/ocfs2/dir.c
===================================================================
--- trunk/fs/ocfs2/dir.c	2005-11-03 22:39:00 UTC (rev 2676)
+++ trunk/fs/ocfs2/dir.c	2005-11-05 00:27:07 UTC (rev 2677)
@@ -356,9 +356,9 @@
 	spin_unlock(&OCFS2_I(dir)->ip_lock);
 
 	if (extend) {
-		status = ocfs2_extend_allocation(OCFS2_SB(sb), dir, 1,
-						 parent_fe_bh, handle,
-						 data_ac, meta_ac, NULL);
+		status = ocfs2_do_extend_allocation(OCFS2_SB(sb), dir, 1,
+						    parent_fe_bh, handle,
+						    data_ac, meta_ac, NULL);
 		BUG_ON(status == -EAGAIN);
 		if (status < 0) {
 			mlog_errno(status);

Modified: trunk/fs/ocfs2/dlmglue.c
===================================================================
--- trunk/fs/ocfs2/dlmglue.c	2005-11-03 22:39:00 UTC (rev 2676)
+++ trunk/fs/ocfs2/dlmglue.c	2005-11-05 00:27:07 UTC (rev 2677)
@@ -30,6 +30,7 @@
 #include <linux/smp_lock.h>
 #include <linux/crc32.h>
 #include <linux/kthread.h>
+#include <linux/pagemap.h>
 
 #include <cluster/heartbeat.h>
 #include <cluster/nodemanager.h>
@@ -55,6 +56,14 @@
 
 #include "buffer_head_io.h"
 
+struct ocfs2_mask_waiter {
+	struct list_head	mw_item;
+	int			mw_status;
+	struct completion	mw_complete;
+	unsigned long		mw_mask;
+	unsigned long		mw_goal;
+};
+
 static void ocfs2_inode_ast_func(void *opaque);
 static void ocfs2_inode_bast_func(void *opaque,
 				  int level);
@@ -74,6 +83,8 @@
 			      int *requeue);
 static int ocfs2_unblock_data(struct ocfs2_lock_res *lockres,
 			      int *requeue);
+static int ocfs2_unblock_inode_lock(struct ocfs2_lock_res *lockres,
+			      int *requeue);
 static int ocfs2_unblock_osb_lock(struct ocfs2_lock_res *lockres,
 				  int *requeue);
 typedef void (ocfs2_convert_worker_t)(struct ocfs2_lock_res *, int);
@@ -89,6 +100,13 @@
 	int  (*unblock)(struct ocfs2_lock_res *, int *);
 };
 
+static struct ocfs2_lock_res_ops ocfs2_inode_rw_lops = {
+	.ast		= ocfs2_inode_ast_func,
+	.bast		= ocfs2_inode_bast_func,
+	.unlock_ast	= ocfs2_unlock_ast_func,
+	.unblock	= ocfs2_unblock_inode_lock,
+};
+
 static struct ocfs2_lock_res_ops ocfs2_inode_meta_lops = {
 	.ast		= ocfs2_inode_ast_func,
 	.bast		= ocfs2_inode_bast_func,
@@ -123,7 +141,8 @@
 static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres)
 {
 	return lockres->l_type == OCFS2_LOCK_TYPE_META ||
-		lockres->l_type == OCFS2_LOCK_TYPE_DATA;
+		lockres->l_type == OCFS2_LOCK_TYPE_DATA ||
+		lockres->l_type == OCFS2_LOCK_TYPE_RW;
 }
 
 static inline int ocfs2_is_super_lock(struct ocfs2_lock_res *lockres)
@@ -154,15 +173,9 @@
 static int ocfs2_lock_create(ocfs2_super *osb,
 			     struct ocfs2_lock_res *lockres,
 			     int level,
-			     int flags);
+			     int dlm_flags);
 static inline int ocfs2_may_continue_on_blocked_lock(struct ocfs2_lock_res *lockres,
 						     int wanted);
-static int ocfs2_cluster_lock(ocfs2_super *osb,
-			      struct ocfs2_lock_res *lockres,
-			      int level,
-			      int lkm_flags,
-			      ocfs2_lock_callback cb,
-			      unsigned long cb_data);
 static void ocfs2_cluster_unlock(ocfs2_super *osb,
 				 struct ocfs2_lock_res *lockres,
 				 int level);
@@ -185,16 +198,26 @@
 				  struct buffer_head **bh);
 static void ocfs2_drop_osb_locks(ocfs2_super *osb);
 static inline int ocfs2_highest_compat_lock_level(int level);
-static int __ocfs2_downconvert_lock(ocfs2_super *osb,
-				    struct ocfs2_lock_res *lockres,
-				    int new_level,
-				    int lvb);
-static int __ocfs2_cancel_convert(ocfs2_super *osb,
-				  struct ocfs2_lock_res *lockres);
 static inline int ocfs2_can_downconvert_meta_lock(struct inode *inode,
 						  struct ocfs2_lock_res *lockres,
 						  int new_level);
 
+static char *ocfs2_lock_type_strings[] = {
+	[OCFS2_LOCK_TYPE_META] = "Meta",
+	[OCFS2_LOCK_TYPE_DATA] = "Data",
+	[OCFS2_LOCK_TYPE_SUPER] = "Super",
+	[OCFS2_LOCK_TYPE_RENAME] = "Rename",
+	/* Need to differntiate from [R]ename.. serializing writes is the
+	 * important job it does, anyway. */
+	[OCFS2_LOCK_TYPE_RW] = "Write/Read",
+};
+
+static char *ocfs2_lock_type_string(enum ocfs2_lock_type type)
+{
+	mlog_bug_on_msg(type >= OCFS2_NUM_LOCK_TYPES, "%d\n", type);
+	return ocfs2_lock_type_strings[type];
+}
+
 static void ocfs2_build_lock_name(enum ocfs2_lock_type type,
 				  u64 blkno,
 				  u32 generation,
@@ -246,7 +269,7 @@
 	spin_lock_init(&res->l_lock);
 	init_waitqueue_head(&res->l_event);
 	INIT_LIST_HEAD(&res->l_blocked_list);
-	INIT_LIST_HEAD(&res->l_flag_cb_list);
+	INIT_LIST_HEAD(&res->l_mask_waiters);
 }
 
 void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res,
@@ -255,14 +278,22 @@
 {
 	struct ocfs2_lock_res_ops *ops;
 
-	BUG_ON(type != OCFS2_LOCK_TYPE_META &&
-	       type != OCFS2_LOCK_TYPE_DATA);
+	switch(type) {
+		case OCFS2_LOCK_TYPE_RW:
+			ops = &ocfs2_inode_rw_lops;
+			break;
+		case OCFS2_LOCK_TYPE_META:
+			ops = &ocfs2_inode_meta_lops;
+			break;
+		case OCFS2_LOCK_TYPE_DATA:
+			ops = &ocfs2_inode_data_lops;
+			break;
+		default:
+			mlog_bug_on_msg(1, "type: %d\n", type);
+			ops = NULL; /* thanks, gcc */
+			break;
+	};
 
-	if (type == OCFS2_LOCK_TYPE_META)
-		ops = &ocfs2_inode_meta_lops;
-	else
-		ops = &ocfs2_inode_data_lops;
-
 	ocfs2_lock_res_init_common(res, type, OCFS2_I(inode)->ip_blkno,
 				   inode->i_generation, ops, inode);
 }
@@ -298,8 +329,8 @@
 	mlog_bug_on_msg(!list_empty(&res->l_blocked_list),
 			"Lockres %s is on the blocked list\n",
 			res->l_name);
-	mlog_bug_on_msg(!list_empty(&res->l_flag_cb_list),
-			"Lockres %s has flag callbacks pending\n",
+	mlog_bug_on_msg(!list_empty(&res->l_mask_waiters),
+			"Lockres %s has mask waiters pending\n",
 			res->l_name);
 	mlog_bug_on_msg(spin_is_locked(&res->l_lock),
 			"Lockres %s is locked\n",
@@ -375,35 +406,32 @@
 	return new_level;
 }
 
-/* XXX must be called with lockres->l_lock held */
-static void lockres_set_flags(struct ocfs2_lock_res *lockres, unsigned long newflags)
+static void lockres_set_flags(struct ocfs2_lock_res *lockres,
+			      unsigned long newflags)
 {
 	struct list_head *pos, *tmp;
-	struct ocfs2_lockres_flag_callback *fcb;
+	struct ocfs2_mask_waiter *mw;
 
-	assert_spin_locked(&lockres->l_lock);
+ 	assert_spin_locked(&lockres->l_lock);
 
 	lockres->l_flags = newflags;
 
-	list_for_each_safe(pos, tmp, &lockres->l_flag_cb_list) {
-		fcb = list_entry(pos, struct ocfs2_lockres_flag_callback,
-				 fc_lockres_item);
-		if ((lockres->l_flags & fcb->fc_flag_mask) !=
-		    fcb->fc_flag_goal)
+	list_for_each_safe(pos, tmp, &lockres->l_mask_waiters) {
+		mw = list_entry(pos, struct ocfs2_mask_waiter, mw_item);
+		if ((lockres->l_flags & mw->mw_mask) != mw->mw_goal)
 			continue;
 
-		list_del_init(&fcb->fc_lockres_item);
-		fcb->fc_cb(0, fcb->fc_data);
-		if (fcb->fc_free_once_called)
-			kfree(fcb);
+		list_del_init(&mw->mw_item);
+		mw->mw_status = 0;
+		complete(&mw->mw_complete);
 	}
 }
-
 static void lockres_or_flags(struct ocfs2_lock_res *lockres, unsigned long or)
 {
 	lockres_set_flags(lockres, lockres->l_flags | or);
 }
-static void lockres_clear_flags(struct ocfs2_lock_res *lockres, unsigned long clear)
+static void lockres_clear_flags(struct ocfs2_lock_res *lockres,
+				unsigned long clear)
 {
 	lockres_set_flags(lockres, lockres->l_flags & ~clear);
 }
@@ -471,6 +499,7 @@
 	struct ocfs2_lock_res *lockres = opaque;
 	struct inode *inode;
 	struct dlm_lockstatus *lksb;
+	unsigned long flags;
 
 	mlog_entry_void();
 
@@ -478,18 +507,18 @@
 
 	mlog(0, "AST fired for inode %"MLFu64", l_action = %u, type = %s\n",
 	     OCFS2_I(inode)->ip_blkno, lockres->l_action,
-	     (lockres->l_type == OCFS2_LOCK_TYPE_META) ? "Meta" : "Data");
+	     ocfs2_lock_type_string(lockres->l_type));
 
 	BUG_ON(!ocfs2_is_inode_lock(lockres));
 
-	spin_lock(&lockres->l_lock);
+	spin_lock_irqsave(&lockres->l_lock, flags);
 
 	lksb = &(lockres->l_lksb);
 	if (lksb->status != DLM_NORMAL) {
 		mlog(ML_ERROR, "ocfs2_inode_ast_func: lksb status value of %u "
 		     "on inode %"MLFu64"\n", lksb->status,
 		     OCFS2_I(inode)->ip_blkno);
-		spin_unlock(&lockres->l_lock);
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
 		mlog_exit_void();
 		return;
 	}
@@ -514,14 +543,14 @@
 		BUG();
 	}
 
-	/* data locking ignores refresh flag for now. */
-	if (lockres->l_type == OCFS2_LOCK_TYPE_DATA)
+	/* data and rw locking ignores refresh flag for now. */
+	if (lockres->l_type != OCFS2_LOCK_TYPE_META)
 		lockres_clear_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
 
 	/* set it to something invalid so if we get called again we
 	 * can catch it. */
 	lockres->l_action = OCFS2_AST_INVALID;
-	spin_unlock(&lockres->l_lock);
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
 	wake_up(&lockres->l_event);
 
 	mlog_exit_void();
@@ -558,16 +587,17 @@
 				    int level)
 {
 	int needs_downconvert;
+	unsigned long flags;
 
 	mlog_entry_void();
 
 	BUG_ON(level <= LKM_NLMODE);
 
-	spin_lock(&lockres->l_lock);
+	spin_lock_irqsave(&lockres->l_lock, flags);
 	needs_downconvert = ocfs2_generic_handle_bast(lockres, level);
 	if (needs_downconvert)
 		ocfs2_schedule_blocked_lock(osb, lockres);
-	spin_unlock(&lockres->l_lock);
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
 
 	ocfs2_kick_vote_thread(osb);
 
@@ -591,7 +621,7 @@
 	mlog(0, "BAST fired for inode %"MLFu64", blocking = %d, level = %d "
 	     "type = %s\n", OCFS2_I(inode)->ip_blkno, level,
 	     lockres->l_level,
-	     (lockres->l_type == OCFS2_LOCK_TYPE_META) ? "Meta" : "Data");
+	     ocfs2_lock_type_string(lockres->l_type));
 
 	ocfs2_generic_bast_func(osb, lockres, level);
 
@@ -602,13 +632,14 @@
 				   int ignore_refresh)
 {
 	struct dlm_lockstatus *lksb = &lockres->l_lksb;
+	unsigned long flags;
 
-	spin_lock(&lockres->l_lock);
+	spin_lock_irqsave(&lockres->l_lock, flags);
 
 	if (lksb->status != DLM_NORMAL) {
 		mlog(ML_ERROR, "lockres %s: lksb status value of %u!\n",
 		     lockres->l_name, lksb->status);
-		spin_unlock(&lockres->l_lock);
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
 		return;
 	}
 
@@ -632,7 +663,7 @@
 	/* set it to something invalid so if we get called again we
 	 * can catch it. */
 	lockres->l_action = OCFS2_AST_INVALID;
-	spin_unlock(&lockres->l_lock);
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
 
 	wake_up(&lockres->l_event);
 }
@@ -702,14 +733,16 @@
 static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
 						int convert)
 {
+	unsigned long flags;
+
 	mlog_entry_void();
-	spin_lock(&lockres->l_lock);
+	spin_lock_irqsave(&lockres->l_lock, flags);
 	lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
 	if (convert)
 		lockres->l_action = OCFS2_AST_INVALID;
 	else
 		lockres->l_unlock_action = OCFS2_UNLOCK_INVALID;
-	spin_unlock(&lockres->l_lock);
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
 
 	wake_up(&lockres->l_event);
 	mlog_exit_void();
@@ -722,32 +755,33 @@
 static int ocfs2_lock_create(ocfs2_super *osb,
 			     struct ocfs2_lock_res *lockres,
 			     int level,
-			     int flags)
+			     int dlm_flags)
 {
 	int ret = 0;
 	enum dlm_status status;
+	unsigned long flags;
 
 	mlog_entry_void();
 
 	mlog(0, "lock %s, level = %d, flags = %d\n", lockres->l_name, level,
-	     flags);
+	     dlm_flags);
 
-	spin_lock(&lockres->l_lock);
+	spin_lock_irqsave(&lockres->l_lock, flags);
 	if ((lockres->l_flags & OCFS2_LOCK_ATTACHED) ||
 	    (lockres->l_flags & OCFS2_LOCK_BUSY)) {
-		spin_unlock(&lockres->l_lock);
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
 		goto bail;
 	}
 
 	lockres->l_action = OCFS2_AST_ATTACH;
 	lockres->l_requested = level;
 	lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
-	spin_unlock(&lockres->l_lock);
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
 
 	status = dlmlock(osb->dlm,
 			 level,
 			 &lockres->l_lksb,
-			 flags,
+			 dlm_flags,
 			 lockres->l_name,
 			 lockres->l_ops->ast,
 			 lockres,
@@ -768,11 +802,12 @@
 static inline int ocfs2_check_wait_flag(struct ocfs2_lock_res *lockres,
 					int flag)
 {
+	unsigned long flags;
 	int ret;
 
-	spin_lock(&lockres->l_lock);
+	spin_lock_irqsave(&lockres->l_lock, flags);
 	ret = lockres->l_flags & flag;
-	spin_unlock(&lockres->l_lock);
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
 
 	return ret;
 }
@@ -791,20 +826,6 @@
 		   !ocfs2_check_wait_flag(lockres, OCFS2_LOCK_REFRESHING));
 }
 
-static void lockres_add_flag_callback(struct ocfs2_lock_res *lockres,
-				      struct ocfs2_lockres_flag_callback *fcb,
-				      unsigned long mask, unsigned long goal)
-{
-	BUG_ON(!list_empty(&fcb->fc_lockres_item));
-	BUG_ON(fcb->fc_cb == NULL);
-
-	assert_spin_locked(&lockres->l_lock);
-
-	list_add_tail(&fcb->fc_lockres_item, &lockres->l_flag_cb_list);
-	fcb->fc_flag_mask = mask;
-	fcb->fc_flag_goal = goal;
-}
-
 /* predict what lock level we'll be dropping down to on behalf
  * of another node, and return true if the currently wanted
  * level will be compatible with it. */
@@ -816,98 +837,85 @@
 	return wanted <= ocfs2_highest_compat_lock_level(lockres->l_blocking);
 }
 
-/* these are generic and could be used elsewhere */
-struct ocfs2_status_completion {
-	int			sc_status;
-	struct completion	sc_complete;
-};
-
-static void ocfs2_status_completion_cb(int rc, unsigned long data)
+static void ocfs2_init_mask_waiter(struct ocfs2_mask_waiter *mw)
 {
-	struct ocfs2_status_completion *sc;
-
-	sc = (struct ocfs2_status_completion *)data;
-	sc->sc_status = rc;
-	complete(&sc->sc_complete);
+	INIT_LIST_HEAD(&mw->mw_item);
+	init_completion(&mw->mw_complete);
 }
 
-static int ocfs2_wait_for_status_completion(struct ocfs2_status_completion *sc)
+static int ocfs2_wait_for_mask(struct ocfs2_mask_waiter *mw)
 {
-	wait_for_completion(&sc->sc_complete);
+	wait_for_completion(&mw->mw_complete);
 	/* Re-arm the completion in case we want to wait on it again */
-	INIT_COMPLETION(sc->sc_complete);
-	return sc->sc_status;
+	INIT_COMPLETION(mw->mw_complete);
+	return mw->mw_status;
 }
 
-static void ocfs2_init_fcb(struct ocfs2_lockres_flag_callback *fcb,
-			   ocfs2_lock_callback cb,
-			   unsigned long cb_data,
-			   int stack_allocated)
+static void lockres_add_mask_waiter(struct ocfs2_lock_res *lockres,
+				    struct ocfs2_mask_waiter *mw,
+				    unsigned long mask,
+				    unsigned long goal)
 {
-	fcb->fc_cb = cb;
-	fcb->fc_data = cb_data;
-	fcb->fc_free_once_called = !stack_allocated;
-	INIT_LIST_HEAD(&fcb->fc_lockres_item);
+	BUG_ON(!list_empty(&mw->mw_item));
+
+	assert_spin_locked(&lockres->l_lock);
+
+	list_add_tail(&mw->mw_item, &lockres->l_mask_waiters);
+	mw->mw_mask = mask;
+	mw->mw_goal = goal;
 }
 
-/* Init a stack allocated FCB and an ocfs2_status_completion together. */
-static void ocfs2_init_completion_fcb(struct ocfs2_lockres_flag_callback *fcb,
-				      struct ocfs2_status_completion *sc)
+/* returns 0 if the mw that was removed was already satisfied, -EBUSY
+ * if the mask still hadn't reached its goal */
+static int lockres_remove_mask_waiter(struct ocfs2_lock_res *lockres,
+				      struct ocfs2_mask_waiter *mw)
 {
-	init_completion(&sc->sc_complete);
-	ocfs2_init_fcb(fcb, ocfs2_status_completion_cb, (unsigned long) sc, 1);
+	unsigned long flags;
+	int ret = 0;
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+	if (!list_empty(&mw->mw_item)) {
+		if ((lockres->l_flags & mw->mw_mask) != mw->mw_goal)
+			ret = -EBUSY;
+
+		list_del_init(&mw->mw_item);
+		init_completion(&mw->mw_complete);
+	}
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+	return ret;
+
 }
 
 static int ocfs2_cluster_lock(ocfs2_super *osb,
 			      struct ocfs2_lock_res *lockres,
 			      int level,
 			      int lkm_flags,
-			      ocfs2_lock_callback cb,
-			      unsigned long cb_data)
+			      int arg_flags)
 {
-	struct ocfs2_lockres_flag_callback sync_fcb, *fcb;
-	struct ocfs2_status_completion sc;
+	struct ocfs2_mask_waiter mw;
 	enum dlm_status status;
-	int ret;
-	int catch_signals = !(osb->s_mount_opt & OCFS2_MOUNT_NOINTR);
-	int sync = 1;
+	int wait, catch_signals = !(osb->s_mount_opt & OCFS2_MOUNT_NOINTR);
+	int ret = 0; /* gcc doesn't realize wait = 1 guarantees ret is set */
+	unsigned long flags;
 
 	mlog_entry_void();
 
-	if (cb != NULL) {
-		fcb = kmalloc(sizeof(*fcb), GFP_NOFS);
-		if (fcb == NULL) {
-			ret = -ENOMEM;
-			goto out;
-		}
+	ocfs2_init_mask_waiter(&mw);
 
-		ocfs2_init_fcb(fcb, cb, cb_data, 0);
+again:
+	wait = 0;
 
-		/* A callback passed in means we'll assume async
-		 * behavior - no waiting on dlm operations will be
-		 * done here and the allocated fcb will call the
-		 * callback when done. */
-		sync = 0;
-	} else {
-		/* No callback passed which means the caller wants
-		 * synchronous behavior - we avoid kmalloc and use a
-		 * stack allocated fcb for this. The status completion
-		 * helpers defined above come in handy here. */
-		fcb = &sync_fcb;
-		ocfs2_init_completion_fcb(fcb, &sc);
-	}
-
-again:
 	if (catch_signals && signal_pending(current)) {
 		ret = -ERESTARTSYS;
 		goto out;
 	}
 
-	spin_lock(&lockres->l_lock);
+	spin_lock_irqsave(&lockres->l_lock, flags);
 
 	mlog_bug_on_msg(lockres->l_flags & OCFS2_LOCK_FREEING,
-			"Cluster lock called on freeing lockres %s! flags 0x%lx\n",
-			lockres->l_name, lockres->l_flags);
+			"Cluster lock called on freeing lockres %s! flags "
+			"0x%lx\n", lockres->l_name, lockres->l_flags);
 
 	/* We only compare against the currently granted level
 	 * here. If the lock is blocked waiting on a downconvert,
@@ -916,14 +924,14 @@
 	    level > lockres->l_level) {
 		/* is someone sitting in dlm_lock? If so, wait on
 		 * them. */
-		lockres_add_flag_callback(lockres, fcb, OCFS2_LOCK_BUSY, 0);
-		ret = -EIOCBRETRY;
+		lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
+		wait = 1;
 		goto unlock;
 	}
 
 	if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) {
 		/* lock has not been created yet. */
-		spin_unlock(&lockres->l_lock);
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
 
 		ret = ocfs2_lock_create(osb, lockres, LKM_NLMODE, 0);
 		if (ret < 0) {
@@ -937,8 +945,8 @@
 	    !ocfs2_may_continue_on_blocked_lock(lockres, level)) {
 		/* is the lock is currently blocked on behalf of
 		 * another node */
-		lockres_add_flag_callback(lockres, fcb, OCFS2_LOCK_BLOCKED, 0);
-		ret = -EIOCBRETRY;
+		lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BLOCKED, 0);
+		wait = 1;
 		goto unlock;
 	}
 
@@ -950,7 +958,7 @@
 		lockres->l_action = OCFS2_AST_CONVERT;
 		lockres->l_requested = level;
 		lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
-		spin_unlock(&lockres->l_lock);
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
 
 		BUG_ON(level == LKM_IVMODE);
 		BUG_ON(level == LKM_NLMODE);
@@ -996,26 +1004,31 @@
 
 	ret = 0;
 unlock:
-	spin_unlock(&lockres->l_lock);
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
 out:
-	/* Non-async callers will always wait here for dlm operations
-	 * to complete. We must be careful to re-initialize the
-	 * completion before looping back. */
-	if (ret == -EIOCBRETRY && sync) {
-		ret = ocfs2_wait_for_status_completion(&sc);
+	/*
+	 * This is helping work around a lock inversion between the page lock
+	 * and dlm locks.  One path holds the page lock while calling aops
+	 * which block acquiring dlm locks.  The voting thread holds dlm
+	 * locks while acquiring page locks while down converting data locks.
+	 * This block is helping an aop path notice the inversion and back
+	 * off to unlock its page lock before trying the dlm lock again.
+	 */
+	if (wait && arg_flags & OCFS2_LOCK_NONBLOCK &&
+	    mw.mw_mask & (OCFS2_LOCK_BUSY|OCFS2_LOCK_BLOCKED)) {
+		wait = 0;
+		if (lockres_remove_mask_waiter(lockres, &mw))
+			ret = -EAGAIN;
+		else
+			goto again;
+	}
+	if (wait) {
+		ret = ocfs2_wait_for_mask(&mw);
 		if (ret == 0)
 			goto again;
 		mlog_errno(ret);
 	}
 
-	/* Only free the async fcb on error. */
-	if (ret && ret != -EIOCBRETRY && !sync) {
-		mlog_bug_on_msg(!list_empty(&fcb->fc_lockres_item),
-				"Lockres %s, freeing flag callback in use\n",
-				lockres->l_name);
-		kfree(fcb);
-	}
-
 	mlog_exit(ret);
 	return ret;
 }
@@ -1024,14 +1037,30 @@
 				 struct ocfs2_lock_res *lockres,
 				 int level)
 {
+	unsigned long flags;
+
 	mlog_entry_void();
-	spin_lock(&lockres->l_lock);
+	spin_lock_irqsave(&lockres->l_lock, flags);
 	ocfs2_dec_holders(lockres, level);
 	ocfs2_vote_on_unlock(osb, lockres);
-	spin_unlock(&lockres->l_lock);
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
 	mlog_exit_void();
 }
 
+static int ocfs2_create_new_inode_lock(struct inode *inode,
+				       struct ocfs2_lock_res *lockres)
+{
+	ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	unsigned long flags;
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+	BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED);
+	lockres_or_flags(lockres, OCFS2_LOCK_LOCAL);
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+	return ocfs2_lock_create(osb, lockres, LKM_EXMODE, LKM_LOCAL);
+}
+
 /* Grants us an EX lock on the data and metadata resources, skipping
  * the normal cluster directory lookup. Use this ONLY on newly created
  * inodes which other nodes can't possibly see, and which haven't been
@@ -1040,9 +1069,7 @@
  * with creating a new lock resource. */
 int ocfs2_create_new_inode_locks(struct inode *inode)
 {
-	int status;
-	ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-	struct ocfs2_lock_res *lockres;
+	int ret;
 
 	BUG_ON(!inode);
 	BUG_ON(!ocfs2_inode_is_new(inode));
@@ -1059,41 +1086,78 @@
 	 * on a resource which has an invalid one -- we'll set it
 	 * valid when we release the EX. */
 
-	lockres = &OCFS2_I(inode)->ip_meta_lockres;
+	ret = ocfs2_create_new_inode_lock(inode,
+					  &OCFS2_I(inode)->ip_rw_lockres);
+	if (ret) {
+		mlog_errno(ret);
+		goto bail;
+	}
 
-	spin_lock(&lockres->l_lock);
-	BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED);
-	lockres_or_flags(lockres, OCFS2_LOCK_LOCAL);
-	spin_unlock(&lockres->l_lock);
+	ret = ocfs2_create_new_inode_lock(inode,
+					  &OCFS2_I(inode)->ip_meta_lockres);
+	if (ret) {
+		mlog_errno(ret);
+		goto bail;
+	}
 
-	status = ocfs2_lock_create(osb, lockres, LKM_EXMODE, LKM_LOCAL);
-	if (status < 0) {
-		mlog_errno(status);
+	ret = ocfs2_create_new_inode_lock(inode,
+					  &OCFS2_I(inode)->ip_data_lockres);
+	if (ret) {
+		mlog_errno(ret);
 		goto bail;
 	}
 
-	lockres = &OCFS2_I(inode)->ip_data_lockres;
+bail:
+	mlog_exit(ret);
+	return ret;
+}
 
-	spin_lock(&lockres->l_lock);
-	BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED);
-	lockres_or_flags(lockres, OCFS2_LOCK_LOCAL);
-	spin_unlock(&lockres->l_lock);
+int ocfs2_rw_lock(struct inode *inode, int write)
+{
+	int status, level;
+	struct ocfs2_lock_res *lockres;
 
-	status = ocfs2_lock_create(osb, lockres, LKM_EXMODE, LKM_LOCAL);
-	if (status < 0) {
+	BUG_ON(!inode);
+
+	mlog_entry_void();
+
+	mlog(0, "inode %"MLFu64" take %s RW lock\n",
+	     OCFS2_I(inode)->ip_blkno,
+	     write ? "EXMODE" : "PRMODE");
+
+	lockres = &OCFS2_I(inode)->ip_rw_lockres;
+
+	level = write ? LKM_EXMODE : LKM_PRMODE;
+
+	status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, level, 0,
+				    0);
+	if (status < 0)
 		mlog_errno(status);
-		goto bail;
-	}
 
-	status = 0;
-bail:
 	mlog_exit(status);
 	return status;
 }
 
-int ocfs2_data_lock(struct inode *inode,
-		    int write)
+void ocfs2_rw_unlock(struct inode *inode, int write)
 {
+	int level = write ? LKM_EXMODE : LKM_PRMODE;
+	struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_rw_lockres;
+
+	mlog_entry_void();
+
+	mlog(0, "inode %"MLFu64" drop %s RW lock\n",
+	     OCFS2_I(inode)->ip_blkno,
+	     write ? "EXMODE" : "PRMODE");
+
+	ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level);
+
+	mlog_exit_void();
+}
+
+int ocfs2_data_lock_full(struct inode *inode,
+			 int write,
+			 int arg_flags)
+{
 	int status = 0, level;
 	struct ocfs2_lock_res *lockres;
 
@@ -1119,8 +1183,8 @@
 
 	level = write ? LKM_EXMODE : LKM_PRMODE;
 
-	status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, level, 0,
-				    NULL, 0);
+	status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, level,
+				    0, arg_flags);
 	if (status < 0)
 		mlog_errno(status);
 
@@ -1129,6 +1193,24 @@
 	return status;
 }
 
+/* see ocfs2_meta_lock_with_page() */
+int ocfs2_data_lock_with_page(struct inode *inode,
+			      int write,
+			      struct page *page)
+{
+	int ret;
+
+	ret = ocfs2_data_lock_full(inode, write, OCFS2_LOCK_NONBLOCK);
+	if (ret == -EAGAIN) {
+		unlock_page(page);
+		if (ocfs2_data_lock(inode, write) == 0)
+			ocfs2_data_unlock(inode, write);
+		ret = AOP_TRUNCATED_PAGE;
+	}
+
+	return ret;
+}
+
 static void ocfs2_vote_on_unlock(ocfs2_super *osb,
 				 struct ocfs2_lock_res *lockres)
 {
@@ -1208,12 +1290,8 @@
 
 	lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb;
 
-	/* Setting this to zero will ensure that old versions of the
-	 * LVB code don't trust our information. */
-	lvb->lvb_old_seq   = cpu_to_be32(0);
 	lvb->lvb_version   = cpu_to_be32(OCFS2_LVB_VERSION);
-
-	lvb->lvb_isize     = cpu_to_be64(i_size_read(inode));
+	lvb->lvb_isize	   = cpu_to_be64(i_size_read(inode));
 	lvb->lvb_iclusters = cpu_to_be32(oi->ip_clusters);
 	lvb->lvb_iuid      = cpu_to_be32(inode->i_uid);
 	lvb->lvb_igid      = cpu_to_be32(inode->i_gid);
@@ -1281,12 +1359,7 @@
 {
 	struct ocfs2_meta_lvb *lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb;
 
-	/* Old OCFS2 versions stored a "sequence" in the lvb to
-	 * determine whether the information could be trusted. We
-	 * don't want to use an lvb populated from a node running the
-	 * old code, so check that sequence is not set. */
-	if (!lvb->lvb_old_seq &&
-	    be32_to_cpu(lvb->lvb_version) == OCFS2_LVB_VERSION)
+	if (be32_to_cpu(lvb->lvb_version) == OCFS2_LVB_VERSION)
 		return 1;
 	return 0;
 }
@@ -1300,19 +1373,20 @@
  *   ocfs2_complete_lock_res_refresh afterwards. */
 static int ocfs2_should_refresh_lock_res(struct ocfs2_lock_res *lockres)
 {
+	unsigned long flags;
+	int status = 0;
 
-	int status = 0;
 	mlog_entry_void();
 
 refresh_check:
-	spin_lock(&lockres->l_lock);
+	spin_lock_irqsave(&lockres->l_lock, flags);
 	if (!(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH)) {
-		spin_unlock(&lockres->l_lock);
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
 		goto bail;
 	}
 
 	if (lockres->l_flags & OCFS2_LOCK_REFRESHING) {
-		spin_unlock(&lockres->l_lock);
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
 
 		ocfs2_wait_on_refreshing_lock(lockres);
 		goto refresh_check;
@@ -1320,7 +1394,7 @@
 
 	/* Ok, I'll be the one to refresh this lock. */
 	lockres_or_flags(lockres, OCFS2_LOCK_REFRESHING);
-	spin_unlock(&lockres->l_lock);
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
 
 	status = 1;
 bail:
@@ -1333,13 +1407,14 @@
 static inline void ocfs2_complete_lock_res_refresh(struct ocfs2_lock_res *lockres,
 						   int status)
 {
+	unsigned long flags;
 	mlog_entry_void();
 
-	spin_lock(&lockres->l_lock);
+	spin_lock_irqsave(&lockres->l_lock, flags);
 	lockres_clear_flags(lockres, OCFS2_LOCK_REFRESHING);
 	if (!status)
 		lockres_clear_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
-	spin_unlock(&lockres->l_lock);
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
 
 	wake_up(&lockres->l_event);
 
@@ -1482,9 +1557,7 @@
 			 ocfs2_journal_handle *handle,
 			 struct buffer_head **ret_bh,
 			 int ex,
-			 int flags,
-			 ocfs2_lock_callback cb,
-			 unsigned long cb_data)
+			 int arg_flags)
 {
 	int status, level, dlm_flags, acquired;
 	struct ocfs2_lock_res *lockres;
@@ -1509,7 +1582,7 @@
 		goto bail;
 	}
 
-	if (!(flags & OCFS2_META_LOCK_RECOVERY))
+	if (!(arg_flags & OCFS2_META_LOCK_RECOVERY))
 		wait_event(osb->recovery_event,
 			   ocfs2_node_map_is_empty(osb, &osb->recovery_map));
 
@@ -1517,11 +1590,10 @@
 	lockres = &OCFS2_I(inode)->ip_meta_lockres;
 	level = ex ? LKM_EXMODE : LKM_PRMODE;
 	dlm_flags = 0;
-	if (flags & OCFS2_META_LOCK_NOQUEUE)
+	if (arg_flags & OCFS2_META_LOCK_NOQUEUE)
 		dlm_flags |= LKM_NOQUEUE;
 
-	status = ocfs2_cluster_lock(osb, lockres, level, dlm_flags, cb,
-				    cb_data);
+	status = ocfs2_cluster_lock(osb, lockres, level, dlm_flags, arg_flags);
 	if (status < 0) {
 		if (status != -EAGAIN && status != -EIOCBRETRY)
 			mlog_errno(status);
@@ -1535,7 +1607,7 @@
 	 * the lower dlm layers. The second time though, we've
 	 * committed to owning this lock so we don't allow signals to
 	 * abort the operation. */
-	if (!(flags & OCFS2_META_LOCK_RECOVERY))
+	if (!(arg_flags & OCFS2_META_LOCK_RECOVERY))
 		wait_event(osb->recovery_event,
 			   ocfs2_node_map_is_empty(osb, &osb->recovery_map));
 
@@ -1582,6 +1654,47 @@
 	return status;
 }
 
+/*
+ * This is working around a lock inversion between tasks acquiring DLM locks
+ * while holding a page lock and the vote thread which blocks dlm lock acquiry
+ * while acquiring page locks.
+ *
+ * ** These _with_page variantes are only intended to be called from aop
+ * methods that hold page locks and return a very specific *positive* error
+ * code that aop methods pass up to the VFS -- test for errors with != 0. **
+ *
+ * The DLM is called such that it returns -EAGAIN if it would have blocked
+ * waiting for the vote thread.  In that case we unlock our page so the vote
+ * thread can make progress.  Once we've done this we have to return
+ * AOP_TRUNCATED_PAGE so the aop method that called us can bubble that back up
+ * into the VFS who will then immediately retry the aop call.
+ *
+ * We do a blocking lock and immediate unlock before returning, though, so that
+ * the lock has a great chance of being cached on this node by the time the VFS
+ * calls back to retry the aop.    This has a potential to livelock as nodes
+ * ping locks back and forth, but that's a risk we're willing to take to avoid
+ * the lock inversion simply.
+ */
+int ocfs2_meta_lock_with_page(struct inode *inode,
+			      ocfs2_journal_handle *handle,
+			      struct buffer_head **ret_bh,
+			      int ex,
+			      struct page *page)
+{
+	int ret;
+
+	ret = ocfs2_meta_lock_full(inode, handle, ret_bh, ex,
+				   OCFS2_LOCK_NONBLOCK);
+	if (ret == -EAGAIN) {
+		unlock_page(page);
+		if (ocfs2_meta_lock(inode, handle, ret_bh, ex) == 0)
+			ocfs2_meta_unlock(inode, ex);
+		ret = AOP_TRUNCATED_PAGE;
+	}
+
+	return ret;
+}
+
 void ocfs2_meta_unlock(struct inode *inode,
 		       int ex)
 {
@@ -1614,7 +1727,7 @@
 	if (ocfs2_is_hard_readonly(osb))
 		return -EROFS;
 
-	status = ocfs2_cluster_lock(osb, lockres, level, 0, NULL, 0);
+	status = ocfs2_cluster_lock(osb, lockres, level, 0, 0);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
@@ -1663,7 +1776,7 @@
 	if (ocfs2_is_hard_readonly(osb))
 		return -EROFS;
 
-	status = ocfs2_cluster_lock(osb, lockres, LKM_EXMODE, 0, NULL, 0);
+	status = ocfs2_cluster_lock(osb, lockres, LKM_EXMODE, 0, 0);
 	if (status < 0)
 		mlog_errno(status);
 
@@ -1746,17 +1859,18 @@
 static void ocfs2_unlock_ast_func(void *opaque, enum dlm_status status)
 {
 	struct ocfs2_lock_res *lockres = opaque;
+	unsigned long flags;
 
 	mlog_entry_void();
 
 	mlog(0, "UNLOCK AST called on lock %s, action = %d\n", lockres->l_name,
 	     lockres->l_unlock_action);
 
-	spin_lock(&lockres->l_lock);
+	spin_lock_irqsave(&lockres->l_lock, flags);
 	/* We tried to cancel a convert request, but it was already
 	 * granted. All we want to do here is clear our unlock
 	 * state. The wake_up call done at the bottom is redundant
-	 * (__ocfs2_cancel_convert doesn't sleep on this) but doesn't
+	 * (ocfs2_prepare_cancel_convert doesn't sleep on this) but doesn't
 	 * hurt anything anyway */
 	if (status == DLM_CANCELGRANT &&
 	    lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT) {
@@ -1772,7 +1886,7 @@
 		mlog(ML_ERROR, "Dlm passes status %d for lock %s, "
 		     "unlock_action %d\n", status, lockres->l_name,
 		     lockres->l_unlock_action);
-		spin_unlock(&lockres->l_lock);
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
 		return;
 	}
 
@@ -1791,65 +1905,13 @@
 	lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
 complete_unlock:
 	lockres->l_unlock_action = OCFS2_UNLOCK_INVALID;
-	spin_unlock(&lockres->l_lock);
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
 
 	wake_up(&lockres->l_event);
 
 	mlog_exit_void();
 }
 
-/* BEWARE: called with lockres lock, and always drops it. Caller
- * should not be calling us with a busy lock... */
-static int __ocfs2_drop_lock(ocfs2_super *osb,
-			     struct ocfs2_lock_res *lockres)
-{
-	int ret = 0;
-	enum dlm_status status;
-
-	if (lockres->l_flags & OCFS2_LOCK_BUSY)
-		mlog(ML_ERROR, "destroying busy lock: \"%s\"\n",
-		     lockres->l_name);
-	if (lockres->l_flags & OCFS2_LOCK_BLOCKED)
-		mlog(0, "destroying blocked lock: \"%s\"\n", lockres->l_name);
-
-	if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) {
-		spin_unlock(&lockres->l_lock);
-		goto bail;
-	}
-
-	lockres_clear_flags(lockres, OCFS2_LOCK_ATTACHED);
-
-	/* make sure we never get here while waiting for an ast to
-	 * fire. */
-	BUG_ON(lockres->l_action != OCFS2_AST_INVALID);
-
-	/* is this necessary? */
-	lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
-	lockres->l_unlock_action = OCFS2_UNLOCK_DROP_LOCK;
-	spin_unlock(&lockres->l_lock);
-
-	mlog(0, "lock %s\n", lockres->l_name);
-
-	status = dlmunlock(osb->dlm,
-			   &lockres->l_lksb,
-			   LKM_VALBLK,
-			   lockres->l_ops->unlock_ast,
-			   lockres);
-	if (status != DLM_NORMAL) {
-		ocfs2_log_dlm_error("dlmunlock", status, lockres);
-		mlog(ML_ERROR, "lockres flags: %lu\n", lockres->l_flags);
-		dlm_print_one_lock(lockres->l_lksb.lockid);
-		BUG();
-	}
-	mlog(0, "lock %s, successfull return from dlmunlock\n",
-	     lockres->l_name);
-
-	ocfs2_wait_on_busy_lock(lockres);
-bail:
-	mlog_exit(ret);
-	return ret;
-}
-
 typedef void (ocfs2_pre_drop_cb_t)(struct ocfs2_lock_res *, void *);
 
 struct drop_lock_cb {
@@ -1861,11 +1923,14 @@
 			   struct ocfs2_lock_res *lockres,
 			   struct drop_lock_cb *dcb)
 {
+	enum dlm_status status;
+	unsigned long flags;
+
 	/* We didn't get anywhere near actually using this lockres. */
 	if (!(lockres->l_flags & OCFS2_LOCK_INITIALIZED))
-		return 0;
+		goto out;
 
-	spin_lock(&lockres->l_lock);
+	spin_lock_irqsave(&lockres->l_lock, flags);
 
 	mlog_bug_on_msg(!(lockres->l_flags & OCFS2_LOCK_FREEING),
 			"lockres %s, flags 0x%lx\n",
@@ -1877,22 +1942,58 @@
 		     lockres->l_name, lockres->l_flags, lockres->l_action,
 		     lockres->l_unlock_action);
 
-		spin_unlock(&lockres->l_lock);
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
 
 		/* XXX: Today we just wait on any busy
 		 * locks... Perhaps we need to cancel converts in the
 		 * future? */
 		ocfs2_wait_on_busy_lock(lockres);
 
-		spin_lock(&lockres->l_lock);
+		spin_lock_irqsave(&lockres->l_lock, flags);
 	}
 
 	if (dcb)
 		dcb->drop_func(lockres, dcb->drop_data);
 
-	/* This will drop the spinlock for us. Dur de dur, at least we
-	 * keep the ugliness in one place :) */
-	return  __ocfs2_drop_lock(osb, lockres);
+	if (lockres->l_flags & OCFS2_LOCK_BUSY)
+		mlog(ML_ERROR, "destroying busy lock: \"%s\"\n",
+		     lockres->l_name);
+	if (lockres->l_flags & OCFS2_LOCK_BLOCKED)
+		mlog(0, "destroying blocked lock: \"%s\"\n", lockres->l_name);
+
+	if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) {
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
+		goto out;
+	}
+
+	lockres_clear_flags(lockres, OCFS2_LOCK_ATTACHED);
+
+	/* make sure we never get here while waiting for an ast to
+	 * fire. */
+	BUG_ON(lockres->l_action != OCFS2_AST_INVALID);
+
+	/* is this necessary? */
+	lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
+	lockres->l_unlock_action = OCFS2_UNLOCK_DROP_LOCK;
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+	mlog(0, "lock %s\n", lockres->l_name);
+
+	status = dlmunlock(osb->dlm, &lockres->l_lksb, LKM_VALBLK,
+			   lockres->l_ops->unlock_ast, lockres);
+	if (status != DLM_NORMAL) {
+		ocfs2_log_dlm_error("dlmunlock", status, lockres);
+		mlog(ML_ERROR, "lockres flags: %lu\n", lockres->l_flags);
+		dlm_print_one_lock(lockres->l_lksb.lockid);
+		BUG();
+	}
+	mlog(0, "lock %s, successfull return from dlmunlock\n",
+	     lockres->l_name);
+
+	ocfs2_wait_on_busy_lock(lockres);
+out:
+	mlog_exit(0);
+	return 0;
 }
 
 /* Mark the lockres as being dropped. It will no longer be
@@ -1904,26 +2005,26 @@
 void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres)
 {
 	int status;
-	struct ocfs2_status_completion sc;
-	struct ocfs2_lockres_flag_callback fcb;
+	struct ocfs2_mask_waiter mw;
+	unsigned long flags;
 
-	ocfs2_init_completion_fcb(&fcb, &sc);
+	ocfs2_init_mask_waiter(&mw);
 
-	spin_lock(&lockres->l_lock);
+	spin_lock_irqsave(&lockres->l_lock, flags);
 	lockres->l_flags |= OCFS2_LOCK_FREEING;
 	while (lockres->l_flags & OCFS2_LOCK_QUEUED) {
-		lockres_add_flag_callback(lockres, &fcb, OCFS2_LOCK_QUEUED, 0);
-		spin_unlock(&lockres->l_lock);
+		lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_QUEUED, 0);
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
 
 		mlog(0, "Waiting on lockres %s\n", lockres->l_name);
 
-		status = ocfs2_wait_for_status_completion(&sc);
+		status = ocfs2_wait_for_mask(&mw);
 		if (status)
 			mlog_errno(status);
 
-		spin_lock(&lockres->l_lock);
+		spin_lock_irqsave(&lockres->l_lock, flags);
 	}
-	spin_unlock(&lockres->l_lock);
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
 }
 
 static void ocfs2_drop_osb_locks(ocfs2_super *osb)
@@ -1985,21 +2086,23 @@
 	if (err < 0 && !status)
 		status = err;
 
+	err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
+			      &OCFS2_I(inode)->ip_rw_lockres,
+			      NULL);
+	if (err < 0)
+		mlog_errno(err);
+	if (err < 0 && !status)
+		status = err;
+
 	mlog_exit(status);
 	return status;
 }
 
-/* called with the spinlock held, and WILL drop it. */
-static int __ocfs2_downconvert_lock(ocfs2_super *osb,
-				    struct ocfs2_lock_res *lockres,
-				    int new_level,
-				    int lvb)
+static void ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres,
+				      int new_level)
 {
-	int ret, flags = LKM_CONVERT;
-	enum dlm_status status;
+	assert_spin_locked(&lockres->l_lock);
 
-	mlog_entry_void();
-
 	BUG_ON(lockres->l_blocking <= LKM_NLMODE);
 
 	if (lockres->l_level <= new_level) {
@@ -2008,21 +2111,31 @@
 		BUG();
 	}
 
-	mlog(0, "lock %s, new_level = %d, l_blocking = %d, lvb = %d\n",
-	     lockres->l_name, new_level, lockres->l_blocking, lvb);
+	mlog(0, "lock %s, new_level = %d, l_blocking = %d\n",
+	     lockres->l_name, new_level, lockres->l_blocking);
 
 	lockres->l_action = OCFS2_AST_DOWNCONVERT;
 	lockres->l_requested = new_level;
 	lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
-	spin_unlock(&lockres->l_lock);
+}
 
+static int ocfs2_downconvert_lock(ocfs2_super *osb,
+				  struct ocfs2_lock_res *lockres,
+				  int new_level,
+				  int lvb)
+{
+	int ret, dlm_flags = LKM_CONVERT;
+	enum dlm_status status;
+
+	mlog_entry_void();
+
 	if (lvb)
-		flags |= LKM_VALBLK;
+		dlm_flags |= LKM_VALBLK;
 
 	status = dlmlock(osb->dlm,
 			 new_level,
 			 &lockres->l_lksb,
-			 flags,
+			 dlm_flags,
 			 lockres->l_name,
 			 lockres->l_ops->ast,
 			 lockres,
@@ -2040,17 +2153,24 @@
 	return ret;
 }
 
-/* called with the spinlock held, and WILL drop it. */
-static int __ocfs2_cancel_convert(ocfs2_super *osb,
-				  struct ocfs2_lock_res *lockres)
+/* returns 1 when the caller should unlock and call dlmunlock */
+static int ocfs2_prepare_cancel_convert(ocfs2_super *osb,
+				        struct ocfs2_lock_res *lockres)
 {
-	int ret;
-	enum dlm_status status;
+	assert_spin_locked(&lockres->l_lock);
 
 	mlog_entry_void();
-
 	mlog(0, "lock %s\n", lockres->l_name);
 
+	if (lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT) {
+		/* If we're already trying to cancel a lock conversion
+		 * then just drop the spinlock and allow the caller to
+		 * requeue this lock. */
+
+		mlog(0, "Lockres %s, skip convert\n", lockres->l_name);
+		return 0;
+	}
+
 	/* were we in a convert when we got the bast fire? */
 	BUG_ON(lockres->l_action != OCFS2_AST_CONVERT &&
 	       lockres->l_action != OCFS2_AST_DOWNCONVERT);
@@ -2061,8 +2181,19 @@
 	mlog_bug_on_msg(!(lockres->l_flags & OCFS2_LOCK_BUSY),
 			"lock %s, invalid flags: 0x%lx\n",
 			lockres->l_name, lockres->l_flags);
-	spin_unlock(&lockres->l_lock);
 
+	return 1;
+}
+
+static int ocfs2_cancel_convert(ocfs2_super *osb,
+				struct ocfs2_lock_res *lockres)
+{
+	int ret;
+	enum dlm_status status;
+
+	mlog_entry_void();
+	mlog(0, "lock %s\n", lockres->l_name);
+
 	ret = 0;
 	status = dlmunlock(osb->dlm,
 			   &lockres->l_lksb,
@@ -2081,25 +2212,6 @@
 	return ret;
 }
 
-static int ocfs2_cancel_convert(ocfs2_super *osb,
-				struct ocfs2_lock_res *lockres)
-{
-	assert_spin_locked(&lockres->l_lock);
-
-	if (lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT) {
-		/* If we're already trying to cancel a lock conversion
-		 * then just drop the spinlock and allow the caller to
-		 * requeue this lock. */
-		spin_unlock(&lockres->l_lock);
-
-		mlog(0, "Lockres %s, skip convert\n", lockres->l_name);
-		return 0;
-	}
-
-	/* this will drop the spinlock for us. */
-	return __ocfs2_cancel_convert(osb, lockres);
-}
-
 static inline int ocfs2_can_downconvert_meta_lock(struct inode *inode,
 						  struct ocfs2_lock_res *lockres,
 						  int new_level)
@@ -2132,11 +2244,13 @@
 	int set_lvb = 0;
 	int ret = 0;
 	struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_meta_lockres;
+	unsigned long flags;
+
 	ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 
 	mlog_entry_void();
 
-	spin_lock(&lockres->l_lock);
+	spin_lock_irqsave(&lockres->l_lock, flags);
 
 	BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED));
 
@@ -2148,9 +2262,13 @@
 
 	if (lockres->l_flags & OCFS2_LOCK_BUSY) {
 		*requeue = 1;
-		ret = ocfs2_cancel_convert(osb, lockres);
-		if (ret < 0)
-			mlog_errno(ret);
+		ret = ocfs2_prepare_cancel_convert(osb, lockres);
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
+		if (ret) {
+			ret = ocfs2_cancel_convert(osb, lockres);
+			if (ret < 0)
+				mlog_errno(ret);
+		}
 		goto leave;
 	}
 
@@ -2174,19 +2292,20 @@
 			mlog(0, "lockres %s: downconverting stale lock!\n",
 			     lockres->l_name);
 
-		mlog(0, "calling __ocfs2_downconvert_lock with "
-		     "l_level=%d, l_blocking=%d, new_level=%d\n",
-		     lockres->l_level, lockres->l_blocking,
-		     new_level);
-		ret = __ocfs2_downconvert_lock(osb, lockres, new_level,
-					       set_lvb);
+		mlog(0, "calling ocfs2_downconvert_lock with l_level=%d, "
+		     "l_blocking=%d, new_level=%d\n",
+		     lockres->l_level, lockres->l_blocking, new_level);
+
+		ocfs2_prepare_downconvert(lockres, new_level);
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
+		ret = ocfs2_downconvert_lock(osb, lockres, new_level, set_lvb);
 		goto leave;
 	}
 	if (!ocfs2_inode_fully_checkpointed(inode))
 		ocfs2_start_checkpoint(osb);
 
 	*requeue = 1;
-	spin_unlock(&lockres->l_lock);
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
 	ret = 0;
 leave:
 	mlog_exit(ret);
@@ -2198,22 +2317,27 @@
 				      int *requeue,
 				      ocfs2_convert_worker_t *worker)
 {
+	unsigned long flags;
 	int blocking;
 	int new_level;
 	int ret = 0;
 
 	mlog_entry_void();
 
-	spin_lock(&lockres->l_lock);
+	spin_lock_irqsave(&lockres->l_lock, flags);
 
 	BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED));
 
 recheck:
 	if (lockres->l_flags & OCFS2_LOCK_BUSY) {
 		*requeue = 1;
-		ret = ocfs2_cancel_convert(osb, lockres);
-		if (ret < 0)
-			mlog_errno(ret);
+		ret = ocfs2_prepare_cancel_convert(osb, lockres);
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
+		if (ret) {
+			ret = ocfs2_cancel_convert(osb, lockres);
+			if (ret < 0)
+				mlog_errno(ret);
+		}
 		goto leave;
 	}
 
@@ -2221,7 +2345,7 @@
 	 * then requeue. */
 	if ((lockres->l_blocking == LKM_EXMODE)
 	    && (lockres->l_ex_holders || lockres->l_ro_holders)) {
-		spin_unlock(&lockres->l_lock);
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
 		*requeue = 1;
 		ret = 0;
 		goto leave;
@@ -2231,7 +2355,7 @@
 	 * requeue if we've got any EX holders */
 	if (lockres->l_blocking == LKM_PRMODE &&
 	    lockres->l_ex_holders) {
-		spin_unlock(&lockres->l_lock);
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
 		*requeue = 1;
 		ret = 0;
 		goto leave;
@@ -2248,11 +2372,11 @@
 	 * may sleep, so we save off a copy of what we're blocking as
 	 * it may change while we're not holding the spin lock. */
 	blocking = lockres->l_blocking;
-	spin_unlock(&lockres->l_lock);
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
 
 	worker(lockres, blocking);
 
-	spin_lock(&lockres->l_lock);
+	spin_lock_irqsave(&lockres->l_lock, flags);
 	if (blocking != lockres->l_blocking) {
 		/* If this changed underneath us, then we can't drop
 		 * it just yet. */
@@ -2263,7 +2387,9 @@
 	*requeue = 0;
 	new_level = ocfs2_highest_compat_lock_level(lockres->l_blocking);
 
-	ret = __ocfs2_downconvert_lock(osb, lockres, new_level, 0);
+	ocfs2_prepare_downconvert(lockres, new_level);
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+	ret = ocfs2_downconvert_lock(osb, lockres, new_level, 0);
 leave:
 	mlog_exit(ret);
 	return ret;
@@ -2328,6 +2454,30 @@
 	return status;
 }
 
+static int ocfs2_unblock_inode_lock(struct ocfs2_lock_res *lockres,
+				    int *requeue)
+{
+	int status;
+	struct inode *inode;
+
+	mlog_entry_void();
+
+	mlog(0, "Unblock lockres %s\n", lockres->l_name);
+
+	inode  = ocfs2_lock_res_inode(lockres);
+
+	status = ocfs2_generic_unblock_lock(OCFS2_SB(inode->i_sb),
+					    lockres,
+					    requeue,
+					    NULL);
+	if (status < 0)
+		mlog_errno(status);
+
+	mlog_exit(status);
+	return status;
+}
+
+
 int ocfs2_unblock_meta(struct ocfs2_lock_res *lockres,
 		       int *requeue)
 {
@@ -2381,6 +2531,7 @@
 {
 	int status;
 	int requeue = 0;
+	unsigned long flags;
 
 	/* Our reference to the lockres in this function can be
 	 * considered valid until we remove the OCFS2_LOCK_QUEUED
@@ -2399,16 +2550,16 @@
 	 * still be marked with OCFS2_LOCK_FREEING after this check,
 	 * but short circuiting here will still save us some
 	 * performance. */
-	spin_lock(&lockres->l_lock);
+	spin_lock_irqsave(&lockres->l_lock, flags);
 	if (lockres->l_flags & OCFS2_LOCK_FREEING)
 		goto unqueue;
-	spin_unlock(&lockres->l_lock);
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
 
 	status = lockres->l_ops->unblock(lockres, &requeue);
 	if (status < 0)
 		mlog_errno(status);
 
-	spin_lock(&lockres->l_lock);
+	spin_lock_irqsave(&lockres->l_lock, flags);
 unqueue:
 	if (lockres->l_flags & OCFS2_LOCK_FREEING || !requeue) {
 		lockres_clear_flags(lockres, OCFS2_LOCK_QUEUED);
@@ -2417,7 +2568,7 @@
 
 	mlog(0, "lockres %s, requeue = %s.\n", lockres->l_name,
 	     requeue ? "yes" : "no");
-	spin_unlock(&lockres->l_lock);
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
 
 	mlog_exit_void();
 }
@@ -2461,9 +2612,8 @@
 
 	mlog(level, "LVB information for %s (called from %s:%u):\n",
 	     lockres->l_name, function, line);
-	mlog(level, "old_seq: %u, version: %u, clusters: %u\n",
-	     be32_to_cpu(lvb->lvb_old_seq), be32_to_cpu(lvb->lvb_version),
-	     be32_to_cpu(lvb->lvb_iclusters));
+	mlog(level, "version: %u, clusters: %u\n",
+	     be32_to_cpu(lvb->lvb_version), be32_to_cpu(lvb->lvb_iclusters));
 	mlog(level, "size: %"MLFu64", uid %u, gid %u, mode 0x%x\n",
 	     be64_to_cpu(lvb->lvb_isize), be32_to_cpu(lvb->lvb_iuid),
 	     be32_to_cpu(lvb->lvb_igid), be16_to_cpu(lvb->lvb_imode));

Modified: trunk/fs/ocfs2/dlmglue.h
===================================================================
--- trunk/fs/ocfs2/dlmglue.h	2005-11-03 22:39:00 UTC (rev 2676)
+++ trunk/fs/ocfs2/dlmglue.h	2005-11-05 00:27:07 UTC (rev 2677)
@@ -27,23 +27,30 @@
 #ifndef DLMGLUE_H
 #define DLMGLUE_H
 
-#define OCFS2_LVB_VERSION 1
+#define OCFS2_LVB_VERSION 2
 
 struct ocfs2_meta_lvb {
-	__be32       lvb_old_seq;
 	__be32       lvb_version;
 	__be32       lvb_iclusters;
 	__be32       lvb_iuid;
 	__be32       lvb_igid;
-	__be16       lvb_imode;
-	__be16       lvb_inlink;
 	__be64       lvb_iatime_packed;
 	__be64       lvb_ictime_packed;
 	__be64       lvb_imtime_packed;
 	__be64       lvb_isize;
-	__be32       lvb_reserved[2];
+	__be16       lvb_imode;
+	__be16       lvb_inlink;
+	__be32       lvb_reserved[3];
 };
 
+/* ocfs2_meta_lock_full() and ocfs2_data_lock_full() 'arg_flags' flags */
+/* don't wait on recovery. */
+#define OCFS2_META_LOCK_RECOVERY	(0x01)
+/* Instruct the dlm not to queue ourselves on the other node. */
+#define OCFS2_META_LOCK_NOQUEUE		(0x02)
+/* don't block waiting for the vote thread, instead return -EAGAIN */
+#define OCFS2_LOCK_NONBLOCK		(0x04)
+
 int ocfs2_dlm_init(ocfs2_super *osb);
 void ocfs2_dlm_shutdown(ocfs2_super *osb);
 void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res);
@@ -53,24 +60,30 @@
 void ocfs2_lock_res_free(struct ocfs2_lock_res *res);
 int ocfs2_create_new_inode_locks(struct inode *inode);
 int ocfs2_drop_inode_locks(struct inode *inode);
-int ocfs2_data_lock(struct inode *inode,
-		    int write);
+int ocfs2_data_lock_full(struct inode *inode,
+			 int write,
+			 int arg_flags);
+#define ocfs2_data_lock(inode, write) ocfs2_data_lock_full(inode, write, 0)
+int ocfs2_data_lock_with_page(struct inode *inode,
+			      int write,
+			      struct page *page);
 void ocfs2_data_unlock(struct inode *inode,
 		       int write);
-/* don't wait on recovery. */
-#define OCFS2_META_LOCK_RECOVERY	(0x01)
-/* Instruct the dlm not to queue ourselves on the other node. */
-#define OCFS2_META_LOCK_NOQUEUE		(0x02)
+int ocfs2_rw_lock(struct inode *inode, int write);
+void ocfs2_rw_unlock(struct inode *inode, int write);
 int ocfs2_meta_lock_full(struct inode *inode,
 			 ocfs2_journal_handle *handle,
 			 struct buffer_head **ret_bh,
 			 int ex,
-			 int flags,
-			 ocfs2_lock_callback cb,
-			 unsigned long cb_data);
+			 int arg_flags);
+int ocfs2_meta_lock_with_page(struct inode *inode,
+			      ocfs2_journal_handle *handle,
+			      struct buffer_head **ret_bh,
+			      int ex,
+			      struct page *page);
 /* 99% of the time we don't want to supply any additional flags --
  * those are for very specific cases only. */
-#define ocfs2_meta_lock(i, h, b, e) ocfs2_meta_lock_full(i, h, b, e, 0, NULL, 0)
+#define ocfs2_meta_lock(i, h, b, e) ocfs2_meta_lock_full(i, h, b, e, 0)
 void ocfs2_meta_unlock(struct inode *inode,
 		       int ex);
 int ocfs2_super_lock(ocfs2_super *osb,

Modified: trunk/fs/ocfs2/file.c
===================================================================
--- trunk/fs/ocfs2/file.c	2005-11-03 22:39:00 UTC (rev 2676)
+++ trunk/fs/ocfs2/file.c	2005-11-05 00:27:07 UTC (rev 2677)
@@ -35,8 +35,8 @@
 
 #include "ocfs2.h"
 
-#include "aio.h"
 #include "alloc.h"
+#include "aops.h"
 #include "dir.h"
 #include "dlmglue.h"
 #include "extent_map.h"
@@ -50,12 +50,6 @@
 
 #include "buffer_head_io.h"
 
-static int ocfs2_zero_extend(struct inode *inode);
-static int ocfs2_orphan_for_truncate(ocfs2_super *osb,
-				     struct inode *inode,
-				     struct buffer_head *fe_bh,
-				     u64 new_i_size);
-
 static int ocfs2_sync_inode(struct inode *inode)
 {
 	filemap_fdatawrite(inode->i_mapping);
@@ -148,286 +142,56 @@
 	return (err < 0) ? -EIO : 0;
 }
 
-static void ocfs2_update_inode_size(struct inode *inode,
-				    u64 new_size)
+int ocfs2_set_inode_size(ocfs2_journal_handle *handle,
+			 struct inode *inode,
+			 struct buffer_head *fe_bh,
+			 u64 new_i_size)
 {
-	i_size_write(inode, new_size);
-	inode->i_blocks = ocfs2_align_bytes_to_sectors(new_size);
-}
-
-void ocfs2_file_finish_extension(struct inode *inode,
-				 loff_t newsize,
-				 unsigned direct_extend)
-{
 	int status;
 
-	mlog(0, "inode %"MLFu64", newsize = %lld, direct_extend = %u\n",
-	     OCFS2_I(inode)->ip_blkno, (long long)newsize, direct_extend);
+	mlog_entry_void();
 
-	ocfs2_update_inode_size(inode, newsize);
+	i_size_write(inode, new_i_size);
+	inode->i_blocks = ocfs2_align_bytes_to_sectors(new_i_size);
+	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
 
-#ifdef OCFS2_ORACORE_WORKAROUNDS
-	if (direct_extend) {
-		/*
-		 * This leaves dirty data in holes.
-		 * Caveat Emptor.
-		 */
-		OCFS2_I(inode)->ip_mmu_private = newsize;
-		return;
-	}
-#endif
-
-	status = ocfs2_zero_extend(inode);
-	/*
-	 * Don't overwrite the result of
-	 * generic_file_write
-	 */
-	if (status)
-		mlog(ML_ERROR, "Unable to pre-zero extension of inode "
-		     "(%d)\n", status);
-}
-
-static ssize_t ocfs2_file_write(struct file *filp,
-				const char __user *buf,
-				size_t count,
-				loff_t *ppos)
-{
-	struct iovec local_iov = { .iov_base = (void __user *)buf,
-				   .iov_len = count };
-	int ret = 0;
-	ocfs2_super *osb = NULL;
-	struct dentry *dentry = filp->f_dentry;
-	struct inode *inode = dentry->d_inode;
-	struct ocfs2_write_lock_info info = {0, };
-	DECLARE_BUFFER_LOCK_CTXT(ctxt);
-
-	mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", filp, buf,
-		   (unsigned int)count,
-		   filp->f_dentry->d_name.len,
-		   filp->f_dentry->d_name.name);
-
-	/* happy write of zero bytes */
-	if (count == 0) {
-		ret = 0;
+	status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
+	if (status < 0) {
+		mlog_errno(status);
 		goto bail;
 	}
 
-	if (!inode) {
-		mlog(0, "bad inode\n");
-		ret = -EIO;
-		goto bail;
-	}
-
-	osb = OCFS2_SB(inode->i_sb);
-
-	ret = ocfs2_write_lock_maybe_extend(filp, buf, count, ppos, &info,
-					    &ctxt);
-	if (ret)
-		goto bail;
-
-	down_read(&OCFS2_I(inode)->ip_alloc_sem);
-
-#ifdef OCFS2_ORACORE_WORKAROUNDS
-	if (osb->s_mount_opt & OCFS2_MOUNT_COMPAT_OCFS) {
-		unsigned int saved_flags = filp->f_flags;
-
-		if (info.wl_do_direct_io)
-			filp->f_flags |= O_DIRECT;
-		else
-			filp->f_flags &= ~O_DIRECT;
-
-		ret = generic_file_write_nolock(filp, &local_iov, 1, ppos);
-
-		filp->f_flags = saved_flags;
-	} else
-#endif
-		ret = generic_file_write_nolock(filp, &local_iov, 1, ppos);
-
-	up_read(&OCFS2_I(inode)->ip_alloc_sem);
-
 bail:
-	/* we might have to finish up extentions that were performed before
-	 * an error was returned by, say, data locking */
-	if (info.wl_extended)
-		ocfs2_file_finish_extension(inode, info.wl_newsize,
-					    info.wl_do_direct_io);
-	if (info.wl_unlock_ctxt)
-		ocfs2_unlock_buffer_inodes(&ctxt);
-	if (info.wl_have_i_sem)
-		up(&inode->i_sem);
-	mlog_exit(ret);
-
-	return ret;
+	mlog_exit(status);
+	return status;
 }
 
-static ssize_t ocfs2_file_read(struct file *filp,
-			       char __user *buf,
-			       size_t count,
-			       loff_t *ppos)
+static int ocfs2_simple_size_update(struct inode *inode,
+				    struct buffer_head *di_bh,
+				    u64 new_i_size)
 {
-	int ret = 0;
-	ocfs2_super *osb = NULL;
-	struct dentry *dentry = filp->f_dentry;
-	struct inode *inode = dentry->d_inode;
-	struct ocfs2_backing_inode *target_binode;
-	DECLARE_BUFFER_LOCK_CTXT(ctxt);
-
-	mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", filp, buf,
-		   (unsigned int)count,
-		   filp->f_dentry->d_name.len,
-		   filp->f_dentry->d_name.name);
-
-	if (!inode) {
-		ret = -EINVAL;
-		mlog_errno(ret);
-		goto bail;
-	}
-
-	osb = OCFS2_SB(inode->i_sb);
-
-#ifdef OCFS2_ORACORE_WORKAROUNDS
-	if (osb->s_mount_opt & OCFS2_MOUNT_COMPAT_OCFS) {
-		if (filp->f_flags & O_DIRECT) {
-			int sector_size = 1 << osb->s_sectsize_bits;
-
-			if (((*ppos) & (sector_size - 1)) ||
-			    (count & (sector_size - 1)) ||
-			    ((unsigned long)buf & (sector_size - 1)) ||
-			    (i_size_read(inode) & (sector_size -1))) {
-				filp->f_flags &= ~O_DIRECT;
-			}
-		}
-	}
-#endif
-
-	ret = ocfs2_setup_io_locks(inode->i_sb, inode, buf, count, &ctxt,
-				   &target_binode);
-	if (ret < 0) {
-		mlog_errno(ret);
-		goto bail;
-	}
-
-	target_binode->ba_lock_data = (filp->f_flags & O_DIRECT) ? 0 : 1;
-
-	ret = ocfs2_lock_buffer_inodes(&ctxt, NULL);
-	if (ret < 0) {
-		mlog_errno(ret);
-		goto bail_unlock;
-	}
-
-	down_read(&OCFS2_I(inode)->ip_alloc_sem);
-
-	ret = generic_file_read(filp, buf, count, ppos);
-
-	up_read(&OCFS2_I(inode)->ip_alloc_sem);
-
-	if (ret == -EINVAL)
-		mlog(ML_ERROR, "Generic_file_read returned -EINVAL\n");
-
-bail_unlock:
-	ocfs2_unlock_buffer_inodes(&ctxt);
-
-bail:
-	mlog_exit(ret);
-
-	return ret;
-}
-
-static ssize_t ocfs2_file_sendfile(struct file *in_file,
-				   loff_t *ppos,
-				   size_t count,
-				   read_actor_t actor,
-				   void *target)
-{
 	int ret;
-	struct inode *inode = in_file->f_mapping->host;
+	ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	ocfs2_journal_handle *handle = NULL;
 
-	mlog_entry("inode %"MLFu64", ppos %lld, count = %u\n",
-		   OCFS2_I(inode)->ip_blkno, (long long) *ppos,
-		   (unsigned int) count);
-
-	/* Obviously, there is no user buffer to worry about here --
-	 * this simplifies locking, so no need to walk vmas a la
-	 * read/write. We take a simple set of cluster locks against
-	 * the inode and call generic_file_sendfile. */
-	ret = ocfs2_meta_lock(inode, NULL, NULL, 0);
-	if (ret < 0) {
+	handle = ocfs2_start_trans(osb, NULL,
+				   OCFS2_INODE_UPDATE_CREDITS);
+	if (handle == NULL) {
+		ret = -ENOMEM;
 		mlog_errno(ret);
-		goto bail;
+		goto out;
 	}
 
-	ret = ocfs2_data_lock(inode, 0);
-	if (ret < 0) {
-		mlog_errno(ret);
-		goto bail_unlock_meta;
-	}
-
-	down_read(&OCFS2_I(inode)->ip_alloc_sem);
-
-	ret = generic_file_sendfile(in_file, ppos, count, actor, target);
+	ret = ocfs2_set_inode_size(handle, inode, di_bh,
+				   new_i_size);
 	if (ret < 0)
 		mlog_errno(ret);
 
-	up_read(&OCFS2_I(inode)->ip_alloc_sem);
-
-	ocfs2_data_unlock(inode, 0);
-bail_unlock_meta:
-	ocfs2_meta_unlock(inode, 0);
-
-bail:
-	mlog_exit(ret);
+	ocfs2_commit_trans(handle);
+out:
 	return ret;
 }
 
-struct file_operations ocfs2_fops = {
-	.read		= ocfs2_file_read,
-	.write		= ocfs2_file_write,
-	.sendfile	= ocfs2_file_sendfile,
-	.mmap		= ocfs2_mmap,
-	.fsync		= ocfs2_sync_file,
-	.release	= ocfs2_file_release,
-	.open		= ocfs2_file_open,
-	.aio_read	= ocfs2_file_aio_read,
-	.aio_write	= ocfs2_file_aio_write,
-};
-
-struct file_operations ocfs2_dops = {
-	.read		= generic_read_dir,
-	.readdir	= ocfs2_readdir,
-	.fsync		= ocfs2_sync_file,
-};
-
-int ocfs2_set_inode_size(ocfs2_journal_handle *handle,
-			 struct inode *inode,
-			 struct buffer_head *fe_bh,
-			 u64 new_i_size)
-{
-	int status, grow;
-
-	mlog_entry_void();
-
-	grow = new_i_size > inode->i_size;
-	i_size_write(inode, new_i_size);
-	inode->i_blocks = ocfs2_align_bytes_to_sectors(new_i_size);
-	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
-
-	status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
-	if (status < 0) {
-		mlog_errno(status);
-		goto bail;
-	}
-
-	/* FIXME: I think this should all be in the caller */
-	spin_lock(&OCFS2_I(inode)->ip_lock);
-	if (!grow)
-		OCFS2_I(inode)->ip_mmu_private = i_size_read(inode);
-	spin_unlock(&OCFS2_I(inode)->ip_lock);
-
-bail:
-	mlog_exit(status);
-	return status;
-}
-
 static int ocfs2_orphan_for_truncate(ocfs2_super *osb,
 				     struct inode *inode,
 				     struct buffer_head *fe_bh,
@@ -458,14 +222,13 @@
 	return status;
 }
 
-static int ocfs2_truncate_file(ocfs2_super *osb,
-			       u64 new_i_size,
-			       struct inode *inode)
+static int ocfs2_truncate_file(struct inode *inode,
+			       struct buffer_head *di_bh,
+			       u64 new_i_size)
 {
 	int status = 0;
 	ocfs2_dinode *fe = NULL;
-	struct buffer_head *fe_bh = NULL;
-	ocfs2_journal_handle *handle = NULL;
+	ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	struct ocfs2_truncate_context *tc = NULL;
 
 	mlog_entry("(inode = %"MLFu64", new_i_size = %"MLFu64"\n",
@@ -473,19 +236,13 @@
 
 	truncate_inode_pages(inode->i_mapping, new_i_size);
 
-	status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &fe_bh,
-				  OCFS2_BH_CACHED, inode);
-	if (status < 0) {
-		mlog_errno(status);
-		goto bail;
-	}
-
-	fe = (ocfs2_dinode *) fe_bh->b_data;
+	fe = (ocfs2_dinode *) di_bh->b_data;
 	if (!OCFS2_IS_VALID_DINODE(fe)) {
 		OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
 		status = -EIO;
 		goto bail;
 	}
+
 	mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode),
 			"Inode %"MLFu64", inode i_size = %lld != di "
 			"i_size = %"MLFu64", i_flags = 0x%x\n",
@@ -516,17 +273,7 @@
 		     fe->i_clusters);
 		/* No allocation change is required, so lets fast path
 		 * this truncate. */
-		handle = ocfs2_start_trans(osb, NULL,
-					  OCFS2_INODE_UPDATE_CREDITS);
-		if (IS_ERR(handle)) {
-			status = PTR_ERR(handle);
-			handle = NULL;
-			mlog_errno(status);
-			goto bail;
-		}
-
-		status = ocfs2_set_inode_size(handle, inode, fe_bh,
-					      new_i_size);
+		status = ocfs2_simple_size_update(inode, di_bh, new_i_size);
 		if (status < 0)
 			mlog_errno(status);
 		goto bail;
@@ -544,19 +291,19 @@
 	 * change. Orphan the inode so that recovery can complete the
 	 * truncate if necessary. This does the task of marking
 	 * i_size. */
-	status = ocfs2_orphan_for_truncate(osb, inode, fe_bh, new_i_size);
+	status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
 	}
 
-	status = ocfs2_prepare_truncate(osb, inode, fe_bh, &tc);
+	status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
 	}
 
-	status = ocfs2_commit_truncate(osb, inode, fe_bh, tc);
+	status = ocfs2_commit_truncate(osb, inode, di_bh, tc);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
@@ -564,74 +311,34 @@
 
 	/* TODO: orphan dir cleanup here. */
 bail:
-	if (handle)
-		ocfs2_commit_trans(handle);
 
-	if (fe_bh)
-		brelse(fe_bh);
-
 	mlog_exit(status);
 	return status;
 }
 
-static int ocfs2_zero_extend(struct inode *inode)
-{
-	struct address_space *mapping = inode->i_mapping;
-	struct page *page;
-	u64 size = i_size_read(inode) - 1;
-	unsigned int offset;
-	int res = 0;
-
-	/* Start the zeroing of blocks */
-	if (i_size_read(inode) > OCFS2_I(inode)->ip_mmu_private) {
-		page = grab_cache_page(mapping,
-				       size >> PAGE_CACHE_SHIFT);
-		if (!page) {
-			res = -ENOMEM;
-			mlog_errno(res);
-			return res;
-		}
-		offset = (unsigned int)(size & (PAGE_CACHE_SIZE - 1)) + 1;
-		res = mapping->a_ops->prepare_write(NULL, page, offset,
-						    offset);
-		if (res < 0) {
-			mlog_errno(res);
-			goto bail_unlock;
-		}
-
-		res = mapping->a_ops->commit_write(NULL, page, offset, offset);
-		if (res < 0)
-			mlog_errno(res);
-
-bail_unlock:
-		unlock_page(page);
-		page_cache_release(page);
-		mark_inode_dirty(inode);
-	}
-
-	return res;
-}
-
 /*
  * extend allocation only here.
  * we'll update all the disk stuff, and oip->alloc_size
  *
  * expect stuff to be locked, a transaction started and enough data /
- * metadata reservations in the contexts. I'll return -EAGAIN, if we
- * run out of transaction credits, so the caller can restart us.
+ * metadata reservations in the contexts.
+ *
+ * Will return -EAGAIN, and a reason if a restart is needed.
+ * If passed in, *reason will always be set, even in error.
  */
-int ocfs2_extend_allocation(ocfs2_super *osb,
-			    struct inode *inode,
-			    u32 clusters_to_add,
-			    struct buffer_head *fe_bh,
-			    ocfs2_journal_handle *handle,
-			    ocfs2_alloc_context *data_ac,
-			    ocfs2_alloc_context *meta_ac,
-			    enum ocfs2_alloc_restarted *reason)
+int ocfs2_do_extend_allocation(ocfs2_super *osb,
+			       struct inode *inode,
+			       u32 clusters_to_add,
+			       struct buffer_head *fe_bh,
+			       ocfs2_journal_handle *handle,
+			       ocfs2_alloc_context *data_ac,
+			       ocfs2_alloc_context *meta_ac,
+			       enum ocfs2_alloc_restarted *reason_ret)
 {
 	int status = 0;
 	int free_extents;
 	ocfs2_dinode *fe = (ocfs2_dinode *) fe_bh->b_data;
+	enum ocfs2_alloc_restarted reason = RESTART_NONE;
 	u32 bit_off, num_bits;
 	u64 block;
 
@@ -652,16 +359,14 @@
 	if (!free_extents && !meta_ac) {
 		mlog(0, "we haven't reserved any metadata!\n");
 		status = -EAGAIN;
-		if (reason)
-			*reason = RESTART_META;
+		reason = RESTART_META;
 		goto leave;
 	} else if ((!free_extents)
 		   && (ocfs2_alloc_context_bits_left(meta_ac)
 		       < ocfs2_extend_meta_needed(fe))) {
 		mlog(0, "filesystem is really fragmented...\n");
 		status = -EAGAIN;
-		if (reason)
-			*reason = RESTART_META;
+		reason = RESTART_META;
 		goto leave;
 	}
 
@@ -710,57 +415,34 @@
 		mlog(0, "need to alloc once more, clusters = %u, wanted = "
 		     "%u\n", fe->i_clusters, clusters_to_add);
 		status = -EAGAIN;
-		if (reason)
-			*reason = RESTART_TRANS;
+		reason = RESTART_TRANS;
 	}
 
 leave:
 	mlog_exit(status);
+	if (reason_ret)
+		*reason_ret = reason;
 	return status;
 }
 
-/*
- * Ok, this function is heavy on the goto's - we need to clean it up a
- * bit.
- *
- * *bytes_extended is a measure of how much was added to
- * dinode->i_size, NOT how much allocated was actually added to the
- * file. It will always be correct, even when we return an error.
- */
-int ocfs2_extend_file(ocfs2_super *osb,
-		      struct inode *inode,
-		      u64 new_i_size,
-		      u64 *bytes_extended)
+int ocfs2_extend_allocation(struct inode *inode,
+			    u32 clusters_to_add)
 {
 	int status = 0;
 	int restart_func = 0;
 	int drop_alloc_sem = 0;
 	int credits, num_free_extents;
-	u32 clusters_to_add;
-	u64 new_fe_size;
+	u32 prev_clusters;
 	struct buffer_head *bh = NULL;
-	ocfs2_dinode *fe;
+	ocfs2_dinode *fe = NULL;
 	ocfs2_journal_handle *handle = NULL;
 	ocfs2_alloc_context *data_ac = NULL;
 	ocfs2_alloc_context *meta_ac = NULL;
 	enum ocfs2_alloc_restarted why;
+	ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 
-	mlog_entry("(new_i_size=%"MLFu64")\n", new_i_size);
+	mlog_entry("(clusters_to_add = %u)\n", clusters_to_add);
 
-	*bytes_extended = 0;
-
-	/* setattr sometimes calls us like this. */
-	if (new_i_size == 0)
-		goto leave;
-
-restart_all:
-	handle = ocfs2_alloc_handle(osb);
-	if (handle == NULL) {
-		status = -ENOMEM;
-		mlog_errno(status);
-		goto leave;
-	}
-
 	status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh,
 				  OCFS2_BH_CACHED, inode);
 	if (status < 0) {
@@ -774,23 +456,21 @@
 		status = -EIO;
 		goto leave;
 	}
-	BUG_ON(i_size_read(inode) !=
-	       (le64_to_cpu(fe->i_size) - *bytes_extended));
-	BUG_ON(new_i_size < i_size_read(inode));
 
-	if (i_size_read(inode) == new_i_size)
-  		goto leave;
+restart_all:
+	BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);
 
-	clusters_to_add = ocfs2_clusters_for_bytes(osb->sb, new_i_size) -
-			  le32_to_cpu(fe->i_clusters);
-
-	mlog(0, "extend inode %"MLFu64", new_i_size = %"MLFu64", "
-		"i_size = %lld, fe->i_clusters = %u, clusters_to_add = %u\n",
-	     OCFS2_I(inode)->ip_blkno, new_i_size, i_size_read(inode),
+	mlog(0, "extend inode %"MLFu64", i_size = %lld, fe->i_clusters = %u, "
+	     "clusters_to_add = %u\n",
+	     OCFS2_I(inode)->ip_blkno, i_size_read(inode),
 	     fe->i_clusters, clusters_to_add);
 
-	if (!clusters_to_add)
-		goto do_start_trans;
+	handle = ocfs2_alloc_handle(osb);
+	if (handle == NULL) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto leave;
+	}
 
 	num_free_extents = ocfs2_num_free_extents(osb,
 						  inode,
@@ -829,7 +509,7 @@
 	 * start_trans is important here -- always do it before! */
 	down_write(&OCFS2_I(inode)->ip_alloc_sem);
 	drop_alloc_sem = 1;
-do_start_trans:
+
 	credits = ocfs2_calc_extend_credits(osb->sb, fe, clusters_to_add);
 	handle = ocfs2_start_trans(osb, handle, credits);
 	if (IS_ERR(handle)) {
@@ -850,53 +530,39 @@
 		goto leave;
 	}
 
-	if (!clusters_to_add)
-		goto no_alloc;
+	prev_clusters = OCFS2_I(inode)->ip_clusters;
 
-	status = ocfs2_extend_allocation(osb,
-					 inode,
-					 clusters_to_add,
-					 bh,
-					 handle,
-					 data_ac,
-					 meta_ac,
-					 &why);
+	status = ocfs2_do_extend_allocation(osb,
+					    inode,
+					    clusters_to_add,
+					    bh,
+					    handle,
+					    data_ac,
+					    meta_ac,
+					    &why);
 	if ((status < 0) && (status != -EAGAIN)) {
 		if (status != -ENOSPC)
 			mlog_errno(status);
 		goto leave;
 	}
 
-	if (status == -EAGAIN && (new_i_size >
-	    ocfs2_clusters_to_bytes(osb->sb, le32_to_cpu(fe->i_clusters)))) {
+	status = ocfs2_journal_dirty(handle, bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
 
+	spin_lock(&OCFS2_I(inode)->ip_lock);
+	clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters);
+	spin_unlock(&OCFS2_I(inode)->ip_lock);
+
+	if (why != RESTART_NONE && clusters_to_add) {
 		if (why == RESTART_META) {
 			mlog(0, "restarting function.\n");
 			restart_func = 1;
 		} else {
 			BUG_ON(why != RESTART_TRANS);
 
-			new_fe_size = ocfs2_clusters_to_bytes(osb->sb,
-						le32_to_cpu(fe->i_clusters));
-			*bytes_extended += new_fe_size -
-					   le64_to_cpu(fe->i_size);
-			/* update i_size in case we crash after the
-			 * extend_trans */
-			fe->i_size = cpu_to_le64(new_fe_size);
-
-			fe->i_mtime = cpu_to_le64(CURRENT_TIME.tv_sec);
-			fe->i_mtime_nsec = cpu_to_le32(CURRENT_TIME.tv_nsec);
-
-			status = ocfs2_journal_dirty(handle, bh);
-			if (status < 0) {
-				mlog_errno(status);
-				goto leave;
-			}
-
-			clusters_to_add =
-				ocfs2_clusters_for_bytes(osb->sb,
-							 new_i_size)
-				- le32_to_cpu(fe->i_clusters);
 			mlog(0, "restarting transaction.\n");
 			/* TODO: This can be more intelligent. */
 			credits = ocfs2_calc_extend_credits(osb->sb,
@@ -913,34 +579,12 @@
 			goto restarted_transaction;
 		}
 	}
-	status = 0;
 
-no_alloc:
-	/* this may not be the end of our allocation so only update
-	 * i_size to what's appropriate. */
-	new_fe_size = ocfs2_clusters_to_bytes(osb->sb,
-					      le32_to_cpu(fe->i_clusters));
-	if (new_i_size < new_fe_size)
-		new_fe_size = new_i_size;
-
-	*bytes_extended += new_fe_size - le64_to_cpu(fe->i_size);
-	fe->i_size = cpu_to_le64(new_fe_size);
-
 	mlog(0, "fe: i_clusters = %u, i_size=%"MLFu64"\n",
 	     fe->i_clusters, fe->i_size);
-
 	mlog(0, "inode: ip_clusters=%u, i_size=%lld\n",
 	     OCFS2_I(inode)->ip_clusters, i_size_read(inode));
 
-	fe->i_ctime = fe->i_mtime = cpu_to_le64(CURRENT_TIME.tv_sec);
-	fe->i_ctime_nsec = fe->i_mtime_nsec = cpu_to_le32(CURRENT_TIME.tv_nsec);
-
-	status = ocfs2_journal_dirty(handle, bh);
-	if (status < 0) {
-		mlog_errno(status);
-		goto leave;
-	}
-
 leave:
 	if (drop_alloc_sem) {
 		up_write(&OCFS2_I(inode)->ip_alloc_sem);
@@ -958,23 +602,150 @@
 		ocfs2_free_alloc_context(meta_ac);
 		meta_ac = NULL;
 	}
+	if ((!status) && restart_func) {
+		restart_func = 0;
+		goto restart_all;
+	}
 	if (bh) {
 		brelse(bh);
 		bh = NULL;
 	}
-	if ((!status) && restart_func) {
-		restart_func = 0;
-		goto restart_all;
-	}
 
 	mlog_exit(status);
 	return status;
 }
 
+/* Some parts of this taken from generic_cont_expand, which turned out
+ * to be too fragile to do exactly what we need without us having to
+ * worry about recursive locking in ->commit_write(). */
+static int ocfs2_write_zero_page(struct inode *inode,
+				 u64 size)
+{
+	struct address_space *mapping = inode->i_mapping;
+	struct page *page;
+	unsigned long index;
+	unsigned int offset;
+	ocfs2_journal_handle *handle = NULL;
+	int ret;
+
+	offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */
+	/* ugh.  in prepare/commit_write, if from==to==start of block, we 
+	** skip the prepare.  make sure we never send an offset for the start
+	** of a block
+	*/
+	if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {
+		offset++;
+	}
+	index = size >> PAGE_CACHE_SHIFT;
+
+	page = grab_cache_page(mapping, index);
+	if (!page) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_prepare_write(NULL, page, offset, offset);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out_unlock;
+	}
+
+	if (ocfs2_should_order_data(inode)) {
+		handle = ocfs2_start_walk_page_trans(inode, page, offset,
+						     offset);
+		if (IS_ERR(handle)) {
+			ret = PTR_ERR(handle);
+			handle = NULL;
+			goto out_unlock;
+		}
+	}
+
+	/* must not update i_size! */
+	ret = block_commit_write(page, offset, offset);
+	if (ret < 0)
+		mlog_errno(ret);
+	else
+		ret = 0;
+
+	if (handle)
+		ocfs2_commit_trans(handle);
+out_unlock:
+	unlock_page(page);
+	page_cache_release(page);
+out:
+	return ret;
+}
+
+static int ocfs2_zero_extend(struct inode *inode,
+			     u64 zero_to_size)
+{
+	int ret = 0;
+	u64 start_off;
+	struct super_block *sb = inode->i_sb;
+
+	start_off = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode));
+	while (start_off < zero_to_size) {
+		ret = ocfs2_write_zero_page(inode, start_off);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		start_off += sb->s_blocksize;
+	}
+
+out:
+	return ret;
+}
+
+static int ocfs2_extend_file(struct inode *inode,
+			     struct buffer_head *di_bh,
+			     u64 new_i_size)
+{
+	int ret = 0;
+	u32 clusters_to_add;
+
+	/* setattr sometimes calls us like this. */
+	if (new_i_size == 0)
+		goto out;
+
+	if (i_size_read(inode) == new_i_size)
+  		goto out;
+	BUG_ON(new_i_size < i_size_read(inode));
+
+	clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size) - 
+		OCFS2_I(inode)->ip_clusters;
+
+	if (clusters_to_add) {
+		ret = ocfs2_extend_allocation(inode, clusters_to_add);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		ret = ocfs2_zero_extend(inode, new_i_size);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+	} 
+
+	/* No allocation required, we just use this helper to
+	 * do a trivial update of i_size. */
+	ret = ocfs2_simple_size_update(inode, di_bh, new_i_size);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+out:
+	return ret;
+}
+
 int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
 {
-	int status = 0;
-	u64 newsize, bytes_added;
+	int status = 0, size_change;
 	struct inode *inode = dentry->d_inode;
 	struct super_block *sb = inode->i_sb;
 	ocfs2_super *osb = OCFS2_SB(sb);
@@ -1006,60 +777,33 @@
 	if (status)
 		return status;
 
-	newsize = attr->ia_size;
+	size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE;
+	if (size_change) {
+		status = ocfs2_rw_lock(inode, 1);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
 
 	status = ocfs2_meta_lock(inode, NULL, &bh, 1);
 	if (status < 0) {
 		if (status != -ENOENT)
 			mlog_errno(status);
-		goto bail;
+		goto bail_unlock_rw;
 	}
 
-	if (S_ISREG(inode->i_mode) &&
-	    attr->ia_valid & ATTR_SIZE &&
-	    newsize != i_size_read(inode)) {
-		bytes_added = 0;
-
-		if (i_size_read(inode) > newsize)
-			status = ocfs2_truncate_file(osb, newsize, inode);
+	if (size_change && attr->ia_size != i_size_read(inode)) {
+		if (i_size_read(inode) > attr->ia_size)
+			status = ocfs2_truncate_file(inode, bh, attr->ia_size);
 		else
-			status = ocfs2_extend_file(osb, inode, newsize,
-						   &bytes_added);
-		if (status < 0 && (!bytes_added)) {
+			status = ocfs2_extend_file(inode, bh, attr->ia_size);
+		if (status < 0) {
 			if (status != -ENOSPC)
 				mlog_errno(status);
 			status = -ENOSPC;
 			goto bail_unlock;
 		}
-
-		/* partial extend, we continue with what we've got. */
-		if (status < 0
-		    && status != -ENOSPC
-		    && status != -EINTR
-		    && status != -ERESTARTSYS)
-			mlog(ML_ERROR,
-			     "status return of %d extending inode "
-			     "%"MLFu64"\n", status,
-			     OCFS2_I(inode)->ip_blkno);
-		status = 0;
-
-		newsize = bytes_added + i_size_read(inode);
-		if (bytes_added)
-			ocfs2_update_inode_size(inode, newsize);
-
-#ifdef OCFS2_ORACORE_WORKAROUNDS
-		spin_lock(&OCFS2_I(inode)->ip_lock);
-		if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_OPEN_DIRECT) {
-			/* This is a total broken hack for O_DIRECT crack */
-			OCFS2_I(inode)->ip_mmu_private = i_size_read(inode);
-		}
-		spin_unlock(&OCFS2_I(inode)->ip_lock);
-#endif
-		status = ocfs2_zero_extend(inode);
-		if (status < 0) {
-			mlog_errno(status);
-			goto bail_unlock;
-		}
 	}
 
 	handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS);
@@ -1083,6 +827,9 @@
 	ocfs2_commit_trans(handle);
 bail_unlock:
 	ocfs2_meta_unlock(inode, 1);
+bail_unlock_rw:
+	if (size_change)
+		ocfs2_rw_unlock(inode, 1);
 bail:
 	if (bh)
 		brelse(bh);
@@ -1120,6 +867,272 @@
 	return err;
 }
 
+static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
+				    const char __user *buf,
+				    size_t count,
+				    loff_t pos)
+{
+	struct iovec local_iov = { .iov_base = (void __user *)buf,
+				   .iov_len = count };
+	int ret, rw_level = -1, meta_level = -1, have_alloc_sem = 0;
+	u32 clusters;
+	struct file *filp = iocb->ki_filp;
+	struct inode *inode = filp->f_dentry->d_inode;
+	loff_t newsize, saved_pos;
+#ifdef OCFS2_ORACORE_WORKAROUNDS
+	ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+#endif
+
+	mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", filp, buf,
+		   (unsigned int)count,
+		   filp->f_dentry->d_name.len,
+		   filp->f_dentry->d_name.name);
+
+	/* happy write of zero bytes */
+	if (count == 0)
+		return 0;
+
+	if (!inode) {
+		mlog(0, "bad inode\n");
+		return -EIO;
+	}
+
+#ifdef OCFS2_ORACORE_WORKAROUNDS
+	/* ugh, work around some applications which open everything O_DIRECT +
+	 * O_APPEND and really don't mean to use O_DIRECT. */
+	if (osb->s_mount_opt & OCFS2_MOUNT_COMPAT_OCFS &&
+	    (filp->f_flags & O_APPEND) && (filp->f_flags & O_DIRECT)) 
+		filp->f_flags &= ~O_DIRECT;
+#endif
+
+
+	down(&inode->i_sem);
+	/* to match setattr's i_sem -> i_alloc_sem -> rw_lock ordering */
+	if (filp->f_flags & O_DIRECT) {
+		have_alloc_sem = 1;
+		down_read(&inode->i_alloc_sem);
+	}
+
+	/* concurrent O_DIRECT writes are allowed */
+	rw_level = (filp->f_flags & O_DIRECT) ? 0 : 1;
+	ret = ocfs2_rw_lock(inode, rw_level);
+	if (ret < 0) {
+		rw_level = -1;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/* 
+	 * We sample i_size under a read level meta lock to see if our write
+	 * is extending the file, if it is we back off and get a write level
+	 * meta lock.
+	 */
+	meta_level = (filp->f_flags & O_APPEND) ? 1 : 0;
+	for(;;) {
+		ret = ocfs2_meta_lock(inode, NULL, NULL, meta_level);
+		if (ret < 0) {
+			meta_level = -1;
+			mlog_errno(ret);
+			goto out;
+		}
+
+		/* work on a copy of ppos until we're sure that we won't have
+		 * to recalculate it due to relocking. */
+		if (filp->f_flags & O_APPEND) {
+			saved_pos = i_size_read(inode);
+			mlog(0, "O_APPEND: inode->i_size=%llu\n", saved_pos);
+		} else {
+			saved_pos = iocb->ki_pos;
+		}
+		newsize = count + saved_pos;
+
+		mlog(0, "pos=%lld newsize=%"MLFu64" cursize=%lld\n",
+		     saved_pos, newsize, i_size_read(inode));
+
+		/* No need for a higher level metadata lock if we're
+		 * never going past i_size. */
+		if (newsize <= i_size_read(inode))
+			break;
+
+		if (meta_level == 0) {
+			ocfs2_meta_unlock(inode, meta_level);
+			meta_level = 1;
+			continue;
+		}
+
+		spin_lock(&OCFS2_I(inode)->ip_lock);
+		clusters = ocfs2_clusters_for_bytes(inode->i_sb, newsize) -
+			OCFS2_I(inode)->ip_clusters;
+		spin_unlock(&OCFS2_I(inode)->ip_lock);
+
+		mlog(0, "Writing at EOF, may need more allocation: "
+		     "i_size = %lld, newsize = %"MLFu64", need %u clusters\n",
+		     i_size_read(inode), newsize, clusters);
+
+		/* We only want to continue the rest of this loop if
+		 * our extend will actually require more
+		 * allocation. */
+		if (!clusters)
+			break;
+
+		ret = ocfs2_extend_allocation(inode, clusters);
+		if (ret < 0) {
+			if (ret != -ENOSPC)
+				mlog_errno(ret);
+			goto out;
+		}
+
+		/* Fill any holes which would've been created by this
+		 * write. If we're O_APPEND, this will wind up
+		 * (correctly) being a noop. */
+		ret = ocfs2_zero_extend(inode, (u64) newsize - count);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+		break;
+	}
+
+	/* ok, we're done with i_size and alloc work */
+	iocb->ki_pos = saved_pos;
+	ocfs2_meta_unlock(inode, meta_level);
+	meta_level = -1;
+
+	/* communicate with ocfs2_dio_end_io */
+	ocfs2_iocb_set_rw_locked(iocb);
+
+#ifdef OCFS2_ORACORE_WORKAROUNDS
+	if (osb->s_mount_opt & OCFS2_MOUNT_COMPAT_OCFS &&
+	    filp->f_flags & O_DIRECT) {
+		unsigned int saved_flags = filp->f_flags;
+		int sector_size = 1 << osb->s_sectsize_bits;
+
+		if ((saved_pos & (sector_size - 1)) ||
+		    (count & (sector_size - 1)) ||
+		    ((unsigned long)buf & (sector_size - 1))) {
+			filp->f_flags |= O_SYNC;
+			filp->f_flags &= ~O_DIRECT;
+		}
+
+		ret = generic_file_aio_write_nolock(iocb, &local_iov, 1,
+						    &iocb->ki_pos);
+
+		filp->f_flags = saved_flags;
+	} else
+#endif
+		ret = generic_file_aio_write_nolock(iocb, &local_iov, 1,
+						    &iocb->ki_pos);
+
+	/* buffered aio wouldn't have proper lock coverage today */
+	BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT));
+
+	/* 
+	 * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io
+	 * function pointer which is called when o_direct io completes so that
+	 * it can unlock our rw lock.  (it's the clustered equivalent of
+	 * i_alloc_sem; protects truncate from racing with pending ios).
+	 * Unfortunately there are error cases which call end_io and others
+	 * that don't.  so we don't have to unlock the rw_lock if either an
+	 * async dio is going to do it in the future or an end_io after an
+	 * error has already done it.
+	 */
+	if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) {
+		rw_level = -1;
+		have_alloc_sem = 0;
+	}
+
+out:
+	if (meta_level != -1)
+		ocfs2_meta_unlock(inode, meta_level);
+	if (have_alloc_sem)
+		up_read(&inode->i_alloc_sem);
+	if (rw_level != -1) 
+		ocfs2_rw_unlock(inode, rw_level);
+	up(&inode->i_sem);
+
+	mlog_exit(ret);
+	return ret;
+}
+
+static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
+				   char __user *buf,
+				   size_t count,
+				   loff_t pos)
+{
+	int ret = 0, rw_level = -1, have_alloc_sem = 0;
+	struct file *filp = iocb->ki_filp;
+	struct inode *inode = filp->f_dentry->d_inode;
+#ifdef OCFS2_ORACORE_WORKAROUNDS
+	ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+#endif
+
+	mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", filp, buf,
+		   (unsigned int)count,
+		   filp->f_dentry->d_name.len,
+		   filp->f_dentry->d_name.name);
+
+	if (!inode) {
+		ret = -EINVAL;
+		mlog_errno(ret);
+		goto bail;
+	}
+
+#ifdef OCFS2_ORACORE_WORKAROUNDS
+	if (osb->s_mount_opt & OCFS2_MOUNT_COMPAT_OCFS) {
+		if (filp->f_flags & O_DIRECT) {
+			int sector_size = 1 << osb->s_sectsize_bits;
+
+			if ((pos & (sector_size - 1)) ||
+			    (count & (sector_size - 1)) ||
+			    ((unsigned long)buf & (sector_size - 1)) ||
+			    (i_size_read(inode) & (sector_size -1))) {
+				filp->f_flags &= ~O_DIRECT;
+			}
+		}
+	}
+#endif
+
+	/* 
+	 * buffered reads protect themselves in ->readpage().  O_DIRECT reads
+	 * need locks to protect pending reads from racing with truncate.
+	 */
+	if (filp->f_flags & O_DIRECT) {
+		down_read(&inode->i_alloc_sem);
+		have_alloc_sem = 1;
+
+		ret = ocfs2_rw_lock(inode, 0);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto bail;
+		}
+		rw_level = 0;
+		/* communicate with ocfs2_dio_end_io */
+		ocfs2_iocb_set_rw_locked(iocb);
+	}
+
+	ret = generic_file_aio_read(iocb, buf, count, iocb->ki_pos);
+	if (ret == -EINVAL)
+		mlog(ML_ERROR, "generic_file_aio_read returned -EINVAL\n");
+
+	/* buffered aio wouldn't have proper lock coverage today */
+	BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT));
+
+	/* see ocfs2_file_aio_write */
+	if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) {
+		rw_level = -1;
+		have_alloc_sem = 0;
+	}
+
+bail:
+	if (have_alloc_sem)
+		up_read(&inode->i_alloc_sem);
+	if (rw_level != -1) 
+		ocfs2_rw_unlock(inode, rw_level);
+	mlog_exit(ret);
+
+	return ret;
+}
+
 struct inode_operations ocfs2_file_iops = {
 	.setattr	= ocfs2_setattr,
 	.getattr	= ocfs2_getattr,
@@ -1129,3 +1142,21 @@
 	.setattr	= ocfs2_setattr,
 	.getattr	= ocfs2_getattr,
 };
+
+struct file_operations ocfs2_fops = {
+	.read		= do_sync_read,
+	.write		= do_sync_write,
+	.sendfile	= generic_file_sendfile,
+	.mmap		= ocfs2_mmap,
+	.fsync		= ocfs2_sync_file,
+	.release	= ocfs2_file_release,
+	.open		= ocfs2_file_open,
+	.aio_read	= ocfs2_file_aio_read,
+	.aio_write	= ocfs2_file_aio_write,
+};
+
+struct file_operations ocfs2_dops = {
+	.read		= generic_read_dir,
+	.readdir	= ocfs2_readdir,
+	.fsync		= ocfs2_sync_file,
+};

Modified: trunk/fs/ocfs2/file.h
===================================================================
--- trunk/fs/ocfs2/file.h	2005-11-03 22:39:00 UTC (rev 2676)
+++ trunk/fs/ocfs2/file.h	2005-11-05 00:27:07 UTC (rev 2677)
@@ -33,31 +33,25 @@
 struct _ocfs2_alloc_context;
 
 enum ocfs2_alloc_restarted {
-	RESTART_TRANS = 0,
+	RESTART_NONE = 0,
+	RESTART_TRANS,
 	RESTART_META
 };
-int ocfs2_extend_allocation(ocfs2_super *osb,
-			    struct inode *inode,
-			    u32 clusters_to_add,
-			    struct buffer_head *fe_bh,
-			    ocfs2_journal_handle *handle,
-			    struct _ocfs2_alloc_context *data_ac,
-			    struct _ocfs2_alloc_context *meta_ac,
-			    enum ocfs2_alloc_restarted *reason);
+int ocfs2_do_extend_allocation(ocfs2_super *osb,
+			       struct inode *inode,
+			       u32 clusters_to_add,
+			       struct buffer_head *fe_bh,
+			       ocfs2_journal_handle *handle,
+			       struct _ocfs2_alloc_context *data_ac,
+			       struct _ocfs2_alloc_context *meta_ac,
+			       enum ocfs2_alloc_restarted *reason);
 int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);
 int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
 		  struct kstat *stat);
-int ocfs2_extend_file(ocfs2_super *osb,
-		      struct inode *inode,
-		      u64 new_i_size,
-		      u64 *bytes_extended);
 
 int ocfs2_set_inode_size(ocfs2_journal_handle *handle,
 			 struct inode *inode,
 			 struct buffer_head *fe_bh,
 			 u64 new_i_size);
 
-void ocfs2_file_finish_extension(struct inode *inode, loff_t newsize,
-				 unsigned direct_extend);
-
 #endif /* OCFS2_FILE_H */

Modified: trunk/fs/ocfs2/inode.c
===================================================================
--- trunk/fs/ocfs2/inode.c	2005-11-03 22:39:00 UTC (rev 2676)
+++ trunk/fs/ocfs2/inode.c	2005-11-05 00:27:07 UTC (rev 2677)
@@ -299,7 +299,6 @@
 		    inode->i_fop = &ocfs2_fops;
 		    inode->i_op = &ocfs2_file_iops;
 		    i_size_write(inode, le64_to_cpu(fe->i_size));
-		    OCFS2_I(inode)->ip_mmu_private = inode->i_size;
 		    break;
 	    case S_IFDIR:
 		    inode->i_op = &ocfs2_dir_iops;
@@ -320,6 +319,8 @@
 		    break;
 	}
 
+	ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_rw_lockres,
+				  OCFS2_LOCK_TYPE_RW, inode);
 	ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_meta_lockres,
 				  OCFS2_LOCK_TYPE_META, inode);
 	ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_data_lockres,
@@ -788,7 +789,12 @@
 
 	/* Lock down the inode. This gives us an up to date view of
 	 * it's metadata (for verification), and allows us to
-	 * serialize delete_inode votes. */
+	 * serialize delete_inode votes. 
+	 *
+	 * Even though we might be doing a truncate, we don't take the
+	 * allocation lock here as it won't be needed - nobody will
+	 * have the file open.
+	 */
 	status = ocfs2_meta_lock(inode, NULL, &di_bh, 1);
 	if (status < 0) {
 		if (status != -ENOENT)
@@ -857,6 +863,7 @@
 
 	/* Do these before all the other work so that we don't bounce
 	 * the vote thread while waiting to destroy the locks. */
+	ocfs2_mark_lockres_freeing(&oi->ip_rw_lockres);
 	ocfs2_mark_lockres_freeing(&oi->ip_meta_lockres);
 	ocfs2_mark_lockres_freeing(&oi->ip_data_lockres);
 
@@ -880,6 +887,7 @@
 	if (status < 0)
 		mlog_errno(status);
 
+	ocfs2_lock_res_free(&oi->ip_rw_lockres);
 	ocfs2_lock_res_free(&oi->ip_meta_lockres);
 	ocfs2_lock_res_free(&oi->ip_data_lockres);
 
@@ -1114,9 +1122,6 @@
 
 	OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
 	i_size_write(inode, le64_to_cpu(fe->i_size));
-	if (S_ISREG(inode->i_mode)) {
-		OCFS2_I(inode)->ip_mmu_private = i_size_read(inode);
-	}
 	inode->i_nlink = le16_to_cpu(fe->i_links_count);
 	inode->i_uid = le32_to_cpu(fe->i_uid);
 	inode->i_gid = le32_to_cpu(fe->i_gid);

Modified: trunk/fs/ocfs2/inode.h
===================================================================
--- trunk/fs/ocfs2/inode.h	2005-11-03 22:39:00 UTC (rev 2676)
+++ trunk/fs/ocfs2/inode.h	2005-11-05 00:27:07 UTC (rev 2677)
@@ -31,6 +31,7 @@
 {
 	u64			ip_blkno;
 
+	struct ocfs2_lock_res	ip_rw_lockres;
 	struct ocfs2_lock_res	ip_meta_lockres;
 	struct ocfs2_lock_res	ip_data_lockres;
 
@@ -41,7 +42,6 @@
 	spinlock_t		ip_lock;
 	u32			ip_open_count;
 	u32			ip_clusters;
-	loff_t			ip_mmu_private;
 	struct ocfs2_extent_map	ip_map;
 	struct list_head	ip_io_markers;
 	int			ip_orphaned_slot;

Modified: trunk/fs/ocfs2/journal.c
===================================================================
--- trunk/fs/ocfs2/journal.c	2005-11-03 22:39:00 UTC (rev 2676)
+++ trunk/fs/ocfs2/journal.c	2005-11-05 00:27:07 UTC (rev 2677)
@@ -446,6 +446,18 @@
 	return status;
 }
 
+int ocfs2_journal_dirty_data(handle_t *handle,
+			     struct buffer_head *bh)
+{
+	int err = journal_dirty_data(handle, bh);
+	if (err)
+		mlog_errno(err);
+	/* TODO: When we can handle it, abort the handle and go RO on
+	 * error here. */
+
+	return err;
+}
+
 /* We always assume you're adding a metadata lock at level 'ex' */
 int ocfs2_handle_add_lock(ocfs2_journal_handle *handle,
 			  struct inode *inode)
@@ -1144,7 +1156,7 @@
 	SET_INODE_JOURNAL(inode);
 
 	status = ocfs2_meta_lock_full(inode, NULL, &bh, 1,
-				      OCFS2_META_LOCK_RECOVERY, NULL, 0);
+				      OCFS2_META_LOCK_RECOVERY);
 	if (status < 0) {
 		mlog(0, "status returned from ocfs2_meta_lock=%d\n", status);
 		if (status != -ERESTARTSYS)
@@ -1334,7 +1346,7 @@
 	SET_INODE_JOURNAL(inode);
 
 	flags = OCFS2_META_LOCK_RECOVERY | OCFS2_META_LOCK_NOQUEUE;
-	status = ocfs2_meta_lock_full(inode, NULL, NULL, 1, flags, NULL, 0);
+	status = ocfs2_meta_lock_full(inode, NULL, NULL, 1, flags);
 	if (status < 0) {
 		if (status != -EAGAIN)
 			mlog_errno(status);

Modified: trunk/fs/ocfs2/journal.h
===================================================================
--- trunk/fs/ocfs2/journal.h	2005-11-03 22:39:00 UTC (rev 2676)
+++ trunk/fs/ocfs2/journal.h	2005-11-05 00:27:07 UTC (rev 2677)
@@ -251,6 +251,8 @@
  *                          buffer. Will have to call ocfs2_journal_dirty once
  *                          we've actually dirtied it. Type is one of . or .
  *  ocfs2_journal_dirty    - Mark a journalled buffer as having dirty data.
+ *  ocfs2_journal_dirty_data - Indicate that a data buffer should go out before
+ *                             the current handle commits.
  *  ocfs2_handle_add_lock  - Sometimes we need to delay lock release
  *                          until after a transaction has been completed. Use
  *                          ocfs2_handle_add_lock to indicate that a lock needs
@@ -310,6 +312,8 @@
  */
 int                  ocfs2_journal_dirty(ocfs2_journal_handle *handle,
 					 struct buffer_head *bh);
+int                  ocfs2_journal_dirty_data(handle_t *handle,
+					      struct buffer_head *bh);
 int                  ocfs2_handle_add_lock(ocfs2_journal_handle *handle,
 					   struct inode *inode);
 /*

Modified: trunk/fs/ocfs2/mmap.c
===================================================================
--- trunk/fs/ocfs2/mmap.c	2005-11-03 22:39:00 UTC (rev 2676)
+++ trunk/fs/ocfs2/mmap.c	2005-11-05 00:27:07 UTC (rev 2677)
@@ -42,102 +42,38 @@
 #include "inode.h"
 #include "mmap.h"
 
-static inline u64 ocfs2_binode_blkno(struct ocfs2_backing_inode *binode);
-static inline struct rb_node * __ocfs2_buffer_lock_ctxt_root(
-	struct ocfs2_buffer_lock_ctxt *ctxt);
-static int ocfs2_buffer_lock_ctxt_insert(struct ocfs2_buffer_lock_ctxt *ctxt,
-					 struct inode *inode,
-					 struct ocfs2_backing_inode **binode_ret);
-static int ocfs2_fill_ctxt_from_buf(struct super_block *sb,
-				    struct inode *target_inode,
-				    char __user *buf,
-				    size_t size,
-				    struct ocfs2_buffer_lock_ctxt *ctxt);
-
 static struct page *ocfs2_nopage(struct vm_area_struct * area,
 				 unsigned long address,
 				 int *type)
 {
-	int status, tmpstat, locked;
 	struct inode *inode = area->vm_file->f_dentry->d_inode;
-	struct page *page;
+	struct page *page = NOPAGE_SIGBUS;
 	sigset_t blocked, oldset;
-	DECLARE_IO_MARKER(io_marker);
+	int ret;
 
-	mlog_entry("(inode %lu, address %lu)\n", inode->i_ino,
-		   address);
+	mlog_entry("(inode %lu, address %lu)\n", inode->i_ino, address);
 
-	locked = ocfs2_is_in_io_marker_list(inode, current);
+	/* The best way to deal with signals in this path is
+	 * to block them upfront, rather than allowing the
+	 * locking paths to return -ERESTARTSYS. */
+	sigfillset(&blocked);
 
-	if (!locked) {
-		/* For lack of a better error... Unfortunately returns
-		 * from nopage aren't very expressive right now. */
-		page = NOPAGE_SIGBUS;
-
-		/* The best way to deal with signals in this path is
-		 * to block them upfront, rather than allowing the
-		 * locking paths to return -ERESTARTSYS. */
-		sigfillset(&blocked);
-
-		/* We should technically never get a bad status return
-		 * from sigprocmask */
-		status = sigprocmask(SIG_BLOCK, &blocked, &oldset);
-		if (status < 0) {
-			mlog_errno(status);
-			goto bail;
-		}
-
-		/* Since we don't allow shared writable, we need only
-		 * worry about read locking here. */
-		status = ocfs2_meta_lock(inode, NULL, NULL, 0);
-		if (status < 0) {
-			mlog_errno(status);
-
-			if (status == -ENOMEM)
-				page = NOPAGE_OOM;
-			goto bail_setmask;
-		}
-
-		status = ocfs2_data_lock(inode, 0);
-		if (status < 0) {
-			mlog_errno(status);
-
-			if (status == -ENOMEM)
-				page = NOPAGE_OOM;
-			goto bail_unlock;
-		}
-
-		tmpstat = sigprocmask(SIG_SETMASK, &oldset, NULL);
-		if (tmpstat < 0)
-			mlog_errno(tmpstat);
-
-		/* I'm not sure if we can somehow recurse back into
-		 * nopage or not, but this doesn't cost us anything,
-		 * so lets do it for now. */
-		ocfs2_add_io_marker(inode, &io_marker);
+	/* We should technically never get a bad ret return
+	 * from sigprocmask */
+	ret = sigprocmask(SIG_BLOCK, &blocked, &oldset);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out;
 	}
 
 	page = filemap_nopage(area, address, type);
 
-	if (!locked) {
-		ocfs2_del_io_marker(inode, &io_marker);
-		ocfs2_data_unlock(inode, 0);
-		ocfs2_meta_unlock(inode, 0);
-	}
-bail:
+	ret = sigprocmask(SIG_SETMASK, &oldset, NULL);
+	if (ret < 0)
+		mlog_errno(ret);
+out:
 	mlog_exit_ptr(page);
 	return page;
-
-bail_unlock:
-	ocfs2_meta_unlock(inode, 0);
-
-bail_setmask:
-	tmpstat = sigprocmask(SIG_SETMASK, &oldset, NULL);
-	if (tmpstat < 0)
-		mlog_errno(tmpstat);
-
-	mlog_exit_ptr(page);
-	return page;
 }
 
 static struct vm_operations_struct ocfs2_file_vm_ops = {
@@ -164,516 +100,3 @@
 	return 0;
 }
 
-static inline u64 ocfs2_binode_blkno(struct ocfs2_backing_inode *binode)
-{
-	struct inode *inode = binode->ba_inode;
-
-	BUG_ON(!inode);
-
-	return OCFS2_I(inode)->ip_blkno;
-}
-
-static inline struct rb_node * __ocfs2_buffer_lock_ctxt_root(
-	struct ocfs2_buffer_lock_ctxt *ctxt)
-{
-	return ctxt->b_inodes.rb_node;
-}
-
-static int ocfs2_buffer_lock_ctxt_insert(struct ocfs2_buffer_lock_ctxt *ctxt,
-					 struct inode *inode,
-					 struct ocfs2_backing_inode **binode_ret)
-{
-	u64 blkno;
-	struct ocfs2_backing_inode *tmp, *binode;
-	struct rb_node * parent = NULL;
-	struct rb_node ** p = &ctxt->b_inodes.rb_node;
-
-	BUG_ON(!ctxt);
-	BUG_ON(!inode);
-
-	blkno = OCFS2_I(inode)->ip_blkno;
-
-	while(*p) {
-		parent = *p;
-		tmp = rb_entry(parent, struct ocfs2_backing_inode, ba_node);
-
-		if (blkno < ocfs2_binode_blkno(tmp))
-			p = &(*p)->rb_left;
-		else if (blkno > ocfs2_binode_blkno(tmp))
-			p = &(*p)->rb_right;
-		else
-			return 0; /* Don't insert duplicates */
-	}
-
-	binode = kcalloc(1, sizeof(struct ocfs2_backing_inode), GFP_KERNEL);
-	if (!binode)
-		return -ENOMEM;
-	binode->ba_inode = inode;
-	ocfs2_init_io_marker(&binode->ba_task);
-
-	if (binode_ret)
-		*binode_ret = binode;
-
-	rb_link_node(&binode->ba_node, parent, p);
-	rb_insert_color(&binode->ba_node, &ctxt->b_inodes);
-
-	return 0;
-}
-
-static int ocfs2_fill_ctxt_from_buf(struct super_block *sb,
-				    struct inode *target_inode,
-				    char __user *buf,
-				    size_t size,
-				    struct ocfs2_buffer_lock_ctxt *ctxt)
-{
-	int status;
-	unsigned long start = (unsigned long)buf;
-	unsigned long end = start + size;
-	struct inode *inode;
-	struct mm_struct *mm = current->mm;
-	struct vm_area_struct *vma;
-
-	for (vma = find_vma(mm, start); vma; vma = vma->vm_next) {
-		if (end <= vma->vm_start)
-			break;
-		if (vma->vm_ops == &ocfs2_file_vm_ops) {
-			if (!vma->vm_file)
-				continue;
-			inode = vma->vm_file->f_dentry->d_inode;
-			if (inode->i_sb == sb &&
-			    inode != target_inode) {
-				status = ocfs2_buffer_lock_ctxt_insert(ctxt,
-								       inode,
-								       NULL);
-				if (status < 0)
-					goto bail;
-			}
-		}
-	}
-	status = 0;
-bail:
-	return status;
-}
-
-int ocfs2_setup_io_locks(struct super_block *sb,
-			 struct inode *target_inode,
-			 char __user *buf,
-			 size_t size,
-			 struct ocfs2_buffer_lock_ctxt *ctxt,
-			 struct ocfs2_backing_inode **target_binode)
-{
-	struct mm_struct *mm = current->mm;
-	int skip_sem = (current->flags & PF_DUMPCORE) || !mm;
-	int status;
-
-	if (!skip_sem)
-		down_read(&mm->mmap_sem);
-
-	BUG_ON(__ocfs2_buffer_lock_ctxt_root(ctxt));
-
-	/* We always insert target because it might not be backing part of the
-	 * buffer - but it needs to be in there so that it's lock gets ordered
-	 * with everything else */
-	status = ocfs2_buffer_lock_ctxt_insert(ctxt, target_inode,
-					       target_binode);
-
-	/* knfsd, which lacks an mm, may call us to do I/O. Since the buffer
-	 * is private to the kernel, there isn't any need to insert any other
-	 * locks, so we can skip it.
-	 *
-	 * The pile of duct tape and mixed nuts that is NFS 1, universe 0
-	 */
-	if (!status && mm) {
-		/* Now fill the tree with any inodes that back this
-		 * buffer. If target inode is in there, it will be
-		 * skipped over. */
-		status = ocfs2_fill_ctxt_from_buf(sb, target_inode, buf, size,
-						  ctxt);
-	}
-
-	if (!skip_sem)
-		up_read(&mm->mmap_sem);
-
-	if (status < 0) {
-		mlog_errno(status);
-		ocfs2_unlock_buffer_inodes(ctxt);
-		goto bail;
-	}
-
-	status = 0;
-bail:
-	return status;
-}
-
-/* starting from pos, which can be null for the first call, give the
- * next buffer that needs unlocking.  we return null when there are none
- * left or we see last_inode */
-static struct ocfs2_backing_inode *
-ocfs2_next_unlocked(struct ocfs2_buffer_lock_ctxt *ctxt,
-		    struct inode *last_inode,
-		    struct ocfs2_backing_inode *pos)
-{
-	struct ocfs2_backing_inode *binode = NULL;
-	struct rb_node *node = NULL;
-
-	if (pos == NULL) {
-		if (ctxt->b_next_unlocked)
-			binode = ctxt->b_next_unlocked;
-		else
-			node = rb_first(&ctxt->b_inodes);
-	} else
-		node = rb_next(&pos->ba_node);
-
-	if (node)
-		binode = rb_entry(node, struct ocfs2_backing_inode, ba_node);
-
-	if (binode && last_inode && binode->ba_inode == last_inode)
-		binode = NULL;
-
-	/* this is just an optimization to skip nodes in the tree
-	 * that we've already seen.  If we're moving from one we've locked
-	 * to one we haven't then we mark this node in the ctxt so that
-	 * we'll return to it in a future after, say, hitting last_inode
-	 * or EIOCBRETRY in lock_buffer_inodes */
-	if (pos && pos->ba_locked && binode)
-		ctxt->b_next_unlocked = binode;
-
-	return binode;
-}
-
-/* Will take locks on all inodes in the ctxt up until 'last_inode'. If
- * last_inode is NULL, then we take locks on everything. We mark lock
- * status on the context so we skip any that have already been
- * locked. On error we will completely abort the context. */
-/* WARNING: If you get a failure case here, you *must* call
- * "ocfs2_unlock_buffer_inodes" as we may have left a few inodes under
- * cluster lock. */
-int ocfs2_lock_buffer_inodes(struct ocfs2_buffer_lock_ctxt *ctxt,
-			     struct inode *last_inode)
-{
-	int status, data_level;
-	struct ocfs2_backing_inode *binode = NULL;
-	struct inode *inode;
-
-	while((binode = ocfs2_next_unlocked(ctxt, last_inode, binode))) {
-		/* the tricksy caller might have locked inodes themselves
-		 * between calls. */
-		if (binode->ba_locked)
-			continue;
-		inode = binode->ba_inode;
-
-		if (!binode->ba_meta_locked) {
-			status = ocfs2_meta_lock_full(inode, NULL, NULL,
-						      binode->ba_lock_meta_level,
-						      0, ctxt->b_cb,
-						      ctxt->b_cb_data);
-
-			if (status < 0) {
-				if (status != -EIOCBRETRY)
-					mlog_errno(status);
-				goto bail;
-			}
-
-			binode->ba_meta_locked = 1;
-		}
-
-		/* ba_lock_data isn't set for direct io */
-		if (binode->ba_lock_data) {
-			data_level = binode->ba_lock_data_level;
-			status = ocfs2_data_lock(inode, data_level);
-			if (status < 0) {
-				if (status == -EIOCBRETRY)
-					goto bail;
-
-				/* clean up the metadata lock that we took
-				 * above
-				 */
-				ocfs2_meta_unlock(inode,
-						  binode->ba_lock_meta_level);
-				binode->ba_meta_locked = 0;
-
-				mlog_errno(status);
-				goto bail;
-			}
-		}
-		ocfs2_add_io_marker(inode, &binode->ba_task);
-		binode->ba_locked = 1;
-	}
-
-	status = 0;
-bail:
-	return status;
-}
-
-void ocfs2_unlock_buffer_inodes(struct ocfs2_buffer_lock_ctxt *ctxt)
-{
-	struct ocfs2_backing_inode *binode;
-	struct rb_node *node;
-
-	/* dlm locks don't mask ints.. this should be lower down */
-	BUG_ON(in_interrupt());
-
-	/* unlock in reverse order to minimize waking forward lockers */
-	while ((node = rb_last(&ctxt->b_inodes)) != NULL) {
-		binode = rb_entry(node, struct ocfs2_backing_inode, ba_node);
-
-		ocfs2_del_io_marker(binode->ba_inode, &binode->ba_task);
-
-		if (binode->ba_locked && binode->ba_lock_data)
-			ocfs2_data_unlock(binode->ba_inode,
-					  binode->ba_lock_data_level);
-
-		if (binode->ba_locked || binode->ba_meta_locked)
-			ocfs2_meta_unlock(binode->ba_inode,
-					  binode->ba_lock_meta_level);
-
-		rb_erase(node, &ctxt->b_inodes);
-		kfree(binode);
-	}
-
-	ctxt->b_next_unlocked = NULL;
-}
-
-/*
- * This builds up the locking state that will be used by a write.  both normal
- * file writes and AIO writes come in through here.  This function does no
- * teardown on its own.  The caller must examine the info struct to see if it
- * needs to release locks or i_sem, etc.  This function is also restartable in
- * that it can return EIOCBRETRY if it would have blocked in the dlm.  It
- * stores its partial progress in the info struct so the caller can call back
- * in when it thinks the dlm won't block any more.  Thus, the caller must zero
- * the info struct before calling in the first time.
- */
-ssize_t ocfs2_write_lock_maybe_extend(struct file *filp,
-				      const char __user *buf,
-				      size_t count,
-				      loff_t *ppos,
-				      struct ocfs2_write_lock_info *info,
-				      struct ocfs2_buffer_lock_ctxt *ctxt)
-{
-	int ret = 0;
-	ocfs2_super *osb = NULL;
-	struct dentry *dentry = filp->f_dentry;
-	struct inode *inode = dentry->d_inode;
-	int status;
-	int level = filp->f_flags & O_APPEND;
-	loff_t saved_ppos;
-	u64 bytes_added = 0;
-
-	osb = OCFS2_SB(inode->i_sb);
-
-	/* the target inode is different from the other inodes.  in o_direct it
-	 * doesn't get a data lock and when appending it gets a level 1 meta
-	 * lock.  we use target_binode to set its flags accordingly */
-	if (info->wl_target_binode == NULL) {
-		ret = ocfs2_setup_io_locks(inode->i_sb, inode,
-					   (char __user *) buf,
-					   count, ctxt,
-					   &info->wl_target_binode);
-		if (ret < 0) {
-			BUG_ON(ret == -EIOCBRETRY);
-			mlog_errno(ret);
-			goto bail;
-		}
-	}
-
-	/* This will lock everyone in the context who's order puts
-	 * them before us. */
-	if (!info->wl_have_before) {
-		info->wl_unlock_ctxt = 1;
-		ret = ocfs2_lock_buffer_inodes(ctxt, inode);
-		if (ret < 0) {
-			if (ret != -EIOCBRETRY)
-				mlog_errno(ret);
-			goto bail;
-		}
-		info->wl_have_before = 1;
-		/* we're writing so get an ex data cluster lock */
-		info->wl_target_binode->ba_lock_data_level = 1;
-	}
-
-	if (!info->wl_have_i_sem) {
-		down(&inode->i_sem);
-		info->wl_have_i_sem = 1;
-	}
-
-lock:
-	if (!info->wl_have_target_meta) {
-		status = ocfs2_meta_lock(inode, NULL, NULL, level);
-		if (status < 0) {
-			mlog_errno(status);
-			ret = status;
-			goto bail;
-		}
-		info->wl_have_target_meta = 1;
-	}
-	/* to handle extending writes, we do a bit of our own locking
-	 * here, but we setup the ctxt do unlock for us (as well as
-	 * handle locking everything else. */
-	if (level)
-		info->wl_target_binode->ba_lock_meta_level = 1;
-
-	/* work on a copy of ppos until we're sure that we won't have
-	 * to recalculate it due to relocking. */
-	saved_ppos = *ppos;
-
-	if (filp->f_flags & O_APPEND) {
-		saved_ppos = i_size_read(inode);
-		mlog(0, "O_APPEND: inode->i_size=%llu\n", saved_ppos);
-
-#ifdef OCFS2_ORACORE_WORKAROUNDS
-		if (osb->s_mount_opt & OCFS2_MOUNT_COMPAT_OCFS) {
-			/* ugh, work around some applications which open
-			 * everything O_DIRECT + O_APPEND and really don't
-			 * mean to use O_DIRECT. */
-			filp->f_flags &= ~O_DIRECT;
-		}
-#endif
-	}
-
-	if (filp->f_flags & O_DIRECT) {
-#ifdef OCFS2_ORACORE_WORKAROUNDS
-		if (osb->s_mount_opt & OCFS2_MOUNT_COMPAT_OCFS) {
-			int sector_size = 1 << osb->s_sectsize_bits;
-
-			if ((saved_ppos & (sector_size - 1)) ||
-			    (count & (sector_size - 1)) ||
-			    ((unsigned long)buf & (sector_size - 1))) {
-				info->wl_do_direct_io = 0;
-				filp->f_flags |= O_SYNC;
-			} else {
-				info->wl_do_direct_io = 1;
-			}
-		} else
-#endif
-			info->wl_do_direct_io = 1;
-
-		mlog(0, "O_DIRECT\n");
-	}
-
-	info->wl_target_binode->ba_lock_data = info->wl_do_direct_io ? 0 : 1;
-
-	info->wl_newsize = count + saved_ppos;
-	if (filp->f_flags & O_APPEND)
-		info->wl_newsize = count + i_size_read(inode);
-
-	mlog(0, "ppos=%lld newsize=%"MLFu64" cursize=%lld\n", saved_ppos,
-	     info->wl_newsize, i_size_read(inode));
-
-	if (info->wl_newsize > i_size_read(inode)) {
-		if (!level) {
-			/* we want an extend, but need a higher
-			 * level cluster lock. */
-			mlog(0, "inode %"MLFu64", had a PR, looping back "
-			     "for EX\n", OCFS2_I(inode)->ip_blkno);
-			ocfs2_meta_unlock(inode, level);
-			info->wl_have_target_meta = 0;
-			level = 1;
-			goto lock;
-		}
-
-		mlog(0, "Writing at EOF, will need more allocation: "
-		     "i_size=%lld, need=%"MLFu64"\n", i_size_read(inode),
-		     info->wl_newsize);
-
-		/* If we extend AT ALL here then we update our state
-		 * and continue the write call, regardless of error --
-		 * this is basically a short write. */
-		status = ocfs2_extend_file(osb, inode, info->wl_newsize,
-					   &bytes_added);
-		if (status < 0 && (!bytes_added)) {
-			if (status != -ERESTARTSYS
-			    && status != -EINTR
-			    && status != -ENOSPC) {
-				mlog_errno(status);
-				mlog(ML_ERROR, "Failed to extend inode %"MLFu64
-				     " from %lld to %"MLFu64,
-				     OCFS2_I(inode)->ip_blkno,
-				     *ppos, info->wl_newsize);
-			}
-			ret = status;
-
-			info->wl_have_target_meta = 0;
-			ocfs2_meta_unlock(inode, level);
-			goto bail;
-		}
-
-		info->wl_extended = 1;
-
-		/* We need to recalulate newsize and count according
-		 * to what extend could give us. If we got the whole
-		 * extend then this doesn't wind up changing the
-		 * values. */
-		info->wl_newsize = i_size_read(inode) + bytes_added;
-		count = info->wl_newsize - saved_ppos;
-
-		if (status < 0
-		    && status != -ENOSPC
-		    && status != -EINTR
-		    && status != -ERESTARTSYS)
-			mlog(ML_ERROR, "status return of %d extending inode "
-			     "%"MLFu64"\n", status,
-			     OCFS2_I(inode)->ip_blkno);
-		status = 0;
-	}
-
-	/* we've got whatever cluster lock is appropriate now, so we
-	 * can stuff *ppos back. */
-	*ppos = saved_ppos;
-
-	if (!info->wl_do_direct_io && !info->wl_have_data_lock) {
-		status = ocfs2_data_lock(inode, 1);
-		if (status < 0) {
-			mlog_errno(status);
-			ret = status;
-
-			info->wl_have_target_meta = 0;
-			ocfs2_meta_unlock(inode, level);
-			goto bail;
-		}
-		info->wl_have_data_lock = 1;
-	}
-
-	/* Alright, fool the io locking stuff into thinking it's
-	 * handled our inode for us. We can now count on it to do the
-	 * unlock for us. */
-	info->wl_target_binode->ba_locked = 1;
-
-	/* This will lock everyone who's order puts them *after* our inode. */
-	ret = ocfs2_lock_buffer_inodes(ctxt, NULL);
-	if (ret < 0) {
-		if (ret != -EIOCBRETRY)
-			mlog_errno(ret);
-		goto bail;
-	}
-
-bail:
-	mlog_exit(ret);
-	return ret;
-}
-
-#if 0
-static void ocfs2_buffer_ctxt_debug(struct ocfs2_buffer_lock_ctxt *ctxt)
-{
-	struct ocfs2_backing_inode *binode;
-	struct inode *inode;
-	struct rb_node *node;
-
-	printk("(%u) ocfs2: buffer lock ctxt: direct io = %d\n",
-	       current->pid, ctxt->b_lock_direct);
-
-	node = rb_first(&ctxt->b_inodes);
-	while (node) {
-		binode = rb_entry(node, struct ocfs2_backing_inode, ba_node);
-		inode = binode->ba_inode;
-
-		printk("(%u) ocfs2: inode %llu, locked %d, is target? %s\n",
-		       current->pid, OCFS2_I(inode)->ip_blkno,
-		       binode->ba_locked,
-		       ocfs2_buffer_lock_is_target(ctxt, inode) ? "yes" :
-		       "no");
-
-		node = rb_next(node);
-	}
-}
-#endif

Modified: trunk/fs/ocfs2/mmap.h
===================================================================
--- trunk/fs/ocfs2/mmap.h	2005-11-03 22:39:00 UTC (rev 2676)
+++ trunk/fs/ocfs2/mmap.h	2005-11-05 00:27:07 UTC (rev 2677)
@@ -1,131 +1,6 @@
 #ifndef OCFS2_MMAP_H
 #define OCFS2_MMAP_H
 
-int ocfs2_mmap(struct file *file,
-	       struct vm_area_struct *vma);
+int ocfs2_mmap(struct file *file, struct vm_area_struct *vma);
 
-/* used by file_read/file_write and nopage to coordinate file
- * locking. I keep this out of the dlmglue code, because quite frankly
- * I don't like that we have to do this stuff. */
-struct ocfs2_io_marker {
-	struct list_head io_list;
-	struct task_struct *io_task;
-};
-
-#define __IOMARKER_INITIALIZER(name) {					\
-	.io_list      = { &(name).io_list, &(name).io_list },		\
-	.io_task      = NULL }
-
-#define DECLARE_IO_MARKER(name)						\
-	struct ocfs2_io_marker name = __IOMARKER_INITIALIZER(name)
-
-static inline void ocfs2_init_io_marker(struct ocfs2_io_marker *task)
-{
-	INIT_LIST_HEAD(&task->io_list);
-	task->io_task = NULL;
-}
-
-static inline void ocfs2_add_io_marker(struct inode *inode,
-				       struct ocfs2_io_marker *task)
-{
-	struct ocfs2_inode_info *oi = OCFS2_I(inode);
-
-	task->io_task = current;
-	spin_lock(&oi->ip_lock);
-	list_add(&task->io_list, &oi->ip_io_markers);
-	spin_unlock(&oi->ip_lock);
-}
-
-static inline void ocfs2_del_io_marker(struct inode *inode,
-				       struct ocfs2_io_marker *task)
-{
-	spin_lock(&OCFS2_I(inode)->ip_lock);
-	if (!list_empty(&task->io_list))
-		list_del_init(&task->io_list);
-	spin_unlock(&OCFS2_I(inode)->ip_lock);
-}
-
-static inline int ocfs2_is_in_io_marker_list(struct inode *inode,
-					   struct task_struct *task)
-{
-	int ret = 0;
-	struct ocfs2_inode_info *oi = OCFS2_I(inode);
-	struct list_head *p;
-	struct ocfs2_io_marker *tmp;
-
-	spin_lock(&oi->ip_lock);
-	list_for_each(p, &oi->ip_io_markers) {
-		tmp = list_entry(p, struct ocfs2_io_marker, io_list);
-		if (tmp->io_task == task) {
-			ret = 1;
-			break;
-		}
-	}
-	spin_unlock(&oi->ip_lock);
-
-	return ret;
-}
-
-struct ocfs2_backing_inode {
-	struct rb_node           ba_node;
-	struct inode            *ba_inode;
-	unsigned		 ba_meta_locked:1, 	/* meta is locked */
-				 ba_locked:1,		/* both are locked */
-				 ba_lock_data:1,	/* should lock data */
-				 ba_lock_meta_level:1,
-				 ba_lock_data_level:1;
-	struct ocfs2_io_marker   ba_task;
-};
-
-/* Used to manage the locks taken during I/O. */
-struct ocfs2_buffer_lock_ctxt {
-	struct rb_root			b_inodes;
-	struct ocfs2_backing_inode	*b_next_unlocked;
-	ocfs2_lock_callback		b_cb;
-	unsigned long			b_cb_data;
-};
-
-#define __BUFFERLOCK_INITIALIZER {					\
-	.b_inodes               = RB_ROOT,				\
-	.b_next_unlocked	= NULL,					\
-	.b_cb			= NULL,					\
-	.b_cb_data		= 0 }
-
-#define DECLARE_BUFFER_LOCK_CTXT(name)					\
-	struct ocfs2_buffer_lock_ctxt name = __BUFFERLOCK_INITIALIZER
-
-#define INIT_BUFFER_LOCK_CTXT(ctxt)	\
-	*(ctxt) = (struct ocfs2_buffer_lock_ctxt) __BUFFERLOCK_INITIALIZER
-
-int ocfs2_setup_io_locks(struct super_block *sb,
-			 struct inode *target_inode,
-			 char __user *buf,
-			 size_t size,
-			 struct ocfs2_buffer_lock_ctxt *ctxt,
-			 struct ocfs2_backing_inode **target_binode);
-
-int ocfs2_lock_buffer_inodes(struct ocfs2_buffer_lock_ctxt *ctxt,
-			     struct inode *last_inode);
-
-void ocfs2_unlock_buffer_inodes(struct ocfs2_buffer_lock_ctxt *ctxt);
-
-struct ocfs2_write_lock_info {
-	u64				wl_newsize;
-	unsigned			wl_extended:1,
-					wl_do_direct_io:1,
-					wl_have_i_sem:1,
-					wl_unlock_ctxt:1,
-					wl_have_before:1,
-					wl_have_target_meta:1,
-					wl_have_data_lock:1;
-	struct ocfs2_backing_inode	*wl_target_binode;
-};
-
-ssize_t ocfs2_write_lock_maybe_extend(struct file *filp,
-				      const char __user *buf,
-				     size_t count,
-				      loff_t *ppos,
-				     struct ocfs2_write_lock_info *info,
-				     struct ocfs2_buffer_lock_ctxt *ctxt);
-
 #endif  /* OCFS2_MMAP_H */

Modified: trunk/fs/ocfs2/namei.c
===================================================================
--- trunk/fs/ocfs2/namei.c	2005-11-03 22:39:00 UTC (rev 2676)
+++ trunk/fs/ocfs2/namei.c	2005-11-05 00:27:07 UTC (rev 2677)
@@ -1626,8 +1626,9 @@
 	newsize = l - 1;
 	if (l > ocfs2_fast_symlink_chars(sb)) {
 		inode->i_op = &ocfs2_symlink_inode_operations;
-		status = ocfs2_extend_allocation(osb, inode, 1, new_fe_bh,
-						 handle, data_ac, NULL, NULL);
+		status = ocfs2_do_extend_allocation(osb, inode, 1, new_fe_bh,
+						    handle, data_ac, NULL,
+						    NULL);
 		if (status < 0) {
 			if (status != -ENOSPC && status != -EINTR) {
 				mlog(ML_ERROR, "Failed to extend file to "

Modified: trunk/fs/ocfs2/ocfs2.h
===================================================================
--- trunk/fs/ocfs2/ocfs2.h	2005-11-03 22:39:00 UTC (rev 2676)
+++ trunk/fs/ocfs2/ocfs2.h	2005-11-05 00:27:07 UTC (rev 2677)
@@ -108,24 +108,13 @@
 
 typedef void (*ocfs2_lock_callback)(int status, unsigned long data);
 
-struct ocfs2_lockres_flag_callback {
-	struct list_head	fc_lockres_item;
-	unsigned		fc_free_once_called:1;
-
-	unsigned long		fc_flag_mask;
-	unsigned long		fc_flag_goal;
-
-	ocfs2_lock_callback	fc_cb;
-	unsigned long		fc_data;
-};
-
 struct ocfs2_lock_res {
 	void                    *l_priv;
 	struct ocfs2_lock_res_ops *l_ops;
 	spinlock_t               l_lock;
 
 	struct list_head         l_blocked_list;
-	struct list_head         l_flag_cb_list;
+	struct list_head         l_mask_waiters;
 
 	enum ocfs2_lock_type     l_type;
 	unsigned long		 l_flags;
@@ -174,6 +163,7 @@
 	OCFS2_MOUNT_BARRIER = 1 << 1,	/* Use block barriers */
 	OCFS2_MOUNT_NOINTR  = 1 << 2,   /* Don't catch signals */
 	OCFS2_MOUNT_ERRORS_PANIC = 1 << 3, /* Panic on errors */
+	OCFS2_MOUNT_DATA_WRITEBACK = 1 << 4, /* No data ordering */
 #ifdef OCFS2_ORACORE_WORKAROUNDS
 	OCFS2_MOUNT_COMPAT_OCFS = 1 << 30, /* ocfs1 compatibility mode */
 #endif
@@ -282,12 +272,6 @@
 
 	struct list_head	osb_net_handlers;
 
-	/* see ocfs2_ki_dtor() */
-	struct work_struct		osb_okp_teardown_work;
-	struct ocfs2_kiocb_private	*osb_okp_teardown_next;
-	atomic_t			osb_okp_pending;
-	wait_queue_head_t		osb_okp_pending_wq;
-
 	wait_queue_head_t		osb_mount_event;
 
 	/* Truncate log info */
@@ -299,6 +283,15 @@
 #define OCFS2_SB(sb)	    ((ocfs2_super *)(sb)->s_fs_info)
 #define OCFS2_MAX_OSB_ID             65536
 
+static inline int ocfs2_should_order_data(struct inode *inode)
+{
+	if (!S_ISREG(inode->i_mode))
+		return 0;
+	if (OCFS2_SB(inode->i_sb)->s_mount_opt & OCFS2_MOUNT_DATA_WRITEBACK)
+		return 0;
+	return 1;
+}
+
 /* set / clear functions because cluster events can make these happen
  * in parallel so we want the transitions to be atomic. this also
  * means that any future flags osb_flags must be protected by spinlock
@@ -415,6 +408,13 @@
 	return clusters;
 }
 
+static inline u64 ocfs2_blocks_for_bytes(struct super_block *sb,
+					 u64 bytes)
+{
+	bytes += sb->s_blocksize - 1;
+	return bytes >> sb->s_blocksize_bits;
+}
+
 static inline u64 ocfs2_clusters_to_bytes(struct super_block *sb,
 					  u32 clusters)
 {
@@ -431,6 +431,15 @@
 	return (u64)clusters << cl_bits;
 }
 
+static inline u64 ocfs2_align_bytes_to_blocks(struct super_block *sb,
+					      u64 bytes)
+{
+	u64 blocks;
+
+        blocks = ocfs2_blocks_for_bytes(sb, bytes);
+	return blocks << sb->s_blocksize_bits;
+}
+
 static inline unsigned long ocfs2_align_bytes_to_sectors(u64 bytes)
 {
 	return (unsigned long)((bytes + 511) >> 9);

Modified: trunk/fs/ocfs2/ocfs2_lockid.h
===================================================================
--- trunk/fs/ocfs2/ocfs2_lockid.h	2005-11-03 22:39:00 UTC (rev 2676)
+++ trunk/fs/ocfs2/ocfs2_lockid.h	2005-11-05 00:27:07 UTC (rev 2677)
@@ -40,6 +40,7 @@
 	OCFS2_LOCK_TYPE_DATA,
 	OCFS2_LOCK_TYPE_SUPER,
 	OCFS2_LOCK_TYPE_RENAME,
+	OCFS2_LOCK_TYPE_RW,
 	OCFS2_NUM_LOCK_TYPES
 };
 
@@ -59,6 +60,9 @@
 		case OCFS2_LOCK_TYPE_RENAME:
 			c = 'R';
 			break;
+		case OCFS2_LOCK_TYPE_RW:
+			c = 'W';
+			break;
 		default:
 			c = '\0';
 	}

Modified: trunk/fs/ocfs2/super.c
===================================================================
--- trunk/fs/ocfs2/super.c	2005-11-03 22:39:00 UTC (rev 2676)
+++ trunk/fs/ocfs2/super.c	2005-11-05 00:27:07 UTC (rev 2677)
@@ -45,7 +45,6 @@
 #include <cluster/masklog.h>
 
 #include "ocfs2.h"
-#include "aio.h"
 
 /* this should be the only file to include a version 1 header */
 #include "ocfs1_fs_compat.h"
@@ -151,6 +150,8 @@
 	Opt_nointr,
 	Opt_hb_none,
 	Opt_hb_local,
+	Opt_data_ordered,
+	Opt_data_writeback,
 	Opt_err,
 };
 
@@ -165,6 +166,8 @@
 	{Opt_nointr, "nointr"},
 	{Opt_hb_none, OCFS2_HB_NONE},
 	{Opt_hb_local, OCFS2_HB_LOCAL},
+	{Opt_data_ordered, "data=ordered"},
+	{Opt_data_writeback, "data=writeback"},
 	{Opt_err, NULL}
 };
 
@@ -380,6 +383,13 @@
 		goto out;
 	}
 
+	if ((osb->s_mount_opt & OCFS2_MOUNT_DATA_WRITEBACK) !=
+	    (parsed_options & OCFS2_MOUNT_DATA_WRITEBACK)) {
+		ret = -EINVAL;
+		mlog(ML_ERROR, "Cannot change data mode on remount\n");
+		goto out;
+	}
+
 	/* We're going to/from readonly mode. */
 	if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) {
 		/* Lock here so the check of HARD_RO and the potential
@@ -636,9 +646,12 @@
 
 	ocfs2_complete_mount_recovery(osb);
 
-	printk("ocfs2: Mounting device (%u,%u) on (node %d, slot %d)\n",
+	printk("ocfs2: Mounting device (%u,%u) on (node %d, slot %d) with %s "
+	       "data mode.\n",
 	       MAJOR(sb->s_dev), MINOR(sb->s_dev), osb->node_num,
-	       osb->slot_num);
+	       osb->slot_num,
+	       osb->s_mount_opt & OCFS2_MOUNT_DATA_WRITEBACK ? "writeback" :
+	       "ordered");
 
 	atomic_set(&osb->vol_state, VOLUME_MOUNTED);
 	wake_up(&osb->osb_mount_event);
@@ -738,6 +751,12 @@
 		case Opt_err_ro:
 			*mount_opt &= ~OCFS2_MOUNT_ERRORS_PANIC;
 			break;
+		case Opt_data_ordered:
+			*mount_opt &= ~OCFS2_MOUNT_DATA_WRITEBACK;
+			break;
+		case Opt_data_writeback:
+			*mount_opt |= OCFS2_MOUNT_DATA_WRITEBACK;
+			break;
 #ifdef OCFS2_ORACORE_WORKAROUNDS
 		case Opt_datavolume:
 			if (is_remount) {
@@ -929,8 +948,8 @@
 
 		oi->ip_blkno = 0ULL;
 		oi->ip_clusters = 0;
-		oi->ip_mmu_private = 0LL;
 
+		ocfs2_lock_res_init_once(&oi->ip_rw_lockres);
 		ocfs2_lock_res_init_once(&oi->ip_meta_lockres);
 		ocfs2_lock_res_init_once(&oi->ip_data_lockres);
 
@@ -1123,8 +1142,6 @@
 	osb = OCFS2_SB(sb);
 	BUG_ON(!osb);
 
-	ocfs2_wait_for_okp_destruction(osb);
-
 	ocfs2_shutdown_local_alloc(osb);
 
 	ocfs2_truncate_log_shutdown(osb);
@@ -1264,12 +1281,6 @@
 	INIT_LIST_HEAD(&osb->vote_list);
 	spin_lock_init(&osb->osb_lock);
 
-	osb->osb_okp_teardown_next = NULL;
-	atomic_set(&osb->osb_okp_pending, 0);
-	init_waitqueue_head(&osb->osb_okp_pending_wq);
-	/* we sync with this work queue (and sb ref) on unmount */
-	INIT_WORK(&osb->osb_okp_teardown_work, okp_teardown_from_list, osb);
-
 	atomic_set(&osb->alloc_stats.moves, 0);
 	atomic_set(&osb->alloc_stats.local_data, 0);
 	atomic_set(&osb->alloc_stats.bitmap_data, 0);