[Ocfs2-commits] mfasheh commits r2557 - branches/locking-changes/fs/ocfs2

Wed Aug 31 13:03:10 CDT 2005

Author: mfasheh
Date: 2005-08-31 13:03:07 -0500 (Wed, 31 Aug 2005)
New Revision: 2557

Added:
   branches/locking-changes/fs/ocfs2/aops.h
Modified:
   branches/locking-changes/fs/ocfs2/Makefile
   branches/locking-changes/fs/ocfs2/aops.c
   branches/locking-changes/fs/ocfs2/dir.c
   branches/locking-changes/fs/ocfs2/file.c
   branches/locking-changes/fs/ocfs2/file.h
   branches/locking-changes/fs/ocfs2/inode.c
   branches/locking-changes/fs/ocfs2/inode.h
   branches/locking-changes/fs/ocfs2/journal.c
   branches/locking-changes/fs/ocfs2/journal.h
   branches/locking-changes/fs/ocfs2/namei.c
   branches/locking-changes/fs/ocfs2/ocfs2.h
   branches/locking-changes/fs/ocfs2/super.c
Log:
* seperate out the extend api some more. all i_size changes happen
  completely independently of allocation changes now. More could probably be 
  done here, but this should suffice as a 1st pass.

* start shoving our cluster locking into the commit_write path now. No
  locking update has happened to the upper layers yet.

* as a result of the locking change, we must update i_size in the
  ocfs2_commit_write path now. This however makes it trivial for us to use
  do ordered data writes on extends, via a small journal.[ch] update.  
  Thanks to ext3 for a small amount of the code used here.

* new strategy for zeroing holes, instead of cont_prepare_write which didn't
  give us enough flexibility with locking and journalling. We now manually 
  zero them up front (but still in a similar fashion) which allows us to do
  the right things wrt cluster locking and the journal.

* remove ip_mmu_private and teach ocfs2_get_block to mark blocks as "new"
  on similar boundary conditions by sampling i_size.



Modified: branches/locking-changes/fs/ocfs2/Makefile
===================================================================

--- branches/locking-changes/fs/ocfs2/Makefile	2005-08-30 00:26:33 UTC (rev 2556)
+++ branches/locking-changes/fs/ocfs2/Makefile	2005-08-31 18:03:07 UTC (rev 2557)
@@ -67,6 +67,7 @@
 	ocfs2.h			\
 	buffer_head_io.h	\
 	alloc.h			\
+	aops.h			\
 	dcache.h		\
 	dir.h			\
 	dlmglue.h		\

Modified: branches/locking-changes/fs/ocfs2/aops.c
===================================================================
--- branches/locking-changes/fs/ocfs2/aops.c	2005-08-30 00:26:33 UTC (rev 2556)
+++ branches/locking-changes/fs/ocfs2/aops.c	2005-08-31 18:03:07 UTC (rev 2557)
@@ -132,8 +132,8 @@
 			   struct buffer_head *bh_result, int create)
 {
 	int err = -EIO;
-	u64 vbo = 0;
 	u64 p_blkno;
+	u64 eof_blkno;
 
 	mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode,
 		   (unsigned long long)iblock, bh_result, create);
@@ -149,8 +149,6 @@
 		goto bail;
 	}
 
-	vbo = (u64)iblock << inode->i_sb->s_blocksize_bits;
-
 	/* this can happen if another node truncs after our extend! */
 	spin_lock(&OCFS2_I(inode)->ip_lock);
 	if (iblock >=
@@ -183,22 +181,20 @@
 		     p_blkno, OCFS2_I(inode)->ip_blkno);
 	}
 
-	if (vbo < OCFS2_I(inode)->ip_mmu_private)
+	eof_blkno = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode)) - 1;
+	mlog(0, "Inode %lu, eof_blkno = %"MLFu64"\n", inode->i_ino, eof_blkno);
+	/* We don't support holes, so I/O inside of i_size can't be
+	 * marked 'new' */
+	if (iblock <= eof_blkno)
 		goto bail;
 	if (!create)
 		goto bail;
-	if (vbo != OCFS2_I(inode)->ip_mmu_private) {
-		mlog(ML_ERROR, "Uh-oh, vbo = %"MLFi64", i_size = %lld, "
-		     "mmu = %lld, inode = %"MLFu64"\n", vbo,
-		     i_size_read(inode), OCFS2_I(inode)->ip_mmu_private,
-		     OCFS2_I(inode)->ip_blkno);
-		BUG();
-		err = -EIO;
-		goto bail;
-	}
+	mlog_bug_on_msg(iblock != (eof_blkno + 1),
+			"Inode %"MLFu64": I/O past tail of file! (i_size = "
+			"%lld, iblock = %llu)\n", OCFS2_I(inode)->ip_blkno,
+			i_size_read(inode), (unsigned long long) iblock);
 
 	set_buffer_new(bh_result);
-	OCFS2_I(inode)->ip_mmu_private += inode->i_sb->s_blocksize;
 
 bail:
 	if (err < 0)
@@ -221,6 +217,10 @@
 	return ret;
 }
 
+/* Note: Because we don't support holes, our allocation has
+ * already happened (allocation writes zeros to the file data)
+ * so we don't have to worry about ordered writes in
+ * ocfs2_writepage. */
 static int ocfs2_writepage(struct page *page, struct writeback_control *wbc)
 {
 	int ret;
@@ -234,32 +234,193 @@
 	return ret;
 }
 
-static int ocfs2_prepare_write(struct file *file, struct page *page,
-		unsigned from, unsigned to)
+int ocfs2_prepare_write(struct file *file, struct page *page,
+			unsigned from, unsigned to)
 {
 	int ret;
 
 	mlog_entry("(0x%p, 0x%p, %u, %u)\n", file, page, from, to);
 
-	ret = cont_prepare_write(page, from, to, ocfs2_get_block,
-		&(OCFS2_I(page->mapping->host)->ip_mmu_private));
+	ret = block_prepare_write(page, from, to, ocfs2_get_block);
 
 	mlog_exit(ret);
 
 	return ret;
 }
 
+/* Taken from ext3. We don't necessarily need the full blown
+ * functionality yet, but IMHO it's better to cut and paste the whole
+ * thing so we can avoid introducing our own bugs (and easily pick up
+ * their fixes when they happen) --Mark */
+static int walk_page_buffers(	handle_t *handle,
+				struct buffer_head *head,
+				unsigned from,
+				unsigned to,
+				int *partial,
+				int (*fn)(	handle_t *handle,
+						struct buffer_head *bh))
+{
+	struct buffer_head *bh;
+	unsigned block_start, block_end;
+	unsigned blocksize = head->b_size;
+	int err, ret = 0;
+	struct buffer_head *next;
+
+	for (	bh = head, block_start = 0;
+		ret == 0 && (bh != head || !block_start);
+	    	block_start = block_end, bh = next)
+	{
+		next = bh->b_this_page;
+		block_end = block_start + blocksize;
+		if (block_end <= from || block_start >= to) {
+			if (partial && !buffer_uptodate(bh))
+				*partial = 1;
+			continue;
+		}
+		err = (*fn)(handle, bh);
+		if (!ret)
+			ret = err;
+	}
+	return ret;
+}
+
+/* Does the actual work of a commit_write. No cluster locks are
+ * taken. If extending is nonzero then it's assumed that di_bh is non
+ * null and that the caller has the proper cluster locks. */
+int ocfs2_commit_write_nolocks(struct file *file, struct page *page,
+			       unsigned from, unsigned to,
+			       struct buffer_head *di_bh,
+			       unsigned extending)
+{
+	int ret;
+	u64 size;
+	ocfs2_dinode *di = NULL;
+	ocfs2_journal_handle *handle = NULL;
+	struct inode *inode = page->mapping->host;
+	ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	if (extending) {
+		handle = ocfs2_start_trans(osb, NULL,
+					   OCFS2_INODE_UPDATE_CREDITS);
+		if (!handle) {
+			ret = -ENOMEM;
+			mlog_errno(ret);
+			goto out;
+		}
+
+		ocfs2_set_inode_lock_trans(osb->journal, inode);
+
+		/* Only do this on extending writes - we don't
+		 * hold the right cluster locks otherwise. */
+		if (ocfs2_should_order_data(inode)) {
+			ret = walk_page_buffers(handle->k_handle,
+						page_buffers(page),
+						from, to, NULL,
+						ocfs2_journal_dirty_data);
+			if (ret < 0) {
+				mlog_errno(ret);
+				goto out;
+			}
+		}
+
+		di = (ocfs2_dinode *) di_bh->b_data;
+		/* Mark our buffer early. We'd rather catch this error
+		 * up here as opposed to after a successful
+		 * commit_write which would require us to set back
+		 * inode->i_size. */
+		ret = ocfs2_journal_access(handle, inode, di_bh,
+					   OCFS2_JOURNAL_ACCESS_WRITE);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	ret = generic_commit_write(file, page, from, to);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (extending) {
+		size = (u64) i_size_read(inode);
+		/* ocfs2_mark_inode_dirty is too heavy to use here. */
+		inode->i_blocks = ocfs2_align_bytes_to_sectors(size);
+		inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+
+		di->i_size = cpu_to_le64(size);
+		di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec);
+		di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
+
+		ret = ocfs2_journal_dirty(handle, di_bh);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+out:
+	if (handle)
+		ocfs2_commit_trans(handle);
+	return ret;
+}
+
 static int ocfs2_commit_write(struct file *file, struct page *page,
 			      unsigned from, unsigned to)
 {
-	int ret;
+	int ret, extending = 0, locklevel = 0;
+	loff_t new_i_size;
+	struct buffer_head *di_bh;
+	struct inode *inode = page->mapping->host;
 
 	mlog_entry("(0x%p, 0x%p, %u, %u)\n", file, page, from, to);
 
-	ret = generic_commit_write(file, page, from, to);
+	/* NOTE: ocfs2_file_aio_write has ensured that it's safe for
+	 * us to sample inode->i_size here without the metadata lock:
+	 *
+	 * 1) We're currently holding the inode alloc lock, so no
+	 *    nodes can change it underneath us.
+	 *
+	 * 2) We've had to take the metadata lock at least once
+	 *    already to check for extending writes, hence insuring
+	 *    that our current copy is also up to date.
+	 */
+	new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
+	if (new_i_size > i_size_read(inode)) {
+		extending = 1;
+		locklevel = 1;
+	}
 
+	ret = ocfs2_meta_lock(inode, NULL, &di_bh, locklevel);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_data_lock(inode, 1);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out_unlock_meta;
+	}
+
+	ret = ocfs2_commit_write_nolocks(file, page, from, to, di_bh,
+					 extending);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out_unlock_data;
+	}
+
+	BUG_ON(i_size_read(inode) != new_i_size);
+
+out_unlock_data:
+	ocfs2_data_unlock(inode, 1);
+out_unlock_meta:
+	ocfs2_meta_unlock(inode, locklevel);
+out:
+	if (di_bh)
+		brelse(di_bh);
+
 	mlog_exit(ret);
-
 	return ret;
 }
 

Added: branches/locking-changes/fs/ocfs2/aops.h
===================================================================
--- branches/locking-changes/fs/ocfs2/aops.h	2005-08-30 00:26:33 UTC (rev 2556)
+++ branches/locking-changes/fs/ocfs2/aops.h	2005-08-31 18:03:07 UTC (rev 2557)
@@ -0,0 +1,33 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * Copyright (C) 2002, 2004, 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_AOPS_H
+#define OCFS2_AOPS_H
+
+int ocfs2_prepare_write(struct file *file, struct page *page,
+			unsigned from, unsigned to);
+
+int ocfs2_commit_write_nolocks(struct file *file, struct page *page,
+			       unsigned from, unsigned to,
+			       struct buffer_head *di_bh,
+			       unsigned extending);
+
+#endif /* OCFS2_FILE_H */

Modified: branches/locking-changes/fs/ocfs2/dir.c
===================================================================
--- branches/locking-changes/fs/ocfs2/dir.c	2005-08-30 00:26:33 UTC (rev 2556)
+++ branches/locking-changes/fs/ocfs2/dir.c	2005-08-31 18:03:07 UTC (rev 2557)
@@ -356,9 +356,9 @@
 	spin_unlock(&OCFS2_I(dir)->ip_lock);
 
 	if (extend) {
-		status = ocfs2_extend_allocation(OCFS2_SB(sb), dir, 1,
-						 parent_fe_bh, handle,
-						 data_ac, meta_ac, NULL);
+		status = ocfs2_do_extend_allocation(OCFS2_SB(sb), dir, 1,
+						    parent_fe_bh, handle,
+						    data_ac, meta_ac, NULL);
 		BUG_ON(status == -EAGAIN);
 		if (status < 0) {
 			mlog_errno(status);

Modified: branches/locking-changes/fs/ocfs2/file.c
===================================================================
--- branches/locking-changes/fs/ocfs2/file.c	2005-08-30 00:26:33 UTC (rev 2556)
+++ branches/locking-changes/fs/ocfs2/file.c	2005-08-31 18:03:07 UTC (rev 2557)
@@ -36,6 +36,7 @@
 #include "ocfs2.h"
 
 #include "alloc.h"
+#include "aops.h"
 #include "dir.h"
 #include "dlmglue.h"
 #include "extent_map.h"
@@ -48,12 +49,6 @@
 
 #include "buffer_head_io.h"
 
-static int ocfs2_zero_extend(struct inode *inode);
-static int ocfs2_orphan_for_truncate(ocfs2_super *osb,
-				     struct inode *inode,
-				     struct buffer_head *fe_bh,
-				     u64 new_i_size);
-
 int ocfs2_sync_inode(struct inode *inode)
 {
 	filemap_fdatawrite(inode->i_mapping);
@@ -146,367 +141,15 @@
 	return (err < 0) ? -EIO : 0;
 }
 
-static void ocfs2_update_inode_size(struct inode *inode,
-				    u64 new_size)
-{
-	i_size_write(inode, new_size);
-	inode->i_blocks = ocfs2_align_bytes_to_sectors(new_size);
-}
-
-void ocfs2_file_finish_extension(struct inode *inode,
-				 loff_t newsize,
-				 unsigned direct_extend)
-{
-	int ret;
-
-	mlog(0, "inode %"MLFu64", newsize = %lld, direct_extend = %u\n",
-	     OCFS2_I(inode)->ip_blkno, (long long)newsize, direct_extend);
-
-	ocfs2_update_inode_size(inode, newsize);
-
-#ifdef OCFS2_ORACORE_WORKAROUNDS
-	if (direct_extend) {
-		/*
-		 * This leaves dirty data in holes.
-		 * Caveat Emptor.
-		 */
-		OCFS2_I(inode)->ip_mmu_private = newsize;
-		return;
-	}
-#endif
-
-	/* caller won't overwrite return from g_f_w so we don't return */
-	ret = ocfs2_zero_extend(inode);
-	if (ret)
-		mlog(ML_ERROR, "Unable to pre-zero extension of inode (%d)\n",
-		     ret);
-}
-
-static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
-				    const char __user *buf,
-				    size_t count,
-				    loff_t pos)
-{
-	struct iovec local_iov = { .iov_base = (void __user *)buf,
-				   .iov_len = count };
-	int ret, level;
-	ocfs2_super *osb = NULL;
-	struct file *filp = iocb->ki_filp;
-	struct inode *inode = filp->f_dentry->d_inode;
-	int do_direct = 0, extended = 0;
-	loff_t newsize, saved_pos;
-
-	mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", filp, buf,
-		   (unsigned int)count,
-		   filp->f_dentry->d_name.len,
-		   filp->f_dentry->d_name.name);
-
-	/* happy write of zero bytes */
-	if (count == 0) {
-		ret = 0;
-		goto bail;
-	}
-
-	if (!inode) {
-		mlog(0, "bad inode\n");
-		ret = -EIO;
-		goto bail;
-	}
-
-	osb = OCFS2_SB(inode->i_sb);
-
-	down(&inode->i_sem);
-
-	/* this ginormous block is in here because it has so many inputs
-	 * and outputs from this function.. */
-	level = !!(filp->f_flags & O_APPEND);
-	for(;;) {
-		u64 bytes_added;
-
-		ret = ocfs2_meta_lock(inode, NULL, NULL, level);
-		if (ret < 0) {
-			mlog_errno(ret);
-			goto bail_i_sem;
-		}
-
-		/* work on a copy of ppos until we're sure that we won't have
-		 * to recalculate it due to relocking. */
-		if (filp->f_flags & O_APPEND) {
-			saved_pos = i_size_read(inode);
-			mlog(0, "O_APPEND: inode->i_size=%llu\n", saved_pos);
-#ifdef OCFS2_ORACORE_WORKAROUNDS
-			if (osb->s_mount_opt & OCFS2_MOUNT_COMPAT_OCFS) {
-				/* ugh, work around some applications which
-				 * open everything O_DIRECT + O_APPEND and
-				 * really don't mean to use O_DIRECT. */
-				filp->f_flags &= ~O_DIRECT;
-			}
-#endif
-		} else {
-			saved_pos = iocb->ki_pos;
-		}
-		newsize = count + saved_pos;
-
-		mlog(0, "pos=%lld newsize=%"MLFu64" cursize=%lld\n",
-		     saved_pos, newsize, i_size_read(inode));
-
-		if (newsize <= i_size_read(inode))
-			break;
-
-		if (level == 0) {
-			ocfs2_meta_unlock(inode, level);
-			level = 1;
-			continue;
-		}
-
-		mlog(0, "Writing at EOF, will need more allocation: "
-		     "i_size=%lld, " "need=%"MLFu64"\n", i_size_read(inode),
-		     newsize);
-
-		/* If we extend AT ALL here then we update our state
-		 * and continue the write call, regardless of error --
-		 * this is basically a short write. */
-		ret = ocfs2_extend_file(osb, inode, newsize, &bytes_added);
-		if (ret < 0 &&
-		    ret != -ERESTARTSYS && ret != -EINTR && ret != -ENOSPC) {
-			mlog_errno(ret);
-			mlog(ML_ERROR, "Failed to extend inode %"MLFu64
-			     " from %lld to %"MLFu64, OCFS2_I(inode)->ip_blkno,
-			     i_size_read(inode), newsize);
-		}
-		if (ret < 0 && (!bytes_added)) 
-			goto bail_meta_unlock;
-
-		extended = 1;
-
-		/* We need to recalulate newsize and count according
-		 * to what extend could give us. If we got the whole
-		 * extend then this doesn't wind up changing the
-		 * values. */
-		newsize = i_size_read(inode) + bytes_added;
-		count = newsize - saved_pos;
-		ret = 0;
-		break;
-	}
-
-	/* we've got whatever cluster lock is appropriate now, so we
-	 * can stuff *ppos back. */
-	iocb->ki_pos = saved_pos;
-
-	if (filp->f_flags & O_DIRECT) {
-#ifdef OCFS2_ORACORE_WORKAROUNDS
-		if (osb->s_mount_opt & OCFS2_MOUNT_COMPAT_OCFS) {
-			int sector_size = 1 << osb->s_sectsize_bits;
-
-			if ((saved_pos & (sector_size - 1)) ||
-			    (count & (sector_size - 1)) ||
-			    ((unsigned long)buf & (sector_size - 1))) {
-				do_direct = 0;
-				filp->f_flags |= O_SYNC;
-			} else {
-				do_direct = 1;
-			}
-		} else
-#endif
-			do_direct = 1;
-
-		mlog(0, "O_DIRECT\n");
-	}
-
-	if (!do_direct) {
-		ret = ocfs2_data_lock(inode, 1);
-		if (ret < 0) {
-			mlog_errno(ret);
-			goto bail_extend;
-		}
-	}
-
-	down_read(&OCFS2_I(inode)->ip_alloc_sem);
-
-#ifdef OCFS2_ORACORE_WORKAROUNDS
-	if (osb->s_mount_opt & OCFS2_MOUNT_COMPAT_OCFS) {
-		unsigned int saved_flags = filp->f_flags;
-
-		if (do_direct)
-			filp->f_flags |= O_DIRECT;
-		else
-			filp->f_flags &= ~O_DIRECT;
-
-		ret = generic_file_aio_write_nolock(iocb, &local_iov, 1,
-						    &iocb->ki_pos);
-
-		filp->f_flags = saved_flags;
-	} else
-#endif
-		ret = generic_file_aio_write_nolock(iocb, &local_iov, 1,
-						    &iocb->ki_pos);
-
-	up_read(&OCFS2_I(inode)->ip_alloc_sem);
-
-	if (!do_direct)
-		ocfs2_data_unlock(inode, 1);
-
-	/* we might have to finish up extentions that were performed before
-	 * an error was returned by, say, data locking */
-bail_extend:
-	if (extended)
-		ocfs2_file_finish_extension(inode, newsize, do_direct);
-bail_meta_unlock:
-	ocfs2_meta_unlock(inode, level);
-bail_i_sem:
-	up(&inode->i_sem);
-
-bail:
-	mlog_exit(ret);
-	return ret;
-}
-
-static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
-				   char __user *buf,
-				   size_t count,
-				   loff_t pos)
-{
-	int ret = 0;
-	ocfs2_super *osb = NULL;
-	struct file *filp = iocb->ki_filp;
-	struct inode *inode = filp->f_dentry->d_inode;
-
-	mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", filp, buf,
-		   (unsigned int)count,
-		   filp->f_dentry->d_name.len,
-		   filp->f_dentry->d_name.name);
-
-	if (!inode) {
-		ret = -EINVAL;
-		mlog_errno(ret);
-		goto bail;
-	}
-
-	osb = OCFS2_SB(inode->i_sb);
-
-#ifdef OCFS2_ORACORE_WORKAROUNDS
-	if (osb->s_mount_opt & OCFS2_MOUNT_COMPAT_OCFS) {
-		if (filp->f_flags & O_DIRECT) {
-			int sector_size = 1 << osb->s_sectsize_bits;
-
-			if ((pos & (sector_size - 1)) ||
-			    (count & (sector_size - 1)) ||
-			    ((unsigned long)buf & (sector_size - 1)) ||
-			    (i_size_read(inode) & (sector_size -1))) {
-				filp->f_flags &= ~O_DIRECT;
-			}
-		}
-	}
-#endif
-
-	ret = ocfs2_meta_lock(inode, NULL, NULL, 0);
-	if (ret < 0) {
-		mlog_errno(ret);
-		goto bail;
-	}
-
-	if (!(filp->f_flags & O_DIRECT)) {
-		ret = ocfs2_data_lock(inode, 0);
-		if (ret < 0) {
-			mlog_errno(ret);
-			goto bail_unlock_meta;
-		}
-	}
-
-	down_read(&OCFS2_I(inode)->ip_alloc_sem);
-
-	ret = generic_file_aio_read(iocb, buf, count, iocb->ki_pos);
-
-	up_read(&OCFS2_I(inode)->ip_alloc_sem);
-
-	if (ret == -EINVAL)
-		mlog(ML_ERROR, "generic_file_aio_read returned -EINVAL\n");
-
-	if (!(filp->f_flags & O_DIRECT))
-		ocfs2_data_unlock(inode, 0);
-bail_unlock_meta:
-	ocfs2_meta_unlock(inode, 0);
-
-bail:
-	mlog_exit(ret);
-
-	return ret;
-}
-
-static ssize_t ocfs2_file_sendfile(struct file *in_file,
-				   loff_t *ppos,
-				   size_t count,
-				   read_actor_t actor,
-				   void *target)
-{
-	int ret;
-	struct inode *inode = in_file->f_mapping->host;
-
-	mlog_entry("inode %"MLFu64", ppos %lld, count = %u\n",
-		   OCFS2_I(inode)->ip_blkno, (long long) *ppos,
-		   (unsigned int) count);
-
-	/* Obviously, there is no user buffer to worry about here --
-	 * this simplifies locking, so no need to walk vmas a la
-	 * read/write. We take a simple set of cluster locks against
-	 * the inode and call generic_file_sendfile. */
-	ret = ocfs2_meta_lock(inode, NULL, NULL, 0);
-	if (ret < 0) {
-		mlog_errno(ret);
-		goto bail;
-	}
-
-	ret = ocfs2_data_lock(inode, 0);
-	if (ret < 0) {
-		mlog_errno(ret);
-		goto bail_unlock_meta;
-	}
-
-	down_read(&OCFS2_I(inode)->ip_alloc_sem);
-
-	ret = generic_file_sendfile(in_file, ppos, count, actor, target);
-	if (ret < 0)
-		mlog_errno(ret);
-
-	up_read(&OCFS2_I(inode)->ip_alloc_sem);
-
-	ocfs2_data_unlock(inode, 0);
-bail_unlock_meta:
-	ocfs2_meta_unlock(inode, 0);
-
-bail:
-	mlog_exit(ret);
-	return ret;
-}
-
-struct file_operations ocfs2_fops = {
-	.read		= do_sync_read,
-	.write		= do_sync_write,
-	.sendfile	= ocfs2_file_sendfile,
-	.mmap		= ocfs2_mmap,
-	.fsync		= ocfs2_sync_file,
-	.release	= ocfs2_file_release,
-	.open		= ocfs2_file_open,
-	.aio_read	= ocfs2_file_aio_read,
-	.aio_write	= ocfs2_file_aio_write,
-};
-
-struct file_operations ocfs2_dops = {
-	.read		= generic_read_dir,
-	.readdir	= ocfs2_readdir,
-	.fsync		= ocfs2_sync_file,
-};
-
 int ocfs2_set_inode_size(ocfs2_journal_handle *handle,
 			 struct inode *inode,
 			 struct buffer_head *fe_bh,
 			 u64 new_i_size)
 {
-	int status, grow;
+	int status;
 
 	mlog_entry_void();
 
-	grow = new_i_size > inode->i_size;
 	i_size_write(inode, new_i_size);
 	inode->i_blocks = ocfs2_align_bytes_to_sectors(new_i_size);
 	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
@@ -517,17 +160,41 @@
 		goto bail;
 	}
 
-	/* FIXME: I think this should all be in the caller */
-	spin_lock(&OCFS2_I(inode)->ip_lock);
-	if (!grow)
-		OCFS2_I(inode)->ip_mmu_private = i_size_read(inode);
-	spin_unlock(&OCFS2_I(inode)->ip_lock);
-
 bail:
 	mlog_exit(status);
 	return status;
 }
 
+static int ocfs2_simple_size_update(struct inode *inode,
+				    struct buffer_head *di_bh,
+				    u64 new_i_size)
+{
+	int ret;
+	ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	ocfs2_journal_handle *handle = NULL;
+
+	handle = ocfs2_start_trans(osb, NULL,
+				   OCFS2_INODE_UPDATE_CREDITS);
+	if (handle == NULL) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/* Since we got our cluster lock from caller and we
+	 * don't add it to the handle: */
+	ocfs2_set_inode_lock_trans(osb->journal, inode);
+
+	ret = ocfs2_set_inode_size(handle, inode, di_bh,
+				   new_i_size);
+	if (ret < 0)
+		mlog_errno(ret);
+
+	ocfs2_commit_trans(handle);
+out:
+	return ret;
+}
+
 static int ocfs2_orphan_for_truncate(ocfs2_super *osb,
 				     struct inode *inode,
 				     struct buffer_head *fe_bh,
@@ -562,14 +229,13 @@
 	return status;
 }
 
-static int ocfs2_truncate_file(ocfs2_super *osb,
-			       u64 new_i_size,
-			       struct inode *inode)
+static int ocfs2_truncate_file(struct inode *inode,
+			       struct buffer_head *di_bh,
+			       u64 new_i_size)
 {
 	int status = 0;
 	ocfs2_dinode *fe = NULL;
-	struct buffer_head *fe_bh = NULL;
-	ocfs2_journal_handle *handle = NULL;
+	ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	struct ocfs2_truncate_context *tc = NULL;
 
 	mlog_entry("(inode = %"MLFu64", new_i_size = %"MLFu64"\n",
@@ -577,14 +243,7 @@
 
 	truncate_inode_pages(inode->i_mapping, new_i_size);
 
-	status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &fe_bh,
-				  OCFS2_BH_CACHED, inode);
-	if (status < 0) {
-		mlog_errno(status);
-		goto bail;
-	}
-
-	fe = (ocfs2_dinode *) fe_bh->b_data;
+	fe = (ocfs2_dinode *) di_bh->b_data;
 	OCFS2_BUG_ON_INVALID_DINODE(fe);
 	mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode),
 			"Inode %"MLFu64", inode i_size = %lld != di "
@@ -616,20 +275,7 @@
 		     fe->i_clusters);
 		/* No allocation change is required, so lets fast path
 		 * this truncate. */
-		handle = ocfs2_start_trans(osb, NULL,
-					  OCFS2_INODE_UPDATE_CREDITS);
-		if (handle == NULL) {
-			status = -ENOMEM;
-			mlog_errno(status);
-			goto bail;
-		}
-
-		/* Since we got our cluster lock from caller and we
-		 * don't add it to the handle: */
-		ocfs2_set_inode_lock_trans(osb->journal, inode);
-
-		status = ocfs2_set_inode_size(handle, inode, fe_bh,
-					      new_i_size);
+		status = ocfs2_simple_size_update(inode, di_bh, new_i_size);
 		if (status < 0)
 			mlog_errno(status);
 		goto bail;
@@ -647,19 +293,19 @@
 	 * change. Orphan the inode so that recovery can complete the
 	 * truncate if necessary. This does the task of marking
 	 * i_size. */
-	status = ocfs2_orphan_for_truncate(osb, inode, fe_bh, new_i_size);
+	status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
 	}
 
-	status = ocfs2_prepare_truncate(osb, inode, fe_bh, &tc);
+	status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
 	}
 
-	status = ocfs2_commit_truncate(osb, inode, fe_bh, tc);
+	status = ocfs2_commit_truncate(osb, inode, di_bh, tc);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
@@ -667,70 +313,29 @@
 
 	/* TODO: orphan dir cleanup here. */
 bail:
-	if (handle)
-		ocfs2_commit_trans(handle);
 
-	if (fe_bh)
-		brelse(fe_bh);
-
 	mlog_exit(status);
 	return status;
 }
 
-static int ocfs2_zero_extend(struct inode *inode)
-{
-	struct address_space *mapping = inode->i_mapping;
-	struct page *page;
-	u64 size = i_size_read(inode) - 1;
-	unsigned int offset;
-	int res = 0;
-
-	/* Start the zeroing of blocks */
-	if (i_size_read(inode) > OCFS2_I(inode)->ip_mmu_private) {
-		page = grab_cache_page(mapping,
-				       size >> PAGE_CACHE_SHIFT);
-		if (!page) {
-			res = -ENOMEM;
-			mlog_errno(res);
-			return res;
-		}
-		offset = (unsigned int)(size & (PAGE_CACHE_SIZE - 1)) + 1;
-		res = mapping->a_ops->prepare_write(NULL, page, offset,
-						    offset);
-		if (res < 0) {
-			mlog_errno(res);
-			goto bail_unlock;
-		}
-
-		res = mapping->a_ops->commit_write(NULL, page, offset, offset);
-		if (res < 0)
-			mlog_errno(res);
-
-bail_unlock:
-		unlock_page(page);
-		page_cache_release(page);
-		mark_inode_dirty(inode);
-	}
-
-	return res;
-}
-
 /*
  * extend allocation only here.
  * we'll update all the disk stuff, and oip->alloc_size
  *
  * expect stuff to be locked, a transaction started and enough data /
- * metadata reservations in the contexts. I'll return -EAGAIN, if we
- * run out of transaction credits, so the caller can restart us.
+ * metadata reservations in the contexts.
+ *
+ * Will return -EAGAIN, and a reason if a restart is needed.
+ * If passed in, *reason will always be set, even in error.
  */
-int ocfs2_extend_allocation(ocfs2_super *osb,
-			    struct inode *inode,
-			    u32 clusters_to_add,
-			    struct buffer_head *fe_bh,
-			    ocfs2_journal_handle *handle,
-			    ocfs2_alloc_context *data_ac,
-			    ocfs2_alloc_context *meta_ac,
-			    enum ocfs2_alloc_restarted *reason)
+int ocfs2_do_extend_allocation(ocfs2_super *osb,
+			       struct inode *inode,
+			       u32 clusters_to_add,
+			       struct buffer_head *fe_bh,
+			       ocfs2_journal_handle *handle,
+			       ocfs2_alloc_context *data_ac,
+			       ocfs2_alloc_context *meta_ac,
+			       enum ocfs2_alloc_restarted *reason)
 {
 	int status = 0;
 	int free_extents;
@@ -740,6 +345,10 @@
 
 	BUG_ON(!clusters_to_add);
 
+	/* We always want to set this, even if we error later. */
+	if (*reason)
+		*reason = RESTART_NONE;
+
 	free_extents = ocfs2_num_free_extents(osb, inode, fe);
 	if (free_extents < 0) {
 		status = free_extents;
@@ -822,48 +431,24 @@
 	return status;
 }
 
-/*
- * Ok, this function is heavy on the goto's - we need to clean it up a
- * bit.
- *
- * *bytes_extended is a measure of how much was added to
- * dinode->i_size, NOT how much allocated was actually added to the
- * file. It will always be correct, even when we return an error.
- */
-int ocfs2_extend_file(ocfs2_super *osb,
-		      struct inode *inode,
-		      u64 new_i_size,
-		      u64 *bytes_extended)
+int ocfs2_extend_allocation(struct inode *inode,
+			    u32 clusters_to_add)
 {
 	int status = 0;
 	int restart_func = 0;
 	int drop_alloc_sem = 0;
 	int credits, num_free_extents;
-	u32 clusters_to_add;
-	u64 new_fe_size;
+	u32 prev_clusters;
 	struct buffer_head *bh = NULL;
-	ocfs2_dinode *fe;
+	ocfs2_dinode *fe = NULL;
 	ocfs2_journal_handle *handle = NULL;
 	ocfs2_alloc_context *data_ac = NULL;
 	ocfs2_alloc_context *meta_ac = NULL;
 	enum ocfs2_alloc_restarted why;
+	ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 
-	mlog_entry("(new_i_size=%"MLFu64")\n", new_i_size);
+	mlog_entry("(clusters_to_add = %u)\n", clusters_to_add);
 
-	*bytes_extended = 0;
-
-	/* setattr sometimes calls us like this. */
-	if (new_i_size == 0)
-		goto leave;
-
-restart_all:
-	handle = ocfs2_alloc_handle(osb);
-	if (handle == NULL) {
-		status = -ENOMEM;
-		mlog_errno(status);
-		goto leave;
-	}
-
 	status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh,
 				  OCFS2_BH_CACHED, inode);
 	if (status < 0) {
@@ -873,23 +458,21 @@
 
 	fe = (ocfs2_dinode *) bh->b_data;
 	OCFS2_BUG_ON_INVALID_DINODE(fe);
-	BUG_ON(i_size_read(inode) !=
-	       (le64_to_cpu(fe->i_size) - *bytes_extended));
-	BUG_ON(new_i_size < i_size_read(inode));
 
-	if (i_size_read(inode) == new_i_size)
-  		goto leave;
+restart_all:
+	BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);
 
-	clusters_to_add = ocfs2_clusters_for_bytes(osb->sb, new_i_size) -
-			  le32_to_cpu(fe->i_clusters);
-
-	mlog(0, "extend inode %"MLFu64", new_i_size = %"MLFu64", "
-		"i_size = %lld, fe->i_clusters = %u, clusters_to_add = %u\n",
-	     OCFS2_I(inode)->ip_blkno, new_i_size, i_size_read(inode),
+	mlog(0, "extend inode %"MLFu64", i_size = %lld, fe->i_clusters = %u, "
+	     "clusters_to_add = %u\n",
+	     OCFS2_I(inode)->ip_blkno, i_size_read(inode),
 	     fe->i_clusters, clusters_to_add);
 
-	if (!clusters_to_add)
-		goto do_start_trans;
+	handle = ocfs2_alloc_handle(osb);
+	if (handle == NULL) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto leave;
+	}
 
 	num_free_extents = ocfs2_num_free_extents(osb,
 						  inode,
@@ -928,7 +511,7 @@
 	 * start_trans is important here -- always do it before! */
 	down_write(&OCFS2_I(inode)->ip_alloc_sem);
 	drop_alloc_sem = 1;
-do_start_trans:
+
 	credits = ocfs2_calc_extend_credits(osb->sb, fe, clusters_to_add);
 	handle = ocfs2_start_trans(osb, handle, credits);
 	if (handle == NULL) {
@@ -951,53 +534,39 @@
 		goto leave;
 	}
 
-	if (!clusters_to_add)
-		goto no_alloc;
+	prev_clusters = OCFS2_I(inode)->ip_clusters;
 
-	status = ocfs2_extend_allocation(osb,
-					 inode,
-					 clusters_to_add,
-					 bh,
-					 handle,
-					 data_ac,
-					 meta_ac,
-					 &why);
+	status = ocfs2_do_extend_allocation(osb,
+					    inode,
+					    clusters_to_add,
+					    bh,
+					    handle,
+					    data_ac,
+					    meta_ac,
+					    &why);
 	if ((status < 0) && (status != -EAGAIN)) {
 		if (status != -ENOSPC)
 			mlog_errno(status);
 		goto leave;
 	}
 
-	if (status == -EAGAIN && (new_i_size >
-	    ocfs2_clusters_to_bytes(osb->sb, le32_to_cpu(fe->i_clusters)))) {
+	status = ocfs2_journal_dirty(handle, bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
 
+	spin_lock(&OCFS2_I(inode)->ip_lock);
+	clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters);
+	spin_unlock(&OCFS2_I(inode)->ip_lock);
+
+	if (why != RESTART_NONE && clusters_to_add) {
 		if (why == RESTART_META) {
 			mlog(0, "restarting function.\n");
 			restart_func = 1;
 		} else {
 			BUG_ON(why != RESTART_TRANS);
 
-			new_fe_size = ocfs2_clusters_to_bytes(osb->sb,
-						le32_to_cpu(fe->i_clusters));
-			*bytes_extended += new_fe_size -
-					   le64_to_cpu(fe->i_size);
-			/* update i_size in case we crash after the
-			 * extend_trans */
-			fe->i_size = cpu_to_le64(new_fe_size);
-
-			fe->i_mtime = cpu_to_le64(CURRENT_TIME.tv_sec);
-			fe->i_mtime_nsec = cpu_to_le32(CURRENT_TIME.tv_nsec);
-
-			status = ocfs2_journal_dirty(handle, bh);
-			if (status < 0) {
-				mlog_errno(status);
-				goto leave;
-			}
-
-			clusters_to_add =
-				ocfs2_clusters_for_bytes(osb->sb,
-							 new_i_size)
-				- le32_to_cpu(fe->i_clusters);
 			mlog(0, "restarting transaction.\n");
 			/* TODO: This can be more intelligent. */
 			credits = ocfs2_calc_extend_credits(osb->sb,
@@ -1014,34 +583,12 @@
 			goto restarted_transaction;
 		}
 	}
-	status = 0;
 
-no_alloc:
-	/* this may not be the end of our allocation so only update
-	 * i_size to what's appropriate. */
-	new_fe_size = ocfs2_clusters_to_bytes(osb->sb,
-					      le32_to_cpu(fe->i_clusters));
-	if (new_i_size < new_fe_size)
-		new_fe_size = new_i_size;
-
-	*bytes_extended += new_fe_size - le64_to_cpu(fe->i_size);
-	fe->i_size = cpu_to_le64(new_fe_size);
-
 	mlog(0, "fe: i_clusters = %u, i_size=%"MLFu64"\n",
 	     fe->i_clusters, fe->i_size);
-
 	mlog(0, "inode: ip_clusters=%u, i_size=%lld\n",
 	     OCFS2_I(inode)->ip_clusters, i_size_read(inode));
 
-	fe->i_ctime = fe->i_mtime = cpu_to_le64(CURRENT_TIME.tv_sec);
-	fe->i_ctime_nsec = fe->i_mtime_nsec = cpu_to_le32(CURRENT_TIME.tv_nsec);
-
-	status = ocfs2_journal_dirty(handle, bh);
-	if (status < 0) {
-		mlog_errno(status);
-		goto leave;
-	}
-
 leave:
 	if (drop_alloc_sem) {
 		up_write(&OCFS2_I(inode)->ip_alloc_sem);
@@ -1059,24 +606,148 @@
 		ocfs2_free_alloc_context(meta_ac);
 		meta_ac = NULL;
 	}
-	if (bh) {
-		brelse(bh);
-		bh = NULL;
-	}
 	if ((!status) && restart_func) {
 		restart_func = 0;
 		goto restart_all;
 	}
+	if (bh) {
+		brelse(bh);
+		bh = NULL;
+	}
 
 	mlog_exit(status);
 	return status;
 }
 
+/* Some parts of this taken from generic_cont_expand, which turned out
+ * to be too fragile to do exactly what we need without us having to
+ * worry about recursive locking in ->commit_write(). */
+static int ocfs2_write_zero_page(struct inode *inode,
+				 struct buffer_head *di_bh,
+				 u64 size)
+{
+	struct address_space *mapping = inode->i_mapping;
+	struct page *page;
+	unsigned long index, offset;
+	int ret;
+
+	offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */
+	/* ugh.  in prepare/commit_write, if from==to==start of block, we 
+	** skip the prepare.  make sure we never send an offset for the start
+	** of a block
+	*/
+	if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {
+		offset++;
+	}
+	index = size >> PAGE_CACHE_SHIFT;
+
+	page = grab_cache_page(mapping, index);
+	if (!page) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_prepare_write(NULL, page, offset, offset);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out_unlock;
+	}
+
+	ret = ocfs2_commit_write_nolocks(NULL, page, offset, offset, di_bh, 1);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out_unlock;
+	}
+
+	ret = 0;
+out_unlock:
+	unlock_page(page);
+	page_cache_release(page);
+out:
+	return ret;
+}
+
+static int ocfs2_zero_extend(struct inode *inode,
+			     u64 zero_to_size)
+{
+	int ret = 0;
+	u64 start_off;
+	struct buffer_head *di_bh = NULL;
+	struct super_block *sb = inode->i_sb;
+
+	ret = ocfs2_read_block(OCFS2_SB(sb), OCFS2_I(inode)->ip_blkno, &di_bh,
+			       OCFS2_BH_CACHED, inode);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	start_off = ocfs2_align_bytes_to_blocks(sb, zero_to_size);
+	while (start_off < zero_to_size) {
+		ret = ocfs2_write_zero_page(inode, di_bh, start_off);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		start_off += sb->s_blocksize;
+	}
+
+out:
+	return ret;
+}
+
+static int ocfs2_extend_file(struct inode *inode,
+			     struct buffer_head *di_bh,
+			     u64 new_i_size)
+{
+	int ret = 0;
+	u32 clusters_to_add;
+
+	/* setattr sometimes calls us like this. */
+	if (new_i_size == 0)
+		goto out;
+
+	if (i_size_read(inode) == new_i_size)
+  		goto out;
+	BUG_ON(new_i_size < i_size_read(inode));
+
+	clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size) - 
+		OCFS2_I(inode)->ip_clusters;
+
+	if (clusters_to_add) {
+		ret = ocfs2_extend_allocation(inode, clusters_to_add);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		/* This will update i_size for us. */
+		ret = ocfs2_zero_extend(inode, new_i_size);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+	} else {
+		/* No allocation required, we just use this helper to
+		 * do a trivial update of i_size. */
+		ret = ocfs2_simple_size_update(inode, di_bh, new_i_size);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+out:
+	return ret;
+}
+
 int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
 {
 	int status = 0;
 	int unlock = 0;
-	u64 newsize, bytes_added;
+	u64 newsize;
 	struct inode *inode = dentry->d_inode;
 	struct super_block *sb = inode->i_sb;
 	ocfs2_super *osb = OCFS2_SB(sb);
@@ -1121,48 +792,16 @@
 	if (S_ISREG(inode->i_mode) &&
 	    attr->ia_valid & ATTR_SIZE &&
 	    newsize != i_size_read(inode)) {
-		bytes_added = 0;
-
 		if (i_size_read(inode) > newsize)
-			status = ocfs2_truncate_file(osb, newsize, inode);
+			status = ocfs2_truncate_file(inode, bh, newsize);
 		else
-			status = ocfs2_extend_file(osb, inode, newsize,
-						   &bytes_added);
-		if (status < 0 && (!bytes_added)) {
+			status = ocfs2_extend_file(inode, bh, newsize);
+		if (status < 0) {
 			if (status != -ENOSPC)
 				mlog_errno(status);
 			status = -ENOSPC;
 			goto bail;
 		}
-
-		/* partial extend, we continue with what we've got. */
-		if (status < 0
-		    && status != -ENOSPC
-		    && status != -EINTR
-		    && status != -ERESTARTSYS)
-			mlog(ML_ERROR,
-			     "status return of %d extending inode "
-			     "%"MLFu64"\n", status,
-			     OCFS2_I(inode)->ip_blkno);
-		status = 0;
-
-		newsize = bytes_added + i_size_read(inode);
-		if (bytes_added)
-			ocfs2_update_inode_size(inode, newsize);
-
-#ifdef OCFS2_ORACORE_WORKAROUNDS
-		spin_lock(&OCFS2_I(inode)->ip_lock);
-		if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_OPEN_DIRECT) {
-			/* This is a total broken hack for O_DIRECT crack */
-			OCFS2_I(inode)->ip_mmu_private = i_size_read(inode);
-		}
-		spin_unlock(&OCFS2_I(inode)->ip_lock);
-#endif
-		status = ocfs2_zero_extend(inode);
-		if (status < 0) {
-			mlog_errno(status);
-			goto bail;
-		}
 	}
 
 	handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS);
@@ -1232,6 +871,302 @@
 	return err;
 }
 
+static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
+				    const char __user *buf,
+				    size_t count,
+				    loff_t pos)
+{
+	struct iovec local_iov = { .iov_base = (void __user *)buf,
+				   .iov_len = count };
+	int ret, level;
+	u32 clusters;
+	ocfs2_super *osb = NULL;
+	struct file *filp = iocb->ki_filp;
+	struct inode *inode = filp->f_dentry->d_inode;
+	int do_direct = 0;
+	loff_t newsize, saved_pos;
+
+	mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", filp, buf,
+		   (unsigned int)count,
+		   filp->f_dentry->d_name.len,
+		   filp->f_dentry->d_name.name);
+
+	/* happy write of zero bytes */
+	if (count == 0) {
+		ret = 0;
+		goto out;
+	}
+
+	if (!inode) {
+		mlog(0, "bad inode\n");
+		ret = -EIO;
+		goto out;
+	}
+
+	osb = OCFS2_SB(inode->i_sb);
+
+	down(&inode->i_sem);
+
+	/* this ginormous block is in here because it has so many inputs
+	 * and outputs from this function.. */
+	level = !!(filp->f_flags & O_APPEND);
+	for(;;) {
+		ret = ocfs2_meta_lock(inode, NULL, NULL, level);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out_i_sem;
+		}
+
+		/* work on a copy of ppos until we're sure that we won't have
+		 * to recalculate it due to relocking. */
+		if (filp->f_flags & O_APPEND) {
+			saved_pos = i_size_read(inode);
+			mlog(0, "O_APPEND: inode->i_size=%llu\n", saved_pos);
+#ifdef OCFS2_ORACORE_WORKAROUNDS
+			if (osb->s_mount_opt & OCFS2_MOUNT_COMPAT_OCFS) {
+				/* ugh, work around some applications which
+				 * open everything O_DIRECT + O_APPEND and
+				 * really don't mean to use O_DIRECT. */
+				filp->f_flags &= ~O_DIRECT;
+			}
+#endif
+		} else {
+			saved_pos = iocb->ki_pos;
+		}
+		newsize = count + saved_pos;
+
+		mlog(0, "pos=%lld newsize=%"MLFu64" cursize=%lld\n",
+		     saved_pos, newsize, i_size_read(inode));
+
+		/* No need for a higher level metadata lock if we're
+		 * never going past i_size. */
+		if (newsize <= i_size_read(inode))
+			break;
+
+		if (level == 0) {
+			ocfs2_meta_unlock(inode, level);
+			level = 1;
+			continue;
+		}
+
+		spin_lock(&OCFS2_I(inode)->ip_lock);
+		clusters = ocfs2_clusters_for_bytes(inode->i_sb, newsize) -
+			OCFS2_I(inode)->ip_clusters;
+		spin_unlock(&OCFS2_I(inode)->ip_lock);
+
+		mlog(0, "Writing at EOF, may need more allocation: "
+		     "i_size = %lld, newsize = %"MLFu64", need %u clusters\n",
+		     i_size_read(inode), newsize, clusters);
+
+		/* We only want to continue the rest of this loop if
+		 * our extend will actually require more
+		 * allocation. */
+		if (!clusters)
+			break;
+
+		ret = ocfs2_extend_allocation(inode, clusters);
+		if (ret < 0) {
+			if (ret != -ENOSPC)
+				mlog_errno(ret);
+			goto out_meta_unlock;
+		}
+
+		/* Fill any holes which would've been created by this
+		 * write. If we're O_APPEND, this will wind up
+		 * (correctly) being a noop. */
+		ret = ocfs2_zero_extend(inode, (u64) newsize - count);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out_meta_unlock;
+		}
+		break;
+	}
+
+	/* we've got whatever cluster lock is appropriate now, so we
+	 * can stuff *ppos back. */
+	iocb->ki_pos = saved_pos;
+
+	if (filp->f_flags & O_DIRECT) {
+#ifdef OCFS2_ORACORE_WORKAROUNDS
+		if (osb->s_mount_opt & OCFS2_MOUNT_COMPAT_OCFS) {
+			int sector_size = 1 << osb->s_sectsize_bits;
+
+			if ((saved_pos & (sector_size - 1)) ||
+			    (count & (sector_size - 1)) ||
+			    ((unsigned long)buf & (sector_size - 1))) {
+				do_direct = 0;
+				filp->f_flags |= O_SYNC;
+			} else {
+				do_direct = 1;
+			}
+		} else
+#endif
+			do_direct = 1;
+
+		mlog(0, "O_DIRECT\n");
+	}
+
+	if (!do_direct) {
+		ret = ocfs2_data_lock(inode, 1);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out_meta_unlock;
+		}
+	}
+
+	down_read(&OCFS2_I(inode)->ip_alloc_sem);
+
+#ifdef OCFS2_ORACORE_WORKAROUNDS
+	if (osb->s_mount_opt & OCFS2_MOUNT_COMPAT_OCFS) {
+		unsigned int saved_flags = filp->f_flags;
+
+		if (do_direct)
+			filp->f_flags |= O_DIRECT;
+		else
+			filp->f_flags &= ~O_DIRECT;
+
+		ret = generic_file_aio_write_nolock(iocb, &local_iov, 1,
+						    &iocb->ki_pos);
+
+		filp->f_flags = saved_flags;
+	} else
+#endif
+		ret = generic_file_aio_write_nolock(iocb, &local_iov, 1,
+						    &iocb->ki_pos);
+
+	up_read(&OCFS2_I(inode)->ip_alloc_sem);
+
+	if (!do_direct)
+		ocfs2_data_unlock(inode, 1);
+
+out_meta_unlock:
+	ocfs2_meta_unlock(inode, level);
+out_i_sem:
+	up(&inode->i_sem);
+
+out:
+	mlog_exit(ret);
+	return ret;
+}
+
+static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
+				   char __user *buf,
+				   size_t count,
+				   loff_t pos)
+{
+	int ret = 0;
+	ocfs2_super *osb = NULL;
+	struct file *filp = iocb->ki_filp;
+	struct inode *inode = filp->f_dentry->d_inode;
+
+	mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", filp, buf,
+		   (unsigned int)count,
+		   filp->f_dentry->d_name.len,
+		   filp->f_dentry->d_name.name);
+
+	if (!inode) {
+		ret = -EINVAL;
+		mlog_errno(ret);
+		goto bail;
+	}
+
+	osb = OCFS2_SB(inode->i_sb);
+
+#ifdef OCFS2_ORACORE_WORKAROUNDS
+	if (osb->s_mount_opt & OCFS2_MOUNT_COMPAT_OCFS) {
+		if (filp->f_flags & O_DIRECT) {
+			int sector_size = 1 << osb->s_sectsize_bits;
+
+			if ((pos & (sector_size - 1)) ||
+			    (count & (sector_size - 1)) ||
+			    ((unsigned long)buf & (sector_size - 1)) ||
+			    (i_size_read(inode) & (sector_size -1))) {
+				filp->f_flags &= ~O_DIRECT;
+			}
+		}
+	}
+#endif
+
+	ret = ocfs2_meta_lock(inode, NULL, NULL, 0);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto bail;
+	}
+
+	if (!(filp->f_flags & O_DIRECT)) {
+		ret = ocfs2_data_lock(inode, 0);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto bail_unlock_meta;
+		}
+	}
+
+	down_read(&OCFS2_I(inode)->ip_alloc_sem);
+
+	ret = generic_file_aio_read(iocb, buf, count, iocb->ki_pos);
+
+	up_read(&OCFS2_I(inode)->ip_alloc_sem);
+
+	if (ret == -EINVAL)
+		mlog(ML_ERROR, "generic_file_aio_read returned -EINVAL\n");
+
+	if (!(filp->f_flags & O_DIRECT))
+		ocfs2_data_unlock(inode, 0);
+bail_unlock_meta:
+	ocfs2_meta_unlock(inode, 0);
+
+bail:
+	mlog_exit(ret);
+
+	return ret;
+}
+
+static ssize_t ocfs2_file_sendfile(struct file *in_file,
+				   loff_t *ppos,
+				   size_t count,
+				   read_actor_t actor,
+				   void *target)
+{
+	int ret;
+	struct inode *inode = in_file->f_mapping->host;
+
+	mlog_entry("inode %"MLFu64", ppos %lld, count = %u\n",
+		   OCFS2_I(inode)->ip_blkno, (long long) *ppos,
+		   (unsigned int) count);
+
+	/* Obviously, there is no user buffer to worry about here --
+	 * this simplifies locking, so no need to walk vmas a la
+	 * read/write. We take a simple set of cluster locks against
+	 * the inode and call generic_file_sendfile. */
+	ret = ocfs2_meta_lock(inode, NULL, NULL, 0);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto bail;
+	}
+
+	ret = ocfs2_data_lock(inode, 0);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto bail_unlock_meta;
+	}
+
+	down_read(&OCFS2_I(inode)->ip_alloc_sem);
+
+	ret = generic_file_sendfile(in_file, ppos, count, actor, target);
+	if (ret < 0)
+		mlog_errno(ret);
+
+	up_read(&OCFS2_I(inode)->ip_alloc_sem);
+
+	ocfs2_data_unlock(inode, 0);
+bail_unlock_meta:
+	ocfs2_meta_unlock(inode, 0);
+
+bail:
+	mlog_exit(ret);
+	return ret;
+}
+
 struct inode_operations ocfs2_file_iops = {
 	.setattr	= ocfs2_setattr,
 	.getattr	= ocfs2_getattr,
@@ -1241,3 +1176,21 @@
 	.setattr	= ocfs2_setattr,
 	.getattr	= ocfs2_getattr,
 };
+
+struct file_operations ocfs2_fops = {
+	.read		= do_sync_read,
+	.write		= do_sync_write,
+	.sendfile	= ocfs2_file_sendfile,
+	.mmap		= ocfs2_mmap,
+	.fsync		= ocfs2_sync_file,
+	.release	= ocfs2_file_release,
+	.open		= ocfs2_file_open,
+	.aio_read	= ocfs2_file_aio_read,
+	.aio_write	= ocfs2_file_aio_write,
+};
+
+struct file_operations ocfs2_dops = {
+	.read		= generic_read_dir,
+	.readdir	= ocfs2_readdir,
+	.fsync		= ocfs2_sync_file,
+};

Modified: branches/locking-changes/fs/ocfs2/file.h
===================================================================
--- branches/locking-changes/fs/ocfs2/file.h	2005-08-30 00:26:33 UTC (rev 2556)
+++ branches/locking-changes/fs/ocfs2/file.h	2005-08-31 18:03:07 UTC (rev 2557)
@@ -33,32 +33,26 @@
 struct _ocfs2_alloc_context;
 
 enum ocfs2_alloc_restarted {
-	RESTART_TRANS = 0,
+	RESTART_NONE = 0,
+	RESTART_TRANS,
 	RESTART_META
 };
-int ocfs2_extend_allocation(ocfs2_super *osb,
-			    struct inode *inode,
-			    u32 clusters_to_add,
-			    struct buffer_head *fe_bh,
-			    ocfs2_journal_handle *handle,
-			    struct _ocfs2_alloc_context *data_ac,
-			    struct _ocfs2_alloc_context *meta_ac,
-			    enum ocfs2_alloc_restarted *reason);
+int ocfs2_do_extend_allocation(ocfs2_super *osb,
+			       struct inode *inode,
+			       u32 clusters_to_add,
+			       struct buffer_head *fe_bh,
+			       ocfs2_journal_handle *handle,
+			       struct _ocfs2_alloc_context *data_ac,
+			       struct _ocfs2_alloc_context *meta_ac,
+			       enum ocfs2_alloc_restarted *reason);
 int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);
 int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
 		  struct kstat *stat);
 int ocfs2_sync_inode(struct inode *inode);
-int ocfs2_extend_file(ocfs2_super *osb,
-		      struct inode *inode,
-		      u64 new_i_size,
-		      u64 *bytes_extended);
 
 int ocfs2_set_inode_size(ocfs2_journal_handle *handle,
 			 struct inode *inode,
 			 struct buffer_head *fe_bh,
 			 u64 new_i_size);
 
-void ocfs2_file_finish_extension(struct inode *inode, loff_t newsize,
-				 unsigned direct_extend);
-
 #endif /* OCFS2_FILE_H */

Modified: branches/locking-changes/fs/ocfs2/inode.c
===================================================================
--- branches/locking-changes/fs/ocfs2/inode.c	2005-08-30 00:26:33 UTC (rev 2556)
+++ branches/locking-changes/fs/ocfs2/inode.c	2005-08-31 18:03:07 UTC (rev 2557)
@@ -297,7 +297,6 @@
 		    inode->i_fop = &ocfs2_fops;
 		    inode->i_op = &ocfs2_file_iops;
 		    i_size_write(inode, le64_to_cpu(fe->i_size));
-		    OCFS2_I(inode)->ip_mmu_private = inode->i_size;
 		    break;
 	    case S_IFDIR:
 		    inode->i_op = &ocfs2_dir_iops;
@@ -1121,9 +1120,6 @@
 
 	OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
 	i_size_write(inode, le64_to_cpu(fe->i_size));
-	if (S_ISREG(inode->i_mode)) {
-		OCFS2_I(inode)->ip_mmu_private = i_size_read(inode);
-	}
 	inode->i_nlink = le16_to_cpu(fe->i_links_count);
 	inode->i_uid = le32_to_cpu(fe->i_uid);
 	inode->i_gid = le32_to_cpu(fe->i_gid);

Modified: branches/locking-changes/fs/ocfs2/inode.h
===================================================================
--- branches/locking-changes/fs/ocfs2/inode.h	2005-08-30 00:26:33 UTC (rev 2556)
+++ branches/locking-changes/fs/ocfs2/inode.h	2005-08-31 18:03:07 UTC (rev 2557)
@@ -41,7 +41,6 @@
 	spinlock_t		ip_lock;
 	u32			ip_open_count;
 	u32			ip_clusters;
-	loff_t			ip_mmu_private;
 	struct ocfs2_extent_map	ip_map;
 	struct list_head	ip_io_markers;
 	int			ip_orphaned_slot;

Modified: branches/locking-changes/fs/ocfs2/journal.c
===================================================================
--- branches/locking-changes/fs/ocfs2/journal.c	2005-08-30 00:26:33 UTC (rev 2556)
+++ branches/locking-changes/fs/ocfs2/journal.c	2005-08-31 18:03:07 UTC (rev 2557)
@@ -428,6 +428,18 @@
 	return status;
 }
 
+int ocfs2_journal_dirty_data(handle_t *handle,
+			     struct buffer_head *bh)
+{
+	int err = journal_dirty_data(handle, bh);
+	if (err)
+		mlog_errno(err);
+	/* TODO: When we can handle it, abort the handle and go RO on
+	 * error here. */
+
+	return err;
+}
+
 /* We always assume you're adding a metadata lock at level 'ex' */
 int ocfs2_handle_add_lock(ocfs2_journal_handle *handle,
 			  struct inode *inode)

Modified: branches/locking-changes/fs/ocfs2/journal.h
===================================================================
--- branches/locking-changes/fs/ocfs2/journal.h	2005-08-30 00:26:33 UTC (rev 2556)
+++ branches/locking-changes/fs/ocfs2/journal.h	2005-08-31 18:03:07 UTC (rev 2557)
@@ -249,6 +249,8 @@
  *                          buffer. Will have to call ocfs2_journal_dirty once
  *                          we've actually dirtied it. Type is one of . or .
  *  ocfs2_journal_dirty    - Mark a journalled buffer as having dirty data.
+ *  ocfs2_journal_dirty_data - Indicate that a data buffer should go out before
+ *                             the current handle commits.
  *  ocfs2_handle_add_lock  - Sometimes we need to delay lock release
  *                          until after a transaction has been completed. Use
  *                          ocfs2_handle_add_lock to indicate that a lock needs
@@ -308,6 +310,8 @@
  */
 int                  ocfs2_journal_dirty(ocfs2_journal_handle *handle,
 					 struct buffer_head *bh);
+int                  ocfs2_journal_dirty_data(handle_t *handle,
+					      struct buffer_head *bh);
 int                  ocfs2_handle_add_lock(ocfs2_journal_handle *handle,
 					   struct inode *inode);
 /*

Modified: branches/locking-changes/fs/ocfs2/namei.c
===================================================================
--- branches/locking-changes/fs/ocfs2/namei.c	2005-08-30 00:26:33 UTC (rev 2556)
+++ branches/locking-changes/fs/ocfs2/namei.c	2005-08-31 18:03:07 UTC (rev 2557)
@@ -1677,8 +1677,9 @@
 	newsize = l - 1;
 	if (l > ocfs2_fast_symlink_chars(sb)) {
 		inode->i_op = &ocfs2_symlink_inode_operations;
-		status = ocfs2_extend_allocation(osb, inode, 1, new_fe_bh,
-						 handle, data_ac, NULL, NULL);
+		status = ocfs2_do_extend_allocation(osb, inode, 1, new_fe_bh,
+						    handle, data_ac, NULL,
+						    NULL);
 		if (status < 0) {
 			if (status != -ENOSPC && status != -EINTR) {
 				mlog(ML_ERROR, "Failed to extend file to "

Modified: branches/locking-changes/fs/ocfs2/ocfs2.h
===================================================================
--- branches/locking-changes/fs/ocfs2/ocfs2.h	2005-08-30 00:26:33 UTC (rev 2556)
+++ branches/locking-changes/fs/ocfs2/ocfs2.h	2005-08-31 18:03:07 UTC (rev 2557)
@@ -271,6 +271,14 @@
 	struct work_struct		osb_truncate_log_wq;
 } ocfs2_super;
 
+static inline int ocfs2_should_order_data(struct inode *inode)
+{
+	if (!S_ISREG(inode->i_mode))
+		return 0;
+	/* TODO: this should be a mount option which we check here. */
+	return 1;
+}
+ 
 #define OCFS2_SB(sb)	    ((ocfs2_super *)(sb)->s_fs_info)
 #define OCFS2_MAX_OSB_ID             65536
 
@@ -345,6 +353,13 @@
 	return clusters;
 }
 
+static inline u64 ocfs2_blocks_for_bytes(struct super_block *sb,
+					 u64 bytes)
+{
+	bytes += sb->s_blocksize - 1;
+	return bytes >> sb->s_blocksize_bits;
+}
+
 static inline u64 ocfs2_clusters_to_bytes(struct super_block *sb,
 					  u32 clusters)
 {
@@ -361,11 +376,13 @@
 	return (u64)clusters << cl_bits;
 }
 
-static inline unsigned long ocfs2_align_bytes_to_blocks(struct super_block *sb,
-							u64 bytes)
+static inline u64 ocfs2_align_bytes_to_blocks(struct super_block *sb,
+					      u64 bytes)
 {
-	bytes += sb->s_blocksize - 1;
-	return (unsigned long)(bytes >> sb->s_blocksize_bits);
+	u64 blocks;
+
+        blocks = ocfs2_blocks_for_bytes(sb, bytes);
+	return blocks << sb->s_blocksize_bits;
 }
 
 static inline unsigned long ocfs2_align_bytes_to_sectors(u64 bytes)

Modified: branches/locking-changes/fs/ocfs2/super.c
===================================================================
--- branches/locking-changes/fs/ocfs2/super.c	2005-08-30 00:26:33 UTC (rev 2556)
+++ branches/locking-changes/fs/ocfs2/super.c	2005-08-31 18:03:07 UTC (rev 2557)
@@ -690,7 +690,6 @@
 
 		oi->ip_blkno = 0ULL;
 		oi->ip_clusters = 0;
-		oi->ip_mmu_private = 0LL;
 
 		ocfs2_lock_res_init_once(&oi->ip_meta_lockres);
 		ocfs2_lock_res_init_once(&oi->ip_data_lockres);