[Ocfs2-commits] mfasheh commits r1704 - trunk/src

Tue Dec 14 18:08:30 CST 2004

Author: mfasheh
Date: 2004-12-14 18:08:28 -0600 (Tue, 14 Dec 2004)
New Revision: 1704

Added:
   trunk/src/mmap.c
   trunk/src/mmap.h
   trunk/src/ocfs_compat.c
Modified:
   trunk/src/Makefile
   trunk/src/dlmglue.c
   trunk/src/extent_map.c
   trunk/src/file.c
   trunk/src/inode.c
   trunk/src/ocfs.h
   trunk/src/ocfs_compat.h
   trunk/src/ocfs_log.h
Log:
* commit some code to make us safe for clustered mmap. this winds up
  supporting it, at least for shared readable. shared write is
  disabled, but could be put in there without a whole lot of effort i
  think.

* move the rbtree compat stuff into ocfs_compat.c

* fix a broken #ifdef in dlmglue.c



Modified: trunk/src/Makefile
===================================================================

--- trunk/src/Makefile	2004-12-14 23:29:09 UTC (rev 1703)
+++ trunk/src/Makefile	2004-12-15 00:08:28 UTC (rev 1704)
@@ -77,7 +77,9 @@
 	ioctl.c			\
 	journal.c		\
 	localalloc.c		\
+	mmap.c			\
 	namei.c			\
+	ocfs_compat.c		\
 	proc.c			\
 	slot_map.c		\
 	suballoc.c		\
@@ -106,6 +108,7 @@
 	ioctl.h			\
 	journal.h		\
 	localalloc.h		\
+	mmap.h			\
 	namei.h			\
 	proc.h			\
 	slot_map.h		\

Modified: trunk/src/dlmglue.c
===================================================================
--- trunk/src/dlmglue.c	2004-12-14 23:29:09 UTC (rev 1703)
+++ trunk/src/dlmglue.c	2004-12-15 00:08:28 UTC (rev 1704)
@@ -1008,7 +1008,7 @@
 
 #ifdef OCFS2_VERBOSE_LOCKING_TRACE
 	printk("ocfs2: (%u) inode %llu drop %s DATA lock\n",
-	       OCFS_I(inode)->ip_blkno, current->pid,
+	       current->pid, OCFS_I(inode)->ip_blkno, 
 	       write ? "EXMODE" : "PRMODE");
 #endif
 
@@ -1271,9 +1271,9 @@
 
 	LOG_ENTRY();
 
-#ifdef VERBOSE_LOCKING_TRACE
+#ifdef OCFS2_VERBOSE_LOCKING_TRACE
 	printk("ocfs2: (%u) inode %llu, take %s META lock\n",
-	       OCFS_I(inode)->ip_blkno, current->pid,
+	       current->pid, OCFS_I(inode)->ip_blkno,
 	       ex ? "EXMODE" : "PRMODE");
 #endif
 
@@ -1344,7 +1344,7 @@
 
 #ifdef OCFS2_VERBOSE_LOCKING_TRACE
 	printk("ocfs2: (%u) inode %llu drop %s META lock\n",
-	       OCFS_I(inode)->ip_blkno, current->pid,
+	       current->pid, OCFS_I(inode)->ip_blkno,
 	       ex ? "EXMODE" : "PRMODE");
 #endif
 
@@ -1884,8 +1884,13 @@
        	inode = ocfs2_lock_res_inode(lockres);
 
         sync_mapping_buffers(inode->i_mapping);
-        if (blocking == LKM_EXMODE)
+        if (blocking == LKM_EXMODE) {
                 ocfs_truncate_inode_pages(inode, 0);
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
+		unmap_mapping_range(inode->i_mapping, 0, 0, 0);
+#endif
+	}
+
 	LOG_EXIT();
 }
 

Modified: trunk/src/extent_map.c
===================================================================
--- trunk/src/extent_map.c	2004-12-14 23:29:09 UTC (rev 1703)
+++ trunk/src/extent_map.c	2004-12-15 00:08:28 UTC (rev 1704)
@@ -773,48 +773,6 @@
 	return 0;
 }
 
-#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,9)
-static struct rb_node *rb_last(struct rb_root *root)
-{
-	struct rb_node	*n;
-
-#ifdef rb_node
-#undef rb_node
-	n = root->rb_node;
-#define rb_node rb_node_s
-#else
-	n = root->rb_node;
-#endif
-	if (!n)
-		return NULL;
-	while (n->rb_right)
-		n = n->rb_right;
-	return n;
-}
-#endif
-
-#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)
-static struct rb_node *rb_prev(struct rb_node *node)
-{
-	/* If we have a left-hand child, go down and then right as far
-	   as we can. */
-	if (node->rb_left) {
-		node = node->rb_left; 
-		while (node->rb_right)
-			node=node->rb_right;
-		return node;
-	}
-
-	/* No left-hand children. Go up till we find an ancestor which
-	   is a right-hand child of its parent */
-	while (node->rb_parent && node == node->rb_parent->rb_left)
-		node = node->rb_parent;
-
-	return node->rb_parent;
-}
-#endif  /* LINUX_VERSION_CODE */
-
-
 /* Needs the lock */
 static void __ocfs2_extent_map_drop(struct inode *inode,
 				    u32 new_clusters,

Modified: trunk/src/file.c
===================================================================
--- trunk/src/file.c	2004-12-14 23:29:09 UTC (rev 1703)
+++ trunk/src/file.c	2004-12-15 00:08:28 UTC (rev 1704)
@@ -44,6 +44,7 @@
 #include "sysfile.h"
 #include "inode.h"
 #include "ioctl.h"
+#include "mmap.h"
 #include "suballoc.h"
 #include "util.h"
 
@@ -230,6 +231,7 @@
 	int have_i_sem = 0;
 	int level = filp->f_flags & O_APPEND;
 	loff_t saved_ppos;
+	DECLARE_BUFFER_LOCK_CTXT(ctxt);
 
 	LOG_SET_CONTEXT(WRITE);
 
@@ -237,6 +239,7 @@
 			(unsigned int)count,
 			filp->f_dentry->d_name.len, 
 			filp->f_dentry->d_name.name);
+
 	/* happy write of zero bytes */
 	if (count == 0) {
 		ret = 0;
@@ -255,6 +258,23 @@
 	down(&inode->i_sem);
 	have_i_sem = 1;
 
+	ret = ocfs2_setup_io_locks(inode->i_sb, inode, (char *) buf,
+				   count, &ctxt);
+	if (ret < 0) {
+		LOG_ERROR_STATUS(ret);
+		goto bail;
+	}
+
+	/* This will lock everyone in the context who's order puts
+	 * them before us. */
+	ret = ocfs2_lock_buffer_inodes(&ctxt, inode);
+	if (ret < 0) {
+		if (ret != -EINTR)
+			LOG_ERROR_STATUS(ret);
+		goto bail;
+	}
+
+	ctxt.b_lock_data_write = 1;
 lock:
 	status = ocfs2_meta_lock(inode, NULL, NULL, level);
 	if (status < 0) {
@@ -262,6 +282,11 @@
 		ret = status;
 		goto bail;
 	}
+	/* to handle extending writes, we do a bit of our own locking
+	 * here, but we setup the ctxt do unlock for us (as well as
+	 * handle locking everything else. */
+	if (level)
+		ctxt.b_lock_meta_write = 1;
 
 	/* work on a copy of ppos until we're sure that we won't have
 	 * to recalculate it due to relocking. */
@@ -287,9 +312,11 @@
 		    ((unsigned long)buf & (sector_size - 1))) {
 			do_direct_io = 0;
 			filp->f_flags |= O_SYNC;
-		} else
+		} else {
 			do_direct_io = 1;
+		}
 	}
+	ctxt.b_lock_direct = do_direct_io;
 
 	newsize = count + saved_ppos;
 	if (filp->f_flags & O_APPEND)
@@ -326,7 +353,7 @@
 				ret = status;
 
 			ocfs2_meta_unlock(inode, level);
-			goto bail;
+			goto bail_unlock;
 		}
 	}
 
@@ -342,10 +369,23 @@
 			ret = status;
 
 			ocfs2_meta_unlock(inode, level);
-			goto bail;
+			goto bail_unlock;
 		}
 	}
 
+	/* Alright, fool the io locking stuff into thinking it's
+	 * handled our inode for us. We can now count on it to do the
+	 * unlock for us. */
+	ctxt.b_target->ba_locked = 1;
+
+	/* This will lock everyone who's order puts them *after* our inode. */
+	ret = ocfs2_lock_buffer_inodes(&ctxt, NULL);
+	if (ret < 0) {
+		if (ret != -EINTR)
+			LOG_ERROR_STATUS(ret);
+		goto bail_unlock;
+	}
+
 	down_read(&OCFS_I(inode)->ip_alloc_sem);
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
 	/* 
@@ -373,8 +413,6 @@
 		ret = generic_file_write_nolock (filp, buf, count, ppos);
 #endif
 	up_read(&OCFS_I(inode)->ip_alloc_sem);
-	if (!do_direct_io)
-		ocfs2_data_unlock(inode, 1);
 
 	if (extended) {
 		LOG_TRACE_STR
@@ -397,8 +435,10 @@
 				LOG_ERROR_ARGS("Unable to pre-zero extension of inode (%d)", status);
 		}
 	}
-	ocfs2_meta_unlock(inode, level);
 
+bail_unlock:
+	ocfs2_unlock_buffer_inodes(&ctxt);
+
 bail:
 	if (have_i_sem)
 		up(&inode->i_sem);
@@ -423,20 +463,13 @@
 	int status = 0;
 	int do_direct_io = 0;
 	int sector_size;
+	DECLARE_BUFFER_LOCK_CTXT(ctxt);
 
 	LOG_SET_CONTEXT(READ);
 
 	LOG_ENTRY_ARGS ("(0x%p, 0x%p, %u, '%*s')\n", filp, buf,
-			(unsigned int)count,
-			filp->f_dentry->d_name.len, filp->f_dentry->d_name.name);
-
-#ifdef PURE_EVIL
-	if (evil_filename_check(EVIL_DENTRY, dentry)) {
-		LOG_ERROR_ARGS("EVIL FILE_READ: count=%u, ppos=%llu, flags=%d\n", (unsigned int)count, *ppos, filp->f_flags);
-	}
-#endif
-
-
+			(unsigned int)count, filp->f_dentry->d_name.len,
+			filp->f_dentry->d_name.name);
 	if (!inode) {
 		LOG_ERROR_STR ("Bad inode or inode has no oin");
 		ret = -EINVAL;
@@ -453,28 +486,23 @@
 		    ((unsigned long)buf & (sector_size - 1)) || (inode->i_size & (sector_size -1))) {
 			do_direct_io = 0;
 			filp->f_flags &= ~O_DIRECT;
-		} else
+		} else {
 			do_direct_io = 1;
+		}
 	}
+	ctxt.b_lock_direct = do_direct_io;
 
-	/* yay, PR (shared) locks all 'round :) */
-	status = ocfs2_meta_lock(inode, NULL, NULL, 0);
-	if (status < 0) {
+	ret = ocfs2_setup_io_locks(inode->i_sb, inode, buf, count, &ctxt);
+	if (ret < 0) {
 		LOG_ERROR_STATUS(status);
-		/* is this ret code correct? */
-		ret = status;
 		goto bail;
 	}
 
-	if (!do_direct_io) {
-		status = ocfs2_data_lock(inode, 0);
-		if (status < 0) {
-			if (status != -EINTR)
-				LOG_ERROR_STATUS(status);
-			/* is this ret code correct? */
-			ret = status;
-			goto bail;
-		}
+	ret = ocfs2_lock_buffer_inodes(&ctxt, NULL);
+	if (ret < 0) {
+		if (ret != -EINTR)
+			LOG_ERROR_STATUS(status);
+		goto bail_unlock;
 	}
 
 	down_read(&OCFS_I(inode)->ip_alloc_sem);
@@ -505,9 +533,9 @@
 	if (ret == -EINVAL)
 		LOG_ERROR_STR ("Generic_file_read returned -EINVAL");
 
-	if (!do_direct_io)
-		ocfs2_data_unlock(inode, 0);
-	ocfs2_meta_unlock(inode, 0);
+bail_unlock:
+	ocfs2_unlock_buffer_inodes(&ctxt);
+
 bail:
 	LOG_EXIT_INT (ret);
 
@@ -518,7 +546,7 @@
 struct file_operations ocfs_fops = {
 	.read = ocfs_file_read,
 	.write = ocfs_file_write,
-	.mmap = generic_file_mmap,
+	.mmap = ocfs2_mmap,
 	.fsync = ocfs_sync_file,
 	.release = ocfs_file_release,
 	.open = ocfs_file_open,

Modified: trunk/src/inode.c
===================================================================
--- trunk/src/inode.c	2004-12-14 23:29:09 UTC (rev 1703)
+++ trunk/src/inode.c	2004-12-15 00:08:28 UTC (rev 1704)
@@ -309,6 +309,7 @@
 	spin_lock_init(&i->ip_lock);
 	ocfs2_extent_map_init(inode);
 	INIT_LIST_HEAD(&i->ip_handle_list);
+	INIT_LIST_HEAD(&i->ip_io_markers);
 	i->ip_handle = NULL;
 	i->ip_next_orphan = NULL;
 
@@ -823,7 +824,9 @@
 
 	if (OCFS_I(inode)->ip_blkno == -1)
 		BUG();
+	OCFS_ASSERT(list_empty(&OCFS_I(inode)->ip_io_markers));
 
+
 	/* blkno == 0 if this inode is newly created and hasn't been
 	 * filled in yet. */
 	if (OCFS_I(inode)->ip_blkno == 0) {

Added: trunk/src/mmap.c
===================================================================
--- trunk/src/mmap.c	2004-12-14 23:29:09 UTC (rev 1703)
+++ trunk/src/mmap.c	2004-12-15 00:08:28 UTC (rev 1704)
@@ -0,0 +1,432 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * mmap.c
+ *
+ * Code to deal with the mess that is clustered mmap.
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include "ocfs_compat.h"
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/uio.h>
+#include <linux/rbtree.h>
+
+#include "ocfs_log.h"
+#include "ocfs.h"
+#include "ocfs2.h"
+
+#include "dlmglue.h"
+#include "file.h"
+#include "inode.h"
+#include "mmap.h"
+#include "util.h"
+
+#define OCFS_DEBUG_CONTEXT    OCFS_DEBUG_CONTEXT_MMAP
+
+static inline u64 ocfs2_binode_blkno(ocfs2_backing_inode *binode);
+static inline struct rb_node * __ocfs2_buffer_lock_ctxt_root(
+	ocfs2_buffer_lock_ctxt *ctxt);
+static int ocfs2_buffer_lock_ctxt_insert(ocfs2_buffer_lock_ctxt *ctxt,
+					 struct inode *inode);
+static int ocfs2_fill_ctxt_from_buf(struct super_block *sb,
+				    struct inode *target_inode,
+				    char *buf,
+				    size_t size,
+				    ocfs2_buffer_lock_ctxt *ctxt);
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
+static struct page *ocfs2_nopage(struct vm_area_struct * area,
+				 unsigned long address,
+				 int *type)
+#else
+/* 'type' is unused in 2.4. */
+static struct page *ocfs2_nopage(struct vm_area_struct * area,
+				 unsigned long address,
+				 int type)
+#endif
+{
+	int status;
+	int locked;
+	struct inode *inode = area->vm_file->f_dentry->d_inode;
+	struct page *page = NULL;
+	DECLARE_IO_MARKER(io_marker);
+
+	LOG_ENTRY_ARGS("(inode %lu, address %lu)\n", inode->i_ino,
+		       address);
+
+	locked = ocfs2_is_in_io_marker_list(inode, current);
+
+	if (!locked) {
+		/* Since we don't allow shared writable, we need only
+		 * worry about read locking here. */
+		status = ocfs2_meta_lock(inode, NULL, NULL, 0);
+		if (status < 0) {
+			if (status != -EINTR)
+				LOG_ERROR_STATUS(status);
+			goto bail;
+		}
+
+		status = ocfs2_data_lock(inode, 0);
+		if (status < 0) {
+			if (status != -EINTR)
+				LOG_ERROR_STATUS(status);
+			ocfs2_meta_unlock(inode, 0);
+			goto bail;
+		}
+		/* I'm not sure if we can somehow recurse back into
+		 * nopage or not, but this doesn't cost us anything,
+		 * so lets do it for now. */
+		ocfs2_add_io_marker(inode, &io_marker);
+	}
+
+	page = filemap_nopage(area, address, type);
+
+	if (!locked) {
+		ocfs2_del_io_marker(inode, &io_marker);
+		ocfs2_data_unlock(inode, 0);
+		ocfs2_meta_unlock(inode, 0);
+	}
+bail:
+	LOG_EXIT_PTR(page);
+	return page;
+}
+
+static struct vm_operations_struct ocfs2_file_vm_ops = {
+	nopage:         ocfs2_nopage,
+};
+
+int ocfs2_mmap(struct file *file,
+	       struct vm_area_struct *vma)
+{
+	struct address_space *mapping = file->f_dentry->d_inode->i_mapping;
+	struct inode *inode = mapping->host;
+
+	/* We don't want to support shared writable mappings yet. */
+	if (((vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_MAYSHARE))
+	    && ((vma->vm_flags & VM_WRITE) || (vma->vm_flags & VM_MAYWRITE))) {
+		printk("ocfs2: (%u) disallow shared writable %lx\n",
+		       current->pid, vma->vm_flags);
+		/* This is -EINVAL because generic_file_readonly_mmap
+		 * returns it in a similar situation. */
+		return -EINVAL;
+	}
+
+	UPDATE_ATIME(inode);
+	vma->vm_ops = &ocfs2_file_vm_ops;
+	return 0;
+}
+
+static inline u64 ocfs2_binode_blkno(ocfs2_backing_inode *binode)
+{
+	struct inode *inode = binode->ba_inode;
+
+	OCFS_ASSERT(inode);
+
+	return OCFS_I(inode)->ip_blkno;
+}
+
+static inline struct rb_node * __ocfs2_buffer_lock_ctxt_root(
+	ocfs2_buffer_lock_ctxt *ctxt)
+{
+	struct rb_node *root = 
+#ifdef rb_node
+#undef rb_node
+		ctxt->b_inodes.rb_node;
+#define rb_node rb_node_s  /* I HATE YOU 2.4 */
+#else
+		ctxt->b_inodes.rb_node;
+#endif
+
+		return root;
+}
+
+static int ocfs2_buffer_lock_ctxt_insert(ocfs2_buffer_lock_ctxt *ctxt,
+					 struct inode *inode)
+{
+	u64 blkno;
+	ocfs2_backing_inode *tmp, *binode;
+	struct rb_node * parent = NULL;
+	struct rb_node ** p =
+#ifdef rb_node
+#undef rb_node
+		&ctxt->b_inodes.rb_node;
+#define rb_node rb_node_s  /* I HATE YOU 2.4 */
+#else
+		&ctxt->b_inodes.rb_node;
+#endif
+
+	OCFS_ASSERT(ctxt);
+	OCFS_ASSERT(inode);
+
+	blkno = OCFS_I(inode)->ip_blkno;
+
+	while(*p) {
+		parent = *p;
+		tmp = rb_entry(parent, ocfs2_backing_inode, ba_node);
+
+		if (blkno < ocfs2_binode_blkno(tmp))
+			p = &(*p)->rb_left;
+		else if (blkno > ocfs2_binode_blkno(tmp))
+			p = &(*p)->rb_right;
+		else
+			return 0; /* Don't insert duplicates */
+	}
+
+	binode = kmalloc(sizeof(ocfs2_backing_inode), GFP_KERNEL);
+	if (!binode)
+		return -ENOMEM;
+	memset(binode, 0, sizeof(ocfs2_backing_inode));
+	binode->ba_inode = inode;
+	binode->ba_locked = 0;
+	ocfs2_init_io_marker(&binode->ba_task);
+
+	rb_link_node(&binode->ba_node, parent, p);
+	rb_insert_color(&binode->ba_node, &ctxt->b_inodes);
+
+	return 0;
+}
+
+static int ocfs2_fill_ctxt_from_buf(struct super_block *sb,
+				    struct inode *target_inode,
+				    char *buf,
+				    size_t size,
+				    ocfs2_buffer_lock_ctxt *ctxt)
+{
+	int status;
+	unsigned long start = (unsigned long)buf;
+	unsigned long end = start + size;
+	struct inode *inode;
+	struct mm_struct *mm = current->mm;
+	struct vm_area_struct *vma;
+
+	for (vma = find_vma(mm, start); vma; vma = vma->vm_next) {
+		if (end <= vma->vm_start)
+			break;
+		if (vma->vm_ops == &ocfs2_file_vm_ops) {
+			if (!vma->vm_file)
+				continue;
+			inode = vma->vm_file->f_dentry->d_inode;
+			if (inode->i_sb == sb &&
+			    inode != target_inode) {
+				status = ocfs2_buffer_lock_ctxt_insert(ctxt,
+								       inode);
+				if (status < 0)
+					goto bail;
+			}
+		}
+	}
+	status = 0;
+bail:
+	return status;
+}
+
+int ocfs2_setup_io_locks(struct super_block *sb,
+			 struct inode *target_inode,
+			 char *buf,
+			 size_t size,
+			 ocfs2_buffer_lock_ctxt *ctxt)
+{
+	int skip_sem = current->flags & PF_DUMPCORE;
+	int status;
+	struct mm_struct *mm = current->mm;
+	struct rb_node *first;
+
+	OCFS_ASSERT(mm);
+
+	if (!skip_sem)
+		down_read(&mm->mmap_sem);
+
+	OCFS_ASSERT(!__ocfs2_buffer_lock_ctxt_root(ctxt));
+
+	/* We always insert target because it might not be backing
+	   part of the buffer - but it needs to be in there so that
+	   it's lock gets ordered with everything else */
+	status = ocfs2_buffer_lock_ctxt_insert(ctxt, target_inode);
+	if (!status) {
+		/* The assert above guarantees that this will work. */
+		ctxt->b_target = rb_entry(__ocfs2_buffer_lock_ctxt_root(ctxt),
+					  ocfs2_backing_inode, ba_node);
+
+		/* Now fill the tree with any inodes that back this
+		 * buffer. If target inode is in there, it will be
+		 * skipped over. */
+		status = ocfs2_fill_ctxt_from_buf(sb, target_inode, buf, size,
+						  ctxt);
+	}
+
+	if (!skip_sem)
+		up_read(&mm->mmap_sem);
+
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		ocfs2_unlock_buffer_inodes(ctxt);
+		goto bail;
+	}
+
+	first = rb_first(&ctxt->b_inodes);
+	ctxt->b_head = rb_entry(first, ocfs2_backing_inode, ba_node);
+
+	status = 0;
+bail:
+	return status;
+}
+
+/* Will take locks on all inodes in the ctxt up until 'last_inode'. If
+ * last_inode is NULL, then we take locks on everything. We mark lock
+ * status on the context so we skip any that have already been
+ * locked. On error we will completely abort the context. */
+/* WARNING: If you get a failure case here, you *must* call
+ * "ocfs2_unlock_buffer_inodes" as we may have left a few inodes under
+ * cluster lock. */
+int ocfs2_lock_buffer_inodes(ocfs2_buffer_lock_ctxt *ctxt,
+			     struct inode *last_inode)
+{
+	int status, meta_level, data_level;
+	ocfs2_backing_inode *binode;
+	struct inode *inode;
+	struct rb_node *node;
+
+	binode = ctxt->b_head;
+
+	while(binode) {
+		inode = binode->ba_inode;
+		if (inode == last_inode)
+			break;
+
+		if (binode->ba_locked)
+			goto skip_locking;
+
+		meta_level = 0;
+		if (ocfs2_buffer_lock_is_target(ctxt, inode))
+			meta_level = ctxt->b_lock_meta_write;
+
+		status = ocfs2_meta_lock(inode, NULL, NULL, meta_level);
+		if (status < 0) {
+			if (status != -EINTR)
+				LOG_ERROR_STATUS(status);
+			goto bail;
+		}
+
+		/* If we're doing direct IO, then skip data locking on
+		 * the target. */
+		if (!ocfs2_buffer_lock_is_target(ctxt, inode) ||
+		    !ctxt->b_lock_direct) {
+			data_level = 0;
+			if (ocfs2_buffer_lock_is_target(ctxt, inode))
+				data_level = ctxt->b_lock_data_write;
+
+			status = ocfs2_data_lock(inode, data_level);
+			if (status < 0) {
+				ocfs2_meta_unlock(inode, meta_level);
+
+				if (status != -EINTR)
+					LOG_ERROR_STATUS(status);
+				goto bail;
+			}
+		}
+		ocfs2_add_io_marker(inode, &binode->ba_task);
+
+		binode->ba_locked = 1;
+skip_locking:
+		node = rb_next(&binode->ba_node);
+		binode = NULL;
+		if (node)
+			binode = rb_entry(node, ocfs2_backing_inode, ba_node);
+	}
+
+	ctxt->b_head = binode;
+
+	status = 0;
+bail:
+	return status;
+}
+
+void ocfs2_unlock_buffer_inodes(ocfs2_buffer_lock_ctxt *ctxt)
+{
+	int level;
+	ocfs2_backing_inode *binode;
+	struct inode *inode;
+	struct rb_node *node, *tmp;
+
+	node = rb_first(&ctxt->b_inodes);
+	while(node) {
+		binode = rb_entry(node, ocfs2_backing_inode, ba_node);
+		if (!binode->ba_locked)
+			goto skip_unlock;
+		inode = binode->ba_inode;
+
+		ocfs2_del_io_marker(inode, &binode->ba_task);
+
+		if (!ocfs2_buffer_lock_is_target(ctxt, inode) ||
+		    !ctxt->b_lock_direct) {
+			level = 0;
+			if (ocfs2_buffer_lock_is_target(ctxt, inode))
+				level = ctxt->b_lock_data_write;
+
+			ocfs2_data_unlock(inode, level);
+		}
+
+		level = 0;
+		if (ocfs2_buffer_lock_is_target(ctxt, inode))
+			level = ctxt->b_lock_meta_write;
+
+		ocfs2_meta_unlock(inode, level);
+
+skip_unlock:
+		tmp = node;
+		node = rb_next(node);
+
+		rb_erase(tmp, &ctxt->b_inodes);
+		kfree(binode);
+	}
+
+	ctxt->b_target = ctxt->b_head = NULL;
+}
+
+#if 0
+static void ocfs2_buffer_ctxt_debug(ocfs2_buffer_lock_ctxt *ctxt)
+{
+	ocfs2_backing_inode *binode;
+	struct inode *inode;
+	struct rb_node *node;
+
+	printk("(%u) ocfs2: buffer lock ctxt: direct io = %d\n",
+	       current->pid, ctxt->b_lock_direct);
+
+	node = rb_first(&ctxt->b_inodes);
+	while (node) {
+		binode = rb_entry(node, ocfs2_backing_inode, ba_node);
+		inode = binode->ba_inode;
+
+		printk("(%u) ocfs2: inode %llu, locked %d, is target? %s\n",
+		       current->pid, OCFS_I(inode)->ip_blkno,
+		       binode->ba_locked,
+		       ocfs2_buffer_lock_is_target(ctxt, inode) ? "yes" : 
+		       "no");
+
+		node = rb_next(node);
+	}
+}
+#endif

Added: trunk/src/mmap.h
===================================================================
--- trunk/src/mmap.h	2004-12-14 23:29:09 UTC (rev 1703)
+++ trunk/src/mmap.h	2004-12-15 00:08:28 UTC (rev 1704)
@@ -0,0 +1,115 @@
+#ifndef OCFS2_MMAP_H
+#define OCFS2_MMAP_H
+
+int ocfs2_mmap(struct file *file,
+	       struct vm_area_struct *vma);
+
+/* used by file_read/file_write and nopage to coordinate file
+ * locking. I keep this out of the dlmglue code, because quite frankly
+ * I don't like that we have to do this stuff. */
+typedef struct _ocfs2_io_marker {
+	struct list_head io_list;
+	struct task_struct *io_task;
+} ocfs2_io_marker;
+
+#define __IOMARKER_INITIALIZER(name) {					\
+	.io_list      = { &(name).io_list, &(name).io_list },		\
+	.io_task      = NULL }
+
+#define DECLARE_IO_MARKER(name)						\
+	ocfs2_io_marker name = __IOMARKER_INITIALIZER(name)
+
+static inline void ocfs2_init_io_marker(ocfs2_io_marker *task)
+{
+	INIT_LIST_HEAD(&task->io_list);
+	task->io_task = NULL;
+}
+
+static inline void ocfs2_add_io_marker(struct inode *inode,
+				       ocfs2_io_marker *task)
+{
+	ocfs_inode_private *oip = OCFS_I(inode);
+
+	task->io_task = current;
+	spin_lock(&oip->ip_lock);
+	list_add(&task->io_list, &oip->ip_io_markers);
+	spin_unlock(&oip->ip_lock);
+}
+
+static inline void ocfs2_del_io_marker(struct inode *inode,
+				       ocfs2_io_marker *task)
+{
+	ocfs_inode_private *oip = OCFS_I(inode);
+
+	spin_lock(&oip->ip_lock);
+	list_del_init(&task->io_list);
+	spin_unlock(&oip->ip_lock);
+}
+
+static inline int ocfs2_is_in_io_marker_list(struct inode *inode,
+					   struct task_struct *task)
+{
+	int ret = 0;
+	ocfs_inode_private *oip = OCFS_I(inode);
+	struct list_head *p;
+	ocfs2_io_marker *tmp;
+
+	spin_lock(&oip->ip_lock);
+	list_for_each(p, &oip->ip_io_markers) {
+		tmp = list_entry(p, ocfs2_io_marker, io_list);
+		if (tmp->io_task == task) {
+			ret = 1;
+			break;
+		}
+	}
+	spin_unlock(&oip->ip_lock);
+
+	return ret;
+}
+
+typedef struct _ocfs2_backing_inode {
+	struct rb_node           ba_node;
+	struct inode            *ba_inode;
+	int                      ba_locked;
+	struct _ocfs2_io_marker  ba_task;
+} ocfs2_backing_inode;
+
+/* Used to manage the locks taken during I/O. */
+typedef struct _ocfs2_buffer_lock_ctxt {
+	/* target flags */
+	unsigned                b_lock_direct:1,
+				b_lock_meta_write:1,
+				b_lock_data_write:1;
+	struct rb_root          b_inodes;
+	ocfs2_backing_inode    *b_target;
+	ocfs2_backing_inode    *b_head;
+} ocfs2_buffer_lock_ctxt;
+
+#define __BUFFERLOCK_INITIALIZER(name) {				\
+	.b_inodes               = RB_ROOT,				\
+	.b_target               = NULL,					\
+	.b_head			= NULL }
+
+#define DECLARE_BUFFER_LOCK_CTXT(name)					\
+	ocfs2_buffer_lock_ctxt name = __BUFFERLOCK_INITIALIZER(name)
+
+int ocfs2_setup_io_locks(struct super_block *sb,
+			 struct inode *target_inode,
+			 char *buf,
+			 size_t size,
+			 ocfs2_buffer_lock_ctxt *ctxt);
+
+int ocfs2_lock_buffer_inodes(ocfs2_buffer_lock_ctxt *ctxt,
+			     struct inode *last_inode);
+
+void ocfs2_unlock_buffer_inodes(struct _ocfs2_buffer_lock_ctxt *ctxt);
+
+static inline int ocfs2_buffer_lock_is_target(ocfs2_buffer_lock_ctxt *ctxt,
+					      struct inode *inode)
+{
+	if (!ctxt->b_target)
+		return 0;
+	return inode == ctxt->b_target->ba_inode;
+}
+
+#endif  /* OCFS2_MMAP_H */

Modified: trunk/src/ocfs.h
===================================================================
--- trunk/src/ocfs.h	2004-12-14 23:29:09 UTC (rev 1703)
+++ trunk/src/ocfs.h	2004-12-15 00:08:28 UTC (rev 1704)
@@ -208,6 +208,7 @@
 	u32		ip_clusters;
 	u64		ip_mmu_private;
 	struct ocfs2_extent_map ip_map;
+	struct list_head ip_io_markers;
 
 	struct semaphore  ip_io_sem;
 

Added: trunk/src/ocfs_compat.c
===================================================================
--- trunk/src/ocfs_compat.c	2004-12-14 23:29:09 UTC (rev 1703)
+++ trunk/src/ocfs_compat.c	2004-12-15 00:08:28 UTC (rev 1704)
@@ -0,0 +1,123 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * ocfs_compat.h
+ *
+ * Compatibility stuff for 2.4
+ *
+ * Code to deal with the mess that is clustered mmap.
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include "ocfs_compat.h"
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/uio.h>
+#include <linux/rbtree.h>
+
+#include "ocfs_log.h"
+#include "ocfs.h"
+#include "ocfs2.h"
+
+#define OCFS_DEBUG_CONTEXT OCFS_DEBUG_CONTEXT_COMPAT
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)
+/*
+ * This function returns the first node (in sort order) of the tree.
+ */
+struct rb_node *rb_first(struct rb_root *root)
+{
+	struct rb_node	*n;
+
+#undef rb_node
+	n = root->rb_node;
+#define rb_node rb_node_s  /* I HATE YOU 2.4 */
+	if (!n)
+		return 0;
+	while (n->rb_left)
+		n = n->rb_left;
+	return n;
+}
+
+struct rb_node *rb_next(struct rb_node *node)
+{
+	/* If we have a right-hand child, go down and then left as far
+	   as we can. */
+	if (node->rb_right) {
+		node = node->rb_right; 
+		while (node->rb_left)
+			node=node->rb_left;
+		return node;
+	}
+
+	/* No right-hand children.  Everything down and left is
+	   smaller than us, so any 'next' node must be in the general
+	   direction of our parent. Go up the tree; any time the
+	   ancestor is a right-hand child of its parent, keep going
+	   up. First time it's a left-hand child of its parent, said
+	   parent is our 'next' node. */
+	while (node->rb_parent && node == node->rb_parent->rb_right)
+		node = node->rb_parent;
+
+	return node->rb_parent;
+}
+
+struct rb_node *rb_prev(struct rb_node *node)
+{
+	/* If we have a left-hand child, go down and then right as far
+	   as we can. */
+	if (node->rb_left) {
+		node = node->rb_left; 
+		while (node->rb_right)
+			node=node->rb_right;
+		return node;
+	}
+
+	/* No left-hand children. Go up till we find an ancestor which
+	   is a right-hand child of its parent */
+	while (node->rb_parent && node == node->rb_parent->rb_left)
+		node = node->rb_parent;
+
+	return node->rb_parent;
+}
+#endif  /* LINUX_VERSION_CODE */
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,9)
+struct rb_node *rb_last(struct rb_root *root)
+{
+	struct rb_node	*n;
+
+#ifdef rb_node
+#undef rb_node
+	n = root->rb_node;
+#define rb_node rb_node_s
+#else
+	n = root->rb_node;
+#endif
+	if (!n)
+		return NULL;
+	while (n->rb_right)
+		n = n->rb_right;
+	return n;
+}
+#endif  /* LINUX_VERSION_CODE */

Modified: trunk/src/ocfs_compat.h
===================================================================
--- trunk/src/ocfs_compat.h	2004-12-14 23:29:09 UTC (rev 1703)
+++ trunk/src/ocfs_compat.h	2004-12-15 00:08:28 UTC (rev 1704)
@@ -62,6 +62,9 @@
 
 #define rb_root rb_root_s
 #define rb_node rb_node_s
+struct rb_node *rb_first(struct rb_root *root);
+struct rb_node *rb_next(struct rb_node *node);
+struct rb_node *rb_prev(struct rb_node *node);
 
 typedef long sector_t;
 
@@ -190,5 +193,8 @@
         (type *)( (char *)__mptr - offsetof(type,member) );})
 #endif
 
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,9)
+struct rb_node *rb_last(struct rb_root *root);
+#endif
 #endif  /* OCFS_COMPAT_H */
 

Modified: trunk/src/ocfs_log.h
===================================================================
--- trunk/src/ocfs_log.h	2004-12-14 23:29:09 UTC (rev 1703)
+++ trunk/src/ocfs_log.h	2004-12-15 00:08:28 UTC (rev 1704)
@@ -115,14 +115,14 @@
 #define OCFS_DEBUG_CONTEXT_VOLCFG      0x00008000	/* volcfg.c   */
 #define OCFS_DEBUG_CONTEXT_DCACHE      0x00010000	/* dcache.c   */
 #define OCFS_DEBUG_CONTEXT_DLMGLUE     0x00020000	/* dlmglue.c  */
-#define OCFS_DEBUG_CONTEXT_HASH        0x00040000	/* hash.c     */
+#define OCFS_DEBUG_CONTEXT_COMPAT      0x00040000	/*            */
 #define OCFS_DEBUG_CONTEXT_IO          0x00080000	/* io.c       */
 #define OCFS_DEBUG_CONTEXT_NAMEI       0x00100000	/* namei.c    */
 #define OCFS_DEBUG_CONTEXT_OSB         0x00200000	/* osb.c      */
 #define OCFS_DEBUG_CONTEXT_SUPER       0x00400000	/* super.c    */
 #define OCFS_DEBUG_CONTEXT_UTIL        0x00800000	/* util.c     */
-#define OCFS_DEBUG_CONTEXT_UNUSED3     0x01000000	/*            */
-#define OCFS_DEBUG_CONTEXT_UNUSED4    0x02000000	/*            */
+#define OCFS_DEBUG_CONTEXT_MMAP        0x01000000	/* mmap.c     */
+#define OCFS_DEBUG_CONTEXT_UNUSED4     0x02000000	/*            */
 
 
 #ifdef OCFS_DBG_TIMING