[Ocfs2-commits] mfasheh commits r2425 - in trunk/fs/ocfs2: . cluster

Tue Jun 28 16:07:40 CDT 2005

Author: mfasheh
Signed-off-by: jlbec
Date: 2005-06-23 21:30:36 -0500 (Thu, 23 Jun 2005)
New Revision: 2425

Added:
   trunk/fs/ocfs2/uptodate.c
   trunk/fs/ocfs2/uptodate.h
Removed:
   trunk/fs/ocfs2/seqnum.c
   trunk/fs/ocfs2/seqnum.h
Modified:
   trunk/fs/ocfs2/Makefile
   trunk/fs/ocfs2/alloc.c
   trunk/fs/ocfs2/buffer_head_io.c
   trunk/fs/ocfs2/cluster/masklog.c
   trunk/fs/ocfs2/cluster/masklog.h
   trunk/fs/ocfs2/dir.c
   trunk/fs/ocfs2/dlmglue.c
   trunk/fs/ocfs2/inode.c
   trunk/fs/ocfs2/inode.h
   trunk/fs/ocfs2/namei.c
   trunk/fs/ocfs2/ocfs2.h
   trunk/fs/ocfs2/suballoc.c
   trunk/fs/ocfs2/super.c
Log:
* New metadata caching scheme. We were running out of bits on b_state with  
  which to hold any useful information so this will take it's place.          

Signed-off-by: jlbec



Modified: trunk/fs/ocfs2/Makefile
===================================================================

--- trunk/fs/ocfs2/Makefile	2005-06-24 02:15:41 UTC (rev 2424)
+++ trunk/fs/ocfs2/Makefile	2005-06-24 02:30:36 UTC (rev 2425)
@@ -53,12 +53,12 @@
 	mmap.c 			\
 	namei.c 		\
 	proc.c 			\
-	seqnum.c 		\
 	slot_map.c 		\
 	suballoc.c 		\
 	super.c 		\
 	symlink.c 		\
 	sysfile.c 		\
+	uptodate.c		\
 	ver.c 			\
 	vote.c
 
@@ -82,12 +82,12 @@
 	mmap.h			\
 	namei.h			\
 	proc.h			\
-	seqnum.h 		\
 	slot_map.h		\
 	suballoc.h		\
 	super.h			\
 	symlink.h		\
 	sysfile.h		\
+	uptodate.h		\
 	ver.h			\
 	vote.h
 

Modified: trunk/fs/ocfs2/alloc.c
===================================================================
--- trunk/fs/ocfs2/alloc.c	2005-06-24 02:15:41 UTC (rev 2424)
+++ trunk/fs/ocfs2/alloc.c	2005-06-24 02:30:36 UTC (rev 2425)
@@ -39,11 +39,11 @@
 #include "inode.h"
 #include "journal.h"
 #include "localalloc.h"
-#include "seqnum.h"
 #include "suballoc.h"
 #include "sysfile.h"
 #include "file.h"
 #include "super.h"
+#include "uptodate.h"
 
 #include "buffer_head_io.h"
 
@@ -181,8 +181,7 @@
 				mlog_errno(status);
 				goto bail;
 			}
-			set_buffer_uptodate(bhs[i]);
-			ocfs2_set_bh_sequence(inode, bhs[i]);
+			ocfs2_set_new_buffer_uptodate(inode, bhs[i]);
 
 			status = ocfs2_journal_access(handle, inode, bhs[i],
 						      OCFS2_JOURNAL_ACCESS_CREATE);
@@ -1612,6 +1611,9 @@
 
 		if (!el->l_next_free_rec) {
 			mlog(0, "deleting this extent block.\n");
+
+			ocfs2_remove_from_cache(inode, eb_bh);
+
 			OCFS2_ASSERT(!eb->h_suballoc_slot);
 			OCFS2_ASSERT(!el->l_recs[0].e_clusters);
 			OCFS2_ASSERT(!el->l_recs[0].e_cpos);

Modified: trunk/fs/ocfs2/buffer_head_io.c
===================================================================
--- trunk/fs/ocfs2/buffer_head_io.c	2005-06-24 02:15:41 UTC (rev 2424)
+++ trunk/fs/ocfs2/buffer_head_io.c	2005-06-24 02:30:36 UTC (rev 2425)
@@ -35,7 +35,7 @@
 #include "alloc.h"
 #include "inode.h"
 #include "journal.h"
-#include "seqnum.h"
+#include "uptodate.h"
 
 #include "buffer_head_io.h"
 
@@ -112,12 +112,10 @@
 	for (i = (nr - 1) ; i >= 0; i--) {
 		bh = bhs[i];
 
-		wait_on_buffer(bh);
-
 		if (inode)
-			ocfs2_set_bh_sequence(inode, bh);
-		else
-			ocfs2_clear_bh_sequence(bh);
+			ocfs2_set_buffer_uptodate(inode, bh);
+
+		wait_on_buffer(bh);
 	}
 	if (inode)
 		up(&OCFS2_I(inode)->ip_io_sem);
@@ -161,6 +159,9 @@
 
 	sb = osb->sb;
 
+	if (flags & OCFS2_BH_CACHED && !inode)
+		flags &= ~OCFS2_BH_CACHED;
+
 	if (inode)
 		down(&OCFS2_I(inode)->ip_io_sem);
 	for (i = 0 ; i < nr ; i++) {
@@ -177,18 +178,17 @@
 		bh = bhs[i];
 		ignore_cache = 0;
 
-		if (flags & OCFS2_BH_CACHED && inode && 
-		    !ocfs2_test_bh_sequence(inode, bh)) {
-			mlog(ML_SEQNUM, "bh (%llu) seqnum does not match "
-			     "inode %"MLFu64"\n",
+		if (flags & OCFS2_BH_CACHED &&
+		    !ocfs2_buffer_uptodate(inode, bh)) {
+			mlog(ML_UPTODATE,
+			     "bh (%llu), inode %"MLFu64" not uptodate\n",
 			     (unsigned long long)bh->b_blocknr,
 			     OCFS2_I(inode)->ip_blkno);
 			ignore_cache = 1;
 		}
 
-		if ((flags & OCFS2_BH_CACHED) && (!buffer_uptodate(bh)))
-			ignore_cache = 1;
-
+		/* XXX: Can we ever get this and *not* have the cached
+		 * flag set? */
 		if (buffer_jbd(bh)) {
 			if (!(flags & OCFS2_BH_CACHED) || ignore_cache)
 				mlog(ML_BH_IO, "trying to sync read a jbd "
@@ -234,6 +234,9 @@
 	for (i = (nr - 1); i >= 0; i--) {
 		bh = bhs[i];
 
+		if (inode)
+			ocfs2_set_buffer_uptodate(inode, bh);
+
 		/* We know this can't have changed as we hold the
 		 * inode sem. Avoid doing any work on the bh as the
 		 * journal has it now. */
@@ -241,11 +244,6 @@
 			continue;
 
 		wait_on_buffer(bh);
-
-		if (inode)
-			ocfs2_set_bh_sequence(inode, bh);
-		else
-			ocfs2_clear_bh_sequence(bh);
 	}
 	if (inode)
 		up(&OCFS2_I(inode)->ip_io_sem);

Modified: trunk/fs/ocfs2/cluster/masklog.c
===================================================================
--- trunk/fs/ocfs2/cluster/masklog.c	2005-06-24 02:15:41 UTC (rev 2424)
+++ trunk/fs/ocfs2/cluster/masklog.c	2005-06-24 02:30:36 UTC (rev 2425)
@@ -204,7 +204,7 @@
 	set_a_string(EXTENT_MAP);
 	set_a_string(DLM_GLUE);
 	set_a_string(BH_IO);
-	set_a_string(SEQNUM);
+	set_a_string(UPTODATE);
 	set_a_string(NAMEI);
 	set_a_string(INODE);
 	set_a_string(VOTE);

Modified: trunk/fs/ocfs2/cluster/masklog.h
===================================================================
--- trunk/fs/ocfs2/cluster/masklog.h	2005-06-24 02:15:41 UTC (rev 2424)
+++ trunk/fs/ocfs2/cluster/masklog.h	2005-06-24 02:30:36 UTC (rev 2425)
@@ -101,7 +101,7 @@
 #define ML_EXTENT_MAP	0x0000000000040000ULL /* ocfs2 extent map caching */
 #define ML_DLM_GLUE	0x0000000000080000ULL /* ocfs2 dlm glue layer */
 #define ML_BH_IO	0x0000000000100000ULL /* ocfs2 buffer I/O */
-#define ML_SEQNUM	0x0000000000200000ULL /* ocfs2 caching sequence #'s */
+#define ML_UPTODATE	0x0000000000200000ULL /* ocfs2 caching sequence #'s */
 #define ML_NAMEI	0x0000000000400000ULL /* ocfs2 directory / namespace */
 #define ML_INODE	0x0000000000800000ULL /* ocfs2 inode manipulation */
 #define ML_VOTE		0x0000000001000000ULL /* ocfs2 node messaging  */

Modified: trunk/fs/ocfs2/dir.c
===================================================================
--- trunk/fs/ocfs2/dir.c	2005-06-24 02:15:41 UTC (rev 2424)
+++ trunk/fs/ocfs2/dir.c	2005-06-24 02:30:36 UTC (rev 2425)
@@ -54,8 +54,8 @@
 #include "inode.h"
 #include "journal.h"
 #include "namei.h"
-#include "seqnum.h"
 #include "suballoc.h"
+#include "uptodate.h"
 
 #include "buffer_head_io.h"
 
@@ -464,8 +464,8 @@
 		goto bail;
 	}
 
-	set_buffer_uptodate(new_bh);
-	ocfs2_set_bh_sequence(dir, new_bh);
+	ocfs2_set_new_buffer_uptodate(dir, new_bh);
+
 	status = ocfs2_journal_access(handle, dir, new_bh, 
 				      OCFS2_JOURNAL_ACCESS_CREATE);
 	if (status < 0) {

Modified: trunk/fs/ocfs2/dlmglue.c
===================================================================
--- trunk/fs/ocfs2/dlmglue.c	2005-06-24 02:15:41 UTC (rev 2424)
+++ trunk/fs/ocfs2/dlmglue.c	2005-06-24 02:30:36 UTC (rev 2425)
@@ -49,8 +49,8 @@
 #include "heartbeat.h"
 #include "inode.h"
 #include "journal.h"
-#include "seqnum.h"
 #include "slot_map.h"
+#include "uptodate.h"
 #include "vote.h"
 
 #include "buffer_head_io.h"
@@ -171,8 +171,6 @@
 static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lockres);
 static inline void ocfs2_generic_handle_attach_action(struct ocfs2_lock_res *lockres);
 static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres, int level);
-static inline void ocfs2_handle_meta_convert_action(struct inode *inode,
-						    struct ocfs2_lock_res *lockres);
 static void ocfs2_schedule_blocked_lock(ocfs2_super *osb,
 					struct ocfs2_lock_res *lockres);
 static void ocfs2_schedule_blocked_inode_lock(struct inode *inode,
@@ -519,19 +517,7 @@
 
 	lockres->l_level = lockres->l_requested;
 	lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
-	mlog_exit_void();
-}
 
-static inline void ocfs2_handle_meta_convert_action(struct inode *inode,
-						    struct ocfs2_lock_res *lockres)
-{
-	mlog_entry_void();
-
-	/* generic_handle_convert_action will set the refresh flag for us. */
-	if (lockres->l_level == LKM_NLMODE)
-		ocfs2_inc_inode_sequence(inode);
-	ocfs2_generic_handle_convert_action(lockres);
-
 	mlog_exit_void();
 }
 
@@ -584,19 +570,11 @@
 
 	switch(lockres->l_action) {
 	case OCFS2_AST_ATTACH:
-		if (lockres->l_type == OCFS2_LOCK_TYPE_META &&
-		    lockres->l_requested > LKM_NLMODE &&
-		    !(lockres->l_flags & OCFS2_LOCK_LOCAL))
-			ocfs2_inc_inode_sequence(inode);
-
 		ocfs2_generic_handle_attach_action(lockres);
 		lockres_clear_flags(lockres, OCFS2_LOCK_LOCAL);
 		break;
 	case OCFS2_AST_CONVERT:
-		if (lockres->l_type == OCFS2_LOCK_TYPE_META)
-			ocfs2_handle_meta_convert_action(inode, lockres);
-		else
-			ocfs2_generic_handle_convert_action(lockres);
+		ocfs2_generic_handle_convert_action(lockres);
 		break;
 	case OCFS2_AST_DOWNCONVERT:
 		ocfs2_generic_handle_downconvert_action(lockres);
@@ -1480,6 +1458,10 @@
 	if (!ocfs2_should_refresh_lock_res(lockres))
 		goto bail;
 
+	/* This will discard any caching information we might have had
+	 * for the inode metadata. */
+	ocfs2_metadata_cache_purge(inode);
+
 	if (ocfs2_lvb_is_trustable(lockres)) {
 		/* yay, fastpath! */
 		ocfs2_meta_lvb_get_trunc_clusters(lockres,

Modified: trunk/fs/ocfs2/inode.c
===================================================================
--- trunk/fs/ocfs2/inode.c	2005-06-24 02:15:41 UTC (rev 2424)
+++ trunk/fs/ocfs2/inode.c	2005-06-24 02:30:36 UTC (rev 2425)
@@ -44,11 +44,11 @@
 #include "inode.h"
 #include "journal.h"
 #include "namei.h"
-#include "seqnum.h"
 #include "suballoc.h"
 #include "super.h"
 #include "symlink.h"
 #include "sysfile.h"
+#include "uptodate.h"
 #include "vote.h"
 
 #include "buffer_head_io.h"
@@ -232,7 +232,7 @@
 	inode->i_mode = fe->i_mode;
 	inode->i_uid = fe->i_uid;
 	inode->i_gid = fe->i_gid;
-	inode->i_blksize = (u32)osb->s_clustersize;	// sb->s_blocksize;
+	inode->i_blksize = (u32)osb->s_clustersize;
 
 	/* Fast symlinks will have i_size but no allocated clusters. */
 	if (S_ISLNK(inode->i_mode) && !fe->i_clusters)
@@ -664,6 +664,8 @@
 		goto bail_unblock;
 	}
 
+	ocfs2_remove_from_cache(inode, fe_bh);
+
 	status = ocfs2_free_dinode(handle, inode_alloc_inode,
 				   inode_alloc_bh, fe);
 	if (status < 0) {
@@ -737,6 +739,16 @@
 	ocfs2_lock_res_free(&oi->ip_meta_lockres);
 	ocfs2_lock_res_free(&oi->ip_data_lockres);
 
+	ocfs2_metadata_cache_purge(inode);
+
+	mlog_bug_on_msg(oi->ip_metadata_cache.ci_num_cached,
+			"Clear inode of %"MLFu64", inode has %u cache items\n",
+			oi->ip_blkno, oi->ip_metadata_cache.ci_num_cached);
+
+	mlog_bug_on_msg(!(oi->ip_flags & OCFS2_INODE_CACHE_INLINE),
+			"Clear inode of %"MLFu64", inode has a bad flag\n",
+			oi->ip_blkno);
+
 	mlog_bug_on_msg(spin_is_locked(&oi->ip_lock),
 			"Clear inode of %"MLFu64", inode is locked\n",
 			oi->ip_blkno);
@@ -765,8 +777,8 @@
 			"Clear inode of %"MLFu64" has non empty handle pointer\n",
 			oi->ip_blkno);
 
-	oi->ip_clean_buffer_seq = OCFS2_CLEAN_SEQ_CLEAR;
-	oi->ip_flags = 0;
+	/* Clear all other flags. */
+	oi->ip_flags = OCFS2_INODE_CACHE_INLINE;
 	oi->ip_created_trans = 0;
 	oi->ip_last_trans = 0;
 	oi->ip_dir_start_lookup = 0;

Modified: trunk/fs/ocfs2/inode.h
===================================================================
--- trunk/fs/ocfs2/inode.h	2005-06-24 02:15:41 UTC (rev 2424)
+++ trunk/fs/ocfs2/inode.h	2005-06-24 02:30:36 UTC (rev 2425)
@@ -55,7 +55,6 @@
 	struct list_head	ip_handle_list;
 	ocfs2_journal_handle	*ip_handle;
 
-	unsigned long		ip_clean_buffer_seq;
 	u32			ip_flags; /* see below */
 
 	/* protected by recovery_lock. */
@@ -69,6 +68,8 @@
 	/* last transaction we were a part of. */
 	unsigned long		ip_last_trans;
 
+	struct ocfs2_caching_info	ip_metadata_cache;
+
 	struct inode		vfs_inode;
 };
 
@@ -83,8 +84,7 @@
 #define OCFS2_INODE_DELETED		0x00000008
 /* Another node is deleting, so our delete is a nop */
 #define OCFS2_INODE_SKIP_DELETE		0x00000010
-/* 
- * Has the inode been orphaned on another node? 
+/* Has the inode been orphaned on another node? 
  *
  * This hints to ocfs2_drop_inode that it should clear i_nlink before
  * continuing.
@@ -101,6 +101,8 @@
 #define OCFS2_INODE_MAYBE_ORPHANED	0x00000020
 /* Does someone have the file open O_DIRECT */
 #define OCFS2_INODE_OPEN_DIRECT		0x00000040
+/* Indicates that the metadata cache should be used as an array. */
+#define OCFS2_INODE_CACHE_INLINE	0x00000080
 
 static inline struct ocfs2_inode_info *OCFS2_I(struct inode *inode)
 {

Modified: trunk/fs/ocfs2/namei.c
===================================================================
--- trunk/fs/ocfs2/namei.c	2005-06-24 02:15:41 UTC (rev 2424)
+++ trunk/fs/ocfs2/namei.c	2005-06-24 02:30:36 UTC (rev 2425)
@@ -55,10 +55,10 @@
 #include "inode.h"
 #include "journal.h"
 #include "namei.h"
-#include "seqnum.h"
 #include "suballoc.h"
 #include "symlink.h"
 #include "sysfile.h"
+#include "uptodate.h"
 #include "vote.h"
 
 #include "buffer_head_io.h"
@@ -280,8 +280,8 @@
 		goto bail;
 	}
 
-	set_buffer_uptodate(new_bh);
-	ocfs2_set_bh_sequence(inode, new_bh);
+	ocfs2_set_new_buffer_uptodate(inode, new_bh);
+
 	status = ocfs2_journal_access(handle, inode, new_bh, 
 				      OCFS2_JOURNAL_ACCESS_CREATE);
 	if (status < 0) {
@@ -552,11 +552,8 @@
 		mlog_errno(status);
 		goto leave;
 	}
-	set_buffer_uptodate(*new_fe_bh);
+	ocfs2_set_new_buffer_uptodate(inode, *new_fe_bh);
 
-	ocfs2_new_inode_sequence(inode);
-	ocfs2_set_bh_sequence(inode, *new_fe_bh);
-
 	status = ocfs2_journal_access(handle, inode, *new_fe_bh,
 				      OCFS2_JOURNAL_ACCESS_CREATE);
 	if (status < 0) {
@@ -1526,9 +1523,8 @@
 			mlog_errno(status);
 			goto bail;
 		}
+		ocfs2_set_new_buffer_uptodate(inode, bhs[virtual]);
 
-		set_buffer_uptodate(bhs[virtual]);
-		ocfs2_set_bh_sequence(inode, bhs[virtual]);
 		status = ocfs2_journal_access(handle, inode, bhs[virtual], 
 					      OCFS2_JOURNAL_ACCESS_CREATE);
 		if (status < 0) {

Modified: trunk/fs/ocfs2/ocfs2.h
===================================================================
--- trunk/fs/ocfs2/ocfs2.h	2005-06-24 02:15:41 UTC (rev 2424)
+++ trunk/fs/ocfs2/ocfs2.h	2005-06-24 02:30:36 UTC (rev 2425)
@@ -47,6 +47,21 @@
 	struct rb_root	em_extents;
 };
 
+/* Most user visible OCFS2 inodes will have very few pieces of
+ * metadata, but larger files (including bitmaps, etc) must be taken
+ * into account when designing an access scheme. We allow a small
+ * amount of inlined blocks to be stored on an array and grow the
+ * structure into a rb tree when necessary. */
+#define OCFS2_INODE_MAX_CACHE_ARRAY 2
+
+struct ocfs2_caching_info {
+	unsigned int		ci_num_cached;
+	union {
+		sector_t	ci_array[OCFS2_INODE_MAX_CACHE_ARRAY];
+		struct rb_root	ci_tree;
+	} ci_cache;
+};
+
 /* this limits us to 256 nodes
  * if we need more, we can do a kmalloc for the map */
 #define OCFS2_NODE_MAP_MAX_NODES    256
@@ -211,7 +226,6 @@
 	wait_queue_head_t checkpoint_event;
 	atomic_t needs_checkpoint;
 	struct _ocfs2_journal *journal;
-	unsigned long osb_clean_buffer_seq;
 
 	enum ocfs2_local_alloc_state local_alloc_state;
 	struct buffer_head *local_alloc_bh;

Deleted: trunk/fs/ocfs2/seqnum.c
===================================================================
--- trunk/fs/ocfs2/seqnum.c	2005-06-24 02:15:41 UTC (rev 2424)
+++ trunk/fs/ocfs2/seqnum.c	2005-06-24 02:30:36 UTC (rev 2425)
@@ -1,186 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * seqnum.c
- *
- * Caching sequence number handling
- *
- * Standard buffer head caching flags (uptodate, etc) are insufficient
- * in a clustered environment - a buffer may be marked up to date on
- * our local node but could have been modified by another cluster
- * member. As a result an additional (and performant) caching scheme
- * is required. OCFS2 uses sequence numbers, stored on the inode and
- * buffer heads to test whether a buffer needs to be read from disk -
- * when a new cluster lock is aquired on an inode, it's sequence
- * number is incremented. Additionally, a buffer undergoing
- * modification is automatically marked by jbd and we can make liberal
- * use of buffer_jbd to shorcut many checks -- if it's in the journal,
- * then it *must* be up to date as we do not allow a metadata block to
- * be modified by multiple nodes at a time.
- *
- * Copyright (C) 2002, 2004, 2005 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- * 
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- * 
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#include <linux/fs.h>
-#include <linux/types.h>
-#include <linux/slab.h>
-#include <linux/highmem.h>
-#include <linux/buffer_head.h>
-
-#define MLOG_MASK_PREFIX ML_SEQNUM
-
-#include <cluster/masklog.h>
-
-#include "ocfs2.h"
-
-#include "inode.h"
-#include "seqnum.h"
-
-/* Sequence numbers are stored in 3 places: 
- *
- * bh->b_state - Set from inode sequence. If it differs from the inode
- * sequence, then the buffer is considered out of date.
- *
- * ip_clean_buffer_seq - The "master" number, used to compare against
- * buffer heads.
- *
- * osb_clean_buffer_seq - A globally incrementing value, we set inode
- * sequence numbers to it. 
- */
-
-/* Number of BH bits used up through JBD. The number of unused bits
- * determine the maximum size of our sequence numbers. */
-#define OCFS2_USED_BH_BITS	22
-#define OCFS2_MAX_BH_BITS	(8 * sizeof(((struct buffer_head *) 0)->b_state))
-#define OCFS2_STATE_BH_BITS	(OCFS2_MAX_BH_BITS - OCFS2_USED_BH_BITS)
-
-#define OCFS2_MAX_SEQUENCE	(1UL << OCFS2_STATE_BH_BITS)
-#define OCFS2_MAX_SEQUENCE_MASK	((1UL << OCFS2_STATE_BH_BITS) - 1)
-#define OCFS2_SEQUENCE_MASK	((~0UL) << OCFS2_USED_BH_BITS)
-
-static spinlock_t ocfs2_clean_buffer_lock = SPIN_LOCK_UNLOCKED;
-
-static void __ocfs2_inc_inode_sequence(struct inode *inode,
-				       int inc_global)
-{
-	ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-	struct ocfs2_inode_info *oi = OCFS2_I(inode);
-
-	assert_spin_locked(&ocfs2_clean_buffer_lock);
-
-	/* Though it's not currently likely, we want to handle the
-	 * case where the first inode to come through here is a newly
-	 * created one - hence the check for OCFS2_CLEAN_SEQ_CLEAR */
-	if (inc_global ||
-	    osb->osb_clean_buffer_seq == OCFS2_CLEAN_SEQ_CLEAR) {
-		osb->osb_clean_buffer_seq++;
-
-		/* We are careful not to wrap the osb sequence */
-		if (osb->osb_clean_buffer_seq >= OCFS2_MAX_SEQUENCE)
-			osb->osb_clean_buffer_seq = OCFS2_CLEAN_SEQ_START;
-	}
-
-	/* Set this from the current osb sequence number. This helps
-	 * prevent inode sequence numbers from repeating themselves
-	 * too soon. */
-	oi->ip_clean_buffer_seq = osb->osb_clean_buffer_seq;
-
-	mlog(0, "(%u) Inode %"MLFu64", seq: %lu\n", current->pid,
-	     OCFS2_I(inode)->ip_blkno, oi->ip_clean_buffer_seq);
-}
-
-void ocfs2_inc_inode_sequence(struct inode *inode)
-{
-	spin_lock(&ocfs2_clean_buffer_lock);
-	__ocfs2_inc_inode_sequence(inode, 1);
-	spin_unlock(&ocfs2_clean_buffer_lock);
-}
-
-/* Called only on newly created inodes. */
-void ocfs2_new_inode_sequence(struct inode *inode)
-{
-	BUG_ON(OCFS2_I(inode)->ip_clean_buffer_seq);
-
-	spin_lock(&ocfs2_clean_buffer_lock);
-	/* We don't increment the global sequence number on brand new
-	 * inodes -- they won't have any existing metadata buffers
-	 * which might be old. */
-	__ocfs2_inc_inode_sequence(inode, 0);
-	spin_unlock(&ocfs2_clean_buffer_lock);
-}
-
-/* Completely point in time, which is fine for buffer_head comparsions
- * as they should be done under cluster lock in which case the
- * sequence won't be incrememnted. */
-static inline unsigned int ocfs2_get_inode_sequence(struct inode *inode)
-{
-	unsigned int ret;
-	struct ocfs2_inode_info *oi = OCFS2_I(inode);
-
-	spin_lock(&ocfs2_clean_buffer_lock);
-	if (oi->ip_clean_buffer_seq >= OCFS2_MAX_SEQUENCE)
-		mlog(ML_ERROR,
-		     "Inode %"MLFu64" has bad sequence: %lu\n",
-		     oi->ip_blkno, oi->ip_clean_buffer_seq);
-
-	ret = oi->ip_clean_buffer_seq & OCFS2_MAX_SEQUENCE_MASK;
-	spin_unlock(&ocfs2_clean_buffer_lock);
-
-	return ret;
-}
-
-/* The sequence numbers on buffer_head are protected by inodes
- * ip_io_sem - this also guards access to that buffer by the journal
- * (via ocfs2_journal_access), so we don't change the buffer state
- * bits at the same time as JBD, which will cause many hard to find
- * problems. Once the buffer has been passed to the journal, we can
- * test that condition via buffer_jbd and avoid changing state
- * bits.
- */
-
-void ocfs2_clear_bh_sequence(struct buffer_head *bh)
-{
-	unsigned int prev = bh->b_state & OCFS2_SEQUENCE_MASK;
-	bh->b_state &= ~prev;
-}
-
-/* For a newly created inode (e.g., one that's in read_locked_inode)
- * it's sequence number will be zero (uninitialized), until a cluster
- * lock is later acquired. That's ok though because we never want to
- * trust buffers for unlocked inodes. 
- *
- * This function is also called against newly allocated inode
- * metadata */
-void ocfs2_set_bh_sequence(struct inode *inode,
-			   struct buffer_head *bh)
-{
-	unsigned int seq =
-		ocfs2_get_inode_sequence(inode) << OCFS2_USED_BH_BITS;
-
-	ocfs2_clear_bh_sequence(bh);
-	bh->b_state |= seq;
-}
-
-int ocfs2_test_bh_sequence(struct inode *inode,
-			   struct buffer_head *bh)
-{
-	unsigned int seq =
-		(bh->b_state & OCFS2_SEQUENCE_MASK) >> OCFS2_USED_BH_BITS;
-
-	return seq == ocfs2_get_inode_sequence(inode);
-}

Deleted: trunk/fs/ocfs2/seqnum.h
===================================================================
--- trunk/fs/ocfs2/seqnum.h	2005-06-24 02:15:41 UTC (rev 2424)
+++ trunk/fs/ocfs2/seqnum.h	2005-06-24 02:30:36 UTC (rev 2425)
@@ -1,41 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * seqnum.h
- *
- * Caching sequence number handling
- *
- * Copyright (C) 2002, 2004, 2005 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- * 
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- * 
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#ifndef OCFS2_SEQNUM_H
-#define OCFS2_SEQNUM_H
-
-#define OCFS2_CLEAN_SEQ_CLEAR 0
-#define OCFS2_CLEAN_SEQ_START 1
-
-void ocfs2_inc_inode_sequence(struct inode *inode);
-void ocfs2_new_inode_sequence(struct inode *inode);
-
-void ocfs2_clear_bh_sequence(struct buffer_head *bh);
-void ocfs2_set_bh_sequence(struct inode *inode,
-			   struct buffer_head *bh);
-int ocfs2_test_bh_sequence(struct inode *inode,
-			   struct buffer_head *bh);
-
-#endif /* OCFS2_SEQNUM_H */

Modified: trunk/fs/ocfs2/suballoc.c
===================================================================
--- trunk/fs/ocfs2/suballoc.c	2005-06-24 02:15:41 UTC (rev 2424)
+++ trunk/fs/ocfs2/suballoc.c	2005-06-24 02:30:36 UTC (rev 2425)
@@ -39,9 +39,9 @@
 #include "inode.h"
 #include "journal.h"
 #include "localalloc.h"
-#include "seqnum.h"
 #include "suballoc.h"
 #include "sysfile.h"
+#include "uptodate.h"
 
 #include "buffer_head_io.h"
 
@@ -274,8 +274,7 @@
 		mlog_errno(status);
 		goto bail;
 	}
-	set_buffer_uptodate(bg_bh);
-	ocfs2_set_bh_sequence(alloc_inode, bg_bh);
+	ocfs2_set_new_buffer_uptodate(alloc_inode, bg_bh);
 
 	status = ocfs2_block_group_fill(handle, 
 					alloc_inode, 

Modified: trunk/fs/ocfs2/super.c
===================================================================
--- trunk/fs/ocfs2/super.c	2005-06-24 02:15:41 UTC (rev 2424)
+++ trunk/fs/ocfs2/super.c	2005-06-24 02:30:36 UTC (rev 2425)
@@ -57,10 +57,10 @@
 #include "journal.h"
 #include "localalloc.h"
 #include "proc.h"
-#include "seqnum.h"
 #include "slot_map.h"
 #include "super.h"
 #include "sysfile.h"
+#include "uptodate.h"
 #include "ver.h"
 #include "vote.h"
 
@@ -509,7 +509,12 @@
 	if (init_ocfs2_extent_maps())
 		return -ENOMEM;
 
-	/* Initialize the memory slabs for oin and file entry */
+	status = init_ocfs2_uptodate_cache();
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
 	status = ocfs2_initialize_mem_caches();
 	if (status < 0) {
 		mlog_errno(status);
@@ -532,6 +537,7 @@
 leave:
 	if (status < 0) {
 		ocfs2_free_mem_caches();
+		exit_ocfs2_uptodate_cache();
 		exit_ocfs2_extent_maps();
 	}
 
@@ -561,6 +567,8 @@
 
 	exit_ocfs2_extent_maps();
 
+	exit_ocfs2_uptodate_cache();
+
 	mlog_exit_void();
 }
 
@@ -640,7 +648,6 @@
 	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
 	    SLAB_CTOR_CONSTRUCTOR) {
 		oi->ip_flags = 0;
-		oi->ip_clean_buffer_seq = OCFS2_CLEAN_SEQ_CLEAR;
 		oi->ip_open_count = 0;
 		spin_lock_init(&oi->ip_lock);
 		ocfs2_extent_map_init(&oi->vfs_inode);
@@ -661,6 +668,8 @@
 		ocfs2_lock_res_init_once(&oi->ip_meta_lockres);
 		ocfs2_lock_res_init_once(&oi->ip_data_lockres);
 
+		ocfs2_metadata_cache_init(&oi->vfs_inode);
+
 		inode_init_once(&oi->vfs_inode);
 	}
 }
@@ -1108,7 +1117,6 @@
 
 	init_waitqueue_head(&osb->checkpoint_event);
 	atomic_set(&osb->needs_checkpoint, 0);
-	osb->osb_clean_buffer_seq = OCFS2_CLEAN_SEQ_CLEAR;
 
 	osb->node_num = O2NM_INVALID_NODE_NUM;
 	osb->slot_num = OCFS2_INVALID_SLOT;

Added: trunk/fs/ocfs2/uptodate.c
===================================================================
--- trunk/fs/ocfs2/uptodate.c	2005-06-24 02:15:41 UTC (rev 2424)
+++ trunk/fs/ocfs2/uptodate.c	2005-06-24 02:30:36 UTC (rev 2425)
@@ -0,0 +1,543 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * uptodate.c
+ *
+ * Tracking the up-to-date-ness of a local buffer_head with respect to
+ * the cluster.
+ * 
+ * Copyright (C) 2002, 2004, 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Standard buffer head caching flags (uptodate, etc) are insufficient
+ * in a clustered environment - a buffer may be marked up to date on
+ * our local node but could have been modified by another cluster
+ * member. As a result an additional (and performant) caching scheme
+ * is required. A further requirement is that we consume as little
+ * memory as possible - we never pin buffer_head structures in order
+ * to cache them.
+ *
+ * We track the existence of up to date buffers on the inodes which
+ * are associated with them. Because we don't want to pin
+ * buffer_heads, this is only a (strong) hint and several other checks
+ * are made in the I/O path to ensure that we don't use a stale or
+ * invalid buffer without going to disk:
+ *	- buffer_jbd is used liberally - if a bh is in the journal on
+ *	  this node then it *must* be up to date.
+ *	- the standard buffer_uptodate() macro is used to detect buffers
+ *	  which may be invalid (even if we have an up to date tracking 
+ * 	  item for them)
+ * 
+ * For a full understanding of how this code works together, one
+ * should read the callers in dlmglue.c, the I/O functions in
+ * buffer_head_io.c and ocfs2_journal_access in journal.c
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/buffer_head.h>
+#include <linux/rbtree.h>
+
+#define MLOG_MASK_PREFIX ML_UPTODATE
+
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "inode.h"
+#include "uptodate.h"
+
+struct ocfs2_meta_cache_item {
+	struct rb_node	c_node;
+	sector_t	c_block;
+};
+
+static kmem_cache_t *ocfs2_uptodate_cachep = NULL;
+
+void ocfs2_metadata_cache_init(struct inode *inode)
+{
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
+
+	oi->ip_flags |= OCFS2_INODE_CACHE_INLINE;
+	ci->ci_num_cached = 0;
+}
+
+/* No lock taken here as 'root' is not expected to be visible to other
+ * processes. */
+static unsigned int ocfs2_purge_copied_metadata_tree(struct rb_root *root)
+{
+	unsigned int purged = 0;
+	struct rb_node *node;
+	struct ocfs2_meta_cache_item *item;
+
+	while ((node = rb_last(root)) != NULL) {
+		item = rb_entry(node, struct ocfs2_meta_cache_item, c_node);
+
+		mlog(0, "Purge item %llu\n",
+		     (unsigned long long) item->c_block);
+
+		rb_erase(&item->c_node, root);
+		kmem_cache_free(ocfs2_uptodate_cachep, item);
+
+		purged++;
+	}
+	return purged;
+}
+
+/* Called from locking and called from ocfs2_clear_inode. Dump the
+ * cache for a given inode.
+ *
+ * This function is a few more lines longer than necessary due to some
+ * accounting done here, but I think it's worth tracking down those
+ * bugs sooner -- Mark */
+void ocfs2_metadata_cache_purge(struct inode *inode)
+{
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	unsigned int tree, to_purge, purged;
+	struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
+	struct rb_root root = RB_ROOT;
+
+	spin_lock(&oi->ip_lock);
+	tree = !(oi->ip_flags & OCFS2_INODE_CACHE_INLINE);
+	to_purge = ci->ci_num_cached;
+
+	mlog(0, "Purge %u %s items from Inode %"MLFu64"\n", to_purge,
+	     tree ? "array" : "tree", oi->ip_blkno);
+
+	/* If we're a tree, save off the root so that we can safely
+	 * initialize the cache. We do the work to free tree members
+	 * without the spinlock. */
+	if (tree)
+		root = ci->ci_cache.ci_tree;
+
+	ocfs2_metadata_cache_init(inode);
+	spin_unlock(&oi->ip_lock);
+
+	purged = ocfs2_purge_copied_metadata_tree(&root);
+	/* If possible, track the number wiped so that we can more
+	 * easily detect counting errors. Unfortunately, this is only
+	 * meaningful for trees. */
+	if (tree && purged != to_purge)
+		mlog(ML_ERROR, "Inode %"MLFu64", count = %u, purged = %u\n",
+		     oi->ip_blkno, to_purge, purged);
+}
+
+/* Returns the index in the cache array, -1 if not found. 
+ * Requires ip_lock. */
+static int ocfs2_search_cache_array(struct ocfs2_caching_info *ci,
+				    sector_t item)
+{
+	int i;
+
+	for (i = 0; i < ci->ci_num_cached; i++) {
+		if (item == ci->ci_cache.ci_array[i])
+			return i;
+	}
+
+	return -1;
+}
+
+/* Returns the cache item if found, otherwise NULL.
+ * Requires ip_lock. */
+static struct ocfs2_meta_cache_item *
+ocfs2_search_cache_tree(struct ocfs2_caching_info *ci,
+			sector_t block)
+{
+	struct rb_node * n = ci->ci_cache.ci_tree.rb_node;
+	struct ocfs2_meta_cache_item *item = NULL;
+
+	while (n) {
+		item = rb_entry(n, struct ocfs2_meta_cache_item, c_node);
+
+		if (block < item->c_block)
+			n = n->rb_left;
+		else if (block > item->c_block)
+			n = n->rb_right;
+		else
+			return item;
+	}
+
+	return NULL;
+}
+
+static int ocfs2_buffer_cached(struct ocfs2_inode_info *oi,
+			       struct buffer_head *bh)
+{
+	int index = -1;
+	struct ocfs2_meta_cache_item *item = NULL;
+
+	spin_lock(&oi->ip_lock);
+
+	mlog(0, "Inode %"MLFu64", query block %llu (inline = %u)\n",
+	     oi->ip_blkno, (unsigned long long) bh->b_blocknr,
+	     !!(oi->ip_flags & OCFS2_INODE_CACHE_INLINE));
+
+	if (oi->ip_flags & OCFS2_INODE_CACHE_INLINE)
+		index = ocfs2_search_cache_array(&oi->ip_metadata_cache,
+						 bh->b_blocknr);
+	else
+		item = ocfs2_search_cache_tree(&oi->ip_metadata_cache,
+					       bh->b_blocknr);
+
+	spin_unlock(&oi->ip_lock);
+
+	mlog(0, "index = %d, item = %p\n", index, item);
+
+	return (index != -1) || (item != NULL);
+}
+
+/* Warning: even if it returns true, this does *not* guarantee that
+ * the block is stored in our inode metadata cache. */
+int ocfs2_buffer_uptodate(struct inode *inode,
+			  struct buffer_head *bh)
+{
+	/* Doesn't matter if the bh is in our cache or not -- if it's
+	 * not marked uptodate then we know it can't have correct
+	 * data. */
+	if (!buffer_uptodate(bh))
+		return 0;
+
+	/* OCFS2 does not allow multiple nodes to be changing the same
+	 * block at the same time. */
+	if (buffer_jbd(bh))
+		return 1;
+
+	/* Ok, locally the buffer is marked as up to date, now search
+	 * our cache to see if we can trust that. */
+	return ocfs2_buffer_cached(OCFS2_I(inode), bh);
+}
+
+/* Requires ip_lock */
+static void ocfs2_append_cache_array(struct ocfs2_caching_info *ci,
+				     sector_t block)
+{
+	BUG_ON(ci->ci_num_cached >= OCFS2_INODE_MAX_CACHE_ARRAY);
+
+	mlog(0, "block %llu takes position %u\n", (unsigned long long) block,
+	     ci->ci_num_cached);
+
+	ci->ci_cache.ci_array[ci->ci_num_cached] = block;
+	ci->ci_num_cached++;
+}
+
+/* By now the caller should have checked that the item does *not*
+ * exist in the tree.
+ * Requires ip_lock. */
+static void __ocfs2_insert_cache_tree(struct ocfs2_caching_info *ci,
+				      struct ocfs2_meta_cache_item *new)
+{
+	sector_t block = new->c_block;
+	struct rb_node *parent = NULL;
+	struct rb_node **p = &ci->ci_cache.ci_tree.rb_node;
+	struct ocfs2_meta_cache_item *tmp;
+
+	mlog(0, "Insert block %llu num = %u\n", (unsigned long long) block,
+	     ci->ci_num_cached);
+
+	while(*p) {
+		parent = *p;
+
+		tmp = rb_entry(parent, struct ocfs2_meta_cache_item, c_node);
+
+		if (block < tmp->c_block)
+			p = &(*p)->rb_left;
+		else if (block > tmp->c_block)
+			p = &(*p)->rb_right;
+		else {
+			/* This should never happen! */
+			mlog(ML_ERROR, "Duplicate block %llu cached!\n",
+			     (unsigned long long) block);
+			BUG();
+		}
+	}
+
+	rb_link_node(&new->c_node, parent, p);
+	rb_insert_color(&new->c_node, &ci->ci_cache.ci_tree);
+	ci->ci_num_cached++;
+}
+
+static inline int ocfs2_insert_can_use_array(struct ocfs2_inode_info *oi,
+					     struct ocfs2_caching_info *ci)
+{
+	assert_spin_locked(&oi->ip_lock);
+
+	return (oi->ip_flags & OCFS2_INODE_CACHE_INLINE) &&
+		(ci->ci_num_cached < OCFS2_INODE_MAX_CACHE_ARRAY);
+}
+
+/* tree should be exactly OCFS2_INODE_MAX_CACHE_ARRAY wide. NULL the
+ * pointers in tree after we use them - this allows caller to detect
+ * when to free in case of error. */
+static void ocfs2_expand_cache(struct ocfs2_inode_info *oi,
+			       struct ocfs2_meta_cache_item **tree)
+{
+	int i;
+	struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
+
+	mlog_bug_on_msg(ci->ci_num_cached != OCFS2_INODE_MAX_CACHE_ARRAY,
+			"Inode %"MLFu64", num cached = %u, should be %u\n",
+			oi->ip_blkno, ci->ci_num_cached,
+			OCFS2_INODE_MAX_CACHE_ARRAY);
+	mlog_bug_on_msg(!(oi->ip_flags & OCFS2_INODE_CACHE_INLINE),
+			"Inode %"MLFu64" not marked as inline anymore!\n",
+			oi->ip_blkno);
+	assert_spin_locked(&oi->ip_lock);
+
+	/* Be careful to initialize the tree members *first* because
+	 * once the ci_tree is used, the array is junk... */
+	for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++)
+		tree[i]->c_block = ci->ci_cache.ci_array[i];
+
+	oi->ip_flags &= ~OCFS2_INODE_CACHE_INLINE;
+	ci->ci_cache.ci_tree = RB_ROOT;
+	/* this will be set again by __ocfs2_insert_cache_tree */
+	ci->ci_num_cached = 0;
+
+	for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++) {
+		__ocfs2_insert_cache_tree(ci, tree[i]);
+		tree[i] = NULL;
+	}
+
+	mlog(0, "Expanded %"MLFu64" to a tree cache: flags 0x%x, num = %u\n",
+	     oi->ip_blkno, oi->ip_flags, ci->ci_num_cached);
+}
+
+/* Slow path function - memory allocation is necessary. See the
+ * comment above ocfs2_set_buffer_uptodate for more information. */
+static void __ocfs2_set_buffer_uptodate(struct ocfs2_inode_info *oi,
+					sector_t block,
+					int expand_tree)
+{
+	int i;
+	struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
+	struct ocfs2_meta_cache_item *new = NULL;
+	struct ocfs2_meta_cache_item *tree[OCFS2_INODE_MAX_CACHE_ARRAY] =
+		{ NULL, };
+
+	mlog(0, "Inode %"MLFu64", block %llu, expand = %d\n",
+	     oi->ip_blkno, (unsigned long long) block, expand_tree);
+
+	new = kmem_cache_alloc(ocfs2_uptodate_cachep, GFP_KERNEL);
+	if (!new) {
+		mlog_errno(-ENOMEM);
+		return;
+	}
+	new->c_block = block;
+
+	if (expand_tree) {
+		/* Do *not* allocate an array here - the removal code
+		 * has no way of tracking that. */
+		for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++) {
+			tree[i] = kmem_cache_alloc(ocfs2_uptodate_cachep,
+						   GFP_KERNEL);
+			if (!tree[i]) {
+				mlog_errno(-ENOMEM);
+				goto out_free;
+			}
+
+			/* These are initialized in ocfs2_expand_cache! */
+		}
+	}
+
+	spin_lock(&oi->ip_lock);
+	if (ocfs2_insert_can_use_array(oi, ci)) {
+		mlog(0, "Someone cleared the tree underneath us\n");
+		/* Ok, items were removed from the cache in between
+		 * locks. Detect this and revert back to the fast path */
+		ocfs2_append_cache_array(ci, block);
+		spin_unlock(&oi->ip_lock);
+		goto out_free;
+	}
+
+	if (expand_tree)
+		ocfs2_expand_cache(oi, tree);
+
+	__ocfs2_insert_cache_tree(ci, new);
+	spin_unlock(&oi->ip_lock);
+
+	new = NULL;
+out_free:
+	if (new)
+		kmem_cache_free(ocfs2_uptodate_cachep, new);
+
+	/* If these were used, then ocfs2_expand_cache re-set them to
+	 * NULL for us. */
+	if (tree[0]) {
+		for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++)
+			if (tree[i])
+				kmem_cache_free(ocfs2_uptodate_cachep,
+						tree[i]);
+	}
+}
+
+/* Item insertion is guarded by ip_io_sem, so the insertion path takes
+ * advantage of this by not rechecking for a duplicate insert during
+ * the slow case. Additionally, if the cache needs to be bumped up to
+ * a tree, the code will not recheck after acquiring the lock --
+ * multiple paths cannot be expanding to a tree at the same time.
+ * 
+ * The slow path takes into account that items can be removed
+ * (including the whole tree wiped and reset) when this process it out
+ * allocating memory. In those cases, it reverts back to the fast
+ * path.
+ *
+ * Note that this function may actually fail to insert the block if
+ * memory cannot be allocated. This is not fatal however (but may
+ * result in a performance penalty) */
+void ocfs2_set_buffer_uptodate(struct inode *inode,
+			       struct buffer_head *bh)
+{
+	int expand;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
+
+	/* The block may very well exist in our cache already, so avoid
+	 * doing any more work in that case. */
+	if (ocfs2_buffer_cached(oi, bh))
+		return;
+
+	mlog(0, "Inode %"MLFu64", inserting block %llu\n", oi->ip_blkno,
+	     (unsigned long long) bh->b_blocknr);
+
+	/* No need to recheck under spinlock - insertion is guarded by
+	 * ip_io_sem */
+	spin_lock(&oi->ip_lock);
+	if (ocfs2_insert_can_use_array(oi, ci)) {
+		/* Fast case - it's an array and there's a free
+		 * spot. */
+		ocfs2_append_cache_array(ci, bh->b_blocknr);
+		spin_unlock(&oi->ip_lock);
+		return;
+	}
+
+	expand = 0;
+	if (oi->ip_flags & OCFS2_INODE_CACHE_INLINE) {
+		/* We need to bump things up to a tree. */
+		expand = 1;
+	}
+	spin_unlock(&oi->ip_lock);
+
+	__ocfs2_set_buffer_uptodate(oi, bh->b_blocknr, expand);
+}
+
+/* Called against a newly allocated buffer. Most likely nobody should
+ * be able to read this sort of metadata while it's still being
+ * allocated, but this is careful to take ip_io_sem anyway. */
+void ocfs2_set_new_buffer_uptodate(struct inode *inode,
+				   struct buffer_head *bh)
+{
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+
+	/* This should definitely *not* exist in our cache */
+	BUG_ON(ocfs2_buffer_cached(oi, bh));
+
+	set_buffer_uptodate(bh);
+
+	down(&oi->ip_io_sem);
+	ocfs2_set_buffer_uptodate(inode, bh);
+	up(&oi->ip_io_sem);
+}
+
+/* Requires ip_lock. */
+static void ocfs2_remove_metadata_array(struct ocfs2_caching_info *ci,
+					int index)
+{
+	sector_t *array = ci->ci_cache.ci_array;
+	int bytes;
+
+	BUG_ON(index < 0 || index >= OCFS2_INODE_MAX_CACHE_ARRAY);
+	BUG_ON(index >= ci->ci_num_cached);
+	BUG_ON(!ci->ci_num_cached);
+
+	mlog(0, "remove index %d (num_cached = %u\n", index,
+	     ci->ci_num_cached);
+
+	ci->ci_num_cached--;
+
+	/* don't need to copy if the array is now empty, or if we
+	 * removed at the tail */
+	if (ci->ci_num_cached && index < ci->ci_num_cached) {
+		bytes = sizeof(sector_t) * (ci->ci_num_cached - index);
+		memmove(&array[index], &array[index + 1], bytes);
+	}
+}
+
+/* Requires ip_lock. */
+static void ocfs2_remove_metadata_tree(struct ocfs2_caching_info *ci,
+				       struct ocfs2_meta_cache_item *item)
+{
+	mlog(0, "remove block %llu from tree\n",
+	     (unsigned long long) item->c_block);
+
+	rb_erase(&item->c_node, &ci->ci_cache.ci_tree);
+	ci->ci_num_cached--;
+}
+
+/* Called when we remove a chunk of metadata from an inode. We don't
+ * bother reverting things to an inlined array in the case of a remove
+ * which moves us back under the limit. */
+void ocfs2_remove_from_cache(struct inode *inode,
+			     struct buffer_head *bh)
+{
+	int index;
+	sector_t block = bh->b_blocknr;
+	struct ocfs2_meta_cache_item *item = NULL;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
+
+	spin_lock(&oi->ip_lock);
+	mlog(0, "Inode %"MLFu64", remove %llu, items = %u, array = %u\n",
+	     oi->ip_blkno, (unsigned long long) block, ci->ci_num_cached,
+	     oi->ip_flags & OCFS2_INODE_CACHE_INLINE);
+
+	if (oi->ip_flags & OCFS2_INODE_CACHE_INLINE) {
+		index = ocfs2_search_cache_array(ci, block);
+		if (index != -1)
+			ocfs2_remove_metadata_array(ci, index);
+	} else {
+		item = ocfs2_search_cache_tree(ci, block);
+		if (item)
+			ocfs2_remove_metadata_tree(ci, item);
+	}
+	spin_unlock(&oi->ip_lock);
+
+	if (item)
+		kmem_cache_free(ocfs2_uptodate_cachep, item);
+}
+
+int __init init_ocfs2_uptodate_cache(void)
+{
+	ocfs2_uptodate_cachep = kmem_cache_create("ocfs2_uptodate",
+				  sizeof(struct ocfs2_meta_cache_item),
+				  0, SLAB_HWCACHE_ALIGN, NULL, NULL);
+	if (!ocfs2_uptodate_cachep)
+		return -ENOMEM;
+
+	mlog(0, "%u inlined cache items per inode.\n",
+	     OCFS2_INODE_MAX_CACHE_ARRAY);
+
+	return 0;
+}
+
+void __exit exit_ocfs2_uptodate_cache(void)
+{
+	if (ocfs2_uptodate_cachep)
+		kmem_cache_destroy(ocfs2_uptodate_cachep);
+}

Added: trunk/fs/ocfs2/uptodate.h
===================================================================
--- trunk/fs/ocfs2/uptodate.h	2005-06-24 02:15:41 UTC (rev 2424)
+++ trunk/fs/ocfs2/uptodate.h	2005-06-24 02:30:36 UTC (rev 2425)
@@ -0,0 +1,44 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * uptodate.h
+ *
+ * Cluster uptodate tracking
+ *
+ * Copyright (C) 2002, 2004, 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_UPTODATE_H
+#define OCFS2_UPTODATE_H
+
+int __init init_ocfs2_uptodate_cache(void);
+void __exit exit_ocfs2_uptodate_cache(void);
+
+void ocfs2_metadata_cache_init(struct inode *inode);
+void ocfs2_metadata_cache_purge(struct inode *inode);
+
+int ocfs2_buffer_uptodate(struct inode *inode,
+			  struct buffer_head *bh);
+void ocfs2_set_buffer_uptodate(struct inode *inode,
+			       struct buffer_head *bh);
+void ocfs2_set_new_buffer_uptodate(struct inode *inode,
+				   struct buffer_head *bh);
+void ocfs2_remove_from_cache(struct inode *inode,
+			     struct buffer_head *bh);
+
+#endif /* OCFS2_UPTODATE_H */