[Ocfs2-commits] jlbec commits r1584 - trunk/src

Thu Oct 21 18:20:18 CDT 2004

Author: jlbec
Date: 2004-10-21 18:20:17 -0500 (Thu, 21 Oct 2004)
New Revision: 1584

Added:
   trunk/src/extent_map.c
   trunk/src/extent_map.h
Modified:
   trunk/src/24io.c
   trunk/src/Makefile
   trunk/src/alloc.c
   trunk/src/aops.c
   trunk/src/dir.c
   trunk/src/file.c
   trunk/src/inode.c
   trunk/src/namei.c
   trunk/src/nm.c
   trunk/src/ocfs.h
   trunk/src/ocfs2.h
   trunk/src/ocfs_compat.h
   trunk/src/super.c
Log:

o Add new rbtree-based extent map.



Modified: trunk/src/24io.c
===================================================================

--- trunk/src/24io.c	2004-10-19 22:45:10 UTC (rev 1583)
+++ trunk/src/24io.c	2004-10-21 23:20:17 UTC (rev 1584)
@@ -14,6 +14,7 @@
 #include "ocfs.h"
 
 #include "alloc.h"
+#include "extent_map.h"
 #include "inode.h"
 
 
@@ -29,6 +30,10 @@
 	ocfs_super *osb;
 	__s64 vbo = 0;
 	__s64 lbo = 0;
+        u64 p_blkno;
+        u64 p_sector;
+        u64 v_blkno;
+        int s_to_b_bits, offset;
 
 	LOG_ENTRY_ARGS ("(0x%p, %ld)\n", inode, iblock);
 
@@ -59,6 +64,24 @@
 				OCFS_I(inode)->ip_blkno);
 	}
 
+        s_to_b_bits = osb->sb->s_blocksize_bits - osb->s_sectsize_bits;
+        v_blkno = (u64)iblock >> s_to_b_bits;
+        offset = (int)((u64)iblock & ((1ULL << s_to_b_bits) - 1));
+        err = ocfs2_extent_map_get_blocks(inode, v_blkno, 1, &p_blkno,
+                                          NULL);
+        if (err) {
+            LOG_ERROR_STATUS(err);
+            goto bail;
+        }
+
+        p_sector = p_blkno << s_to_b_bits;
+        p_sector += offset;
+        if (p_sector != *oblock) {
+            err = -EIO;
+            LOG_ERROR_ARGS("p_sector = %llu, *oblock = %llu\n",
+                           p_sector, (unsigned long long)*oblock);
+        }
+ 
 bail:
 	if (err < 0)
 		err = -EIO;

Modified: trunk/src/Makefile
===================================================================
--- trunk/src/Makefile	2004-10-19 22:45:10 UTC (rev 1583)
+++ trunk/src/Makefile	2004-10-21 23:20:17 UTC (rev 1584)
@@ -72,6 +72,7 @@
 	dir.c			\
 	dlm.c			\
 	extmap.c		\
+	extent_map.c		\
 	file.c			\
 	heartbeat.c		\
 	inode.c			\
@@ -106,6 +107,7 @@
 	dir.h			\
 	dlm.h			\
 	extmap.h		\
+	extent_map.h		\
 	file.h			\
 	heartbeat.h		\
 	inode.h			\

Modified: trunk/src/alloc.c
===================================================================
--- trunk/src/alloc.c	2004-10-19 22:45:10 UTC (rev 1583)
+++ trunk/src/alloc.c	2004-10-21 23:20:17 UTC (rev 1584)
@@ -38,6 +38,7 @@
 #include "bitmap.h"
 #include "dlm.h"
 #include "extmap.h"
+#include "extent_map.h"
 #include "inode.h"
 #include "localalloc.h"
 #include "util.h"
@@ -716,6 +717,17 @@
 		el->l_recs[i].e_cpos = fe->i_clusters;
 		el->l_next_free_rec++;
 	}
+	
+	/*
+	 * extent_map errors are not fatal, so they are ignored outside
+	 * of flushing the thing.
+	 */
+	status = ocfs2_extent_map_append(inode, &el->l_recs[i],
+					 new_clusters);
+	if (status) {
+		LOG_ERROR_STATUS(status);
+		ocfs2_extent_map_drop(inode, fe->i_clusters);
+	}
 
 	status = ocfs_journal_dirty(handle, fe_bh);
 	if (status < 0)
@@ -1766,6 +1778,10 @@
 	}
 	status = 0;
 bail:
+	if (!status)
+		ocfs2_extent_map_trunc(inode, fe->i_clusters);
+	else
+		ocfs2_extent_map_drop(inode, 0);
 	LOG_EXIT_STATUS(status);
 	return(status);
 }

Modified: trunk/src/aops.c
===================================================================
--- trunk/src/aops.c	2004-10-19 22:45:10 UTC (rev 1583)
+++ trunk/src/aops.c	2004-10-21 23:20:17 UTC (rev 1584)
@@ -32,6 +32,7 @@
 
 #include "alloc.h"
 #include "buffer_head_io.h"
+#include "extent_map.h"
 #include "file.h"
 #include "inode.h"
 #include "ocfs_journal.h"
@@ -127,8 +128,10 @@
 		struct buffer_head *bh_result, int create)
 {
 	int err = -EIO;
+	int err2 = -EIO;
 	__s64 vbo = 0;
 	__s64 lbo = 0;
+	u64 p_blkno;
 	__u32 len;
 	int open_direct;
 
@@ -166,6 +169,9 @@
 	err = ocfs_lookup_file_allocation(OCFS2_SB(inode->i_sb),
 					  vbo, &lbo, len, NULL, 
 					  inode);
+
+	err2 = ocfs2_extent_map_get_blocks(inode, iblock, 1, &p_blkno,
+					   NULL);
 	if (!open_direct)
 		up_read(&OCFS_I(inode)->ip_node_extend_sem);
 
@@ -174,6 +180,20 @@
 		goto bail;
 	}
 
+	if (err2 < 0) {
+		err = err2;
+		LOG_ERROR_ARGS("Error %d from get_blocks(0x%p, %llu, 1, %llu, NULL)\n",
+			       err2,
+			       inode, (unsigned long long)iblock,
+			       p_blkno);
+		goto bail;
+	}
+
+	if (p_blkno != (lbo >> inode->i_sb->s_blocksize_bits)) {
+		LOG_ERROR_ARGS("get_blocks() returned %llu, expected %lld\n",
+			       p_blkno, lbo >> inode->i_sb->s_blocksize_bits);
+	}
+
 	map_bh(bh_result, inode->i_sb, lbo >> inode->i_sb->s_blocksize_bits);
 
 	err = 0;
@@ -204,6 +224,7 @@
 	OCFS_I(inode)->ip_mmu_private += inode->i_sb->s_blocksize;
 
 bail:
+#warning Any error will crash kunmap_high() from cont_prepare_write()
 	if (err < 0)
 		err = -EIO;
 
@@ -434,12 +455,20 @@
 static int ocfs_bmap(struct address_space *mapping, long block) 
 #endif
 {
+	/* Why do two #ifs?  Because Mark is an EMACS user. */
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
+	sector_t disk_block = 0;
+	sector_t status;
+#else
 	int disk_block = 0;
+	int status;
+#endif
 	ocfs_super *osb = OCFS_SB(mapping->host->i_sb);
 	__s64 vbo = 0;
 	__s64 lbo = 0;
 	__u32 len;
-	int err = 0, status;
+	u64 p_blkno;
+	int err = 0;
 	struct inode *inode = mapping->host;
 
 	LOG_SET_CONTEXT(BMAP);
@@ -463,12 +492,33 @@
 		goto bail;
 	}
 
+	err = ocfs2_extent_map_get_blocks(inode, block, 1, &p_blkno,
+					  NULL);
+	if (err) {
+		LOG_ERROR_ARGS("get_blocks() failed, block = %llu\n",
+			       (unsigned long long)block);
+		LOG_ERROR_STATUS(err);
+		goto bail;
+	}
+
 	disk_block = lbo >> inode->i_sb->s_blocksize_bits;
+	
+	/* FIXME size of items */
+	if (p_blkno != disk_block) {
+		LOG_ERROR_ARGS("get_blocks() returned %llu, expected %llu\n",
+			       p_blkno,
+			       (unsigned long long)disk_block);
+	}
 
 bail:
+	/* "Gross" - MarkF (While Joel was inserting the #if) */
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
+	status = err ? 0 : disk_block;
+#else
 	status = err ? err : disk_block;
+#endif
 
-	LOG_EXIT_STATUS(status);
+	LOG_EXIT_STATUS((int)status);
 	LOG_CLEAR_CONTEXT();
 	return status;
 }
@@ -499,6 +549,8 @@
 	__s64 vbo; /* file offset */
 	__s64 lbo; /* logical (disk) offset */
 	__s64 vbo_max; /* file offset, max_blocks from iblock */
+	u64 p_blkno;
+	int contig_blocks;
 	int set_new = 0; /* flag */
 	__u64 new_size; /* In bytes, the size of the contiguous block */
 	unsigned char blocksize_bits;
@@ -544,9 +596,34 @@
 	/* This figure out the size of the next contiguous block, and
 	 * our logical offset */	
 	/* TODO: Try our damndest to give sizes in multiples of PAGE_SIZE */
+	/* FIXME: nice bug, fail to check status.  Which will fail if
+	 * max_blocks > the contiguousness. */
 	status = ocfs_lookup_file_allocation(osb, vbo, &lbo, max_blocks << blocksize_bits, 
 					     &new_size, inode);
 
+	status = ocfs2_extent_map_get_blocks(inode, iblock, 1, &p_blkno,
+					     &contig_blocks);
+	if (status) {
+		LOG_ERROR_ARGS("get_blocks() failed iblock=%llu\n",
+			       (unsigned long long)iblock);
+		status = -EIO;
+		goto bail;
+	}
+
+	if (p_blkno != (lbo >> blocksize_bits)) {
+		LOG_ERROR_ARGS("get_blocks() returned %llu, expected %lld\n",
+			       p_blkno, lbo >> blocksize_bits);
+	}
+
+	if (contig_blocks != (new_size >> blocksize_bits)) {
+		LOG_ERROR_ARGS("get_blocks() returned contig = %u, expected %lld\n",
+			       contig_blocks, new_size >> blocksize_bits);
+	}
+
+	if (max_blocks < contig_blocks)
+		contig_blocks = max_blocks;
+
+
 	/* Do whatever we need to the buffer_head */
 	if (set_new) {
 		set_buffer_new(bh_result);

Modified: trunk/src/dir.c
===================================================================
--- trunk/src/dir.c	2004-10-19 22:45:10 UTC (rev 1583)
+++ trunk/src/dir.c	2004-10-21 23:20:17 UTC (rev 1584)
@@ -50,6 +50,7 @@
 #include "alloc.h"
 #include "dir.h"
 #include "dlm.h"
+#include "extent_map.h"
 #include "file.h"
 #include "inode.h"
 #include "ocfs_journal.h"
@@ -349,6 +350,7 @@
 	int status;
 	s64 vbo, lbo;
 	int extend;
+	u64 p_blkno;
 
 	spin_lock(&OCFS_I(dir)->ip_lock);
 	extend = (dir->i_size == OCFS_I(dir)->ip_alloc_size);
@@ -375,6 +377,17 @@
 		goto bail;
 	}
 
+	status = ocfs2_extent_map_get_blocks(dir, dir->i_blocks, 1,
+					     &p_blkno, NULL);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+	if (p_blkno != (lbo >> sb->s_blocksize_bits)) {
+		LOG_ERROR_ARGS("Bad get_blocks(), got %llu, expected %llu\n",
+			       p_blkno, (lbo >> sb->s_blocksize_bits));
+	}
+
 	*new_bh = sb_getblk(sb, lbo >> sb->s_blocksize_bits);
 	if (!*new_bh) {
 		status = -EIO;

Added: trunk/src/extent_map.c
===================================================================
--- trunk/src/extent_map.c	2004-10-19 22:45:10 UTC (rev 1583)
+++ trunk/src/extent_map.c	2004-10-21 23:20:17 UTC (rev 1584)
@@ -0,0 +1,962 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * extent_map.c
+ *
+ * In-memory extent map for OCFS2.  Man, this code was prettier in
+ * the library.
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License, version 2,  as published by the Free Software Foundation.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#define _XOPEN_SOURCE 600 /* Triggers magic in features.h */
+#define _LARGEFILE64_SOURCE
+
+#include "ocfs_compat.h"
+
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/rbtree.h>
+
+#include "ocfs_log.h"
+#include "ocfs.h"
+#include "ocfs2.h"
+
+#include "extent_map.h"
+
+#include "buffer_head_io.h"
+
+
+/*
+ * SUCK SUCK SUCK
+ * Our headers are so bad that struct ocfs2_extent_map is in ocfs.h
+ */
+
+struct ocfs2_extent_map_entry {
+	struct rb_node e_node;
+	int e_tree_depth;
+	ocfs2_extent_rec e_rec;
+};
+
+struct ocfs2_em_insert_context {
+	int need_left;
+	int need_right;
+	struct ocfs2_extent_map_entry *new_ent;
+	struct ocfs2_extent_map_entry *old_ent;
+	struct ocfs2_extent_map_entry *left_ent;
+	struct ocfs2_extent_map_entry *right_ent;
+};
+
+static kmem_cache_t *ocfs2_em_ent_cachep = NULL;
+
+
+static struct ocfs2_extent_map_entry *
+ocfs2_extent_map_lookup(struct ocfs2_extent_map *em,
+			u32 cpos, u32 clusters,
+			struct rb_node ***ret_p,
+			struct rb_node **ret_parent);
+static int ocfs2_extent_map_insert_entry(struct ocfs2_extent_map *em,
+				 struct ocfs2_extent_map_entry *ent);
+static int ocfs2_extent_map_find_leaf(struct inode *inode,
+				      u32 cpos, u32 clusters,
+				      ocfs2_extent_list *el);
+static int ocfs2_extent_map_lookup_read(struct inode *inode,
+			u32 cpos, u32 clusters,
+			struct ocfs2_extent_map_entry **ret_ent);
+static int ocfs2_extent_map_try_insert(struct inode *inode,
+			       ocfs2_extent_rec *rec,
+			       int tree_depth,
+			       struct ocfs2_em_insert_context *ctxt);
+
+
+
+/*
+ * Find an entry in the tree that intersects the region passed in.
+ * Note that this will find straddled intervals, it is up to the
+ * callers to enforce any boundary conditions.
+ *
+ * Callers must hold ip_lock.  This lookup is not guaranteed to return
+ * a tree_depth 0 match, and as such can race inserts if the lock
+ * were not held.
+ *
+ * The rb_node garbage lets insertion share the search.  Trivial
+ * callers pass NULL.
+ */
+static struct ocfs2_extent_map_entry *
+ocfs2_extent_map_lookup(struct ocfs2_extent_map *em,
+			u32 cpos, u32 clusters,
+			struct rb_node ***ret_p,
+			struct rb_node **ret_parent)
+{
+	struct rb_node **p =
+#ifdef rb_node
+#undef rb_node
+		&em->em_extents.rb_node;
+#define rb_node rb_node_s  /* I HATE YOU 2.4 */
+#else
+		&em->em_extents.rb_node;
+#endif
+	struct rb_node *parent = NULL;
+	struct ocfs2_extent_map_entry *ent = NULL;
+
+	while (*p)
+	{
+		parent = *p;
+		ent = rb_entry(parent, struct ocfs2_extent_map_entry,
+			       e_node);
+		if ((cpos + clusters) <= ent->e_rec.e_cpos) {
+			p = &(*p)->rb_left;
+			ent = NULL;
+		} else if (cpos >= (ent->e_rec.e_cpos +
+				    ent->e_rec.e_clusters)) {
+			p = &(*p)->rb_right;
+			ent = NULL;
+		} else
+			break;
+	}
+
+	if (ret_p != NULL)
+		*ret_p = p;
+	if (ret_parent != NULL)
+		*ret_parent = parent;
+	return ent;
+}
+
+/*
+ * Find the leaf containing the interval we want.  While we're on our
+ * way down the tree, fill in every record we see at any depth, because
+ * we might want it later.
+ * 
+ * Note that this code is run without ip_lock.  That's because it
+ * sleeps while reading.  If someone is also filling the extent list at
+ * the same time we are, we might have to restart.
+ */
+static int ocfs2_extent_map_find_leaf(struct inode *inode,
+				      u32 cpos, u32 clusters,
+				      ocfs2_extent_list *el)
+{
+	int i, ret;
+	struct buffer_head *eb_bh = NULL;
+	u64 blkno;
+	ocfs2_extent_block *eb;
+	ocfs2_extent_rec *rec;
+
+	/*
+	 * The bh data containing the el cannot change here, because
+	 * we hold alloc_sem.  So we can do this without other
+	 * locks.
+	 */
+	while (el->l_tree_depth)
+	{
+		blkno = 0;
+		for (i = 0; i < el->l_next_free_rec; i++) {
+			rec = &el->l_recs[i];
+
+			ret = -EBADR;
+			if ((rec->e_cpos + rec->e_clusters) >
+			    (OCFS_I(inode)->ip_alloc_size >> OCFS_SB(inode->i_sb)->s_clustersize_bits))
+				goto out_free;
+
+			if ((rec->e_cpos + rec->e_clusters) <= cpos) {
+				ret = ocfs2_extent_map_insert(inode,
+							      rec,
+							      el->l_tree_depth);
+				if (ret && (ret != -EEXIST))
+					goto out_free;
+				continue;
+			}
+			if ((cpos + clusters) <= rec->e_cpos) {
+				ret = ocfs2_extent_map_insert(inode,
+							      rec,
+							      el->l_tree_depth);
+				if (ret && (ret != -EEXIST))
+					goto out_free;
+				continue;
+			}
+
+			/*
+			 * We've found a record that matches our
+			 * interval.  We don't insert it because we're
+			 * about to traverse it.
+			 */
+		
+			/* Check to see if we're stradling */
+			ret = -ESRCH;
+			if ((rec->e_cpos > cpos) ||
+			    ((cpos + clusters) >
+			     (rec->e_cpos + rec->e_clusters)))
+				goto out_free;
+
+			/*
+			 * If we've already found a record, the el has
+			 * two records covering the same interval.
+			 * EEEK!
+			 */
+			ret = -EBADR;
+			if (blkno)
+				goto out_free;
+
+			blkno = rec->e_blkno;
+		}
+
+		/*
+		 * We don't support holes, and we're still up
+		 * in the branches, so we'd better have found someone
+		 */
+		ret = -EBADR;
+		if (!blkno)
+			goto out_free;
+
+		if (eb_bh) {
+			brelse(eb_bh);
+			eb_bh = NULL;
+		}
+		ret = ocfs_read_block(OCFS_SB(inode->i_sb),
+				      blkno, &eb_bh, OCFS_BH_CACHED,
+				      inode);
+		if (ret)
+			goto out_free;
+		eb = (ocfs2_extent_block *)eb_bh->b_data;
+		OCFS_ASSERT_RO(IS_VALID_EXTENT_BLOCK(eb));
+		el = &eb->h_list;
+	}
+
+	if (el->l_tree_depth)
+		BUG();
+
+	for (i = 0; i < el->l_next_free_rec; i++) {
+		rec = &el->l_recs[i];
+		ret = ocfs2_extent_map_insert(inode, rec,
+					      el->l_tree_depth);
+		if (ret)
+			goto out_free;
+	}
+
+	ret = 0;
+
+out_free:
+	if (eb_bh)
+		brelse(eb_bh);
+
+	return ret;
+}
+
+/*
+ * This lookup actually will read from disk.  It has one invariant:
+ * It will never re-traverse blocks.  This means that all inserts should
+ * be new regions or more granular regions (both allowed by insert).
+ */
+static int ocfs2_extent_map_lookup_read(struct inode *inode,
+				u32 cpos, u32 clusters,
+				struct ocfs2_extent_map_entry **ret_ent)
+{
+	int ret;
+	u64 blkno;
+	struct ocfs2_extent_map *em = &OCFS_I(inode)->ip_map;
+	struct ocfs2_extent_map_entry *ent;
+	struct buffer_head *bh = NULL;
+	ocfs2_extent_block *eb;
+	ocfs2_dinode *di;
+	ocfs2_extent_list *el;
+
+	spin_lock(&OCFS_I(inode)->ip_lock);
+	ent = ocfs2_extent_map_lookup(em, cpos, clusters, NULL, NULL);
+	if (ent) {
+		if (!ent->e_tree_depth) {
+			spin_unlock(&OCFS_I(inode)->ip_lock);
+			*ret_ent = ent;
+			return 0;
+		}
+		blkno = ent->e_rec.e_blkno;
+		spin_unlock(&OCFS_I(inode)->ip_lock);
+
+		ret = ocfs_read_block(OCFS_SB(inode->i_sb), blkno, &bh,
+				      OCFS_BH_CACHED, inode);
+		if (ret) {
+			if (bh)
+				brelse(bh);
+			return ret;
+		}
+		eb = (ocfs2_extent_block *)bh->b_data;
+		OCFS_ASSERT_RO(IS_VALID_EXTENT_BLOCK(eb));
+		el = &eb->h_list;
+	} else {
+		spin_unlock(&OCFS_I(inode)->ip_lock);
+
+		ret = ocfs_read_block(OCFS_SB(inode->i_sb),
+				      OCFS_I(inode)->ip_blkno, &bh,
+				      OCFS_BH_CACHED, inode);
+		if (ret) {
+			if (bh)
+				brelse(bh);
+			return ret;
+		}
+		di = (ocfs2_dinode *)bh->b_data;
+		OCFS_ASSERT_RO(IS_VALID_FILE_ENTRY(di));
+		el = &di->id2.i_list;
+	}
+
+	ret = ocfs2_extent_map_find_leaf(inode, cpos, clusters, el);
+	brelse(bh);
+	if (ret)
+		return ret;
+
+	ent = ocfs2_extent_map_lookup(em, cpos, clusters, NULL, NULL);
+	if (!ent)
+		return -ESRCH;
+
+	if (ent->e_tree_depth)
+		BUG();  /* FIXME: Make sure this isn't a corruption */
+
+	*ret_ent = ent;
+
+	return 0;
+}
+
+/*
+ * Callers must hold ip_lock.  This can insert pieces of the tree,
+ * thus racing lookup if the lock weren't held.
+ */
+static int ocfs2_extent_map_insert_entry(struct ocfs2_extent_map *em,
+					 struct ocfs2_extent_map_entry *ent)
+{
+	struct rb_node **p, *parent;
+	struct ocfs2_extent_map_entry *old_ent;
+	
+	old_ent = ocfs2_extent_map_lookup(em, ent->e_rec.e_cpos,
+					  ent->e_rec.e_clusters,
+					  &p, &parent);
+	if (old_ent)
+		return -EEXIST;
+
+	rb_link_node(&ent->e_node, parent, p);
+	rb_insert_color(&ent->e_node, &em->em_extents);
+
+	return 0;
+}
+
+
+/*
+ * Simple rule: on any return code other than -EAGAIN, anything left
+ * in the insert_context will be freed.
+ */
+static int ocfs2_extent_map_try_insert(struct inode *inode,
+				       ocfs2_extent_rec *rec,
+				       int tree_depth,
+				       struct ocfs2_em_insert_context *ctxt)
+{
+	int ret;
+	struct ocfs2_extent_map *em = &OCFS_I(inode)->ip_map;
+	struct ocfs2_extent_map_entry *old_ent;
+
+	ctxt->need_left = 0;
+	ctxt->need_right = 0;
+	ctxt->old_ent = NULL;
+
+	spin_lock(&OCFS_I(inode)->ip_lock);
+	ret = ocfs2_extent_map_insert_entry(em, ctxt->new_ent);
+	if (!ret) {
+		ctxt->new_ent = NULL;
+		goto out_unlock;
+	}
+
+	old_ent = ocfs2_extent_map_lookup(em, rec->e_cpos,
+					  rec->e_clusters, NULL, NULL);
+
+	if (!old_ent)
+		BUG();
+
+	ret = -EEXIST;
+	if (old_ent->e_tree_depth < tree_depth)
+		goto out_unlock;
+
+	if (old_ent->e_tree_depth == tree_depth) {
+		if (!memcmp(rec, &old_ent->e_rec,
+			    sizeof(ocfs2_extent_rec)))
+			ret = 0;
+
+		/* FIXME: Should this be ESRCH/EBADR??? */
+		goto out_unlock;
+	}
+
+	/*
+	 * We do it in this order specifically so that no actual tree
+	 * changes occur until we have all the pieces we need.  We
+	 * don't want malloc failures to leave an inconsistent tree.
+	 * Whenever we drop the lock, another process could be
+	 * inserting.  Also note that, if another process just beat us
+	 * to an insert, we might not need the same pieces we needed
+	 * the first go round.  In the end, the pieces we need will
+	 * be used, and the pieces we don't will be freed.
+	 */
+	ctxt->need_left = !!(rec->e_cpos > old_ent->e_rec.e_cpos);
+	ctxt->need_right = !!((old_ent->e_rec.e_cpos +
+			       old_ent->e_rec.e_clusters) > 
+			      (rec->e_cpos + rec->e_clusters));
+	ret = -EAGAIN;
+	if (ctxt->need_left) {
+		if (!ctxt->left_ent)
+			goto out_unlock;
+		*(ctxt->left_ent) = *old_ent;
+		ctxt->left_ent->e_rec.e_clusters =
+			rec->e_cpos - ctxt->left_ent->e_rec.e_cpos;
+	}
+	if (ctxt->need_right) {
+		if (!ctxt->right_ent)
+			goto out_unlock;
+		*(ctxt->right_ent) = *old_ent;
+		ctxt->right_ent->e_rec.e_cpos =
+			rec->e_cpos + rec->e_clusters;
+		ctxt->right_ent->e_rec.e_clusters =
+			(old_ent->e_rec.e_cpos +
+			 old_ent->e_rec.e_clusters) -
+			ctxt->right_ent->e_rec.e_cpos;
+	}
+
+	rb_erase(&old_ent->e_node, &em->em_extents);
+	/* Now that he's erased, set him up for deletion */
+	ctxt->old_ent = old_ent;
+
+	if (ctxt->need_left) {
+		ret = ocfs2_extent_map_insert_entry(em,
+						    ctxt->left_ent);
+		if (ret)
+			goto out_unlock;
+		ctxt->left_ent = NULL;
+	}
+
+	if (ctxt->need_right) {
+		ret = ocfs2_extent_map_insert_entry(em,
+						    ctxt->right_ent);
+		if (ret)
+			goto out_unlock;
+		ctxt->right_ent = NULL;
+	}
+
+	ret = ocfs2_extent_map_insert_entry(em, ctxt->new_ent);
+	
+	if (!ret)
+		ctxt->new_ent = NULL;
+
+out_unlock:
+	spin_unlock(&OCFS_I(inode)->ip_lock);
+
+	return ret;
+}
+
+
+int ocfs2_extent_map_insert(struct inode *inode, ocfs2_extent_rec *rec,
+			    int tree_depth)
+{
+	int ret;
+	struct ocfs2_em_insert_context ctxt = {0, };
+
+	if ((rec->e_cpos + rec->e_clusters) >
+	    OCFS_I(inode)->ip_map.em_clusters)
+		return -EBADR;
+
+	/* Zero e_clusters means a truncated tail record.  It better be EOF */
+	if (!rec->e_clusters) {
+		if ((rec->e_cpos + rec->e_clusters) != 
+		    OCFS_I(inode)->ip_map.em_clusters)
+			return -EBADR;
+
+		/* Ignore the truncated tail */
+		return 0;
+	}
+
+	ret = -ENOMEM;
+	ctxt.new_ent = kmem_cache_alloc(ocfs2_em_ent_cachep,
+					GFP_KERNEL);
+	if (!ctxt.new_ent)
+		return ret;
+
+	ctxt.new_ent->e_rec = *rec;
+	ctxt.new_ent->e_tree_depth = tree_depth;
+
+	do {
+		ret = -ENOMEM;
+		if (ctxt.need_left && !ctxt.left_ent) {
+			ctxt.left_ent =
+				kmem_cache_alloc(ocfs2_em_ent_cachep,
+						 GFP_KERNEL);
+			if (!ctxt.left_ent)
+				break;
+		}
+		if (ctxt.need_right && !ctxt.right_ent) {
+			ctxt.right_ent =
+				kmem_cache_alloc(ocfs2_em_ent_cachep,
+						 GFP_KERNEL);
+			if (!ctxt.right_ent)
+				break;
+		}
+
+		ret = ocfs2_extent_map_try_insert(inode, rec,
+						  tree_depth, &ctxt);
+	} while (ret == -EAGAIN);
+
+	if (ctxt.left_ent)
+		kmem_cache_free(ocfs2_em_ent_cachep, ctxt.left_ent);
+	if (ctxt.right_ent)
+		kmem_cache_free(ocfs2_em_ent_cachep, ctxt.right_ent);
+	if (ctxt.old_ent)
+		kmem_cache_free(ocfs2_em_ent_cachep, ctxt.old_ent);
+	if (ctxt.new_ent)
+		kmem_cache_free(ocfs2_em_ent_cachep, ctxt.new_ent);
+
+	return ret;
+}
+
+/*
+ * Append this record to the tail of the extent map.  It must be
+ * tree_depth 0.  The record might be an extension of an existing
+ * record, and as such that needs to be handled.  eg:
+ *
+ * Existing record in the extent map:
+ *
+ *	cpos = 10, len = 10
+ * 	|---------| 
+ *
+ * New Record:
+ *
+ *	cpos = 10, len = 20
+ * 	|------------------|
+ *
+ * The passed record is the new on-disk record.  The new_clusters value
+ * is how many clusters were added to the file.  If the append is a
+ * contiguous append, the new_clusters has been added to
+ * rec->e_clusters.  If the append is an entirely new extent, then
+ * rec->e_clusters is == new_clusters.
+ */
+int ocfs2_extent_map_append(struct inode *inode, ocfs2_extent_rec *rec,
+			    u32 new_clusters)
+{
+	int ret;
+	struct ocfs2_extent_map *em = &OCFS_I(inode)->ip_map;
+	struct ocfs2_extent_map_entry *ent;
+	ocfs2_extent_rec *old;
+
+	OCFS_ASSERT(new_clusters);
+	OCFS_ASSERT(rec->e_clusters >= new_clusters);
+
+	if (em->em_clusters <
+	    (OCFS_I(inode)->ip_alloc_size >> OCFS_SB(inode->i_sb)->s_clustersize_bits)) {
+		/*
+		 * Size changed underneath us on disk.  Drop any
+		 * straddling records and update our idea of
+		 * i_clusters
+		 */
+		ocfs2_extent_map_drop(inode, em->em_clusters - 1);
+		em->em_clusters = OCFS_I(inode)->ip_alloc_size >>  OCFS_SB(inode->i_sb)->s_clustersize_bits;
+	}
+
+	OCFS_ASSERT((rec->e_cpos + rec->e_clusters) ==
+		    (em->em_clusters + new_clusters));
+
+	em->em_clusters += new_clusters;
+
+	ret = -ENOENT;
+	if (rec->e_clusters > new_clusters) {
+		/* This is a contiguous append */
+		ent = ocfs2_extent_map_lookup(em, rec->e_cpos, 1,
+					      NULL, NULL);
+		if (ent) {
+			old = &ent->e_rec;
+			OCFS_ASSERT((rec->e_cpos + rec->e_clusters) ==
+				    (old->e_cpos + old->e_clusters +
+				     new_clusters));
+			if (!ent->e_tree_depth) {
+				OCFS_ASSERT(old->e_cpos == rec->e_cpos);
+				OCFS_ASSERT(old->e_blkno ==
+					    rec->e_blkno);
+				ret = 0;
+			} 
+			/*
+			 * Let non-leafs fall through as -ENOENT to 
+			 * force insertion of the new leaf.
+			 */
+			old->e_clusters += new_clusters;
+		}
+	}
+
+	if (ret == -ENOENT)
+		ret = ocfs2_extent_map_insert(inode, rec, 0);
+
+	return ret;
+}
+
+/*
+ * Look up the record containing this cluster offset.  This record is
+ * part of the extent map.  Do not free it.  Any changes you make to
+ * it will reflect in the extent map.  So, if your last extent
+ * is (cpos = 10, clusters = 10) and you truncate the file by 5
+ * clusters, you can do:
+ *
+ * ret = ocfs2_extent_map_get_rec(em, orig_size - 5, &rec);
+ * rec->e_clusters -= 5;
+ *
+ * The lookup does not read from disk.  If the map isn't filled in for
+ * an entry, you won't find it.
+ *
+ * Also note that the returned record is valid until alloc_sem is
+ * dropped.  After that, truncate and extend can happen.  Caveat Emptor.
+ */
+int ocfs2_extent_map_get_rec(struct inode *inode, u32 cpos,
+			     ocfs2_extent_rec **rec,
+			     int *tree_depth)
+{
+	int ret = -ENOENT;
+	struct ocfs2_extent_map *em = &OCFS_I(inode)->ip_map;
+	struct ocfs2_extent_map_entry *ent;
+
+	*rec = NULL;
+
+	if (cpos >=
+	    (OCFS_I(inode)->ip_alloc_size >> OCFS_SB(inode->i_sb)->s_clustersize_bits))
+		return -EINVAL;
+
+	if (cpos >= em->em_clusters) {
+		/*
+		 * Size changed underneath us on disk.  Drop any
+		 * straddling records and update our idea of
+		 * i_clusters
+		 */
+		ocfs2_extent_map_drop(inode, em->em_clusters - 1);
+		em->em_clusters = OCFS_I(inode)->ip_alloc_size >>  OCFS_SB(inode->i_sb)->s_clustersize_bits;
+	}
+
+	ent = ocfs2_extent_map_lookup(&OCFS_I(inode)->ip_map, cpos, 1,
+				      NULL, NULL);
+	
+	if (ent) {
+		*rec = &ent->e_rec;
+		if (tree_depth)
+			*tree_depth = ent->e_tree_depth;
+		ret = 0;
+	}
+
+	return ret;
+}
+
+int ocfs2_extent_map_get_clusters(struct inode *inode,
+				  u32 v_cpos, int count,
+				  u32 *p_cpos, int *ret_count)
+{
+	int ret;
+	u32 coff, ccount;
+	struct ocfs2_extent_map *em = &OCFS_I(inode)->ip_map;
+	struct ocfs2_extent_map_entry *ent = NULL;
+
+	*p_cpos = ccount = 0;
+
+	if ((v_cpos + count) >
+	    (OCFS_I(inode)->ip_alloc_size >> OCFS_SB(inode->i_sb)->s_clustersize_bits))
+		return -EINVAL;
+
+	if ((v_cpos + count) > em->em_clusters) {
+		/*
+		 * Size changed underneath us on disk.  Drop any
+		 * straddling records and update our idea of
+		 * i_clusters
+		 */
+		ocfs2_extent_map_drop(inode, em->em_clusters - 1);
+		em->em_clusters = OCFS_I(inode)->ip_alloc_size >>  OCFS_SB(inode->i_sb)->s_clustersize_bits;
+	}
+
+
+	ret = ocfs2_extent_map_lookup_read(inode, v_cpos, count, &ent);
+	if (ret)
+		return ret;
+
+	if (ent) {
+		/* We should never find ourselves straddling an interval */
+		if ((ent->e_rec.e_cpos > v_cpos) ||
+		    ((v_cpos + count) >
+		     (ent->e_rec.e_cpos + ent->e_rec.e_clusters)))
+			return -ESRCH;
+
+		coff = v_cpos - ent->e_rec.e_cpos;
+		*p_cpos = ocfs2_blocks_to_clusters(inode->i_sb,
+						   ent->e_rec.e_blkno) +
+			coff;
+
+		if (ret_count)
+			*ret_count = ent->e_rec.e_clusters - coff;
+
+		return 0;
+	}
+
+
+	return -ENOENT;
+}
+
+int ocfs2_extent_map_get_blocks(struct inode *inode,
+				u64 v_blkno, int count,
+				u64 *p_blkno, int *ret_count)
+{
+	int ret;
+	u64 boff;
+	u32 cpos, clusters;
+	int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
+	struct ocfs2_extent_map_entry *ent = NULL;
+	struct ocfs2_extent_map *em = &OCFS_I(inode)->ip_map;
+	ocfs2_extent_rec *rec;
+
+	*p_blkno = 0;
+
+	cpos = ocfs2_blocks_to_clusters(inode->i_sb, v_blkno);
+	clusters = ocfs2_blocks_to_clusters(inode->i_sb,
+					    (u64)count + bpc - 1);
+	if ((cpos + clusters) >
+	    (OCFS_I(inode)->ip_alloc_size >> OCFS_SB(inode->i_sb)->s_clustersize_bits))
+		return -EINVAL;
+
+	if ((cpos + clusters) > em->em_clusters) {
+		/*
+		 * Size changed underneath us on disk.  Drop any
+		 * straddling records and update our idea of
+		 * i_clusters
+		 */
+		ocfs2_extent_map_drop(inode, em->em_clusters - 1);
+		em->em_clusters = OCFS_I(inode)->ip_alloc_size >>  OCFS_SB(inode->i_sb)->s_clustersize_bits;
+	}
+
+	ret = ocfs2_extent_map_lookup_read(inode, cpos, clusters, &ent);
+	if (ret)
+		return ret;
+
+	if (ent)
+	{
+		rec = &ent->e_rec;
+
+		/* We should never find ourselves straddling an interval */
+		if ((rec->e_cpos > cpos) ||
+		    ((cpos + clusters) >
+		     (rec->e_cpos + rec->e_clusters)))
+			return -ESRCH;
+
+		boff = ocfs2_clusters_to_blocks(inode->i_sb,
+						cpos - rec->e_cpos);
+		boff += (v_blkno & (u64)(bpc - 1));
+		*p_blkno = rec->e_blkno + boff;
+
+		if (ret_count) {
+			*ret_count = ocfs2_clusters_to_blocks(inode->i_sb,
+							      rec->e_clusters) - boff;
+		}
+
+		return 0;
+	}
+
+	return -ENOENT;
+}
+
+int ocfs2_extent_map_init(struct inode *inode)
+{
+	struct ocfs2_extent_map *em = &OCFS_I(inode)->ip_map;
+
+	em->em_extents = RB_ROOT;
+	em->em_clusters = 0;
+
+	return 0;
+}
+
+/*
+ * Not in mainline at all
+ */
+static struct rb_node *rb_last(struct rb_root *root)
+{
+	struct rb_node	*n;
+
+#ifdef rb_node
+#undef rb_node
+	n = root->rb_node;
+#define rb_node rb_node_s
+#else
+	n = root->rb_node;
+#endif
+	if (!n)
+		return NULL;
+	while (n->rb_right)
+		n = n->rb_right;
+	return n;
+}
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)
+static struct rb_node *rb_prev(struct rb_node *node)
+{
+	/* If we have a left-hand child, go down and then right as far
+	   as we can. */
+	if (node->rb_left) {
+		node = node->rb_left; 
+		while (node->rb_right)
+			node=node->rb_right;
+		return node;
+	}
+
+	/* No left-hand children. Go up till we find an ancestor which
+	   is a right-hand child of its parent */
+	while (node->rb_parent && node == node->rb_parent->rb_left)
+		node = node->rb_parent;
+
+	return node->rb_parent;
+}
+#endif  /* LINUX_VERSION_CODE */
+
+
+/* Needs the lock */
+static void __ocfs2_extent_map_drop(struct inode *inode,
+				    u32 new_clusters,
+				    struct rb_node **free_head,
+				    struct ocfs2_extent_map_entry **tail_ent)
+{
+	struct rb_node *node, *next;
+	struct ocfs2_extent_map *em = &OCFS_I(inode)->ip_map;
+	struct ocfs2_extent_map_entry *ent;
+
+	*free_head = NULL;
+
+	ent = NULL;
+	node = rb_last(&em->em_extents);
+	while (node)
+	{
+		next = rb_prev(node);
+
+		ent = rb_entry(node, struct ocfs2_extent_map_entry,
+			       e_node);
+		if (ent->e_rec.e_cpos < new_clusters)
+			break;
+
+		rb_erase(&ent->e_node, &em->em_extents);
+
+		node->rb_right = *free_head;
+		*free_head = node;
+
+		ent = NULL;
+		node = next;
+	}
+
+	/* Do we have an entry straddling new_clusters? */
+	if (tail_ent) {
+		if (ent &&
+		    ((ent->e_rec.e_cpos + ent->e_rec.e_clusters) >
+		     new_clusters))
+			*tail_ent = ent;
+		else
+			*tail_ent = NULL;
+	}
+
+	return;
+}
+
+static void __ocfs2_extent_map_drop_cleanup(struct rb_node *free_head)
+{
+	struct rb_node *node;
+	struct ocfs2_extent_map_entry *ent;
+
+	while (free_head) {
+		node = free_head;
+		free_head = node->rb_right;
+
+		ent = rb_entry(node, struct ocfs2_extent_map_entry,
+			       e_node);
+		kmem_cache_free(ocfs2_em_ent_cachep, ent);
+	}
+}
+
+
+/*
+ * Remove all entries past new_clusters, inclusive of an entry that
+ * contains new_clusters.  This is effectively a cache forget.
+ *
+ * If you want to also clip the last extent by some number of clusters,
+ * you need to call ocfs2_extent_map_trunc().
+ * This code does not check or modify ip_alloc_size.
+ */
+int ocfs2_extent_map_drop(struct inode *inode, u32 new_clusters)
+{
+	struct rb_node *free_head = NULL;
+	struct ocfs2_extent_map *em = &OCFS_I(inode)->ip_map;
+	struct ocfs2_extent_map_entry *ent;
+
+	spin_lock(&OCFS_I(inode)->ip_lock);
+
+	__ocfs2_extent_map_drop(inode, new_clusters, &free_head, &ent);
+
+	if (ent) {
+		rb_erase(&ent->e_node, &em->em_extents);
+		ent->e_node.rb_right = free_head;
+		free_head = &ent->e_node;
+	}
+
+	spin_unlock(&OCFS_I(inode)->ip_lock);
+
+	if (free_head)
+		__ocfs2_extent_map_drop_cleanup(free_head);
+
+	return 0;
+}
+
+/*
+ * Remove all entries past new_clusters and also clip any extent
+ * straddling new_clusters, if there is one.  This does not check
+ * or modify ip_alloc_size.
+ */
+int ocfs2_extent_map_trunc(struct inode *inode, u32 new_clusters)
+{
+	struct rb_node *free_head = NULL;
+	struct ocfs2_extent_map_entry *ent = NULL;
+
+	spin_lock(&OCFS_I(inode)->ip_lock);
+
+	__ocfs2_extent_map_drop(inode, new_clusters, &free_head, &ent);
+
+	if (ent)
+		ent->e_rec.e_clusters =
+			new_clusters - ent->e_rec.e_cpos;
+
+	OCFS_I(inode)->ip_map.em_clusters = new_clusters;
+
+	spin_unlock(&OCFS_I(inode)->ip_lock);
+
+	if (free_head)
+		__ocfs2_extent_map_drop_cleanup(free_head);
+
+	return 0;
+}
+
+
+int __init init_ocfs2_extent_maps(void)
+{
+	ocfs2_em_ent_cachep =
+		kmem_cache_create("ocfs2_em_ent",
+				  sizeof(struct ocfs2_extent_map_entry),
+				  0, SLAB_HWCACHE_ALIGN, NULL, NULL);
+	if (!ocfs2_em_ent_cachep)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void __exit exit_ocfs2_extent_maps(void)
+{
+	kmem_cache_destroy(ocfs2_em_ent_cachep);
+
+	return;
+}

Added: trunk/src/extent_map.h
===================================================================
--- trunk/src/extent_map.h	2004-10-19 22:45:10 UTC (rev 1583)
+++ trunk/src/extent_map.h	2004-10-21 23:20:17 UTC (rev 1584)
@@ -0,0 +1,55 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * extent_map.h
+ *
+ * In-memory file extent mappings for OCFS2.
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License, version 2,  as published by the Free Software Foundation.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Authors: Joel Becker
+ */
+
+#ifndef _EXTENT_MAP_H
+#define _EXTENT_MAP_H
+
+int init_ocfs2_extent_maps(void);
+void exit_ocfs2_extent_maps(void);
+
+/*
+ * EVERY CALL here except _init, _trunc, and _drop expects alloc_sem
+ * to be held.  The allocation cannot change at all while the map is
+ * in the process of being updated.
+ */
+int ocfs2_extent_map_init(struct inode *inode);
+int ocfs2_extent_map_insert(struct inode *inode, ocfs2_extent_rec *rec,
+			    int tree_depth);
+int ocfs2_extent_map_append(struct inode *inode, ocfs2_extent_rec *rec,
+			    u32 new_clusters);
+int ocfs2_extent_map_get_rec(struct inode *inode, u32 cpos,
+			     ocfs2_extent_rec **rec,
+			     int *tree_depth);
+int ocfs2_extent_map_get_clusters(struct inode *inode,
+				  u32 v_cpos, int count,
+			  	  u32 *p_cpos, int *ret_count);
+int ocfs2_extent_map_get_blocks(struct inode *inode,
+				u64 v_blkno, int count,
+				u64 *p_blkno, int *ret_count);
+int ocfs2_extent_map_drop(struct inode *inode, u32 new_clusters);
+int ocfs2_extent_map_trunc(struct inode *inode, u32 new_clusters);
+
+#endif  /* _EXTENT_MAP_H */

Modified: trunk/src/file.c
===================================================================
--- trunk/src/file.c	2004-10-19 22:45:10 UTC (rev 1583)
+++ trunk/src/file.c	2004-10-21 23:20:17 UTC (rev 1584)
@@ -40,6 +40,7 @@
 #include "dir.h"
 #include "dlm.h"
 #include "extmap.h"
+#include "extent_map.h"
 #include "file.h"
 #include "sysfile.h"
 #include "inode.h"
@@ -821,12 +822,13 @@
 			struct buffer_head *fe_bh,
 			u64 new_i_size)
 {
-	int status;
+	int status, grow;
 	struct super_block *sb = inode->i_sb;
 	ocfs_inode_private *oip = OCFS_I(inode);
 
 	LOG_ENTRY();
 
+	grow = new_i_size > inode->i_size;
 	inode->i_size = new_i_size;
 	OCFS_SET_INODE_TIME(inode, i_mtime, OCFS_CURRENT_TIME);
 	inode->i_blocks = (new_i_size + sb->s_blocksize - 1) 
@@ -837,11 +839,16 @@
 		goto bail;
 	}
 
+	/* FIXME: I think this should all be in the caller */
 	spin_lock(&oip->ip_lock);
-	oip->ip_mmu_private = inode->i_size;
+	if (!grow)
+		oip->ip_mmu_private = inode->i_size;
 	/* do we really need to do an extent_map_trunc here? */
 	ocfs_extent_map_trunc(&oip->ip_ext_map);
 	spin_unlock(&oip->ip_lock);
+
+	ocfs2_extent_map_drop(inode,
+			      ocfs2_clusters_for_bytes(sb, new_i_size));
 bail:
 	LOG_EXIT_STATUS(status);
 	return status;

Modified: trunk/src/inode.c
===================================================================
--- trunk/src/inode.c	2004-10-19 22:45:10 UTC (rev 1583)
+++ trunk/src/inode.c	2004-10-21 23:20:17 UTC (rev 1584)
@@ -41,6 +41,7 @@
 #include "alloc.h"
 #include "dlm.h"
 #include "extmap.h"
+#include "extent_map.h"
 #include "file.h"
 #include "inode.h"
 #include "lockres.h"
@@ -311,6 +312,7 @@
 	i->ip_open_cnt = 0;
 	spin_lock_init(&i->ip_lock);
 	ocfs_extent_map_init (&i->ip_ext_map);
+	ocfs2_extent_map_init(inode);
 	INIT_LIST_HEAD(&i->ip_recovery_list);
 	INIT_LIST_HEAD(&i->ip_handle_list);
 	i->ip_handle = NULL;
@@ -819,6 +821,7 @@
 	}
 
 	ocfs_extent_map_trunc(&OCFS_I(inode)->ip_ext_map);
+	ocfs2_extent_map_drop(inode, 0);
 
 	down(&recovery_list_sem);
 	list_del(&OCFS_I(inode)->ip_recovery_list);
@@ -854,6 +857,7 @@
 	int tmperr;
 	ocfs_super *osb;
 	__s64 vbo, lbo;
+	u64 p_blkno;
 	int readflags = OCFS_BH_CACHED;
 
 	osb = OCFS_SB(inode->i_sb);
@@ -870,17 +874,22 @@
 		return(NULL);
 
 	/* do we need extend sem?  no extend dlm message for dirs */
-	/*
-	 * UGLY: last argument to lookup_file_allocation() (locked) is
-	 * forced to '1' here, even though we don't have the lock.  This
-	 * is to force fast, unlocked operation.  Get A Real DLM.
-	 */
 	tmperr = ocfs_lookup_file_allocation(osb, vbo, &lbo, 
 					     osb->sb->s_blocksize, NULL,
 					     inode);
 	if (tmperr < 0)
 		goto fail;
 
+	tmperr = ocfs2_extent_map_get_blocks(inode, block, 1,
+					     &p_blkno, NULL);
+	if (tmperr < 0)
+		goto fail;
+
+	if (p_blkno != (lbo >> osb->sb->s_blocksize_bits)) {
+		LOG_ERROR_ARGS("get_blocks() expected %llu, got %lld\n",
+			       p_blkno, lbo >> osb->sb->s_blocksize_bits);
+	}
+
 	tmperr = ocfs_read_block(osb, lbo >> osb->sb->s_blocksize_bits, 
 				 &bh, readflags, inode);
 	if (tmperr < 0)
@@ -1037,6 +1046,7 @@
 		       ocfs2_dinode *fe)
 {
 	int status = 0;
+	int drop_map = 0;
 	ocfs_super *osb = OCFS2_SB(inode->i_sb);
 
 	spin_lock(&OCFS_I(inode)->ip_lock);
@@ -1086,6 +1096,7 @@
 				       OCFS_I(inode)->ip_alloc_size, 
 				       fe->i_clusters);
 			ocfs_extent_map_trunc (&OCFS_I(inode)->ip_ext_map);
+			drop_map = 1; /* Because we have the lock here */
 		}
 
 		if (le32_to_cpu(fe->i_flags) & OCFS2_BITMAP_FL) {
@@ -1133,6 +1144,15 @@
 
 	spin_unlock(&OCFS_I(inode)->ip_lock);
 
+	if (drop_map) {
+		/*
+		 * If we could trust the ordering of truncate
+		 * notification, we could some day do:
+		 * ocfs2_extent_map_trunc(inode, fe->i_clusters)
+		 */
+		ocfs2_extent_map_trunc(inode, fe->i_clusters);
+	}
+
 	return(status);
 }				/* ocfs_refresh_inode */
 

Modified: trunk/src/namei.c
===================================================================
--- trunk/src/namei.c	2004-10-19 22:45:10 UTC (rev 1583)
+++ trunk/src/namei.c	2004-10-21 23:20:17 UTC (rev 1584)
@@ -51,6 +51,7 @@
 #include "dcache.h"
 #include "dir.h"
 #include "dlm.h"
+#include "extent_map.h"
 #include "file.h"
 #include "sysfile.h"
 #include "inode.h"
@@ -1415,6 +1416,8 @@
 	const char *c;
 	struct super_block *sb = osb->sb;
 	s64 logical, contig;
+	u64 p_blkno;
+	int p_blocks;
 	int virtual, blocks, status, i, bytes_left;
 
 	bytes_left = inode->i_size + 1;
@@ -1461,6 +1464,23 @@
 	/* right now lookup_file_allocation returns bytes, but that
 	 * changes soon so shift back to blocks. */
 	logical = logical >> sb->s_blocksize_bits;
+
+	status = ocfs2_extent_map_get_blocks(inode, 0, 1, &p_blkno, 
+					     &p_blocks);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
+	if (logical != p_blkno) {
+		LOG_ERROR_ARGS("Bad get_blocks(): expected %llu, got %llu\n",
+			       p_blkno, logical);
+	}
+	if (p_blocks != (contig >> sb->s_blocksize_bits)) {
+		LOG_ERROR_ARGS("Bad get_blocks() length: expected %u, got %lld\n",
+			       p_blocks, (contig >> sb->s_blocksize_bits));
+	}
+
 	virtual = 0;
 	while(bytes_left > 0) {
 		c = &symname[virtual * sb->s_blocksize];

Modified: trunk/src/nm.c
===================================================================
--- trunk/src/nm.c	2004-10-19 22:45:10 UTC (rev 1583)
+++ trunk/src/nm.c	2004-10-21 23:20:17 UTC (rev 1584)
@@ -38,14 +38,15 @@
 
 #include "alloc.h"
 #include "dlm.h"
+#include "extmap.h"
+#include "extent_map.h"
+#include "file.h"
 #include "heartbeat.h"
 #include "inode.h"
 #include "lockres.h"
 #include "nm.h"
 #include "util.h"
 #include "vote.h"
-#include "extmap.h"
-#include "file.h"
 
 #include "ocfs_journal.h"
 #include "buffer_head_io.h"
@@ -444,6 +445,7 @@
 	spin_lock(&OCFS_I(inode)->ip_lock);
 	ocfs_extent_map_trunc(&OCFS_I(inode)->ip_ext_map);
 	spin_unlock(&OCFS_I(inode)->ip_lock);
+	ocfs2_extent_map_drop(inode, 0);
 }
 
 
@@ -695,11 +697,14 @@
 			ocfs_truncate_inode_pages(inode, 0);
 			spin_lock(&OCFS_I(inode)->ip_lock);
 			ocfs_extent_map_trunc(&OCFS_I(inode)->ip_ext_map);
-			
+
 			/* truncate may send this */
 			if (flags & FLAG_FILE_UPDATE_OIN)
 				atomic_set(&OCFS_I(inode)->ip_needs_verification, 1);
 			spin_unlock(&OCFS_I(inode)->ip_lock);
+
+			/* Do we need this? */
+			ocfs2_extent_map_drop(inode, 0);
 		}
 		return 0;
 	}

Modified: trunk/src/ocfs.h
===================================================================
--- trunk/src/ocfs.h	2004-10-19 22:45:10 UTC (rev 1583)
+++ trunk/src/ocfs.h	2004-10-21 23:20:17 UTC (rev 1584)
@@ -35,6 +35,7 @@
 #include <linux/sched.h>
 #include <linux/wait.h>
 #include <linux/list.h>
+#include <linux/rbtree.h>
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
 # include <linux/workqueue.h>
 #else
@@ -367,6 +368,12 @@
 
 struct _ocfs_journal_handle;
 
+/* I hate our includes */
+struct ocfs2_extent_map {
+	u32		em_clusters;
+	struct rb_root	em_extents;
+};
+
 /* OCFS2 Inode Private Data */
 typedef struct _ocfs_inode_private
 {
@@ -386,6 +393,7 @@
 	__s64		  ip_mmu_private;
 	__u32             ip_open_flags;
 	ocfs_extent_map   ip_ext_map;
+	struct ocfs2_extent_map ip_map;
 
 	atomic_t          ip_needs_verification;
 

Modified: trunk/src/ocfs2.h
===================================================================
--- trunk/src/ocfs2.h	2004-10-19 22:45:10 UTC (rev 1583)
+++ trunk/src/ocfs2.h	2004-10-21 23:20:17 UTC (rev 1584)
@@ -45,7 +45,7 @@
 }
 
 static inline unsigned int ocfs2_clusters_for_bytes(struct super_block *sb,
-						    __u64 bytes)
+						    u64 bytes)
 {
 	int cl_bits = OCFS_SB(sb)->s_clustersize_bits;
 	unsigned int clusters;

Modified: trunk/src/ocfs_compat.h
===================================================================
--- trunk/src/ocfs_compat.h	2004-10-19 22:45:10 UTC (rev 1583)
+++ trunk/src/ocfs_compat.h	2004-10-21 23:20:17 UTC (rev 1584)
@@ -60,6 +60,9 @@
 #define generic_file_write_nolock do_generic_file_write
 #endif
 
+#define rb_root rb_root_s
+#define rb_node rb_node_s
+
 typedef long sector_t;
 
 #define map_bh(bh, sb, blk) \

Modified: trunk/src/super.c
===================================================================
--- trunk/src/super.c	2004-10-19 22:45:10 UTC (rev 1583)
+++ trunk/src/super.c	2004-10-21 23:20:17 UTC (rev 1584)
@@ -51,6 +51,7 @@
 
 #include "alloc.h"
 #include "bitmap.h"
+#include "extent_map.h"
 #include "heartbeat.h"
 #include "inode.h"
 #include "journal.h"
@@ -516,10 +517,15 @@
 	LOG_ENTRY ();
 
 	ocfs_version_print();
+
+	if (init_ocfs2_extent_maps())
+		return -ENOMEM;
 	
 	ocfs_hostname = kmalloc(strlen(system_utsname.nodename) + 1, GFP_KERNEL);
-	if (ocfs_hostname == NULL)
-		return -EINVAL;
+	if (ocfs_hostname == NULL) {
+		status = -EINVAL;
+		goto leave;
+	}
 
 	strcpy(ocfs_hostname, system_utsname.nodename);
 	printk("ocfs2: hostname is %s\n", ocfs_hostname);
@@ -583,6 +589,8 @@
 
 		if (ocfs_table_header)
 			unregister_sysctl_table(ocfs_table_header);
+
+		exit_ocfs2_extent_maps();
 	}
 
 	LOG_EXIT_STATUS (status);
@@ -693,6 +701,8 @@
 
 	unregister_filesystem (&ocfs_fs_type);
 
+	exit_ocfs2_extent_maps();
+
 	printk("Unloaded OCFS Driver module\n");
 	LOG_EXIT ();
 	return;
@@ -1330,6 +1340,7 @@
 	int status = 0;
 	ocfs_publish *publish = NULL;
 	__u64 ret;
+	u64 p_blkno;
 	struct buffer_head *publish_bh = NULL;  /* our own publish sector */
 	struct buffer_head **publish_bhs = NULL; /* all the publish sectors */
 	int i;
@@ -1544,13 +1555,26 @@
 		LOG_ERROR_STATUS(status = -EINVAL);
 		goto bail;
 	}
-	/* We're in the mount path, pretend locked=1 */
 	status = ocfs_lookup_file_allocation(osb, 0ULL, &ret, osb->sb->s_blocksize, NULL,
 					     inode);
 	if (status < 0) {
 		LOG_ERROR_STATUS(status);
 		goto bail;
 	}
+
+	status = ocfs2_extent_map_get_blocks(inode, 0ULL, 1, &p_blkno,
+					     NULL);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
+	if (p_blkno != (ret >> osb->sb->s_blocksize_bits)) {
+		LOG_ERROR_ARGS("get_blocks() returned %llu, expected %lld\n",
+			       p_blkno,
+			       ret >> osb->sb->s_blocksize_bits);
+	}
+
 	// i_size must be at least
 	// (2 + osb->max_nodes + 4) + osb->max_nodes + osb->max_nodes
 	if (inode->i_size >> osb->sb->s_blocksize_bits < 
@@ -1617,6 +1641,20 @@
 		LOG_ERROR_STATUS(status);
 		goto bail;
 	}
+
+	status = ocfs2_extent_map_get_blocks(inode, 0ULL, 1, &p_blkno,
+					     NULL);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
+	if (p_blkno != (ret >> osb->sb->s_blocksize_bits)) {
+		LOG_ERROR_ARGS("get_blocks() returned %llu, expected %lld\n",
+			       p_blkno,
+			       ret >> osb->sb->s_blocksize_bits);
+	}
+
 	/* for now, just one extent... but in the future... */
 	osb->bitmap_blkno = ret >> osb->sb->s_blocksize_bits;
 	osb->bitmap_blocks = OCFS_I(inode)->ip_alloc_size >> osb->sb->s_blocksize_bits;