[Ocfs2-commits] mfasheh commits r1543 - trunk/src

svn-commits at oss.oracle.com svn-commits at oss.oracle.com
Fri Oct 1 20:08:09 CDT 2004


Author: mfasheh
Date: 2004-10-01 20:08:08 -0500 (Fri, 01 Oct 2004)
New Revision: 1543

Added:
   trunk/src/localalloc.c
   trunk/src/localalloc.h
   trunk/src/suballoc.c
   trunk/src/suballoc.h
Modified:
   trunk/src/Makefile
   trunk/src/alloc.c
   trunk/src/alloc.h
   trunk/src/aops.c
   trunk/src/bitmap.c
   trunk/src/bitmap.h
   trunk/src/dcache.c
   trunk/src/dir.c
   trunk/src/dir.h
   trunk/src/dlm.c
   trunk/src/file.c
   trunk/src/file.h
   trunk/src/inode.c
   trunk/src/inode.h
   trunk/src/journal.c
   trunk/src/namei.c
   trunk/src/namei.h
   trunk/src/ocfs.h
   trunk/src/ocfs2_fs.h
   trunk/src/ocfs_journal.h
   trunk/src/ocfs_log.h
   trunk/src/super.c
   trunk/src/symlink.c
   trunk/src/sysfile.c
   trunk/src/sysfile.h
   trunk/src/vote.c
Log:
* merge the dlm_changes branch back into trunk.



Modified: trunk/src/Makefile
===================================================================
--- trunk/src/Makefile	2004-10-02 01:01:24 UTC (rev 1542)
+++ trunk/src/Makefile	2004-10-02 01:08:08 UTC (rev 1543)
@@ -77,10 +77,12 @@
 	inode.c			\
 	ioctl.c			\
 	journal.c		\
+	localalloc.c		\
 	lockres.c		\
 	namei.c			\
 	nm.c			\
 	proc.c			\
+	suballoc.c		\
 	super.c			\
 	symlink.c		\
 	sysfile.c		\
@@ -109,10 +111,12 @@
 	inode.h			\
 	ioctl.h			\
 	journal.h		\
+	localalloc.h		\
 	lockres.h		\
 	namei.h			\
 	nm.h			\
 	proc.h			\
+	suballoc.h		\
 	super.h			\
 	symlink.h		\
 	sysfile.h		\

Modified: trunk/src/alloc.c
===================================================================
--- trunk/src/alloc.c	2004-10-02 01:01:24 UTC (rev 1542)
+++ trunk/src/alloc.c	2004-10-02 01:08:08 UTC (rev 1543)
@@ -41,7 +41,9 @@
 #include "dlm.h"
 #include "extmap.h"
 #include "inode.h"
+#include "localalloc.h"
 #include "util.h"
+#include "suballoc.h"
 #include "sysfile.h"
 #include "file.h"
 
@@ -55,6 +57,14 @@
 			       struct buffer_head *extent_grp_bh, 
 			       ocfs_journal_handle *handle,
 			       struct inode *inode);
+
+static int ocfs_create_new_meta_bhs(ocfs_super *osb, 
+				    ocfs_journal_handle *handle,
+				    struct inode *inode,
+				    int wanted, 
+				    ocfs2_alloc_context *meta_ac,
+				    struct buffer_head *bhs[]);
+
 static int ocfs_allocate_new_data_node(ocfs_super *osb, 
 				       ocfs2_dinode *fe,
 				       u64 new_blkno,
@@ -62,13 +72,16 @@
 				       struct buffer_head *eb_bh, 
 				       u64 *new_eb_blkno, 
       				       ocfs_journal_handle *handle,
-				       struct inode *inode);
+				       struct inode *inode,
+				       ocfs2_alloc_context *meta_ac);
 
 static int ocfs_grow_extent_tree(ocfs_super *osb,
 				 struct buffer_head *fe_bh,
 				 ocfs_journal_handle *handle,
 				 u64 blkno,
-			       	 u32 new_clusters, struct inode *inode);
+			       	 u32 new_clusters, 
+				 struct inode *inode,
+				 ocfs2_alloc_context *meta_ac);
 
 static int _squish_extent_entries(ocfs_super *osb,
 				  ocfs2_extent_rec *extarr, 
@@ -90,34 +103,24 @@
 static int ocfs_update_last_eb_blk(ocfs_super *osb, ocfs2_dinode *fe,
 				   struct inode *inode);
 
-static int ocfs_free_vol_block (ocfs_super * osb, ocfs_journal_handle *handle,
-				ocfs_free_rec * FreeLog, __u32 NodeNum, 
-				__u32 Type);
-
 static int ocfs_free_disk_bitmap (ocfs_super * osb, ocfs_free_rec *free_log);
 
 static inline int ocfs_free_main_bitmap(ocfs_super *osb, 
 					ocfs_journal_handle *handle, 
-					ocfs_free_rec *freelog);
+					struct inode *bitmap_inode,
+					struct buffer_head *bh,
+					ocfs_free_rec *freelog); 
 
-static int ocfs_alloc_new_window(ocfs_super *osb, struct buffer_head *lock_bh,
-				 struct inode *bm_inode,
-				 ocfs_journal_handle *handle);
-static int ocfs_sync_local_from_shutdown(ocfs_super *osb, 
-					 ocfs_bitmap_free_head **f, 
-					 struct buffer_head *local_alloc_bh, 
-					 int in_recovery);
-static __u32 ocfs_alloc_count_bits(ocfs2_dinode *alloc);
-static void ocfs_clear_local_alloc(ocfs2_dinode *alloc);
-static int ocfs_find_space_from_local(ocfs_super *osb, __u32 bitswanted,
-				      u32 *bitoff, u32 *bitcount, 
-				      ocfs_journal_handle *handle);
-static int ocfs_local_find_clear_bits(ocfs_super *osb,
-				      ocfs2_dinode *alloc,
-				      __u32 numbits);
 static int ocfs_extent_contig(struct inode *inode, ocfs2_extent_rec *ext,
 			      u64 blkno);
 
+static int ocfs_claim_main_bitmap_bits(ocfs_super *osb,
+				       ocfs_journal_handle *handle,
+				       ocfs2_alloc_context *ac,
+				       u32 min_bits,
+				       u32 *bit_off,
+				       u32 *num_bits);
+
 static int ocfs_extent_contig(struct inode *inode, ocfs2_extent_rec *ext,
 			      u64 blkno)
 {
@@ -154,29 +157,32 @@
 
 int ocfs_add_to_bitmap_free_head(ocfs_super *osb,
 				 ocfs_bitmap_free_head *f, 
-				 __u32 len, __u32 fileoff,
-				 __u32 nodenum, __u32 type)
+				 u32 len, u32 fileoff,
+				 u32 nodenum, u64 blkno, u32 type)
 {
 	int status = 0, n;
 	ocfs_free_rec *log;
 	ocfs_bitmap_update *fb;
 
-	LOG_ENTRY_ARGS("(len = %u, fileoff = %u, nodenum = %u, " 
-		       "type=%d (\"%s\")\n", len, fileoff, nodenum, type, 
+	LOG_ENTRY_ARGS("(len = %u, fileoff = %u, nodenum = %u, blk = %llu" 
+		       "type=%d (\"%s\")\n", len, fileoff, nodenum,
+		       blkno, type, 
 		       (type == DISK_ALLOC_VOLUME) ? "DISK_ALLOC_VOLUME" : 
-		       ( (type == DISK_ALLOC_EXTENT_NODE) ? 
-			 "DISK_ALLOC_EXTENT_NODE" : "DISK_ALLOC_INODE" ));
+		       "DISK_ALLOC_EXTENT_NODE");
 
 	if (len == 0) {
 		printk("ocfs2: Zero length delete!\n");
 		printk("(len = %u, fileoff = %u, nodenum = %u, "
 		       "type=%d (\"%s\")\n", len, fileoff, nodenum, type, 
 		       (type == DISK_ALLOC_VOLUME) ? "DISK_ALLOC_VOLUME" : 
-		       ( (type == DISK_ALLOC_EXTENT_NODE) ? 
-			 "DISK_ALLOC_EXTENT_NODE" : "DISK_ALLOC_INODE" ));
+		       "DISK_ALLOC_EXTENT_NODE");
 		BUG();
 	}
 
+	/* right now we don't support this. */
+	if ((type == DISK_ALLOC_EXTENT_NODE) && (nodenum != 0))
+		BUG();
+
 	log = f->tail;
 
 	/* need a new one? */
@@ -202,6 +208,7 @@
 	fb->file_off = fileoff; 
 	fb->type     = type; 
 	fb->node_num = nodenum; 
+	fb->blkno    = blkno; 
 
 	log->num_updates++;
 done:
@@ -209,6 +216,21 @@
 	return(status);
 }
 
+static inline void ocfs_copy_update(ocfs_free_rec *rec,
+				    ocfs_bitmap_update *fb2)
+{
+	int idx = rec->num_updates;
+	ocfs_bitmap_update *fb1 = &(rec->update[idx]);
+
+	fb1->length   = fb2->length;
+	fb1->file_off = fb2->file_off;
+	fb1->type     = fb2->type;
+	fb1->node_num = fb2->node_num;
+	fb1->blkno = fb2->blkno;
+	rec->num_updates++;
+	return;
+}
+
 /*
  * ocfs_free_disk_bitmap()
  *
@@ -216,336 +238,191 @@
 static int ocfs_free_disk_bitmap (ocfs_super * osb, ocfs_free_rec *free_log)
 {
 	int status = 0;
-	__u32 num_upd;
-	__u32 i;
-	__u32 node_num;
-	ocfs_free_rec **ext_alloc_free = NULL;
-	ocfs_free_rec **inode_alloc_free = NULL;
-	ocfs_free_rec *free_vol_bits = NULL;
-	ocfs_free_rec *tmp_log;
-	struct inode **ext_alloc_inode = NULL;
-	struct inode **inode_alloc_inode = NULL;
+	int i;
+	ocfs_free_rec *ext_alloc_free = NULL;
+	ocfs_free_rec *vol_alloc_free = NULL;
+	struct inode *ext_alloc_inode = NULL;
 	struct inode *vol_inode = NULL;
-	__u32 tmp_indx;
-	struct buffer_head *globalbh = NULL;
-	struct buffer_head *tmpbh = NULL;
+	struct buffer_head *vol_alloc_bh = NULL;
+	struct buffer_head *ext_alloc_bh = NULL;
 	ocfs_journal_handle *handle = NULL;
-	int credits = 33; /* one for each potential sysfile fe. This
-			   * goes away when ocfs_ugly_hack goes
-			   * away. */
+	int credits = 0; /* sysfile fe's. */
+	ocfs_bitmap_update *tmp;
 
 	LOG_ENTRY_ARGS ("(0x%p, 0x%p)\n", osb, free_log);
 
-#define ALLOC_BLOCK(ptr, len, err)				\
-	do {							\
-		(ptr) = ocfs_malloc (len);			\
-		if (!(ptr)) {					\
-			LOG_ERROR_STATUS ((err) = -ENOMEM);	\
-			goto finally;				\
-		}						\
-	} while (0)
+	for (i = 0; i < free_log->num_updates; i++) {
+//		OCFS_ASSERT(
+//			(free_log->update[i].type == DISK_ALLOC_EXTENT_NODE)
+//			|| (free_log->update[i].type == DISK_ALLOC_VOLUME));
+		if ((free_log->update[i].type != DISK_ALLOC_EXTENT_NODE)
+		    && (free_log->update[i].type != DISK_ALLOC_VOLUME)) {
+			printk("num_updates = %u, i=%d\n", 
+			       free_log->num_updates, i);
+			printk("length:   %llu\n", free_log->update[i].length);
+			printk("file_off: %llu\n",
+			       free_log->update[i].file_off);
+			printk("type:     %u\n", free_log->update[i].type);
+			printk("node_num: %d\n", free_log->update[i].node_num);
+			printk("blkno:    %llu\n", free_log->update[i].blkno);
+			BUG();
+		}
 
-	ALLOC_BLOCK(inode_alloc_free,
-		    osb->max_nodes * sizeof (ocfs_free_rec *), status);
-	ALLOC_BLOCK(inode_alloc_inode,
-		    osb->max_nodes * sizeof (struct inode *), status);
-	ALLOC_BLOCK(ext_alloc_free,
-		    osb->max_nodes * sizeof (ocfs_free_rec *), status);
-	ALLOC_BLOCK(ext_alloc_inode,
-		    osb->max_nodes * sizeof (struct inode *), status);
 
-	/* init */
-	for (i = 0; i < osb->max_nodes; i++) {
-		ext_alloc_free[i] = NULL;
-		ext_alloc_inode[i] = NULL;
-		inode_alloc_free[i] = NULL;
-		inode_alloc_inode[i] = NULL;
-	}
-
-	num_upd = free_log->num_updates;
-	for (i = 0; i < num_upd; i++) {
-		switch (free_log->update[i].type) {
-		    case DISK_ALLOC_INODE:
-			    node_num = free_log->update[i].node_num;
-			    if (inode_alloc_free[node_num] == NULL) {
-				    inode_alloc_free[node_num] =
+		if (free_log->update[i].type == DISK_ALLOC_EXTENT_NODE) {
+			if (!ext_alloc_free) {
+				ext_alloc_free = 
+					ocfs_malloc(sizeof(ocfs_free_rec));
+				if (!ext_alloc_free) {
+					LOG_ERROR_STATUS(status = -ENOMEM);
+					goto finally;
+				}
+				ext_alloc_free->num_updates = 0;
+				credits++; /* for the fe updates */
+			}
+			credits++;
+			ocfs_copy_update(ext_alloc_free, 
+					 &(free_log->update[i]));
+		} else {
+			if (vol_alloc_free == NULL) {
+				vol_alloc_free =
 					ocfs_malloc (sizeof (ocfs_free_rec));
-				    if (inode_alloc_free[node_num] == NULL) {
-					    LOG_ERROR_STATUS (status = -ENOMEM);
-					    goto finally;
-				    }
-				    inode_alloc_free[node_num]->num_updates = 0;
-			    }
-			    tmp_log = inode_alloc_free[node_num];
-
-			    credits++;
-			    break;
-
-		    case DISK_ALLOC_EXTENT_NODE:
-			    node_num = free_log->update[i].node_num;
-			    if (ext_alloc_free[node_num] == NULL) {
-				    ext_alloc_free[node_num] =
-					ocfs_malloc (sizeof (ocfs_free_rec));
-				    if (ext_alloc_free[node_num] == NULL) {
-					    LOG_ERROR_STATUS (status = -ENOMEM);
-					    goto finally;
-				    }
-				    ext_alloc_free[node_num]->num_updates = 0;
-			    }
-			    tmp_log = ext_alloc_free[node_num];
-
-			    credits++;
-			    break;
-
-		    case DISK_ALLOC_VOLUME:
-			    if (free_vol_bits == NULL) {
-				    free_vol_bits =
-					ocfs_malloc (sizeof (ocfs_free_rec));
-				    if (free_vol_bits == NULL) {
-					    LOG_ERROR_STATUS (status = -ENOMEM);
-					    goto finally;
-				    }
-				    free_vol_bits->num_updates = 0;
-			    }
-			    tmp_log = free_vol_bits;
-
-			    credits += ocfs_blocks_for_bits(osb->sb,
-							    free_log->update[i].length);
-			    break;
-
-		    default:
-			    tmp_log = NULL;
-			    break;
+				if (vol_alloc_free == NULL) {
+					LOG_ERROR_STATUS (status = -ENOMEM);
+					goto finally;
+				}
+				vol_alloc_free->num_updates = 0;
+				credits++; /* for the fe updates */
+			}
+			credits += ocfs_blocks_for_bits(osb->sb,
+							free_log->update[i].length);
+			ocfs_copy_update(vol_alloc_free, 
+					 &(free_log->update[i]));
 		}
-
-
-		if (tmp_log) {
-			ocfs_bitmap_update *fb1, *fb2;
-
-			tmp_indx = tmp_log->num_updates;
-
-			fb1 = &(tmp_log->update[tmp_indx]);
-			fb2 = &(free_log->update[i]);
-
-			fb1->length = fb2->length;
-			fb1->file_off = fb2->file_off;
-			fb1->type = fb2->type;
-			fb1->node_num = fb2->node_num;
-
-			tmp_log->num_updates++;
-		}
 	}
 
-	/* start the transaction here to preserve ordering with the
-	 * bitmap io_sems... */
-	handle = ocfs_start_trans(osb, NULL, credits);
+	handle = ocfs_alloc_handle(osb);
 	if (!handle) {
 		status = -ENOMEM;
 		LOG_ERROR_STATUS(status);
 		goto finally;
 	}
 
-	/* Get all the locks we need. do global bitmap last to
-	 * preserve lock ordering with extend/create */
-	for (i = 0; i < osb->max_nodes; i++) {
-		if (inode_alloc_free[i] != NULL) {
-			inode_alloc_inode[i] = 
-				ocfs_get_system_file_inode(osb, INODE_ALLOC_BITMAP_SYSTEM_INODE, i);
-			if (!inode_alloc_inode[i]) {
-				status = -EINVAL;
-				LOG_ERROR_STATUS (status);
-				goto abort;
-			}
-			ocfs_handle_add_inode(handle, inode_alloc_inode[i]);
-
-			status = ocfs_acquire_lock (osb, 
-						    OCFS_LKM_EXMODE,
-						    0,
-						    &tmpbh,
-						    inode_alloc_inode[i]);
-			if (tmpbh) {
-				brelse(tmpbh);
-				tmpbh = NULL;
-			}
-			if (status < 0) {
-				iput(inode_alloc_inode[i]);
-				inode_alloc_inode[i] = NULL;
-				if (status != -EINTR)
-					LOG_ERROR_STATUS (status);
-				goto abort;
-			}
-			ocfs_handle_add_lock(handle, OCFS_LKM_EXMODE, 
-					     0, 
-					     inode_alloc_inode[i]);
+	if (ext_alloc_free) {
+		ext_alloc_inode = 
+			ocfs_get_system_file_inode(osb, 
+						   EXTENT_ALLOC_SYSTEM_INODE, 
+						   0);
+		if (!ext_alloc_inode) {
+			status = -EINVAL;
+			LOG_ERROR_STATUS (status);
+			goto finally;
 		}
-	}
 
-	for (i = 0; i < osb->max_nodes; i++) {
-		if (ext_alloc_free[i] != NULL) {
-			ext_alloc_inode[i] = 
-				ocfs_get_system_file_inode(osb, EXTENT_ALLOC_BITMAP_SYSTEM_INODE, i);
-			if (!ext_alloc_inode[i]) {
-				status = -EINVAL;
+		status = ocfs_acquire_lock (osb, OCFS_LKM_EXMODE, 0,
+					    &ext_alloc_bh, ext_alloc_inode);
+		if (status < 0) {
+			if (status != -EINTR)
 				LOG_ERROR_STATUS (status);
-				goto abort;
-			}
-			ocfs_handle_add_inode(handle, ext_alloc_inode[i]);
-
-			status = ocfs_acquire_lock (osb, 
-						    OCFS_LKM_EXMODE,
-						    0,
-						    &tmpbh,
-						    ext_alloc_inode[i]);
-			if (tmpbh) {
-				brelse(tmpbh);
-				tmpbh = NULL;
-			}
-			if (status < 0) {
-				iput(ext_alloc_inode[i]);
-				ext_alloc_inode[i] = NULL;
-				if (status != -EINTR)
-					LOG_ERROR_STATUS (status);
-				goto abort;
-			}
-			ocfs_handle_add_lock(handle, OCFS_LKM_EXMODE, 
-					     0,  
-					     ext_alloc_inode[i]);
+			goto finally;
 		}
+		ocfs_handle_add_lock(handle, OCFS_LKM_EXMODE, 0, 
+				     ext_alloc_inode);
+		ocfs_handle_add_inode(handle, ext_alloc_inode);
 	}
 
-	if (free_vol_bits != NULL) {
-		vol_inode = ocfs_get_system_file_inode(osb, GLOBAL_BITMAP_SYSTEM_INODE, -1);
+	if (vol_alloc_free) {
+		vol_inode = 
+			ocfs_get_system_file_inode(osb, 
+						   GLOBAL_BITMAP_SYSTEM_INODE, 
+						   -1);
 		if (!vol_inode) {
 			status = -EINVAL;
 			LOG_ERROR_STATUS (status);
-			goto abort;
+			goto finally;
 		}
-		ocfs_handle_add_inode(handle, vol_inode);
 
-		status = ocfs_acquire_lock(osb, OCFS_LKM_EXMODE,
-					   0,
-					   &globalbh, vol_inode);
+		status = ocfs_acquire_lock(osb, OCFS_LKM_EXMODE, 0,
+					   &vol_alloc_bh, vol_inode);
 		if (status < 0) {
 			iput(vol_inode);
 			vol_inode = NULL;
 
 			if (status != -EINTR)
 				LOG_ERROR_STATUS (status);
-			goto abort;
+			goto finally;
 		}
 		ocfs_handle_add_lock(handle, OCFS_LKM_EXMODE, 
 				     0, vol_inode);
+		ocfs_handle_add_inode(handle, vol_inode);
 	}
 
-	/* free vol block */
-	if (free_vol_bits != NULL)
-		ocfs_free_vol_block(osb, handle, free_vol_bits, -1, 
-				    DISK_ALLOC_VOLUME);
-
-	for (i = 0; i < osb->max_nodes; i++) {
-		if (inode_alloc_free[i] != NULL)
-			ocfs_free_vol_block(osb, handle,
-					    inode_alloc_free[i], i,
-					    DISK_ALLOC_INODE);
-		if (ext_alloc_free[i] != NULL)
-			ocfs_free_vol_block(osb, handle,
-					    ext_alloc_free[i], i,
-					    DISK_ALLOC_EXTENT_NODE);
+	handle = ocfs_start_trans(osb, handle, credits);
+	if (!handle) {
+		status = -ENOMEM;
+		LOG_ERROR_STATUS(status);
+		goto finally;
 	}
+	ocfs_handle_set_always_commits(handle, 1);
 
-	if (free_vol_bits) {
-		ocfs2_dinode *bm_lock;
+	if (vol_alloc_free)
+		ocfs_free_main_bitmap(osb, handle, vol_inode, 
+				      vol_alloc_bh, vol_alloc_free);
 
-		status = ocfs_journal_access(handle, globalbh, 
-					     OCFS_JOURNAL_ACCESS_WRITE);
-		if (status < 0) {
-			LOG_ERROR_STATUS(status);
-			goto abort;
+	if (ext_alloc_free)
+		for (i = 0; i < ext_alloc_free->num_updates; i++) {
+			tmp = &(ext_alloc_free->update[i]);
+			status = ocfs_free_suballoc_bits(osb,
+							 handle,
+							 ext_alloc_inode,
+							 ext_alloc_bh,
+							 (unsigned int) 
+							 tmp->file_off,
+							 tmp->blkno,
+							 1);
+			if (status < 0) {
+				LOG_ERROR_STATUS (status);
+				goto finally;
+			}
 		}
 
-		bm_lock = (ocfs2_dinode *) globalbh->b_data;
-		bm_lock->id1.bitmap1.i_used =
-			ocfs_count_bits(osb->sb, &osb->cluster_bitmap);
-
-		status = ocfs_journal_dirty(handle, globalbh);
-		if (status < 0) {
-			LOG_ERROR_STATUS (status);
-			goto abort;
-		}
-	}
-
-	ocfs_commit_trans(handle);
-
-	handle = NULL;
-
-abort:
+finally:
 	if (handle)
-		ocfs_abort_trans(handle);
+		ocfs_commit_trans(handle);
 
-finally:
-	if (inode_alloc_inode) {
-		for (i = 0; i < osb->max_nodes; i++) {
-			if (inode_alloc_inode[i])
-				iput(inode_alloc_inode[i]);
-		}
-		kfree(inode_alloc_inode);
-	}
-
-	if (ext_alloc_inode) {
-		for (i = 0; i < osb->max_nodes; i++) {
-			if (ext_alloc_inode[i])
-				iput(ext_alloc_inode[i]);
-		}
-		kfree(ext_alloc_inode);
-	}
-
+	if (ext_alloc_inode)
+		iput(ext_alloc_inode);
 	if (vol_inode)
 		iput(vol_inode);
 
-	if (globalbh)
-		brelse(globalbh);
+	if (vol_alloc_bh)
+		brelse(vol_alloc_bh);
+	if (ext_alloc_bh)
+		brelse(ext_alloc_bh);
 
-	if (ext_alloc_free) {
-		for (i = 0; i < osb->max_nodes; i++) {
-			if (ext_alloc_free[i])
-				kfree(ext_alloc_free[i]);
-		}
+	if (ext_alloc_free)
 		kfree(ext_alloc_free);
-	}
+	if (vol_alloc_free)
+		kfree(vol_alloc_free);
 
-	if (inode_alloc_free) {
-		for (i = 0; i < osb->max_nodes; i++) {
-			if (inode_alloc_free[i])
-				kfree(inode_alloc_free[i]);
-		}
-		kfree(inode_alloc_free);
-	}
-
-	if (free_vol_bits)
-		kfree(free_vol_bits);
-
 	LOG_EXIT_STATUS (status);
 	return status;
 }				/* ocfs_free_disk_bitmap */
 
 static inline int ocfs_free_main_bitmap(ocfs_super *osb, 
 					ocfs_journal_handle *handle, 
+					struct inode *bitmap_inode,
+					struct buffer_head *bh,
 					ocfs_free_rec *freelog) 
 {
 	int i;
 	ocfs_alloc_bm *bitmap;
 	int status;
 	__u32 bitmapblocks; /* we only care about the valid blocks */
-	struct inode *bitmap_inode = NULL;
+	ocfs2_dinode *bm_lock;
 
 	LOG_ENTRY();
 
-	bitmap_inode = ocfs_get_system_file_inode(osb, GLOBAL_BITMAP_SYSTEM_INODE, -1);
-	if (!bitmap_inode) {
-		LOG_ERROR_STATUS (status = -EINVAL);
-		goto bail;
-	}
-
 	bitmap = &osb->cluster_bitmap;
 
 	bitmapblocks = ocfs_blocks_for_bits(osb->sb, bitmap->validbits);
@@ -558,120 +435,72 @@
 		goto bail;
 	}
 
-	for (i = 0; i < freelog->num_updates; i++)
+	status = ocfs_journal_access(handle, bh, 
+				     OCFS_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+	bm_lock = (ocfs2_dinode *) bh->b_data;
+
+	for (i = 0; i < freelog->num_updates; i++) {
 		ocfs_clear_bits(osb->sb, handle, bitmap,
 				freelog->update[i].file_off,
 				freelog->update[i].length);
+		bm_lock->id1.bitmap1.i_used -= freelog->update[i].length;
+	}
 
+	status = ocfs_journal_dirty(handle, bh);
+	if (status < 0) {
+		LOG_ERROR_STATUS (status);
+		goto bail;
+	}
+
 	status = 0;
 bail:
-	if (bitmap_inode)
-		iput(bitmap_inode);
 	LOG_EXIT_STATUS(status);
 	return(0);
 }
 
 /*
- * ocfs_free_vol_block()
- *
+ * How many free extents have we got before we need more meta data?
  */
-static int ocfs_free_vol_block(ocfs_super *osb,
-			       ocfs_journal_handle *handle,
-			       ocfs_free_rec *FreeLog, __u32 NodeNum,
-			       __u32 Type)
+int ocfs_num_free_extents(ocfs_super *osb, 
+			  struct inode *inode,
+			  ocfs2_dinode *fe)
 {
-	int status = 0;
-	__u64 fileSize = 0;
-	__u64 allocSize = 0;
-	__u32 foundBit = -1;
-	__u32 blockSize = 0, blockSizeBits = 0;
-	int file_type;
-	__u32 bitmapblocks = 0;
-	ocfs_alloc_bm AllocBitmap;
-	ocfs_alloc_bm *tmpbitmap = NULL;
-	__u32 i;
-	struct inode *inode = NULL;
+	int retval;
+	ocfs2_extent_list *el;
+	ocfs2_extent_block *eb;
+	struct buffer_head *eb_bh = NULL;
 
-	LOG_ENTRY ();
+	LOG_ENTRY();
 
-	LOG_TRACE_ARGS("Free Log Details (type = %d):\n", Type);
-	LOG_TRACE_ARGS("num_updates = %u\n", FreeLog->num_updates);
-	for(i = 0; i < FreeLog->num_updates; i++)
-		LOG_TRACE_ARGS("(upd=%u, length=%llu, file_off=%llu, type=%d, node_num=%d)\n", 
-			       i, FreeLog->update[i].length, FreeLog->update[i].file_off, 
-			       FreeLog->update[i].type, FreeLog->update[i].node_num);
+	OCFS_ASSERT(IS_VALID_FILE_ENTRY(fe));
 
-	switch (Type) {
-	    case DISK_ALLOC_EXTENT_NODE:
-		    file_type = EXTENT_ALLOC_BITMAP_SYSTEM_INODE;
-		    blockSize = osb->sb->s_blocksize;
-		    blockSizeBits = osb->sb->s_blocksize_bits;
-		    break;
+	if (fe->i_last_eb_blk) {
+		retval = ocfs_read_bh(osb, fe->i_last_eb_blk << osb->sb->s_blocksize_bits, &eb_bh, OCFS_BH_CACHED, inode);
+		if (retval < 0) {
+			LOG_ERROR_STATUS(retval);
+			goto bail;
+		}
+		eb = (ocfs2_extent_block *) eb_bh->b_data;
+		el = &(eb->h_list);
+	} else
+		el = &(fe->id2.i_list);
 
-	    case DISK_ALLOC_INODE:
-		    file_type = INODE_ALLOC_BITMAP_SYSTEM_INODE;
-		    blockSize = osb->sb->s_blocksize;
-		    blockSizeBits = osb->sb->s_blocksize_bits;
-		    break;
+	OCFS_ASSERT(el->l_tree_depth == 0);
 
-	    case DISK_ALLOC_VOLUME:
-		    status = ocfs_free_main_bitmap(osb, handle, FreeLog);
-		    if (status < 0)
-			    LOG_ERROR_STATUS (status);
-		    goto leave;
-	    default:
-		    goto leave;
-	}
+	retval = el->l_count - el->l_next_free_rec;
 
-	if (NodeNum >= osb->max_nodes) {
-		LOG_ERROR_STATUS(status = -EINVAL);
-		goto leave;
-	}
+bail:
+	if (eb_bh)
+		brelse(eb_bh);
 
-	inode = ocfs_get_system_file_inode(osb, file_type, NodeNum);
-	if (!inode) {
-		LOG_ERROR_STATUS (status = -EINVAL);
-		goto leave;
-	}
-	fileSize = inode->i_size;
-	allocSize = OCFS_I(inode)->ip_alloc_size;
+	LOG_EXIT_STATUS(retval);
+	return(retval);
+}
 
-	ocfs_initialize_bitmap(osb->sb, &AllocBitmap, fileSize * 8,
-			       allocSize * 8);
-
-	tmpbitmap = &AllocBitmap;
-	bitmapblocks = ocfs_blocks_for_bits(osb->sb,
-					    tmpbitmap->validbits);
-
-	status = ocfs_read_system_file(osb, file_type, NodeNum,
-				       AllocBitmap.chunk, 
-				       bitmapblocks << osb->sb->s_blocksize_bits);
-	if (status < 0) {
-		LOG_ERROR_STATUS (status);
-		goto leave;
-	}
-
-	for (i = 0; i < FreeLog->num_updates; i++) {
-		if (FreeLog->update[i].file_off == 0 && Type == 0) {
-			LOG_ERROR_ARGS ("offset=0, type=%x, blksz=%d", Type,
-					blockSize);
-		}
-		
-		foundBit = (__u32) (FreeLog->update[i].file_off >> blockSizeBits);
-		ocfs_clear_bits(osb->sb, handle, tmpbitmap, foundBit,
-				(__u32) FreeLog->update[i].length);
-	}
-
-leave:
-	if (tmpbitmap)
-		ocfs_uninitialize_bitmap(tmpbitmap);
-	if (inode)
-		iput(inode);
-	LOG_EXIT_STATUS (status);
-	return status;
-}			/* ocfs_free_vol_block */
-
-
 /* ocfs_allocate_new_data_node()
  * 
  */
@@ -682,26 +511,24 @@
 				       struct buffer_head *eb_bh, 
 				       u64 *new_eb_blkno,
 				       ocfs_journal_handle *handle,
-				       struct inode *inode)
+				       struct inode *inode,
+				       ocfs2_alloc_context *meta_ac)
 {
 	int status = 0;
 	__u32 k, i;
 	__u32 depth;
-	int allocSize;
 	u64 parent_blk;
-	__u64 physicalOffset;
-	u64 phys_blkno;
-	__u64 fileOffset = 0;
 	int new_blocks = 0;
 	ocfs2_extent_block *eb = NULL;
 	ocfs2_extent_list *el1, *el2 = NULL;
 	struct buffer_head **eb_bhs = NULL;
 	struct buffer_head *bh = NULL;
-	int bh_locked = 0;
 	int size;
 
 	LOG_ENTRY ();
-	
+
+	OCFS_ASSERT(meta_ac);
+
 	if (eb_bh) {
 		status = ocfs_journal_access(handle, eb_bh, 
 					     OCFS_JOURNAL_ACCESS_WRITE);
@@ -713,7 +540,6 @@
 
 		eb = (ocfs2_extent_block *) eb_bh->b_data;
 		el1 = &eb->h_list;
-		bh_locked = 1;
 	}
 	else
 		el1 = &fe->id2.i_list;
@@ -726,18 +552,7 @@
 		parent_blk = fe->i_blkno;
 
 	new_blocks = depth;
-	allocSize = new_blocks << osb->sb->s_blocksize_bits;
 
-	/* allocate contiguous blocks on disk */
-	status = ocfs_alloc_node_block(osb, allocSize, &physicalOffset, 
-				       &fileOffset, osb->node_num, 
-				       DISK_ALLOC_EXTENT_NODE, handle);
-	if (status < 0) {
-		LOG_ERROR_STATUS (status);
-		goto finally;
-	}
-	phys_blkno = physicalOffset >> osb->sb->s_blocksize_bits;
-
 	size = sizeof(struct buffer_head *) * new_blocks;
 	eb_bhs = kmalloc(size, GFP_KERNEL);
 	if (eb_bhs == NULL) {
@@ -747,57 +562,39 @@
 	}
 	memset(eb_bhs, 0, size);
 
-	status = ocfs_read_bhs(osb,
-			       phys_blkno << osb->sb->s_blocksize_bits,
-			       (u64)new_blocks << osb->sb->s_blocksize_bits,
-			       eb_bhs, OCFS_BH_CACHED, inode);
+	status = ocfs_create_new_meta_bhs(osb, handle, inode, new_blocks, meta_ac, eb_bhs);
 	if (status < 0) {
 		LOG_ERROR_STATUS (status);
 		goto finally;
 	}
 
-	/* zero them all out */
-	for(i = 0; i < new_blocks; i++) {
-		status = ocfs_journal_access(handle, eb_bhs[i], 
-					     OCFS_JOURNAL_ACCESS_CREATE);
-		if (status < 0) {
-			LOG_ERROR_STATUS(status);
-			goto finally;
-		}
-
-		memset(eb_bhs[i]->b_data, 0, osb->sb->s_blocksize);
-		set_buffer_uptodate(eb_bhs[i]);
-	}
-	
 	k = el1->l_next_free_rec;
 	el1->l_recs[k].e_cpos = fe->i_clusters;
 	el1->l_recs[k].e_clusters = new_clusters;
-	el1->l_recs[k].e_blkno = phys_blkno;
+	el1->l_recs[k].e_blkno = 
+		((ocfs2_extent_block *) eb_bhs[0]->b_data)->h_blkno;
 	el1->l_next_free_rec++;
 
+	OCFS_ASSERT(el1->l_next_free_rec <= el1->l_count);
+
 	/* Fill in all the headers and the leaf */
 	for (i = 0; i < depth; i++) {
-		ocfs2_extent_block *eb;
+		ocfs2_extent_block *eb, *tmpeb;
 
 		eb = (ocfs2_extent_block *) eb_bhs[i]->b_data;
 
 		eb->h_parent_blk = parent_blk;
-		eb->h_suballoc_blkno =
-			(fileOffset >> osb->sb->s_blocksize_bits) + i;
-		eb->h_suballoc_node = osb->node_num;
-		eb->h_blkno = phys_blkno + i;
-		strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE);
 
 		el2 = &eb->h_list;
-		el2->l_count = ocfs2_extent_recs_per_eb(osb->sb);
 		el2->l_next_free_rec = 1;
 		el2->l_recs[0].e_cpos = fe->i_clusters;
 		el2->l_recs[0].e_clusters = new_clusters;
 		el2->l_tree_depth = (depth - (i + 1));
 
 		if (el2->l_tree_depth) {
+			tmpeb = (ocfs2_extent_block *) eb_bhs[i+1]->b_data;
 			/* fill in each header */
-			el2->l_recs[0].e_blkno = phys_blkno + (i + 1);
+			el2->l_recs[0].e_blkno = tmpeb->h_blkno;
 		} else {
 			/* fill in the leaf */
 			el2->l_recs[0].e_blkno = new_blkno;
@@ -821,7 +618,6 @@
 		u64 tmp_blk = eb->h_parent_blk;
 		int tree_depth = el1->l_tree_depth;
 
-		bh_locked = 0;
 		eb = NULL;
 
 	       	el1 = &fe->id2.i_list;
@@ -894,6 +690,100 @@
 	return status;
 }				/* ocfs_allocate_new_data_node */
 
+/* expects array to already be malloced 
+ *
+ * sets h_signature, h_blkno, h_suballoc_bit, h_suballoc_node, and
+ * l_count for you 
+ */
+static int ocfs_create_new_meta_bhs(ocfs_super *osb, 
+				    ocfs_journal_handle *handle,
+				    struct inode *inode,
+				    int wanted, 
+				    ocfs2_alloc_context *meta_ac,
+				    struct buffer_head *bhs[])
+{
+	int count, status, i;
+	u16 suballoc_bit_start;
+	u32 num_got;
+	u64 first_blkno;
+	ocfs2_extent_block *eb;
+
+	LOG_ENTRY();
+
+	count = 0;
+	while (count < wanted) {
+		status = ocfs_claim_metadata(osb, 
+					     handle, 
+					     meta_ac,
+					     wanted - count, 
+					     &suballoc_bit_start, 
+					     &num_got,
+					     &first_blkno);
+		if (status < 0) {
+			LOG_ERROR_STATUS (status);
+			goto bail;
+		}
+
+		for(i = count;  i < (num_got + count); i++) {
+			bhs[i] = sb_getblk(osb->sb, first_blkno);
+			if (bhs[i] == NULL) {
+				status = -EIO;
+				LOG_ERROR_STATUS(status);
+				goto bail;
+			}
+			set_buffer_uptodate(bhs[i]);
+			SET_BH_SEQNUM(inode, bhs[i]);
+
+			status = ocfs_journal_access(handle, bhs[i],
+						     OCFS_JOURNAL_ACCESS_CREATE);
+			if (status < 0) {
+				LOG_ERROR_STATUS(status);
+				goto bail;
+			}
+
+			memset(bhs[i]->b_data, 0, osb->sb->s_blocksize);
+			eb = (ocfs2_extent_block *) bhs[i]->b_data;
+			/* Ok, setup the minimal stuff here. */
+			strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE);
+			eb->h_blkno = first_blkno;
+
+#ifndef OCFS_USE_ALL_METADATA_SUBALLOCATORS
+			/* we always use node zeros suballocator */
+			eb->h_suballoc_node = 0;
+#else
+			eb->h_suballoc_node = osb->node_num;
+#endif
+			eb->h_suballoc_bit = suballoc_bit_start;
+			eb->h_list.l_count = ocfs2_extent_recs_per_eb(osb->sb);
+
+			suballoc_bit_start++;
+			first_blkno++;
+
+			/* We'll also be dirtied by the caller, so
+			 * this isn't absolutely necessary. */
+			status = ocfs_journal_dirty(handle, bhs[i]);
+			if (status < 0) {
+				LOG_ERROR_STATUS(status);
+				goto bail;
+			}
+		}
+
+		count += num_got;
+	}
+
+	status = 0;
+bail:
+	if (status < 0) {
+		for(i = 0; i < wanted; i++) {
+			if (bhs[i])
+				brelse(bhs[i]);
+			bhs[i] = NULL;
+		}
+	}
+	LOG_EXIT_STATUS(status);
+	return(status);
+}
+
 /* ocfs_grow_extent_tree()
  *
  */
@@ -901,16 +791,15 @@
 				 struct buffer_head *fe_bh,
 				 ocfs_journal_handle *handle,
 				 u64 blkno, u32 new_clusters,
-				 struct inode *inode)
+				 struct inode *inode,
+				 ocfs2_alloc_context *meta_ac)
 {
 	int status = 0;
 	__s32 k, i;
 	ocfs2_extent_block *eb1 = NULL;
 	ocfs2_extent_block *eb2 = NULL;
 	ocfs2_extent_list *ebl, *fel;
-	__u64 physicalOffset;
-	__u64 fileOffset = 0;
-	u64 phys_blkno, parent_blk, last_eb_blkno;
+	u64 parent_blk, last_eb_blkno;
 	u64 new_parent_blk = 0;
 	struct buffer_head **bhs = NULL;
 	int numbhs = 0;
@@ -919,6 +808,8 @@
 	LOG_ENTRY_ARGS("(0x%p, 0x%p, %llu, %u\n", osb, fe, blkno,
 		       new_clusters);
 
+	OCFS_ASSERT(meta_ac);
+
 	fe = (ocfs2_dinode *) fe_bh->b_data;
 
 	fel = &fe->id2.i_list;
@@ -932,35 +823,12 @@
 	}
 	memset(bhs, 0, numbhs * sizeof(*bhs));
 
-	/* Allocate the space from the Extent file. This function should */
-	/* return contigous disk blocks requested. */
-	status = ocfs_alloc_node_block(osb,
-				       numbhs << osb->sb->s_blocksize_bits,
-				       &physicalOffset, &fileOffset,
-				       osb->node_num, 
-			       	       DISK_ALLOC_EXTENT_NODE, handle);
+	status = ocfs_create_new_meta_bhs(osb, handle, inode, numbhs, meta_ac, bhs);
 	if (status < 0) {
 		LOG_ERROR_STATUS (status);
 		goto finally;
 	}
-	phys_blkno = physicalOffset >> osb->sb->s_blocksize_bits;
 
-	for (i = 0; i < numbhs; i++) {
-		bhs[i] = sb_getblk(osb->sb, phys_blkno + i);
-		if (bhs[i] == NULL) {
-			status = -EIO;
-			LOG_ERROR_STATUS(status);
-			goto finally;
-		}
-		memset(bhs[i]->b_data, 0, osb->sb->s_blocksize);
-		set_buffer_uptodate(bhs[i]);
-	}
-
-	if (phys_blkno == 0) {
-		LOG_ERROR_STATUS(status = -ENOMEM);
-		goto finally;
-	}
-
 	eb1 = (ocfs2_extent_block *) bhs[0]->b_data;
 	/* Copy the File Entry information in to the newly allocated sector */
 	ebl = &eb1->h_list;
@@ -971,56 +839,47 @@
 	}
 
 	last_eb_blkno = fe->i_last_eb_blk;
-	eb1->h_blkno =
-		physicalOffset >> osb->sb->s_blocksize_bits;
 	new_parent_blk = eb1->h_blkno;
-	eb1->h_suballoc_blkno =
-		fileOffset >> osb->sb->s_blocksize_bits;
-	eb1->h_suballoc_node = osb->node_num;
 	eb1->h_next_leaf_blk = 0;
 	fel->l_tree_depth++;
 
 	LOG_TRACE_ARGS ("Tree depth is: %d\n", fel->l_tree_depth);
 
-	/* If tree_depth is one now, the for loop will not execute. */
-	/* First time a file is created, tree_depth = 0 */
-
 	parent_blk = fe->i_blkno;
 
+	/* If tree_depth is one now, the for loop will not execute. *
+	 * First time a file is created, tree_depth = 0 */
 	for (i = 0; i < (fel->l_tree_depth - 1); i++) {
+		ocfs2_extent_block *tmpeb;
+
 		eb2 = (ocfs2_extent_block *) bhs[i]->b_data;
 		ebl = &eb2->h_list;
 
 		ebl->l_tree_depth = (fel->l_tree_depth - 1) - i;
 		ebl->l_count = ocfs2_extent_recs_per_eb(osb->sb);
 
-		strcpy(eb2->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE);
-
 		if (i == 0) {
-			ebl->l_recs[fel->l_count].e_blkno =
-				phys_blkno + 1;
+			tmpeb = (ocfs2_extent_block *) bhs[1]->b_data;
+			ebl->l_recs[fel->l_count].e_blkno = 
+				tmpeb->h_blkno;
 			ebl->l_recs[fel->l_count].e_cpos =
 				fe->i_clusters;
 			ebl->l_recs[fel->l_count].e_clusters =
 				new_clusters;
 			ebl->l_next_free_rec = fel->l_count + 1;
 
-			eb2->h_blkno =
-				physicalOffset >> osb->sb->s_blocksize_bits;
+			OCFS_ASSERT(ebl->l_next_free_rec <= ebl->l_count);
+
 			eb2->h_parent_blk = parent_blk;
 
 			parent_blk = last_eb_blkno = eb2->h_blkno;
 		} else {
-			ebl->l_recs[0].e_blkno = phys_blkno + (i + 1);
+			tmpeb = (ocfs2_extent_block *) bhs[i + 1]->b_data;
+			ebl->l_recs[0].e_blkno = tmpeb->h_blkno;
 			ebl->l_recs[0].e_cpos = fe->i_clusters;
 			ebl->l_recs[0].e_clusters = new_clusters;
 			ebl->l_next_free_rec = 1;
 
-			eb2->h_suballoc_blkno =
-				(fileOffset >> osb->sb->s_blocksize_bits) + i;
-			eb2->h_suballoc_node = osb->node_num;
-			eb2->h_blkno = 
-				(physicalOffset >> osb->sb->s_blocksize_bits) + i;
 			eb2->h_parent_blk = parent_blk;
 
 			parent_blk = last_eb_blkno = eb2->h_blkno;
@@ -1038,7 +897,6 @@
 	/* For the time being we are assuming that the newly allocated Extent */
 	/* will have one more entry to accomodate the latest allocation */
 
-	strcpy(eb1->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE);
 	ebl->l_tree_depth = 0;
 	ebl->l_count = ocfs2_extent_recs_per_eb(osb->sb);
 
@@ -1046,24 +904,19 @@
 	ebl->l_recs[i].e_clusters = new_clusters;
 	ebl->l_recs[i].e_blkno = blkno;
 	ebl->l_next_free_rec = i + 1;
+	OCFS_ASSERT(ebl->l_next_free_rec <= ebl->l_count);
 
-	eb1->h_suballoc_blkno =
-		(fileOffset >> osb->sb->s_blocksize_bits) +
-		numbhs - 1;
-	eb1->h_suballoc_node = osb->node_num;
-	eb1->h_blkno =
-		(physicalOffset >> osb->sb->s_blocksize_bits) +
-		numbhs - 1;
 	eb1->h_parent_blk = parent_blk;
 	eb1->h_next_leaf_blk = 0;
 
-	parent_blk = last_eb_blkno = eb1->h_blkno;
+	last_eb_blkno = eb1->h_blkno;
 
-	/* This needs to be a sync write OR journalled to be safe. */
-	status = ocfs_write_bhs(osb, bhs, numbhs, inode);
-	if (status < 0) {
-		LOG_ERROR_STATUS (status);
-		goto finally;
+	for(i = 0; i < numbhs; i++) {
+		status = ocfs_journal_dirty(handle, bhs[i]);
+		if (status < 0) {
+			LOG_ERROR_STATUS (status);
+			goto finally;
+		}
 	}
 
 	/* Update the Previous Last Data Extent with this new Data
@@ -1151,7 +1004,8 @@
 	LOG_TRACE_ARGS("fe->i_clusters = %u\n", fe->i_clusters);
 	fel->l_recs[0].e_cpos = 0;  /* FIXME: not needed */
 	fel->l_recs[0].e_clusters = fe->i_clusters + new_clusters;
-	fel->l_recs[0].e_blkno = phys_blkno;
+	eb1 = (ocfs2_extent_block *) bhs[0]->b_data;
+	fel->l_recs[0].e_blkno = eb1->h_blkno;
 	fel->l_next_free_rec = 1;
 	fe->i_last_eb_blk = last_eb_blkno;
 
@@ -1169,7 +1023,8 @@
 int ocfs_allocate_extent(ocfs_super *osb, struct buffer_head *fe_bh,
 			 ocfs_journal_handle *handle,
 			 u64 blkno, u32 new_clusters,
-			 struct inode *inode)
+			 struct inode *inode,
+			 ocfs2_alloc_context *meta_ac)
 {
 	int status = 0;
 	int IncreaseTreeDepth = 0;
@@ -1295,6 +1150,7 @@
 		el1->l_recs[k].e_clusters = new_clusters;
 		el1->l_recs[k].e_blkno = blkno;
 		el1->l_next_free_rec++;
+		OCFS_ASSERT(el1->l_next_free_rec <= el1->l_count);
 		UpdateParent = 1;
 	} else {
 		/* Read the last extent and keep traversing
@@ -1307,13 +1163,6 @@
 			parent_blk = 0;
 
 		for (i = 1; i < fel->l_tree_depth; i++) {
-			/* if we loop back around */
-			if (eb2) {
-				brelse(eb2_bh);
-				eb2 = NULL;
-				el2 = NULL;
-				eb2_bh =NULL;
-			}
 			status = ocfs_read_bh(osb,
 					      parent_blk << osb->sb->s_blocksize_bits,
 					      &eb2_bh,
@@ -1341,8 +1190,12 @@
 				break;
 			
 			parent_blk = eb2->h_parent_blk;
+			brelse(eb2_bh);
+			eb2 = NULL;
+			el2 = NULL;
+			eb2_bh = NULL;
 		} /* for (i = 1; i < fe->i_tree_depth; i++) */
-		
+
 		if (eb2) {
 			eb2 = NULL;
 			el2 = NULL;
@@ -1357,6 +1210,7 @@
 			IncreaseTreeDepth = 1;
 			goto increase_depth;
 		}
+
 		/* ok, we need to add a branch. pass in NULL
 		 * if we need a whole branch, otherwise the
 		 * extent which needs the new leaf */
@@ -1364,7 +1218,7 @@
 						     new_clusters,
 						     eb2_bh,
 						     &new_eb_blkno,
-						     handle, inode);
+						     handle, inode, meta_ac);
 		if (status < 0) {
 			LOG_ERROR_STATUS(status);
 			goto finally;
@@ -1478,7 +1332,7 @@
 
 		status = ocfs_grow_extent_tree(osb, fe_bh, handle,
 					       blkno, new_clusters,
-					       inode);
+					       inode, meta_ac);
 		if (status < 0) {
 			LOG_ERROR_STATUS(status);
 			goto finally;
@@ -1588,7 +1442,7 @@
 			} 
 			status = ocfs_handle_add_commit_bits(handle, 
 							   numBitsAllocated, 
-							   bitmapOffset, -1, 
+							   bitmapOffset, -1, 0, 
 							   DISK_ALLOC_VOLUME); 
 			if (status < 0) {
 				LOG_ERROR_STATUS (status);
@@ -1663,7 +1517,7 @@
 				num_clusters = ext->e_clusters;
 				bitmap_offset =
 					(u32)((ext->e_blkno << osb->sb->s_blocksize_bits) >> osb->s_clustersize_bits);
-				status = ocfs_handle_add_commit_bits(handle, num_clusters, bitmap_offset, -1, DISK_ALLOC_VOLUME);
+				status = ocfs_handle_add_commit_bits(handle, num_clusters, bitmap_offset, -1, 0, DISK_ALLOC_VOLUME);
 				if (status < 0) {
 					LOG_ERROR_STATUS (status);
 					goto bail;
@@ -1719,8 +1573,9 @@
 free_meta:
 		/* Free the metadata associated with this extent group */
 		status = ocfs_handle_add_commit_bits(handle, 1,
-						     cur_eb->h_suballoc_blkno << osb->sb->s_blocksize_bits,
+						     cur_eb->h_suballoc_bit,
 						     cur_eb->h_suballoc_node,
+						     cur_eb->h_blkno,
 						     DISK_ALLOC_EXTENT_NODE);
 		if (status < 0) {
 			LOG_ERROR_STATUS (status);
@@ -2084,7 +1939,7 @@
 						done = 1;
 						goto skip_bitmap_add;
 					}
-					status = ocfs_handle_add_commit_bits(handle, num_clusters, bitmap_offset, -1, DISK_ALLOC_VOLUME);
+					status = ocfs_handle_add_commit_bits(handle, num_clusters, bitmap_offset, -1, 0, DISK_ALLOC_VOLUME);
 					if (status < 0) {
 						LOG_ERROR_STATUS (status);
 						goto bail;
@@ -2159,7 +2014,7 @@
 				LOG_TRACE_ARGS("Popping this header (%llu)\n",
 					       alloc_eb->h_blkno);
 
-				status = ocfs_handle_add_commit_bits(handle, 1, alloc_eb->h_suballoc_blkno << osb->sb->s_blocksize_bits, alloc_eb->h_suballoc_node, DISK_ALLOC_EXTENT_NODE);
+				status = ocfs_handle_add_commit_bits(handle, 1, alloc_eb->h_suballoc_bit, alloc_eb->h_suballoc_node, alloc_eb->h_blkno, DISK_ALLOC_EXTENT_NODE);
 				if (status < 0) {
 					LOG_ERROR_STATUS (status);
 					goto bail;
@@ -2862,466 +2717,283 @@
 	return (status);
 }				/* ocfs_get_leaf_extent */
 
+void ocfs_free_alloc_context(ocfs2_alloc_context *ac)
+{
+	if (ac->ac_inode)
+		iput(ac->ac_inode);
+	if (ac->ac_bh)
+		brelse(ac->ac_bh);
+	kfree(ac);
+}
+
 /*
- * ocfs_find_contiguous_space_from_bitmap()
- *
- * This function looks for free space in the volume based on the bitmap.
- * It looks for contiguous space only and if it finds the space available
- * it returns a cluster bitmap offset. Each bit in Cluster bitmap represents
- * memory equal to cluster size (specified during format).
- *
- * TODO: The Bitmap stuff needs to be changed for handling more than 32 bits...
- * Although we can go upto 4k(clustersize) * 8 * 4M(max 32 bits for now...)
- *
- * Returns 0 on success, < 0 on error.
- *
- * Pass in 'lock_bh' and bitmap_inode only if you've already taken the 
- * vol_alloc semaphore, and you've done the acquire_lock on the bitmap.
+ * min_bits - minimum contiguous chunk from this total allocation we
+ * can handle. set to what we asked for originally for a full
+ * contig. allocation, set to '1' to indicate we can deal with extents
+ * of any size.
  */
-static int ocfs_find_contiguous_space_from_bitmap(ocfs_super *osb,
-					   ocfs_journal_handle *handle,
-					   __u64 file_size,
-					   u32 *cluster_off,
-					   u32 *cluster_count,
-					   int sysfile,
-					   struct buffer_head *lock_bh,
-					   struct inode *bitmap_inode)
+int ocfs_claim_bits(ocfs_super *osb, 
+		    ocfs_journal_handle *handle, 
+		    ocfs2_alloc_context *ac,
+		    u32 min_bits,
+		    u32 *bit_off,
+		    u32 *num_bits)
 {
-	int status = 0, startbh, numblocks;
-	u32 bitoffset = 0, ClusterCount = 0;
-	__u64 ByteCount = 0;
-	__u32 LargeAlloc = 0;
-	static __u32 LargeAllocOffset = 0;
-	static __u32 SmallAllocOffset = 0;
-	struct buffer_head *bh = NULL;
-	ocfs2_dinode *bm_lock;
-	__u32 bitmapblocks; /* we only care about the valid blocks */
-	int local_lock = 0;
-	int local_inode = 0;
-	__u32 five_percent, free_bits;
+	int status;
 
-	LOG_ENTRY ();
+	LOG_ENTRY();
 
-	OCFS_ASSERT (osb);
+	OCFS_ASSERT(ac);
+	OCFS_ASSERT(ac->ac_bits_given < ac->ac_bits_wanted);
 
-	if ((bitmap_inode && !lock_bh) || (lock_bh && !bitmap_inode))
-		BUG();
+	OCFS_ASSERT(ac->ac_which == OCFS_AC_USE_LOCAL 
+		    || ac->ac_which == OCFS_AC_USE_MAIN);
+	OCFS_ASSERT(ac->ac_handle == handle);
 
-	if (!bitmap_inode) {
-		bitmap_inode = ocfs_get_system_file_inode(osb, GLOBAL_BITMAP_SYSTEM_INODE, -1);
-		if (!bitmap_inode) {
-			status = -EINVAL;
-			LOG_ERROR_STR("Could not get bitmap inode!");
-			goto leave;
-		}
-		local_inode = 1;
+	if (ac->ac_which == OCFS_AC_USE_LOCAL) {
+		status = ocfs_claim_local_alloc_bits(osb, 
+						     handle, 
+						     ac, 
+						     min_bits,
+						     bit_off, 
+						     num_bits);
+	} else {
+		status = ocfs_claim_main_bitmap_bits(osb, 
+						     handle, 
+						     ac,
+						     min_bits,
+						     bit_off,
+						     num_bits);
 	}
-
-	if (lock_bh) {
-		bh = lock_bh;
-	} else { /* local lock */
-		local_lock = 1;
-
-		ocfs_handle_add_inode(handle, bitmap_inode);
-
-		/* Get the allocation lock here */
-		status = ocfs_acquire_lock(osb, OCFS_LKM_EXMODE, 
-					   0, &bh, bitmap_inode);
-		if (status < 0) {
-			if (status != -EINTR)
-				LOG_ERROR_STATUS (status);
-			goto leave;
-		}
-		ocfs_handle_add_lock(handle, OCFS_LKM_EXMODE, 
-				      0, bitmap_inode);
-	}
-
-	status = ocfs_journal_access(handle, bh, OCFS_JOURNAL_ACCESS_WRITE);
 	if (status < 0) {
-		LOG_ERROR_STATUS(status);
-		goto leave;
+		/* If you're not asking for contiguous blocks, then
+		 * the reserve function should've given you enough,
+		 * and ENOSPC this is a real error. */
+		if ((status != -ENOSPC) || (min_bits == 1))
+			LOG_ERROR_STATUS(status);
+		goto bail;
 	}
 
-	bm_lock = (ocfs2_dinode *) bh->b_data;
+	ac->ac_bits_given += *num_bits;
 
-	ClusterCount = (u32) ((u64) (file_size + (osb->s_clustersize-1)) >> 
-				osb->s_clustersize_bits);
-	if (ClusterCount == 0) {
-		LOG_ERROR_STR ("DISK_FULL?: ClusterCount==0");
-		status = 0;
-		goto leave;
-	}
-	ByteCount = (u64)ClusterCount << osb->s_clustersize_bits;
-	if (ByteCount == 0) {
-		LOG_ERROR_STR ("DISK_FULL?: Bytecount==0");
-		status = 0;
-		goto leave;
-	}
+bail:
+	LOG_EXIT_STATUS(status);
+	return(status);
+}
 
-	free_bits = le32_to_cpu(bm_lock->id1.bitmap1.i_total) - 
-		le32_to_cpu(bm_lock->id1.bitmap1.i_used);
-	if ( (sysfile && ClusterCount > free_bits) || 
-	     (!sysfile && ClusterCount > 
-	         (free_bits - ((8 * ONE_MEGA_BYTE) >> osb->s_clustersize_bits))) ){
-		LOG_ERROR_ARGS("Disk Full: ClusterCount=%u, free_bits=%u, sysfile=%s\n",
-			       ClusterCount, free_bits, sysfile?"yes":"no");
-		status = -ENOSPC;
-		goto leave;
-	}
-	
-	/* This function will check for clear bits in the Bitmap for
-	 * consecutive */
-	/* clear bits equal to ClusterCount */
+static int ocfs_claim_main_bitmap_bits(ocfs_super *osb,
+				       ocfs_journal_handle *handle,
+				       ocfs2_alloc_context *ac,
+				       u32 min_bits,
+				       u32 *bit_off,
+				       u32 *num_bits)
+{
+	int status, num_blocks;
+	u32 bits_wanted;
+	u32 best_fit_bits;
+	ocfs2_dinode *fe;
+	struct inode *bitmap_inode;
+	struct buffer_head *bh;
 
-	/* If we create a chunk that is larger than 5% of the
-	 * disksize, then start */
-	/* allocation at 5%, so that small files stay in the beginning
-	 * as much as possible */
+	LOG_ENTRY();
+	OCFS_ASSERT(ac->ac_which == OCFS_AC_USE_MAIN);
 
-	five_percent = le32_to_cpu(bm_lock->id1.bitmap1.i_total) / 20;
-	if (ClusterCount > five_percent) {
-		LargeAlloc = 1;
-		LargeAllocOffset = five_percent;
-	}
+	bitmap_inode = ac->ac_inode;
+	bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given;
+	bh = ac->ac_bh;
+	fe = (ocfs2_dinode *) bh->b_data;
 
-	bitmapblocks =
-		ocfs_blocks_for_bits(osb->sb,
- 				     osb->cluster_bitmap.validbits);
-	
+	num_blocks = ocfs_blocks_for_bits(osb->sb,
+					  osb->cluster_bitmap.validbits);
 	/* Ok, somewhat lame, but we submit the whole bitmap for reading here*/
-	if (ocfs_read_bhs(osb, osb->bitmap_blkno << osb->sb->s_blocksize_bits, 
-			  bitmapblocks << osb->sb->s_blocksize_bits,
-			  osb->cluster_bitmap.chunk, OCFS_BH_CACHED, bitmap_inode)) {
-		LOG_ERROR_STATUS(-EIO);
-		goto leave;
+	status = ocfs_read_bhs(osb, osb->bitmap_blkno << osb->sb->s_blocksize_bits, num_blocks << osb->sb->s_blocksize_bits, osb->cluster_bitmap.chunk, OCFS_BH_CACHED, bitmap_inode);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
 	}
-	
-	bitoffset = ocfs_find_clear_bits (osb, &osb->cluster_bitmap, 
-					  ClusterCount,
-					  LargeAlloc ? LargeAllocOffset :
-					  SmallAllocOffset, sysfile ? 0 :
-					  ((8 * ONE_MEGA_BYTE) >>
-					   osb->s_clustersize_bits));
 
-	/* if fails we should try again from the beginning of the disk. */
-	/* in the end we pass # of bits we want to keep for system
-	 * file extention only.  */
-	/* Right now if we run out of diskspace, we still have 8mb
-	 * free for a systemfile */
-
-	if (bitoffset == -1 && LargeAlloc) {
-		LOG_TRACE_STR("Running low on diskspace.");
+	best_fit_bits = bits_wanted;
+	status = ocfs_find_clear_bits (osb, &osb->cluster_bitmap,
+				       bits_wanted, bit_off, 
+				       &best_fit_bits);
+	if (status < 0) {
 		osb->cluster_bitmap.failed++;
-		bitoffset = ocfs_find_clear_bits (osb, &osb->cluster_bitmap,
-						  ClusterCount, 0,
-						  sysfile ? 0 :
-						  ((8 * ONE_MEGA_BYTE) >>
-						   osb->s_clustersize_bits));
+		LOG_TRACE_STR("Running out of space!");
+		goto bail;
 	}
 
-	/* It returns -1 on failure, otherwise bitoffset points at the */
-	/* location inb bitmap from where there are ClusterCount no of bits */
-	/* are free.  */
-
-	if (bitoffset == -1) {
-		if (sysfile)
-			LOG_ERROR_ARGS ("Cannot allocate %u contiguous clusters for system file\n",
-					ClusterCount);
+	if (best_fit_bits < min_bits) {
+		LOG_TRACE_ARGS ("Cannot allocate %u contiguous clusters for "
+				"system file\n", min_bits);
 		status = -ENOSPC;
-		goto leave;
+		goto bail;
 	}
 
-	LOG_TRACE_ARGS ("setting %u bits at bit offset=%u\n", ClusterCount, bitoffset);
+	if (best_fit_bits != bits_wanted)
+		LOG_TRACE_ARGS("discontiguous allocation done: wanted = %u, "
+			       "best_fit = %u, bit_off = %u!\n", 
+			       bits_wanted, best_fit_bits, *bit_off);
 
-	ocfs_set_bits(osb->sb, handle, &osb->cluster_bitmap, bitoffset,
-		      ClusterCount);
+	/* when we do discontig. just change this line. */
+	*num_bits = best_fit_bits;
 
-	/* Ok, write out the bitmap now. We optimize only by writing
-	 * out the bitmap blocks which have changed, and not all of
-	 * them like before. */
-	numblocks = ocfs_bitmap_blocks_affected(osb->sb,
-						bitoffset,
-						ClusterCount,
-						&startbh);
+	/* cool, we've got some. set them now. */
+	status = ocfs_journal_access(handle, bh, OCFS_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
 
-	LOG_TRACE_ARGS("bitoffset = %u, ClusterCount = %u, startbh = %u, numblocks = %u\n", bitoffset, ClusterCount, startbh, numblocks);
+	fe->id1.bitmap1.i_used += *num_bits;
 
-	/* write the bitmap size info to the lock sector */
-	bm_lock->id1.bitmap1.i_used =
-		ocfs_count_bits(osb->sb, &osb->cluster_bitmap);
-
 	status = ocfs_journal_dirty(handle, bh);
 	if (status < 0) {
 		LOG_ERROR_STATUS (status);
-		goto leave;
+		goto bail;
 	}
 
-	*cluster_off = bitoffset;
-	*cluster_count = ClusterCount;
+	ocfs_set_bits(osb->sb, handle, &osb->cluster_bitmap, *bit_off,
+		      *num_bits);
+	atomic_inc(&osb->alloc_stats.bitmap_data);
+
+#warning "implement this"
+	/* At this point, we should see if local alloc was switched
+	 * off, and turn it back on if we have enough free bits in the
+	 * main bitmap.*/
 	status = 0;
+bail:
+	LOG_EXIT_STATUS(status);
+	return(status);
+}
 
-leave:
-	if (local_lock && bh)
-		brelse(bh);
-
-	if (local_inode)
-		iput(bitmap_inode);
-
-	LOG_EXIT_STATUS (status);
-	return status;
-}				/* ocfs_find_contiguous_space_from_bitmap */
-
-/*
- * ocfs_alloc_node_block()
- *
- * You need to be holding node_alloc_sem!
- */
-int ocfs_alloc_node_block(ocfs_super *osb, __u64 bytes_wanted,
-			  __u64 *DiskOffset, __u64 *file_off,
-			  __u32 NodeNum, __u32 Type,
-			  ocfs_journal_handle *handle)
+/* caller is responsible for freeing 'ac' when he's done with it. */
+int ocfs_reserve_bits(ocfs_super *osb, 
+		      ocfs_journal_handle *handle,
+		      u32 bits_wanted,
+		      ocfs2_alloc_context **ac)
 {
-	int status = 0;
-	int startbh, numblocks;
-	__u64 fileSize = 0;
-	__u64 numBytes = 0;
-	__u64 allocSize = 0;
-	__u64 prevFileSize = 0;
-	__u64 extent;
-	__u64 newFileSize;
-	__u64 bitMapSize;
-	ocfs_alloc_bm bitmap;
-	__u32 numBits = 0;
-	__u32 foundBit = -1;
-	__u32 blockSize = 0, blockSizeBits = 0;
-	int bm_file = 0;
-	int alloc_file = 0;
-	struct buffer_head *bh = NULL;
-	struct buffer_head *alloc_bh = NULL;
-	ocfs2_dinode *fe = NULL;
-	ocfs2_dinode *alloc_fe = NULL;
-	int needs_uninit = 0;
-	int delay_lockrel = 0;
-	struct inode *inode = NULL; /* alloc bitmap file inode */
-	struct inode *alloc_inode = NULL; /* alloc file inode */
+	int status;
 
-	LOG_ENTRY_ARGS("(bytes_wanted = (%llu), Type=%d)\n", bytes_wanted,Type);
+	LOG_ENTRY();
+	OCFS_ASSERT(handle);
 
-	switch (Type) {
-		case DISK_ALLOC_EXTENT_NODE:
-			bm_file = EXTENT_ALLOC_BITMAP_SYSTEM_INODE;
-			alloc_file = EXTENT_ALLOC_SYSTEM_INODE;
-			blockSize = osb->sb->s_blocksize;
-			blockSizeBits = osb->sb->s_blocksize_bits;
-			atomic_inc(&osb->alloc_stats.ext_allocs);
-			break;
-		case DISK_ALLOC_INODE:
-			bm_file = INODE_ALLOC_BITMAP_SYSTEM_INODE;
-			alloc_file = INODE_ALLOC_SYSTEM_INODE;
-			blockSize = osb->sb->s_blocksize;
-			blockSizeBits = osb->sb->s_blocksize_bits;
-			atomic_inc(&osb->alloc_stats.inode_allocs);
-			break;
-		default:
-			status = -EINVAL;
-			LOG_ERROR_STATUS(status);
-			goto leave;
+	*ac = kmalloc(sizeof(ocfs2_alloc_context), GFP_KERNEL);
+	if (!(*ac)) {
+		status = -ENOMEM;
+		LOG_ERROR_STATUS(status);
+		goto bail;
 	}
+	memset(*ac, 0, sizeof(ocfs2_alloc_context));
+	(*ac)->ac_bits_wanted = bits_wanted;
+	(*ac)->ac_handle = handle;
 
-	inode = ocfs_get_system_file_inode(osb, bm_file, NodeNum);
-	if (!inode) {
-		LOG_ERROR_STATUS(status=-EINVAL);
-		goto leave;
+	status = -ENOSPC;
+	if (ocfs_alloc_should_use_local(osb, bits_wanted)) {
+		status = ocfs_reserve_local_alloc_bits(osb, 
+						       handle, 
+						       bits_wanted, 
+						       *ac);
+		if ((status < 0) && (status != -ENOSPC)) {
+			LOG_ERROR_STATUS(status);
+			goto bail;
+		} else if (status == -ENOSPC) {
+#warning "need to deal with disabling local alloc better"
+			/* reserve_local_bits will return enospc with
+			 * the local alloc inode still locked, so we
+			 * can change this safely here. */
+			LOG_ERROR_STR("Disabling local alloc");
+			osb->have_local_alloc = 0;
+//			ocfs_shutdown_local_alloc(osb);
+		} else 
+			(*ac)->ac_which = OCFS_AC_USE_LOCAL;
 	}
-	alloc_inode = ocfs_get_system_file_inode(osb, alloc_file, NodeNum);
-	if (!alloc_inode) {
-		LOG_ERROR_STATUS(status=-EINVAL);
-		goto leave;
-	}
 
-	/* Allocate a block of size blocksize from the relevant file/bitmap */
-	OCFS_ASSERT (blockSize);
-
-	status = ocfs_acquire_lock (osb, OCFS_LKM_EXMODE,
-			     0, &bh, inode);
-	if (status < 0) {
-		LOG_ERROR_STATUS (status);
-		goto leave;
-	}
-
-	ocfs_handle_add_lock(handle, OCFS_LKM_EXMODE, 
-			     0, inode);
-
-	numBits = ((bytes_wanted + (blockSize-1)) >> blockSizeBits);
-	numBytes = (u64)numBits << blockSizeBits;
-
-	/* Read in the bitmap file for the alloc and look for the
-	 * required space, if found */
-	fe = (ocfs2_dinode *) bh->b_data;
-	prevFileSize = fileSize = fe->i_size;
-	allocSize = (u64)fe->i_clusters << osb->s_clustersize_bits;
-
-	if ((fileSize != 0) && (allocSize != 0)) {
-		ocfs_initialize_bitmap(osb->sb, &bitmap,
-				       (__u32)fileSize * 8,
-				       (__u32)allocSize * 8);
-		needs_uninit = 1;
-
-		status = ocfs_read_system_file(osb, bm_file, NodeNum,
-					       bitmap.chunk, allocSize);
+	if (status == -ENOSPC) {
+		status = ocfs_reserve_main_bitmap_bits(osb, 
+						       handle, 
+						       bits_wanted,
+						       *ac);
 		if (status < 0) {
-			LOG_ERROR_STATUS (status);
-			goto leave;
+			LOG_ERROR_STATUS(status);
+			goto bail;
 		}
+		(*ac)->ac_which = OCFS_AC_USE_MAIN;
+	}
 
-		foundBit = ocfs_find_clear_bits(osb, &bitmap, numBits, 0, 0);
+	status = 0;
+bail:
+	if ((status < 0) && *ac) {
+		ocfs_free_alloc_context(*ac);
+		*ac = NULL;
 	}
 
-	/* It returns -1 on failure , otherwise ByteOffset points at the */
-	/* location in bitmap from where there are ClusterCount no of bits */
-	/* are free. */
+	LOG_EXIT_STATUS(status);
+	return(status);
+}
 
-	if (foundBit == -1) {
-		/* if not found add more allocation to the file and try again. */
-		//extent = ONE_MEGA_BYTE;
-		extent = ( ((numBits * blockSize) + (ONE_MEGA_BYTE-1)) >> 20 ) << 20;
+int ocfs_reserve_main_bitmap_bits(ocfs_super *osb, 
+				  ocfs_journal_handle *handle,
+				  u32 bits_wanted,
+				  ocfs2_alloc_context *ac)
+{
+	int status = 0;
+	struct inode *bitmap_inode;
+	struct buffer_head *bh = NULL;
+	ocfs2_dinode *fe;
+	u32 free_bits;
 
-#warning maybe take this out and put a verifyupdateinode in here
-		status = ocfs_read_bh(osb,
-				      OCFS_I(alloc_inode)->ip_blkno << alloc_inode->i_sb->s_blocksize_bits, 
-			&alloc_bh, OCFS_BH_CACHED, alloc_inode);
-		if (status < 0) {
-			LOG_ERROR_STATUS (status);
-			goto leave;
-		}
+	LOG_ENTRY();
 
-		alloc_fe = (ocfs2_dinode *) alloc_bh->b_data;
-		newFileSize = alloc_fe->i_size;
-		allocSize = (u64)alloc_fe->i_clusters << osb->s_clustersize_bits;
-		if (newFileSize != alloc_inode->i_size ||
-		    allocSize != OCFS_I(alloc_inode)->ip_alloc_size) {
-			LOG_ERROR_ARGS("aha! alloc inode was out of date! "
-			       		"newFileSize=%llu, i_size=%llu, "
-			       		"allocSize=%llu, ip_alloc_size=%llu\n",
-			       		newFileSize, alloc_inode->i_size,
-			       		allocSize, OCFS_I(alloc_inode)->ip_alloc_size);
-		}
+	OCFS_ASSERT(!(handle->flags & OCFS_HANDLE_STARTED));
 
-		//newFileSize = alloc_inode->i_size;
-		//allocSize = OCFS_I(alloc_inode)->alloc_size;
+	bitmap_inode = ocfs_get_system_file_inode(osb, 
+						  GLOBAL_BITMAP_SYSTEM_INODE, 
+						  -1);
+	if (!bitmap_inode) {
+		status = -EINVAL;
+		LOG_ERROR_STR("Could not get bitmap inode!");
+		goto bail;
+	}
 
-		/* This is for OUI optimzation to allocate more disk
-		 * space for directory allocations */
-		
-		if (allocSize > 0)
-			extent *= 2;
-		
-		LOG_TRACE_ARGS("extending the alloc file to %llu\n",
-		       newFileSize + extent);
-		status = ocfs_extend_file(osb, newFileSize + extent, 
-					  handle, alloc_inode, NULL, 1,
-					  NULL);
-		if (status < 0) {
+	status = ocfs_acquire_lock(osb, OCFS_LKM_EXMODE, 
+				   0, &bh, bitmap_inode);
+	if (status < 0) {
+		if (status != -EINTR)
 			LOG_ERROR_STATUS (status);
-			goto leave;
-		}
-		
-		newFileSize += extent;
-		bitMapSize = newFileSize >> (blockSizeBits+3);
-
-		/* Does this need the buffer_head? if so, we need to
-		 * do a put_data first! */
-		/* Calculate the new bitmap size */
-		
-		LOG_TRACE_ARGS("extending the bitmap file to %llu\n", bitMapSize);
-		status = ocfs_extend_file(osb, bitMapSize,
-					  handle, inode, NULL, 1, bh);
-		if (status < 0) {
-			LOG_ERROR_STATUS (status);
-			goto leave;
-		}
-		fe = (ocfs2_dinode *) bh->b_data;
-		/* we wrote it back out in ocfs_extend_system_file so
-		 * we can trust the sizes here */
-		fileSize = fe->i_size;
-		allocSize = (u64)fe->i_clusters << osb->s_clustersize_bits;
-		LOG_TRACE_ARGS("fileSize=%llu, allocSize=%llu\n",
-		       fileSize, allocSize);
-
-		if (needs_uninit)
-			ocfs_reinitialize_bitmap(osb->sb, &bitmap,
-						 fileSize * 8, 
-						 allocSize * 8);
-		else
-			ocfs_initialize_bitmap(osb->sb, &bitmap,
-					       fileSize * 8, 
-					       allocSize * 8);
-		needs_uninit = 1;
-
-		status = ocfs_read_system_file(osb, bm_file, NodeNum, 
-					       bitmap.chunk, allocSize);
-		if (status < 0) {
-			LOG_ERROR_STATUS (status);
-			goto leave;
-		}
-		
-		foundBit = ocfs_find_clear_bits(osb, &bitmap, numBits, 0, 0);
-
-		delay_lockrel = 1;
-		if (Type == DISK_ALLOC_EXTENT_NODE)
-			atomic_inc(&osb->alloc_stats.ext_extends);
-		else if (Type == DISK_ALLOC_INODE)
-			atomic_inc(&osb->alloc_stats.inode_extends);
+		goto bail;
 	}
+	ocfs_handle_add_lock(handle, OCFS_LKM_EXMODE, 
+			     0, bitmap_inode);
+	ocfs_handle_add_inode(handle, bitmap_inode);
 
-	LOG_TRACE_ARGS ("bit offset=%d, num=%d\n", foundBit, numBits);
+	fe = (ocfs2_dinode *) bh->b_data;
+	free_bits = le32_to_cpu(fe->id1.bitmap1.i_total) - 
+		le32_to_cpu(fe->id1.bitmap1.i_used);
 
-	ocfs_set_bits(osb->sb, handle, &bitmap, foundBit, numBits);
-
-	/* only write out what has changed... */
-	numblocks = ocfs_bitmap_blocks_affected(osb->sb,
-						foundBit,
-						numBits,
-						&startbh);
-
-	LOG_TRACE_ARGS ("offset=%u, type=%x, blksz=%u, foundbit=%u, fileid=%u\n",
-			foundBit * blockSize, Type, blockSize, foundBit, alloc_file);
-
-	status = ocfs_lookup_file_allocation(osb, foundBit * blockSize, 
-					     DiskOffset, osb->sb->s_blocksize, NULL,
-					     alloc_inode, 1);
-	if (status < 0 || *DiskOffset == 0) {
-		if (!status)
-			status = -EINVAL;
-		LOG_ERROR_STATUS(status);
-		goto leave;
+	if (bits_wanted > free_bits) {
+		LOG_ERROR_ARGS("Disk Full: wanted=%u, free_bits=%u\n",
+			       bits_wanted, free_bits);
+		status = -ENOSPC;
+		goto bail;
 	}
 
-	*file_off = (__u64) ((__u64) foundBit * (__u64) blockSize);
+	/* Ok, done - we've determined that there's enough space in
+	 * the bitmap. Actually finding it is the job of the
+	 * allocation function now. We keep things locked so that the
+	 * bitmap can't change underneath us. */
+	ac->ac_inode = igrab(bitmap_inode);
+	get_bh(bh);
+	ac->ac_bh = bh;
 
-	/* this can just fall through */
-	if (*file_off == 0) {
-		LOG_TRACE_ARGS ("offset=%llu, type=%x, blksz=%u, foundbit=%u\n",
-			*file_off, Type, blockSize, foundBit);
-	}
-
-leave:
-	if (needs_uninit)
-		ocfs_uninitialize_bitmap(&bitmap);
-
-	if (inode)
-		iput(inode);
-	if (alloc_inode)
-		iput(alloc_inode);
-
-	if (bh != NULL)
+bail:
+	if (bitmap_inode)
+		iput(bitmap_inode);
+	if (bh)
 		brelse(bh);
-	if (alloc_bh != NULL)
-		brelse(alloc_bh);
 
-	LOG_EXIT_STATUS (status);
-	return status;
-}				/* ocfs_alloc_node_block */
+	LOG_EXIT_STATUS(status);
+	return(status);
+}
 
 /*
  * ocfs_free_file_extents()
@@ -3334,7 +3006,7 @@
 	int status = 0;
 	__u32 i;
 	__u32 numBitsAllocated = 0, bitmapOffset = 0;
-	ocfs2_extent_block *extent = NULL;
+	ocfs2_extent_block *extent;
 	struct buffer_head *extent_bh = NULL;
 	ocfs2_extent_list *fel;
 	ocfs2_dinode *fe;
@@ -3342,12 +3014,7 @@
 	LOG_ENTRY ();
 
 	fe = (ocfs2_dinode *) fe_bh->b_data;
-	if (OCFS_I(inode)->ip_blkno != fe->i_blkno)
-		BUG();
 
-	if (inode)
-		SET_BH_SEQNUM(inode, fe_bh);
-
 	fe = (ocfs2_dinode *) fe_bh->b_data;
 	fel = &fe->id2.i_list;
 
@@ -3360,7 +3027,7 @@
 				   osb->s_clustersize_bits);
 
 			ocfs_handle_add_commit_bits(handle, numBitsAllocated, 
-						    bitmapOffset, -1, 
+						    bitmapOffset, -1, 0, 
 						    DISK_ALLOC_VOLUME);
 		}
 	} else {
@@ -3380,7 +3047,7 @@
 				LOG_ERROR_STATUS(status);
 				goto leave;
 			}
-			extent = NULL;
+
 			status = ocfs_kill_this_tree(osb, extent_bh, handle, inode);
 			if (status < 0) {
 				LOG_ERROR_STATUS(status);
@@ -3399,803 +3066,3 @@
 	return status;
 }				/* ocfs_free_file_extents */
 
-/* Some constants and functions that control how we allocate and use
- * local alloc bitmaps. These are intended to be easily
- * tunable. Possibly even remove them once we've found a good mix. */
-
-/* The largest cluster size where we even consider using local alloc. */
-#define OCFS_LOCAL_ALLOC_MAX_CSIZE    (128 * 1024)
-/* The largest allocation to use the local bitmap for. */
-#define OCFS_LOCAL_ALLOC_MAX_ALLOC    (2 * 1024 * 1024)
-
-/* 
- * ocfs_local_alloc_window_bits
- * 
- * Determine how large our local alloc window should be, in bits. This
- * is entirely changeable -- just replace this function. Right now as
- * a *testing* default, we have a function that takes cluster size
- * into account in the following manner: 
- *
- * 4k -> 1024 bits, 8k -> 512 bits, 16k -> 256 bits, 
- * 32/64/128k -> 64 bits
- */
-static inline int ocfs_local_alloc_window_bits(ocfs_super *osb)
-{
-	int numbits;
-
-	switch (osb->s_clustersize) {
-	case (4*1024):
-		numbits = 1024;
-		break;
-
-	case (8*1024):
-		numbits = 512;
-		break;
-
-	case (16*1024):
-		numbits = 256;
-		break;
-
-	default:
-		numbits = 64;
-		break;
-	}
-	return(numbits);
-} /* ocfs_local_alloc_window_bits */
-
-/*
- * ocfs_alloc_count_bits
- */
-static __u32 ocfs_alloc_count_bits(ocfs2_dinode *alloc)
-{
-	int i;
-	__u8 tmp;
-	__u8 *buffer;
-	__u32 count = 0;
-
-	LOG_ENTRY();
-
-	buffer = LOCAL_ALLOC(alloc)->la_bitmap;
-	for (i = 0; i < LOCAL_ALLOC(alloc)->la_size; i++) {
-		memcpy(&tmp, buffer, 1);
-		count+= BITCOUNT(tmp);
-		buffer++;
-	}
-
-	LOG_EXIT_ULONG ((unsigned long)count);
-	return(count);
-} /* ocfs_alloc_count_bits */
-
-/*
- * ocfs_clear_local_alloc
- */
-static void ocfs_clear_local_alloc(ocfs2_dinode *alloc) 
-{
-	int i;
-	LOG_ENTRY();
-
-	LOCAL_ALLOC(alloc)->la_bm_bits = 0;
-	LOCAL_ALLOC(alloc)->la_bits_set = 0;
-	LOCAL_ALLOC(alloc)->la_bm_off = 0;
-	for(i = 0; i < LOCAL_ALLOC(alloc)->la_size; i++)
-		LOCAL_ALLOC(alloc)->la_bitmap[i] = 0;
-
-	LOG_EXIT();
-	return;
-} /* ocfs_clear_local_alloc */
-
-/* 
- * ocfs_sync_local_to_main
- *
- * sync the local alloc to main bitmap. 
- *
- * assumes you've already locked the main bitmap -- the bitmap inode
- * passed is used for caching.
- */
-static int ocfs_sync_local_to_main(ocfs_super *osb, 
-				   ocfs_journal_handle *handle, 
-				   ocfs2_dinode *alloc,
-				   struct inode *main_bm_inode)
-{
-	int status = 0;
-	int bit_off, left;
-	void *bitmap;
-	unsigned int start, numblocks, bitmapblocks;
-
-	LOG_ENTRY_ARGS("alloc->la_bm_bits = %u, COUNT = %u, la_bits_set = %u\n", 
-		       LOCAL_ALLOC(alloc)->la_bm_bits,
-		       ocfs_alloc_count_bits(alloc), 
-		       LOCAL_ALLOC(alloc)->la_bits_set);
-
-	if (LOCAL_ALLOC(alloc)->la_bm_bits == 0) {
-		LOG_TRACE_STR("nothing to sync!");
-		goto bail;
-	}
-
-	bitmapblocks =
-		ocfs_blocks_for_bits(osb->sb,
- 				     osb->cluster_bitmap.validbits);
-
-	/* figure out which block in the bitmap to start on and the
-	 * maximum number of blocks we can span over -- we don't need
-	 * to read any more as that's the most we'll be touching... */
-	numblocks = ocfs_bitmap_blocks_affected(osb->sb,
-						LOCAL_ALLOC(alloc)->la_bm_off,
-						LOCAL_ALLOC(alloc)->la_bits_set,
-						&start);
-
-	if ((start + numblocks) > bitmapblocks) {
-		printk("uhoh, bitmap calculation is bad!\n");
-		printk("alloc->la_bm_bits = %u, COUNT = %u, alloc->la_bits_set = %u"
-		       "start=%u, alloc->la_bm_off = %u, numblocks=%u, "
-		       "bitmapblocks = %u\n",
-		       LOCAL_ALLOC(alloc)->la_bm_bits, ocfs_alloc_count_bits(alloc), 
-		       LOCAL_ALLOC(alloc)->la_bits_set, start, LOCAL_ALLOC(alloc)->la_bm_off, numblocks,
-		       bitmapblocks);
-
-		BUG();
-	}
-
-	LOG_TRACE_ARGS("start=%u, alloc->la_bm_off = %u, numblocks=%u\n", start, 
-		       LOCAL_ALLOC(alloc)->la_bm_off, numblocks);
-	status = ocfs_read_bhs(osb,
-			       (osb->bitmap_blkno + start) << osb->sb->s_blocksize_bits,
-			       numblocks << osb->sb->s_blocksize_bits,
-			       &osb->cluster_bitmap.chunk[start], OCFS_BH_CACHED, 
-			       main_bm_inode);
-	if (status < 0) {
-		LOG_ERROR_STATUS(status);
-		goto bail;
-	}
-
-	bitmap = LOCAL_ALLOC(alloc)->la_bitmap;
-	/* any unset bits in local alloc need to be unset in bitmap. */
-	bit_off = 0;
-	left = LOCAL_ALLOC(alloc)->la_bm_bits;
-	while ((bit_off = find_next_zero_bit(bitmap, left, bit_off)) 
-	       != -1) {
-		if (bit_off >= left) {
-			/*LOG_TRACE_ARGS("bit_off (%d) >= left\n", bit_off);*/
-			break;
-		}
-
-		LOG_TRACE_ARGS("Clearing bit %u in main bitmap\n", 
-			       bit_off + LOCAL_ALLOC(alloc)->la_bm_off);
-		ocfs_clear_bits(osb->sb, handle, &osb->cluster_bitmap, 
-				bit_off + LOCAL_ALLOC(alloc)->la_bm_off,
-				1);
-		bit_off++;
-	}
-
-bail:
-
-	LOG_EXIT_STATUS(status);
-	return(status);
-} /* ocfs_sync_local_to_main */
-
-/*
- * This essentially does the same thing as sync_local_to_main, but
- * without a journal handle -- used during shutdown and recovery.
- */
-static int ocfs_sync_local_from_shutdown(ocfs_super *osb, 
-					 ocfs_bitmap_free_head **f, 
-					 struct buffer_head *local_alloc_bh, 
-					 int in_recovery)
-{
-	int status = 0;
-	int bit_off, left;
-	ocfs2_dinode *alloc = NULL;
-	void *bitmap;
-
-	LOG_ENTRY();
-
-	if (!local_alloc_bh)
-		BUG();
-
-	alloc = (ocfs2_dinode *) local_alloc_bh->b_data;
-	if (LOCAL_ALLOC(alloc)->la_bm_bits == 0) {
-		LOG_TRACE_STR("nothing to sync!");
-		goto bail;
-	}
-
-	if (!(*f)) {
-		*f = ocfs_alloc_bitmap_free_head();
-		if (*f == NULL) {
-			LOG_ERROR_STATUS(-ENOMEM);
-			goto bail;
-		}
-	}
-
-	alloc = (ocfs2_dinode *) local_alloc_bh->b_data;
-
-	LOG_TRACE_ARGS("alloc->la_bm_bits = %u, COUNT = %u, la_bits_set = %u\n", 
-		       LOCAL_ALLOC(alloc)->la_bm_bits,
-		       ocfs_alloc_count_bits(alloc), 
-		       LOCAL_ALLOC(alloc)->la_bits_set);
-
-	bitmap = LOCAL_ALLOC(alloc)->la_bitmap;
-
-	/* any unset bits in local alloc need to be unset in bitmap. */
-	bit_off = 0;
-	left = LOCAL_ALLOC(alloc)->la_bm_bits;
-	while ((bit_off = find_next_zero_bit(bitmap, left, bit_off)) 
-	       != -1) {
-		if (bit_off >= left) {
-			/*LOG_TRACE_ARGS("bit_off (%d) >= left\n", bit_off);*/
-			break;
-		}
-	     /* LOG_TRACE_ARGS("Clearing bit %u in main bitmap\n", bit_off);*/
-		status = ocfs_add_to_bitmap_free_head(osb, 
-						 *f, 1, 
-						 bit_off + LOCAL_ALLOC(alloc)->la_bm_off,
-						 -1, DISK_ALLOC_VOLUME);
-		if (status < 0) {
-			ocfs_free_bitmap_free_head(*f);
-			*f = NULL;
-		}
-		bit_off++;
-	}
-
-bail:
-	LOG_EXIT_STATUS(status);
-	return(status);
-} /* ocfs_sync_local_from_shutdown */
-
-/*
- * ocfs_alloc_new_window
- *
- * pass it the bitmap lock in lock_bh if you have it. 
- */
-static int ocfs_alloc_new_window(ocfs_super *osb, struct buffer_head *lock_bh, 
-				 struct inode *bm_inode, 
-				 ocfs_journal_handle *handle)
-{
-	int status = 0;
-	__u64 alloc_bytes;
-	u32 cluster_off, cluster_count;
-	ocfs2_dinode *alloc = NULL;
-
-	LOG_ENTRY();
-
-	alloc = (ocfs2_dinode *) osb->local_alloc_bh->b_data;
-	if (LOCAL_ALLOC(alloc)->la_bm_bits != 0)
-		LOG_TRACE_STR("asking me to alloc a new window over a"
-			      " non-empty one");
-
-	/* we try to use find_contig_space_from_bitmap here for now. */
-	alloc_bytes = (u64)ocfs_local_alloc_window_bits(osb) << osb->s_clustersize_bits;
-	LOG_TRACE_ARGS("Allocating %llu bytes (%u clusters) for a "
-		       "new window.\n", alloc_bytes, 
-		       ocfs_local_alloc_window_bits(osb));
-
-	status = ocfs_find_contiguous_space_from_bitmap(osb, handle, 
-							alloc_bytes, 
-							&cluster_off, 
-							&cluster_count, 0, 
-							lock_bh, bm_inode);
-	if (status < 0) {
-		LOG_ERROR_STATUS(status);
-		goto bail;
-	}
-	atomic_inc(&osb->alloc_stats.bitmap_data);
-
-	alloc = (ocfs2_dinode *) osb->local_alloc_bh->b_data;
-
-	LOCAL_ALLOC(alloc)->la_bm_off = cluster_off;
-	LOCAL_ALLOC(alloc)->la_bm_bits = cluster_count;
-	/* just in case... In the future when we find space ourselves,
-	 * we don't have to get all contiguous -- but we'll have to
-	 * set all previously used bits in bitmap and update
-	 * la_bits_set before setting the bits in the main bitmap. */
-	LOCAL_ALLOC(alloc)->la_bits_set = 0;
-	memset(LOCAL_ALLOC(alloc)->la_bitmap, 0,
-	       LOCAL_ALLOC(alloc)->la_size);
-
-	LOG_TRACE_STR("New window allocated:");
-	LOG_TRACE_ARGS("window la_bm_off = %u\n",
-		       LOCAL_ALLOC(alloc)->la_bm_off);
-	LOG_TRACE_ARGS("window la_bm_bits = %u\n",
-		       LOCAL_ALLOC(alloc)->la_bm_bits);
-
-bail:
-	LOG_EXIT_STATUS(status);
-	return(status);
-} /* ocfs_alloc_new_window */
-
-/*
- * ocfs_local_find_clear_bits
- */
-static int ocfs_local_find_clear_bits(ocfs_super *osb,
-				      ocfs2_dinode *alloc,
-				      __u32 numbits)
-{
-	int numfound, bitoff, left, startoff, lastzero;
-	void *bitmap = NULL;
-
-	LOG_ENTRY_ARGS("(numbits wanted = %u)\n", numbits);
-
-	bitmap = LOCAL_ALLOC(alloc)->la_bitmap;
-
-	numfound = bitoff = startoff = 0;
-	lastzero = -1;
-	left = LOCAL_ALLOC(alloc)->la_bm_bits;
-	while ((bitoff = find_next_zero_bit(bitmap, left, startoff)) != -1) {
-		if (bitoff == left) {
-			/* LOG_TRACE_ARGS("bitoff (%d) == left", bitoff); */
-			break;
-		}
-		/* LOG_TRACE_ARGS("Found a zero: bitoff = %d, startoff = %d, "
-		   "numfound = %d\n", bitoff, startoff, numfound);*/
-
-		/* Ok, we found a zero bit... is it contig. or do we
-		 * start over?*/
-		if (bitoff == startoff) {
-			/* we found a zero */
-			numfound++;
-			startoff++;
-		} else {
-			/* got a zero after some ones */
-			numfound = 1;
-			startoff = bitoff+1;
-		}
-		/* we got everything we needed */
-		if (numfound == numbits) {
-			/* LOG_TRACE_STR("Found it all!"); */
-			break;
-		}
-	}
-
-	LOG_TRACE_ARGS("Exiting loop, bitoff = %d, numfound = %d\n", bitoff, 
-		       numfound);
-
-	if (numfound == numbits)
-		bitoff = startoff - numfound;
-	else
-		bitoff = -1;
-
-	LOG_EXIT_STATUS(bitoff);
-	return(bitoff);
-} /* ocfs_local_find_clear_bits */
-
-/*
- * ocfs_find_space_from_local
- */
-static int ocfs_find_space_from_local(ocfs_super *osb, u32 bitswanted, 
-				      u32 *bitoff, u32 *bitcount, 
-				      ocfs_journal_handle *handle)
-{
-	ocfs2_dinode *alloc;
-	int status = 0, tmpstat;
-	int startoff, tmpoff;
-	__u32 tmpwanted;
-	/* main bitmap variables. */
-	struct buffer_head *main_bm_bh = NULL;
-	struct inode *main_bm_inode = NULL;
-	void *bitmap;
-
-	LOG_ENTRY_ARGS("(bitswanted = %u)\n", bitswanted);
-
-	if (!osb->have_local_alloc) {
-		status = -ENOSPC;
-		goto bail;
-	}
-
-	if (bitswanted > ocfs_local_alloc_window_bits(osb)) {
-		LOG_TRACE_STR("Asking for more than my max window size!\n");
-		status = -ENOSPC;
-		goto bail;
-	}
-
-	status = ocfs_journal_access(handle, osb->local_alloc_bh, 
-				     OCFS_JOURNAL_ACCESS_WRITE);
-	if (status < 0) {
-		LOG_ERROR_STATUS(status);
-		goto bail;
-	}
-
-	alloc = (ocfs2_dinode *) osb->local_alloc_bh->b_data;
-tryagain:
-	/* If we need to initialize a new window, do so now. */
-	if (LOCAL_ALLOC(alloc)->la_bm_bits == 0) {
-		LOG_TRACE_STR("Allocating a new window...");
-
-		status = ocfs_alloc_new_window(osb, main_bm_bh, main_bm_inode, 
-					       handle);
-		if (status < 0) {
-			if (status != -ENOSPC)
-				LOG_ERROR_STATUS(status);
-
-			/* it may not have been dirtied yet... */
-			tmpstat = ocfs_journal_dirty(handle, 
-						     osb->local_alloc_bh);
-			if (tmpstat < 0)
-				LOG_ERROR_STATUS(tmpstat);
-
-			goto bail;
-		}
-		atomic_inc(&osb->alloc_stats.moves);
-	}
-
-	/* Alright, try to satisfy the request. */
-	startoff = ocfs_local_find_clear_bits(osb, alloc, bitswanted);
-	if (startoff == -1) {
-		/* we couldn't get enough bits from the local
-		 * alloc. Lets sync what we've got to the main bitmap,
-		 * clear the local out and try again. */
-		LOG_TRACE_STR("Could not find enough contiguous bits in local "
-			      "alloc bitmap, trying to move my window.");
-
-		if (!main_bm_inode)
-			main_bm_inode = ocfs_get_system_file_inode(osb, GLOBAL_BITMAP_SYSTEM_INODE, -1);
-
-		if (!main_bm_inode) {
-			status = -EINVAL;
-			LOG_ERROR_STATUS (status);
-			goto bail;
-		}
-
-		/* lock bitmap here */
-		ocfs_handle_add_inode(handle, main_bm_inode);
-
-		/* Get the allocation lock here */
-		status = ocfs_acquire_lock(osb, OCFS_LKM_EXMODE, 
-					   0, &main_bm_bh, main_bm_inode);
-		if (status < 0) {
-			main_bm_bh = NULL;
-			if (status != -EINTR)
-				LOG_ERROR_STATUS (status);
-			goto bail;
-		}
-
-		ocfs_handle_add_lock(handle, OCFS_LKM_EXMODE, 
-				     0, main_bm_inode);
-
-		status = ocfs_sync_local_to_main(osb, handle, alloc,
-						 main_bm_inode);
-		if (status < 0) {
-			LOG_ERROR_STATUS(status);
-			goto bail;
-		}
-
-		ocfs_clear_local_alloc(alloc);
-
-		goto tryagain;
-	}
-
-	LOG_TRACE_ARGS("Found %u bits, starting at local alloc offset %d\n",
-		       bitswanted, startoff);
-
-	/* Ok, if we've got this far then the search suceeded and we
-	 * can mark the bitmap. */
-	bitmap = LOCAL_ALLOC(alloc)->la_bitmap;
-	tmpoff = startoff;
-	tmpwanted = bitswanted;
-	while(tmpwanted--) {
-		/* LOG_TRACE_ARGS("setting bit %d\n", tmpoff); */
-		set_bit(tmpoff++, bitmap);
-	}
-	LOCAL_ALLOC(alloc)->la_bits_set += bitswanted;
-
-	*bitoff = LOCAL_ALLOC(alloc)->la_bm_off + startoff;
-	*bitcount = bitswanted;
-
-	status = ocfs_journal_dirty(handle, osb->local_alloc_bh);
-	if (status < 0) {
-		LOG_ERROR_STATUS(status);
-		goto bail;
-	}
-bail:
-	/* if we locked the main bitmap, cleanup after ourselves. */
-	if (main_bm_bh)
-		brelse(main_bm_bh);
-
-	if (main_bm_inode)
-		iput(main_bm_inode);
-
-	LOG_EXIT_STATUS(status);
-	return(status);
-} /* ocfs_find_space_from_local */
-
-
-/*
- * ocfs_find_space
- *
- * A drop-in replacement for
- * ocfs_find_contiguous_space_from_bitmap. We will in fact, call
- * ocfs_find_contiguous_space_from_bitmap if you don't give us a
- * journal handle, or if the local bitmap isn't loaded, or if the
- * allocation is simply to big to fit in the local one. otherwise,
- * we'll try to use our local alloc instead.
- *
- */
-int ocfs_find_space(ocfs_super *osb, __u64 file_size,
-		    u32 *cluster_off, u32 *cluster_count, int sysfile,
-		    ocfs_journal_handle *handle)
-{
-	int status = 0;
-	u32 bitswanted;
-	int use_global = 1;
-	struct inode *local_alloc_inode = NULL;
-
-	LOG_ENTRY_ARGS("(file_size = (%llu), handle = 0x%p, sysfile = %s)\n", 
-		       file_size, handle, sysfile ? "true" : "false");
-
-	if (file_size == 0) {
-		LOG_ERROR_STR ("asking for an allocation of zero bytes...");
-		status = 0;
-		goto bail;
-	}
-
-	/* need to calculate a couple of things for below... */
-	bitswanted = ocfs_clusters_for_bytes(osb->sb, file_size);
-
-	/* Ok, now decide if we can use local alloc bitmap.
-	 * We *always* use global bitmap for clustersize > 128k,
-	 * file_size > 2mb, so force it under these conditions. */
-	if (handle 
-	    && osb->have_local_alloc
-	    && file_size <= OCFS_LOCAL_ALLOC_MAX_ALLOC) {
-		use_global = 0;
-
-		local_alloc_inode = 
-			ocfs_get_system_file_inode(osb, 
-						   LOCAL_ALLOC_SYSTEM_INODE,
-						   osb->node_num);
-
-		if (!local_alloc_inode) {
-			status = -EFAIL;
-			LOG_ERROR_STATUS(status);
-			goto bail;
-		}
-		ocfs_handle_add_inode(handle, local_alloc_inode);
-
-		/* If the local alloc has been disabled while we were
-		 * waiting for another process to finish with it, then
-		 * find_space_from_local will return -ENOSPC and we'll
-		 * continue with the global. */
-		status = ocfs_find_space_from_local(osb, bitswanted, 
-						    cluster_off, cluster_count,
-						    handle);
-		/* If we've run out of space for our local alloc, lets
-		 * try the global one just in case... */
-		if (status == -ENOSPC)
-			use_global = 1;
-		else if (status < 0) {
-			LOG_ERROR_STATUS(status);
-			goto bail;
-		}
-		atomic_inc(&osb->alloc_stats.local_data);
-	}
-
-	if (use_global) {
-		status = ocfs_find_contiguous_space_from_bitmap(osb, handle, 
-								file_size,
-								cluster_off, 
-								cluster_count, 
-								sysfile, NULL,
-								NULL);
-		if (!status)
-			atomic_inc(&osb->alloc_stats.bitmap_data);
-
-	}
-
-	if (status < 0)
-		LOG_ERROR_STATUS(status);
-
-	LOG_TRACE_ARGS("Returning *cluster_off = %u, *cluster_count"
-		       "= %u\n", *cluster_off, *cluster_count);
-bail:
-	if (local_alloc_inode)
-		iput(local_alloc_inode);
-
-	LOG_EXIT_STATUS(status);
-	return(status);
-} /* ocfs_find_space */
-
-/* 
- * ocfs_load_local_alloc 
- */
-int ocfs_load_local_alloc(ocfs_super *osb)
-{
-	int status = 0;
-	ocfs2_dinode *alloc = NULL;
-	struct buffer_head *alloc_bh = NULL;
-	__u32 num_used;
-	struct inode *inode = NULL;
-
-	LOG_ENTRY();
-
-	/* we don't enable local alloc on cluster sizes >= 128k */
-	if (osb->s_clustersize > OCFS_LOCAL_ALLOC_MAX_CSIZE)
-		goto bail;
-
-	/* read the alloc off disk */
-	inode = ocfs_get_system_file_inode(osb, LOCAL_ALLOC_SYSTEM_INODE, osb->node_num);
-	if (!inode) {
-		LOG_ERROR_STATUS(status=-EINVAL);
-		goto bail;
-	}
-	status = ocfs_read_bh(osb,
-			      OCFS_I(inode)->ip_blkno << inode->i_sb->s_blocksize_bits,
-			      &alloc_bh, 0, inode);
-	if (status < 0) {
-		LOG_ERROR_STATUS(status);
-		goto bail;
-	}
-
-	alloc = (ocfs2_dinode *) alloc_bh->b_data;
-
-	/* do a little verification. */
-	num_used = ocfs_alloc_count_bits(alloc);
-
-	/* nowadays the local alloc has always been recovered before
-	 * we load it so there should be no bits used from the main
-	 * bitmap. */
-	if (num_used
-	    || LOCAL_ALLOC(alloc)->la_bits_set
-	    || LOCAL_ALLOC(alloc)->la_bm_bits 
-	    || LOCAL_ALLOC(alloc)->la_bm_off) {
-		LOG_ERROR_ARGS("Local alloc hasn't been recovered!\n"
-			       "found = %u, set = %u, taken = %u, off = %u\n",
-			       num_used,
-			       LOCAL_ALLOC(alloc)->la_bits_set, 
-			       LOCAL_ALLOC(alloc)->la_bm_bits,
-			       LOCAL_ALLOC(alloc)->la_bm_off);
-		status = -EFAIL;
-		goto bail;
-	}
-
-	if (!LOCAL_ALLOC(alloc)->la_size || 
-	    (LOCAL_ALLOC(alloc)->la_size > ocfs2_local_alloc_size(inode->i_sb))) {
-		LOG_ERROR_ARGS("Local alloc size is invalid (la_size = %u)\n",
-			      LOCAL_ALLOC(alloc)->la_size);
-		status = -EINVAL;
-		goto bail;
-	}
-
-	osb->local_alloc_bh = alloc_bh;
-	osb->have_local_alloc = 1;
-
-bail:
-	if (status < 0)
-		if (alloc_bh)
-			brelse(alloc_bh);
-	if (inode)
-		iput(inode);
-
-	LOG_EXIT_STATUS(status);
-	return(status);
-} /* ocfs_load_local_alloc */
-
-/* 
- * ocfs_shutdown_local_alloc
- *
- * return any unused bits to the bitmap and write out a clean
- * local_alloc. 
- *
- * local_alloc_bh is optional. If not passed, we will simply use the
- * one off osb. If you do pass it however, be warned that it *will* be
- * returned brelse'd and NULL'd out.*/
-void ocfs_shutdown_local_alloc(ocfs_super *osb)
-{
-	int status;
-	ocfs2_dinode *alloc = NULL;
-	ocfs_bitmap_free_head *f = NULL;
-	struct buffer_head *bh = NULL;
-	ocfs_journal_handle *handle = NULL;
-
-	LOG_ENTRY();
-
-	if (!osb->have_local_alloc)
-		goto bail;
-
-	bh = osb->local_alloc_bh;
-
-	status = ocfs_sync_local_from_shutdown(osb, &f, bh, 0);
-	if (status < 0)
-		LOG_ERROR_STATUS(status);
-
-	handle = ocfs_start_trans(osb, NULL, 1);
-	if (!handle) {
-		LOG_ERROR_STATUS(-ENOMEM);
-		goto bail;
-	}
-	ocfs_handle_set_always_commits(handle, 1);
-
-	status = ocfs_journal_access(handle, bh, OCFS_JOURNAL_ACCESS_WRITE);
-	if (status < 0) {
-		LOG_ERROR_STATUS(status);
-		goto bail;
-	}
-
-	alloc = (ocfs2_dinode *) bh->b_data;
-	ocfs_clear_local_alloc(alloc);
-
-	status = ocfs_journal_dirty(handle, bh);
-	if (status < 0) {
-		LOG_ERROR_STATUS(status);
-		goto bail;
-	}
-
-	brelse(bh);
-
-	ocfs_commit_trans(handle);
-	handle = NULL;
-
-	osb->local_alloc_bh = NULL;
-	osb->have_local_alloc = 0;
-
-	if (f)
-		ocfs_process_bitmap_free_head(osb, f);
-
-bail:
-	if (handle)
-		ocfs_commit_trans(handle);
-
-	if (f)
-		ocfs_free_bitmap_free_head(f);
-
-	LOG_EXIT();
-	return;
-} /* ocfs_shutdown_local_alloc */
-
-/*
- * ocfs_recover_local_alloc
- *
- * We want to free the bitmap bits outside of any recovery context, so
- * it's allocated and passed back for you.
- */
-int ocfs_recover_local_alloc(ocfs_super *osb, 
-			     int node_num, 
-			     ocfs_bitmap_free_head **bits_to_free)
-{
-	int status = 0;
-	struct buffer_head *alloc_bh = NULL;
-	struct inode *inode = NULL;
-	ocfs2_dinode *alloc;
-
-	LOG_ENTRY_ARGS("(node_num = %d)\n", node_num);
-
-	inode = ocfs_get_system_file_inode(osb, 
-					   LOCAL_ALLOC_SYSTEM_INODE, 
-					   node_num);
-	if (!inode) {
-		LOG_ERROR_STATUS(status=-EINVAL);
-		goto bail;
-	}
-
-	status = ocfs_read_bh(osb,
-			      OCFS_I(inode)->ip_blkno << inode->i_sb->s_blocksize_bits,
-			      &alloc_bh, 
-			      0, inode);
-	if (status < 0) {
-		LOG_ERROR_STATUS(status);
-		goto bail;
-	}
-
-	status = ocfs_sync_local_from_shutdown(osb, 
-					       bits_to_free, 
-					       alloc_bh, 
-					       1);
-	if (status < 0) {
-		LOG_ERROR_STATUS(status);
-		goto bail;
-	}
-
-	alloc = (ocfs2_dinode *) alloc_bh->b_data;
-	ocfs_clear_local_alloc(alloc);
-
-	status = ocfs_write_bh(osb, alloc_bh, inode);
-	if (status < 0)
-		LOG_ERROR_STATUS(status);
-
-bail:
-	if (alloc_bh)
-		brelse(alloc_bh);
-
-	if (inode)
-		iput(inode);
-
-	LOG_EXIT_STATUS(status);
-	return(status);
-} /* ocfs_recover_local_alloc */

Modified: trunk/src/alloc.h
===================================================================
--- trunk/src/alloc.h	2004-10-02 01:01:24 UTC (rev 1542)
+++ trunk/src/alloc.h	2004-10-02 01:08:08 UTC (rev 1543)
@@ -30,20 +30,15 @@
 #define OCFS2_ALLOC_H
 
 int ocfs_add_to_bitmap_free_head(ocfs_super *osb,
-				 ocfs_bitmap_free_head *f,
-				 __u32 len, __u32 fileoff,
-				 __u32 nodenum, __u32 type);
+				 ocfs_bitmap_free_head *f, 
+				 u32 len, u32 fileoff,
+				 u32 nodenum, u64 blkno, u32 type);
+struct _ocfs2_alloc_context;
 int ocfs_allocate_extent(ocfs_super *osb, struct buffer_head *fe_bh,
 			 ocfs_journal_handle *handle,
 			 u64 blkno, u32 new_clusters,
-			 struct inode *inode);
-int ocfs_alloc_node_block(ocfs_super *osb, __u64 FileSize,
-			  __u64 *DiskOffset, __u64 *file_off,
-			  __u32 NodeNum, __u32 Type,
-			  ocfs_journal_handle *handle);
-int ocfs_find_space(ocfs_super *osb, __u64 file_size,
-		    u32 *cluster_off, u32 *cluster_count, int sysfile,
-		    ocfs_journal_handle *handle);
+			 struct inode *inode, 
+			 struct _ocfs2_alloc_context *meta_ac);
 int ocfs_free_extents_for_truncate(ocfs_super *osb,
 				   ocfs2_dinode *fe,
 				   ocfs_journal_handle *handle,
@@ -67,4 +62,56 @@
 			     ocfs_bitmap_free_head **bits_to_free);
 void ocfs_shutdown_local_alloc(ocfs_super *osb);
 
+typedef struct _ocfs2_alloc_context {
+	struct inode *ac_inode;    /* which bitmap are we allocating from? */
+	struct buffer_head *ac_bh; /* file entry bh */
+	u32    ac_bits_wanted;
+	u32    ac_bits_given;
+#define OCFS_AC_USE_LOCAL 1
+#define OCFS_AC_USE_MAIN  2
+#define OCFS_AC_USE_INODE 3
+#define OCFS_AC_USE_META  4
+	u32    ac_which;
+	ocfs_journal_handle *ac_handle; /* debugging mostly. */
+} ocfs2_alloc_context;
+
+void ocfs_free_alloc_context(ocfs2_alloc_context *ac);
+static inline int ocfs_alloc_context_bits_left(ocfs2_alloc_context *ac)
+{
+	return(ac->ac_bits_wanted - ac->ac_bits_given);
+}
+int ocfs_reserve_bits(ocfs_super *osb, 
+		      ocfs_journal_handle *handle,
+		      u32 bits_wanted,
+		      ocfs2_alloc_context **ac);
+int ocfs_claim_bits(ocfs_super *osb, 
+		    ocfs_journal_handle *handle, 
+		    ocfs2_alloc_context *ac,
+		    u32 min_bits,
+		    u32 *bit_off,
+		    u32 *num_bits);
+int ocfs_num_free_extents(ocfs_super *osb, 
+			  struct inode *inode,
+			  ocfs2_dinode *fe);
+/* how many new metadata chunks would an allocation need at maximum? */
+static inline int ocfs2_extend_meta_needed(ocfs2_dinode *fe)
+{
+	/*
+	 * Rather than do all the work of determining how much we need
+	 * (involves a ton of reads and locks), just ask for the
+	 * maximal limit.  That's a tree depth shift.  So, one block for
+	 * level of the tree (current l_tree_depth), one block for the
+	 * new tree_depth==0 extent_block, and one block at the new
+	 * top-of-the tree.
+	 */
+	return(fe->id2.i_list.l_tree_depth + 2);
+}
+
+/* This is for local alloc ONLY. Others should use the generic apis
+ * above. */
+int ocfs_reserve_main_bitmap_bits(ocfs_super *osb, 
+				  ocfs_journal_handle *handle,
+				  u32 bits_wanted,
+				  ocfs2_alloc_context *ac);
+
 #endif /* OCFS2_ALLOC_H */

Modified: trunk/src/aops.c
===================================================================
--- trunk/src/aops.c	2004-10-02 01:01:24 UTC (rev 1542)
+++ trunk/src/aops.c	2004-10-02 01:08:08 UTC (rev 1543)
@@ -539,8 +539,7 @@
 	   blocks for the file. */
 	if (create && vbo_max > OCFS_I(inode)->ip_alloc_size) {
 		/* WARNING: How much do we really want to extend the file? */
-		status = ocfs_extend_file(osb, vbo_max,
-					  NULL, inode, NULL, 0, NULL);
+		status = ocfs_extend_file(osb, inode, vbo_max);
 		if (status < 0) {
 			status = -ENOSPC;
 			LOG_ERROR_STR("ocfs_direct_IO_get_blocks: failed to extend the file!");

Modified: trunk/src/bitmap.c
===================================================================
--- trunk/src/bitmap.c	2004-10-02 01:01:24 UTC (rev 1542)
+++ trunk/src/bitmap.c	2004-10-02 01:08:08 UTC (rev 1543)
@@ -165,27 +165,42 @@
 /*
  * ocfs_find_clear_bits()
  *
- * sysonly is passed # bits in bitmap that are rserved for system file space
- * in case we have a disk full.
+ * if you don't pass in best_fit_bits, we only try contig and give you
+ * -ENOSPC otherwise.
  *
+ * if you pass in best_fit_bits we return -ENOSPC only if we're
+ * completely full, otherwise we'll always set best_fit_bits to at max
+ * bits_wanted, and of course *bitoff to your starting offset.
  */
 int ocfs_find_clear_bits(ocfs_super *osb, ocfs_alloc_bm * bitmap,
-			 __u32 numBits, __u32 offset, __u32 sysonly)
+			 u32 bits_wanted, u32 *bitoff,
+			 u32 *best_fit_bits)
 {
+	int status = 0;
 	__u32 globalsize, globaloff, localstart, lastbh;
 	__u32 size = OCFS_BITS_IN_CHUNK(osb->sb);
-	__u32 bitoff = 0, count = 0;
+	__u32 count = 0;
+	u32 largest_start, largest_size;
 	void *buffer;
 	int c;
 	struct buffer_head *currbh = NULL;
 
-	LOG_ENTRY_ARGS ("(0x%p, 0x%p, %u, %u, %u)\n", osb, bitmap, numBits,
-			offset, sysonly);
+	LOG_ENTRY_ARGS ("(0x%p, 0x%p, %u)\n", osb, bitmap, bits_wanted);
 
-	globalsize = bitmap->validbits - sysonly;
+	/* We keep track of the biggest contiguous chunk we've seen so
+	 * far in these two variables. If we never completely fulfull
+	 * the request, we can pass them back for a best try. */
+	largest_start = 0;
+	largest_size = 0;
+
+	if (best_fit_bits)
+		*best_fit_bits = 0;
+
+	*bitoff = 0;
+	globalsize = bitmap->validbits;
 	lastbh = ocfs_blocks_for_bits(osb->sb, globalsize) - 1;
 
-	globaloff = offset;
+	globaloff = 0;
 	ocfs_bitmap_block_for_off(osb->sb, globaloff, &c, &localstart);
 
 	if (lastbh == 0)
@@ -199,11 +214,11 @@
 	  "localstart=%u\n", c, lastbh, size, localstart);*/
 	buffer = currbh->b_data;
 
-	while ((bitoff = find_next_zero_bit(buffer,
+	while ((*bitoff = find_next_zero_bit(buffer,
 					    OCFS_BITS_IN_CHUNK(osb->sb),
 					    localstart)) != -1) {
-		/*LOG_TRACE_ARGS("c=%u, globaloff=%u, bitoff=%u, "
-			       "localstart=%u\n", c, globaloff, bitoff, 
+		/*LOG_TRACE_ARGS("c=%u, globaloff=%u, *bitoff=%u, "
+			       "localstart=%u\n", c, globaloff, *bitoff, 
 			       localstart);*/
 
 		/* find_next_zero_bit returns:
@@ -211,10 +226,10 @@
 		   some number < size: at the next zero bit
 		   localstart: if the current one is a zero
 		*/
-		if (bitoff >= size) {
+		if (*bitoff >= size) {
 nextbh:
 			/* we've hit the end of our bh. */
-			/*LOG_TRACE_ARGS("bitoff >= size (%u)\n", bitoff,c);*/
+			/*LOG_TRACE_ARGS("*bitoff >= size (%u)\n", *bitoff,c);*/
 
 			/* if it's the last bh, then quit the loop */
 			if (c == lastbh) {
@@ -224,7 +239,7 @@
 			}
 			/* otherwise, reset localstart and switch bhs
 			 * and continue */
-			localstart = bitoff = 0;
+			localstart = *bitoff = 0;
 			c++;
 			currbh = bitmap->chunk[c];
 			buffer = currbh->b_data;
@@ -236,37 +251,42 @@
 			continue;
 		}
 
-		if (!ocfs_test_allocatable(bitoff, currbh)) {
+		if (!ocfs_test_allocatable(*bitoff, currbh)) {
 			/* We found a zero, but we can't use it as it
 			 * hasn't been put to disk yet! */
 			count = 0;
-			localstart = bitoff + 1;
+			localstart = *bitoff + 1;
 			/* In doing this, we might go over our current bh. */
 			if (localstart >= size)
 				goto nextbh;
 
 			globaloff =
 				ocfs_bitmap_off_for_block(osb->sb, c,
-					 		  bitoff) + 1;
-		} else if (bitoff == localstart) {
-			/*LOG_TRACE_ARGS("bitoff == localstart (%u)\n", 
-			  bitoff);*/
+					 		  *bitoff) + 1;
+		} else if (*bitoff == localstart) {
+			/*LOG_TRACE_ARGS("*bitoff == localstart (%u)\n", 
+			  *bitoff);*/
 			/* cool, we have another zero! */
 			count++;
 			localstart++;
 			globaloff++;
 		} else {
-			/*LOG_TRACE_ARGS("bitoff (%u) != localstart (%u)\n", 
-			  bitoff, localstart);*/
+			/*LOG_TRACE_ARGS("*bitoff (%u) != localstart (%u)\n", 
+			  *bitoff, localstart);*/
 			/* we had to skip over some ones */
 			count = 1;
 			globaloff =
 				ocfs_bitmap_off_for_block(osb->sb, c,
-				 			  bitoff) + 1;
-			localstart = bitoff + 1;
+				 			  *bitoff) + 1;
+			localstart = *bitoff + 1;
 		}
 
-		if (count == numBits) {
+		if (count > largest_size) {
+			largest_size = count;
+			largest_start = globaloff - count;
+		}
+
+		if (count == bits_wanted) {
 			/* we've found everything we wanted. */
 			LOG_TRACE_ARGS("Found it all! (count=%u)\n", count);
 			buffer = NULL;
@@ -274,13 +294,19 @@
 		}
 	}
 
-	if (count == numBits)
-		bitoff = globaloff - count;
-	else
-		bitoff = -1;
+	if (count == bits_wanted) {
+		*bitoff = globaloff - count;
+		if (best_fit_bits)
+			*best_fit_bits = bits_wanted;
+	} else if (best_fit_bits && largest_size) {
+#warning "can we go off the end of the bitmap here?"
+			*best_fit_bits = largest_size;
+			*bitoff = largest_start;
+	} else
+		status = -ENOSPC;
 
-	LOG_EXIT_ULONG ((unsigned long)bitoff);
-	return bitoff;
+	LOG_EXIT_STATUS(status);
+	return(status);
 }				/* ocfs_find_clear_bits */
 
 /*

Modified: trunk/src/bitmap.h
===================================================================
--- trunk/src/bitmap.h	2004-10-02 01:01:24 UTC (rev 1542)
+++ trunk/src/bitmap.h	2004-10-02 01:08:08 UTC (rev 1543)
@@ -33,8 +33,8 @@
 void ocfs_clear_bits(struct super_block *sb,
 		     ocfs_journal_handle *handle, ocfs_alloc_bm *bitmap,
 		     __u32 start, __u32 num);
-int ocfs_find_clear_bits(ocfs_super *osb, ocfs_alloc_bm *bitmap,
-			 __u32 numBits, __u32 offset, __u32 sysonly);
+int ocfs_find_clear_bits(ocfs_super *osb, ocfs_alloc_bm * bitmap,
+			 u32 numBits, u32 *bitoff, u32 *best_fit_bits);
 void ocfs_initialize_bitmap(struct super_block *sb,
 			    ocfs_alloc_bm *bitmap, __u32 validbits,
 			    __u32 allocbits);

Modified: trunk/src/dcache.c
===================================================================
--- trunk/src/dcache.c	2004-10-02 01:01:24 UTC (rev 1542)
+++ trunk/src/dcache.c	2004-10-02 01:08:08 UTC (rev 1543)
@@ -38,9 +38,10 @@
 #include "ocfs_log.h"
 #include "ocfs.h"
 
+#include "alloc.h"
 #include "dcache.h"
+#include "file.h"
 #include "vote.h"
-#include "file.h"
 
 #define OCFS_DEBUG_CONTEXT    OCFS_DEBUG_CONTEXT_DCACHE
 

Modified: trunk/src/dir.c
===================================================================
--- trunk/src/dir.c	2004-10-02 01:01:24 UTC (rev 1542)
+++ trunk/src/dir.c	2004-10-02 01:08:08 UTC (rev 1543)
@@ -49,10 +49,14 @@
 #include "ocfs_log.h"
 #include "ocfs.h"
 
+#include "alloc.h"
 #include "dir.h"
 #include "dlm.h"
+#include "file.h"
 #include "inode.h"
+#include "ocfs_journal.h"
 #include "namei.h"
+#include "suballoc.h"
 #include "util.h"
 
 #include "buffer_head_io.h"
@@ -63,6 +67,10 @@
 	DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
 };
 
+static int ocfs_extend_dir(ocfs_super *osb, 
+			   struct inode *dir, 
+			   struct buffer_head *parent_fe_bh,
+			   struct buffer_head **new_de_bh);
 /*
  * ocfs_readdir()
  *
@@ -79,6 +87,7 @@
 	struct super_block * sb = inode->i_sb;
 	int have_disk_lock = 0;
 	ocfs_super *osb = OCFS_SB(sb);
+	int have_sem = 0;
 
 	LOG_SET_CONTEXT(READDIR);
 
@@ -87,11 +96,6 @@
 	stored = 0;
 	bh = NULL;
 
-	/* NOTE: We only take a write lock here because of the
-	 * acquire_lock call. We should investigate whether taking a
-	 * read lock is such a bad idea in this case... */
-	down_write(&OCFS_I(inode)->ip_io_sem);
-	
 	error = ocfs_acquire_lock_ro(osb, inode);
 	if (error < 0) {
 		if (error != -EINTR)
@@ -101,12 +105,14 @@
 		goto bail;
 	}
 	have_disk_lock = 1;
+	down_read(&OCFS_I(inode)->ip_io_sem);
+	have_sem = 1;
 
 	offset = filp->f_pos & (sb->s_blocksize - 1);
 
 	while (!error && !stored && filp->f_pos < inode->i_size) {
 		blk = (filp->f_pos) >> sb->s_blocksize_bits;
-		bh = ocfs_bread (NULL, inode, blk, 0, &err, 0);
+		bh = ocfs_bread (inode, blk, &err, 0);
 		if (!bh) {
 			LOG_ERROR_ARGS ("directory #%llu contains a hole at offset %lu\n",
 					OCFS_I(inode)->ip_blkno,
@@ -121,8 +127,9 @@
 		if (!offset) {
 			for (i = 16 >> (sb->s_blocksize_bits - 9), num = 0;
 			     i > 0; i--) {
-				tmp = ocfs_bread (NULL, inode, ++blk, 0, &err, 1);
-				brelse (tmp);
+				tmp = ocfs_bread (inode, ++blk, &err, 1);
+				if (tmp)
+					brelse (tmp);
 			}
 		}
 
@@ -198,6 +205,8 @@
 
 	stored = 0;
 bail:
+	if (have_sem)
+		up_read(&OCFS_I(inode)->ip_io_sem);
 
 	if (have_disk_lock) {
 		error = ocfs_release_lock_ro (osb, inode);
@@ -205,8 +214,6 @@
 			LOG_ERROR_STATUS (error);
 	}
 
-	up_write(&OCFS_I(inode)->ip_io_sem);
-
 	LOG_EXIT_STATUS(stored);
 	LOG_CLEAR_CONTEXT();
 	return stored;
@@ -239,6 +246,7 @@
 				LOG_ERROR_STATUS (status);
 			goto leave;
 		}
+		down_read(&OCFS_I(inode)->ip_io_sem);
 		lock_acq = 1;
 	}
 
@@ -255,6 +263,7 @@
 
 	if (take_lock && lock_acq)
 	{
+		up_read(&OCFS_I(inode)->ip_io_sem);
 		tmpstat = ocfs_release_lock_ro (osb, inode);
 		if (tmpstat < 0) {
 			LOG_ERROR_STATUS (tmpstat);
@@ -278,7 +287,7 @@
 /*
  * routine to check that the specified directory is empty (for rmdir)
  */
-int empty_dir(struct inode *inode)
+int ocfs_empty_dir(struct inode *inode)
 {
 	unsigned long offset;
 	struct buffer_head * bh;
@@ -289,7 +298,7 @@
 	sb = inode->i_sb;
 	if ((inode->i_size <
 	     (OCFS2_DIR_REC_LEN(1) + OCFS2_DIR_REC_LEN(2))) ||
-	    !(bh = ocfs_bread (NULL, inode, 0, 0, &err, 0))) {
+	    !(bh = ocfs_bread (inode, 0, &err, 0))) {
 	    	LOG_ERROR_ARGS ("bad directory (dir #%llu) - no data block\n", 
 				OCFS_I(inode)->ip_blkno);
 		return 1;
@@ -312,8 +321,8 @@
 	while (offset < inode->i_size ) {
 		if (!bh || (void *) de >= (void *) (bh->b_data + sb->s_blocksize)) {
 			brelse (bh);
-			bh = ocfs_bread(NULL, inode,
-				      	offset >> sb->s_blocksize_bits, 0, &err, 0);
+			bh = ocfs_bread(inode,
+				      	offset >> sb->s_blocksize_bits, &err, 0);
 			if (!bh) {
 				LOG_ERROR_ARGS ("directory #%llu contains a hole at offset %lu\n",
 					OCFS_I(inode)->ip_blkno, offset);
@@ -337,3 +346,279 @@
 	brelse (bh);
 	return 1;
 }
+
+/* returns a bh of the 1st new block in the allocation. */
+int ocfs_do_extend_dir(struct super_block *sb,
+		       ocfs_journal_handle *handle,
+		       struct inode *dir,
+		       struct buffer_head *parent_fe_bh,
+		       ocfs2_alloc_context *data_ac,
+		       ocfs2_alloc_context *meta_ac,
+		       struct buffer_head **new_bh)
+{
+	int status;
+	s64 vbo, lbo;
+	int extend;
+
+	down(&OCFS_I(dir)->ip_sem);
+	extend = (dir->i_size == OCFS_I(dir)->ip_alloc_size);
+	up(&OCFS_I(dir)->ip_sem);
+
+	if (extend) {
+		status = ocfs_extend_allocation(OCFS_SB(sb), dir, 1,
+						parent_fe_bh, handle,
+						data_ac, meta_ac, NULL);
+		OCFS_ASSERT(status != -EAGAIN);
+		if (status < 0) {
+			LOG_ERROR_STATUS(status);
+			goto bail;
+		}
+	}
+
+	vbo = (s64) dir->i_size;
+	lbo = 0;
+
+	status = ocfs_lookup_file_allocation(OCFS_SB(sb), vbo, &lbo, 
+					     sb->s_blocksize, NULL, dir, 1);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
+	*new_bh = sb_getblk(sb, lbo >> sb->s_blocksize_bits);
+	if (!*new_bh) {
+		status = -EIO;
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+	status = 0;
+bail:
+	LOG_EXIT_STATUS(status);
+	return(status);
+}
+
+/* assumes you already have a cluster lock on the directory. */
+static int ocfs_extend_dir(ocfs_super *osb, 
+			   struct inode *dir, 
+			   struct buffer_head *parent_fe_bh,
+			   struct buffer_head **new_de_bh)
+{
+	int status = 0;
+	ocfs2_dinode *fe = (ocfs2_dinode *) parent_fe_bh->b_data;
+	int credits, num_free_extents;
+	ocfs2_alloc_context *data_ac = NULL;
+	ocfs2_alloc_context *meta_ac = NULL;
+	ocfs_journal_handle *handle = NULL;
+	struct buffer_head *new_bh = NULL;
+	struct ocfs2_dir_entry * de;
+	struct super_block *sb = osb->sb;
+
+	LOG_ENTRY();
+
+	LOG_TRACE_ARGS("extending dir %llu (i_size = %llu)\n", 
+		       OCFS_I(dir)->ip_blkno, dir->i_size);
+
+	handle = ocfs_alloc_handle(osb);
+	if (handle == NULL) {
+		LOG_ERROR_STATUS (status = -ENOMEM);
+		goto bail;
+	}
+
+	/* dir->i_size is always block aligned. */
+	down(&OCFS_I(dir)->ip_sem);
+	if (dir->i_size == OCFS_I(dir)->ip_alloc_size) {
+		up(&OCFS_I(dir)->ip_sem);
+		num_free_extents = ocfs_num_free_extents(osb, dir, fe);
+		if (num_free_extents < 0) {
+			status = num_free_extents;
+			LOG_ERROR_STATUS(status);
+			goto bail;
+		}
+
+		if (!num_free_extents) {
+			status = ocfs_reserve_new_metadata(osb, handle,
+							   fe, &meta_ac);
+			if (status < 0) {
+				LOG_ERROR_STATUS (status);
+				goto bail;
+			}
+		}
+
+		status = ocfs_reserve_bits(osb, handle, 1, &data_ac);
+		if (status < 0) {
+			LOG_ERROR_STATUS(status);
+			goto bail;
+		}
+
+		credits = ocfs_calc_extend_credits(sb, 1);
+	} else {
+		up(&OCFS_I(dir)->ip_sem);
+		/* one for the dinode, one for the new block. */
+		credits = 2;
+	}
+
+	handle = ocfs_start_trans(osb, handle, credits);
+	if (handle == NULL) {
+		LOG_ERROR_STATUS(status = -ENOMEM);
+		goto bail;
+	}
+
+	status = ocfs_do_extend_dir(osb->sb, handle, dir, parent_fe_bh, 
+				    data_ac, meta_ac, &new_bh);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
+	set_buffer_uptodate(new_bh);
+	SET_BH_SEQNUM(dir, new_bh);
+	status = ocfs_journal_access(handle, new_bh, 
+				     OCFS_JOURNAL_ACCESS_CREATE);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+	memset(new_bh->b_data, 0, sb->s_blocksize);
+	de = (struct ocfs2_dir_entry *) new_bh->b_data;
+	de->inode = 0;
+	de->rec_len = le16_to_cpu(sb->s_blocksize);
+	status = ocfs_journal_dirty(handle, new_bh);	
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
+	dir->i_size += dir->i_sb->s_blocksize;
+	dir->i_blocks += 1;
+	status = ocfs_mark_inode_dirty(handle, dir, parent_fe_bh);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
+	*new_de_bh = new_bh;
+	get_bh(*new_de_bh);
+bail:
+	if (handle) {
+		if (status < 0)
+			ocfs_abort_trans(handle);
+		else 
+			ocfs_commit_trans(handle);
+	}
+	if (data_ac)
+		ocfs_free_alloc_context(data_ac);
+	if (meta_ac)
+		ocfs_free_alloc_context(meta_ac);
+
+	if ((status < 0) && new_bh)
+		brelse(new_bh);
+
+	LOG_EXIT_STATUS (status);
+	return status;
+}				/* ocfs_extend_dir */
+
+/*
+ * Search the dir for a good spot, extending it if necessary. The
+ * block containing an appropriate record is returned in ret_de_bh.
+ */
+int ocfs_prepare_dir_for_insert(ocfs_super *osb, 
+				struct inode *dir,
+				struct buffer_head *parent_fe_bh,
+				const char *name, 
+				int namelen,
+				struct buffer_head **ret_de_bh)
+{
+	unsigned long offset;
+	struct buffer_head * bh = NULL;
+	unsigned short rec_len;
+	ocfs2_dinode *fe;
+	struct ocfs2_dir_entry * de;
+	struct super_block * sb;
+	int status;
+
+	LOG_ENTRY();
+
+	LOG_TRACE_ARGS("getting ready to insert namelen %d into dir %llu\n", 
+		       namelen, OCFS_I(dir)->ip_blkno);
+
+	OCFS_ASSERT(S_ISDIR(dir->i_mode));
+	fe = (ocfs2_dinode *) parent_fe_bh->b_data;
+	OCFS_ASSERT(fe->i_size == dir->i_size);
+
+	sb = dir->i_sb;
+
+	if (!namelen) {
+		status = -EINVAL;
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
+	bh = ocfs_bread (dir, 0, &status, 0);
+	if (!bh) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
+	rec_len = OCFS2_DIR_REC_LEN(namelen);
+	offset = 0;
+	de = (struct ocfs2_dir_entry *) bh->b_data;
+	while (1) {
+		if ((char *)de >= sb->s_blocksize + bh->b_data) {
+			brelse (bh);
+			bh = NULL;
+
+			if (dir->i_size <= offset) {
+				status = ocfs_extend_dir(osb, 
+							 dir, 
+							 parent_fe_bh,
+							 &bh);
+				if (status < 0) {
+					LOG_ERROR_STATUS(status);
+					goto bail;
+				}
+				OCFS_ASSERT(bh);
+				*ret_de_bh = bh;
+				get_bh(*ret_de_bh);
+				goto bail;
+			}
+			bh = ocfs_bread (dir, 
+					 offset >> sb->s_blocksize_bits, 
+					 &status, 
+					 0);
+			if (!bh) {
+				LOG_ERROR_STATUS(status);
+				goto bail;
+			}
+			/* move to next block */
+			de = (struct ocfs2_dir_entry *) bh->b_data;
+		}
+		if (!ocfs_check_dir_entry (dir, de, bh, offset)) {
+			status = -ENOENT;
+			goto bail;
+		}
+		if (ocfs_match (namelen, name, de)) {
+			status = -EEXIST;
+			goto bail;
+		}
+		if (((le64_to_cpu(de->inode) == 0) &&
+		     (le16_to_cpu(de->rec_len) >= rec_len)) ||
+		    (le16_to_cpu(de->rec_len) >=
+		     (OCFS2_DIR_REC_LEN(de->name_len) + rec_len))) {
+			/* Ok, we found a spot. Return this bh and let
+			 * the caller actually fill it in. */
+			*ret_de_bh = bh;
+			get_bh(*ret_de_bh);
+			status = 0;
+			goto bail;
+		}
+		offset += le16_to_cpu(de->rec_len);
+		de = (struct ocfs2_dir_entry *) ((char *) de + le16_to_cpu(de->rec_len));
+	}
+
+	status = 0;
+bail:
+	if (bh)
+		brelse(bh);
+	LOG_EXIT_STATUS(status);
+	return(status);
+}

Modified: trunk/src/dir.h
===================================================================
--- trunk/src/dir.h	2004-10-02 01:01:24 UTC (rev 1542)
+++ trunk/src/dir.h	2004-10-02 01:08:08 UTC (rev 1543)
@@ -29,12 +29,24 @@
 #ifndef OCFS2_DIR_H
 #define OCFS2_DIR_H
 
-int empty_dir(struct inode *inode);  /* FIXME: to namei.c */
+int ocfs_empty_dir(struct inode *inode);  /* FIXME: to namei.c */
 int ocfs_find_files_on_disk(ocfs_super *osb, const char *name,
 			    int namelen, u64 *blkno,
 			    struct inode *inode, int take_lock,
 			    struct buffer_head **dirent_bh,
 			    struct ocfs2_dir_entry **dirent);
 int ocfs_readdir(struct file *filp, void *dirent, filldir_t filldir);
-
+int ocfs_prepare_dir_for_insert(ocfs_super *osb, 
+				struct inode *dir,
+				struct buffer_head *parent_fe_bh,
+				const char *name, 
+				int namelen,
+				struct buffer_head **ret_de_bh);
+int ocfs_do_extend_dir(struct super_block *sb,
+		       ocfs_journal_handle *handle,
+		       struct inode *dir,
+		       struct buffer_head *parent_fe_bh,
+		       ocfs2_alloc_context *data_ac,
+		       ocfs2_alloc_context *meta_ac,
+		       struct buffer_head **new_bh);
 #endif /* OCFS2_DIR_H */

Modified: trunk/src/dlm.c
===================================================================
--- trunk/src/dlm.c	2004-10-02 01:01:24 UTC (rev 1542)
+++ trunk/src/dlm.c	2004-10-02 01:08:08 UTC (rev 1543)
@@ -218,6 +218,7 @@
 	OCFS_ASSERT(lock_type != OCFS_LKM_NLMODE);
 	OCFS_ASSERT(inode);
 	OCFS_ASSERT(bh);
+	OCFS_ASSERT(!journal_current_handle());
 
 	lock_id = OCFS_I(inode)->ip_blkno;
 	LOG_TRACE_ARGS("lock_id = %llu\n", lock_id);
@@ -232,23 +233,31 @@
 	}
 
 	updated = 0;
+again:
+	/* yay, lock ordering. at least we don't hold io sem across
+	 * the whole thing now. */
+	down_read(&OCFS_I(inode)->ip_io_sem);
 	ocfs_acquire_lockres_write (inode);
 
-again:
 	LOG_TRACE_ARGS("attempting to get lock, pass: %d\n", ++k);
 
+	/* if updated = 1 then we've read a valid bh so skip the
+	 * update_lockres if we can trust it. */
+	if (updated && (lockres->master_node_num != osb->node_num))
+		updated = 0;
+
 	if (!updated) {
 		status = ocfs_update_lockres(osb, *bh, inode, 1);
 		if (status < 0) {
+			up_read(&OCFS_I(inode)->ip_io_sem);
 			ocfs_release_lockres_write (inode);
 			LOG_ERROR_STATUS (status);
 			goto finally;
 		}
+		updated = 1;
 	}
+	up_read(&OCFS_I(inode)->ip_io_sem);
 
-	/* alright, if we own it then no more updates are necessary. */
-	if (lockres->master_node_num == osb->node_num)
-		updated = 1;
 reevaluate:
 	no_owner = (lockres->master_node_num == OCFS_INVALID_NODE_NUM);
 
@@ -388,11 +397,6 @@
 				LOG_ERROR_ARGS("Timed out acquiring lock for inode "
 					       "%llu, retrying...\n", OCFS_I(inode)->ip_blkno);
 			ocfs_sleep (50);
-			ocfs_acquire_lockres_write(inode);
-			/* if we're going to jump back up, we want to update
-			 * if we're not the master... */
-			if (lockres->master_node_num != osb->node_num)
-				updated = 0;
 			goto again;
 		}
 		goto finally;

Modified: trunk/src/file.c
===================================================================
--- trunk/src/file.c	2004-10-02 01:01:24 UTC (rev 1542)
+++ trunk/src/file.c	2004-10-02 01:08:08 UTC (rev 1543)
@@ -46,6 +46,7 @@
 #include "sysfile.h"
 #include "inode.h"
 #include "ioctl.h"
+#include "suballoc.h"
 #include "util.h"
 
 #include "ocfs_journal.h"
@@ -55,6 +56,11 @@
 
 static int ocfs2_zero_extend(struct inode *inode);
 
+static unsigned int ocfs_calc_overalloc_bits(ocfs_super *osb, 
+					     struct file *filp,
+					     ocfs2_dinode *fe,
+					     u64 new_size);
+
 static void ocfs_fe_set_attributes(ocfs2_dinode *fe, struct iattr *attr)
 {
 	if (attr->ia_valid & ATTR_SIZE)
@@ -403,7 +409,7 @@
 				   struct inode *inode)
 {
 	int status = 0;
-	ocfs2_dinode *fileEntry = NULL;
+	ocfs2_dinode *fe = NULL;
 	struct buffer_head *bh = NULL;
 	ocfs_journal_handle *handle = NULL;
 
@@ -427,9 +433,9 @@
 			LOG_ERROR_STATUS (status);
 		goto leave;
 	}
-
 	ocfs_handle_add_lock(handle, OCFS_LKM_EXMODE, FLAG_FILE_UPDATE_OIN, 
 			     inode);
+	ocfs_handle_add_inode(handle, inode);
 
 	/* Start a transaction - need a minimal amount of block credits (1) */
 	handle = ocfs_start_trans(osb, handle, 1);
@@ -437,10 +443,11 @@
 		LOG_ERROR_STATUS(status);
 		goto leave;
 	}
+	ocfs_handle_set_always_commits(handle, 1);
 
-	fileEntry = (ocfs2_dinode *) bh->b_data;
+	fe = (ocfs2_dinode *) bh->b_data;
 
-	if (!IS_VALID_FILE_ENTRY(fileEntry)) {
+	if (!IS_VALID_FILE_ENTRY(fe)) {
 		LOG_ERROR_ARGS("Invalid fe at blkno %llu",
 			       OCFS_I(inode)->ip_blkno);
 		status = -EFAIL;
@@ -453,11 +460,11 @@
 		goto leave;
 	}
 
-	fileEntry = (ocfs2_dinode *) bh->b_data;
+	fe = (ocfs2_dinode *) bh->b_data;
 
-	fileEntry->i_mtime = OCFS_CURRENT_TIME;
+	fe->i_mtime = OCFS_CURRENT_TIME;
 
-	ocfs_fe_set_attributes(fileEntry, attr);
+	ocfs_fe_set_attributes(fe, attr);
 
 	status = ocfs_journal_dirty(handle, bh);
 	if (status < 0) {
@@ -466,14 +473,9 @@
 	}
 
 leave:
+	if (handle)
+		ocfs_commit_trans(handle);
 
-	if (handle) {
-		if (status < 0)
-			ocfs_abort_trans(handle);
-		else
-			ocfs_commit_trans(handle);
-	}
-
 	if (bh != NULL)
 		brelse(bh);
 
@@ -626,10 +628,7 @@
 		LOG_TRACE_ARGS
 		    ("Writing at EOF, will need more allocation: have=%llu, "
 		     "need=%llu\n", OCFS_I(inode)->ip_alloc_size, newsize);
-		down_write(&OCFS_I(inode)->ip_io_sem);
-		status = ocfs_extend_file(osb, newsize,
-					  NULL, inode, NULL, 0, NULL);
-		up_write(&OCFS_I(inode)->ip_io_sem);
+		status = ocfs_extend_file(osb, inode, newsize);
 		if (status < 0) {
 			if (status != -EINTR && status != -ENOSPC) {
 				LOG_ERROR_STATUS (status);
@@ -922,6 +921,8 @@
 	/* if we updated correctly then we can update the alloc_size */
 	OCFS_I(inode)->ip_alloc_size = new_alloc_size;
 	OCFS_I(inode)->ip_mmu_private = fe->i_size;
+	ocfs_extent_map_destroy(&OCFS_I(inode)->ip_ext_map);
+	ocfs_extent_map_init (&OCFS_I(inode)->ip_ext_map);
 	up (&(OCFS_I(inode)->ip_sem));
 
 leave:
@@ -980,264 +981,348 @@
 	return res;
 }
 
-
-/* ocfs_extend_file()
- *
+/*
+ * extend allocation only here.
+ * we'll update all the disk stuff, and oip->alloc_size
+ * 
+ * expect stuff to be locked, a transaction started and enough data /
+ * metadata reservations in the contexts. I'll return -EAGAIN, if we
+ * run out of transaction credits, so the caller can restart us.
  */
-int ocfs_extend_file(ocfs_super *osb, __u64 file_size,
-		     ocfs_journal_handle *passed_handle,
-		     struct inode *inode, struct iattr *attr,
-		     int system_file, struct buffer_head *fe_bh)
+int ocfs_extend_allocation(ocfs_super *osb, 
+			   struct inode *inode, 
+			   u32 clusters_to_add, 
+			   struct buffer_head *fe_bh,
+			   ocfs_journal_handle *handle, 
+			   ocfs2_alloc_context *data_ac,
+			   ocfs2_alloc_context *meta_ac,
+			   enum ocfs2_alloc_restarted *reason)
 {
 	int status = 0;
-	ocfs2_dinode *fe;
-	__u64 tempOffset = 0;
-	__u64 current_alloc;
-	__u64 alloc_size;
-	u32 bitmapOffset = 0;
-	u32 numClustersAlloc = 0;
-	u64 block_off;
-	u64 num_blocks;
-	struct buffer_head *bh = NULL;
-	ocfs_journal_handle *handle = NULL;
-	int credits;
-	struct inode *ext_alloc_inode = NULL;
+	int credits_needed, free_extents, multi_pass;
+	ocfs2_dinode *fe = (ocfs2_dinode *) fe_bh->b_data;
+	u32 bit_off, num_bits;
+	u64 block;
 
-	LOG_ENTRY_ARGS("(file_size=%llu, system=%s)\n",
-		       file_size, system_file?"yes":"no");
+	OCFS_ASSERT(clusters_to_add);
 
-	if (!inode)
-		BUG();
-
-#ifdef PURE_EVIL
-	if (evil_filename_check(EVIL_INODE, inode)) {
-		LOG_ERROR_ARGS("EVIL EXTEND: file_size=%llu, oldsize=%llu\n",
-			       file_size, inode->i_size);
+	multi_pass = 0;
+again:
+	free_extents = ocfs_num_free_extents(osb, inode, fe);
+	if (free_extents < 0) {
+		status = free_extents;
+		LOG_ERROR_STATUS(status);
+		goto leave;
 	}
-#endif
 
-	if (file_size == 0)
+	/* there are two cases which could cause us to EAGAIN in the
+	 * we-need-more-metadata case: 
+	 * 1) we haven't reserved *any*
+	 * 2) we are so fragmented, we've needed to add metadata too 
+	 *    many times. */
+	if (!free_extents && !meta_ac) {
+		LOG_TRACE_STR("we haven't reserved any metadata!");
+		status = -EAGAIN;
+		if (reason)
+			*reason = RESTART_META;
 		goto leave;
+	} else if ((!free_extents)
+		   && (ocfs_alloc_context_bits_left(meta_ac)
+		       < ocfs2_extend_meta_needed(fe))) {
+		LOG_TRACE_STR("filesystem is really fragmented...");
+		status = -EAGAIN;
+		if (reason)
+			*reason = RESTART_META;
+		goto leave;
+	}
 
-	if (passed_handle == NULL) {
-		handle = ocfs_alloc_handle(osb);
-		if (handle == NULL) {
-			LOG_ERROR_STATUS(status = -ENOMEM);
-			goto leave;
-		}
+	/* do we have enough credits for another single extend, of
+	 * what's left? */
+	/* fe + main bitmap fe + main bitmap bits */
+	if (!multi_pass)
+		credits_needed = 1 + 1 + 
+			ocfs_blocks_for_bits(osb->sb, clusters_to_add);
+	else /* if we've already extended once, then we've already reserved. */
+		credits_needed = ocfs_blocks_for_bits(osb->sb, clusters_to_add);
+	if (!free_extents) {
+		/* will need to extend the file: 
+		 * metadata suballoc fe + metadata suballoc bitmap 
+		 * + actual metadata blocks. */
+		credits_needed += 1 + 1 + ocfs2_extend_meta_needed(fe);
+	}
+	multi_pass = 1;
 
-		status = ocfs_acquire_lock (osb, OCFS_LKM_EXMODE, 
-					    FLAG_FILE_EXTEND, &bh, inode);
-		if (status < 0) {
-			if (status != -EINTR)
-				LOG_ERROR_STATUS (status);
-			goto leave;
-		}
-		ocfs_handle_add_lock(handle, 
-				     OCFS_LKM_EXMODE,
-				     FLAG_FILE_EXTEND|FLAG_FILE_UPDATE_OIN,
-				     inode);
-	} else {
-		handle = passed_handle;
-		/* fe_bh is optional if you already have a transaction open. */
-		if (fe_bh)
-			bh = fe_bh;
-		else {
-			status = ocfs_read_bh(osb,
-					      OCFS_I(inode)->ip_blkno << inode->i_sb->s_blocksize_bits,
-					      &bh, OCFS_BH_CACHED, inode);
-			if (status < 0) {
-				LOG_ERROR_STATUS (status);
-				goto leave;
-			}
-		}
+	if (credits_needed > (handle->max_buffs - handle->num_buffs)) {
+		LOG_TRACE_ARGS("Not enough credits for this extend: need %u, "
+			       "have %u\n", credits_needed, 
+			       (handle->max_buffs - handle->num_buffs));
+		status = -EAGAIN;
+		if (reason)
+			*reason = RESTART_TRANS;
+		goto leave;
 	}
 
-	fe = (ocfs2_dinode *) bh->b_data;
-	OCFS_ASSERT(IS_VALID_FILE_ENTRY(fe));
+	status = ocfs_claim_bits(osb, handle, data_ac, 1, &bit_off, &num_bits);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto leave;
+	}
 
-	current_alloc = (u64)fe->i_clusters << osb->s_clustersize_bits;
-	alloc_size = file_size - current_alloc;
-	LOG_TRACE_ARGS("current_alloc=%llu, alloc_size=%llu\n",
-		       current_alloc, alloc_size);
+	OCFS_ASSERT(num_bits <= clusters_to_add);
 
-	if (passed_handle == NULL) {
-		credits = ocfs_calc_extend_credits(osb->sb, 
-						   (__u32) alloc_size); 
+	/* reserve our write early -- allocate_extent may update the inode */
+	status = ocfs_journal_access(handle, fe_bh, OCFS_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto leave;
+	}
 
-		handle = ocfs_start_trans(osb, handle, credits);
-		if (handle == NULL) {
-			LOG_ERROR_STATUS(status = -ENOMEM);
-			goto leave;
-		}
+	block = ocfs_clusters_to_blocks(osb->sb, bit_off);
+	LOG_TRACE_ARGS("Allocating %u clusters at block %u for inode %llu\n",
+		       num_bits, bit_off, OCFS_I(inode)->ip_blkno);
+	status = ocfs_allocate_extent(osb, fe_bh, handle, block, num_bits, 
+				      inode, meta_ac);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto leave;
 	}
 
-	status = ocfs_journal_access(handle, bh, OCFS_JOURNAL_ACCESS_WRITE);
+	fe->i_clusters += num_bits;
+	down (&(OCFS_I(inode)->ip_sem));
+	OCFS_I(inode)->ip_alloc_size =
+		(u64)fe->i_clusters << osb->s_clustersize_bits;
+	up (&(OCFS_I(inode)->ip_sem));
+
+	status = ocfs_journal_dirty(handle, fe_bh);
 	if (status < 0) {
 		LOG_ERROR_STATUS(status);
 		goto leave;
 	}
 
-	if (file_size <= (__s64)current_alloc)
-		goto no_alloc;
+	clusters_to_add -= num_bits;
 
-	alloc_size = file_size - current_alloc;
+	if (clusters_to_add) {
+		LOG_TRACE_ARGS("need to alloc once more, clusters = %u, "
+			       "wanted = %u\n", fe->i_clusters, 
+			       clusters_to_add);
+		goto again;
+	}
 
-	/* TODO: We can add something here so that after 2-3 allocations, 
-	 * we give a lot more disk space to the file than the alloc_size so 
-	 * in order to try to use the Extents of File Entry only and ofcourse 
-	 * the file will have more contigous disk space. */
+leave:
+	LOG_EXIT_STATUS(status);
+	return(status);
+}
 
-	if (!system_file) {
-		int one_percentish_bits = 7;
-		__u64 tempSize = current_alloc;
+static unsigned int ocfs_calc_overalloc_bits(ocfs_super *osb, 
+					     struct file *filp,
+					     ocfs2_dinode *fe,
+					     u64 new_size)
+{
+#warning "finish this"
+	/* TODO: We will keep a small history of allocs on the filp
+	 * and calculate a reasonable overalloc based on that data
+	 * here. */
+	return(0);
+}
 
-		if (tempSize > ONE_MEGA_BYTE)
-			tempSize = ONE_MEGA_BYTE;
-		alloc_size += (tempSize * 2);
+/* ocfs_extend_file()
+ *
+ * Ok, this function is heavy on the goto's - we need to clean it up a
+ * bit.
+ */
+int ocfs_extend_file(ocfs_super *osb, 
+		     struct inode *inode,
+		     u64 new_i_size)
+{
+	int status = 0;
+	int restart_func = 0;
+	int skip_overalloc = 0;
+	int credits, num_free_extents;
+	unsigned int overalloc_bits = 0;
+	u32 clusters_to_add;
+	struct buffer_head *bh = NULL;
+	ocfs2_dinode *fe;
+	ocfs_journal_handle *handle = NULL;
+	ocfs2_alloc_context *data_ac = NULL;
+	ocfs2_alloc_context *meta_ac = NULL;
+	enum ocfs2_alloc_restarted why;
 
-		if (alloc_size <
-		    (current_alloc >> one_percentish_bits)) {
-			alloc_size = current_alloc >> one_percentish_bits;
-			tempSize = alloc_size;
-			// avoid using 64 bit mod
-			while (tempSize > (10*ONE_MEGA_BYTE))
-				tempSize -= (10*ONE_MEGA_BYTE);
-			tempSize = (10*ONE_MEGA_BYTE) - tempSize;
-			alloc_size += tempSize;
-		}
+	LOG_ENTRY_ARGS("(new_i_size=%llu)\n", new_i_size);
+
+	/* setattr sometimes calls us like this. */
+	if (new_i_size == 0)
+		goto leave;
+
+restart_all:
+	handle = ocfs_alloc_handle(osb);
+	if (handle == NULL) {
+		LOG_ERROR_STATUS(status = -ENOMEM);
+		goto leave;
 	}
 
-	status = ocfs_find_space(osb, alloc_size, &bitmapOffset,
-				 &numClustersAlloc, system_file, handle);
-	LOG_TRACE_ARGS("find_space: alloc_size=%llu, returned off=%u"
-		       ", num=%u\n", alloc_size, bitmapOffset, 
-		       numClustersAlloc);
+	status = ocfs_acquire_lock(osb, OCFS_LKM_EXMODE, FLAG_FILE_EXTEND, 
+				    &bh, inode);
 	if (status < 0) {
-		if (status != -ENOSPC && status != -EINTR)
+		if (status != -EINTR)
 			LOG_ERROR_STATUS (status);
 		goto leave;
 	}
+	ocfs_handle_add_lock(handle, 
+			     OCFS_LKM_EXMODE,
+			     FLAG_FILE_EXTEND|FLAG_FILE_UPDATE_OIN,
+			     inode);
+	ocfs_handle_add_inode(handle, inode);
 
-	block_off = ocfs_clusters_to_blocks(osb->sb,
-					    bitmapOffset);
-	num_blocks = ocfs_clusters_to_blocks(osb->sb,
-					     numClustersAlloc);
+	fe = (ocfs2_dinode *) bh->b_data;
+	OCFS_ASSERT(IS_VALID_FILE_ENTRY(fe));
+	OCFS_ASSERT(new_i_size >= fe->i_size);
 
-#define	OCFS_MAX_ZERO_BLOCKS (4096)
+	if (fe->i_size == new_i_size) {
+		OCFS_ASSERT(inode->i_size == new_i_size);
+		goto leave;
+	}
 
-	if (system_file) {
-		struct buffer_head **bhs = NULL;
-		int i;
-		sector_t block;
-		u64 this_last;
+	clusters_to_add = ocfs_clusters_for_bytes(osb->sb, new_i_size) 
+		- fe->i_clusters;
 
-//		printk("Extending system inode %llu (from %u, adding %u)\n", fe->i_blkno, fe->i_clusters, numClustersAlloc);
-		LOG_TRACE_ARGS("zeroing %llu blocks from offset %llu\n", 
-			       num_blocks, block_off);
-		bhs = kmalloc(OCFS_MAX_ZERO_BLOCKS * sizeof(struct buffer_head *),
-			      GFP_KERNEL);
-		if (!bhs) {
-			status = -ENOMEM;
-			LOG_ERROR_STATUS(status);
-			goto leave;
-		}
-		memset(bhs, 0, OCFS_MAX_ZERO_BLOCKS * 
-		       sizeof(struct buffer_head *));
+	LOG_TRACE_ARGS("extend inode %llu, new_i_size = %llu, i_size = %llu, "
+		       "fe->i_clusters = %u, clusters_to_add = %u\n", 
+		       OCFS_I(inode)->ip_blkno, new_i_size, inode->i_size, 
+		       fe->i_clusters, clusters_to_add);
 
-		block = block_off;
-		while (block < (block_off + num_blocks)) {
-			this_last = block + OCFS_MAX_ZERO_BLOCKS;
-			if (this_last > (block_off + num_blocks))
-				this_last = block_off + num_blocks;
+	if (!clusters_to_add) 
+		goto do_start_trans;
 
-			//LOG_TRACE_ARGS("block = %llu, this_last = %llu\n",
-			//	       (unsigned long long) block, this_last);
-			i = 0;
-			while (block < this_last) {
-				bhs[i] = sb_getblk(osb->sb, block);
-				if (!bhs[i]) {
-					status = -ENOMEM;
-					LOG_ERROR_STATUS(status);
-					break;
-				}
-				memset(bhs[i]->b_data, 0, 
-				       osb->sb->s_blocksize);
-				set_buffer_uptodate(bhs[i]);
-				i++;
-				block++;
-			}
-			if (status)
-				break;
+	overalloc_bits = 0;
+	if (!skip_overalloc) {
+		overalloc_bits = ocfs_calc_overalloc_bits(osb, 
+							  NULL, 
+							  fe, 
+							  new_i_size);
+		clusters_to_add += overalloc_bits;
+		skip_overalloc = 1;
+	}
 
-			//LOG_TRACE_ARGS("writing %d blocks\n", i);
-			status = ocfs_write_bhs(osb, bhs, i, inode);
-			if (status) {
-				LOG_ERROR_STATUS(status);
-				break;
-			}
-			for (i = 0; i < OCFS_MAX_ZERO_BLOCKS; i++)
-				if (bhs[i])
-				    brelse(bhs[i]);
-			memset(bhs, 0, OCFS_MAX_ZERO_BLOCKS * 
-			       sizeof(struct buffer_head *));
+	num_free_extents = ocfs_num_free_extents(osb, 
+						 inode, 
+						 fe);
+	if (num_free_extents < 0) {
+		status = num_free_extents;
+		LOG_ERROR_STATUS(status);
+		goto leave;
+	}
+
+	if (!num_free_extents) {
+		status = ocfs_reserve_new_metadata(osb, 
+						   handle, 
+						   fe, 
+						   &meta_ac);
+		if (status < 0) {
+			LOG_ERROR_STATUS (status);
+			goto leave;
 		}
+	}
 
-		for (i = 0; i < OCFS_MAX_ZERO_BLOCKS; i++)
-			if (bhs[i])
-				brelse(bhs[i]);
-		kfree(bhs);
+	status = ocfs_reserve_bits(osb, 
+				   handle, 
+				   clusters_to_add,
+				   &data_ac);
+	if (status < 0) {
+		if (status != -ENOSPC)
+			LOG_ERROR_STATUS(status);
+		goto leave;
 	}
 
-	ext_alloc_inode = ocfs_get_system_file_inode(osb, EXTENT_ALLOC_BITMAP_SYSTEM_INODE, osb->node_num);
-	if (!ext_alloc_inode) {
-		status = -EFAIL;
-		LOG_ERROR_STATUS(status);
+do_start_trans:
+	credits = ocfs_calc_extend_credits(osb->sb, clusters_to_add);
+	handle = ocfs_start_trans(osb, handle, credits);
+	if (handle == NULL) {
+		LOG_ERROR_STATUS(status = -ENOMEM);
 		goto leave;
 	}
 
-	ocfs_handle_add_inode(handle, ext_alloc_inode);
-	status = ocfs_allocate_extent(osb, bh, handle, block_off,
-				      numClustersAlloc, inode);
+restarted_transaction:
+	/* reserve a write to the file entry early on - that we if we
+	 * run out of credits in the allocation path, we can still
+	 * update i_size. */
+	status = ocfs_journal_access(handle, bh, OCFS_JOURNAL_ACCESS_WRITE);
 	if (status < 0) {
-		LOG_ERROR_STATUS (status);
+		LOG_ERROR_STATUS(status);
 		goto leave;
 	}
 
-	/* update the total allocation size here */
-	fe->i_clusters += numClustersAlloc;
+	if (!clusters_to_add)
+		goto no_alloc;
 
-	down (&(OCFS_I(inode)->ip_sem));
-	OCFS_I(inode)->ip_alloc_size =
-		(u64)fe->i_clusters << osb->s_clustersize_bits;
-	up (&(OCFS_I(inode)->ip_sem));
+	status = ocfs_extend_allocation(osb, 
+					inode, 
+					clusters_to_add,
+					bh,
+					handle,
+					data_ac,
+					meta_ac,
+					&why);
+	if ((status < 0) && (status != -EAGAIN)) {
+		LOG_ERROR_STATUS(status);
+		goto leave;
+	}
 
-	/* no need to do OCFS_SECTOR_ALIGN once the allocation size is
-	 * correct. */
+	if (status == -EAGAIN 
+	    && (new_i_size > 
+		(fe->i_clusters << osb->s_clustersize_bits))) {
+
+		if (why == RESTART_META) {
+			LOG_TRACE_ARGS("restarting function.\n");
+			restart_func = 1;
+		} else {
+			OCFS_ASSERT(why == RESTART_TRANS);
+
+			/* update i_size in case we crash after the
+			 * extend_trans */
+			fe->i_size = (u64) (fe->i_clusters << osb->s_clustersize_bits);
+			fe->i_mtime = OCFS_CURRENT_TIME;
+
+			status = ocfs_journal_dirty(handle, bh);
+			if (status < 0) {
+				LOG_ERROR_STATUS (status);
+				goto leave;
+			}
+
+			clusters_to_add = 
+				ocfs_clusters_for_bytes(osb->sb, new_i_size)
+				- fe->i_clusters + overalloc_bits;
+			LOG_TRACE_ARGS("restarting transaction.\n");
+			/* TODO: This can be more intelligent. */
+			credits = ocfs_calc_extend_credits(osb->sb, 
+							   clusters_to_add);
+			status = ocfs_extend_trans(handle, credits);
+			if (status < 0) {
+				/* handle still has to be committed /
+				 * aborted at this point. */
+				LOG_ERROR_STATUS(status = -ENOMEM);
+				goto leave;
+			}
+			goto restarted_transaction;
+		}
+	}
+	status = 0;
+
 no_alloc:
-	/* Update tha file size and add the new one to old one. */
-	fe->i_size = file_size;
+	/* this may not be the end of our allocation so only update
+	 * i_size to what's appropriate. */
+	if (new_i_size > (fe->i_clusters << osb->s_clustersize_bits))
+		fe->i_size = fe->i_clusters << osb->s_clustersize_bits;
+	else
+		fe->i_size = new_i_size;
+#warning "is there a reason why we don't update i_blocks here?"
 	LOG_TRACE_ARGS("fe: i_clusters = %u, i_size=%llu\n", 
 		       fe->i_clusters, fe->i_size);
 
-	/* NOTE: this is a bit of a hack; unlike regular files, 
-	 * system files do not have another opportunity to update
-	 * the inode/i_private fields */
-	if (system_file) {
-		OCFS_I(inode)->ip_alloc_size = (u64)fe->i_clusters << osb->s_clustersize_bits;
-		inode->i_size = fe->i_size;
-		inode->i_blocks = (inode->i_size + osb->sb->s_blocksize - 1) >> osb->sb->s_blocksize_bits;
-	}
 	LOG_TRACE_ARGS("inode: ip_alloc_size=%llu, i_size=%llu\n",
 		       OCFS_I(inode)->ip_alloc_size, inode->i_size);
 
-	if (attr)
-		ocfs_fe_set_attributes(fe, attr);
-
 	fe->i_mtime = OCFS_CURRENT_TIME;
 
-	tempOffset = fe->i_blkno << osb->sb->s_blocksize_bits;
-
 	status = ocfs_journal_dirty(handle, bh);
 	if (status < 0) {
 		LOG_ERROR_STATUS (status);
@@ -1245,22 +1330,29 @@
 	}
 
 leave:
-	if ((passed_handle == NULL) && handle) {
+	if (handle) {
 		if (status < 0)
 			ocfs_abort_trans(handle);
 		else 
 			ocfs_commit_trans(handle);
+		handle = NULL;
 	}
-
-	if (bh != NULL)
-		LOG_TRACE_ARGS("bh->b_count = %d\n", 
-			       atomic_read(&(bh->b_count)));
-	if (bh != NULL && fe_bh == NULL)
+	if (data_ac) {
+		ocfs_free_alloc_context(data_ac);
+		data_ac = NULL;
+	}
+	if (meta_ac) {
+		ocfs_free_alloc_context(meta_ac);
+		meta_ac = NULL;
+	}
+	if (bh) {
 		brelse(bh);
-
-	if (ext_alloc_inode)
-		iput(ext_alloc_inode);
-
+		bh = NULL;
+	}
+	if ((!status) && restart_func) {
+		restart_func = 0;
+		goto restart_all;
+	}
 	LOG_EXIT_STATUS (status);
 	return status;
 }				/* ocfs_extend_file */
@@ -1277,7 +1369,6 @@
 	int status;
 	ocfs_super *osb = NULL;
 	struct super_block *sb = inode->i_sb;
-	int extended = 0;
 
 	LOG_SET_CONTEXT(SETATTR);
 
@@ -1292,8 +1383,6 @@
 	}
 #endif
 
-	down_write(&OCFS_I(inode)->ip_io_sem);
-
 	if (!dentry->d_parent || !dentry->d_parent->d_inode) {
 		LOG_ERROR_STR ("bad inode or root inode");
 		goto bail;
@@ -1327,7 +1416,9 @@
 	if (attr->ia_valid & ATTR_SIZE) {
 		if (atomic_read(&OCFS_I(inode)->ip_needs_verification)) {
 			LOG_TRACE_STR ("OIN_NEEDS_VERIFICATION");
+			down_read(&OCFS_I(inode)->ip_io_sem);
 			status = ocfs_verify_update_inode (osb, inode);
+			up_read(&OCFS_I(inode)->ip_io_sem);
 			if (status < 0) {
 				LOG_ERROR_STATUS (status);
 				LOG_TRACE_STR ("TODO: disable volume");
@@ -1340,12 +1431,9 @@
 			ocfs_truncate_inode_pages(inode, newsize);
 			status = ocfs_truncate_file(osb, newsize, 
 						    inode);
+		} else {
+			status = ocfs_extend_file(osb, inode, newsize);
 		}
-		else {
-			status = ocfs_extend_file(osb, newsize, NULL, 
-						  inode, attr, 0, NULL);
-			extended = 1;
-		}
 		if (status < 0) {
 			if (status != -EINTR)
 				LOG_ERROR_STATUS (status);
@@ -1354,10 +1442,6 @@
 		}
 
 		down (&(OCFS_I(inode)->ip_sem));
-		if (inode->i_size > newsize) {
-                        ocfs_extent_map_destroy(&OCFS_I(inode)->ip_ext_map);
-                        ocfs_extent_map_init (&OCFS_I(inode)->ip_ext_map);
-		}
 		inode->i_size = newsize;
 		inode->i_blocks = (newsize + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
 		if (OCFS_I(inode)->ip_open_flags &
@@ -1366,9 +1450,7 @@
 			OCFS_I(inode)->ip_mmu_private = inode->i_size;
 		}
 		up (&(OCFS_I(inode)->ip_sem));
-		up_write(&OCFS_I(inode)->ip_io_sem);
 		status = ocfs2_zero_extend(inode);
-		down_write(&OCFS_I(inode)->ip_io_sem);
 		if (status < 0) {
 			LOG_ERROR_STATUS(status);
 			goto bail;
@@ -1376,20 +1458,16 @@
 	}
 
 	status = -EFAIL;
-	if (!extended) {
-		status = ocfs_change_file_attrib(osb, attr, inode);
-		if (status < 0) {
-			if (status != -EINTR)
-				LOG_ERROR_STATUS (status);
-			error = -EIO;
-			goto bail;
-		}
+	status = ocfs_change_file_attrib(osb, attr, inode);
+	if (status < 0) {
+		if (status != -EINTR)
+			LOG_ERROR_STATUS (status);
+		error = -EIO;
+		goto bail;
 	}
 	error = inode_setattr (inode, attr);
 
 bail:
-	up_write(&OCFS_I(inode)->ip_io_sem);
-
 	LOG_EXIT_INT (error);
 
 	LOG_CLEAR_CONTEXT();

Modified: trunk/src/file.h
===================================================================
--- trunk/src/file.h	2004-10-02 01:01:24 UTC (rev 1542)
+++ trunk/src/file.h	2004-10-02 01:08:08 UTC (rev 1543)
@@ -33,10 +33,21 @@
 extern struct file_operations ocfs_dops;
 extern struct inode_operations ocfs_file_iops;
 
-int ocfs_extend_file(ocfs_super * osb, __u64 file_size,
-		     ocfs_journal_handle *passed_handle, 
-		     struct inode *inode, struct iattr *attr, 
-		     int system_file, struct buffer_head *fe_bh);
+enum ocfs2_alloc_restarted {
+	RESTART_TRANS = 0,
+	RESTART_META
+};
+int ocfs_extend_allocation(ocfs_super *osb, 
+			   struct inode *inode, 
+			   u32 clusters_to_add, 
+			   struct buffer_head *fe_bh,
+			   ocfs_journal_handle *handle, 
+			   ocfs2_alloc_context *data_ac,
+			   ocfs2_alloc_context *meta_ac,
+			   enum ocfs2_alloc_restarted *reason);
+int ocfs_extend_file(ocfs_super *osb, 
+		     struct inode *inode,
+		     u64 new_i_size);
 int ocfs_inode_fill_ext_map(ocfs_super *osb, struct buffer_head *fe_bh,
 			    struct inode *inode);
 int ocfs_setattr(struct dentry *dentry, struct iattr *attr);

Modified: trunk/src/inode.c
===================================================================
--- trunk/src/inode.c	2004-10-02 01:01:24 UTC (rev 1542)
+++ trunk/src/inode.c	2004-10-02 01:08:08 UTC (rev 1543)
@@ -47,6 +47,7 @@
 #include "inode.h"
 #include "lockres.h"
 #include "namei.h"
+#include "suballoc.h"
 #include "super.h"
 #include "symlink.h"
 #include "sysfile.h"
@@ -537,10 +538,12 @@
 void ocfs_delete_inode(struct inode *inode)
 {
 	struct inode *orphan_dir_inode = NULL;
+	struct inode *inode_alloc_inode = NULL;
 	ocfs_journal_handle *handle = NULL;
 	ocfs_super *osb = OCFS_SB(inode->i_sb);
 	int status = 0;
 	struct buffer_head *orphan_dir_bh = NULL;
+	struct buffer_head *inode_alloc_bh = NULL;
 	struct buffer_head *fe_bh = NULL;
 	ocfs2_dinode *fe;
 
@@ -550,18 +553,18 @@
 
 	if (OCFS_I(inode)->ip_flags & OCFS_INODE_SYSTEM_FILE) {
 		LOG_TRACE_STR("Skipping system file delete.");
-		goto clear_inode;
+		goto bail;
 	}
 
 	if (inode == osb->root_inode) {
 		LOG_TRACE_STR("Skipping root inode delete.");
-		goto clear_inode;
+		goto bail;
 	}
 
 	if (OCFS_I(inode)->ip_flags & OCFS_INODE_SKIP_DELETE) {
 		LOG_TRACE_ARGS("Skipping delete of %lu because another node "
 			       "has done this for us.\n", inode->i_ino);
-		goto clear_inode;
+		goto bail;
 	}
 
 	/* If we're coming from process_vote we can't go into our own
@@ -572,17 +575,9 @@
 	if (osb->voting_ino == inode->i_ino) {
 		LOG_TRACE_ARGS("Skipping delete of %lu because we're currently"
 			       "in process_vote\n", inode->i_ino);
-		goto clear_inode;
+		goto bail;
 	}
 
-	orphan_dir_inode = ocfs_get_system_file_inode(osb, 
-						      ORPHAN_DIR_SYSTEM_INODE, 
-						      -1);
-	if (!orphan_dir_inode) {
-		LOG_ERROR_STATUS(-EFAIL);
-		goto clear_inode;
-	}
-
 	/* acquire_lock and friends will igrab / iput this guy, so we
 	 * take an extra ref. to avoid recursive calls to
 	 * delete_inode. */
@@ -597,7 +592,7 @@
 		 * about deleting it. */
 		if (status != -EBUSY)
 			LOG_ERROR_STATUS(status);
-		goto clear_inode;
+		goto bail;
 	}
 
 	fe = (ocfs2_dinode *) fe_bh->b_data;
@@ -605,51 +600,84 @@
 		/* for lack of a better error? */
 		status = -EEXIST;
 		LOG_ERROR_STATUS(status);
-		goto clear_inode;
+		goto bail;
 	}
 
 	/* has someone already deleted us?! baaad... */
 	if (fe->i_dtime) {
 		status = -EEXIST;
 		LOG_ERROR_STATUS(status);
-		goto clear_inode;
+		goto bail;
 	}
 
 	if (fe->i_links_count) {
 		status = -EBUSY;
 		LOG_ERROR_STATUS(status);
-		goto clear_inode;
+		goto bail;
 	}
 
 	/* Oop, lets be carefull of lock / trans ordering here... */
-	handle = ocfs_start_trans(osb, NULL, OCFS_FILE_DELETE_CREDITS);
+	handle = ocfs_alloc_handle(osb);
 	if (handle == NULL) {
-		unlock_kernel();
-		LOG_ERROR_STATUS(-ENOMEM);
-		goto clear_inode;
+		status = -ENOMEM;
+		LOG_ERROR_STATUS(status);
+		goto bail;
 	}
 
-	ocfs_handle_add_inode(handle, orphan_dir_inode);
-
-	lock_kernel();
-
+	orphan_dir_inode = ocfs_get_system_file_inode(osb, 
+						      ORPHAN_DIR_SYSTEM_INODE, 
+						      -1);
+	if (!orphan_dir_inode) {
+		status = -EEXIST;
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
 	status = ocfs_acquire_lock(osb, OCFS_LKM_EXMODE, 0,
 				   &orphan_dir_bh, orphan_dir_inode);
 	if (status < 0) {
 		LOG_ERROR_STATUS(status);
-		goto bail_locked;
+		goto bail;
 	}
 	ocfs_handle_add_lock(handle, OCFS_LKM_EXMODE, 0,
 			     orphan_dir_inode);
+	ocfs_handle_add_inode(handle, orphan_dir_inode);
 
+	inode_alloc_inode = ocfs_get_system_file_inode(osb, INODE_ALLOC_SYSTEM_INODE, fe->i_suballoc_node);
+	if (!inode_alloc_inode) {
+		status = -EEXIST;
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+	status = ocfs_acquire_lock(osb, OCFS_LKM_EXMODE, 0,
+				   &inode_alloc_bh, inode_alloc_inode);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+	ocfs_handle_add_lock(handle, OCFS_LKM_EXMODE, 0,
+			     inode_alloc_inode);
+	ocfs_handle_add_inode(handle, inode_alloc_inode);
+
+	handle = ocfs_start_trans(osb, handle, OCFS_FILE_DELETE_CREDITS);
+	if (handle == NULL) {
+		status = -ENOMEM;
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+	ocfs_handle_set_always_commits(handle, 1);
+
 	status = ocfs_orphan_del(osb, handle, orphan_dir_inode, inode, 
 				 orphan_dir_bh);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
 
 	/* set the inodes dtime */
 	status = ocfs_journal_access(handle, fe_bh, OCFS_JOURNAL_ACCESS_WRITE);
 	if (status < 0) {
 		LOG_ERROR_STATUS(status);
-		goto bail_locked;
+		goto bail;
 	}
 
 	fe->i_dtime = OCFS_CURRENT_TIME;
@@ -658,38 +686,35 @@
 	status = ocfs_journal_dirty(handle, fe_bh);
 	if (status < 0) {
 		LOG_ERROR_STATUS(status);
-		goto bail_locked;
+		goto bail;
 	}
 
-	/* actually delete the data and the inode */
-	status = ocfs_free_file_extents(osb, fe_bh, handle, inode);
+	status = ocfs_free_suballoc_bits(osb, handle, inode_alloc_inode,
+					 inode_alloc_bh, fe->i_suballoc_bit,
+					 fe->i_blkno, 1);
 	if (status < 0) {
 		LOG_ERROR_STATUS(status);
-		goto bail_locked;
+		goto bail;
 	}
 
-	status = ocfs_handle_add_commit_bits(handle, 1,
-					     fe->i_suballoc_blkno << osb->sb->s_blocksize_bits,
-					     fe->i_suballoc_node,
-					     DISK_ALLOC_INODE);
+	/* actually delete the data and the inode */
+	status = ocfs_free_file_extents(osb, fe_bh, handle, inode);
 	if (status < 0)
 		LOG_ERROR_STATUS(status);
 
-bail_locked:
-	if (handle && (status == 0))
+bail:
+	if (handle)
 		ocfs_commit_trans(handle);
-	else if (handle)
-		ocfs_abort_trans(handle);
-
-	unlock_kernel();
-
-clear_inode:
 	if (orphan_dir_bh)
 		brelse(orphan_dir_bh);
+	if (inode_alloc_bh)
+		brelse(inode_alloc_bh);
 	if (fe_bh)
 		brelse(fe_bh);
 	if (orphan_dir_inode)
 		iput(orphan_dir_inode);
+	if (inode_alloc_inode)
+		iput(inode_alloc_inode);
 
 	/* we must clear inode. */
 	clear_inode(inode);
@@ -733,7 +758,7 @@
 	/* blkno == 0 if this inode is newly created and hasn't been
 	 * filled in yet. */
 	if (OCFS_I(inode)->ip_blkno == 0) {
-		LOG_ERROR_STR("uhm, blkno = 0!");
+		LOG_TRACE_STR("uhm, blkno = 0!");
 		goto bail;
 	}
 
@@ -767,11 +792,11 @@
  * stuff in ocfs_get_block (that is, ocfs_get_block pretty much
  * expects never to extend).
  */
-struct buffer_head *ocfs_bread(ocfs_journal_handle *handle, struct inode * inode, 
-			       int block, int create, int *err, int reada)
+struct buffer_head *ocfs_bread(struct inode * inode, 
+			       int block, int *err, int reada)
 {
 	struct buffer_head * bh = NULL;
-	int fatal = 0, tmperr, new = 0;
+	int tmperr;
 	ocfs_super *osb;
 	__s64 vbo, lbo;
 	int readflags = OCFS_BH_CACHED;
@@ -779,79 +804,32 @@
 	osb = OCFS_SB(inode->i_sb);
 	vbo = (__s64) block << inode->i_sb->s_blocksize_bits;
 
-	OCFS_ASSERT(!create || handle);
-
 #warning only turn this on if we know we can deal with read_bh returning nothing
 #if 0
 	if (reada)
 		readflags |= OCFS_BH_READAHEAD;
 #endif
 
-	if (vbo >= inode->i_size) {
-		if (!create) {
-			*err = -ENOSPC;
-			return NULL;
-		}
-		new = 1;
-	}
+	OCFS_ASSERT((vbo < inode->i_size) || reada);
+	if (vbo >= inode->i_size)
+		return(NULL);
 
-	/* ???: do we need ip_sem?  should have i_sem i think */
-	if (vbo >= OCFS_I(inode)->ip_alloc_size) {
-		int vbo_pad;
-		
-		vbo_pad = inode->i_sb->s_blocksize;
-		vbo_pad -= vbo & (s64)(inode->i_sb->s_blocksize - 1);
-
-		*err = ocfs_extend_file(osb, 
-					vbo + vbo_pad, 
-				        handle, inode, NULL, 0, NULL);
-		if (*err < 0) {
-			*err = -ENOSPC;
-			return NULL;
-		}
-		/*
-		 * fe->i_size will be vbo + padding to blocksize here,
-		 * and i_blocks will be whatever is actually allocated.
-		 * i_size will be changed by caller (ocfs_add_entry) if
-		 * we return !NULL.
-		 */
-	}
-
 	/* do we need extend sem?  no extend dlm message for dirs */
 	/*
 	 * UGLY: last argument to lookup_file_allocation() (locked) is
 	 * forced to '1' here, even though we don't have the lock.  This
 	 * is to force fast, unlocked operation.  Get A Real DLM.
 	 */
-	tmperr = ocfs_lookup_file_allocation(osb, vbo, &lbo, osb->sb->s_blocksize, NULL,
+	tmperr = ocfs_lookup_file_allocation(osb, vbo, &lbo, 
+					     osb->sb->s_blocksize, NULL,
 					     inode, 1);
 	if (tmperr < 0)
 		goto fail;
 
-	if (new) {
-		bh = sb_getblk(osb->sb, lbo >> osb->sb->s_blocksize_bits);
-		if (!bh) {
-			tmperr = -EIO;
-			goto fail;
-		}
-		set_buffer_uptodate(bh);
-		SET_BH_SEQNUM(inode, bh);
+	tmperr = ocfs_read_bh(osb, lbo, &bh, readflags, inode);
+	if (tmperr < 0)
+		goto fail;
 
-		fatal = ocfs_journal_access(handle, bh,
-					   OCFS_JOURNAL_ACCESS_CREATE);
-		if (fatal)
-			goto fail;
-
-		memset(bh->b_data, 0, osb->sb->s_blocksize);
-		fatal = ocfs_journal_dirty(handle, bh);	
-		if (fatal)
-			goto fail;
-	} else {
-		tmperr = ocfs_read_bh(osb, lbo, &bh, readflags, inode);
-		if (tmperr < 0)
-			goto fail;
-	}
-
 	tmperr = 0;
 
 	*err = 0;
@@ -936,6 +914,67 @@
 }				/* ocfs_inode_revalidate */
 
 /*
+ * ocfs_mark_inode_dirty
+ * 
+ * Updates a disk inode from a 
+ * struct inode.
+ * Only takes ip_sem. 
+ */
+int ocfs_mark_inode_dirty(ocfs_journal_handle *handle, 
+			  struct inode *inode, 
+			  struct buffer_head *bh)
+{
+	int status;
+	ocfs2_dinode *fe = (ocfs2_dinode *) bh->b_data;
+	ocfs_super *osb = OCFS_SB(inode->i_sb);
+
+#warning "need to check the casts and the endian-ness in this function"
+	LOG_ENTRY_ARGS("(inode %llu)\n", OCFS_I(inode)->ip_blkno);
+
+	status = ocfs_journal_access(handle, bh, OCFS_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto leave;
+	}
+
+	down(&OCFS_I(inode)->ip_sem);
+	if (le32_to_cpu(fe->i_flags) & OCFS2_BITMAP_FL) {
+		fe->id1.bitmap1.i_used = 
+			cpu_to_le32(OCFS_I(inode)->u.ip_bitinfo.used_bits);
+		fe->id1.bitmap1.i_total = 
+			cpu_to_le32(OCFS_I(inode)->u.ip_bitinfo.total_bits);
+	}
+
+	fe->i_clusters = 
+		(u32)OCFS_I(inode)->ip_alloc_size >> osb->s_clustersize_bits;
+	up(&OCFS_I(inode)->ip_sem);
+
+	fe->i_size = (u64)inode->i_size;
+	fe->i_links_count = inode->i_nlink;
+	fe->i_uid = inode->i_uid;
+	fe->i_gid = inode->i_gid;
+	fe->i_mode = inode->i_mode;
+	fe->i_atime = ocfs_get_seconds(inode->i_atime);
+	fe->i_ctime = ocfs_get_seconds(inode->i_ctime);
+	fe->i_mtime = ocfs_get_seconds(inode->i_mtime);
+#warning "do we want to update these here?"
+//	fe->i_dtime = ocfs_get_seconds(inode->i_dtime);
+//	fe->i_generation = inode->i_generation;
+
+	status = ocfs_journal_dirty(handle, bh);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto leave;
+	}
+
+	status = 0;
+leave:
+
+	LOG_EXIT_STATUS(status);
+	return(status);
+}				/* ocfs_mark_inode_dirty */
+
+/*
  * ocfs_refresh_inode
  * 
  * Updates a struct inode from a disk inode.

Modified: trunk/src/inode.h
===================================================================
--- trunk/src/inode.h	2004-10-02 01:01:24 UTC (rev 1542)
+++ trunk/src/inode.h	2004-10-02 01:08:08 UTC (rev 1543)
@@ -29,9 +29,8 @@
 #ifndef OCFS2_INODE_H
 #define OCFS2_INODE_H
 
-struct buffer_head *ocfs_bread(ocfs_journal_handle *handle,
-			       struct inode * inode, int block,
-			       int create, int *err, int reada);
+struct buffer_head *ocfs_bread(struct inode * inode, int block,
+			       int *err, int reada);
 void ocfs_clear_inode(struct inode *inode);
 void ocfs_delete_inode(struct inode *inode);
 struct inode *ocfs_iget(ocfs_super *osb, __u64 feoff);
@@ -47,7 +46,10 @@
 void ocfs_sync_blockdev(struct super_block *sb);
 int ocfs_verify_update_inode(ocfs_super *osb, struct inode *inode);
 int ocfs_refresh_inode(struct inode *inode, 
-				ocfs2_dinode *fe);
+		       ocfs2_dinode *fe);
+int ocfs_mark_inode_dirty(ocfs_journal_handle *handle, 
+			  struct inode *inode, 
+			  struct buffer_head *bh);
 #ifdef AIO_ENABLED
 int ocfs_kvec_read(struct file *file, kvec_cb_t cb, size_t size, loff_t pos); 
 int ocfs_kvec_write(struct file *file, kvec_cb_t cb, size_t size, loff_t pos); 

Modified: trunk/src/journal.c
===================================================================
--- trunk/src/journal.c	2004-10-02 01:01:24 UTC (rev 1542)
+++ trunk/src/journal.c	2004-10-02 01:08:08 UTC (rev 1543)
@@ -41,6 +41,7 @@
 #include "extmap.h"
 #include "inode.h"
 #include "journal.h"
+#include "localalloc.h"
 #include "lockres.h"
 #include "namei.h"
 #include "nm.h"
@@ -426,6 +427,7 @@
 		BUG();
 
 	OCFS_ASSERT(osb->journal->state != OCFS_JOURNAL_FREE);
+	OCFS_ASSERT(max_buffs > 0);
 
 	/* JBD might support this, but our journalling code doesn't yet. */
 	if (journal_current_handle()) {
@@ -492,29 +494,15 @@
 {
 	OCFS_ASSERT(handle);
 	OCFS_ASSERT(inode);
-	OCFS_ASSERT((handle->flags & OCFS_HANDLE_STARTED));
 
-	if (OCFS_I(inode)->ip_handle == handle) {
-		/* sanity check */
-		if (list_empty(&OCFS_I(inode)->ip_handle_list))
-			BUG();
-
-		/* I think this can happen to the main bitmap inode if
-		 * we extend a regular file and also have to extend a
-		 * system file in the same transaction */
-		LOG_TRACE_ARGS("Inode %lu already added to transaction!\n",
-			       inode->i_ino);
-		return;
-	}
-
 	atomic_inc(&inode->i_count);
 
 	/* we're obviously changing it... */
 	down_write(&OCFS_I(inode)->ip_io_sem);
 
 	/* sanity check */
-	if (OCFS_I(inode)->ip_handle)
-		BUG();
+	OCFS_ASSERT(!OCFS_I(inode)->ip_handle);
+	OCFS_ASSERT(list_empty(&OCFS_I(inode)->ip_handle_list));
 
 	OCFS_I(inode)->ip_handle = handle;
 	list_del(&(OCFS_I(inode)->ip_handle_list));
@@ -534,8 +522,7 @@
 		inode = ip->ip_inode;
 
 		OCFS_I(inode)->ip_handle = NULL;
-		list_del(&OCFS_I(inode)->ip_handle_list);
-		INIT_LIST_HEAD(&OCFS_I(inode)->ip_handle_list);
+		list_del_init(&OCFS_I(inode)->ip_handle_list);
 
 		up_write(&OCFS_I(inode)->ip_io_sem);
 		iput(inode);
@@ -571,9 +558,10 @@
 	OCFS_ASSERT(!handle->num_co);
 	OCFS_ASSERT(!handle->num_buffs);
 
-	osb = handle->osb;
+	ocfs_handle_unlock_inodes(handle);
 	/* You are allowed to add journal locks before the transaction
 	 * has started. */
+	osb = handle->osb;
 	ocfs_handle_move_locks(osb->journal, handle);
 	spin_lock(&osb->journal->cmt_lock);
 	osb->needs_flush = 1;
@@ -639,9 +627,11 @@
 
 	handle->k_handle = NULL; /* it's been free'd in journal_stop */
 
-	for(i = 0; i < handle->num_buffs; i++) {
-		brelse(handle->buffs[i]);
-		handle->buffs[i] = NULL;
+	if (!(handle->flags & OCFS_HANDLE_ALWAYS_COMMITS)) {
+		for(i = 0; i < handle->num_buffs; i++) {
+			brelse(handle->buffs[i]);
+			handle->buffs[i] = NULL;
+		}
 	}
 	handle->num_buffs = 0;
 	if (handle->buffs) {
@@ -801,6 +791,115 @@
 	return;
 } /* ocfs_abort_trans */
 
+/* 
+ * 'nblocks' is what you want to add to the current
+ * transaction. extend_trans will either extend the current handle by
+ * nblocks, or commit it and start a new one with nblocks credits.
+ *
+ * WARNING: This will not release any semaphores or disk locks taken
+ * during the transaction, so make sure they were taken *before*
+ * start_trans or we'll have ordering deadlocks. 
+ *
+ * This function would be alot simpler if we didn't have to worry
+ * about abort. 
+ */
+int ocfs_extend_trans(ocfs_journal_handle *handle, int nblocks)
+{
+	int status, new_max_buffs, new_num_co, new_num_buffs, i;
+	int restarted = 0;
+	struct buffer_head **new_buffs = NULL;
+	ocfs_journal_copyout *new_co_buffs = NULL;
+
+	OCFS_ASSERT(handle);
+	OCFS_ASSERT(handle->flags & OCFS_HANDLE_STARTED);
+	OCFS_ASSERT(nblocks);
+
+	LOG_ENTRY();
+
+	printk("Trying to extend transaction by %d blocks\n", nblocks);
+
+	status = journal_extend(handle->k_handle, nblocks);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
+	if (status > 0) {
+		printk("journal_extend failed, trying journal_restart\n");
+		status = journal_restart(handle->k_handle, nblocks);
+		if (status < 0) {
+#warning we need to handle this better
+			handle->k_handle = NULL;
+			LOG_ERROR_STATUS(status);
+			goto bail;
+		}
+
+		restarted = 1;
+		new_num_co = 0;
+		new_num_buffs = 0;
+		new_max_buffs = nblocks;
+	} else {
+		new_num_co = handle->num_co;
+		new_num_buffs = handle->num_buffs;
+		new_max_buffs = handle->max_buffs + nblocks;
+	}
+
+	new_buffs = ocfs_malloc(sizeof(struct buffer_head *) * new_max_buffs);
+	if (!new_buffs) {
+		LOG_ERROR_STR("Failed to allocate memory for journal buffs!");
+		goto bail;
+	}
+	memset(new_buffs, 0, sizeof(struct buffer_head *) * new_max_buffs);
+
+	new_co_buffs = ocfs_malloc(sizeof(ocfs_journal_copyout)*new_max_buffs);
+	if (!new_co_buffs) {
+		kfree(new_buffs);
+		LOG_ERROR_STR("Failed to allocate memory for co_buffs!");
+		goto bail;
+	}
+	memset(new_co_buffs, 0, sizeof(ocfs_journal_copyout) * new_max_buffs);
+
+	if (!restarted) {
+		if (handle->num_buffs)
+			memcpy(new_buffs, handle->buffs, 
+			       sizeof(*new_buffs) * handle->num_buffs);
+		if (handle->num_co)
+			memcpy(new_co_buffs, handle->co_buffs, 
+			       sizeof(*new_co_buffs) * handle->num_co);
+	}
+
+	if (restarted) {
+		/* only brelse and free copyout buffers if we restarted. */
+		if (!(handle->flags & OCFS_HANDLE_ALWAYS_COMMITS)) {
+			for(i = 0; i < handle->num_buffs; i++) {
+				brelse(handle->buffs[i]);
+				handle->buffs[i] = NULL;
+			}
+		}
+
+		if (handle->buffs) {
+			kfree(handle->buffs);
+			handle->buffs = NULL;
+		}
+
+		ocfs_handle_free_all_copyout(handle);
+	} else {
+		kfree(handle->buffs);
+		kfree(handle->co_buffs);
+	}
+
+	handle->buffs = new_buffs;
+	handle->num_buffs = new_num_buffs;
+	handle->max_buffs = new_max_buffs;
+	handle->co_buffs = new_co_buffs;
+	handle->num_co = new_num_co;
+	status = 0;
+bail:
+
+	LOG_EXIT_STATUS(status);
+	return(status);
+}
+
 /*
  * ocfs_journal_access
  */
@@ -927,21 +1026,16 @@
  */
 int ocfs_journal_dirty(ocfs_journal_handle *handle, struct buffer_head *bh) 
 {
-	int status = -1;
-	int i;
+	int status = -EINVAL;
+	int i = 0;
 
 	OCFS_ASSERT((handle->flags & OCFS_HANDLE_STARTED));
 
 	LOG_ENTRY_ARGS("(bh->b_blocknr=%llu)\n",
 			(unsigned long long)bh->b_blocknr);
 
-	if (handle->num_buffs >= handle->max_buffs) {
-		LOG_ERROR_ARGS("Cannot add buffer to full transaction! "
-			       "num_buffs=%d, max_buffs=%d, block=%llu\n",
-			       handle->num_buffs, handle->max_buffs,
-			       (unsigned long long)bh->b_blocknr);
-		goto done;
-	}
+	if (handle->flags & OCFS_HANDLE_ALWAYS_COMMITS)
+		goto call_jbd;
 
 	/* First, make sure we aren't already in the list. If we've
 	 * already been added, then that's OK as JBD knows how to
@@ -956,6 +1050,8 @@
 		}
 	}
 
+	OCFS_ASSERT(handle->num_buffs < handle->max_buffs);
+
 	i = handle->num_buffs;
 	/* Increase the ref count on this buffer. We
 	 * do this because we still want to keep them
@@ -970,10 +1066,6 @@
 		LOG_ERROR_ARGS("Could not dirty metadata buffer. "
 			       "(bh->b_blocknr=%llu)\n",
 			       (unsigned long long)bh->b_blocknr);
-		LOG_TRACE_ARGS("Setting handle->buffs[%d] = NULL\n", i);
-		brelse(bh);
-		handle->buffs[i] = NULL;
-		handle->num_buffs--;
 		goto done;
 	}
 
@@ -1097,15 +1189,11 @@
 		goto done;
 	}
 
-	down_write(&OCFS_I(inode)->ip_io_sem);
-
 	SET_INODE_JOURNAL(inode);
 
 	status = ocfs_acquire_lock (osb, OCFS_LKM_EXMODE,
 				    0, &bh, inode);
 	if (status < 0) {
-		up_write(&OCFS_I(inode)->ip_io_sem);
-
 		if (status != -EINTR)
 			LOG_ERROR_STR("Could not get lock on journal!");
 		goto done;
@@ -1118,9 +1206,6 @@
 			       "You must run tuneocfs to add a journal for this node.\n",
 			       fe->i_size, OCFS_JOURNAL_DEFAULT_SIZE);
 		status = -EINVAL;
-		fe = NULL;
-		up_write(&OCFS_I(inode)->ip_io_sem);
-
 		goto done;
 	}
 
@@ -1141,8 +1226,6 @@
 
 	OCFS_I(inode)->ip_open_cnt++;
 
-	up_write(&OCFS_I(inode)->ip_io_sem);
-
 	/* call the kernels journal init function now */
 	k_journal = journal_init_inode(inode);
 	if (k_journal == NULL) {
@@ -1525,8 +1608,6 @@
 		goto done;
 	}
 
-	down_write(&OCFS_I(inode)->ip_io_sem);
-
 	SET_INODE_JOURNAL(inode);
 
 	/* Should not ever be called to recover ourselves -- in that
@@ -1537,7 +1618,6 @@
 	status = ocfs_acquire_lock (osb, OCFS_LKM_EXMODE, FLAG_FILE_RECOVERY, 
 				    &bh, inode);
 
-	up_write(&OCFS_I(inode)->ip_io_sem);
 	if (status < 0) {
 		LOG_TRACE_ARGS("status returned from acquire_lock=%d\n", 
 			       status);
@@ -1684,7 +1764,6 @@
 		goto bail;
 	}
 
-	down_write(&OCFS_I(orphan_dir_inode)->ip_io_sem);
 	status = ocfs_acquire_lock_ro(osb, orphan_dir_inode);
 	if (status < 0) {
 		LOG_ERROR_STATUS(status);
@@ -1694,13 +1773,15 @@
 
 	offset = 0;
 	iter = NULL;
+	down_read(&OCFS_I(orphan_dir_inode)->ip_io_sem);
 	while(offset < orphan_dir_inode->i_size) {
 		blk = offset >> sb->s_blocksize_bits;
 
-		bh = ocfs_bread(NULL, orphan_dir_inode, blk, 0, &status, 0);
+		bh = ocfs_bread(orphan_dir_inode, blk, &status, 0);
 		if (!bh)
 			status = -EINVAL;
 		if (status < 0) {
+			up_read(&OCFS_I(orphan_dir_inode)->ip_io_sem);
 			if (bh)
 				brelse(bh);
 			LOG_ERROR_STATUS(status);
@@ -1714,6 +1795,7 @@
 
 			if (!ocfs_check_dir_entry(orphan_dir_inode,
 						  de, bh, local)) {
+				up_read(&OCFS_I(orphan_dir_inode)->ip_io_sem);
 				status = -EINVAL;
 				LOG_ERROR_STATUS(status);
 				brelse (bh);
@@ -1754,6 +1836,7 @@
 		}
 		brelse(bh);
 	}
+	up_read(&OCFS_I(orphan_dir_inode)->ip_io_sem);
 
 	status = ocfs_release_lock_ro(osb, orphan_dir_inode);
 	have_disk_lock = 0;
@@ -1762,7 +1845,6 @@
 		goto bail;
 	}
 
-	up_write(&OCFS_I(orphan_dir_inode)->ip_io_sem);
 	iput(orphan_dir_inode);
 	orphan_dir_inode = NULL;
 
@@ -1781,10 +1863,9 @@
 			LOG_ERROR_STATUS(tmpstat);
 	}
 
-	if (orphan_dir_inode) {
-		up_write(&OCFS_I(orphan_dir_inode)->ip_io_sem);
+	if (orphan_dir_inode) 
 		iput(orphan_dir_inode);
-	}
+
 	return(status);
 }
 

Added: trunk/src/localalloc.c
===================================================================
--- trunk/src/localalloc.c	2004-10-02 01:01:24 UTC (rev 1542)
+++ trunk/src/localalloc.c	2004-10-02 01:08:08 UTC (rev 1543)
@@ -0,0 +1,1013 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * localalloc.c
+ *
+ * Node local data allocation
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Authors: Kurt Hackel, Mark Fasheh, Sunil Mushran, Wim Coekaerts,
+ *	    Manish Singh, Neeraj Goyal, Suchit Kaura
+ */
+
+#define OCFS_DEBUG_CONTEXT    OCFS_DEBUG_CONTEXT_LOCALALLOC
+
+#include "ocfs_compat.h"
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+
+#include "ocfs_log.h"
+#include "ocfs.h"
+
+#include "alloc.h"
+#include "bitmap.h"
+#include "dlm.h"
+#include "extmap.h"
+#include "inode.h"
+#include "localalloc.h"
+#include "util.h"
+#include "suballoc.h"
+#include "sysfile.h"
+#include "file.h"
+
+#include "ocfs_journal.h"
+#include "buffer_head_io.h"
+
+/* The largest cluster size where we even consider using local alloc. */
+#define OCFS_LOCAL_ALLOC_MAX_CSIZE    (128 * 1024)
+
+/* The largest allocation to use the local bitmap for. */
+#define OCFS_LOCAL_ALLOC_MAX_ALLOC    (2 * 1024 * 1024)
+
+static inline int ocfs_local_alloc_window_bits(ocfs_super *osb);
+
+static __u32 ocfs_local_alloc_count_bits(ocfs2_dinode *alloc);
+
+static int ocfs_local_alloc_find_clear_bits(ocfs_super *osb,
+				      ocfs2_dinode *alloc,
+				      __u32 numbits);
+
+static void ocfs_clear_local_alloc(ocfs2_dinode *alloc);
+
+static int ocfs_sync_local_from_shutdown(ocfs_super *osb, 
+					 ocfs_bitmap_free_head **f, 
+					 struct buffer_head *local_alloc_bh, 
+					 int in_recovery);
+
+static int ocfs_sync_local_to_main(ocfs_super *osb, 
+				   ocfs_journal_handle *handle, 
+				   ocfs2_dinode *alloc,
+				   struct inode *main_bm_inode,
+				   struct buffer_head *main_bm_bh);
+
+static int ocfs_local_alloc_reserve_for_window(ocfs_super *osb, 
+					       ocfs_journal_handle *handle,
+					       ocfs2_alloc_context **ac,
+					       struct inode **bitmap_inode,
+					       struct buffer_head **bitmap_bh);
+
+static int ocfs_local_alloc_new_window(ocfs_super *osb, 
+				       ocfs_journal_handle *handle,
+				       ocfs2_alloc_context *ac);
+
+static int ocfs_local_alloc_slide_window(ocfs_super *osb, 
+					 struct inode * local_alloc_inode);
+
+/* 
+ * ocfs_local_alloc_window_bits
+ * 
+ * Determine how large our local alloc window should be, in bits. This
+ * is entirely changeable -- just replace this function. Right now as
+ * a *testing* default, we have a function that takes cluster size
+ * into account in the following manner: 
+ *
+ * 4k -> 1024 bits, 8k -> 512 bits, 16k -> 256 bits, 
+ * 32k -> 128 bits, 64k -> 64 bits
+ */
+static inline int ocfs_local_alloc_window_bits(ocfs_super *osb)
+{
+	int numbits;
+
+	switch (osb->s_clustersize) {
+	case (4*1024):
+		numbits = 1024;
+		break;
+
+	case (8*1024):
+		numbits = 512;
+		break;
+
+	case (16*1024):
+		numbits = 256;
+		break;
+
+	case (32*1024):
+		numbits = 128;
+		break;
+
+	default:
+		numbits = 64;
+		break;
+	}
+	return(numbits);
+} /* ocfs_local_alloc_window_bits */
+
+int ocfs_alloc_should_use_local(ocfs_super *osb, u64 bits)
+{
+	if (osb->have_local_alloc 
+	    && ((bits<<osb->s_clustersize_bits) <= OCFS_LOCAL_ALLOC_MAX_ALLOC)
+	    && (bits <= ocfs_local_alloc_window_bits(osb)))
+		return(1);
+	return(0);
+}
+
+/* 
+ * ocfs_load_local_alloc 
+ */
+int ocfs_load_local_alloc(ocfs_super *osb)
+{
+	int status = 0;
+	ocfs2_dinode *alloc = NULL;
+	struct buffer_head *alloc_bh = NULL;
+	__u32 num_used;
+	struct inode *inode = NULL;
+
+	LOG_ENTRY();
+
+	/* we don't enable local alloc on cluster sizes >= 128k */
+	if (osb->s_clustersize > OCFS_LOCAL_ALLOC_MAX_CSIZE)
+		goto bail;
+
+	/* read the alloc off disk */
+	inode = ocfs_get_system_file_inode(osb, LOCAL_ALLOC_SYSTEM_INODE, 
+					   osb->node_num);
+	if (!inode) {
+		LOG_ERROR_STATUS(status=-EINVAL);
+		goto bail;
+	}
+	status = ocfs_read_bh(osb,
+			      OCFS_I(inode)->ip_blkno << inode->i_sb->s_blocksize_bits,
+			      &alloc_bh, 0, inode);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
+	alloc = (ocfs2_dinode *) alloc_bh->b_data;
+
+	/* do a little verification. */
+	num_used = ocfs_local_alloc_count_bits(alloc);
+
+	/* nowadays the local alloc has always been recovered before
+	 * we load it so there should be no bits used from the main
+	 * bitmap. */
+	if (num_used
+	    || LOCAL_ALLOC(alloc)->la_bits_set
+	    || LOCAL_ALLOC(alloc)->la_bm_bits 
+	    || LOCAL_ALLOC(alloc)->la_bm_off) {
+		LOG_ERROR_ARGS("Local alloc hasn't been recovered!\n"
+			       "found = %u, set = %u, taken = %u, off = %u\n",
+			       num_used,
+			       LOCAL_ALLOC(alloc)->la_bits_set, 
+			       LOCAL_ALLOC(alloc)->la_bm_bits,
+			       LOCAL_ALLOC(alloc)->la_bm_off);
+		status = -EFAIL;
+		goto bail;
+	}
+
+	if (!LOCAL_ALLOC(alloc)->la_size || 
+	    (LOCAL_ALLOC(alloc)->la_size > ocfs2_local_alloc_size(inode->i_sb))) {
+		LOG_ERROR_ARGS("Local alloc size is invalid (la_size = %u)\n",
+			      LOCAL_ALLOC(alloc)->la_size);
+		status = -EINVAL;
+		goto bail;
+	}
+
+	osb->local_alloc_bh = alloc_bh;
+	osb->have_local_alloc = 1;
+
+bail:
+	if (status < 0)
+		if (alloc_bh)
+			brelse(alloc_bh);
+	if (inode)
+		iput(inode);
+
+	LOG_EXIT_STATUS(status);
+	return(status);
+} /* ocfs_load_local_alloc */
+
+/* 
+ * ocfs_shutdown_local_alloc
+ *
+ * return any unused bits to the bitmap and write out a clean
+ * local_alloc. 
+ *
+ * local_alloc_bh is optional. If not passed, we will simply use the
+ * one off osb. If you do pass it however, be warned that it *will* be
+ * returned brelse'd and NULL'd out.*/
+void ocfs_shutdown_local_alloc(ocfs_super *osb)
+{
+	int status;
+	ocfs2_dinode *alloc = NULL;
+	ocfs_bitmap_free_head *f = NULL;
+	struct buffer_head *bh = NULL;
+	ocfs_journal_handle *handle = NULL;
+	struct inode *local_alloc_inode = NULL;
+	ocfs_inode_private *oip;
+
+	LOG_ENTRY();
+
+	local_alloc_inode = 
+		ocfs_get_system_file_inode(osb, 
+					   LOCAL_ALLOC_SYSTEM_INODE,
+					   osb->node_num);
+	if (!local_alloc_inode) {
+		status = -ENOENT;
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+	oip = OCFS_I(local_alloc_inode);
+
+	/* Take io_sem here to turn off local alloc before another guy
+	 * can come in and start using him. */
+	down_write(&oip->ip_io_sem);
+	if (!osb->have_local_alloc) {
+		up_write(&oip->ip_io_sem);
+		goto bail;
+	}
+	osb->have_local_alloc = 0;
+	up_write(&oip->ip_io_sem);
+
+	bh = osb->local_alloc_bh;
+
+	status = ocfs_sync_local_from_shutdown(osb, &f, bh, 0);
+	if (status < 0)
+		LOG_ERROR_STATUS(status);
+
+	handle = ocfs_start_trans(osb, NULL, 1);
+	if (!handle) {
+		LOG_ERROR_STATUS(-ENOMEM);
+		goto bail;
+	}
+	ocfs_handle_set_always_commits(handle, 1);
+
+	status = ocfs_journal_access(handle, bh, OCFS_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
+	alloc = (ocfs2_dinode *) bh->b_data;
+	ocfs_clear_local_alloc(alloc);
+
+	status = ocfs_journal_dirty(handle, bh);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
+	brelse(bh);
+
+	ocfs_commit_trans(handle);
+	handle = NULL;
+
+	osb->local_alloc_bh = NULL;
+	osb->have_local_alloc = 0;
+
+	if (f)
+		ocfs_process_bitmap_free_head(osb, f);
+
+bail:
+	if (handle)
+		ocfs_commit_trans(handle);
+
+	if (f)
+		ocfs_free_bitmap_free_head(f);
+
+	if (local_alloc_inode)
+		iput(local_alloc_inode);
+
+	LOG_EXIT();
+	return;
+} /* ocfs_shutdown_local_alloc */
+
+/*
+ * ocfs_recover_local_alloc
+ *
+ * We want to free the bitmap bits outside of any recovery context, so
+ * it's allocated and passed back for you.
+ */
+int ocfs_recover_local_alloc(ocfs_super *osb, 
+			     int node_num, 
+			     ocfs_bitmap_free_head **bits_to_free)
+{
+	int status = 0;
+	struct buffer_head *alloc_bh = NULL;
+	struct inode *inode = NULL;
+	ocfs2_dinode *alloc;
+
+	LOG_ENTRY_ARGS("(node_num = %d)\n", node_num);
+
+	inode = ocfs_get_system_file_inode(osb, 
+					   LOCAL_ALLOC_SYSTEM_INODE, 
+					   node_num);
+	if (!inode) {
+		LOG_ERROR_STATUS(status=-EINVAL);
+		goto bail;
+	}
+
+	status = ocfs_read_bh(osb,
+			      OCFS_I(inode)->ip_blkno << inode->i_sb->s_blocksize_bits,
+			      &alloc_bh, 
+			      0, inode);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
+	status = ocfs_sync_local_from_shutdown(osb, 
+					       bits_to_free, 
+					       alloc_bh, 
+					       1);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
+	alloc = (ocfs2_dinode *) alloc_bh->b_data;
+	ocfs_clear_local_alloc(alloc);
+
+	status = ocfs_write_bh(osb, alloc_bh, inode);
+	if (status < 0)
+		LOG_ERROR_STATUS(status);
+
+bail:
+	if (alloc_bh)
+		brelse(alloc_bh);
+
+	if (inode)
+		iput(inode);
+
+	LOG_EXIT_STATUS(status);
+	return(status);
+} /* ocfs_recover_local_alloc */
+
+/* 
+ * ocfs_reserve_local_alloc_bits
+ *
+ * make sure we've got at least bitswanted contiguous bits in the
+ * local alloc. You lose them when you drop ip_io_sem.
+ * 
+ * We will add ourselves to the transaction passed in, but may start
+ * our own in order to shift windows.
+ *
+ * When we stop being lame and support multiple chunks of
+ * discontiguous space, we this turns into a really simple check of
+ * ->la_bits_set
+ */
+int ocfs_reserve_local_alloc_bits(ocfs_super *osb, 
+				  ocfs_journal_handle *passed_handle,
+				  u32 bits_wanted,
+				  ocfs2_alloc_context *ac)
+{
+	int status;
+	struct inode *local_alloc_inode = NULL;
+	int startoff;
+	ocfs2_dinode *alloc;
+
+	LOG_ENTRY();
+
+	OCFS_ASSERT(passed_handle);
+	OCFS_ASSERT(ac);
+	OCFS_ASSERT(!(passed_handle->flags & OCFS_HANDLE_STARTED));
+
+	local_alloc_inode = 
+		ocfs_get_system_file_inode(osb, 
+					   LOCAL_ALLOC_SYSTEM_INODE,
+					   osb->node_num);
+	if (!local_alloc_inode) {
+		status = -ENOENT;
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+	ocfs_handle_add_inode(passed_handle, local_alloc_inode);
+
+	if (!osb->have_local_alloc) {
+		status = -ENOSPC;
+		goto bail;
+	}
+
+#warning "isn't it about time we turned this check off?"
+	if (bits_wanted > ocfs_clusters_for_bytes(osb->sb, 
+						 OCFS_LOCAL_ALLOC_MAX_ALLOC)) {
+		LOG_TRACE_STR("Asking for more than max local alloction!\n");
+		status = -ENOSPC;
+		goto bail;
+	}
+
+	if (bits_wanted > ocfs_local_alloc_window_bits(osb)) {
+		LOG_TRACE_STR("Asking for more than my max window size!\n");
+		status = -ENOSPC;
+		goto bail;
+	}
+
+	alloc = (ocfs2_dinode *) osb->local_alloc_bh->b_data;
+	startoff = ocfs_local_alloc_find_clear_bits(osb, alloc, bits_wanted);
+	if (startoff == -1) {
+		/* uhoh, window change time. */
+		status = 
+			ocfs_local_alloc_slide_window(osb, local_alloc_inode);
+		if (status < 0) {
+			if (status != -ENOSPC)
+				LOG_ERROR_STATUS(status);
+			goto bail;
+		}
+	}
+
+	ac->ac_inode = igrab(local_alloc_inode);
+	get_bh(osb->local_alloc_bh);
+	ac->ac_bh = osb->local_alloc_bh;
+
+	status = 0;
+bail:
+	if (local_alloc_inode)
+		iput(local_alloc_inode);
+
+	LOG_EXIT_STATUS(status);
+	return(status);
+}
+
+int ocfs_claim_local_alloc_bits(ocfs_super *osb,
+				ocfs_journal_handle *handle,
+				ocfs2_alloc_context *ac,
+				u32 min_bits,
+				u32 *bit_off,
+				u32 *num_bits)
+{
+	int status, start;
+	struct inode *local_alloc_inode;
+	u32 bits_wanted;
+	void *bitmap;
+	ocfs2_dinode *alloc;
+
+	LOG_ENTRY();
+	OCFS_ASSERT(ac->ac_which == OCFS_AC_USE_LOCAL);
+
+	bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given;
+	local_alloc_inode = ac->ac_inode;
+	alloc = (ocfs2_dinode *) osb->local_alloc_bh->b_data;
+
+	start = ocfs_local_alloc_find_clear_bits(osb, alloc, bits_wanted);
+	if (start == -1) {
+		/* TODO: Shouldn't we just BUG here? */
+		status = -ENOSPC;
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
+	bitmap = LOCAL_ALLOC(alloc)->la_bitmap;
+	*bit_off = LOCAL_ALLOC(alloc)->la_bm_off + start;
+	/* local alloc is always contiguous by nature -- we never
+	 * delete bits from it! */
+	*num_bits = bits_wanted;
+
+	status = ocfs_journal_access(handle, osb->local_alloc_bh, 
+				     OCFS_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
+	while(bits_wanted--)
+		set_bit(start++, bitmap);
+
+	status = ocfs_journal_dirty(handle, osb->local_alloc_bh);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
+	status = 0;
+bail:
+	LOG_EXIT_STATUS(status);
+	return(status);
+}
+
+/*
+ * ocfs_local_alloc_count_bits
+ */
+static __u32 ocfs_local_alloc_count_bits(ocfs2_dinode *alloc)
+{
+	int i;
+	__u8 tmp;
+	__u8 *buffer;
+	__u32 count = 0;
+
+	LOG_ENTRY();
+
+	buffer = LOCAL_ALLOC(alloc)->la_bitmap;
+	for (i = 0; i < LOCAL_ALLOC(alloc)->la_size; i++) {
+		memcpy(&tmp, buffer, 1);
+		count+= BITCOUNT(tmp);
+		buffer++;
+	}
+
+	LOG_EXIT_ULONG ((unsigned long)count);
+	return(count);
+} /* ocfs_local_alloc_count_bits */
+
+/*
+ * ocfs_local_alloc_find_clear_bits
+ */
+static int ocfs_local_alloc_find_clear_bits(ocfs_super *osb,
+					    ocfs2_dinode *alloc,
+					    __u32 numbits)
+{
+	int numfound, bitoff, left, startoff, lastzero;
+	void *bitmap = NULL;
+
+	LOG_ENTRY_ARGS("(numbits wanted = %u)\n", numbits);
+
+	if (LOCAL_ALLOC(alloc)->la_bm_bits == 0) {
+		LOG_TRACE_STR("No bits in my window!");
+		bitoff = -1;
+		goto bail;
+	}
+
+	bitmap = LOCAL_ALLOC(alloc)->la_bitmap;
+
+	numfound = bitoff = startoff = 0;
+	lastzero = -1;
+	left = LOCAL_ALLOC(alloc)->la_bm_bits;
+	while ((bitoff = find_next_zero_bit(bitmap, left, startoff)) != -1) {
+		if (bitoff == left) {
+			/* LOG_TRACE_ARGS("bitoff (%d) == left", bitoff); */
+			break;
+		}
+		/* LOG_TRACE_ARGS("Found a zero: bitoff = %d, startoff = %d, "
+		   "numfound = %d\n", bitoff, startoff, numfound);*/
+
+		/* Ok, we found a zero bit... is it contig. or do we
+		 * start over?*/
+		if (bitoff == startoff) {
+			/* we found a zero */
+			numfound++;
+			startoff++;
+		} else {
+			/* got a zero after some ones */
+			numfound = 1;
+			startoff = bitoff+1;
+		}
+		/* we got everything we needed */
+		if (numfound == numbits) {
+			/* LOG_TRACE_STR("Found it all!"); */
+			break;
+		}
+	}
+
+	LOG_TRACE_ARGS("Exiting loop, bitoff = %d, numfound = %d\n", bitoff, 
+		       numfound);
+
+	if (numfound == numbits)
+		bitoff = startoff - numfound;
+	else
+		bitoff = -1;
+
+bail:
+	LOG_EXIT_STATUS(bitoff);
+	return(bitoff);
+} /* ocfs_local_find_alloc_clear_bits */
+
+/*
+ * ocfs_clear_local_alloc
+ */
+static void ocfs_clear_local_alloc(ocfs2_dinode *alloc) 
+{
+	int i;
+	LOG_ENTRY();
+
+	LOCAL_ALLOC(alloc)->la_bm_bits = 0;
+	LOCAL_ALLOC(alloc)->la_bits_set = 0;
+	LOCAL_ALLOC(alloc)->la_bm_off = 0;
+	for(i = 0; i < LOCAL_ALLOC(alloc)->la_size; i++)
+		LOCAL_ALLOC(alloc)->la_bitmap[i] = 0;
+
+	LOG_EXIT();
+	return;
+} /* ocfs_clear_local_alloc */
+
+/*
+ * This essentially does the same thing as sync_local_to_main, but
+ * without a journal handle -- used during shutdown and recovery.
+ */
+static int ocfs_sync_local_from_shutdown(ocfs_super *osb, 
+					 ocfs_bitmap_free_head **f, 
+					 struct buffer_head *local_alloc_bh, 
+					 int in_recovery)
+{
+	int status = 0;
+	int bit_off, left;
+	ocfs2_dinode *alloc = NULL;
+	void *bitmap;
+
+	LOG_ENTRY();
+
+	if (!local_alloc_bh)
+		BUG();
+
+	alloc = (ocfs2_dinode *) local_alloc_bh->b_data;
+	if (LOCAL_ALLOC(alloc)->la_bm_bits == 0) {
+		LOG_TRACE_STR("nothing to sync!");
+		goto bail;
+	}
+
+	if (!(*f)) {
+		*f = ocfs_alloc_bitmap_free_head();
+		if (*f == NULL) {
+			LOG_ERROR_STATUS(-ENOMEM);
+			goto bail;
+		}
+	}
+
+	alloc = (ocfs2_dinode *) local_alloc_bh->b_data;
+
+	LOG_TRACE_ARGS("alloc->la_bm_bits = %u, COUNT = %u, la_bits_set = %u\n", 
+		       LOCAL_ALLOC(alloc)->la_bm_bits,
+		       ocfs_local_alloc_count_bits(alloc), 
+		       LOCAL_ALLOC(alloc)->la_bits_set);
+
+	bitmap = LOCAL_ALLOC(alloc)->la_bitmap;
+
+	/* any unset bits in local alloc need to be unset in bitmap. */
+	bit_off = 0;
+	left = LOCAL_ALLOC(alloc)->la_bm_bits;
+	while ((bit_off = find_next_zero_bit(bitmap, left, bit_off)) 
+	       != -1) {
+		if (bit_off >= left) {
+			/*LOG_TRACE_ARGS("bit_off (%d) >= left\n", bit_off);*/
+			break;
+		}
+	     /* LOG_TRACE_ARGS("Clearing bit %u in main bitmap\n", bit_off);*/
+		status = ocfs_add_to_bitmap_free_head(osb, 
+						 *f, 1, 
+						 bit_off + LOCAL_ALLOC(alloc)->la_bm_off,
+						 -1, 0, DISK_ALLOC_VOLUME);
+		if (status < 0) {
+			ocfs_free_bitmap_free_head(*f);
+			*f = NULL;
+		}
+		bit_off++;
+	}
+
+bail:
+	LOG_EXIT_STATUS(status);
+	return(status);
+} /* ocfs_sync_local_from_shutdown */
+
+/* 
+ * ocfs_sync_local_to_main
+ *
+ * sync the local alloc to main bitmap. 
+ *
+ * assumes you've already locked the main bitmap -- the bitmap inode
+ * passed is used for caching.
+ */
+static int ocfs_sync_local_to_main(ocfs_super *osb, 
+				   ocfs_journal_handle *handle, 
+				   ocfs2_dinode *alloc,
+				   struct inode *main_bm_inode,
+				   struct buffer_head *main_bm_bh)
+{
+	int status = 0;
+	int bit_off, left;
+	void *bitmap;
+	unsigned int start, numblocks, bitmapblocks;
+	ocfs2_dinode *bm_fe;
+
+	LOG_ENTRY_ARGS("alloc->la_bm_bits = %u, COUNT = %u, la_bits_set = %u\n", 
+		       LOCAL_ALLOC(alloc)->la_bm_bits,
+		       ocfs_local_alloc_count_bits(alloc), 
+		       LOCAL_ALLOC(alloc)->la_bits_set);
+
+	if (LOCAL_ALLOC(alloc)->la_bm_bits == 0) {
+		LOG_TRACE_STR("nothing to sync!");
+		goto bail;
+	}
+
+	bitmapblocks =
+		ocfs_blocks_for_bits(osb->sb,
+ 				     osb->cluster_bitmap.validbits);
+
+	/* figure out which block in the bitmap to start on and the
+	 * maximum number of blocks we can span over -- we don't need
+	 * to read any more as that's the most we'll be touching... */
+	numblocks = ocfs_bitmap_blocks_affected(osb->sb,
+						LOCAL_ALLOC(alloc)->la_bm_off,
+						LOCAL_ALLOC(alloc)->la_bits_set,
+						&start);
+
+	if ((start + numblocks) > bitmapblocks) {
+		printk("uhoh, bitmap calculation is bad!\n");
+		printk("alloc->la_bm_bits = %u, COUNT = %u, alloc->la_bits_set = %u"
+		       "start=%u, alloc->la_bm_off = %u, numblocks=%u, "
+		       "bitmapblocks = %u\n",
+		       LOCAL_ALLOC(alloc)->la_bm_bits, ocfs_local_alloc_count_bits(alloc), 
+		       LOCAL_ALLOC(alloc)->la_bits_set, start, LOCAL_ALLOC(alloc)->la_bm_off, numblocks,
+		       bitmapblocks);
+
+		BUG();
+	}
+
+	bm_fe = (ocfs2_dinode *) main_bm_bh->b_data;
+	status = ocfs_journal_access(handle, main_bm_bh, 
+				     OCFS_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
+	LOG_TRACE_ARGS("start=%u, alloc->la_bm_off = %u, numblocks=%u\n", start, 
+		       LOCAL_ALLOC(alloc)->la_bm_off, numblocks);
+	status = ocfs_read_bhs(osb,
+			       (osb->bitmap_blkno + start) << osb->sb->s_blocksize_bits,
+			       numblocks << osb->sb->s_blocksize_bits,
+			       &osb->cluster_bitmap.chunk[start], OCFS_BH_CACHED, 
+			       main_bm_inode);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
+	bitmap = LOCAL_ALLOC(alloc)->la_bitmap;
+	/* any unset bits in local alloc need to be unset in bitmap. */
+	bit_off = 0;
+	left = LOCAL_ALLOC(alloc)->la_bm_bits;
+	while ((bit_off = find_next_zero_bit(bitmap, left, bit_off)) 
+	       != -1) {
+		if (bit_off >= left) {
+			/*LOG_TRACE_ARGS("bit_off (%d) >= left\n", bit_off);*/
+			break;
+		}
+
+		LOG_TRACE_ARGS("Clearing bit %u in main bitmap\n", 
+			       bit_off + LOCAL_ALLOC(alloc)->la_bm_off);
+		ocfs_clear_bits(osb->sb, handle, &osb->cluster_bitmap, 
+				bit_off + LOCAL_ALLOC(alloc)->la_bm_off,
+				1);
+		bm_fe->id1.bitmap1.i_used--;
+		bit_off++;
+	}
+
+	status = ocfs_journal_dirty(handle, main_bm_bh);
+	if (status < 0) {
+		LOG_ERROR_STATUS (status);
+		goto bail;
+	}
+
+bail:
+
+	LOG_EXIT_STATUS(status);
+	return(status);
+} /* ocfs_sync_local_to_main */
+
+static int ocfs_local_alloc_reserve_for_window(ocfs_super *osb, 
+					       ocfs_journal_handle *handle,
+					       ocfs2_alloc_context **ac,
+					       struct inode **bitmap_inode,
+					       struct buffer_head **bitmap_bh)
+{
+	int status;
+
+	*ac = kmalloc(sizeof(ocfs2_alloc_context), GFP_KERNEL);
+	if (!(*ac)) {
+		status = -ENOMEM;
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+	memset(*ac, 0, sizeof(ocfs2_alloc_context));
+	(*ac)->ac_handle = handle;
+
+	(*ac)->ac_bits_wanted = ocfs_local_alloc_window_bits(osb);
+	status = ocfs_reserve_main_bitmap_bits(osb,
+					       handle,
+					       (*ac)->ac_bits_wanted,
+					       *ac);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+	(*ac)->ac_which = OCFS_AC_USE_MAIN;
+
+	*bitmap_inode = (*ac)->ac_inode;
+	igrab(*bitmap_inode);
+	*bitmap_bh = (*ac)->ac_bh;
+	get_bh(*bitmap_bh);
+	status = 0;
+bail:
+	if ((status < 0) && *ac) {
+		ocfs_free_alloc_context(*ac);
+		*ac = NULL;
+	}
+
+	LOG_EXIT_STATUS(status);
+	return(status);
+}
+
+/*
+ * ocfs_local_alloc_new_window
+ *
+ * pass it the bitmap lock in lock_bh if you have it. 
+ */
+static int ocfs_local_alloc_new_window(ocfs_super *osb, 
+				       ocfs_journal_handle *handle,
+				       ocfs2_alloc_context *ac)
+{
+	int status = 0;
+	u32 cluster_off, cluster_count;
+	ocfs2_dinode *alloc = NULL;
+
+	LOG_ENTRY();
+
+	alloc = (ocfs2_dinode *) osb->local_alloc_bh->b_data;
+	if (LOCAL_ALLOC(alloc)->la_bm_bits != 0)
+		LOG_TRACE_STR("asking me to alloc a new window over a"
+			      " non-empty one");
+
+	LOG_TRACE_ARGS("Allocating %u clusters for a new window.\n", 
+		       ocfs_local_alloc_window_bits(osb));
+	/* we used the main bitmap specific reserve function, but we
+	 * set everything up nicely, so there's no reason why we can't
+	 * use the generic claim. */
+	status = ocfs_claim_bits(osb, 
+				 handle, 
+				 ac, 
+				 ocfs_local_alloc_window_bits(osb),
+				 &cluster_off, 
+				 &cluster_count);
+	if (status < 0) {
+		if (status != -ENOSPC)
+			LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+	atomic_inc(&osb->alloc_stats.bitmap_data);
+
+	alloc = (ocfs2_dinode *) osb->local_alloc_bh->b_data;
+
+	LOCAL_ALLOC(alloc)->la_bm_off = cluster_off;
+	LOCAL_ALLOC(alloc)->la_bm_bits = cluster_count;
+	/* just in case... In the future when we find space ourselves,
+	 * we don't have to get all contiguous -- but we'll have to
+	 * set all previously used bits in bitmap and update
+	 * la_bits_set before setting the bits in the main bitmap. */
+	LOCAL_ALLOC(alloc)->la_bits_set = 0;
+	memset(LOCAL_ALLOC(alloc)->la_bitmap, 0,
+	       LOCAL_ALLOC(alloc)->la_size);
+
+	LOG_TRACE_STR("New window allocated:");
+	LOG_TRACE_ARGS("window la_bm_off = %u\n",
+		       LOCAL_ALLOC(alloc)->la_bm_off);
+	LOG_TRACE_ARGS("window la_bm_bits = %u\n",
+		       LOCAL_ALLOC(alloc)->la_bm_bits);
+
+bail:
+	LOG_EXIT_STATUS(status);
+	return(status);
+} /* ocfs_local_alloc_new_window */
+
+/* Note that we do *NOT* lock the local alloc inode here as
+ * it's been locked already for us. */
+static int ocfs_local_alloc_slide_window(ocfs_super *osb, 
+					 struct inode *local_alloc_inode)
+{
+	int status = 0;
+	struct buffer_head *main_bm_bh = NULL;
+	struct inode *main_bm_inode = NULL;
+	ocfs_journal_handle *handle = NULL;
+	ocfs2_dinode *alloc;
+	ocfs2_dinode *alloc_copy = NULL;
+	ocfs2_alloc_context *ac = NULL;
+
+	LOG_ENTRY();
+
+	main_bm_inode = ocfs_get_system_file_inode(osb, 
+						   GLOBAL_BITMAP_SYSTEM_INODE, 
+						   -1);
+	if (!main_bm_inode) {
+		status = -EINVAL;
+		LOG_ERROR_STATUS (status);
+		goto bail;
+	}
+
+	handle = ocfs_alloc_handle(osb);
+	if (!handle) {
+		status = -ENOMEM;
+		LOG_ERROR_STATUS (status);
+		goto bail;
+	}
+
+	/* This will lock the main bitmap for us. */
+	status = ocfs_local_alloc_reserve_for_window(osb, 
+						     handle, 
+						     &ac,
+						     &main_bm_inode,
+						     &main_bm_bh);
+	if (status < 0) {
+		if (status != -EINTR)
+			LOG_ERROR_STATUS (status);
+		goto bail;
+	}
+
+	handle = ocfs_start_trans(osb, handle, OCFS_WINDOW_MOVE_CREDITS);
+	if (!handle) {
+		status = -ENOMEM;
+		LOG_ERROR_STATUS (status);
+		goto bail;
+	}
+	ocfs_handle_set_always_commits(handle, 1);
+
+	alloc = (ocfs2_dinode *) osb->local_alloc_bh->b_data;
+
+	/* We want to clear the local alloc before doing anything
+	 * else, so that if we error later during this operation,
+	 * local alloc shutdown won't try to double free main bitmap
+	 * bits. Make a copy so the sync function knows which bits to
+	 * free. */
+	alloc_copy = kmalloc(osb->local_alloc_bh->b_size, GFP_KERNEL);
+	if (!alloc_copy) {
+		status = -ENOMEM;
+		goto bail;
+	}
+	memcpy(alloc_copy, alloc, osb->local_alloc_bh->b_size);
+
+	status = ocfs_journal_access(handle, osb->local_alloc_bh, 
+				     OCFS_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
+	ocfs_clear_local_alloc(alloc);
+
+	status = ocfs_journal_dirty(handle, osb->local_alloc_bh);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
+	status = ocfs_sync_local_to_main(osb, handle, alloc_copy, 
+					 main_bm_inode, main_bm_bh);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
+	status = ocfs_local_alloc_new_window(osb, handle, ac);
+	if (status < 0) {
+		if (status != -ENOSPC)
+			LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
+	atomic_inc(&osb->alloc_stats.moves);
+
+	status = 0;
+bail:
+	if (handle)
+		ocfs_commit_trans(handle);
+
+	if (main_bm_bh)
+		brelse(main_bm_bh);
+
+	if (main_bm_inode)
+		iput(main_bm_inode);
+
+	if (alloc_copy)
+		kfree(alloc_copy);
+
+	if (ac)
+		ocfs_free_alloc_context(ac);
+
+	LOG_EXIT_STATUS(status);
+	return(status);
+}
+

Added: trunk/src/localalloc.h
===================================================================
--- trunk/src/localalloc.h	2004-10-02 01:01:24 UTC (rev 1542)
+++ trunk/src/localalloc.h	2004-10-02 01:08:08 UTC (rev 1543)
@@ -0,0 +1,55 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * localalloc.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Authors: Kurt Hackel, Mark Fasheh, Sunil Mushran, Wim Coekaerts,
+ *	    Manish Singh, Neeraj Goyal, Suchit Kaura
+ */
+
+#ifndef OCFS2_LOCALALLOC_H
+#define OCFS2_LOCALALLOC_H
+
+int ocfs_load_local_alloc(ocfs_super *osb);
+
+void ocfs_shutdown_local_alloc(ocfs_super *osb);
+
+int ocfs_recover_local_alloc(ocfs_super *osb, 
+			     int node_num, 
+			     ocfs_bitmap_free_head **bits_to_free);
+
+int ocfs_alloc_should_use_local(ocfs_super *osb, 
+				u64 bits);
+
+int ocfs_reserve_local_alloc_bits(ocfs_super *osb, 
+				  ocfs_journal_handle *passed_handle,
+				  u32 bits_wanted,
+				  ocfs2_alloc_context *ac);
+
+int ocfs_claim_local_alloc_bits(ocfs_super *osb,
+				ocfs_journal_handle *handle,
+				ocfs2_alloc_context *ac,
+				u32 min_bits,
+				u32 *bit_off,
+				u32 *num_bits);
+
+#endif /* OCFS2_LOCALALLOC_H */

Modified: trunk/src/namei.c
===================================================================
--- trunk/src/namei.c	2004-10-02 01:01:24 UTC (rev 1542)
+++ trunk/src/namei.c	2004-10-02 01:08:08 UTC (rev 1543)
@@ -58,6 +58,7 @@
 #include "inode.h"
 #include "lockres.h"
 #include "namei.h"
+#include "suballoc.h"
 #include "util.h"
 #include "vote.h"
 
@@ -71,26 +72,34 @@
 				  const char *name, int namelen, 
 				  unsigned long offset, 
 				  struct ocfs2_dir_entry **res_dir);
+
 static int ocfs_delete_entry(ocfs_journal_handle *handle,
 			     struct inode *dir, 
 			     struct ocfs2_dir_entry *de_del, 
 			     struct buffer_head *bh);
-static int __ocfs_add_entry(ocfs_journal_handle *handle,
-			    struct inode *dir,
-			    const char *name, int namelen, 
-			    struct inode *inode, u64 blkno, 
-			    struct buffer_head *parent_fe_bh);
-static inline int ocfs_match(int len, const char *const name,
-			     struct ocfs2_dir_entry *de);
 
+static int __ocfs_add_entry (ocfs_journal_handle *handle, struct inode *dir,
+			     const char *name, int namelen, 
+			     struct inode *inode, u64 blkno, 
+			     struct buffer_head *parent_fe_bh, 
+			     struct buffer_head *insert_bh);
+
 static int ocfs_mknod_locked(ocfs_super *osb, struct inode *dir, 
 			     struct dentry *dentry, int mode, 
 			     dev_t dev,
 			     struct buffer_head **new_fe_bh, 
 			     struct buffer_head *parent_fe_bh,
 			     ocfs_journal_handle *handle,
-			     struct inode *inode);
+			     struct inode *inode,
+			     ocfs2_alloc_context *inode_ac);
 
+static int ocfs_fill_new_dir(ocfs_super *osb, 
+			     ocfs_journal_handle *handle,
+			     struct inode *parent, 
+			     struct inode *inode,
+			     struct buffer_head *fe_bh,
+			     ocfs2_alloc_context *data_ac);
+
 static int ocfs_double_lock(ocfs_super *osb,
 			    ocfs_journal_handle *handle,
 			    __u32 type1, __u32 flags1, 
@@ -100,8 +109,15 @@
 			    struct buffer_head **bh2,
 		     	    struct inode *inode2);
 
+static int ocfs_prepare_orphan_dir(ocfs_super *osb, 
+				   ocfs_journal_handle *handle,
+				   struct inode *inode,
+				   char **ret_name,
+				   struct buffer_head **de_bh);
+
 static int ocfs_orphan_add(ocfs_super *osb, ocfs_journal_handle *handle,
-			   struct inode *inode, ocfs2_dinode *fe);
+			   struct inode *inode, ocfs2_dinode *fe, 
+			   char *name, struct buffer_head *de_bh);
 
 static int ocfs_create_symlink_data(ocfs_super *osb, 
 				    ocfs_journal_handle *handle, 
@@ -111,11 +127,12 @@
 static inline int ocfs_add_entry(ocfs_journal_handle *handle, 
 				 struct dentry *dentry, 
 				 struct inode *inode, u64 blkno, 
-				 struct buffer_head *parent_fe_bh) 
+				 struct buffer_head *parent_fe_bh,
+				 struct buffer_head *insert_bh) 
 {
 	return(__ocfs_add_entry(handle, dentry->d_parent->d_inode, 
 				dentry->d_name.name, dentry->d_name.len, 
-				inode, blkno, parent_fe_bh));
+				inode, blkno, parent_fe_bh, insert_bh));
 }
 
 /*
@@ -148,11 +165,9 @@
 	LOG_TRACE_ARGS("about to call find_files_on_disk with inode=%p\n", 
 		       dir);
 
-	down_write(&OCFS_I(dir)->ip_io_sem);
 	status = ocfs_find_files_on_disk(osb, dentry->d_name.name,
 					 dentry->d_name.len, &blkno,
 					 dir, 1, &dirent_bh, &dirent);
-	up_write(&OCFS_I(dir)->ip_io_sem);
 	if (status < 0)
 		goto bail_add;
 	
@@ -171,13 +186,82 @@
 bail:
 	if (dirent_bh)
 		brelse(dirent_bh);
-	
+
 	LOG_EXIT_PTR (ret);
 
 	LOG_CLEAR_CONTEXT();
 	return ret;
 }				/* ocfs_lookup */
 
+static int ocfs_fill_new_dir(ocfs_super *osb, 
+			     ocfs_journal_handle *handle,
+			     struct inode *parent, 
+			     struct inode *inode,
+			     struct buffer_head *fe_bh,
+			     ocfs2_alloc_context *data_ac)
+{
+	int status;
+	struct buffer_head *new_bh = NULL;
+	struct ocfs2_dir_entry *de = NULL;
+
+	LOG_ENTRY();
+
+	status = ocfs_do_extend_dir(osb->sb, handle, inode, fe_bh,
+				    data_ac, NULL, &new_bh);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
+	set_buffer_uptodate(new_bh);
+	SET_BH_SEQNUM(inode, new_bh);
+	status = ocfs_journal_access(handle, new_bh, 
+				     OCFS_JOURNAL_ACCESS_CREATE);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+	memset(new_bh->b_data, 0, osb->sb->s_blocksize);
+
+	de = (struct ocfs2_dir_entry *) new_bh->b_data;
+	de->inode = cpu_to_le64(OCFS_I(inode)->ip_blkno);
+	de->name_len = 1;
+	de->rec_len =
+		cpu_to_le16(OCFS2_DIR_REC_LEN(de->name_len));
+	strcpy (de->name, ".");
+	ocfs_set_de_type(de, S_IFDIR);
+	de = (struct ocfs2_dir_entry *) ((char *) de + le16_to_cpu(de->rec_len));
+	de->inode = cpu_to_le64(OCFS_I(parent)->ip_blkno);
+	de->rec_len = cpu_to_le16(inode->i_sb->s_blocksize -
+				  OCFS2_DIR_REC_LEN(1));
+	de->name_len = 2;
+	strcpy (de->name, "..");
+	ocfs_set_de_type(de, S_IFDIR);
+
+	status = ocfs_journal_dirty(handle, new_bh);	
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
+	inode->i_size = inode->i_sb->s_blocksize;
+	inode->i_nlink = 2;
+	inode->i_blocks = 1;
+	status = ocfs_mark_inode_dirty(handle, inode, fe_bh);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
+	status = 0;
+bail:
+	if (new_bh)
+		brelse(new_bh);
+
+	LOG_EXIT_STATUS(status);
+	return(status);
+}
+
 static int ocfs_mknod(struct inode *dir, struct dentry *dentry,
 		int mode, dev_t dev)
 {
@@ -189,7 +273,10 @@
 	ocfs2_dinode *fe = NULL;
 	ocfs2_dinode *dirfe;
 	struct buffer_head *new_fe_bh = NULL;
+	struct buffer_head *de_bh = NULL;
 	struct inode *inode = NULL;
+	ocfs2_alloc_context *inode_ac = NULL;
+	ocfs2_alloc_context *data_ac = NULL;
 
 	LOG_SET_CONTEXT(MKNOD);
 
@@ -204,8 +291,6 @@
 	}
 #endif
 
-	down_write(&OCFS_I(dir)->ip_io_sem);
-
 	/* get our super block */
 	osb = OCFS_SB(dir->i_sb);
 	if (osb->osb_flags & OCFS_OSB_FLAGS_SHUTDOWN) {
@@ -245,9 +330,8 @@
 			LOG_ERROR_STATUS (status);
 		goto leave;
 	}
-
-	/* Ok, we got the lock -- we'd better add it to our transaction */
 	ocfs_handle_add_lock(handle, OCFS_LKM_EXMODE, 0, dir);
+	ocfs_handle_add_inode(handle, dir);
 
 	dirfe = (ocfs2_dinode *) parent_fe_bh->b_data;
 	if (!dirfe->i_links_count) {
@@ -256,15 +340,43 @@
 		goto leave;
 	}
 
+	/* get a spot inside the dir. */
+	status = ocfs_prepare_dir_for_insert(osb, dir, parent_fe_bh, 
+					     dentry->d_name.name, 
+					     dentry->d_name.len, &de_bh);
+	if (status < 0) {
+		LOG_ERROR_STATUS (status);
+		goto leave;
+	}
+
+	/* reserve an inode spot */
+	status = ocfs_reserve_new_inode(osb, handle, &inode_ac);
+	if (status < 0) {
+		LOG_ERROR_STATUS (status);
+		goto leave;
+	}
+
+	/* are we making a directory? If so, reserve a cluster for his
+	 * 1st extent. */
+	if (S_ISDIR(mode)) {
+		status = ocfs_reserve_bits(osb, handle, 1, &data_ac);
+		if (status < 0) {
+			LOG_ERROR_STATUS(status);
+			goto leave;
+		}
+	}
+
 	handle = ocfs_start_trans(osb, handle, OCFS_MKNOD_CREDITS);
 	if (handle == NULL) {
 		LOG_ERROR_STATUS (status = -ENOMEM);
 		goto leave;
 	}
+	ocfs_handle_set_always_commits(handle, 1);
 
 	/* do the real work now. */
 	status = ocfs_mknod_locked(osb, dir, dentry, mode, dev,
-				   &new_fe_bh, parent_fe_bh, handle, inode);
+				   &new_fe_bh, parent_fe_bh, handle, 
+				   inode, inode_ac);
 	if (status < 0) {
 		if (status != -EINTR)
 			LOG_ERROR_STATUS(status);
@@ -288,53 +400,20 @@
 
 	status = ocfs_update_lockres(osb, new_fe_bh, inode, 0);
 	if (S_ISDIR (mode)) {
-		struct buffer_head *newdirbh = NULL;
-		int retval = 0;
-		struct ocfs2_dir_entry *de = NULL;
-
-		newdirbh = ocfs_bread (handle, inode, 0, 1, &retval, 0);
-		if (!newdirbh) {
-			LOG_ERROR_STATUS(status = retval);
-			goto leave;
-		}
-		status = ocfs_journal_access(handle, newdirbh, OCFS_JOURNAL_ACCESS_WRITE);
+		status = ocfs_fill_new_dir(osb, handle, dir, inode, 
+					   new_fe_bh, data_ac);
 		if (status < 0) {
-			brelse(newdirbh);
 			LOG_ERROR_STATUS(status);
 			goto leave;
 		}
-		de = (struct ocfs2_dir_entry *) newdirbh->b_data;
-		de->inode = cpu_to_le64(fe->i_blkno);
-		fe = NULL;
-		de->name_len = 1;
-		de->rec_len =
-			cpu_to_le16(OCFS2_DIR_REC_LEN(de->name_len));
-		strcpy (de->name, ".");
-		ocfs_set_de_type(de, S_IFDIR);
-		de = (struct ocfs2_dir_entry *) ((char *) de + le16_to_cpu(de->rec_len));
-		de->inode = cpu_to_le64(OCFS_I(dir)->ip_blkno);
-		de->rec_len = cpu_to_le16(inode->i_sb->s_blocksize -
-					  OCFS2_DIR_REC_LEN(1));
-		de->name_len = 2;
-		strcpy (de->name, "..");
-		ocfs_set_de_type(de, S_IFDIR);
-		inode->i_nlink = 2;
-		status = ocfs_journal_dirty(handle, newdirbh);
-		brelse (newdirbh);
+
+		status = ocfs_journal_access(handle, parent_fe_bh, 
+					     OCFS_JOURNAL_ACCESS_WRITE);
 		if (status < 0) {
 			LOG_ERROR_STATUS(status);
 			goto leave;
 		}
-		inode->i_size = inode->i_sb->s_blocksize;
-	
-		status = ocfs_journal_access(handle, parent_fe_bh, OCFS_JOURNAL_ACCESS_WRITE);
-		if (status < 0) {
-			LOG_ERROR_STATUS(status);
-			goto leave;
-		}
-		fe = (ocfs2_dinode *) parent_fe_bh->b_data;
-		fe->i_links_count++;
-		fe = NULL;
+		dirfe->i_links_count++;
 		status = ocfs_journal_dirty(handle, parent_fe_bh);
 		if (status < 0) {
 			LOG_ERROR_STATUS(status);
@@ -343,17 +422,21 @@
 		dir->i_nlink++;
 	}
 
+	status = ocfs_add_entry(handle, dentry, inode, fe->i_blkno,
+				parent_fe_bh, de_bh);
+	if (status < 0) {
+		LOG_ERROR_STATUS (status);
+		goto leave;
+	}
+
 	insert_inode_hash (inode);
 	d_instantiate (dentry, inode);
-	ocfs_commit_trans(handle);
 
 	status = 0;
 leave:
-	if ((status < 0) && handle)
-		ocfs_abort_trans(handle);
+	if (handle)
+		ocfs_commit_trans(handle);
 
-	up_write(&OCFS_I(dir)->ip_io_sem);
-
 	if (status == -ENOSPC)
 		LOG_TRACE_STR ("Disk is full");
 	else if (status < 0 && status != -EINTR)
@@ -362,12 +445,21 @@
 	if (new_fe_bh) 
 		brelse(new_fe_bh);
 
+	if (de_bh) 
+		brelse(de_bh);
+
 	if (parent_fe_bh != NULL) 
 		brelse(parent_fe_bh);
 
 	if ((status < 0) && inode)
 		iput(inode);
 
+	if (inode_ac)
+		ocfs_free_alloc_context(inode_ac);
+
+	if (data_ac)
+		ocfs_free_alloc_context(data_ac);
+
 	LOG_EXIT_STATUS(status);
 
 	LOG_CLEAR_CONTEXT();
@@ -384,39 +476,27 @@
 			     struct buffer_head **new_fe_bh, 
 			     struct buffer_head *parent_fe_bh,
 			     ocfs_journal_handle *handle,
-			     struct inode *inode)
+			     struct inode *inode,
+			     ocfs2_alloc_context *inode_ac)
 {
 	int status = 0;
 	ocfs2_dinode *fe = NULL;
 	ocfs2_extent_list *fel;
-	__u64 disk_off = 0;
 	u64 fe_blkno = 0;
-	__u64 fileOffset = 0;
-	struct inode *inode_alloc_inode = NULL;
+	u16 suballoc_bit;
 
 	LOG_ENTRY_ARGS ("(0x%p, 0x%p, %d, %lu, '%*s')\n", dir, dentry, mode,
 			(unsigned long)dev, dentry->d_name.len, dentry->d_name.name);
 
 	OCFS_ASSERT(new_fe_bh);
 	*new_fe_bh = NULL;
-	
-	inode_alloc_inode = ocfs_get_system_file_inode(osb, INODE_ALLOC_BITMAP_SYSTEM_INODE, osb->node_num);
-	if (!inode_alloc_inode) {
-		status = -EFAIL;
-		LOG_ERROR_STATUS(status);
-		goto leave;
-	}
 
-	ocfs_handle_add_inode(handle, inode_alloc_inode);
-	status = ocfs_alloc_node_block(osb, osb->sb->s_blocksize,
-			      	       &disk_off, &fileOffset, 
-			       	       osb->node_num, DISK_ALLOC_INODE, 
-		       		       handle);
+	status = ocfs_claim_new_inode(osb, handle, inode_ac, &suballoc_bit,
+				      &fe_blkno);
 	if (status < 0) {
 		LOG_ERROR_STATUS (status);
 		goto leave;
 	}
-	fe_blkno = disk_off >> osb->sb->s_blocksize_bits;
 
 	*new_fe_bh = sb_getblk(osb->sb, fe_blkno);
 	if (!*new_fe_bh) {
@@ -449,7 +529,7 @@
 	spin_unlock(&osb->s_next_gen_lock);
 	fe->i_generation = cpu_to_le32(inode->i_generation);
 	fe->i_blkno = fe_blkno;
-	fe->i_suballoc_blkno = fileOffset >> osb->sb->s_blocksize_bits;
+	fe->i_suballoc_bit = suballoc_bit;
 	fe->i_suballoc_node = osb->node_num;
 	fe->i_uid = current->fsuid;
 	if (dir->i_mode & S_ISGID) {
@@ -494,22 +574,12 @@
 	/* Inode is not yet fully populated, but we need some fields
 	 * for add_entry. */
 	inode->i_mode = mode;
-
-	status = ocfs_add_entry(handle, dentry, inode, fe_blkno,
-				parent_fe_bh);
-	if (status < 0) {
-		LOG_ERROR_STATUS (status);
-		goto leave;
-	}
-
 	SET_BH_SEQNUM(inode, *new_fe_bh);
 leave:
 	if (status < 0 && *new_fe_bh) {
 		brelse(*new_fe_bh);
 		*new_fe_bh = NULL;
 	}
-	if (inode_alloc_inode)
-		iput(inode_alloc_inode);
 
 	LOG_EXIT_STATUS (status);
 	return status;
@@ -563,9 +633,9 @@
 	ocfs_journal_handle *handle = NULL;
 	struct inode *inode = old_dentry->d_inode;
 	int err;
-	int drop_dir_sem = 0, drop_inode_sem = 0;
 	struct buffer_head *fe_bh = NULL;
 	struct buffer_head *parent_fe_bh = NULL;
+	struct buffer_head *de_bh = NULL;
 	ocfs2_dinode *fe = NULL;
 	ocfs_super *osb = OCFS_SB(dir->i_sb);
 
@@ -592,8 +662,6 @@
 		goto bail;
 	}
 
-	down_write(&OCFS_I(dir)->ip_io_sem);
-	drop_dir_sem = 1;
 	/* lock the parent directory */
 	err = ocfs_acquire_lock (osb, OCFS_LKM_EXMODE, 0, 
 				    &parent_fe_bh, dir);
@@ -603,9 +671,16 @@
 		goto bail;
 	}
 	ocfs_handle_add_lock(handle, OCFS_LKM_EXMODE, 0, dir);
+	ocfs_handle_add_inode(handle, dir);
 
-	down_write(&OCFS_I(inode)->ip_io_sem);
-	drop_inode_sem = 1;
+	err = ocfs_prepare_dir_for_insert(osb, dir, parent_fe_bh, 
+					     dentry->d_name.name, 
+					     dentry->d_name.len, &de_bh);
+	if (err < 0) {
+		LOG_ERROR_STATUS (err);
+		goto bail;
+	}
+
 	err = ocfs_acquire_lock (osb, OCFS_LKM_EXMODE, 0, &fe_bh, inode);
 	if (err < 0) {
 		if (err != -EINTR)
@@ -614,6 +689,7 @@
 	}
 	ocfs_handle_add_lock(handle, OCFS_LKM_EXMODE, FLAG_FILE_UPDATE_OIN, 
 			     inode);
+	ocfs_handle_add_inode(handle, inode);
 
 	fe = (ocfs2_dinode *) fe_bh->b_data;
 	if (fe->i_links_count >= OCFS2_LINK_MAX) {
@@ -648,8 +724,8 @@
 		goto bail;
 	}
 
-	err = ocfs_add_entry(handle, dentry, inode,
-			     OCFS_I(inode)->ip_blkno, parent_fe_bh);
+	err = ocfs_add_entry(handle, dentry, inode, OCFS_I(inode)->ip_blkno, 
+			     parent_fe_bh, de_bh);
 	if (err) {
 		fe->i_links_count--;
 		inode->i_nlink--;
@@ -662,12 +738,8 @@
 bail:
 	if (handle)
 		ocfs_commit_trans(handle);
-
-	if (drop_dir_sem)
-		up_write(&OCFS_I(dir)->ip_io_sem);
-	if (drop_inode_sem)
-		up_write(&OCFS_I(inode)->ip_io_sem);
-
+	if (de_bh) 
+		brelse(de_bh);
 	if (fe_bh)
 		brelse(fe_bh);
 	if (parent_fe_bh)
@@ -690,19 +762,22 @@
 	int retval = -EBUSY;
 	ocfs_super *osb = OCFS_SB(dir->i_sb);
 	u64 blkno;
-	struct inode *parentInode = dentry->d_parent->d_inode;
 	ocfs2_dinode *fe = NULL;
 	struct buffer_head *fe_bh = NULL;
 	struct buffer_head *parent_node_bh = NULL; /* parent locknode */
 	ocfs_journal_handle *handle = NULL;
 	struct ocfs2_dir_entry *dirent = NULL;
 	struct buffer_head *dirent_bh = NULL;
+	char *orphan_name;
+	struct buffer_head *orphan_entry_bh = NULL;
 
 	LOG_SET_CONTEXT(UNLINK);
 
 	LOG_ENTRY_ARGS ("(0x%p, 0x%p, '%*s')\n", dir, dentry,
 			dentry->d_name.len, dentry->d_name.name);
 
+	OCFS_ASSERT((dentry->d_parent->d_inode == dir));
+
 	LOG_TRACE_ARGS("ino = %llu\n", OCFS_I(inode)->ip_blkno);
 
 	status = -EBUSY;
@@ -710,31 +785,29 @@
 	if (inode == osb->root_inode) {
 		LOG_TRACE_STR ("Cannot delete the root directory");
 		status = -EPERM;
-		goto bail;
+		goto leave;
 	}
 
 	handle = ocfs_alloc_handle(osb);
 	if (handle == NULL) {
 		LOG_ERROR_STATUS (status = -ENOMEM);
-		goto bail;
+		goto leave;
 	}
 
-	down_write(&OCFS_I(dir)->ip_io_sem);
-	down_write(&OCFS_I(inode)->ip_io_sem);
-
 	status = ocfs_acquire_lock(osb, OCFS_LKM_EXMODE, 0, 
-				   &parent_node_bh, parentInode);
+				   &parent_node_bh, dir);
 	if (status < 0) {
 		LOG_ERROR_STATUS(status);
 		goto leave;
 	}
-	ocfs_handle_add_lock(handle, OCFS_LKM_EXMODE, 0, parentInode);
+	ocfs_handle_add_lock(handle, OCFS_LKM_EXMODE, 0, dir);
+	ocfs_handle_add_inode(handle, dir);
 
 	/* this will re-read the directory now with the EXCLUSIVE */
 	/* lock already held; it will also return the blkno to us */
 	status = ocfs_find_files_on_disk(osb, dentry->d_name.name,
 					 dentry->d_name.len, &blkno,
-					 parentInode, 0, &dirent_bh,
+					 dir, 0, &dirent_bh,
 					 &dirent);
 	if (status < 0) {
 		LOG_ERROR_STATUS(status);
@@ -753,9 +826,10 @@
 	}
 	ocfs_handle_add_lock(handle, OCFS_LKM_EXMODE, FLAG_RELEASE_DENTRY, 
 			     inode);
+	ocfs_handle_add_inode(handle, inode);
 
 	if (S_ISDIR (inode->i_mode)) {
-	       	if (!empty_dir(inode)) {
+	       	if (!ocfs_empty_dir(inode)) {
 			status = -ENOTEMPTY;
 			goto leave;
 		} else if (inode->i_nlink != 2) {
@@ -764,11 +838,22 @@
 		}
 	}
 
+	if (S_ISDIR(inode->i_mode) || (inode->i_nlink == 1)) {
+		status = ocfs_prepare_orphan_dir(osb, handle, inode, 
+						 &orphan_name, 
+						 &orphan_entry_bh);
+		if (status < 0) {
+			LOG_ERROR_STATUS (status);
+			goto leave;
+		}
+	}
+
 	handle = ocfs_start_trans(osb, handle, OCFS_FILE_DELETE_CREDITS);
 	if (handle == NULL) {
 		LOG_ERROR_STATUS (status = -ENOMEM);
 		goto leave;
 	}
+	ocfs_handle_set_always_commits(handle, 1);
 
 	status = ocfs_journal_access(handle, fe_bh, OCFS_JOURNAL_ACCESS_WRITE);
 	if (status < 0) {
@@ -784,26 +869,28 @@
 		inode->i_nlink = fe->i_links_count;
 	}
 
-	if (S_ISDIR (inode->i_mode))
-		fe->i_links_count = 0;
-	else
-		fe->i_links_count--;
-	if (!fe->i_links_count) {
-		status = ocfs_orphan_add(osb, handle, inode, fe);
+	if (S_ISDIR(inode->i_mode) || (fe->i_links_count == 1)) {
+		status = ocfs_orphan_add(osb, handle, inode, fe, orphan_name,
+					 orphan_entry_bh);
 		if (status < 0) {
 			LOG_ERROR_STATUS(status);
 			goto leave;
 		}
 	}
 
-	status = ocfs_journal_dirty(handle, fe_bh);
+	/* delete the name from the parent dir */
+	status = ocfs_delete_entry (handle, dir, dirent, dirent_bh);
 	if (status < 0) {
 		LOG_ERROR_STATUS(status);
 		goto leave;
 	}
 
-	/* delete the name from the parent dir */
-	status = ocfs_delete_entry (handle, parentInode, dirent, dirent_bh);
+	if (S_ISDIR (inode->i_mode))
+		fe->i_links_count = 0;
+	else
+		fe->i_links_count--;
+
+	status = ocfs_journal_dirty(handle, fe_bh);
 	if (status < 0) {
 		LOG_ERROR_STATUS(status);
 		goto leave;
@@ -828,22 +915,15 @@
 
 leave:
 	if (handle) {
-		if (status < 0)
-			ocfs_abort_trans(handle);
-		else {
-			ocfs_commit_trans(handle);
-			// already checked to make sure dir has nlink==2
-			if (S_ISDIR (inode->i_mode)) {
-				inode->i_nlink = 0;
-				dir->i_nlink--;
-			} else
-				inode->i_nlink--;
-		}
+		ocfs_commit_trans(handle);
+		// already checked to make sure dir has nlink==2
+		if (S_ISDIR (inode->i_mode)) {
+			inode->i_nlink = 0;
+			dir->i_nlink--;
+		} else
+			inode->i_nlink--;
 	}
 
-	up_write(&OCFS_I(inode)->ip_io_sem);
-	up_write(&OCFS_I(dir)->ip_io_sem);
-bail:
 	if (status < 0 && status != -ENOTEMPTY && 
 	    status != -EPERM && status != -EBUSY && status != -EINTR) {
 		LOG_ERROR_STATUS(status);
@@ -860,6 +940,12 @@
 	if (parent_node_bh)
 		brelse(parent_node_bh);
 
+	if (orphan_entry_bh)
+		brelse(orphan_entry_bh);
+
+	if (orphan_name)
+		kfree(orphan_name);
+
 	LOG_EXIT_INT (retval);
 
 	LOG_CLEAR_CONTEXT();
@@ -933,6 +1019,7 @@
 			goto bail;
 		}
 		ocfs_handle_add_lock(handle, type2, flags2, inode2);
+		ocfs_handle_add_inode(handle, inode2);
 	}
 	/* lock id1 */
 	status = ocfs_acquire_lock(osb, type1, flags1, 
@@ -942,48 +1029,31 @@
 		goto bail;
 	}
 	ocfs_handle_add_lock(handle, type1, flags1, inode1);
-
+	ocfs_handle_add_inode(handle, inode1);
 bail:
-
 	LOG_EXIT_STATUS(status);
 	return(status);
 } /* ocfs_double_lock */
 
-static inline void double_down_write(struct rw_semaphore *s1, 
-				     struct rw_semaphore *s2)
-{
-	if (s1 != s2) {
-		if ((unsigned long) s1 < (unsigned long) s2) {
-			struct rw_semaphore *tmp = s2;
-			s2 = s1; s1 = tmp;
-		}
-		down_write(s1);
-	}
-	down_write(s2);
-}
-
-static inline void double_up_write(struct rw_semaphore *s1, 
-				   struct rw_semaphore *s2)
-{
-	up_write(s1);
-	if (s1 != s2)
-		up_write(s2);
-}
-
 #define PARENT_INO(buffer) \
 	((struct ocfs2_dir_entry *) ((char *) buffer + \
 	le16_to_cpu(((struct ocfs2_dir_entry *) buffer)->rec_len)))->inode
+
 /*
  * ocfs_rename()
  *
  */
 static int ocfs_rename (struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry)
 {
+#warning "this needs to be split up into seperate functions."
 	int status = 0;
 	struct inode *old_inode = old_dentry->d_inode;
 	struct inode *new_inode = new_dentry->d_inode;
 	ocfs2_dinode *newfe = NULL;
+	char *orphan_name;
+	struct buffer_head *orphan_entry_bh = NULL;
 	struct buffer_head *newfe_bh = NULL;
+	struct buffer_head *insert_entry_bh = NULL;
 	ocfs_super *osb = NULL;
 	u64 newfe_blkno;
 	ocfs_journal_handle *handle = NULL;
@@ -1006,14 +1076,9 @@
 
 	osb = OCFS_SB(old_dir->i_sb);
 
-	double_down_write(&OCFS_I(old_dir)->ip_io_sem, 
-			  &OCFS_I(new_dir)->ip_io_sem);
-	down_write(&OCFS_I(old_inode)->ip_io_sem);
-
 	if (new_inode) {
 		if (ocfs_inc_icount(new_inode) < 0)
 			BUG();
-		down_write(&OCFS_I(new_inode)->ip_io_sem);
 	}
 
 	if (atomic_read (&old_dentry->d_count) > 2) {
@@ -1024,13 +1089,6 @@
 		}
 	}
 
-	if (new_inode && S_ISDIR (old_inode->i_mode) && 
-	    !empty_dir (new_inode)) {
-		status = -ENOTEMPTY;
-		LOG_TRACE_STR ("New (directory) dentry NOT empty!");
-		goto bail;
-	}
-
 	handle = ocfs_alloc_handle(osb);
 	if (handle == NULL) {
 		LOG_ERROR_STATUS(status = -ENOMEM);
@@ -1047,7 +1105,7 @@
 				  &new_dir_bh, new_dir);
 	if (status < 0) {
 		LOG_ERROR_STATUS(status);
-		goto finally;
+		goto bail;
 	}
 
 	/* make sure both dirs have bhs
@@ -1059,7 +1117,7 @@
 		} else {
 			LOG_ERROR_STR("no old_dir_bh!");
 			status = -EIO;
-			goto finally;
+			goto bail;
 		}
 	}
 
@@ -1076,42 +1134,42 @@
 		}
 		if (status < 0) {
 			LOG_ERROR_STATUS(status);
-			goto finally;
+			goto bail;
 		}
-
 		ocfs_handle_add_lock(handle, OCFS_LKM_EXMODE, 
 				     FLAG_RELEASE_DENTRY|FLAG_FILE_RENAME,
 				     old_inode);
+		ocfs_handle_add_inode(handle, old_inode);
 
 		status = -EIO;
-		old_inode_de_bh = ocfs_bread (handle, old_inode, 0, 0, &status, 0);
+		old_inode_de_bh = ocfs_bread (old_inode, 0, &status, 0);
 		if (!old_inode_de_bh)
-			goto finally;
+			goto bail;
 
 		status = -EIO;
 		if (le64_to_cpu(PARENT_INO(old_inode_de_bh->b_data)) != 
 		    OCFS_I(old_dir)->ip_blkno)
-			goto finally;
+			goto bail;
 		status = -EMLINK;
 		if (!new_inode && new_dir!=old_dir &&
 		    new_dir->i_nlink >= OCFS2_LINK_MAX)
-			goto finally;
+			goto bail;
 	} else {
 		/* Ah, the simple case - we're a file so just send a
 		 * message. */
 		status = ocfs_notify_on_rename(osb, old_inode);
 		if (status < 0) {
 			LOG_ERROR_STATUS(status);
-			goto finally;
+			goto bail;
 		}
 	}
-	
+
 	status = -ENOENT;
 	old_de_bh = ocfs_find_entry(old_dentry->d_name.name, 
 				    old_dentry->d_name.len,
 				    old_dir, &old_de);
 	if (!old_de_bh)
-		goto finally;
+		goto bail;
 
 	/*
 	 *  Check for inode number is _not_ due to possible IO errors.
@@ -1120,7 +1178,7 @@
 	 *  same name. Goodbye sticky bit ;-<
 	 */
 	if (le64_to_cpu(old_de->inode) != OCFS_I(old_inode)->ip_blkno)
-		goto finally;
+		goto bail;
 
 	/* check if the target already exists (in which case we need
 	 * to delete it */
@@ -1134,7 +1192,7 @@
 		/* If we cannot find the file specified we should just */
 		/* return the error... */
 		LOG_ERROR_STATUS (status);
-		goto finally;
+		goto bail;
 	} 
 
 	if (!new_de && new_inode)
@@ -1155,23 +1213,39 @@
 					   new_inode);
 		if (status < 0) {
 			LOG_ERROR_STATUS(status);
-			goto finally;
+			goto bail;
 		}
-
 		ocfs_handle_add_lock(handle, OCFS_LKM_EXMODE, 
 				     FLAG_RELEASE_DENTRY, new_inode);
+		ocfs_handle_add_inode(handle, new_inode);
 
 		newfe = (ocfs2_dinode *) newfe_bh->b_data;
 
-		/* if our caching is working right, then after the
-		 * verify_update_inode, newfe->i_nlink ==
-		 * new_inode->i_nlink */
-		status = ocfs_refresh_inode (new_inode, newfe);
-
 		LOG_TRACE_ARGS("aha rename over existing... new_de=%p "
 			       "new_blkno=%llu newfebh=%p bhblocknr=%llu\n",
 			       new_de, newfe_blkno, newfe_bh, newfe_bh ?
 			       (unsigned long long)newfe_bh->b_blocknr : 0ULL);
+
+		if (S_ISDIR(new_inode->i_mode) || (new_inode->i_nlink == 1)) {
+			status = ocfs_prepare_orphan_dir(osb, handle, 
+							 new_inode, 
+							 &orphan_name,
+							 &orphan_entry_bh);
+			if (status < 0) {
+				LOG_ERROR_STATUS (status);
+				goto bail;
+			}
+		}
+	} else {
+		OCFS_ASSERT(new_dentry->d_parent->d_inode == new_dir);
+		status = ocfs_prepare_dir_for_insert(osb, new_dir, new_dir_bh,
+						     new_dentry->d_name.name, 
+						     new_dentry->d_name.len,
+						     &insert_entry_bh);
+		if (status < 0) {
+			LOG_ERROR_STATUS (status);
+			goto bail;
+		}
 	}
 
 	handle = ocfs_start_trans(osb, handle, OCFS_FILE_RENAME_CREDITS);
@@ -1179,48 +1253,38 @@
 		LOG_ERROR_STATUS(status = -ENOMEM);
 		goto bail;
 	}
+	ocfs_handle_set_always_commits(handle, 1);
 
 	if (new_de) {
+		if (S_ISDIR (new_inode->i_mode)) {
+			if (!ocfs_empty_dir(new_inode) || new_inode->i_nlink != 2) {
+				status = -ENOTEMPTY;
+				goto bail;
+			}
+		}
 		status = ocfs_journal_access(handle, newfe_bh, 
 					     OCFS_JOURNAL_ACCESS_WRITE);
 		if (status < 0) {
 			LOG_ERROR_STATUS (status);
-			goto finally;
+			goto bail;
 		}
 
-		if (S_ISDIR (new_inode->i_mode)) {
-			if (!empty_dir(new_inode) || new_inode->i_nlink != 2) {
-				status = -ENOTEMPTY;
-				goto finally;
-			}
-		}
-
-		if (S_ISDIR (new_inode->i_mode))
-			newfe->i_links_count = 0;
-		else
-			newfe->i_links_count--;
-
-		if (!newfe->i_links_count) {
+		if (S_ISDIR(new_inode->i_mode) || (newfe->i_links_count == 1)){
 			status = ocfs_orphan_add(osb, handle, new_inode,
-						 newfe);
+						 newfe, orphan_name, 
+						 orphan_entry_bh);
 			if (status < 0) {
 				LOG_ERROR_STATUS(status);
-				goto finally;
+				goto bail;
 			}
 		}
 
-		status = ocfs_journal_dirty(handle, newfe_bh);
-		if (status < 0) {
-			LOG_ERROR_STATUS (status);
-			goto finally;
-		}
-
 		/* change the dirent to point to the correct inode */
 		status = ocfs_journal_access(handle, new_de_bh, 
 					     OCFS_JOURNAL_ACCESS_WRITE);
 		if (status < 0) {
 			LOG_ERROR_STATUS (status);
-			goto finally;
+			goto bail;
 		}
 		new_de->inode =
 			le64_to_cpu(OCFS_I(old_inode)->ip_blkno);
@@ -1229,21 +1293,26 @@
 		status = ocfs_journal_dirty(handle, new_de_bh);
 		if (status < 0) {
 			LOG_ERROR_STATUS(status);
-			goto finally;
+			goto bail;
 		}
+
+		if (S_ISDIR (new_inode->i_mode))
+			newfe->i_links_count = 0;
+		else
+			newfe->i_links_count--;
+
+		status = ocfs_journal_dirty(handle, newfe_bh);
+		if (status < 0) {
+			LOG_ERROR_STATUS (status);
+			goto bail;
+		}
 	} else {
 		/* if the name was not found in new_dir, add it now */
 		status = ocfs_add_entry (handle, new_dentry, old_inode, 
 					 OCFS_I(old_inode)->ip_blkno, 
-					 new_dir_bh);
+					 new_dir_bh, insert_entry_bh);
 	}
 
-finally:
-	if (status < 0) {
-		ocfs_abort_trans(handle);
-		goto bail;
-	}
-
 	old_inode->i_ctime = CURRENT_TIME;
 	mark_inode_dirty(old_inode);
 
@@ -1262,19 +1331,9 @@
 	if (old_inode_de_bh) {
 		status = ocfs_journal_access(handle, old_inode_de_bh, 
 					     OCFS_JOURNAL_ACCESS_WRITE);
-		if (status < 0) {
-			LOG_ERROR_STATUS (status);
-			// BAD
-			goto bail;
-		}
 		PARENT_INO(old_inode_de_bh->b_data) =
 			le64_to_cpu(OCFS_I(new_dir)->ip_blkno);
 		status = ocfs_journal_dirty(handle, old_inode_de_bh);
-		if (status < 0) {
-			LOG_ERROR_STATUS(status);
-			// BAD
-			goto bail;
-		}
 		old_dir->i_nlink--;
 		if (new_inode) {
 			new_inode->i_nlink--;
@@ -1316,25 +1375,16 @@
 		}
 	}
 
-	/* commit_trans */
-	ocfs_commit_trans(handle);
+	status = 0;
+bail:
+	if (handle)
+		ocfs_commit_trans(handle);
 
 	if (new_inode)
 		sync_mapping_buffers(old_inode->i_mapping);
 
-	status = 0;
-
-bail:
-
-	double_up_write(&OCFS_I(old_dir)->ip_io_sem, 
-			&OCFS_I(new_dir)->ip_io_sem);
-	up_write(&OCFS_I(old_inode)->ip_io_sem);
-
-	if (new_inode) {
-		up_write(&OCFS_I(new_inode)->ip_io_sem);
+	if (new_inode)
 		iput(new_inode);
-	}
-
 	if (newfe_bh)
 		brelse(newfe_bh);
 	if (old_dir_bh)
@@ -1347,7 +1397,14 @@
 		brelse(old_de_bh);
 	if (old_inode_de_bh)
 		brelse(old_inode_de_bh);
+	if (orphan_entry_bh)
+		brelse(orphan_entry_bh);
+	if (insert_entry_bh)
+		brelse(insert_entry_bh);
+	if (orphan_name)
+		kfree(orphan_name);
 
+
 	LOG_EXIT_STATUS(status);
 
 	LOG_CLEAR_CONTEXT();
@@ -1479,11 +1536,14 @@
 	struct super_block *sb;
 	int l;
 	struct buffer_head *new_fe_bh = NULL;
+	struct buffer_head *de_bh = NULL;
 	struct buffer_head *parent_fe_bh = NULL;
 	ocfs2_dinode *fe = NULL;
 	ocfs2_dinode *dirfe;
 	ocfs_journal_handle *handle = NULL;
 	int credits;
+	ocfs2_alloc_context *inode_ac = NULL;
+	ocfs2_alloc_context *data_ac = NULL;
 
 	LOG_SET_CONTEXT(SYMLINK);
 
@@ -1494,8 +1554,6 @@
 	sb = dir->i_sb;
 	osb = OCFS_SB(sb);
 
-	down_write(&OCFS_I(dir)->ip_io_sem);
-
 	inode = new_inode (sb);
 	if (IS_ERR (inode)) {
 		status = PTR_ERR(inode);
@@ -1514,7 +1572,7 @@
 	l = strlen (symname) + 1;
 	newsize = l - 1;
 
-	credits = ocfs_calc_symlink_credits(sb, newsize);
+	credits = ocfs_calc_symlink_credits(sb);
 
 	handle = ocfs_alloc_handle(osb);
 	if (handle == NULL) {
@@ -1528,35 +1586,56 @@
 	if (status < 0) {
 		if (status != -EINTR)
 			LOG_ERROR_STATUS (status);
-		goto abort_trans;
+		goto bail;
 	}
-
 	ocfs_handle_add_lock(handle, OCFS_LKM_EXMODE, 0, dir);
+	ocfs_handle_add_inode(handle, dir);
 
 	dirfe = (ocfs2_dinode *) parent_fe_bh->b_data;
 	if (!dirfe->i_links_count) {
 		/* can't make a file in a deleted directory. */
 		status = -ENOENT;
-		goto abort_trans;
+		goto bail;
 	}
 
+	status = ocfs_prepare_dir_for_insert(osb, dir, parent_fe_bh, 
+					     dentry->d_name.name, 
+					     dentry->d_name.len, &de_bh);
+	if (status < 0) {
+		LOG_ERROR_STATUS (status = -ENOMEM);
+		goto bail;
+	}
+
+	status = ocfs_reserve_new_inode(osb, handle, &inode_ac);
+	if (status < 0) {
+		LOG_ERROR_STATUS (status = -ENOMEM);
+		goto bail;
+	}
+
+	status = ocfs_reserve_bits(osb, handle, 1, &data_ac);
+	if (status < 0) {
+		LOG_ERROR_STATUS (status = -ENOMEM);
+		goto bail;
+	}
+
 	handle = ocfs_start_trans(osb, handle, credits);
 	if (handle == NULL) {
 		LOG_ERROR_STATUS (status = -ENOMEM);
 		goto bail;
 	}
+	ocfs_handle_set_always_commits(handle, 1);
 
 	status = ocfs_mknod_locked(osb, dir, dentry, 
  				   S_IFLNK | S_IRWXUGO, 0,
  				   &new_fe_bh, parent_fe_bh, handle,
-				   inode);
+				   inode, inode_ac);
 	if (status < 0) {
 		LOG_ERROR_STATUS(status);
-		goto abort_trans;
+		goto bail;
 	}
 
 	fe = (ocfs2_dinode *) new_fe_bh->b_data;
-	
+
 	if (ocfs_populate_inode (inode, fe, 1) < 0) {
 		LOG_ERROR_ARGS("populate inode failed! bh->b_blocknr=%llu, "
 			       "i_blkno=%llu, i_ino=%lu\n",
@@ -1571,47 +1650,58 @@
 	if (status < 0)
 		LOG_ERROR_STATUS(status);
 
-	status = ocfs_extend_file(osb, newsize, handle, inode, NULL, 0,
-				  new_fe_bh);
+	status = ocfs_extend_allocation(osb, inode, 1, new_fe_bh, handle, 
+					data_ac, NULL, NULL);
 	if (status < 0) {
 		if (status != -ENOSPC && status != -EINTR) {
 			LOG_ERROR_ARGS ("Failed to extend file to %llu", newsize);
 			LOG_ERROR_STATUS(status);
 			status = -ENOSPC;
 		}
-		goto abort_trans;
+		goto bail;
 	}
 	inode->i_rdev = 0;
 	inode->i_size = newsize;
 	inode->i_blocks = (newsize + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
 
+	status = ocfs_mark_inode_dirty(handle, inode, new_fe_bh);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
 	status = ocfs_inode_fill_ext_map(osb, new_fe_bh, inode);
 	if (status < 0) {
 		LOG_ERROR_STATUS(status);
-		goto abort_trans;
+		goto bail;
 	}
 
 	status = ocfs_create_symlink_data(osb, handle, inode, symname);
-	if (status < 0)
-		LOG_ERROR_STATUS(status);
+	if (status < 0) {
+		LOG_ERROR_STATUS (status);
+		goto bail;
+	}
 
-abort_trans:
+	status = ocfs_add_entry(handle, dentry, inode, fe->i_blkno,
+				parent_fe_bh, de_bh);
 	if (status < 0) {
-		ocfs_abort_trans(handle);
-	} else {
-		insert_inode_hash (inode);
-		d_instantiate (dentry, inode);
-
-		ocfs_commit_trans(handle);
+		LOG_ERROR_STATUS (status);
+		goto bail;
 	}
 
+	insert_inode_hash (inode);
+	d_instantiate (dentry, inode);
 bail:
-	up_write(&OCFS_I(dir)->ip_io_sem);
-	if (new_fe_bh) {
+	if (handle)
+		ocfs_commit_trans(handle);
+	if (new_fe_bh)
 		brelse(new_fe_bh);
-	}
 	if (parent_fe_bh)
 		brelse(parent_fe_bh);
+	if (de_bh)
+		brelse(de_bh);
+	if (inode_ac)
+		ocfs_free_alloc_context(inode_ac);
 
 	LOG_EXIT_STATUS (status);
 
@@ -1644,75 +1734,41 @@
 	return error_msg == NULL ? 1 : 0;
 }
 
-static inline int ocfs_match (int len, const char * const name, struct ocfs2_dir_entry * de)
-{
-	if (len != de->name_len)
-		return 0;
-	if (!de->inode)
-		return 0;
-	return !memcmp(name, de->name, len);
-}
-
 /* we don't always have a dentry for what we want to add, so people
- * like orphan dir can call this instead. */
+ * like orphan dir can call this instead. 
+ *
+ * If you pass me insert_bh, I'll skip the search of the other dir
+ * blocks and put the record in there. 
+*/
 static int __ocfs_add_entry (ocfs_journal_handle *handle, struct inode *dir,
 			     const char *name, int namelen, 
 			     struct inode *inode, u64 blkno, 
-			     struct buffer_head *parent_fe_bh) 
+			     struct buffer_head *parent_fe_bh, 
+			     struct buffer_head *insert_bh) 
 {
 	unsigned long offset;
 	unsigned short rec_len;
-	struct buffer_head * bh;
 	struct ocfs2_dir_entry * de, * de1;
 	struct super_block * sb;
 	int retval, status;
-	ocfs2_dinode *fe = NULL;
 
 	LOG_ENTRY();
+	OCFS_ASSERT(insert_bh);
 
 	sb = dir->i_sb;
 
 	if (!namelen)
 		return -EINVAL;
-	bh = ocfs_bread (handle, dir, 0, 0, &retval, 0);
-	if (!bh)
-		return retval;
+
 	rec_len = OCFS2_DIR_REC_LEN(namelen);
 	offset = 0;
-	de = (struct ocfs2_dir_entry *) bh->b_data;
+	de = (struct ocfs2_dir_entry *) insert_bh->b_data;
 	while (1) {
-		if ((char *)de >= sb->s_blocksize + bh->b_data) {
-			brelse (bh);
-			bh = NULL;
-			bh = ocfs_bread (handle, dir, offset >> sb->s_blocksize_bits, 1, &retval, 0);
-			if (!bh)
-				goto bail;
-			if (dir->i_size <= offset) {
-				if (dir->i_size == 0) {
-					retval = -ENOENT;
-					goto bail;
-				}
-
-				/* create next block */
-				status = ocfs_journal_access(handle, bh, OCFS_JOURNAL_ACCESS_WRITE);
-				de = (struct ocfs2_dir_entry *) bh->b_data;
-				de->inode = 0;
-				de->rec_len = le16_to_cpu(sb->s_blocksize);
-				dir->i_size = offset + sb->s_blocksize;
-				status = ocfs_journal_dirty(handle, bh);
-
-				/* update the parent file entry file size */
-				status = ocfs_journal_access(handle, parent_fe_bh, OCFS_JOURNAL_ACCESS_WRITE);
-				fe = (ocfs2_dinode *) parent_fe_bh->b_data;
-				fe->i_size = dir->i_size;
-				status = ocfs_journal_dirty(handle, parent_fe_bh);
-			} else {
-				/* move to next block */
-				de = (struct ocfs2_dir_entry *) bh->b_data;
-			}
-			
-		}
-		if (!ocfs_check_dir_entry (dir, de, bh, offset)) {
+		OCFS_ASSERT((char *)de < sb->s_blocksize + insert_bh->b_data);
+		/* These checks should've already been passed by the
+		 * prepare function, but I guess we can leave them
+		 * here anyway. */
+		if (!ocfs_check_dir_entry (dir, de, insert_bh, offset)) {
 			retval = -ENOENT;
 			goto bail;
 		}
@@ -1724,7 +1780,8 @@
 		     (le16_to_cpu(de->rec_len) >= rec_len)) ||
 		    (le16_to_cpu(de->rec_len) >=
 		     (OCFS2_DIR_REC_LEN(de->name_len) + rec_len))) {
-			status = ocfs_journal_access(handle, bh, OCFS_JOURNAL_ACCESS_WRITE);
+			status = ocfs_journal_access(handle, insert_bh, 
+						    OCFS_JOURNAL_ACCESS_WRITE);
 			/* By now the buffer is marked for journaling */
 			offset += le16_to_cpu(de->rec_len);
 			if (le64_to_cpu(de->inode)) {
@@ -1747,7 +1804,7 @@
 
 			dir->i_mtime = dir->i_ctime = CURRENT_TIME;
 			dir->i_version++;
-			status = ocfs_journal_dirty(handle, bh);
+			status = ocfs_journal_dirty(handle, insert_bh);
 			retval = 0;
 			goto bail;
 		}
@@ -1755,9 +1812,10 @@
 		de = (struct ocfs2_dir_entry *) ((char *) de + le16_to_cpu(de->rec_len));
 	}
 
+	/* when you think about it, the assert above should prevent us
+	 * from ever getting here. */
 	retval = -ENOSPC;
 bail:
-	brelse (bh);
 
 	LOG_EXIT_STATUS(retval);
 	return retval;
@@ -1904,7 +1962,7 @@
 				num++;
 		
 #warning questionable readahead stuff here	
-				bh = ocfs_bread(NULL, dir, b++, 0, &err, 1);
+				bh = ocfs_bread(dir, b++, &err, 1);
 				bh_use[ra_max] = bh;
 #if 0		// ???
 				if (bh)
@@ -1956,44 +2014,62 @@
 	return ret;
 }
 
-/*
- * ocfs_orphan_add()
- *
- */
-static int ocfs_orphan_add(ocfs_super *osb, ocfs_journal_handle *handle,
-			   struct inode *inode, ocfs2_dinode *fe)
+static int ocfs_blkno_stringify(u64 blkno, char **retval)
 {
-	struct inode *orphan_dir_inode = NULL;
-	struct buffer_head *orphan_dir_bh = NULL;
-	int status = 0;
 	char *name = NULL;
 	int namelen;
-	ocfs2_dinode *orphan_fe;
 
-	LOG_ENTRY_ARGS("(inode->i_ino = %lu)\n", inode->i_ino);
+	LOG_ENTRY();
 
-	/* create a unique name here. */
+	*retval = NULL;
 	name = kmalloc(OCFS2_MAX_FILENAME_LENGTH+1, GFP_KERNEL);
 	if (!name) {
-		status = -EFAIL;
-		LOG_ERROR_STATUS(status);
-		goto leave;
+		namelen = -ENOMEM;
+		LOG_ERROR_STATUS(namelen);
+		goto bail;
 	}
 
 	namelen = snprintf(name, OCFS2_MAX_FILENAME_LENGTH+1, "%llu", 
-			   OCFS_I(inode)->ip_blkno);
+			   blkno);
 	if (namelen <= 0) {
-		if (namelen)
-			status = namelen;
-		else
-			status = -EFAIL;
+		kfree(name);
+		if (!namelen)
+			namelen = -EFAULT;
+		LOG_ERROR_STATUS(namelen);
+		goto bail;
+	}
+
+	LOG_TRACE_ARGS("built filename '%s' for orphan dir (len=%d)\n", name, 
+		       namelen);
+
+	*retval = name;
+bail:
+	LOG_EXIT_STATUS(namelen);
+	return(namelen);
+}
+
+static int ocfs_prepare_orphan_dir(ocfs_super *osb, 
+				   ocfs_journal_handle *handle,
+				   struct inode *inode,
+				   char **ret_name,
+				   struct buffer_head **de_bh)
+{
+	struct inode *orphan_dir_inode = NULL;
+	struct buffer_head *orphan_dir_bh = NULL;
+	int status = 0;
+	char *name = NULL;
+	int namelen;
+
+	*ret_name = NULL;
+
+	/* create a unique name here. */
+	namelen = ocfs_blkno_stringify(OCFS_I(inode)->ip_blkno, &name);
+	if (namelen < 0) {
+		status = namelen;
 		LOG_ERROR_STATUS(status);
 		goto leave;
 	}
 
-	LOG_TRACE_ARGS("adding filename '%s' to orphan dir (len=%d)\n", name, 
-		       namelen);
-
 	orphan_dir_inode = ocfs_get_system_file_inode(osb, 
 						      ORPHAN_DIR_SYSTEM_INODE, 
 						      -1);
@@ -2002,7 +2078,6 @@
 		LOG_ERROR_STATUS(status);
 		goto leave;
 	}
-	ocfs_handle_add_inode(handle, orphan_dir_inode);
 
 	/* disk lock orphan dir here. */
 	status = ocfs_acquire_lock(osb, OCFS_LKM_EXMODE, 0,
@@ -2013,15 +2088,64 @@
 	}
 	ocfs_handle_add_lock(handle, OCFS_LKM_EXMODE, 0, 
 			     orphan_dir_inode);
+	ocfs_handle_add_inode(handle, orphan_dir_inode);
 
-	status = __ocfs_add_entry(handle, orphan_dir_inode, name, namelen, 
-				  inode, OCFS_I(inode)->ip_blkno, 
-				  orphan_dir_bh);
+	status = ocfs_prepare_dir_for_insert(osb, orphan_dir_inode, 
+					     orphan_dir_bh, name, namelen, 
+					     de_bh);
 	if (status < 0) {
 		LOG_ERROR_STATUS(status);
 		goto leave;
 	}
 
+	*ret_name = name;
+leave:
+	if (orphan_dir_inode)
+		iput(orphan_dir_inode);
+
+	if ((status < 0) && name)
+		kfree(name);
+
+	if (orphan_dir_bh)
+		brelse(orphan_dir_bh);
+
+	LOG_EXIT_STATUS(status);
+	return(status);
+}
+
+/*
+ * ocfs_orphan_add()
+ *
+ */
+static int ocfs_orphan_add(ocfs_super *osb, ocfs_journal_handle *handle,
+			   struct inode *inode, ocfs2_dinode *fe, 
+			   char *name, struct buffer_head *de_bh)
+{
+	struct inode *orphan_dir_inode = NULL;
+	struct buffer_head *orphan_dir_bh = NULL;
+	int status = 0;
+	int namelen;
+	ocfs2_dinode *orphan_fe;
+
+	LOG_ENTRY_ARGS("(inode->i_ino = %lu)\n", inode->i_ino);
+
+	namelen = strlen(name);
+
+	orphan_dir_inode = ocfs_get_system_file_inode(osb, 
+						      ORPHAN_DIR_SYSTEM_INODE, 
+						      -1);
+	if (!orphan_dir_inode) {
+		status = -EFAIL;
+		LOG_ERROR_STATUS(status);
+		goto leave;
+	}
+
+	status = ocfs_read_bh(osb, OCFS_I(orphan_dir_inode)->ip_blkno << osb->sb->s_blocksize_bits, &orphan_dir_bh, OCFS_BH_CACHED, orphan_dir_inode);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto leave;
+	}
+
 	status = ocfs_journal_access(handle, orphan_dir_bh, 
 				     OCFS_JOURNAL_ACCESS_WRITE);
 	if (status < 0) {
@@ -2042,16 +2166,19 @@
 		goto leave;
 	}
 
+	status = __ocfs_add_entry(handle, orphan_dir_inode, name, namelen, 
+				  inode, OCFS_I(inode)->ip_blkno, 
+				  orphan_dir_bh, de_bh);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto leave;
+	}
 
 	fe->i_flags |= OCFS2_ORPHANED_FL;
-
 leave:
 	if (orphan_dir_inode)
 		iput(orphan_dir_inode);
 
-	if (name)
-		kfree(name);
-
 	if (orphan_dir_bh)
 		brelse(orphan_dir_bh);
 

Modified: trunk/src/namei.h
===================================================================
--- trunk/src/namei.h	2004-10-02 01:01:24 UTC (rev 1542)
+++ trunk/src/namei.h	2004-10-02 01:08:08 UTC (rev 1543)
@@ -40,5 +40,13 @@
 int ocfs_orphan_del(ocfs_super *osb, ocfs_journal_handle *handle,
 		    struct inode *orphan_dir_inode, struct inode *inode,
 		    struct buffer_head *orphan_dir_bh);
+static inline int ocfs_match (int len, const char * const name, struct ocfs2_dir_entry * de)
+{
+	if (len != de->name_len)
+		return 0;
+	if (!de->inode)
+		return 0;
+	return !memcmp(name, de->name, len);
+}
 
 #endif /* OCFS2_NAMEI_H */

Modified: trunk/src/ocfs.h
===================================================================
--- trunk/src/ocfs.h	2004-10-02 01:01:24 UTC (rev 1542)
+++ trunk/src/ocfs.h	2004-10-02 01:08:08 UTC (rev 1543)
@@ -98,6 +98,13 @@
 /* convenience macro */
 
 #define OCFS_ASSERT(x)             do { if (!(x)) BUG(); } while (0)
+#define OCFS_ASSERT_RO(x)	\
+	do { \
+		if (!(x)) { \
+			printk(KERN_ERR "This should make the filesystem remount RO\n"); \
+			BUG(); \
+		} \
+	} while (0)
 
 
 #define BITCOUNT(x)     (((BX_(x)+(BX_(x)>>4)) & 0x0F0F0F0F) % 255)
@@ -266,6 +273,8 @@
 	(!strcmp((ptr)->i_signature, OCFS2_INODE_SIGNATURE))
 #define  IS_VALID_EXTENT_BLOCK(ptr)  \
 	(!strcmp((ptr)->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE))
+#define  IS_VALID_GROUP_DESC(ptr)    \
+	(!strcmp((ptr)->bg_signature, OCFS2_GROUP_DESC_SIGNATURE))
 
 /*
 ** Macros
@@ -360,7 +369,8 @@
 	__u32 uncommitted_holders;
 	__u8 lock_type;
 	struct rw_semaphore lock;
-	unsigned long readonly_state;
+#warning readonly_state is an int, *_bit requires unsigned long
+	int readonly_state;
 	ocfs_node_map readonly_map;
 };
 
@@ -815,15 +825,15 @@
 /* these three used as 'type' in ocfs_bitmap_update */
 #define  DISK_ALLOC_EXTENT_NODE   2
 #define  DISK_ALLOC_VOLUME        3
-#define  DISK_ALLOC_INODE	  4
 
 /* a bitmap update, currently used for freeing bits */
 typedef struct ocfs_bitmap_update
 {
-	__u64 length;
-	__u64 file_off;
-	__u32 type;
-	__s16 node_num;
+	u64 length;
+	u64 file_off;
+	u32 type;
+	s16 node_num;
+	u64 blkno;
 }
 ocfs_bitmap_update;
 

Modified: trunk/src/ocfs2_fs.h
===================================================================
--- trunk/src/ocfs2_fs.h	2004-10-02 01:01:24 UTC (rev 1542)
+++ trunk/src/ocfs2_fs.h	2004-10-02 01:08:08 UTC (rev 1543)
@@ -56,6 +56,7 @@
 #define OCFS2_SUPER_BLOCK_SIGNATURE	"OCFSV2"
 #define OCFS2_INODE_SIGNATURE		"INODE01"
 #define OCFS2_EXTENT_BLOCK_SIGNATURE	"EXBLK01"
+#define OCFS2_GROUP_DESC_SIGNATURE      "GROUP01"
 
 /* Compatibility flags */
 #define OCFS2_HAS_COMPAT_FEATURE(sb,mask)			\
@@ -96,7 +97,7 @@
 #define OCFS2_BITMAP_FL		(0x00000080)	/* Allocation bitmap */
 #define OCFS2_JOURNAL_FL	(0x00000100)	/* Node journal */
 #define OCFS2_DLM_FL		(0x00000200)	/* DLM area */
-	
+#define OCFS2_CHAIN_FL		(0x00000400)	/* Chain allocator */
 
 /* Limit of space in ocfs2_dir_entry */
 #define OCFS2_MAX_FILENAME_LENGTH       255
@@ -117,16 +118,13 @@
 enum {
 	BAD_BLOCK_SYSTEM_INODE = 0,
 	GLOBAL_INODE_ALLOC_SYSTEM_INODE,
-	GLOBAL_INODE_ALLOC_BITMAP_SYSTEM_INODE,
 	DLM_SYSTEM_INODE,
 #define OCFS2_FIRST_ONLINE_SYSTEM_INODE DLM_SYSTEM_INODE
 	GLOBAL_BITMAP_SYSTEM_INODE,
 	ORPHAN_DIR_SYSTEM_INODE,
 #define OCFS2_LAST_GLOBAL_SYSTEM_INODE ORPHAN_DIR_SYSTEM_INODE
 	EXTENT_ALLOC_SYSTEM_INODE,
-	EXTENT_ALLOC_BITMAP_SYSTEM_INODE,
 	INODE_ALLOC_SYSTEM_INODE,
-	INODE_ALLOC_BITMAP_SYSTEM_INODE,
 	JOURNAL_SYSTEM_INODE,
 	LOCAL_ALLOC_SYSTEM_INODE,
 	NUM_SYSTEM_INODES
@@ -134,10 +132,9 @@
 
 static char *ocfs2_system_inode_names[NUM_SYSTEM_INODES] = {
 	/* Global system inodes (single copy) */
-	/* The first three are only used from userspace mfks/tunefs */
+	/* The first two are only used from userspace mfks/tunefs */
 	[BAD_BLOCK_SYSTEM_INODE]		"bad_blocks",
 	[GLOBAL_INODE_ALLOC_SYSTEM_INODE] 	"global_inode_alloc",
-	[GLOBAL_INODE_ALLOC_BITMAP_SYSTEM_INODE]	"global_inode_alloc_bitmap",
 
 	/* These are used by the running filesystem */
 	[DLM_SYSTEM_INODE]			"dlm",
@@ -146,9 +143,7 @@
 
 	/* Node-specific system inodes (one copy per node) */
 	[EXTENT_ALLOC_SYSTEM_INODE]		"extent_alloc:%04d",
-	[EXTENT_ALLOC_BITMAP_SYSTEM_INODE]	"extent_alloc_bitmap:%04d",
 	[INODE_ALLOC_SYSTEM_INODE]		"inode_alloc:%04d",
-	[INODE_ALLOC_BITMAP_SYSTEM_INODE]	"inode_alloc_bitmap:%04d",
 	[JOURNAL_SYSTEM_INODE]			"journal:%04d",
 	[LOCAL_ALLOC_SYSTEM_INODE]		"local_alloc:%04d"
 };
@@ -225,6 +220,12 @@
 /*10*/
 } ocfs2_extent_rec;	
 
+typedef struct _ocfs2_chain_rec {
+	__u32 c_free;	/* Number of free bits in this chain. */
+	__u32 c_total;	/* Number of total bits in this chain */
+	__u64 c_blkno;	/* Physical disk offset (blocks) of 1st group */
+} ocfs2_chain_rec;
+
 /*
  * On disk extent list for OCFS2 (node in the tree).  Note that this
  * is contained inside ocfs2_dinode or ocfs2_extent_block, so the
@@ -245,16 +246,30 @@
 } ocfs2_extent_list;
 
 /*
+ * On disk allocation chain list for OCFS2.  Note that this is
+ * contained inside ocfs2_dinode, so the offsets are relative to
+ * ocfs2_dinode.id2.i_chain.
+ */
+typedef struct _ocfs2_chain_list {
+/*00*/	__u16 cl_cpg;			/* Clusters per Block Group */
+	__u16 cl_bpc;			/* Bits per cluster */
+	__u16 cl_count;			/* Total chains in this list */
+	__u16 cl_next_free_rec;		/* Next unused chain slot */
+	__u64 cl_reserved1;
+/*10*/	ocfs2_chain_rec cl_recs[0];	/* Chain records */
+} ocfs2_chain_list;
+
+/*
  * On disk extent block (indirect block) for OCFS2
  */
 typedef struct _ocfs2_extent_block
 {
 /*00*/	__u8 h_signature[8];		/* Signature for verification */
-	__u64 h_suballoc_blkno;		/* Node suballocator offset,
-					   in blocks */
+	__u64 h_reserved1;
 /*10*/	__s16 h_suballoc_node;		/* Node suballocator this
 					   extent_header belongs to */
-	__u16 h_reserved1;
+	__u16 h_suballoc_bit;		/* Bit offset in suballocater
+					   block group */
 	__u32 h_reserved2;
 	__u64 h_blkno;			/* Offset on disk, in blocks */
 /*20*/	__u64 h_parent_blk;		/* Offset on disk, in blocks,
@@ -273,12 +288,9 @@
 typedef struct _ocfs2_disk_lock
 {
 /*00*/	__s16 dl_master;	/* Node number of current master */
-	__u16 dl_reserved1;
 	__u8 dl_level;		/* Lock level */
-	__u8 dl_reserved2[3];	/* Pad to u64 */
-	__u64 dl_reserved3;	/* was dl_seq_num */
-/*10*/	__u32 dl_reserved4[8];  /* was dl_node_map */
-/*30*/
+	__u8 dl_reserved1;
+/*04*/
 } ocfs2_disk_lock;
 
 /*
@@ -338,45 +350,43 @@
 typedef struct _ocfs2_dinode {
 /*00*/	__u8 i_signature[8];		/* Signature for validation */
 	__u32 i_generation;		/* Generation number */
-	__u16 i_reserved1;
 	__s16 i_suballoc_node;		/* Node suballocater this inode
 					   belongs to */
-/*10*/	__u64 i_suballoc_blkno;		/* Node suballocator offset,
-       					   in blocks */
-/*18*/	ocfs2_disk_lock i_disk_lock;	/* Lock structure */
-/*48*/	__u32 i_uid;			/* Owner UID */
+	__u16 i_suballoc_bit;		/* Bit offset in suballocater
+					   block group */
+/*10*/	ocfs2_disk_lock i_disk_lock;	/* Lock structure */
+/*14*/	__u32 i_clusters;		/* Cluster count */
+/*18*/	__u32 i_uid;			/* Owner UID */
 	__u32 i_gid;			/* Owning GID */
-/*50*/	__u64 i_size;			/* Size in bytes */
+/*20*/	__u64 i_size;			/* Size in bytes */
 	__u16 i_mode;			/* File mode */
 	__u16 i_links_count;		/* Links count */
 	__u32 i_flags;			/* File flags */
-/*60*/	__u64 i_atime;			/* Access time */
+/*30*/	__u64 i_atime;			/* Access time */
 	__u64 i_ctime;			/* Creation time */
-/*70*/	__u64 i_mtime;			/* Modification time */
+/*40*/	__u64 i_mtime;			/* Modification time */
 	__u64 i_dtime;			/* Deletion time */
-/*80*/	__u64 i_blkno;			/* Offset on disk, in blocks */
-	__u32 i_clusters;		/* Cluster count */
-	__u32 i_reserved2;
-/*90*/	__u64 i_last_eb_blk;		/* Pointer to last extent
+/*50*/	__u64 i_blkno;			/* Offset on disk, in blocks */
+	__u64 i_last_eb_blk;		/* Pointer to last extent
 					   block */
-	__u64 i_reserved3;
-/*A0*/	__u64 i_reserved4;
-	__u64 i_reserved5;
-/*B0*/	__u64 i_reserved6;
-	union {
-		__u64 i_pad1;		/* Generic way to refer to this 64bit
-					   union */
+/*60*/	__u64 i_reserved1[11];
+/*B8*/	union {
+		__u64 i_pad1;		/* Generic way to refer to this
+					   64bit union */
 		struct {
 			__u64 i_rdev;	/* Device number */
 		} dev1;
-		struct {		/* Info for bitmap system inodes */
+		struct {		/* Info for bitmap system
+					   inodes */
 			__u32 i_used;	/* Bits (ie, clusters) used  */
-			__u32 i_total;	/* Total bits (clusters) available */
+			__u32 i_total;	/* Total bits (clusters)
+					   available */
 		} bitmap1;
 	} id1;				/* Inode type dependant 1 */
 /*C0*/	union {
 		ocfs2_super_block i_super;
-                ocfs2_local_alloc i_lab;
+		ocfs2_local_alloc i_lab;
+		ocfs2_chain_list  i_chain;
 		ocfs2_extent_list i_list;
 	} id2;
 /* Actual on-disk size is one block */
@@ -394,8 +404,29 @@
 /* Actual on-disk length specified by rec_len */
 };
 
+/*
+ * On disk allocator group structure for OCFS2
+ */
+typedef struct _ocfs2_group_desc
+{
+/*00*/	__u8    bg_signature[8];        /* Signature for validation */
+	__u16   bg_size;                /* Size of included bitmap in
+					   bytes. */
+	__u16   bg_bits;                /* Bits represented by this
+					   group. */
+	__u16	bg_free_bits_count;     /* Free bits count */
+	__u16   bg_chain;               /* What chain I am in. */
+/*10*/	__u32   bg_generation;
+	__u32	bg_reserved1;
+	__u64   bg_next_group;          /* Next group in my list, in
+					   blocks */
+/*20*/	__u64   bg_parent_dinode;       /* dinode which owns me, in
+					   blocks */
+	__u64   bg_blkno;               /* Offset on disk, in blocks */
+/*30*/	__u64   bg_reserved2[2];
+/*40*/	__u8    bg_bitmap[0];
+} ocfs2_group_desc;
 
-
 #ifdef __KERNEL__
 static inline int ocfs2_extent_recs_per_inode(struct super_block *sb)
 {
@@ -407,6 +438,16 @@
 	return size / sizeof(struct _ocfs2_extent_rec);
 }
 
+static inline int ocfs2_chain_recs_per_inode(struct super_block *sb)
+{
+	int size;
+
+	size = sb->s_blocksize -
+		offsetof(struct _ocfs2_dinode, id2.i_chain.cl_recs);
+
+	return size / sizeof(struct _ocfs2_chain_rec);
+}
+
 static inline int ocfs2_extent_recs_per_eb(struct super_block *sb)
 {
 	int size;
@@ -426,6 +467,16 @@
 
 	return size;
 }
+
+static inline int ocfs2_group_bitmap_size(struct super_block *sb)
+{
+	int size;
+
+	size = sb->s_blocksize -
+		offsetof(struct _ocfs2_group_desc, bg_bitmap);
+
+	return size;
+}
 #else
 static inline int ocfs2_extent_recs_per_inode(int blocksize)
 {
@@ -437,6 +488,16 @@
 	return size / sizeof(struct _ocfs2_extent_rec);
 }
 
+static inline int ocfs2_chain_recs_per_inode(int blocksize)
+{
+	int size;
+
+	size = blocksize -
+		offsetof(struct _ocfs2_dinode, id2.i_chain.cl_recs);
+
+	return size / sizeof(struct _ocfs2_chain_rec);
+}
+
 static inline int ocfs2_extent_recs_per_eb(int blocksize)
 {
 	int size;
@@ -456,6 +517,16 @@
 
 	return size;
 }
+
+static inline int ocfs2_group_bitmap_size(int blocksize)
+{
+	int size;
+
+	size = blocksize -
+		offsetof(struct _ocfs2_group_desc, bg_bitmap);
+
+	return size;
+}
 #endif  /* __KERNEL__ */
 
 

Modified: trunk/src/ocfs_journal.h
===================================================================
--- trunk/src/ocfs_journal.h	2004-10-02 01:01:24 UTC (rev 1542)
+++ trunk/src/ocfs_journal.h	2004-10-02 01:08:08 UTC (rev 1543)
@@ -185,7 +185,10 @@
 
 	/* We know how many buffers (max) we'll have for this
 	 * transaction so we can just allocate an array of pointers at
-	 * the same time as the creation of this handle. */
+	 * the same time as the creation of this handle.
+	 *
+	 * NOTE: 'num_buffs' will always be zero if your transaction
+	 * is roll-forward only. */
 	int                 num_buffs;
 	struct buffer_head  **buffs;
 
@@ -216,8 +219,8 @@
 #define OCFS_HANDLE_SYNC			2
 /* This is really the right way to do things, but until we fix all the
  * code, it's a performance improvement for a handle which never
- * aborts. Should be set before passing any buffers to
- * journal_access! */
+ * aborts (always roll-forward). Should be set before passing any
+ * buffers to journal_access! */
 #define OCFS_HANDLE_ALWAYS_COMMITS		4
 
 static inline void ocfs_handle_free_all_copyout(ocfs_journal_handle *handle)
@@ -249,8 +252,9 @@
 }
 
 static inline int ocfs_handle_add_commit_bits(ocfs_journal_handle *handle,
-					      __u32 len, __u32 fileoff,
-					      __u32 nodenum, __u32 type)
+					      u32 len, u32 fileoff,
+					      u32 nodenum, u64 blkno,
+					      u32 type)
 {
 	int ret = 0;
 	if (!handle->commit_bits)
@@ -262,7 +266,7 @@
 		ret = ocfs_add_to_bitmap_free_head(handle->osb,
 						   handle->commit_bits,
 						   len, fileoff,
-						   nodenum, type);
+						   nodenum, blkno, type);
 	return ret;
 }
 
@@ -301,6 +305,9 @@
  *                          this handle.
  *  ocfs_commit_trans     - Complete a handle.
  *  ocfs_abort_trans      - Abort a handle.
+ *  ocfs_extend_trans     - Extend a handle by nblocks credits. This may 
+ *                          commit the handle to disk in the process, but will
+ *                          not release any locks taken during the transaction.
  *  ocfs_journal_access   - Notify the handle that we want to journal this 
  *                          buffer. Will have to call ocfs_journal_dirty once
  *                          we've actually dirtied it. Type is one of . or .
@@ -321,6 +328,9 @@
 				      int max_buffs);
 void                 ocfs_commit_trans(ocfs_journal_handle *handle);
 void                 ocfs_abort_trans(ocfs_journal_handle *handle);
+int                  ocfs_extend_trans(ocfs_journal_handle *handle, 
+				       int nblocks);
+
 /*
  * Create access is for when we get a newly created buffer and we're
  * not gonna read it off disk, but rather fill it ourselves. If it's
@@ -395,6 +405,9 @@
 			    (OCFS_SINGLE_FILE_EXTEND_CREDITS * 4) +	      \
 			    OCFS_JOURNAL_FUZZ_CREDITS)
 
+/* local alloc metadata change + main bitmap updates */
+#define OCFS_WINDOW_MOVE_CREDITS (1 + 8 + OCFS_JOURNAL_FUZZ_CREDITS)
+
 /* single file metadata updates * 3 because we might have to extend
  * the file alloc and file alloc bitmap files + possible update to
  * local bitmap. + 2 blocks for bits to set in the metadata alloc
@@ -402,18 +415,15 @@
 #define OCFS_FILE_EXTEND_CREDITS (OCFS_SINGLE_FILE_EXTEND_CREDITS * 3         \
 				  + 1 + 2 + 8 + OCFS_JOURNAL_FUZZ_CREDITS)
 
-
 /* Now that we journal bitmap writes, this might get a bit more
  * complicated, use this function to determine how many credits are
  * needed for an extend. Unfortunately, we're in bytes because the
  * rest of the file system is. 
  */
 static inline int ocfs_calc_extend_credits(struct super_block *sb,
-					   __u32 bytes_wanted)
+					   __u32 bits_wanted)
 {
 	int bitmap_blocks, sysfile_bitmap_blocks;
-	unsigned int bits_wanted;
-	bits_wanted = ocfs_clusters_for_bytes(sb, bytes_wanted);
 	/* take advantage of the fact that we always allocate in one 
 	 * large chunk. */
 	bitmap_blocks = ocfs_blocks_for_bits(sb, bits_wanted) + 1;
@@ -434,16 +444,25 @@
 	return (bitmap_blocks + sysfile_bitmap_blocks + OCFS_FILE_EXTEND_CREDITS);
 }
 
-static inline int ocfs_calc_symlink_credits(struct super_block *sb,
-					    int size)
+static inline int ocfs_calc_symlink_credits(struct super_block *sb)
 {
 	/* get our fuzz from mknod and extend credits. */
 	int blocks = OCFS_MKNOD_CREDITS + 1;
 	
-	blocks += ocfs_calc_extend_credits(sb, size);
+	blocks += ocfs_calc_extend_credits(sb, 1);
+	blocks += ocfs_clusters_to_blocks(sb, 1);
 
-	blocks += (size + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
+	return(blocks);
+}
 
+static inline int ocfs_calc_group_alloc_credits(struct super_block *sb,
+						unsigned int cpg)
+{
+	int blocks;
+	int bitmap_blocks = ocfs_blocks_for_bits(sb, cpg) + 1;
+	/* parent inode update + new block group header + bitmap inode update 
+	   + bitmap blocks affected */
+	blocks = 1 + 1 + 1 + bitmap_blocks + OCFS_JOURNAL_FUZZ_CREDITS;
 	return(blocks);
 }
 
@@ -452,7 +471,7 @@
 
 /* the file entry + the locknode + possibily the parent dirnode + fuzz */
 /* ok, these credits are messed up and need to be re calculated. */
-#define OCFS_FILE_DELETE_CREDITS  (1 + 1 + 1 + OCFS_JOURNAL_FUZZ_CREDITS)
+#define OCFS_FILE_DELETE_CREDITS  (2 + 1 + 1 + 1 + OCFS_JOURNAL_FUZZ_CREDITS)
 
 /* fe change, locknode change, dirnode head, times two plus a possible
  * delete, plus a possible dirnode addition in insert_file, and fuzz */

Modified: trunk/src/ocfs_log.h
===================================================================
--- trunk/src/ocfs_log.h	2004-10-02 01:01:24 UTC (rev 1542)
+++ trunk/src/ocfs_log.h	2004-10-02 01:08:08 UTC (rev 1543)
@@ -112,8 +112,8 @@
 #define OCFS_DEBUG_CONTEXT_FILE        0x00000200	/* file.c     */
 #define OCFS_DEBUG_CONTEXT_INODE       0x00000400	/* inode.c    */
 #define OCFS_DEBUG_CONTEXT_JOURNAL     0x00000800	/* journal.c  */
-#define OCFS_DEBUG_CONTEXT_UNUSED1     0x00001000	/*            */
-#define OCFS_DEBUG_CONTEXT_UNUSED2     0x00002000	/*            */
+#define OCFS_DEBUG_CONTEXT_CHAINALLOC  0x00001000	/*            */
+#define OCFS_DEBUG_CONTEXT_LOCALALLOC  0x00002000	/*            */
 #define OCFS_DEBUG_CONTEXT_SYSFILE     0x00004000	/* sysfile.c  */
 #define OCFS_DEBUG_CONTEXT_VOLCFG      0x00008000	/* volcfg.c   */
 #define OCFS_DEBUG_CONTEXT_DCACHE      0x00010000	/* dcache.c   */

Added: trunk/src/suballoc.c
===================================================================
--- trunk/src/suballoc.c	2004-10-02 01:01:24 UTC (rev 1542)
+++ trunk/src/suballoc.c	2004-10-02 01:08:08 UTC (rev 1543)
@@ -0,0 +1,1091 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * suballoc.c
+ *
+ * metadata alloc and free
+ * Inspired by ext3 block groups.
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Authors: Mark Fasheh, Kurt Hackel, Joel Becker, Sunil Mushran, 
+ *          Wim Coekaerts, Manish Singh
+ */
+
+#include "ocfs_compat.h"
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+
+#include "ocfs_log.h"
+#include "ocfs.h"
+
+#include "alloc.h"
+#include "dlm.h"
+#include "util.h"
+#include "suballoc.h"
+#include "sysfile.h"
+
+#include "ocfs_journal.h"
+#include "buffer_head_io.h"
+
+#define OCFS_DEBUG_CONTEXT    OCFS_DEBUG_CONTEXT_CHAINALLOC
+
+static inline void debug_bg(ocfs2_group_desc *bg);
+static inline void debug_suballoc_inode(ocfs2_dinode *fe);
+static inline u16 ocfs2_find_victim_chain(ocfs2_chain_list *cl);
+static int ocfs2_block_group_fill(ocfs_journal_handle *handle, 
+				  struct inode *alloc_inode,
+				  struct buffer_head *bg_bh,
+				  u64 group_blkno,
+				  u16 my_chain,
+				  ocfs2_chain_list *cl);
+static int ocfs_block_group_alloc(ocfs_super *osb, 
+				  struct inode *alloc_inode,
+				  struct buffer_head *bh);
+static int ocfs_reserve_suballoc_bits(ocfs_super *osb, 
+				      ocfs_journal_handle *handle,
+				      ocfs2_alloc_context *ac);
+static int ocfs_claim_suballoc_bits(ocfs_super *osb,
+				    ocfs_journal_handle *handle,
+				    ocfs2_alloc_context *ac,
+				    u32 bits_wanted,
+				    u16 *bit_off,
+				    unsigned int *num_bits,
+				    u64 *bg_blkno);
+static int ocfs_block_group_find_clear_bits(ocfs_super *osb, 
+					    ocfs2_group_desc *bg,
+					    unsigned int bits_wanted, 
+					    u16 *bit_off,
+					    u16 *bits_found);
+static inline int ocfs_block_group_set_bits(ocfs_journal_handle *handle,
+					    ocfs2_group_desc *bg, 
+					    struct buffer_head *group_bh,
+					    unsigned int bit_off, 
+					    unsigned int num_bits);
+static int ocfs_relink_block_group(ocfs_journal_handle *handle,
+				   struct buffer_head *fe_bh,
+				   struct buffer_head *bg_bh,
+				   struct buffer_head *prev_bg_bh,
+				   u16 chain);
+static inline int ocfs_block_group_reasonably_empty(ocfs2_group_desc *bg);
+static inline u64 ocfs_which_suballoc_group(u64 block, unsigned int bit);
+
+static int ocfs2_block_group_fill(ocfs_journal_handle *handle, 
+				  struct inode *alloc_inode,
+				  struct buffer_head *bg_bh,
+				  u64 group_blkno,
+				  u16 my_chain,
+				  ocfs2_chain_list *cl)
+{
+	int status = 0;
+	ocfs2_group_desc *bg = (ocfs2_group_desc *) bg_bh->b_data;
+	struct super_block * sb = alloc_inode->i_sb;
+
+	LOG_ENTRY();
+
+	OCFS_ASSERT(((unsigned long long) bg_bh->b_blocknr) == group_blkno);
+
+	set_buffer_uptodate(bg_bh);
+	SET_BH_SEQNUM(alloc_inode, bg_bh);
+	status = ocfs_journal_access(handle, 
+				     bg_bh, 
+				     OCFS_JOURNAL_ACCESS_CREATE);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
+	memset(bg, 0, sb->s_blocksize);
+	strcpy (bg->bg_signature, OCFS2_GROUP_DESC_SIGNATURE);
+	bg->bg_generation = cpu_to_le32(alloc_inode->i_generation);
+	bg->bg_size = ocfs2_group_bitmap_size(sb);
+	bg->bg_bits = (u32) cl->cl_cpg * (u32) cl->cl_bpc;
+	bg->bg_chain = my_chain;
+	bg->bg_next_group = cl->cl_recs[my_chain].c_blkno;
+	bg->bg_parent_dinode = OCFS_I(alloc_inode)->ip_blkno;
+	bg->bg_blkno = group_blkno;
+	/* set the 1st bit in the bitmap to account for the descriptor block */
+	set_bit(0, bg->bg_bitmap);
+	bg->bg_free_bits_count = bg->bg_bits - 1;
+
+	status = ocfs_journal_dirty(handle, bg_bh);
+	if (status < 0)
+		LOG_ERROR_STATUS(status);
+#warning "we need to zero out the other blocks in the group! (only inode alloc?)"
+bail:
+	LOG_EXIT_STATUS(status);
+	return status;
+}
+
+static inline u16 ocfs2_find_smallest_chain(ocfs2_chain_list *cl)
+{
+	u16 curr, best;
+
+	best = curr = 0;
+	while (curr < cl->cl_count) {
+		if (cl->cl_recs[best].c_total > cl->cl_recs[curr].c_total)
+			best = curr;
+		curr++;
+	}
+	return best;
+}
+#ifdef OCFS_BG_ZERO
+static struct buffer_head **ocfs_block_group_zero_start(ocfs_super *osb,
+						       u32 bit_off,
+						       u16 clusters)
+{
+	struct buffer_head **bhs = NULL;
+	unsigned int blocks;
+	u64 blkno;
+	int i;
+
+	LOG_ENTRY();
+
+	blocks = ocfs_clusters_to_blocks(osb->sb, (u32) clusters) - (u64) 1;
+	bhs = kmalloc(blocks * sizeof(struct buffer_head *), GFP_KERNEL);
+	if (!bhs) {
+		LOG_ERROR_STATUS(-ENOMEM);
+		goto bail;
+	}
+	memset(bhs, 0, blocks * sizeof(struct buffer_head *));
+
+	blkno = ocfs_clusters_to_blocks(osb->sb, bit_off) + (u64) 1;
+	for(i = 0; i < blocks; i++) {
+		bhs[i] = sb_getblk(osb->sb, ((u64) i + blkno));
+		if (!bhs[i]) {
+			kfree(bhs);
+			bhs = NULL;
+			LOG_ERROR_STATUS(-EIO);
+			goto bail;
+		}
+		lock_buffer(bhs[i]);
+		OCFS_ASSERT(!buffer_jbd(bhs[i]));
+
+		memset(bhs[i]->b_data, 0, osb->sb->s_blocksize);
+		set_buffer_uptodate(bhs[i]);
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)		
+		clear_buffer_dirty(bhs[i]);
+#else
+		mark_buffer_clean(bhs[i]);
+#endif
+
+		bhs[i]->b_end_io = ocfs_end_buffer_io_sync;
+		submit_bh(WRITE, bhs[i]);
+	}
+bail:
+	LOG_EXIT();
+	return bhs;
+}
+
+static void ocfs_block_group_zero_wait(ocfs_super *osb, 
+				       struct buffer_head **bhs,
+				       u16 clusters)
+{
+	unsigned int blocks = 
+		ocfs_clusters_to_blocks(osb->sb, (u32) clusters) - (u64) 1;
+
+	do {
+		blocks--;
+		wait_on_buffer(bhs[i]);
+		brelse(bhs[i]);
+	} while (blocks);
+
+	kfree(bhs);
+	return;
+}
+#endif
+/*
+ * We expect the block group allocator to already be locked.
+ */
+static int ocfs_block_group_alloc(ocfs_super *osb, 
+				  struct inode *alloc_inode,
+				  struct buffer_head *bh)
+{
+	int status, credits;
+	ocfs2_dinode *fe = (ocfs2_dinode *) bh->b_data;
+	ocfs2_chain_list *cl;
+	ocfs2_alloc_context *ac = NULL;
+	ocfs_journal_handle *handle = NULL;
+	u32 bit_off, num_bits;
+	u16 alloc_rec;
+	u64 bg_blkno;
+	struct buffer_head *bg_bh = NULL;
+	ocfs2_group_desc *bg;
+#ifdef OCFS_BG_ZERO
+	struct buffer_head **zero_bhs = NULL;
+#endif
+
+	LOG_ENTRY();
+
+	handle = ocfs_alloc_handle(osb);
+	if (!handle) {
+		status = -ENOMEM;
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
+	cl = &fe->id2.i_chain;
+	status = ocfs_reserve_bits(osb, 
+				   handle, 
+				   cl->cl_cpg, 
+				   &ac);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
+	credits = ocfs_calc_group_alloc_credits(osb->sb, cl->cl_cpg);
+	handle = ocfs_start_trans(osb, handle, credits);
+	if (!handle) {
+		status = -ENOMEM;
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+	ocfs_handle_set_always_commits(handle, 1);
+
+	status = ocfs_claim_bits(osb, 
+				 handle, 
+				 ac, 
+				 cl->cl_cpg, 
+				 &bit_off, 
+				 &num_bits);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
+#ifdef OCFS_BG_ZERO
+	status = ocfs_block_group_zero_start(osb, bit_off, cl->cpg, &zero_bhs);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+#endif
+	alloc_rec = ocfs2_find_smallest_chain(cl);
+
+	/* setup the group */
+	bg_blkno = ocfs_clusters_to_blocks(osb->sb, bit_off);
+	LOG_TRACE_ARGS("new descriptor, record %u, at block %llu\n", 
+		       alloc_rec, bg_blkno);
+
+	bg_bh = sb_getblk(osb->sb, bg_blkno);
+	if (!bg_bh) {
+		status = -EIO;
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
+	status = ocfs2_block_group_fill(handle, 
+					alloc_inode, 
+					bg_bh, 
+					bg_blkno,
+					alloc_rec, 
+					cl);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
+	bg = (ocfs2_group_desc *) bg_bh->b_data;
+
+	status = ocfs_journal_access(handle, bh, OCFS_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
+	cl->cl_recs[alloc_rec].c_free  += bg->bg_free_bits_count;
+	cl->cl_recs[alloc_rec].c_total += bg->bg_bits;
+	cl->cl_recs[alloc_rec].c_blkno  = bg_blkno;
+	if (cl->cl_next_free_rec < cl->cl_count)
+		cl->cl_next_free_rec++;
+
+	fe->id1.bitmap1.i_used  += (bg->bg_bits - bg->bg_free_bits_count);
+	fe->id1.bitmap1.i_total += bg->bg_bits;
+	fe->i_clusters += cl->cl_cpg;
+
+	status = ocfs_journal_dirty(handle, bh);
+		if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
+	down (&(OCFS_I(alloc_inode)->ip_sem));
+	OCFS_I(alloc_inode)->ip_alloc_size = 
+		(u64)fe->i_clusters << osb->s_clustersize_bits;
+	fe->i_size = OCFS_I(alloc_inode)->ip_alloc_size;
+	OCFS_I(alloc_inode)->u.ip_bitinfo.used_bits = fe->id1.bitmap1.i_used;
+	OCFS_I(alloc_inode)->u.ip_bitinfo.total_bits = fe->id1.bitmap1.i_total;
+	up (&(OCFS_I(alloc_inode)->ip_sem));
+	alloc_inode->i_size = fe->i_size;
+	alloc_inode->i_blocks = (alloc_inode->i_size + osb->sb->s_blocksize - 1) >> osb->sb->s_blocksize_bits;
+
+	status = 0;
+bail:
+#ifdef OCFS_BG_ZERO
+	if (zero_bhs)
+		ocfs_block_group_zero_wait(osb, zero_bhs, cl->cl_cpg);
+#endif
+	if (handle)
+		ocfs_commit_trans(handle);
+
+	if (ac)
+		ocfs_free_alloc_context(ac);
+
+	if (bg_bh)
+		brelse(bg_bh);
+
+	LOG_EXIT_STATUS(status);
+	return status;
+}
+
+static int ocfs_reserve_suballoc_bits(ocfs_super *osb, 
+				      ocfs_journal_handle *handle,
+				      ocfs2_alloc_context *ac)
+{
+	int status;
+	u32 bits_wanted = ac->ac_bits_wanted;
+	struct inode *alloc_inode = ac->ac_inode;
+	struct buffer_head *bh = NULL;
+	ocfs2_dinode *fe;
+
+	LOG_ENTRY();
+
+	OCFS_ASSERT(!(handle->flags & OCFS_HANDLE_STARTED));
+
+	status = ocfs_acquire_lock(osb, OCFS_LKM_EXMODE, 
+				   0, &bh, alloc_inode);
+	if (status < 0) {
+		if (status != -EINTR)
+			LOG_ERROR_STATUS (status);
+		goto bail;
+	}
+	ocfs_handle_add_lock(handle, OCFS_LKM_EXMODE, 
+			     0, alloc_inode);
+	ocfs_handle_add_inode(handle, alloc_inode);
+
+	fe = (ocfs2_dinode *) bh->b_data;
+	OCFS_ASSERT_RO(IS_VALID_FILE_ENTRY(fe));
+	OCFS_ASSERT_RO(fe->i_flags & OCFS2_CHAIN_FL);
+
+	if (bits_wanted > (le32_to_cpu(fe->id1.bitmap1.i_total) - 
+			   le32_to_cpu(fe->id1.bitmap1.i_used))) {
+		status = ocfs_block_group_alloc(osb, alloc_inode, bh);
+		if (status < 0) {
+			LOG_ERROR_STATUS(status);
+			goto bail;
+		}
+		/* You should never ask for this much metadata */
+		OCFS_ASSERT(bits_wanted <= 
+			    (le32_to_cpu(fe->id1.bitmap1.i_total) 
+			     - le32_to_cpu(fe->id1.bitmap1.i_used)));
+	}
+
+	get_bh(bh);
+	ac->ac_bh = bh;
+	status = 0;
+bail:
+	if (bh)
+		brelse(bh);
+
+	LOG_EXIT_STATUS(status);
+	return status;
+}
+
+int ocfs_reserve_new_metadata(ocfs_super *osb, 
+			      ocfs_journal_handle *handle,
+			      ocfs2_dinode *fe,
+			      ocfs2_alloc_context **ac)
+{
+	int status;
+	struct inode *alloc_inode = NULL;
+
+	*ac = kmalloc(sizeof(ocfs2_alloc_context), GFP_KERNEL);
+	if (!(*ac)) {
+		status = -ENOMEM;
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+	memset(*ac, 0, sizeof(ocfs2_alloc_context));
+	/* Our file data alloc path is such a mess that I really feel
+	 * comfortable just always over-reserving here. */
+	(*ac)->ac_bits_wanted = 2 * ocfs2_extend_meta_needed(fe);
+	(*ac)->ac_handle = handle;
+	(*ac)->ac_which = OCFS_AC_USE_META;
+
+#ifndef OCFS_USE_ALL_METADATA_SUBALLOCATORS
+	alloc_inode = ocfs_get_system_file_inode(osb, EXTENT_ALLOC_SYSTEM_INODE, 0);
+#else
+	alloc_inode = ocfs_get_system_file_inode(osb, EXTENT_ALLOC_SYSTEM_INODE, osb->node_num);
+#endif
+	if (!alloc_inode) {
+		status = -ENOMEM;
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
+	(*ac)->ac_inode = igrab(alloc_inode);
+
+	status = ocfs_reserve_suballoc_bits(osb, handle, (*ac));
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
+	status = 0;
+bail:
+	if ((status < 0) && *ac) {
+		ocfs_free_alloc_context(*ac);
+		*ac = NULL;
+	}
+
+	if (alloc_inode)
+		iput(alloc_inode);
+
+	LOG_EXIT_STATUS(status);
+	return status;
+}
+
+int ocfs_reserve_new_inode(ocfs_super *osb, 
+			   ocfs_journal_handle *handle,
+			   ocfs2_alloc_context **ac)
+{
+	int status;
+	struct inode *alloc_inode = NULL;
+
+	*ac = kmalloc(sizeof(ocfs2_alloc_context), GFP_KERNEL);
+	if (!(*ac)) {
+		status = -ENOMEM;
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+	memset(*ac, 0, sizeof(ocfs2_alloc_context));
+	(*ac)->ac_bits_wanted = 1;
+	(*ac)->ac_handle = handle;
+	(*ac)->ac_which = OCFS_AC_USE_INODE;
+
+	alloc_inode = ocfs_get_system_file_inode(osb, INODE_ALLOC_SYSTEM_INODE, osb->node_num);
+	if (!alloc_inode) {
+		status = -ENOMEM;
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
+	(*ac)->ac_inode = igrab(alloc_inode);
+
+	status = ocfs_reserve_suballoc_bits(osb, handle, *ac);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
+	status = 0;
+bail:
+	if ((status < 0) && *ac) {
+		ocfs_free_alloc_context(*ac);
+		*ac = NULL;
+	}
+
+	if (alloc_inode)
+		iput(alloc_inode);
+
+	LOG_EXIT_STATUS(status);
+	return status;
+}
+
+static int ocfs_block_group_find_clear_bits(ocfs_super *osb, 
+					    ocfs2_group_desc *bg,
+					    unsigned int bits_wanted, 
+					    u16 *bit_off,
+					    u16 *bits_found)
+{
+	void *bitmap;
+	u16 best_offset, best_size;
+	int offset, start, found, status = 0;
+
+	OCFS_ASSERT_RO(IS_VALID_GROUP_DESC(bg));
+
+	found = start = best_offset = best_size = 0;
+	bitmap = bg->bg_bitmap;
+
+	while((offset = find_next_zero_bit(bitmap, 
+					   bg->bg_bits, 
+					   start)) != -1) {
+		if (offset == bg->bg_bits)
+			break;
+
+		if (offset == start) {
+			/* we found a zero */
+			found++;
+			start++;
+		} else {
+			/* got a zero after some ones */
+			found = 1;
+			start = offset + 1;
+		}
+		if (found > best_size) {
+			best_size = found;
+			best_offset = start - found;
+		}
+		/* we got everything we needed */
+		if (found == bits_wanted) {
+			/* LOG_TRACE_STR("Found it all!"); */
+			break;
+		}
+	}
+
+	if (found == bits_wanted) {
+		*bit_off = start - found;
+		*bits_found = found;
+	} else if (best_offset) {
+		*bit_off = best_offset;
+		*bits_found = best_size;
+	} else {
+		status = -ENOSPC;
+		LOG_ERROR_STATUS(status);
+	}
+
+	return status;
+}
+
+static inline int ocfs_block_group_set_bits(ocfs_journal_handle *handle,
+					    ocfs2_group_desc *bg, 
+					    struct buffer_head *group_bh,
+					    unsigned int bit_off, 
+					    unsigned int num_bits)
+{
+	int status;
+	void *bitmap = bg->bg_bitmap;
+
+	LOG_ENTRY();
+
+	OCFS_ASSERT_RO(IS_VALID_GROUP_DESC(bg));
+	OCFS_ASSERT(bg->bg_free_bits_count >= num_bits);
+
+	LOG_TRACE_ARGS("block_group_set_bits: off = %u, num = %u\n", bit_off, 
+		       num_bits);
+
+	status = ocfs_journal_access(handle, 
+				     group_bh, 
+				     OCFS_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
+	bg->bg_free_bits_count -= num_bits;
+
+	while(num_bits--)
+		set_bit(bit_off++, bitmap);
+
+	status = ocfs_journal_dirty(handle, 
+				    group_bh);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
+bail:
+	LOG_EXIT_STATUS(status);
+	return status;
+}
+
+/* find the one with the most empty bits */
+static inline u16 ocfs2_find_victim_chain(ocfs2_chain_list *cl)
+{
+	u16 curr, best;
+
+	OCFS_ASSERT(cl->cl_next_free_rec);
+
+	best = curr = 0;
+	while (curr < cl->cl_next_free_rec) {
+		if (cl->cl_recs[curr].c_free > cl->cl_recs[best].c_free)
+			best = curr;
+		curr++;
+	}
+
+	OCFS_ASSERT(best < cl->cl_next_free_rec);
+	return best;
+}
+
+static int ocfs_relink_block_group(ocfs_journal_handle *handle,
+				   struct buffer_head *fe_bh,
+				   struct buffer_head *bg_bh,
+				   struct buffer_head *prev_bg_bh,
+				   u16 chain)
+{
+	int status;
+	/* there is a really tiny chance the journal calls could fail,
+	 * but we wouldn't want inconsistent blocks in *any* case. */
+	u64 fe_ptr, bg_ptr, prev_bg_ptr;
+	ocfs2_dinode *fe = (ocfs2_dinode *) fe_bh->b_data;
+	ocfs2_group_desc *bg = (ocfs2_group_desc *) bg_bh->b_data;
+	ocfs2_group_desc *prev_bg = (ocfs2_group_desc *) prev_bg_bh->b_data;
+
+	OCFS_ASSERT_RO(IS_VALID_FILE_ENTRY(fe));
+	OCFS_ASSERT_RO(IS_VALID_GROUP_DESC(bg));
+	OCFS_ASSERT_RO(IS_VALID_GROUP_DESC(prev_bg));
+
+	printk("In suballoc %llu, chain %u, move group %llu to top, "
+	       "prev = %llu\n", fe->i_blkno, chain, bg->bg_blkno, 
+	       prev_bg->bg_blkno);
+
+	fe_ptr = fe->id2.i_chain.cl_recs[chain].c_blkno;
+	bg_ptr = bg->bg_next_group;
+	prev_bg_ptr = prev_bg->bg_next_group;
+
+	status = ocfs_journal_access(handle, prev_bg_bh, 
+				     OCFS_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
+	prev_bg->bg_next_group = bg->bg_next_group;
+
+	status = ocfs_journal_dirty(handle, prev_bg_bh);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
+	status = ocfs_journal_access(handle, bg_bh, OCFS_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
+	bg->bg_next_group = fe->id2.i_chain.cl_recs[chain].c_blkno;
+
+	status = ocfs_journal_dirty(handle, bg_bh);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
+	status = ocfs_journal_access(handle, fe_bh, OCFS_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
+	fe->id2.i_chain.cl_recs[chain].c_blkno = bg->bg_blkno; 
+
+	status = ocfs_journal_dirty(handle, fe_bh);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
+	status = 0;
+bail:
+	if (status < 0) {
+		fe->id2.i_chain.cl_recs[chain].c_blkno = fe_ptr;
+		bg->bg_next_group = bg_ptr;
+		prev_bg->bg_next_group = prev_bg_ptr;
+	}
+
+	LOG_EXIT_STATUS(status);
+	return status;
+}
+
+static inline int ocfs_block_group_reasonably_empty(ocfs2_group_desc *bg)
+{
+	return bg->bg_free_bits_count >= (bg->bg_bits / 2);
+}
+
+/* will give out up to bits_wanted contiguous bits. */
+static int ocfs_claim_suballoc_bits(ocfs_super *osb,
+				    ocfs_journal_handle *handle,
+				    ocfs2_alloc_context *ac,
+				    u32 bits_wanted,
+				    u16 *bit_off,
+				    unsigned int *num_bits,
+				    u64 *bg_blkno)
+{
+	int status, groups_read;
+	struct inode *alloc_inode = ac->ac_inode;
+	struct buffer_head *group_bh = NULL;
+	struct buffer_head *prev_group_bh = NULL;
+	ocfs2_chain_list *cl;
+	ocfs2_dinode *fe;
+	ocfs2_group_desc *bg;
+	u16 chain, tmp_bits;
+	u64 next_group;
+
+	LOG_ENTRY();
+
+	OCFS_ASSERT(ac->ac_bits_given < ac->ac_bits_wanted);
+	OCFS_ASSERT(ac->ac_handle == handle);
+	OCFS_ASSERT(bits_wanted <= (ac->ac_bits_wanted - ac->ac_bits_given));
+	OCFS_ASSERT(ac->ac_bh);
+
+	fe = (ocfs2_dinode *) ac->ac_bh->b_data;
+	OCFS_ASSERT_RO(IS_VALID_FILE_ENTRY(fe));
+	OCFS_ASSERT_RO(fe->id1.bitmap1.i_used < fe->id1.bitmap1.i_total);
+
+	cl = (ocfs2_chain_list *) &fe->id2.i_chain;
+
+	chain = ocfs2_find_victim_chain(cl);
+
+	LOG_TRACE_ARGS("trying to alloc %u bits from chain %u, inode %llu\n",
+		       bits_wanted, chain, OCFS_I(alloc_inode)->ip_blkno);
+
+	status = ocfs_read_bh(osb, 
+			      cl->cl_recs[chain].c_blkno << osb->sb->s_blocksize_bits, 
+			      &group_bh, 
+			      OCFS_BH_CACHED, 
+			      alloc_inode);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
+	bg = (ocfs2_group_desc *) group_bh->b_data;
+	OCFS_ASSERT_RO(IS_VALID_GROUP_DESC(bg));
+
+	/* for now, the chain search is a bit simplistic. We just use
+	 * the 1st group with any empty bits. */
+	groups_read = 1;
+	while (!bg->bg_free_bits_count) {
+		/*
+		 * This means we've walked off the end of a chain that
+		 * we thought had bits, but didn't.  While this
+		 * _could_ be a code error, it is more likely to be
+		 * corruption on disk.
+		 */
+		OCFS_ASSERT_RO(bg->bg_next_group);
+
+		if (prev_group_bh) {
+			brelse(prev_group_bh);
+			prev_group_bh = NULL;
+		}
+		next_group = bg->bg_next_group;
+		prev_group_bh = group_bh;
+		group_bh = NULL;
+		status = ocfs_read_bh(osb, 
+				      next_group << osb->sb->s_blocksize_bits, 
+				      &group_bh, 
+				      OCFS_BH_CACHED, 
+				      alloc_inode);
+		if (status < 0) {
+			LOG_ERROR_STATUS(status);
+			goto bail;
+		}
+		groups_read++;
+		bg = (ocfs2_group_desc *) group_bh->b_data;
+		OCFS_ASSERT_RO(IS_VALID_GROUP_DESC(bg));
+	}
+
+#define OCFS2_BG_RELINK_TRIGGER 1
+	/*
+	 * Keep track of previous block descriptor read. When
+	 * we find a target, if we have read more than X
+	 * number of descriptors, and the target is reasonably
+	 * empty, relink him to top of his chain.
+	 *
+	 * prev_bg->bg_next_group = bg->bg_next_group;
+	 * bg->bg_next_group = fe->id2.i_chain.cl_recs[chain].c_blkno;
+	 * fe->id2.i_chain.cl_recs[chain].c_blkno = bg->bg_blkno; 
+	 *
+	 * We've read 0 extra blocks and only send one more to
+	 * the transaction, yet the next guy to search has a
+	 * much easier time.
+	 */
+	if ((prev_group_bh) 
+	    && (groups_read > OCFS2_BG_RELINK_TRIGGER) 
+	    && (ocfs_block_group_reasonably_empty(bg))) {
+		status = ocfs_relink_block_group(handle, ac->ac_bh, group_bh, 
+						 prev_group_bh, chain);
+		if (status < 0) {
+			LOG_ERROR_STATUS(status);
+			goto bail;
+		}
+	}
+
+	status = ocfs_block_group_find_clear_bits(osb, 
+						  bg, 
+						  bits_wanted, 
+						  bit_off,
+						  &tmp_bits);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+	*num_bits = tmp_bits;
+
+	OCFS_ASSERT(*num_bits);
+
+	/* we found some. set the info on dinode, chainlist and then
+	 * the group */
+	status = ocfs_journal_access(handle, 
+				     ac->ac_bh, 
+				     OCFS_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
+	fe->id1.bitmap1.i_used += *num_bits;
+	cl->cl_recs[chain].c_free -= *num_bits;
+
+	status = ocfs_journal_dirty(handle, 
+				    ac->ac_bh);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
+	status = ocfs_block_group_set_bits(handle, 
+					   bg, 
+					   group_bh, 
+					   *bit_off, 
+					   *num_bits);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
+	LOG_TRACE_ARGS("Allocated %u bits from suballocator %llu\n", 
+		       *num_bits, fe->i_blkno);
+
+	down (&(OCFS_I(alloc_inode)->ip_sem));
+	OCFS_I(alloc_inode)->u.ip_bitinfo.used_bits = fe->id1.bitmap1.i_used;
+	up (&(OCFS_I(alloc_inode)->ip_sem));
+	*bg_blkno = bg->bg_blkno;
+bail:
+	if (group_bh)
+		brelse(group_bh);
+	if (prev_group_bh)
+		brelse(prev_group_bh);
+
+	LOG_EXIT_STATUS(status);
+	return status;
+}
+
+int ocfs_claim_metadata(ocfs_super *osb,
+			ocfs_journal_handle *handle,
+			ocfs2_alloc_context *ac,
+			u32 bits_wanted,
+			u16 *suballoc_bit_start,
+			unsigned int *num_bits,
+			u64 *blkno_start)
+{
+	int status;
+	u64 bg_blkno;
+
+	OCFS_ASSERT(ac);
+	OCFS_ASSERT(ac->ac_bits_wanted >= (ac->ac_bits_given + bits_wanted));
+	OCFS_ASSERT(ac->ac_which == OCFS_AC_USE_META);
+	OCFS_ASSERT(ac->ac_handle == handle);
+
+	status = ocfs_claim_suballoc_bits(osb,
+					  handle,
+					  ac,
+					  bits_wanted,
+					  suballoc_bit_start,
+					  num_bits,
+					  &bg_blkno);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
+	*blkno_start = bg_blkno + (u64) *suballoc_bit_start;
+	ac->ac_bits_given += (*num_bits);
+	status = 0;
+bail:
+	LOG_EXIT_STATUS(status);
+	return status;
+}
+
+int ocfs_claim_new_inode(ocfs_super *osb, 
+			 ocfs_journal_handle *handle,
+			 ocfs2_alloc_context *ac,
+			 u16 *suballoc_bit,
+			 u64 *fe_blkno)
+{
+	int status;
+	unsigned int num_bits;
+	u64 bg_blkno;
+
+	LOG_ENTRY();
+
+	OCFS_ASSERT(ac);
+	OCFS_ASSERT(ac->ac_bits_given == 0);
+	OCFS_ASSERT(ac->ac_bits_wanted == 1);
+	OCFS_ASSERT(ac->ac_which == OCFS_AC_USE_INODE);
+	OCFS_ASSERT(ac->ac_handle == handle);
+
+	status = ocfs_claim_suballoc_bits(osb, 
+					  handle, 
+					  ac, 
+					  1, 
+					  suballoc_bit, 
+					  &num_bits,
+					  &bg_blkno);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
+	OCFS_ASSERT(num_bits == 1);
+
+	*fe_blkno = bg_blkno + (u64) (*suballoc_bit);
+	ac->ac_bits_given++;
+	status = 0;
+bail:
+	LOG_EXIT_STATUS(status);
+	return status;
+}
+
+static inline u64 ocfs_which_suballoc_group(u64 block, unsigned int bit)
+{
+	return(block - (u64) bit);
+}
+
+/*
+ * expects the suballoc inode to already be locked.
+ */
+int ocfs_free_suballoc_bits(ocfs_super *osb, 
+			    ocfs_journal_handle *handle, 
+			    struct inode *alloc_inode,
+			    struct buffer_head *alloc_bh,
+			    unsigned int start_bit,
+			    u64 start_block,
+			    unsigned int count) 
+{
+	int status = 0;
+	ocfs2_dinode *fe = (ocfs2_dinode *) alloc_bh->b_data;
+	ocfs2_chain_list *cl = &fe->id2.i_chain;
+	u64 bg_blkno;
+	struct buffer_head *group_bh = NULL;
+	ocfs2_group_desc *group;
+	unsigned int tmp;
+
+	LOG_ENTRY();
+
+	OCFS_ASSERT_RO(IS_VALID_FILE_ENTRY(fe));
+	OCFS_ASSERT((count + start_bit) 
+		    <= ((u32) cl->cl_cpg * (u32) cl->cl_bpc));
+
+	bg_blkno = ocfs_which_suballoc_group(start_block, start_bit);
+	LOG_TRACE_ARGS("freeing %u bits from group %llu, starting at %u "
+		       "(1st block = %llu, suballocator %llu)\n",
+		       count, bg_blkno, start_bit, start_block, 
+		       OCFS_I(alloc_inode)->ip_blkno);
+
+	status = ocfs_read_bh(osb, 
+			      bg_blkno << osb->sb->s_blocksize_bits, 
+			      &group_bh, 
+			      OCFS_BH_CACHED, 
+			      alloc_inode);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
+	group = (ocfs2_group_desc *) group_bh->b_data;
+	OCFS_ASSERT_RO(IS_VALID_GROUP_DESC(group));
+
+	status = ocfs_journal_access(handle, group_bh, 
+				     OCFS_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
+	tmp = count;
+	while(tmp--)
+		clear_bit((start_bit + tmp), group->bg_bitmap);
+	group->bg_free_bits_count += count;
+
+	status = ocfs_journal_dirty(handle, group_bh);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
+	status = ocfs_journal_access(handle, alloc_bh, 
+				     OCFS_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
+	cl->cl_recs[group->bg_chain].c_free += count;
+	fe->id1.bitmap1.i_used -= count;
+
+	status = ocfs_journal_dirty(handle, alloc_bh);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
+	down (&(OCFS_I(alloc_inode)->ip_sem));
+	OCFS_I(alloc_inode)->u.ip_bitinfo.used_bits = fe->id1.bitmap1.i_used;
+	up (&(OCFS_I(alloc_inode)->ip_sem));
+bail:
+	if (group_bh)
+		brelse(group_bh);
+
+	LOG_EXIT_STATUS(status);
+	return status;
+}
+
+static inline void debug_bg(ocfs2_group_desc *bg) 
+{
+	printk("Block Group:\n");
+	printk("bg_signature:       %s\n", bg->bg_signature);
+	printk("bg_size:            %u\n", bg->bg_size);
+	printk("bg_bits:            %u\n", bg->bg_bits);
+	printk("bg_free_bits_count: %u\n", bg->bg_free_bits_count);
+	printk("bg_chain:           %u\n", bg->bg_chain);
+	printk("bg_generation:      %u\n", bg->bg_generation);
+	printk("bg_next_group:      %llu\n", bg->bg_next_group);
+	printk("bg_parent_dinode:   %llu\n", bg->bg_parent_dinode);
+	printk("bg_blkno:           %llu\n", bg->bg_blkno);
+	return;
+}
+
+static inline void debug_suballoc_inode(ocfs2_dinode *fe)
+{
+	int i;
+
+	printk("Suballoc Inode %llu:\n", fe->i_blkno);
+	printk("i_signature:                  %s\n", fe->i_signature);
+	printk("i_size:                       %llu\n", fe->i_size);
+	printk("i_clusters:                   %u\n", fe->i_clusters);
+	printk("i_generation:                 %u\n", fe->i_generation);
+	printk("id1.bitmap1.i_used:           %u\n", fe->id1.bitmap1.i_used);
+	printk("id1.bitmap1.i_total:          %u\n", fe->id1.bitmap1.i_total);
+	printk("id2.i_chain.cl_cpg:           %u\n", fe->id2.i_chain.cl_cpg);
+	printk("id2.i_chain.cl_bpc:           %u\n", fe->id2.i_chain.cl_bpc);
+	printk("id2.i_chain.cl_count:         %u\n", fe->id2.i_chain.cl_count);
+	printk("id2.i_chain.cl_next_free_rec: %u\n", 
+	       fe->id2.i_chain.cl_next_free_rec);
+	for(i = 0; i < fe->id2.i_chain.cl_next_free_rec; i++) {
+		printk("fe->id2.i_chain.cl_recs[%d].c_free:  %u\n", i, fe->id2.i_chain.cl_recs[i].c_free);
+		printk("fe->id2.i_chain.cl_recs[%d].c_total: %u\n", i, fe->id2.i_chain.cl_recs[i].c_total);
+		printk("fe->id2.i_chain.cl_recs[%d].c_blkno: %llu\n", i, fe->id2.i_chain.cl_recs[i].c_blkno);
+	}
+	return;
+}

Added: trunk/src/suballoc.h
===================================================================
--- trunk/src/suballoc.h	2004-10-02 01:01:24 UTC (rev 1542)
+++ trunk/src/suballoc.h	2004-10-02 01:08:08 UTC (rev 1543)
@@ -0,0 +1,58 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * suballoc.h
+ *
+ * Defines sub allocator api
+ *
+ * Copyright (C) 2003, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Authors: Mark Fasheh, Kurt Hackel, Joel Becker, Sunil Mushran, 
+ *	    Manish Singh, Wim Coekaerts
+ */
+
+#ifndef _CHAINALLOC_H_
+#define _CHAINALLOC_H_
+
+int ocfs_reserve_new_metadata(ocfs_super *osb, 
+			      ocfs_journal_handle *handle,
+			      ocfs2_dinode *fe,
+			      ocfs2_alloc_context **ac);
+int ocfs_reserve_new_inode(ocfs_super *osb, 
+			   ocfs_journal_handle *handle,
+			   ocfs2_alloc_context **ac);
+int ocfs_claim_new_inode(ocfs_super *osb, 
+			 ocfs_journal_handle *handle,
+			 ocfs2_alloc_context *ac,
+			 u16 *suballoc_bit,
+			 u64 *fe_blkno);
+int ocfs_claim_metadata(ocfs_super *osb,
+			ocfs_journal_handle *handle,
+			ocfs2_alloc_context *ac,
+			u32 bits_wanted,
+			u16 *suballoc_bit_start,
+			u32 *num_bits,
+			u64 *blkno_start);
+int ocfs_free_suballoc_bits(ocfs_super *osb, 
+			    ocfs_journal_handle *handle, 
+			    struct inode *alloc_inode,
+			    struct buffer_head *alloc_bh,
+			    unsigned int start_bit,
+			    u64 start_block,
+			    unsigned int count);
+#endif /* _CHAINALLOC_H_ */

Modified: trunk/src/super.c
===================================================================
--- trunk/src/super.c	2004-10-02 01:01:24 UTC (rev 1542)
+++ trunk/src/super.c	2004-10-02 01:08:08 UTC (rev 1543)
@@ -56,6 +56,7 @@
 #include "heartbeat.h"
 #include "inode.h"
 #include "journal.h"
+#include "localalloc.h"
 #include "nm.h"
 #include "proc.h"
 #include "super.h"

Modified: trunk/src/symlink.c
===================================================================
--- trunk/src/symlink.c	2004-10-02 01:01:24 UTC (rev 1542)
+++ trunk/src/symlink.c	2004-10-02 01:08:08 UTC (rev 1543)
@@ -43,6 +43,7 @@
 #include "ocfs_log.h"
 #include "ocfs.h"
 
+#include "alloc.h"
 #include "file.h"
 #include "inode.h"
 #include "symlink.h"

Modified: trunk/src/sysfile.c
===================================================================
--- trunk/src/sysfile.c	2004-10-02 01:01:24 UTC (rev 1542)
+++ trunk/src/sysfile.c	2004-10-02 01:08:08 UTC (rev 1543)
@@ -126,78 +126,3 @@
 	return inode;
 }
 
-
-/*
- * ocfs_read_system_file()
- *
- * make sure that the bhs array is either all NULL'd out or you really
- * know what you're doing! Also, those bh's will have to be brelse'd
- * after you're done with them!
- *
- */
-int ocfs_read_system_file(ocfs_super *osb, int type, __u32 node,
-			  struct buffer_head *bhs[], __u64 Length)
-{
-	int status = 0, i;
-	__u64 ret = 0;
-	struct inode *inode = NULL;
-	__u32 blocks;
-	__u64 off, contig_blocks, contig_bytes;
-
-	LOG_ENTRY_ARGS ("type=%d, node=%u, Length = %llu\n", type, node, Length);
-
-	OCFS_ASSERT(!(Length & (osb->sb->s_blocksize - 1)));
-
-	inode = ocfs_get_system_file_inode(osb, type, node);
-	if (!inode) {
-		LOG_ERROR_STATUS(status=-EINVAL);
-		goto leave;
-	}
-
-	off = 0;
-	i = 0;
-	blocks = Length >> osb->sb->s_blocksize_bits;
-	while (blocks > 0) {
-		status = ocfs_lookup_file_allocation(osb, off, &ret,
-						     (Length - off), 
-						     &contig_bytes,
-						     inode, 1);
-		contig_blocks =
-			contig_bytes >> osb->sb->s_blocksize_bits;
-		if (contig_bytes !=
-		    (contig_blocks << osb->sb->s_blocksize_bits)) {
-			LOG_ERROR_ARGS("unaligned system file read! off=%llu, "
-				       "contig_bytes=%llu\n", off, contig_bytes);
-			LOG_ERROR_STATUS(status=-EINVAL);
-			goto leave;
-		}
-		if (contig_blocks > blocks) {
-			contig_blocks = blocks;
-			contig_bytes = blocks << osb->sb->s_blocksize_bits;
-		}
-		if (status == 0) {
-			// found all remaining
-		} else if (status == -EFAIL && contig_blocks > 0) {
-			// found some
-		} else {
-			// failed
-			LOG_ERROR_STATUS(status = -EIO);
-			goto leave;
-		}
-		status = ocfs_read_bhs(osb, ret, contig_blocks << osb->sb->s_blocksize_bits, &(bhs[i]), OCFS_BH_CACHED, inode);
-		if (status < 0) {
-			LOG_ERROR_STATUS (status);
-			goto leave;
-		}
-		i += contig_blocks;
-		blocks -= contig_blocks;
-		off += contig_bytes;
-	}
-
-leave:
-	if (inode)
-		iput(inode);
-
-	LOG_EXIT_STATUS (status);
-	return status;
-}				/* ocfs_read_system_file */

Modified: trunk/src/sysfile.h
===================================================================
--- trunk/src/sysfile.h	2004-10-02 01:01:24 UTC (rev 1542)
+++ trunk/src/sysfile.h	2004-10-02 01:08:08 UTC (rev 1543)
@@ -30,7 +30,5 @@
 #define OCFS2_SYSFILE_H
 
 struct inode * ocfs_get_system_file_inode(ocfs_super *osb, int type, __u32 node);
-int ocfs_read_system_file (ocfs_super *osb, int type, __u32 node,
-			   struct buffer_head *bhs[], __u64 Length);
 
 #endif /* OCFS2_SYSFILE_H */

Modified: trunk/src/vote.c
===================================================================
--- trunk/src/vote.c	2004-10-02 01:01:24 UTC (rev 1542)
+++ trunk/src/vote.c	2004-10-02 01:08:08 UTC (rev 1543)
@@ -885,7 +885,7 @@
 #warning "should we even be erroring here at all!"
 		LOG_ERROR_ARGS("inode %llu, vote_status=%d, vote_state=%d, "
 			       "lockid=%llu, flags = 0x%x, asked type = %u "
-			       "master = %d, state = 0x%lx, type = %u\n",
+			       "master = %d, state = 0x%x, type = %u\n",
 			       OCFS_I(inode)->ip_blkno, obj->vote_status, 
 			       obj->vote_state, lock_id, flags, lock_type, 
 			       GET_INODE_LOCKRES(inode)->master_node_num, 



More information about the Ocfs2-commits mailing list