[Ocfs2-commits] mfasheh commits r1563 - trunk/src

svn-commits at oss.oracle.com svn-commits at oss.oracle.com
Mon Oct 11 21:01:11 CDT 2004


Author: mfasheh
Date: 2004-10-11 21:01:09 -0500 (Mon, 11 Oct 2004)
New Revision: 1563

Modified:
   trunk/src/alloc.c
   trunk/src/alloc.h
   trunk/src/dir.c
   trunk/src/file.c
   trunk/src/ocfs2_fs.h
Log:
* remove ocfs_allocate_extent and replace it with something relatively
  sane.
  - we are now completely roll forward in extend (i will be removing
    ocfs_abort_trans and friends shortly)
  - the code has shrunk by about 500 lines
  - most importantly, the code is completely known, including all
    possible error cases, so we should be able to easily and
    accurately predict what a tree would look like at any given moment
    during an extend operation.



Modified: trunk/src/alloc.c
===================================================================
--- trunk/src/alloc.c	2004-10-12 01:56:54 UTC (rev 1562)
+++ trunk/src/alloc.c	2004-10-12 02:01:09 UTC (rev 1563)
@@ -54,6 +54,16 @@
 /* Tracing */
 #define OCFS_DEBUG_CONTEXT    OCFS_DEBUG_CONTEXT_ALLOC
 
+static int ocfs_extent_contig(struct inode *inode, ocfs2_extent_rec *ext,
+			      u64 blkno);
+
+static int ocfs_free_clusters(ocfs_super *osb,
+			      ocfs_journal_handle *handle,
+			      struct inode *bitmap_inode,
+			      struct buffer_head *bitmap_bh,
+			      u64 start_blk,
+			      unsigned int num_clusters);
+
 static int ocfs_create_new_meta_bhs(ocfs_super *osb, 
 				    ocfs_journal_handle *handle,
 				    struct inode *inode,
@@ -61,27 +71,33 @@
 				    ocfs2_alloc_context *meta_ac,
 				    struct buffer_head *bhs[]);
 
-static int ocfs_allocate_new_data_node(ocfs_super *osb, 
-				       ocfs2_dinode *fe,
-				       u64 new_blkno,
-				       u32 new_clusters, 
-				       struct buffer_head *eb_bh, 
-				       u64 *new_eb_blkno, 
-      				       ocfs_journal_handle *handle,
-				       struct inode *inode,
-				       ocfs2_alloc_context *meta_ac);
+static int ocfs2_add_branch(ocfs_super *osb,
+			    ocfs_journal_handle *handle,
+			    struct inode *inode,
+			    struct buffer_head *fe_bh,
+			    struct buffer_head *eb_bh,
+			    struct buffer_head *last_eb_bh,
+			    ocfs2_alloc_context *meta_ac);
 
-static int ocfs_grow_extent_tree(ocfs_super *osb,
-				 struct buffer_head *fe_bh,
-				 ocfs_journal_handle *handle,
-				 u64 blkno,
-			       	 u32 new_clusters, 
-				 struct inode *inode,
-				 ocfs2_alloc_context *meta_ac);
+static int ocfs2_shift_tree_depth(ocfs_super *osb,
+				  ocfs_journal_handle *handle,
+				  struct inode *inode,
+				  struct buffer_head *fe_bh,
+				  ocfs2_alloc_context *meta_ac,
+				  struct buffer_head **ret_new_eb_bh);
 
-static int ocfs_extent_contig(struct inode *inode, ocfs2_extent_rec *ext,
-			      u64 blkno);
+static int ocfs2_do_insert_extent(ocfs_super *osb,
+				  ocfs_journal_handle *handle,
+				  struct inode *inode,
+				  struct buffer_head *fe_bh,
+				  u64 blkno,
+				  u32 new_clusters);
 
+static int ocfs2_find_branch_target(ocfs_super *osb,
+				    struct inode *inode,
+				    struct buffer_head *fe_bh,
+				    struct buffer_head **target_bh);
+
 static int ocfs_claim_main_bitmap_bits(ocfs_super *osb,
 				       ocfs_journal_handle *handle,
 				       ocfs2_alloc_context *ac,
@@ -89,13 +105,6 @@
 				       u32 *bit_off,
 				       u32 *num_bits);
 
-static int ocfs_free_clusters(ocfs_super *osb,
-			      ocfs_journal_handle *handle,
-			      struct inode *bitmap_inode,
-			      struct buffer_head *bitmap_bh,
-			      u64 start_blk,
-			      unsigned int num_clusters);
-
 static int ocfs_find_new_last_ext_blk(ocfs_super *osb,
 				      struct inode *inode,
 				      ocfs2_dinode *fe,
@@ -115,6 +124,13 @@
 {
 	u64 start_blkno = ext->e_blkno;
 
+	if (!ext->e_clusters) {
+		/* an empty extent is always contig. */
+		OCFS_ASSERT(!ext->e_blkno);
+		OCFS_ASSERT(ext->e_cpos);
+		return 1;
+	}
+		
 	start_blkno += ocfs2_clusters_to_blocks(inode->i_sb,
 						ext->e_clusters);
 	return (start_blkno == blkno);
@@ -218,195 +234,6 @@
 	return(retval);
 }
 
-/* ocfs_allocate_new_data_node()
- * 
- */
-static int ocfs_allocate_new_data_node(ocfs_super *osb, 
-				       ocfs2_dinode *fe,
-				       u64 new_blkno,
-				       u32 new_clusters, 
-				       struct buffer_head *eb_bh, 
-				       u64 *new_eb_blkno,
-				       ocfs_journal_handle *handle,
-				       struct inode *inode,
-				       ocfs2_alloc_context *meta_ac)
-{
-	int status = 0;
-	__u32 k, i;
-	__u32 depth;
-	u64 parent_blk;
-	int new_blocks = 0;
-	ocfs2_extent_block *eb = NULL;
-	ocfs2_extent_list *el1, *el2 = NULL;
-	struct buffer_head **eb_bhs = NULL;
-	struct buffer_head *bh = NULL;
-	int size;
-
-	LOG_ENTRY ();
-
-	OCFS_ASSERT(meta_ac);
-
-	if (eb_bh) {
-		status = ocfs_journal_access(handle, eb_bh, 
-					     OCFS_JOURNAL_ACCESS_WRITE);
-
-		if (status < 0) {
-			LOG_ERROR_STATUS(status);
-			goto finally;
-		}
-
-		eb = (ocfs2_extent_block *) eb_bh->b_data;
-		el1 = &eb->h_list;
-	}
-	else
-		el1 = &fe->id2.i_list;
-
-	depth = el1->l_tree_depth;
-
-	if (eb != NULL)
-		parent_blk = eb->h_blkno;
-	else
-		parent_blk = fe->i_blkno;
-
-	new_blocks = depth;
-
-	size = sizeof(struct buffer_head *) * new_blocks;
-	eb_bhs = kmalloc(size, GFP_KERNEL);
-	if (eb_bhs == NULL) {
-		status = -ENOMEM;
-		LOG_ERROR_STATUS(status);
-		goto finally;
-	}
-	memset(eb_bhs, 0, size);
-
-	status = ocfs_create_new_meta_bhs(osb, handle, inode, new_blocks, meta_ac, eb_bhs);
-	if (status < 0) {
-		LOG_ERROR_STATUS (status);
-		goto finally;
-	}
-
-	k = el1->l_next_free_rec;
-	el1->l_recs[k].e_cpos = fe->i_clusters;
-	el1->l_recs[k].e_clusters = new_clusters;
-	el1->l_recs[k].e_blkno = 
-		((ocfs2_extent_block *) eb_bhs[0]->b_data)->h_blkno;
-	el1->l_next_free_rec++;
-
-	OCFS_ASSERT(el1->l_next_free_rec <= el1->l_count);
-
-	/* Fill in all the headers and the leaf */
-	for (i = 0; i < depth; i++) {
-		ocfs2_extent_block *eb, *tmpeb;
-
-		eb = (ocfs2_extent_block *) eb_bhs[i]->b_data;
-
-		eb->h_parent_blk = parent_blk;
-
-		el2 = &eb->h_list;
-		el2->l_next_free_rec = 1;
-		el2->l_recs[0].e_cpos = fe->i_clusters;
-		el2->l_recs[0].e_clusters = new_clusters;
-		el2->l_tree_depth = (depth - (i + 1));
-
-		if (el2->l_tree_depth) {
-			tmpeb = (ocfs2_extent_block *) eb_bhs[i+1]->b_data;
-			/* fill in each header */
-			el2->l_recs[0].e_blkno = tmpeb->h_blkno;
-		} else {
-			/* fill in the leaf */
-			el2->l_recs[0].e_blkno = new_blkno;
-			*new_eb_blkno = fe->i_last_eb_blk =
-				eb->h_blkno;
-		}
-
-		parent_blk = eb->h_blkno;
-	}
-
-	for(i = 0; i < new_blocks; i++) {
-		status = ocfs_journal_dirty(handle, eb_bhs[i]);
-		if (status < 0) {
-			LOG_ERROR_STATUS (status);
-			goto finally;
-		}
-	}
-
-	if (eb) {
-		/* both needed below in for loop */
-		u64 tmp_blk = eb->h_parent_blk;
-		int tree_depth = el1->l_tree_depth;
-
-		eb = NULL;
-
-	       	el1 = &fe->id2.i_list;
-
-		status = ocfs_journal_dirty(handle, eb_bh);
-		if (status < 0) {
-			LOG_ERROR_STATUS (status);
-			goto finally;
-		}
-
-		/* gotta fix up his parent extents now. We totally
-		 * reuse the eb and el2 variables now as they're no
-		 * longer needed for their original purpose. */
-		for (i = tree_depth + 1; i < el1->l_tree_depth; i++) {
-			bh = NULL;
-			status = ocfs_read_block(osb,
-						 tmp_blk,
-						 &bh, OCFS_BH_CACHED,
-						 inode);
-			if (status < 0) {
-				LOG_ERROR_STATUS (status);
-				goto finally;
-			}
-			status = ocfs_journal_access(handle, bh, 
-						     OCFS_JOURNAL_ACCESS_WRITE);
-			if (status < 0) {
-				LOG_ERROR_STATUS(status);
-				goto finally;
-			}
-
-			eb = (ocfs2_extent_block *) bh->b_data;
-			if (!IS_VALID_EXTENT_BLOCK(eb)) {
-				brelse(bh);
-				LOG_ERROR_STATUS (status = -EINVAL);
-				goto finally;
-			}
-			el2 = &eb->h_list;
-
-			if (el2->l_next_free_rec == 0) {
-				brelse(bh);
-				LOG_ERROR_STATUS (status = -EINVAL);
-				goto finally;
-			}
-
-			k = el2->l_next_free_rec - 1;
-			el2->l_recs[k].e_clusters += new_clusters;
-
-			tmp_blk = eb->h_parent_blk;
-
-			status = ocfs_journal_dirty(handle, bh);
-			if (status < 0) {
-				brelse(bh);
-				LOG_ERROR_STATUS (status);
-				goto finally;
-			}
-
-			brelse(bh);
-		}
-		k = el1->l_next_free_rec - 1;
-		el1->l_recs[k].e_clusters += new_clusters;
-	}
-finally:
-	if (eb_bhs) {
-		for (i = 0; i < new_blocks; i++)
-			if (eb_bhs[i])
-				brelse(eb_bhs[i]);
-		kfree(eb_bhs);
-	}
-	LOG_EXIT_STATUS (status);
-	return status;
-}				/* ocfs_allocate_new_data_node */
-
 /* expects array to already be malloced 
  *
  * sets h_signature, h_blkno, h_suballoc_bit, h_suballoc_node, and
@@ -501,597 +328,613 @@
 	return(status);
 }
 
-/* ocfs_grow_extent_tree()
+/*
+ * ocfs2_add_branch()
+ * 
+ * Add an entire tree branch to our inode. eb_bh is the extent block
+ * to start at, if we don't want to start the branch at the dinode
+ * structure.
+ * 
+ * last_eb_bh is required as we have to update it's next_leaf pointer
+ * for the new last extent block.
  *
+ * the new branch will be 'empty' in the sense that every block will
+ * contain a single record with e_clusters == 0.
  */
-static int ocfs_grow_extent_tree(ocfs_super *osb,
-				 struct buffer_head *fe_bh,
-				 ocfs_journal_handle *handle,
-				 u64 blkno, u32 new_clusters,
-				 struct inode *inode,
-				 ocfs2_alloc_context *meta_ac)
+static int ocfs2_add_branch(ocfs_super *osb,
+			    ocfs_journal_handle *handle,
+			    struct inode *inode,
+			    struct buffer_head *fe_bh,
+			    struct buffer_head *eb_bh,
+			    struct buffer_head *last_eb_bh,
+			    ocfs2_alloc_context *meta_ac)
 {
-	int status = 0;
-	__s32 k, i;
-	ocfs2_extent_block *eb1 = NULL;
-	ocfs2_extent_block *eb2 = NULL;
-	ocfs2_extent_list *ebl, *fel;
-	u64 parent_blk, last_eb_blkno;
-	u64 new_parent_blk = 0;
-	struct buffer_head **bhs = NULL;
-	int numbhs = 0;
-	ocfs2_dinode *fe = NULL;
+	int status, new_blocks, size, i;
+	u64 next_blkno, new_last_eb_blk;
+	struct buffer_head *bh;
+	struct buffer_head **new_eb_bhs = NULL;
+	ocfs2_dinode *fe;
+	ocfs2_extent_block *eb;
+	ocfs2_extent_list  *eb_el;
+	ocfs2_extent_list  *el;
 
-	LOG_ENTRY_ARGS("(0x%p, 0x%p, %llu, %u\n", osb, fe, blkno,
-		       new_clusters);
+	LOG_ENTRY();
 
-	OCFS_ASSERT(meta_ac);
+	OCFS_ASSERT(last_eb_bh);
 
 	fe = (ocfs2_dinode *) fe_bh->b_data;
 
-	fel = &fe->id2.i_list;
-	numbhs = fel->l_tree_depth + 1;
+	if (eb_bh) {
+		eb = (ocfs2_extent_block *) eb_bh->b_data;
+		el = &eb->h_list;
+	} else 
+		el = &fe->id2.i_list;
 
-	bhs = kmalloc(numbhs * sizeof(*bhs), GFP_KERNEL);
-	if (bhs == NULL) {
+	/* we never add a branch to a leaf. */
+	OCFS_ASSERT(el->l_tree_depth);
+
+	new_blocks = el->l_tree_depth;
+
+	/* allocate the number of new eb blocks we need */
+	size = sizeof(struct buffer_head *) * new_blocks;
+	new_eb_bhs = kmalloc(size, GFP_KERNEL);
+	if (!new_eb_bhs) {
 		status = -ENOMEM;
-		LOG_ERROR_STATUS(status = -ENOMEM);
-		goto finally;
+		LOG_ERROR_STATUS(status);
+		goto bail;
 	}
-	memset(bhs, 0, numbhs * sizeof(*bhs));
+	memset(new_eb_bhs, 0, size);
 
-	status = ocfs_create_new_meta_bhs(osb, handle, inode, numbhs, meta_ac, bhs);
+	status = ocfs_create_new_meta_bhs(osb, handle, inode, new_blocks, 
+					  meta_ac, new_eb_bhs);
 	if (status < 0) {
 		LOG_ERROR_STATUS (status);
-		goto finally;
+		goto bail;
 	}
 
-	eb1 = (ocfs2_extent_block *) bhs[0]->b_data;
-	/* Copy the File Entry information in to the newly allocated sector */
-	ebl = &eb1->h_list;
-	for (k = 0; k < fel->l_count; k++) {
-		ebl->l_recs[k].e_cpos = fel->l_recs[k].e_cpos;
-		ebl->l_recs[k].e_clusters = fel->l_recs[k].e_clusters;
-		ebl->l_recs[k].e_blkno = fel->l_recs[k].e_blkno;
-	}
+	/* Note: new_eb_bhs[new_blocks - 1] is the guy which will be
+	 * linked with the rest of the tree. 
+	 * conversly, new_eb_bhs[0] is the new bottommost leaf. 
+	 * 
+	 * when we leave the loop, new_last_eb_blk will point to the
+	 * newest leaf, and next_blkno will point to the topmost extent
+	 * block. */
+	next_blkno = new_last_eb_blk = 0;
+	for(i = 0; i < new_blocks; i++) {
+		bh = new_eb_bhs[i];
+		eb = (ocfs2_extent_block *) bh->b_data;
+		OCFS_ASSERT(IS_VALID_EXTENT_BLOCK(eb));
+		eb_el = &eb->h_list;
 
-	last_eb_blkno = fe->i_last_eb_blk;
-	new_parent_blk = eb1->h_blkno;
-	eb1->h_next_leaf_blk = 0;
-	fel->l_tree_depth++;
+		status = ocfs_journal_access(handle, bh, 
+					     OCFS_JOURNAL_ACCESS_CREATE);
+		if (status < 0) {
+			LOG_ERROR_STATUS(status);
+			goto bail;
+		}
 
-	LOG_TRACE_ARGS ("Tree depth is: %d\n", fel->l_tree_depth);
+		eb->h_next_leaf_blk = 0;
+		eb_el->l_tree_depth = i;
+		eb_el->l_next_free_rec = 1;
+		eb_el->l_recs[0].e_cpos = fe->i_clusters;
+		eb_el->l_recs[0].e_blkno = next_blkno;
+		eb_el->l_recs[0].e_clusters = 0;
+		if (!eb_el->l_tree_depth)
+			new_last_eb_blk = eb->h_blkno;
 
-	parent_blk = fe->i_blkno;
+		status = ocfs_journal_dirty(handle, bh);
+		if (status < 0) {
+			LOG_ERROR_STATUS(status);
+			goto bail;
+		}
 
-	/* If tree_depth is one now, the for loop will not execute. *
-	 * First time a file is created, tree_depth = 0 */
-	for (i = 0; i < (fel->l_tree_depth - 1); i++) {
-		ocfs2_extent_block *tmpeb;
+		next_blkno = eb->h_blkno;
+	}
 
-		eb2 = (ocfs2_extent_block *) bhs[i]->b_data;
-		ebl = &eb2->h_list;
+	/* This is a bit hairy. We want to update up to three blocks
+	 * here without leaving any of them in an inconsistent state
+	 * in case of error. We don't have to worry about
+	 * journal_dirty erroring as it won't unless we've aborted the
+	 * handle (in which case we would never be here) so reserving
+	 * the write with journal_access is all we need to do. */
+	status = ocfs_journal_access(handle, last_eb_bh, 
+				     OCFS_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+	status = ocfs_journal_access(handle, fe_bh, 
+				     OCFS_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+	if (eb_bh) {
+		status = ocfs_journal_access(handle, eb_bh, 
+					     OCFS_JOURNAL_ACCESS_WRITE);
+		if (status < 0) {
+			LOG_ERROR_STATUS(status);
+			goto bail;
+		}
+	}
 
-		ebl->l_tree_depth = (fel->l_tree_depth - 1) - i;
-		ebl->l_count = ocfs2_extent_recs_per_eb(osb->sb);
+	/* Link the new branch into the rest of the tree (el will
+	 * either be on the fe, or the extent block passed in. */
+	i = el->l_next_free_rec;
+	el->l_recs[i].e_blkno = next_blkno;
+	el->l_recs[i].e_cpos = fe->i_clusters;
+	el->l_recs[i].e_clusters = 0;
+	el->l_next_free_rec++;
 
-		if (i == 0) {
-			tmpeb = (ocfs2_extent_block *) bhs[1]->b_data;
-			ebl->l_recs[fel->l_count].e_blkno = 
-				tmpeb->h_blkno;
-			ebl->l_recs[fel->l_count].e_cpos =
-				fe->i_clusters;
-			ebl->l_recs[fel->l_count].e_clusters =
-				new_clusters;
-			ebl->l_next_free_rec = fel->l_count + 1;
+	/* fe needs a new last extent block pointer, as does the
+	 * next_leaf on the previously last-extent-block. */
+	fe->i_last_eb_blk = new_last_eb_blk;
 
-			OCFS_ASSERT(ebl->l_next_free_rec <= ebl->l_count);
+	eb = (ocfs2_extent_block *) last_eb_bh->b_data;
+	eb->h_next_leaf_blk = new_last_eb_blk;
 
-			eb2->h_parent_blk = parent_blk;
+	status = ocfs_journal_dirty(handle, last_eb_bh);
+	if (status < 0)
+		LOG_ERROR_STATUS(status);
+	status = ocfs_journal_dirty(handle, fe_bh);
+	if (status < 0)
+		LOG_ERROR_STATUS(status);
+	if (eb_bh) {
+		status = ocfs_journal_dirty(handle, eb_bh);
+		if (status < 0)
+			LOG_ERROR_STATUS(status);
+	}
 
-			parent_blk = last_eb_blkno = eb2->h_blkno;
-		} else {
-			tmpeb = (ocfs2_extent_block *) bhs[i + 1]->b_data;
-			ebl->l_recs[0].e_blkno = tmpeb->h_blkno;
-			ebl->l_recs[0].e_cpos = fe->i_clusters;
-			ebl->l_recs[0].e_clusters = new_clusters;
-			ebl->l_next_free_rec = 1;
+	status = 0;
+bail:
+	if (new_eb_bhs) {
+		for (i = 0; i < new_blocks; i++)
+			if (new_eb_bhs[i])
+				brelse(new_eb_bhs[i]);
+		kfree(new_eb_bhs);
+	}
 
-			eb2->h_parent_blk = parent_blk;
+	LOG_EXIT_STATUS(status);
+	return status;
+}
 
-			parent_blk = last_eb_blkno = eb2->h_blkno;
-		}
-	}
+/*
+ * ocfs2_shift_tree_depth()
+ *
+ * adds another level to the allocation tree.
+ * returns back the new extent block so you can add a branch to it
+ * after this call. 
+ */
+static int ocfs2_shift_tree_depth(ocfs_super *osb,
+				  ocfs_journal_handle *handle,
+				  struct inode *inode,
+				  struct buffer_head *fe_bh,
+				  ocfs2_alloc_context *meta_ac,
+				  struct buffer_head **ret_new_eb_bh)
+{
+	int status, i;
+	struct buffer_head *new_eb_bh = NULL;
+	ocfs2_dinode *fe;
+	ocfs2_extent_block *eb;
+	ocfs2_extent_list  *fe_el;
+	ocfs2_extent_list  *eb_el;
 
-	/* Update the Data Segment, which is the last one in our array */
-	eb1 = (ocfs2_extent_block *) bhs[numbhs - 1]->b_data;
-	ebl = &eb1->h_list;
+	LOG_ENTRY();
 
-	i = (fel->l_tree_depth > 1) ? 0 : fel->l_count;
+	status = ocfs_create_new_meta_bhs(osb, handle, inode, 1, meta_ac, 
+					  &new_eb_bh);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
 
-	LOG_TRACE_ARGS ("EntryAvailable is: %d\n", ebl->l_next_free_rec);
+	eb = (ocfs2_extent_block *) new_eb_bh->b_data;
+	OCFS_ASSERT(IS_VALID_EXTENT_BLOCK(eb));
+	eb_el = &eb->h_list;
+	fe = (ocfs2_dinode *) fe_bh->b_data;
+	fe_el = &fe->id2.i_list;
 
-	/* For the time being we are assuming that the newly allocated Extent */
-	/* will have one more entry to accomodate the latest allocation */
+	status = ocfs_journal_access(handle, new_eb_bh, 
+				     OCFS_JOURNAL_ACCESS_CREATE);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
 
-	ebl->l_tree_depth = 0;
-	ebl->l_count = ocfs2_extent_recs_per_eb(osb->sb);
+	/* copy the fe data into the new extent block */
+	eb_el->l_tree_depth = fe_el->l_tree_depth;
+	eb_el->l_next_free_rec = fe_el->l_next_free_rec;
+	for(i = 0; i < fe_el->l_next_free_rec; i++) {
+		eb_el->l_recs[i].e_cpos = fe_el->l_recs[i].e_cpos;
+		eb_el->l_recs[i].e_clusters = fe_el->l_recs[i].e_clusters;
+		eb_el->l_recs[i].e_blkno = fe_el->l_recs[i].e_blkno;
+	}
 
-	ebl->l_recs[i].e_cpos = fe->i_clusters;
-	ebl->l_recs[i].e_clusters = new_clusters;
-	ebl->l_recs[i].e_blkno = blkno;
-	ebl->l_next_free_rec = i + 1;
-	OCFS_ASSERT(ebl->l_next_free_rec <= ebl->l_count);
+	status = ocfs_journal_dirty(handle, new_eb_bh);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
 
-	eb1->h_parent_blk = parent_blk;
-	eb1->h_next_leaf_blk = 0;
+	status = ocfs_journal_access(handle, fe_bh, 
+				     OCFS_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
 
-	last_eb_blkno = eb1->h_blkno;
+	/* update fe now */
+	fe_el->l_tree_depth++;
+	fe_el->l_recs[0].e_cpos = 0;
+	fe_el->l_recs[0].e_blkno = eb->h_blkno;
+	fe_el->l_recs[0].e_clusters = fe->i_clusters;
+	for(i = 1; i < fe_el->l_next_free_rec; i++) {
+		fe_el->l_recs[i].e_cpos = 0;
+		fe_el->l_recs[i].e_clusters = 0;
+		fe_el->l_recs[i].e_blkno = 0;
+	}
+	fe_el->l_next_free_rec = 1;
 
-	for(i = 0; i < numbhs; i++) {
-		status = ocfs_journal_dirty(handle, bhs[i]);
-		if (status < 0) {
-			LOG_ERROR_STATUS (status);
-			goto finally;
-		}
+	/* If this is our 1st tree depth shift, then last_eb_blk
+	 * becomes the allocated extent block */
+	if (fe_el->l_tree_depth == 1)
+		fe->i_last_eb_blk = eb->h_blkno;
+
+	status = ocfs_journal_dirty(handle, fe_bh);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
 	}
 
-	/* Update the Previous Last Data Extent with this new Data
-	 * Extent Pointer */
-	if (fe->i_last_eb_blk != 0) {
-		struct buffer_head *bh = NULL;
+	*ret_new_eb_bh = new_eb_bh;
+	new_eb_bh = NULL;
+	status = 0;
+bail:
+	if (new_eb_bh)
+		brelse(new_eb_bh);
 
-		status = ocfs_read_block(osb, fe->i_last_eb_blk,
-					 &bh, OCFS_BH_CACHED, inode);
-		if (status < 0) {
-			LOG_ERROR_STATUS (status);
-			goto finally;
-		}
+	LOG_EXIT_STATUS(status);
+	return status;
+}
 
-		status = ocfs_journal_access(handle, bh, 
-					     OCFS_JOURNAL_ACCESS_WRITE);
-		if (status < 0) {
-			LOG_ERROR_STATUS (status);
-			goto finally;
-		}
+/*
+ * ocfs2_do_insert_extent()
+ *
+ * Expects the tree to already have room in the rightmost leaf for the
+ * extent.  Updates all the extent blocks (and the dinode) on the way
+ * down.
+ */
+static int ocfs2_do_insert_extent(ocfs_super *osb,
+				  ocfs_journal_handle *handle,
+				  struct inode *inode,
+				  struct buffer_head *fe_bh,
+				  u64 start_blk,
+				  u32 new_clusters)
+{
+	int status, i, num_bhs = 0;
+	u64 next_blkno;
+	struct buffer_head **eb_bhs = NULL;
+	ocfs2_dinode *fe;
+	ocfs2_extent_block *eb;
+	ocfs2_extent_list  *el;
 
-		eb1 = (ocfs2_extent_block *) bh->b_data;
-		if (!IS_VALID_EXTENT_BLOCK(eb1) ||
-		    eb1->h_list.l_tree_depth) {
-			brelse(bh);
-			LOG_ERROR_STATUS (status = -EINVAL);
-			goto finally;
-		}
+	LOG_ENTRY();
 
-		eb1->h_next_leaf_blk = last_eb_blkno;
-		
-		status = ocfs_journal_dirty(handle, bh);
-		brelse(bh);
-		if (status < 0) {
-			LOG_ERROR_STATUS (status);
-			goto finally;
-		}
+	status = ocfs_journal_access(handle, fe_bh, 
+				     OCFS_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
 	}
 
-	/* Update the uphdrptr of the extents pointed to by fe */
-	if (fel->l_tree_depth > 1) {
-		int i;
-		struct buffer_head *bh = NULL;
+	fe = (ocfs2_dinode *) fe_bh->b_data;
+	el = &fe->id2.i_list;
+	if (el->l_tree_depth) {
+		/* This is another operation where we want to be
+		 * careful about our tree updates. An error here means
+		 * none of the previous changes we made should roll
+		 * forward. As a result, we have to record the buffers
+		 * for this part of the tree in an array and reserve a
+		 * journal write to them before making any changes. */
+		num_bhs = fe->id2.i_list.l_tree_depth;
+		eb_bhs = kmalloc(sizeof(struct buffer_head *) * num_bhs, 
+			      GFP_KERNEL);
+		if (!eb_bhs) {
+			status = -ENOMEM;
+			LOG_ERROR_STATUS(status);
+			goto bail;
+		}
+		memset(eb_bhs, 0, sizeof(struct buffer_head *) * num_bhs);
 
-		status = 0;
-		for (i = 0; i < fel->l_count; ++i) {
-			status = ocfs_read_block(osb,
-						 fel->l_recs[i].e_blkno,
-						 &bh, OCFS_BH_CACHED,
-						 inode);
+		i = 0;
+		while(el->l_tree_depth) {
+			OCFS_ASSERT_RO(el->l_next_free_rec);
+			next_blkno = el->l_recs[el->l_next_free_rec-1].e_blkno;
+
+			OCFS_ASSERT(i < num_bhs);
+			status = ocfs_read_block(osb, next_blkno, &eb_bhs[i], 
+						 OCFS_BH_CACHED, inode);
 			if (status < 0) {
 				LOG_ERROR_STATUS(status);
-				brelse(bh);
-				break;
+				goto bail;
 			}
+			eb = (ocfs2_extent_block *) eb_bhs[i]->b_data;
+			OCFS_ASSERT_RO(IS_VALID_EXTENT_BLOCK(eb));
 
-			status = ocfs_journal_access(handle, bh, 
+			status = ocfs_journal_access(handle, eb_bhs[i], 
 						    OCFS_JOURNAL_ACCESS_WRITE);
 			if (status < 0) {
-				LOG_ERROR_STATUS (status);
-				goto finally;
+				LOG_ERROR_STATUS(status);
+				goto bail;
 			}
 
-			eb1 = (ocfs2_extent_block *) bh->b_data;
-			eb1->h_parent_blk = new_parent_blk;
+			el = &eb->h_list;
+			i++;
+			/* When we leave this loop, eb_bhs[num_bhs - 1] will
+			 * hold the bottom-most leaf extent block. */
+		}
+		OCFS_ASSERT(!el->l_tree_depth);
 
-			status = ocfs_journal_dirty(handle, bh);
-			brelse(bh);
-			bh = NULL;
+		el = &fe->id2.i_list;
+		/* If we have tree depth, then the fe update is
+		 * trivial, and we want to switch el out for the
+		 * bottom-most leaf in order to update it with the
+		 * actual extent data below. */
+		OCFS_ASSERT_RO(el->l_next_free_rec);
+		el->l_recs[el->l_next_free_rec - 1].e_clusters += new_clusters;
+		for(i = 0; i < (num_bhs - 1); i++) {
+			eb = (ocfs2_extent_block *) eb_bhs[i]->b_data;
+			el = &eb->h_list;
+
+			/* finally, make our actual change to the
+			 * intermediate extent blocks. */
+			el->l_recs[el->l_next_free_rec - 1].e_clusters
+					+= new_clusters;
+
+			status = ocfs_journal_dirty(handle, eb_bhs[i]);
 			if (status < 0)
 				LOG_ERROR_STATUS(status);
 		}
+		OCFS_ASSERT(i == (num_bhs - 1));
+		/* note that the leaf block wasn't touched in
+		 * the loop above */
+		eb = (ocfs2_extent_block *) eb_bhs[num_bhs - 1]->b_data;
+		el = &eb->h_list;
+		OCFS_ASSERT(!el->l_tree_depth);
 	}
 
-	/* Clear all the extent information from File Entry */
-	for (i = 0; i < fel->l_count; i++) {
-		fel->l_recs[i].e_cpos = 0;
-		fel->l_recs[i].e_clusters = 0;
-		fel->l_recs[i].e_blkno = 0;
+	/* yay, we can finally add the actual extent now! */
+	i = el->l_next_free_rec - 1;
+	if (el->l_next_free_rec && ocfs_extent_contig(inode, 
+						      &el->l_recs[i], 
+						      start_blk)) {
+		/* having an empty extent at eof is legal. */
+		if (!el->l_recs[i].e_clusters)
+			el->l_recs[i].e_blkno = start_blk;
+		el->l_recs[i].e_clusters += new_clusters;
+	} else {
+		/* No contiguous record, or no empty record at eof, so
+		 * we add a new one. */
+		OCFS_ASSERT(el->l_next_free_rec < el->l_count);
+		i = el->l_next_free_rec;
+		el->l_recs[i].e_blkno = start_blk;
+		el->l_recs[i].e_clusters = new_clusters;
+		el->l_recs[i].e_cpos = fe->i_clusters;
+		el->l_next_free_rec++;
 	}
 
-	/* Update the File Entry Extent */
+	status = ocfs_journal_dirty(handle, fe_bh);
+	if (status < 0)
+		LOG_ERROR_STATUS(status);
+	if (fe->id2.i_list.l_tree_depth) {
+		status = ocfs_journal_dirty(handle, eb_bhs[num_bhs - 1]);
+		if (status < 0)
+			LOG_ERROR_STATUS(status);
+	}
 
-	LOG_TRACE_ARGS("fe->i_clusters = %u\n", fe->i_clusters);
-	fel->l_recs[0].e_cpos = 0;  /* FIXME: not needed */
-	fel->l_recs[0].e_clusters = fe->i_clusters + new_clusters;
-	eb1 = (ocfs2_extent_block *) bhs[0]->b_data;
-	fel->l_recs[0].e_blkno = eb1->h_blkno;
-	fel->l_next_free_rec = 1;
-	fe->i_last_eb_blk = last_eb_blkno;
+	status = 0;
+bail:
+	if (eb_bhs) {
+		for (i = 0; i < num_bhs; i++)
+			if (eb_bhs[i])
+				brelse(eb_bhs[i]);
+		kfree(eb_bhs);
+	}
 
-finally:
-#warning Leaking bhs here
-	LOG_EXIT_STATUS (status);
-	return (status);
-}				/* ocfs_grow_extent_tree */
+	LOG_EXIT_STATUS(status);
+	return status;
+}
 
 /*
- * ocfs_allocate_extent()
+ * ocfs2_find_branch_target()
  *
- * You need to be holding node_alloc_sem!
+ * Should only be called when there is no space left in any of the
+ * leaf nodes. What we want to do is find the lowest tree depth
+ * non-leaf extent block with room for new records. There are three
+ * valid results of this search:
+ *
+ * 1) a lowest extent block is found, then we pass it back in
+ *    *lowest_eb_bh and return '0'
+ *
+ * 2) the search fails to find anything, but the dinode has room. We
+ *    pass NULL back in *lowest_eb_bh, but still return '0'
+ *
+ * 3) the search fails to find anything AND the dinode is full, in
+ *    which case we return > 0
+ *
+ * return status < 0 indicates an error.
  */
-int ocfs_allocate_extent(ocfs_super *osb, struct buffer_head *fe_bh,
-			 ocfs_journal_handle *handle,
-			 u64 blkno, u32 new_clusters,
-			 struct inode *inode,
-			 ocfs2_alloc_context *meta_ac)
+static int ocfs2_find_branch_target(ocfs_super *osb,
+				    struct inode *inode,
+				    struct buffer_head *fe_bh,
+				    struct buffer_head **target_bh)
 {
-	int status = 0;
-	int IncreaseTreeDepth = 0;
-	int k = 0, i;
-	ocfs2_extent_block *eb1 = NULL, *eb2 = NULL;
-	ocfs2_extent_list *fel, *el1 = NULL, *el2 = NULL;
-	struct buffer_head *eb1_bh = NULL, *eb2_bh = NULL;
-	int UpdateParent = 0;
-	u64 parent_blk, new_eb_blkno;
-	ocfs2_dinode *fe = NULL;
+	int status = 0, i;
+	u64 blkno;
+	ocfs2_dinode *fe;
+	ocfs2_extent_block *eb;
+	ocfs2_extent_list  *el;
+	struct buffer_head *bh = NULL;
+	struct buffer_head *lowest_bh = NULL;
 
-	LOG_ENTRY_ARGS("(blkno=%llu, new_clusters=%u, inode=%llu)\n",
-		       blkno, new_clusters, OCFS_I(inode)->ip_blkno);
+	LOG_ENTRY();
 
+	*target_bh = NULL;
+
 	fe = (ocfs2_dinode *) fe_bh->b_data;
-	OCFS_ASSERT(fe);
+	el = &fe->id2.i_list;
 
-	if (!IS_VALID_FILE_ENTRY (fe)) {
-		LOG_ERROR_STATUS(status = -EINVAL);
-		goto finally;
-	}
-	fel = &fe->id2.i_list;
+	while(el->l_tree_depth > 1) {
+		OCFS_ASSERT_RO(el->l_next_free_rec);
+		i = el->l_next_free_rec - 1;
+		blkno = el->l_recs[i].e_blkno;
+		OCFS_ASSERT_RO(blkno);
 
-	if (!fel->l_tree_depth) {
-		LOG_TRACE_ARGS("Using local extents: depth=%d, next_free=%u, l_count=%u\n", 
-			       fel->l_tree_depth, fel->l_next_free_rec, fel->l_count);
-		/* We are still using the local extents of File Entry */
-		if (fel->l_next_free_rec > fel->l_count) {
-			LOG_ERROR_STATUS(status = -EINVAL);
-			goto finally;
+		if (bh) {
+			brelse(bh);
+			bh = NULL;
 		}
 
-		k = fel->l_next_free_rec - 1;
-		if (k >= 0 &&
-		    ocfs_extent_contig(inode, &fel->l_recs[k], blkno)) {
-			/* See if we can merge the extents and just increase the length */
-			LOG_TRACE_ARGS ("Using local extent for extent Entry = %u\n", k);
-			fel->l_recs[k].e_clusters += new_clusters;
-			goto finally;
+		status = ocfs_read_block(osb, blkno, &bh, OCFS_BH_CACHED, 
+					 inode);
+		if (status < 0) {
+			LOG_ERROR_STATUS(status);
+			goto bail;
 		}
 
-		/* We cannot merge try to give him the next extent */
-		k = fel->l_next_free_rec;
-		if (k < fel->l_count) {
-			/* file_off for the new extent will be equal
-			 * to the previous allocation size of file */
-			fel->l_recs[k].e_cpos = fe->i_clusters;
-			fel->l_recs[k].e_clusters = new_clusters;
-			fel->l_recs[k].e_blkno = blkno;
-			fel->l_next_free_rec++;
-			goto finally;
-		}
-		/* We have no more room in the fe, must increase
-		 * tree_depth */
-		IncreaseTreeDepth = 1;
-		goto increase_depth;
-	}
-	LOG_TRACE_STR("Using NON-local extents");
+		eb = (ocfs2_extent_block *) bh->b_data;
+		OCFS_ASSERT_RO(IS_VALID_EXTENT_BLOCK(eb));
+		el = &eb->h_list;
 
-	/*** Nonlocal Extents ***/
-	/* This is now less likely with OCFS2 extent lists */
-	if (fel->l_tree_depth > 4)
-		LOG_ERROR_ARGS ("inode %llu, tree_depth=%u", 
-				OCFS_I(inode)->ip_blkno, fel->l_tree_depth);
-	
-	/* This File is no longer using Local Extents */
-	IncreaseTreeDepth = 0;
-	
-	status = ocfs_read_block(osb, fe->i_last_eb_blk,
-				 &eb1_bh, OCFS_BH_CACHED, inode);
-	if (status < 0) {
-		LOG_ERROR_STATUS (status = -EINVAL);
-		goto finally;
-	}
-	eb1 = (ocfs2_extent_block *) eb1_bh->b_data;
-	el1 = &eb1->h_list;
-	if (!IS_VALID_EXTENT_BLOCK(eb1) ||
-	    el1->l_tree_depth) {
-		LOG_ERROR_STATUS (status = -EINVAL);
-		goto finally;
-	}
-	
-	k = el1->l_next_free_rec - 1;
-	LOG_TRACE_ARGS ("Using local extent for extent Entry = %u\n", k);
-	if (el1->l_next_free_rec < 1)
-		LOG_ERROR_ARGS ("l_next_free_rec=%d",
-				el1->l_next_free_rec);
-	
-	/* See if we can merge the extents and just increase
-	 * the length */
-	/* FIXME: If k < 0, shouldn't we ERROR_RO_FS? */
-	if (k >= 0 && ocfs_extent_contig(inode, &(el1->l_recs[k]), blkno)) {
-		status = ocfs_journal_access(handle, eb1_bh, 
-					     OCFS_JOURNAL_ACCESS_WRITE);
-		if (status < 0) {
-			LOG_ERROR_STATUS (status);
-			goto finally;
+		if (el->l_next_free_rec < el->l_count) {
+			if (lowest_bh)
+				brelse(lowest_bh);
+			lowest_bh = bh;
+			get_bh(lowest_bh);
 		}
-
-		el1->l_recs[k].e_clusters += new_clusters;
-		status = 0;
-		UpdateParent = 1;
-		goto do_update_parent;
 	}
 
-	/* We cannot merge, give him the next extent */
-	k = el1->l_next_free_rec;
-	
-	if (k < el1->l_count) {
-		/* we can just add next extent */
-		status = ocfs_journal_access(handle, eb1_bh, 
-					     OCFS_JOURNAL_ACCESS_WRITE);
-		if (status < 0) {
-			LOG_ERROR_STATUS (status);
-			goto finally;
-		}
+	/* If we didn't find one and the fe doesn't have any room,
+	 * then return '1' */
+	if (!lowest_bh 
+	    && (fe->id2.i_list.l_next_free_rec == fe->id2.i_list.l_count))
+		status = 1;
 
-		eb1 = (ocfs2_extent_block *) eb1_bh->b_data;
-		el1 = &eb1->h_list;
+	*target_bh = lowest_bh;
+bail:
+	if (bh)
+		brelse(bh);
 
-		el1->l_recs[k].e_cpos = fe->i_clusters;
-		el1->l_recs[k].e_clusters = new_clusters;
-		el1->l_recs[k].e_blkno = blkno;
-		el1->l_next_free_rec++;
-		OCFS_ASSERT(el1->l_next_free_rec <= el1->l_count);
-		UpdateParent = 1;
-	} else {
-		/* Read the last extent and keep traversing
-		 * upward till we find a free extent or we are
-		 * at the top and need to create another
-		 * level. */
-		if (fel->l_tree_depth > 1)
-			parent_blk = eb1->h_parent_blk;
-		else
-			parent_blk = 0;
+	LOG_EXIT_STATUS(status);
+	return status;
+}
 
-		for (i = 1; i < fel->l_tree_depth; i++) {
-			status = ocfs_read_block(osb,
-						 parent_blk,
-						 &eb2_bh,
-						 OCFS_BH_CACHED,
-						 inode);
-			if (status < 0) {
-				LOG_ERROR_STATUS (status);
-				goto finally;
-			}
-			eb2 = (ocfs2_extent_block *) eb2_bh->b_data;
-			el2 = &eb2->h_list;
-			if (!IS_VALID_EXTENT_BLOCK(eb2) ||
-			    !el2->l_tree_depth) {
-				LOG_ERROR_STATUS (status = -EINVAL);
-				goto finally;
-			}
-			
-			if ((el2->l_tree_depth != i) ||
-			    (el2->l_next_free_rec > el2->l_count)) {
-				LOG_ERROR_STATUS(status = -EINVAL);
-				goto finally;
-			}
-			
-			if (el2->l_next_free_rec != el2->l_count)
-				break;
-			
-			parent_blk = eb2->h_parent_blk;
-			brelse(eb2_bh);
-			eb2 = NULL;
-			el2 = NULL;
-			eb2_bh = NULL;
-		} /* for (i = 1; i < fe->i_tree_depth; i++) */
+/* the caller needs to update fe->i_clusters */
+int ocfs2_insert_extent(ocfs_super *osb, 
+			ocfs_journal_handle *handle,
+			struct inode *inode,
+			struct buffer_head *fe_bh,
+			u64 start_blk, 
+			u32 new_clusters,
+			ocfs2_alloc_context *meta_ac)
+{
+	int status, i, shift;
+	struct buffer_head *last_eb_bh = NULL;
+	struct buffer_head *bh = NULL;
+	ocfs2_dinode *fe;
+	ocfs2_extent_block *eb;
+	ocfs2_extent_list  *el;
 
-		if (eb2) {
-			eb2 = NULL;
-			el2 = NULL;
-			/* we may still need the bh so don't brelse */
-		}
+	LOG_ENTRY();
 
-		/* if we got to the top, then we're at the FE. Check
-		 * if the FE is full -- if so, then we need to
-		 * increase the tree_depth. */
-		if ((i == fel->l_tree_depth) &&
-		    (fel->l_next_free_rec == fel->l_count)) {
-			IncreaseTreeDepth = 1;
-			goto increase_depth;
-		}
+	LOG_TRACE_ARGS("add %u clusters starting at block %llu to inode "
+		       "%llu\n",new_clusters, start_blk, 
+		       OCFS_I(inode)->ip_blkno);
 
-		/* ok, we need to add a branch. pass in NULL
-		 * if we need a whole branch, otherwise the
-		 * extent which needs the new leaf */
-		status = ocfs_allocate_new_data_node(osb, fe, blkno,
-						     new_clusters,
-						     eb2_bh,
-						     &new_eb_blkno,
-						     handle, inode, meta_ac);
-		if (status < 0) {
-			LOG_ERROR_STATUS(status);
-			goto finally;
-		}
+	fe = (ocfs2_dinode *) fe_bh->b_data;
+	el = &fe->id2.i_list;
 
-		status = ocfs_journal_access(handle, eb1_bh, 
-					     OCFS_JOURNAL_ACCESS_WRITE);
+	if (el->l_tree_depth) {
+		/* jump to end of tree */
+		status = ocfs_read_block(osb, fe->i_last_eb_blk, &last_eb_bh,
+					 OCFS_BH_CACHED, inode);
 		if (status < 0) {
-			LOG_ERROR_STATUS (status);
-			goto finally;
+			LOG_EXIT_STATUS(status);
+			goto bail;
 		}
-
-		eb1 = (ocfs2_extent_block *) eb1_bh->b_data;
-
-		eb1->h_next_leaf_blk = fe->i_last_eb_blk = new_eb_blkno;
+		eb = (ocfs2_extent_block *) last_eb_bh->b_data;
+		el = &(eb->h_list);
 	}
-	
-do_update_parent:
-	/* before we put the variable away, save off parent_blk as
-	 * we may need it if we update parent */
-	parent_blk = eb1->h_parent_blk;
-	
-	/* gotta put it away to write it ;) */
-	eb1 = NULL;
-	status = ocfs_journal_dirty(handle, eb1_bh);
-	if (status < 0) {
-		LOG_ERROR_STATUS(status);
-		goto finally;
-	}
-	
-	if (!IncreaseTreeDepth && UpdateParent) {
-		for (i = 1; i < fel->l_tree_depth; i++) {
-			
-			/* next two if's are for loop around */
-			if (eb2_bh) {
-				if (eb2) {
-					eb2 = NULL;
-					el2 = NULL;
-				}
-				brelse(eb2_bh);
-				eb2_bh = NULL;
-			}
-			/* TODO: Can we do a cached read here? */
-			status = ocfs_read_block(osb,
-						 parent_blk,
-						 &eb2_bh, 
-						 OCFS_BH_CACHED,
-						 inode);
-			if (status < 0) {
-				LOG_ERROR_STATUS (status);
-				goto finally;
-			}
 
-			status = ocfs_journal_access(handle, eb2_bh,
-						     OCFS_JOURNAL_ACCESS_WRITE);
-			if (status < 0) {
-				LOG_ERROR_STATUS(status);
-				goto finally;
-			}
+	/* Can we allocate without adding/shifting tree bits? */
+	i = el->l_next_free_rec - 1;
+	if (!el->l_next_free_rec
+	    || (el->l_next_free_rec < el->l_count)
+	    || ocfs_extent_contig(inode, &el->l_recs[i], start_blk))
+		goto out_add;
 
-			eb2 = (ocfs2_extent_block *) eb2_bh->b_data;
-			el2 = &eb2->h_list;
-			if (!IS_VALID_EXTENT_BLOCK(eb2) ||
-			    !el2->l_tree_depth) {
-				LOG_ERROR_STATUS (status = -EINVAL);
-				goto finally;
-			}
-			
-			if (el2->l_next_free_rec == 0) {
-				LOG_ERROR_STATUS (status = -EINVAL);
-				goto finally;
-			}
-			
-			k = el2->l_next_free_rec - 1;
-			
-			el2->l_recs[k].e_clusters += new_clusters;
-			
-			/* gonna need it if we loop around */
-			parent_blk = eb2->h_parent_blk;
-			
-			eb2 = NULL;
-			el2 = NULL;
+	LOG_TRACE_STR("ocfs2_allocate_extent: couldn't do a simple add, "
+		      "traversing tree now.\n");
 
-			status = ocfs_journal_dirty(handle, eb2_bh);
-			if (status < 0) {
-				goto finally;
-			}
-		}
-		
-		k = fel->l_next_free_rec - 1;
-		
-		fel->l_recs[k].e_clusters += new_clusters;
-	}
-	
-	if (status < 0) {
+	shift = ocfs2_find_branch_target(osb, inode, fe_bh, &bh);
+	if (shift < 0) {
+		status = shift;
 		LOG_ERROR_STATUS(status);
-		goto finally;
+		goto bail;
 	}
 
-increase_depth:
-	if (IncreaseTreeDepth) {
-		fe = NULL;
-		if (eb1_bh && eb1) {
-			eb1 = NULL;
-			el1 = NULL;
-		}
-		if (eb2_bh && eb2) {
-			eb2 = NULL;
-			el2 = NULL;
-		}	
+	/* We traveled all the way to the bottom and found nothing. */
+	if (shift) {
+		/* if we hit a leaf, we'd better be empty :) */
+		OCFS_ASSERT(el->l_next_free_rec == el->l_count);
+		OCFS_ASSERT(!bh);
+		LOG_TRACE_ARGS("ocfs2_allocate_extent: need to shift tree "
+			       "depth (current = %u)\n", 
+			       fe->id2.i_list.l_tree_depth);
 
-		status = ocfs_grow_extent_tree(osb, fe_bh, handle,
-					       blkno, new_clusters,
-					       inode, meta_ac);
+		/* ocfs2_shift_tree_depth will return us a buffer with
+		 * the new extent block (so we can pass that to
+		 * ocfs2_add_branch). */
+		status = ocfs2_shift_tree_depth(osb, handle, inode, fe_bh, 
+						meta_ac, &bh);
 		if (status < 0) {
 			LOG_ERROR_STATUS(status);
-			goto finally;
+			goto bail;
 		}
+		/* Special case: we have room now if we shifted from
+		 * tree_depth 0 */
+		if (fe->id2.i_list.l_tree_depth == 1)
+			goto out_add;
 	}
 
-finally:
-	if ((status == 0) && (inode != NULL)) {
-		__s64 Vbo = 0;
-		__s64 Lbo = 0;
-
-		/* Add this Entry in to extent map. If a new mapping
-		 * run to be added overlaps an existing mapping run,
-		 * ocfs_add_extent_map_entry merges them into a single
-		 * mapping run.So just adding this entry will be
-		 * fine. */
-		if (fe == NULL)
-			fe = (ocfs2_dinode *) fe_bh->b_data;
-
-		Vbo = (u64)fe->i_clusters << osb->s_clustersize_bits;
-		Lbo = blkno << osb->sb->s_blocksize_bits;
-
-		/* Add the Entry to the extent map list */
-		if (!ocfs_add_extent_map_entry(osb, &OCFS_I(inode)->ip_ext_map,
-					       Vbo, Lbo,
-					       (u64)new_clusters << osb->s_clustersize_bits))
-			LOG_ERROR_STATUS (status = -EINVAL);
+	/* call ocfs2_add_branch to add the final part of the tree with
+	 * the new data. */
+	LOG_TRACE_ARGS("ocfs2_allocate_extent: add branch. bh = %p\n", bh);
+	status = ocfs2_add_branch(osb, handle, inode, fe_bh, bh, last_eb_bh, 
+				  meta_ac);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
 	}
 
-	/* Buffers are always null if they haven't been mapped and
-	 * non-null if they have. 
-	 * Buffer heads are non-NULL if they need to be brelsed */
-	if (eb1_bh) {
-		brelse(eb1_bh);
-	}
+out_add:
+	/* Finally, we can add clusters. */
+	status = ocfs2_do_insert_extent(osb, handle, inode, fe_bh, 
+					start_blk, new_clusters);
+	if (status < 0)
+		LOG_ERROR_STATUS(status);
 
-	if (eb2_bh) {
-		brelse(eb2_bh);
-	}
+bail:
+	if (bh)
+		brelse(bh);
 
-	LOG_EXIT_STATUS (status);
-	return (status);
-}				/* ocfs_allocate_extent */
+	if (last_eb_bh)
+		brelse(last_eb_bh);
 
+	LOG_EXIT_STATUS(status);
+	return status;
+}
+
 /*
  * ocfs_lookup_file_allocation()
  *

Modified: trunk/src/alloc.h
===================================================================
--- trunk/src/alloc.h	2004-10-12 01:56:54 UTC (rev 1562)
+++ trunk/src/alloc.h	2004-10-12 02:01:09 UTC (rev 1563)
@@ -30,15 +30,14 @@
 #define OCFS2_ALLOC_H
 
 struct _ocfs2_alloc_context;
-int ocfs_allocate_extent(ocfs_super *osb, struct buffer_head *fe_bh,
-			 ocfs_journal_handle *handle,
-			 u64 blkno, u32 new_clusters,
-			 struct inode *inode, 
-			 struct _ocfs2_alloc_context *meta_ac);
-int ocfs_free_extents_for_truncate(ocfs_super *osb,
-				   ocfs2_dinode *fe,
-				   ocfs_journal_handle *handle,
-				   struct inode *inode);
+int ocfs2_insert_extent(ocfs_super *osb, 
+			ocfs_journal_handle *handle,
+			struct inode *inode,
+			struct buffer_head *fe_bh,
+			u64 blkno, 
+			u32 new_clusters,
+			struct _ocfs2_alloc_context *meta_ac);
+
 int ocfs_get_leaf_extent(ocfs_super *osb, ocfs2_dinode *fe,
 			 __s64 Vbo, struct buffer_head **data_extent_bh,
 			 struct inode *inode);

Modified: trunk/src/dir.c
===================================================================
--- trunk/src/dir.c	2004-10-12 01:56:54 UTC (rev 1562)
+++ trunk/src/dir.c	2004-10-12 02:01:09 UTC (rev 1563)
@@ -463,7 +463,6 @@
 		LOG_ERROR_STATUS(status = -ENOMEM);
 		goto bail;
 	}
-	ocfs_handle_set_may_abort(handle, 1);
 
 	status = ocfs_do_extend_dir(osb->sb, handle, dir, parent_fe_bh, 
 				    data_ac, meta_ac, &new_bh);
@@ -501,12 +500,9 @@
 	*new_de_bh = new_bh;
 	get_bh(*new_de_bh);
 bail:
-	if (handle) {
-		if (status < 0)
-			ocfs_abort_trans(handle);
-		else 
-			ocfs_commit_trans(handle);
-	}
+	if (handle)
+		ocfs_commit_trans(handle);
+
 	if (data_ac)
 		ocfs_free_alloc_context(data_ac);
 	if (meta_ac)

Modified: trunk/src/file.c
===================================================================
--- trunk/src/file.c	2004-10-12 01:56:54 UTC (rev 1562)
+++ trunk/src/file.c	2004-10-12 02:01:09 UTC (rev 1563)
@@ -1116,7 +1116,7 @@
 
 	OCFS_ASSERT(num_bits <= clusters_to_add);
 
-	/* reserve our write early -- allocate_extent may update the inode */
+	/* reserve our write early -- insert_extent may update the inode */
 	status = ocfs_journal_access(handle, fe_bh, OCFS_JOURNAL_ACCESS_WRITE);
 	if (status < 0) {
 		LOG_ERROR_STATUS(status);
@@ -1126,8 +1126,8 @@
 	block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
 	LOG_TRACE_ARGS("Allocating %u clusters at block %u for inode %llu\n",
 		       num_bits, bit_off, OCFS_I(inode)->ip_blkno);
-	status = ocfs_allocate_extent(osb, fe_bh, handle, block, num_bits, 
-				      inode, meta_ac);
+	status = ocfs2_insert_extent(osb, handle, inode, fe_bh, block, 
+				       num_bits, meta_ac);
 	if (status < 0) {
 		LOG_ERROR_STATUS(status);
 		goto leave;
@@ -1286,7 +1286,6 @@
 		LOG_ERROR_STATUS(status = -ENOMEM);
 		goto leave;
 	}
-	ocfs_handle_set_may_abort(handle, 1);
 
 restarted_transaction:
 	/* reserve a write to the file entry early on - that we if we
@@ -1345,8 +1344,8 @@
 							   clusters_to_add);
 			status = ocfs_extend_trans(handle, credits);
 			if (status < 0) {
-				/* handle still has to be committed /
-				 * aborted at this point. */
+				/* handle still has to be committed at
+				 * this point. */
 				LOG_ERROR_STATUS(status = -ENOMEM);
 				goto leave;
 			}
@@ -1379,10 +1378,7 @@
 
 leave:
 	if (handle) {
-		if (status < 0)
-			ocfs_abort_trans(handle);
-		else 
-			ocfs_commit_trans(handle);
+		ocfs_commit_trans(handle);
 		handle = NULL;
 	}
 	if (data_ac) {

Modified: trunk/src/ocfs2_fs.h
===================================================================
--- trunk/src/ocfs2_fs.h	2004-10-12 01:56:54 UTC (rev 1562)
+++ trunk/src/ocfs2_fs.h	2004-10-12 02:01:09 UTC (rev 1563)
@@ -272,9 +272,7 @@
 					   block group */
 	__u32 h_reserved2;
 	__u64 h_blkno;			/* Offset on disk, in blocks */
-/*20*/	__u64 h_parent_blk;		/* Offset on disk, in blocks,
-					   of this block's parent in the
-					   tree */
+/*20*/	__u64 h_reserved3;
 	__u64 h_next_leaf_blk;		/* Offset on disk, in blocks,
 					   of next leaf header pointing
 					   to data */



More information about the Ocfs2-commits mailing list