[Ocfs2-commits] mfasheh commits r1549 - trunk/src

svn-commits at oss.oracle.com svn-commits at oss.oracle.com
Wed Oct 6 20:38:02 CDT 2004


Author: mfasheh
Date: 2004-10-06 20:38:01 -0500 (Wed, 06 Oct 2004)
New Revision: 1549

Modified:
   trunk/src/alloc.c
   trunk/src/alloc.h
   trunk/src/file.c
   trunk/src/file.h
   trunk/src/inode.c
   trunk/src/ocfs_journal.h
Log:
* redo our truncate code. It's about a million times less embarrassing now, 
  though ocfs_do_truncate could probably be cleaned up a bit.     
  - we don't handle putting / removing truncating files in the orphan dir
    yet, but the placeholders are all left there.
  - truncate is now restartable, roll forward and can deal with any
    size tree depth :)



Modified: trunk/src/alloc.c
===================================================================
--- trunk/src/alloc.c	2004-10-05 22:33:26 UTC (rev 1548)
+++ trunk/src/alloc.c	2004-10-07 01:38:01 UTC (rev 1549)
@@ -53,11 +53,6 @@
 /* Tracing */
 #define OCFS_DEBUG_CONTEXT    OCFS_DEBUG_CONTEXT_ALLOC
 
-static int ocfs_kill_this_tree(ocfs_super *osb,
-			       struct buffer_head *extent_grp_bh, 
-			       ocfs_journal_handle *handle,
-			       struct inode *inode);
-
 static int ocfs_create_new_meta_bhs(ocfs_super *osb, 
 				    ocfs_journal_handle *handle,
 				    struct inode *inode,
@@ -83,26 +78,6 @@
 				 struct inode *inode,
 				 ocfs2_alloc_context *meta_ac);
 
-static int _squish_extent_entries(ocfs_super *osb,
-				  ocfs2_extent_rec *extarr, 
-				  __u16 *freeExtent, 
-				  ocfs_journal_handle *handle,
-				  u32 num_clusters, int flag, 
-				  struct inode *inode);
-
-static int ocfs_fix_extent_block(ocfs_super *osb,
-				 struct buffer_head *eb_bh,
-				 struct inode *inode);
-
-static int ocfs_split_this_tree(ocfs_super * osb, 
-				struct buffer_head *eb_bh, 
-				ocfs_journal_handle *handle,
-				ocfs2_dinode *fe, 
-				struct inode *inode);
-
-static int ocfs_update_last_eb_blk(ocfs_super *osb, ocfs2_dinode *fe,
-				   struct inode *inode);
-
 static int ocfs_free_disk_bitmap (ocfs_super * osb, ocfs_free_rec *free_log);
 
 static int ocfs_extent_contig(struct inode *inode, ocfs2_extent_rec *ext,
@@ -114,7 +89,27 @@
 				       u32 min_bits,
 				       u32 *bit_off,
 				       u32 *num_bits);
+static int ocfs_free_clusters(ocfs_super *osb,
+			      ocfs_journal_handle *handle,
+			      struct inode *bitmap_inode,
+			      struct buffer_head *bitmap_bh,
+			      u64 start_blk,
+			      unsigned int num_clusters);
 
+static int ocfs_find_new_last_ext_blk(ocfs_super *osb,
+				      struct inode *inode,
+				      ocfs2_dinode *fe,
+				      unsigned int new_i_clusters,
+				      struct buffer_head *old_last_eb,
+				      struct buffer_head **new_last_eb);
+
+static int ocfs_do_truncate(ocfs_super *osb, 
+			    unsigned int clusters_to_del,
+			    struct inode *inode,
+			    struct buffer_head *fe_bh,
+			    struct buffer_head *old_last_eb_bh,
+			    ocfs2_truncate_context *tc);
+
 static int ocfs_extent_contig(struct inode *inode, ocfs2_extent_rec *ext,
 			      u64 blkno)
 {
@@ -455,6 +450,65 @@
 	return status;
 }				/* ocfs_free_disk_bitmap */
 
+static int ocfs_free_clusters(ocfs_super *osb,
+			      ocfs_journal_handle *handle,
+			      struct inode *bitmap_inode,
+			      struct buffer_head *bitmap_bh,
+			      u64 start_blk,
+			      unsigned int num_clusters)
+{
+	ocfs_alloc_bm *bitmap;
+	int status;
+	unsigned int start_cluster, bitmap_blocks, bitmap_start;
+	ocfs2_dinode *fe;
+
+	LOG_ENTRY();
+
+	start_cluster = ocfs_blocks_to_clusters(osb->sb, start_blk);
+
+	bitmap = &osb->cluster_bitmap;
+
+	bitmap_blocks = ocfs_bitmap_blocks_affected(osb->sb,
+						    start_cluster,
+						    num_clusters,
+						    &bitmap_start);
+
+	LOG_TRACE_ARGS("want to free %u clusters starting at block %llu\n", 
+		       num_clusters, start_blk);
+	LOG_TRACE_ARGS("bitmap_start = %u, bitmap_blkno = %llu, bitmap_blocks "
+		       "= %u\n", bitmap_start, osb->bitmap_blkno, 
+		       bitmap_blocks);
+	status = ocfs_read_blocks(osb, (osb->bitmap_blkno + bitmap_start), 
+				  bitmap_blocks, &bitmap->chunk[bitmap_start], 
+				  OCFS_BH_CACHED, bitmap_inode);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
+	status = ocfs_journal_access(handle, bitmap_bh, 
+				     OCFS_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+	fe = (ocfs2_dinode *) bitmap_bh->b_data;
+
+	ocfs_clear_bits(osb->sb, handle, bitmap, start_cluster, num_clusters);
+	fe->id1.bitmap1.i_used -= num_clusters;
+
+	status = ocfs_journal_dirty(handle, bitmap_bh);
+	if (status < 0) {
+		LOG_ERROR_STATUS (status);
+		goto bail;
+	}
+
+	status = 0;
+bail:
+	LOG_EXIT_STATUS(status);
+	return(0);
+}
+
 /*
  * How many free extents have we got before we need more meta data?
  */
@@ -486,7 +540,6 @@
 	OCFS_ASSERT(el->l_tree_depth == 0);
 
 	retval = el->l_count - el->l_next_free_rec;
-
 bail:
 	if (eb_bh)
 		brelse(eb_bh);
@@ -1370,1014 +1423,6 @@
 }				/* ocfs_allocate_extent */
 
 /*
- * _squish_extent_entries()
- * FileSize is the allocated size of the file after the truncate.
- * 'flag' seems to be an indicator that (if true) tells us that we already know
- *	we're gonna have to clear out all of extarr.
- */
-static int _squish_extent_entries(ocfs_super *osb,
-				  ocfs2_extent_rec *extarr,
-				  __u16 *freeExtent,
-				  ocfs_journal_handle *handle,
-				  u32 num_clusters, int flag,
-				  struct inode *inode) 
-{
-	int status = 0;
-	int FirstTime = 1;
-	__u64 FileSize = (u64)num_clusters << osb->s_clustersize_bits; /* FIXME get rid of this and use num_clusters!! */
-	ocfs2_extent_rec *ext; 
-	__u32 i, csize = osb->s_clustersize_bits,
-	    numBitsAllocated = 0, bitmapOffset = 0, 
-	    firstfree = *freeExtent;
-	__u64 bytes, foff, doff, 
-	    diskOffsetTobeFreed, lengthTobeFreed = 0, 
-	    actualSize = 0, origLength = 0;
-
-	LOG_ENTRY_ARGS("(*freeExtent = %u, FileSize = %llu, flag = %d)\n", 
-		       *freeExtent, FileSize, flag);
-
-	firstfree = *freeExtent;
-
-	/* loop through the used ocfs2_extent_recs */
-	for (i = 0; i < firstfree; i++) { 
-		ext = &(extarr[i]); 
-		bytes = (u64)ext->e_clusters << osb->s_clustersize_bits;
-		foff = (u64)ext->e_cpos << osb->s_clustersize_bits; 
-		doff = ext->e_blkno << osb->sb->s_blocksize_bits; 
-		actualSize = (bytes + foff);
-		if (flag || actualSize > FileSize) { 
-			if (flag || foff >= FileSize) { 
-				if (!flag && FirstTime) { 
-					*freeExtent = i; 
-					FirstTime = 0; 
-				} 
-				numBitsAllocated = ext->e_clusters; 
-				bitmapOffset = (__u32) (doff >> csize); 
-				ext->e_clusters = ext->e_blkno =
-					ext->e_cpos = 0; 
-			} else { 
-				if (FirstTime) { 
-					*freeExtent = i + 1; 
-					FirstTime = 0; 
-				} 
-				origLength = bytes; 
-				bytes = FileSize - foff; 
-				ext->e_clusters =
-					(u32)(bytes >> osb->s_clustersize_bits);
-				lengthTobeFreed = origLength - bytes; 
-				if (lengthTobeFreed == 0) { 
-					continue; 
-				} 
-				numBitsAllocated = (__u32) (lengthTobeFreed >> csize); 
-				diskOffsetTobeFreed = doff + bytes; 
-				bitmapOffset = (__u32) (diskOffsetTobeFreed >> csize); 
-			} 
-			status = ocfs_handle_add_commit_bits(handle, 
-							   numBitsAllocated, 
-							   bitmapOffset, -1, 0, 
-							   DISK_ALLOC_VOLUME); 
-			if (status < 0) {
-				LOG_ERROR_STATUS (status);
-				break;
-			}
-		}
-	}
-
-	LOG_EXIT_STATUS (status);
-	return status; 
-}				/* _squish_extent_entries */
-
-/* used by ocfs_kill_this_tree and ocfs_split_this_tree */
-/* This value needs to be removed in a future version and set to
- * tree_depth + 1, dynamically */
-#define OCFS_TREE_STACK_SIZE 8
-
-/*
- * ocfs_kill_this_tree
- *
- * Given an extent_group (can be a DAT or header), delete everything,
- * including itself, it's children, and any data blocks they point to.
- * Works fine with any tree_depth (up to 4, in which case we'd need
- * more stack space)
- *
- * extent_grp_bh will be unchanged, though it will be marked for
- * deletion in free_head.
- */
-
-/*
- * We can't recurse, so we keep a simple stack of ocfs2_extent_blocks.
- */
-static int ocfs_kill_this_tree(ocfs_super *osb,
-			       struct buffer_head *extent_grp_bh,
-			       ocfs_journal_handle *handle,
-			       struct inode *inode) 
-{
-	int status = -EFAIL;
-	int i;
-	__u32 victim;
-	u32 num_clusters = 0;
-	__u32 bitmap_offset = 0;
-	ocfs2_extent_rec *ext;
-	struct buffer_head *tmp_bh = NULL;
-	char * stack[OCFS_TREE_STACK_SIZE];
-	ocfs2_extent_block *cur_eb; /* convenience, points to TOS */
-	ocfs2_extent_list *cur_el;
-	int tos = 0;
-
-	LOG_ENTRY();
-
-	for (i =0; i < OCFS_TREE_STACK_SIZE; i++)
-		stack[i] = NULL;
-
-	stack[tos] = kmalloc(osb->sb->s_blocksize, GFP_KERNEL);
-	memcpy(stack[tos], extent_grp_bh->b_data, osb->sb->s_blocksize);
-
-	do {
-		cur_eb = (ocfs2_extent_block *) stack[tos];
-		if (!IS_VALID_EXTENT_BLOCK(cur_eb)) {
-			LOG_ERROR_STR("Invalid extent block!");
-			goto bail;
-		}
-
-		cur_el = &cur_eb->h_list;
-
-		if (!cur_el->l_tree_depth) {
-			LOG_TRACE_ARGS("found some data to free (%llu)\n", cur_eb->h_blkno);
-			for(i = 0; i < cur_el->l_next_free_rec; i++) {
-				/* Free the data associated with each header */
-				ext = &cur_el->l_recs[i];
-				num_clusters = ext->e_clusters;
-				bitmap_offset =
-					(u32)((ext->e_blkno << osb->sb->s_blocksize_bits) >> osb->s_clustersize_bits);
-				status = ocfs_handle_add_commit_bits(handle, num_clusters, bitmap_offset, -1, 0, DISK_ALLOC_VOLUME);
-				if (status < 0) {
-					LOG_ERROR_STATUS (status);
-					goto bail;
-				}
-			}
-		} else {
-			/* Ok, we're a header. */
-
-			/* Did we already kill all his children, or
-			 * are they already dead? */
-			if (cur_el->l_next_free_rec == 0) {
-				LOG_TRACE_ARGS("Popping this header (%llu)\n", cur_eb->h_blkno);
-				goto free_meta;
-			}
-
-			/* We're gonna read in our last used extent
-			 * and put him at the top of the stack. We
-			 * also update our l_next_free_rec so that next
-			 * time we read in the next to last one and so
-			 * on until we've finished all of them
-			 */
-
-			victim = cur_el->l_next_free_rec - 1;
-
-			/* should already be null, but we can do this
-			 * just in case. */
-			stack[tos] = kmalloc(osb->sb->s_blocksize,
-					     GFP_KERNEL);
-
-			status = ocfs_read_block(osb,
-						 cur_el->l_recs[victim].e_blkno,
-						 &tmp_bh, 
-						 OCFS_BH_CACHED,
-						 inode);
-			if (status < 0) {
-				LOG_ERROR_STATUS (status);
-				goto bail;
-			}
-
-			cur_el->l_next_free_rec--;
-			cur_eb = NULL;
-			cur_el = NULL;
-			tos++;
-
-			memcpy(stack[tos], tmp_bh->b_data,
-			       osb->sb->s_blocksize);
-			brelse(tmp_bh);
-			tmp_bh = NULL;
-			/* We only want to free on our way back up the tree */
-			continue;
-		}
-
-free_meta:
-		/* Free the metadata associated with this extent group */
-		status = ocfs_handle_add_commit_bits(handle, 1,
-						     cur_eb->h_suballoc_bit,
-						     cur_eb->h_suballoc_node,
-						     cur_eb->h_blkno,
-						     DISK_ALLOC_EXTENT_NODE);
-		if (status < 0) {
-			LOG_ERROR_STATUS (status);
-			goto bail;
-		}
-		/* Pop one off the stack */
-		kfree(stack[tos]);
-		stack[tos] = NULL;
-		cur_eb = NULL;
-		cur_el = NULL;
-		tos--;
-	} while (tos >= 0);
-
-	status = 0;
-bail:
-	for(i = 0; i < OCFS_TREE_STACK_SIZE; i++)
-		if (stack[i])
-			kfree(stack[i]);
-
-	LOG_EXIT_STATUS (status);
-	return(status);
-} /* ocfs_kill_this_tree */
-
-
-static int ocfs_fix_extent_block(ocfs_super *osb,
-				 struct buffer_head *eb_bh,
-				 struct inode *inode) 
-{
-	ocfs2_extent_block *eb;
-	ocfs2_extent_list *el;
-	int status = -EFAIL;
-	int i;
-
-	LOG_ENTRY();
-
-	if (!eb_bh) {
-		LOG_ERROR_STR("Invalid extent block bh (NULL)!");
-		goto bail;
-	}
-
-	eb = (ocfs2_extent_block *) eb_bh->b_data;
-
-	if (!IS_VALID_EXTENT_BLOCK(eb)) {
-		LOG_ERROR_STR("Invalid extent block!");
-		goto bail;
-	}
-	
-	el = &eb->h_list;
-	
-	for(i = el->l_next_free_rec; i < el->l_count; i++) {
-		el->l_recs[i].e_clusters = 0;
-		el->l_recs[i].e_blkno = 0;
-		el->l_recs[i].e_cpos = 0;
-	}
-
-	status = 0;
-bail:
-	LOG_EXIT_STATUS (status);
-	return(status);
-}
-
-
-/*
- * ocfs_split_this_tree 
- *
- * Given an extent_group (DAT or HDR) takes the new alloc_size from fe
- * and splits this tree into two parts, one of which is deleted.
- *
- * TODO: This function can likely be combined with the above, we will try to
- * write it so that it can.
- * TODO: This function should be split up into a couple smaller ones.
- */
-static int ocfs_split_this_tree(ocfs_super *osb,
-				struct buffer_head *eb_bh,
-				ocfs_journal_handle *handle,
-				ocfs2_dinode *fe,
-				struct inode *inode) 
-{
-	int status = -EFAIL;
-	ocfs2_extent_rec *rec;
-	struct buffer_head * bh_stack[OCFS_TREE_STACK_SIZE];
-	ocfs2_extent_block *alloc_eb = NULL;  /* convenience, points to TOS */
-	ocfs2_extent_block *eb;
-	ocfs2_extent_list *el;
-	struct buffer_head *tmp_bh = NULL, *tmp_bh2 = NULL;
-	int tos = 0;
-	int i, victim;
-	__u64 bytes, doff, orig_bytes;
-	__u64 total_bytes;  /* FIXME needs to be clusters!!! */
-	__u32 num_clusters, bitmap_offset;
-	int done = 0;
-	int depth = fe->id2.i_list.l_tree_depth;
-	int needs_brelse = 0;
-
-	LOG_ENTRY();
-
-	/* This is a similar hack to the one below, untested for depth
-	   = 4 files because I can't recreate one. */
-	if (depth == 4) {
-		LOG_ERROR_STR("Truncating file with tree_depth 4, this is not tested and may be unsafe!");
-		LOG_TRACE_STR("Found a tree_depth 4 tree, trimming it.\n");
-
-		status = ocfs_journal_access(handle, eb_bh, 
-					     OCFS_JOURNAL_ACCESS_WRITE);
-		if (status < 0) {
-			LOG_ERROR_STATUS(status);
-			goto bail;
-		}
-		eb = (ocfs2_extent_block *) eb_bh->b_data;
-		el = &eb->h_list;
-		for (i = (el->l_next_free_rec - 1); i >= 0; i--) {
-			rec = &el->l_recs[i];
-
-			if (tmp_bh2)
-				brelse(tmp_bh2);
-			tmp_bh2 = NULL;
-			status = ocfs_read_block(osb,
-						 rec->e_blkno,
-						 &tmp_bh2, 
-						 OCFS_BH_CACHED, inode);
-			if (status < 0) {
-				eb = NULL;
-				brelse(tmp_bh2);
-				LOG_ERROR_STATUS (status);
-				goto bail;
-			}
-			
-			if (rec->e_cpos >= fe->i_clusters) {
-				/* Trim this whole subtree */
-				status = ocfs_kill_this_tree(osb,
-							     tmp_bh2, 
-							     handle,
-							     inode);
-				if (status < 0) {
-					eb = NULL;
-					el = NULL;
-					brelse(tmp_bh2);
-					LOG_ERROR_STATUS (status);
-					goto bail;
-				}
-				rec->e_cpos = 0;
-				rec->e_blkno = 0;
-				rec->e_clusters = 0;
-				el->l_next_free_rec = i;
-			} else  { /* This is the one we want to split. */
-				rec->e_clusters =
-					fe->i_clusters - rec->e_cpos;
-				break;
-			}
-		}
-
-		/* Write out our new top of the tree duder */
-		eb = NULL;
-		el = NULL;
-
-		status = ocfs_journal_dirty(handle, eb_bh);
-		if (status < 0) {
-			LOG_ERROR_STATUS(status);
-			goto bail;
-		}
-
-		/* Make our new TOS the header we want to split. */
-		if (tmp_bh2 == NULL) {
-			LOG_ERROR_STATUS(-EFAIL);
-			goto bail;
-		}
-		eb_bh = tmp_bh2;
-
-		/* We want to do the next bit of stuff too */
-		depth = 3;
-		needs_brelse = 1;
-	}
-
-	/* This is a hack, but i have little time to make this function right*/
-	/* get rid of everything from the top level HDR that we can, then
-	   proceeed as if we're tree_depth 2 (which we know works) */
-	if (depth == 3) {
-		LOG_TRACE_STR("Found a tree_depth 3 tree, trimming it.\n");
-
-		status = ocfs_journal_access(handle, eb_bh, 
-					     OCFS_JOURNAL_ACCESS_WRITE);
-		if (status < 0) {
-			LOG_ERROR_STATUS(status);
-			goto bail;
-		}
-
-		eb = (ocfs2_extent_block *) eb_bh->b_data;
-		el = &eb->h_list;
-		for(i = (el->l_next_free_rec - 1); i >= 0; i--) {
-			rec = &el->l_recs[i];
-
-			if (tmp_bh)
-				brelse(tmp_bh);
-			tmp_bh = NULL;
-
-			status = ocfs_read_block(osb,
-						 rec->e_blkno,
-						 &tmp_bh, 
-						 OCFS_BH_CACHED,
-						 inode);
-			if (status < 0) {
-				eb = NULL;
-				el = NULL;
-				brelse(tmp_bh);
-				LOG_ERROR_STATUS (status);
-				goto bail;
-			}
-			
-			if (rec->e_cpos >= fe->i_clusters) {
-				/* Trim this whole subtree */
-				status = ocfs_kill_this_tree(osb,
-							     tmp_bh, 
-							     handle,
-							     inode);
-				if (status < 0) {
-					eb = NULL;
-					el = NULL;
-					brelse(tmp_bh);
-					LOG_ERROR_STATUS (status);
-					goto bail;
-				}
-				rec->e_cpos = 0;
-				rec->e_blkno = 0;
-				rec->e_clusters = 0;
-				el->l_next_free_rec = i;
-			} else  { /* This is the one we want to split. */
-				rec->e_clusters = 
-					fe->i_clusters - rec->e_cpos;
-				break;
-			}
-		}
-		/* Write out our new top of the tree duder */
-		eb = NULL;
-		el = NULL;
-
-		status = ocfs_journal_dirty(handle, eb_bh);
-		if (status < 0) {
-			LOG_ERROR_STATUS(status);
-			goto bail;
-		}
-
-		/* Make our new TOS the header we want to split. */
-		if (tmp_bh == NULL) {
-			LOG_ERROR_STATUS(-EFAIL);
-			goto bail;
-		}
-
-		eb_bh = tmp_bh;
-
-		/* Right now, we don't use 'depth' below here, but just
-		 * in case */
-		depth = 2;
-		if (needs_brelse)
-			brelse(tmp_bh2);
-		needs_brelse = 1;
-	}
-
-	for (i =0; i < OCFS_TREE_STACK_SIZE; i++)
-		bh_stack[i] = NULL;
-
-	bh_stack[tos] = eb_bh;
-
-	/* Ok, find the splitting point (can be a DAT or HDR) */
-	do {
-		/* it's perfectly legal to get_access a block but
-		 * never dirty it, so lets just pre-emptively do it
-		 * now. */
-		status = ocfs_journal_access(handle, bh_stack[tos], 
-					     OCFS_JOURNAL_ACCESS_WRITE);
-		if (status < 0) {
-			LOG_ERROR_STATUS(status);
-			goto bail;
-		}
-
-		alloc_eb = (ocfs2_extent_block *) bh_stack[tos]->b_data;
-		if (!IS_VALID_EXTENT_BLOCK(alloc_eb)) {
-			LOG_ERROR_STR("Invalid extent block!");
-			goto bail;
-		}
-		
-		el = &alloc_eb->h_list;
-
-		if (!el->l_tree_depth) {
-			/* shall we just do away with him? */
-			if (el->l_recs[0].e_cpos >= fe->i_clusters) {
-				LOG_TRACE_ARGS("Killing this data extent (%llu)\n", alloc_eb->h_blkno);
-				/* Boundary case - what if this guy is
-				 * the last DAT we should delete
-				 * (i.e., split no more ;) */
-				alloc_eb = NULL;
-				el = NULL;
-				status = ocfs_kill_this_tree(osb,
-							     bh_stack[tos],
-							     handle,
-							     inode);
-				if (status < 0) {
-					LOG_ERROR_STATUS (status);
-					goto bail;
-				}
-				/* silly, but what to do? */
-				alloc_eb = (ocfs2_extent_block *) bh_stack[tos]->b_data;
-			} else {
-				/* Alright, we know for sure that
-				 * we're splitting in this guy. */
-				LOG_TRACE_ARGS("Splitting this data extent (%llu)\n", alloc_eb->h_blkno);
-				fe->i_last_eb_blk = alloc_eb->h_blkno;
-				alloc_eb->h_next_leaf_blk = 0;
-				/* total_bytes is used below to know
-				 * how much total we've whacked off
-				 * this extent*/
-				total_bytes = 0;
-
-				/* there is a chance the split is at a
-				 * header boundary. this will catch
-				 * it: */
-				rec = &el->l_recs[el->l_next_free_rec - 1];
-				if ((rec->e_cpos + rec->e_clusters) == fe->i_clusters) {
-					LOG_TRACE_STR("Ok, hit that boundary in the DAT");
-					goto fix_headers;
-				}
-
-				/* Either kill the data or resize it */
-				for(i = (el->l_next_free_rec - 1); i >= 0; i--) {
-					rec = &el->l_recs[i];
-
-					/* changed this from > to >= */
-					/* Do we delete it completely? */
-					if (rec->e_cpos >= fe->i_clusters) {
-						total_bytes += (u64)rec->e_clusters << osb->s_clustersize_bits;
-
-						num_clusters = rec->e_clusters;
-						bitmap_offset = (u32)(((rec->e_blkno << osb->sb->s_blocksize_bits)) >> osb->s_clustersize_bits);
-						rec->e_cpos = 0;
-						rec->e_clusters = 0;
-						rec->e_blkno = 0;
-					} else if ((rec->e_cpos + rec->e_clusters) > fe->i_clusters) {
-						/* Do we shrink it? */
-						/* FIXME
-						 * this is soooo a
-						 * hodge-podge of bytes
-						 * and clusters
-						 */
-						orig_bytes = (u64)rec->e_clusters << osb->s_clustersize_bits;
-						num_clusters = rec->e_clusters;
-						doff = rec->e_blkno << osb->sb->s_blocksize_bits; 
-						rec->e_clusters = fe->i_clusters - rec->e_cpos;
-						bytes = (u64)rec->e_clusters << osb->s_clustersize_bits;
-						num_clusters -= rec->e_clusters;
-						bitmap_offset = (u32)((doff + bytes) >> osb->s_clustersize_bits);
-						/* we want to exit the
-						 * for loop now */
-						total_bytes += (orig_bytes - bytes);
-						done = 1;
-					} else {
-						/* if we get here,
-						 * then we don't want
-						 * to actually delete
-						 * *anything* from
-						 * this extent. */
-						LOG_TRACE_ARGS("Not deleting extent %d, e_blkno = %llu, e_clusters = %u, e_cpos = %u\n", i, rec->e_blkno, rec->e_clusters, rec->e_cpos);
-						done = 1;
-						goto skip_bitmap_add;
-					}
-					status = ocfs_handle_add_commit_bits(handle, num_clusters, bitmap_offset, -1, 0, DISK_ALLOC_VOLUME);
-					if (status < 0) {
-						LOG_ERROR_STATUS (status);
-						goto bail;
-					}
-
-skip_bitmap_add:
-					if (done) {
-						el->l_next_free_rec =
-							i + 1;
-						break;
-					}
-				} /* For loop */
-
-				/* Either way, we need to write this back out*/
-				alloc_eb = NULL;
-				el = NULL;
-
-				status = ocfs_journal_dirty(handle,
-							    bh_stack[tos]);
-				if (status < 0) {
-					LOG_ERROR_STATUS(status);
-					goto bail;
-				}
-
-				LOG_TRACE_ARGS("Fixing the headers above us! (tos=%d)\n", tos);
-fix_headers:
-				/*And here we should fix the headers above us*/
-				tos--;
-				while (tos >= 0) {
-					LOG_TRACE_ARGS("at top of loop, tos=%d\n", tos);
-					status = ocfs_journal_access(handle, bh_stack[tos], OCFS_JOURNAL_ACCESS_WRITE);
-					if (status < 0) {
-						LOG_ERROR_STATUS(status);
-						goto bail;
-					}
-
-					alloc_eb = (ocfs2_extent_block *) bh_stack[tos]->b_data;
-					el = &alloc_eb->h_list;
-					victim = el->l_next_free_rec;
-					el->l_next_free_rec++;
-					/* need to also update
-					 * numbytes on these guys */
-					rec = &el->l_recs[victim];
-					rec->e_clusters -=
-						(u32)(total_bytes >> osb->s_clustersize_bits);
-					alloc_eb = NULL;
-					el = NULL;
-					status = ocfs_fix_extent_block(osb, 
-								bh_stack[tos], inode);
-					if (status < 0) {
-						LOG_ERROR_STATUS(status);
-						goto bail;
-					}
-					status = ocfs_journal_dirty(handle, 
-								    bh_stack[tos]);
-					if (status < 0) {
-						LOG_ERROR_STATUS(status);
-						goto bail;
-					}
-					tos--;
-				}
-				LOG_TRACE_STR("breaking to end function now!");
-				/* Ok, done! */
-				break;
-			}
-		} else {  /* It's a header extent */
-
-			/* Did we already kill all his children, or
-			 * are they already dead? */
-			if (el->l_next_free_rec == 0) {
-				/*Ok, we're done with this guy, pop the stack*/
-				LOG_TRACE_ARGS("Popping this header (%llu)\n",
-					       alloc_eb->h_blkno);
-
-				status = ocfs_handle_add_commit_bits(handle, 1, alloc_eb->h_suballoc_bit, alloc_eb->h_suballoc_node, alloc_eb->h_blkno, DISK_ALLOC_EXTENT_NODE);
-				if (status < 0) {
-					LOG_ERROR_STATUS (status);
-					goto bail;
-				}
-				brelse(bh_stack[tos]);
-				alloc_eb = NULL;
-				el = NULL;
-				bh_stack[tos] = NULL;
-				tos--;
-				continue;
-			}
-			/* changed this from > to >= */
-			/* Do we just delete this whole part of the tree? */
-			if (el->l_recs[0].e_cpos >= fe->i_clusters) {
-				LOG_TRACE_ARGS("whacking this tree: (%llu)\n",
-					       alloc_eb->h_blkno);
-
-				if (el->l_recs[0].e_cpos == fe->i_clusters)
-					done = 1;
-
-				alloc_eb = NULL;
-				ocfs_kill_this_tree(osb, bh_stack[tos], 
-						    handle, inode);
-				brelse(bh_stack[tos]);
-				alloc_eb = NULL;
-				el = NULL;
-				bh_stack[tos] = NULL;
-				tos--;
-				if (tos < 0) {
-					LOG_ERROR_STR("End of stack reached.");
-					goto bail;
-				}
-				/* I just have to fix my parent,
-				 * right? Yes, but only because our
-				 * max tree_depth is 3. if it were
-				 * more, we'd have to fix his
-				 * parents parent. */
-				status = ocfs_journal_access(handle, bh_stack[tos], OCFS_JOURNAL_ACCESS_WRITE);
-				if (status < 0) {
-					LOG_ERROR_STATUS(status);
-					goto bail;
-				}
-
-				alloc_eb = (ocfs2_extent_block *) bh_stack[tos]->b_data;
-				el = &alloc_eb->h_list;
-
-				victim = el->l_next_free_rec;
-				el->l_recs[victim].e_cpos = 0;
-				el->l_recs[victim].e_clusters = 0;
-				el->l_recs[victim].e_blkno = 0;
-				el->l_next_free_rec--;
-				alloc_eb = NULL;
-				el = NULL;
-				/* Here's an interesting boundary
-				 * case. What if we're truncating on a
-				 * boundary between two headers and
-				 * this is the one we just deleted. In
-				 * that case we're done, but need to 
-				 * write the parent out before we leave
-				 * again, this bit of code depends on 
-				 * tree_depth of 3. */
-				if (done) {
-					LOG_TRACE_STR("Found a boundary "     \
-						      "header, almost done "  \
-						      " (gonna quit)");
-					status = ocfs_fix_extent_block(osb, 
-								bh_stack[tos], inode);
-					if (status < 0) {
-						LOG_ERROR_STATUS(status);
-						goto bail;
-					}
-
-					status = ocfs_journal_dirty(handle, 
-								bh_stack[tos]);
-					if (status < 0) {
-						LOG_ERROR_STATUS(status);
-						goto bail;
-					}
-					/* decrement tos so we dont
-					 * trigger an error
-					 * condition */
-					brelse(bh_stack[tos]);
-					bh_stack[tos] = NULL;
-					tos--;
-					break;
-				}
-
-				status = ocfs_journal_dirty(handle,
-							    bh_stack[tos]);
-				if (status < 0) {
-					LOG_ERROR_STATUS(status);
-					goto bail;
-				}
-
-
-				/* Ok, we're not a boundary case, continue */
-				continue;
-			}
-
-			el->l_next_free_rec--;
-			victim = el->l_next_free_rec;
-			rec = &el->l_recs[victim];
-			alloc_eb = NULL;
-			el = NULL;
-
-			status = ocfs_journal_dirty(handle,
-						    bh_stack[tos]);
-			if (status < 0) {
-				LOG_ERROR_STATUS (status);
-				goto bail;
-			}
-
-			/* grow the stack. */
-			tos++;
-			/* should never be true. */
-			if (bh_stack[tos] != NULL)
-				LOG_ERROR_STR("uhoh, not brelsing a buffer " \
-					      "on our stack!\n");
-
-			status = ocfs_read_block(osb,
-						 rec->e_blkno,
-						 &bh_stack[tos], 
-						 OCFS_BH_CACHED,
-						 inode);
-			if (status < 0) {
-				LOG_ERROR_STATUS (status);
-				goto bail;
-			}
-
-			/* We only want to free on our way up the tree */
-			continue;
-		}
-		brelse(bh_stack[tos]);
-		bh_stack[tos] = NULL;
-		alloc_eb = NULL;
-		el = NULL;
-		tos--;
-	} while (tos >= 0);
-	
-	if (tos >= 0)
-		LOG_ERROR_ARGS("Quitting main loop while top of stack >= 0 " \
-			       " (tos=%d)\n", tos);
-
-	status=0;
-bail:
-	/* brelse the stack. We only brelse the bottom of the stack if
-	 * we know for sure that it wasn't passed from the caller */
-	if (needs_brelse)
-		brelse(bh_stack[0]);
-	for(i = 1; i < OCFS_TREE_STACK_SIZE; i++)
-		if (bh_stack[i])
-			brelse(bh_stack[i]);
-
-	LOG_EXIT_STATUS (status);
-	return(status);
-} /* ocfs_split_this_tree */
-
-
-/*
- * ocfs_update_last_eb_blk
- *
- *  Travel all the way to the rightmost DAT and set fe->i_last_eb_blk
- *  to it.  
- *
- *  We do cached reads here because we ought to have already read the
- *  various ext headers and dats off the system previously in the
- *  truncate path.
- */
-static int ocfs_update_last_eb_blk(ocfs_super *osb,
-				   ocfs2_dinode *fe,
-				   struct inode *inode) 
-{
-	int status = -EFAIL;
-	struct buffer_head *eb_bh = NULL;
-	ocfs2_extent_block *eb = NULL;
-	ocfs2_extent_list *el, *fel;
-	u64 next_blk;
-	int victim;
-
-	LOG_ENTRY ();
-
-	fel = &fe->id2.i_list;
-	if (fel->l_next_free_rec == 0) {
-		LOG_TRACE_STR("setting to zero as there isn't any used extents");
-		fe->i_last_eb_blk = 0;
-		status = 0;
-		goto bail;
-	}
-
-	/* Can't be called with local extents */
-	if (!fel->l_tree_depth)
-		BUG();
-
-	/* Ugly magic -1 */
-	victim = fel->l_next_free_rec - 1;
-	status = ocfs_read_block(osb,
-				 fel->l_recs[victim].e_blkno,
-				 &eb_bh, 
-				 OCFS_BH_CACHED, inode);
-	if (status < 0) {
-		LOG_ERROR_STATUS(status);
-		goto bail;
-	}
-
-	eb = (ocfs2_extent_block *) eb_bh->b_data;
-
-	if (!IS_VALID_EXTENT_BLOCK(eb)) { 
-		LOG_ERROR_STR("Invalid extent block!");
-		goto bail;
-	}
-
-	el = &eb->h_list;
-
-	while (el->l_tree_depth) {
-		if (!IS_VALID_EXTENT_BLOCK(eb)) {
-			LOG_ERROR_STR("Invalid extent block!");
-			goto bail;
-		}
-
-		el = &eb->h_list;
-
-		next_blk = el->l_recs[el->l_next_free_rec - 1].e_blkno;
-
-		brelse(eb_bh);
-		eb = NULL;
-		eb_bh = NULL;
-
-		status = ocfs_read_block(osb,
-					 next_blk,
-					 &eb_bh, 
-					 OCFS_BH_CACHED, inode);
-		if (status < 0) {
-			LOG_ERROR_STATUS(status);
-			goto bail;
-		}
-		eb = (ocfs2_extent_block *) eb_bh->b_data;
-	}
-	
-	fe->i_last_eb_blk = eb->h_blkno;
-	status = 0;
-bail:
-	if (eb_bh)
-		brelse(eb_bh);
-
-	LOG_EXIT_STATUS(status);
-	return(status);
-}  /* ocfs_update_last_eb_blk */
-
-/*
- * ocfs_free_extents_for_truncate()
- *
- * You know, it's funny -- you'd expect that we'd flush out the fe
- * before leaving this function, but that's pretty much up to the
- * caller!
- */
-int ocfs_free_extents_for_truncate(ocfs_super *osb,
-				   ocfs2_dinode *fe,
-				   ocfs_journal_handle *handle,
-				   struct inode *inode)
-{
-	int status = 0;
-	struct buffer_head *extent_bh = NULL;
-	int i, j;
-	ocfs2_extent_list *fel;
-	int updated_leb; /* used to mark whether fe->i_last_eb_blk has
-			   * been updated */
-
-	LOG_ENTRY ();
-
-	fel = &fe->id2.i_list;
-
-	/* local extents */
-	if (!fel->l_tree_depth) {
-		status = _squish_extent_entries(osb, fel->l_recs, 
-						&fel->l_next_free_rec, 
-						handle, fe->i_clusters,
-						0, inode);
-		if (status < 0) {
-			LOG_ERROR_STATUS (status);
-			goto finally;
-		}
-		goto finally;
-	}
-
-	LOG_TRACE_ARGS("non-local extents. taking that code path, truncating to i_clusters of (%u)\n", fe->i_clusters);
-	/* non-local extents */
-
-	updated_leb = 0;
-
-	/* Loop backwards through only the used free extent block here */
-	for (i = (fel->l_next_free_rec - 1); i >= 0; i--) {
-		LOG_TRACE_ARGS("at top of loop, i = %d\n", i);
-		/* Go ahead and read that bit of the tree - we'll need it. */
-		status = ocfs_read_block(osb,
-					 fel->l_recs[i].e_blkno,
-					 &extent_bh, OCFS_BH_CACHED,
-					 inode);
-		if (status < 0) {
-			LOG_ERROR_STATUS(status);
-			goto finally;
-		}
-		/* Figure out, do we want to kill this whole tree? */
-		if (fel->l_recs[i].e_cpos >= fe->i_clusters) {
-			LOG_TRACE_ARGS("Found an entire tree to delete!\n");
-			
-			status = ocfs_kill_this_tree(osb, extent_bh,
-						     handle, inode);
-			if (status < 0) {
-				LOG_ERROR_STATUS(status);
-				goto finally;
-			}
-			/* Ok, update the fe */
-			fel->l_recs[i].e_cpos = 0;
-			fel->l_recs[i].e_blkno = 0;
-			fel->l_recs[i].e_clusters = 0;
-			fel->l_next_free_rec = i;
-		} else { /* Ok, we only want part of it. */
-			LOG_TRACE_ARGS("Splitting this tree!\n");
-			status = ocfs_split_this_tree(osb, extent_bh, 
-						      handle, fe, 
-						      inode);
-			if (status < 0) {
-				LOG_ERROR_STATUS(status);
-				goto finally;
-			}
-
-			/* Ok, update the FileEntry */
-			LOG_TRACE_ARGS("Alright. e_clusters = (%u), i_clusters = (%u) e_cpos = (%u)\n",
-				       fel->l_recs[i].e_clusters,
-				       fe->i_clusters,
-				       fel->l_recs[i].e_cpos);
-			fel->l_recs[i].e_clusters = fe->i_clusters;
-			for (j=0; j < i; j++) 
-				fel->l_recs[i].e_clusters +=
-					fel->l_recs[j].e_clusters;
-
-			fel->l_next_free_rec = i + 1;
-			/* We're done - we can't split more than one
-			 * parts of the tree. */
-			updated_leb = 1;
-			break;
-		}
-		brelse(extent_bh);
-		extent_bh = NULL;
-	}
-
-	/* Ok, trunc to zero is a special case, doofus */
-	if (fe->i_clusters == 0) {
-		fe->i_last_eb_blk = 0;
-		fel->l_tree_depth = 0;
-		updated_leb = 1;
-	}
-
-	if (!updated_leb) {
-		status = ocfs_update_last_eb_blk(osb, fe, inode);
-		if (status < 0) {
-			LOG_ERROR_STATUS(status);
-			goto finally;
-		}
-	}
-
-finally:
-	if (extent_bh)
-		brelse(extent_bh);
-
-	LOG_EXIT_INT (status);
-	return status;
-}  /* ocfs_free_extents_for_truncate */
-
-
-/*
  * ocfs_lookup_file_allocation()
  *
  * This routine looks up the existing mapping of VBO to LBO for a  file.
@@ -2989,74 +2034,551 @@
 	return(status);
 }
 
+/* This function will figure out whether the currently last extent
+ * block will be deleted, and if it will, what the new last extent
+ * block will be so we can update his h_next_leaf_blk field, as well
+ * as the dinodes i_last_eb_blk */
+static int ocfs_find_new_last_ext_blk(ocfs_super *osb,
+				      struct inode *inode,
+				      ocfs2_dinode *fe,
+				      unsigned int new_i_clusters,
+				      struct buffer_head *old_last_eb,
+				      struct buffer_head **new_last_eb)
+{
+	int i, status = 0;
+	u64 block = 0;
+	ocfs2_extent_block *eb;
+	ocfs2_extent_list *el;
+	struct buffer_head *bh = NULL;
+
+	OCFS_ASSERT(IS_VALID_FILE_ENTRY(fe));
+
+	*new_last_eb = NULL;
+
+	/* we have no tree, so of course, no last_eb. */
+	if (!fe->id2.i_list.l_tree_depth)
+		goto bail;
+
+	/* trunc to zero special case - this makes tree_depth = 0
+	 * regardless of what it is.  */
+	if (!new_i_clusters)
+		goto bail;
+
+	eb = (ocfs2_extent_block *) old_last_eb->b_data;
+	el = &(eb->h_list);
+	OCFS_ASSERT(el->l_next_free_rec);
+
+	/* Make sure that this guy will actually be empty after we
+	 * clear away the data. */
+	if (el->l_recs[0].e_cpos < new_i_clusters)
+		goto bail;
+
+	/* Ok, at this point, we know that last_eb will definitely
+	 * change, so lets traverse the tree and find the second to
+	 * last extent block. */
+	el = &(fe->id2.i_list);
+	/* go down the tree, */
+	do {
+		for(i = (el->l_next_free_rec - 1); i >= 0; i--) {
+			if (el->l_recs[i].e_cpos < new_i_clusters) {
+				block = el->l_recs[i].e_blkno;
+				break;
+			}
+		}
+		OCFS_ASSERT(i >= 0);
+
+		if (bh) {
+			brelse(bh);
+			bh = NULL;
+		}
+
+		status = ocfs_read_block(osb, block, &bh, OCFS_BH_CACHED,
+					 inode);
+		if (status < 0) {
+			LOG_ERROR_STATUS(status);
+			goto bail;
+		}
+		eb = (ocfs2_extent_block *) bh->b_data;
+		el = &(eb->h_list);
+		OCFS_ASSERT(IS_VALID_EXTENT_BLOCK(eb));
+	} while (el->l_tree_depth);
+
+	*new_last_eb = bh;
+	get_bh(*new_last_eb);
+	LOG_TRACE_ARGS("returning block %llu\n", eb->h_blkno);
+bail:
+	if (bh)
+		brelse(bh);
+
+	return status;
+}
+
+static int ocfs_do_truncate(ocfs_super *osb, 
+			    unsigned int clusters_to_del,
+			    struct inode *inode,
+			    struct buffer_head *fe_bh,
+			    struct buffer_head *old_last_eb_bh,
+			    ocfs2_truncate_context *tc)
+{
+	int status, i, depth;
+	ocfs_journal_handle *handle;
+	ocfs2_dinode *fe;
+	ocfs2_extent_block *eb;
+	ocfs2_extent_block *last_eb = NULL;
+	ocfs2_extent_list *el;
+	struct buffer_head *eb_bh = NULL;
+	struct buffer_head *last_eb_bh = NULL;
+	u64 next_eb = 0;
+	u64 delete_blk = 0;
+
+	handle = tc->tc_handle;
+	fe = (ocfs2_dinode *) fe_bh->b_data;
+
+	status = ocfs_find_new_last_ext_blk(osb, 
+					    inode,
+					    fe, 
+					    fe->i_clusters - clusters_to_del,
+					    old_last_eb_bh,
+					    &last_eb_bh);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+	if (last_eb_bh)
+		last_eb = (ocfs2_extent_block *) last_eb_bh->b_data;
+
+	status = ocfs_journal_access(handle, fe_bh, OCFS_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+	el = &(fe->id2.i_list);
+
+	down (&(OCFS_I(inode)->ip_sem));
+	OCFS_I(inode)->ip_alloc_size =
+		(u64)(fe->i_clusters - clusters_to_del) << osb->s_clustersize_bits;
+	up (&(OCFS_I(inode)->ip_sem));
+	fe->i_clusters -= clusters_to_del;
+	fe->i_mtime = OCFS_CURRENT_TIME;
+
+	i = el->l_next_free_rec - 1;
+
+	OCFS_ASSERT(el->l_recs[i].e_clusters >= clusters_to_del);
+	el->l_recs[i].e_clusters -= clusters_to_del;
+	/* tree depth zero, we can just delete the clusters, otherwise
+	 * we need to record the offset of the next level extent block
+	 * as we may overwrite it. */
+	if (!el->l_tree_depth)
+		delete_blk = el->l_recs[i].e_blkno + el->l_recs[i].e_clusters;
+	else 
+		next_eb = el->l_recs[i].e_blkno;
+
+	if (!el->l_recs[i].e_clusters) {
+		/* if we deleted the whole extent record, then clear
+		 * out the other fields and update the extent
+		 * list. For depth > 0 trees, we've already recorded
+		 * the extent block in 'next_eb' */
+		el->l_recs[i].e_cpos = 0;
+		el->l_recs[i].e_blkno = 0;
+		OCFS_ASSERT(el->l_next_free_rec);
+		el->l_next_free_rec--;
+	}
+
+	depth = el->l_tree_depth;
+	if (!fe->i_clusters) {
+		/* trunc to zero is a special case. */
+		el->l_tree_depth = 0;
+		fe->i_last_eb_blk = 0;
+	} else if (last_eb)
+		fe->i_last_eb_blk = last_eb->h_blkno;
+
+	status = ocfs_journal_dirty(handle, fe_bh);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
+	if (last_eb) {
+		/* If there will be a new last extent block, then by
+		 * definition, there cannot be any leaves to the right of
+		 * him. */
+		status = ocfs_journal_access(handle, last_eb_bh, 
+					     OCFS_JOURNAL_ACCESS_WRITE);
+		if (status < 0) {
+			LOG_ERROR_STATUS(status);
+			goto bail;
+		}
+		last_eb->h_next_leaf_blk = 0;
+		status = ocfs_journal_dirty(handle, last_eb_bh);
+		if (status < 0) {
+			LOG_ERROR_STATUS(status);
+			goto bail;
+		}
+	}
+
+	/* if our tree depth > 0, update all the tree blocks below us. */
+	while(depth) {
+		LOG_TRACE_ARGS("traveling tree (depth = %d, next_eb = %llu)\n",
+			       depth,  next_eb);
+		status = ocfs_read_block(osb, next_eb, &eb_bh, 
+					 OCFS_BH_CACHED, inode);
+		if (status < 0) {
+			LOG_ERROR_STATUS(status);
+			goto bail;
+		}
+		eb = (ocfs2_extent_block *) eb_bh->b_data;
+		OCFS_ASSERT(IS_VALID_EXTENT_BLOCK(eb));
+		el = &(eb->h_list);
+
+		status = ocfs_journal_access(handle, eb_bh,
+					     OCFS_JOURNAL_ACCESS_WRITE);
+		if (status < 0) {
+			LOG_ERROR_STATUS(status);
+			goto bail;
+		}
+
+		OCFS_ASSERT(el->l_next_free_rec);
+		OCFS_ASSERT(depth == (el->l_tree_depth + 1));
+
+		i = el->l_next_free_rec - 1;
+
+		LOG_TRACE_ARGS("extent block %llu, before: record %d: "
+			       "(%u, %u, %llu), next = %u\n", eb->h_blkno, i, 
+			       el->l_recs[i].e_cpos, el->l_recs[i].e_clusters, 
+			       el->l_recs[i].e_blkno, el->l_next_free_rec);
+
+		OCFS_ASSERT(el->l_recs[i].e_clusters >= clusters_to_del);
+		el->l_recs[i].e_clusters -= clusters_to_del;
+
+		next_eb = el->l_recs[i].e_blkno;
+		/* bottom-most block requires us to delete data.*/
+		if (!el->l_tree_depth)
+			delete_blk = el->l_recs[i].e_blkno + 
+				el->l_recs[i].e_clusters;
+		if (!el->l_recs[i].e_clusters) {
+			el->l_recs[i].e_cpos = 0;
+			el->l_recs[i].e_blkno = 0;
+			OCFS_ASSERT(el->l_next_free_rec);
+			el->l_next_free_rec--;
+		}
+		LOG_TRACE_ARGS("extent block %llu, after: record %d: "
+			       "(%u, %u, %llu), next = %u\n", eb->h_blkno, i, 
+			       el->l_recs[i].e_cpos, el->l_recs[i].e_clusters, 
+			       el->l_recs[i].e_blkno, el->l_next_free_rec);
+
+		status = ocfs_journal_dirty(handle, eb_bh);
+		if (status < 0) {
+			LOG_ERROR_STATUS(status);
+			goto bail;
+		}
+
+		if (!el->l_next_free_rec) {
+			LOG_TRACE_ARGS("deleting this extent block.\n");
+			OCFS_ASSERT(!eb->h_suballoc_node);
+			OCFS_ASSERT(!el->l_recs[0].e_clusters);
+			OCFS_ASSERT(!el->l_recs[0].e_cpos);
+			OCFS_ASSERT(!el->l_recs[0].e_blkno);
+			status = ocfs_free_suballoc_bits(osb,
+							 handle,
+							 tc->tc_ext_alloc_inode,
+							 tc->tc_ext_alloc_bh,
+							 eb->h_suballoc_bit,
+							 eb->h_blkno,
+							 1);
+			if (status < 0) {
+				LOG_ERROR_STATUS(status);
+				goto bail;
+			}
+		}
+		brelse(eb_bh);
+		eb_bh = NULL;
+		depth--;
+	}
+
+	OCFS_ASSERT(delete_blk);
+	status = ocfs_free_clusters(osb, handle, tc->tc_bitmap_inode,
+				    tc->tc_bitmap_bh, delete_blk, 
+				    clusters_to_del);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+	status = 0;
+bail:
+	LOG_EXIT_STATUS(status);
+	return(status);
+}
+
 /*
- * ocfs_free_file_extents()
+ * It is expected, that by the time you call this function,
+ * inode->i_size and fe->i_size have been adjusted.
  *
+ * This will start, restart and commit your handle for you.
+ *
+ * WARNING: This will gobble the contexts reference to last_eb_bh.
  */
-int ocfs_free_file_extents(ocfs_super *osb, struct buffer_head *fe_bh,
-			   ocfs_journal_handle *handle,
-			   struct inode *inode)
+int ocfs_commit_truncate(ocfs_super *osb,
+			 struct inode *inode,
+			 struct buffer_head *fe_bh,
+			 ocfs2_truncate_context *tc)
 {
-	int status = 0;
-	__u32 i;
-	__u32 numBitsAllocated = 0, bitmapOffset = 0;
-	ocfs2_extent_block *extent;
-	struct buffer_head *extent_bh = NULL;
-	ocfs2_extent_list *fel;
+	int status, i, credits;
+	unsigned int clusters_to_del, target_i_clusters;
+	u64 last_eb = 0;
 	ocfs2_dinode *fe;
+	ocfs2_extent_block *eb;
+	ocfs2_extent_list *el;
+	struct buffer_head *last_eb_bh;
+	ocfs_journal_handle *handle;
 
-	LOG_ENTRY ();
+	LOG_ENTRY();
 
-	fe = (ocfs2_dinode *) fe_bh->b_data;
+	last_eb_bh = tc->tc_last_eb_bh;
+	tc->tc_last_eb_bh = NULL;
+	handle = tc->tc_handle;
 
+	target_i_clusters = ocfs_clusters_for_bytes(osb->sb, inode->i_size);
 	fe = (ocfs2_dinode *) fe_bh->b_data;
-	fel = &fe->id2.i_list;
 
-	if (!fel->l_tree_depth) {
-		for (i = 0; i < fel->l_next_free_rec; i++) {
-			numBitsAllocated = fel->l_recs[i].e_clusters;
+	if (fe->id2.i_list.l_tree_depth) {
+		eb = (ocfs2_extent_block *) last_eb_bh->b_data;
+		el = &(eb->h_list);
+	} else
+		el = &(fe->id2.i_list);
+	last_eb = fe->i_last_eb_blk;
+start:
+	LOG_TRACE_ARGS("ocfs_commit_truncate: fe->i_clusters = %u, "
+		       "last_eb = %llu, fe->i_last_eb_blk = %llu, "
+		       "fe->id2.i_list.l_tree_depth = %u last_eb_bh = %p\n",
+		       fe->i_clusters, last_eb, fe->i_last_eb_blk,
+		       fe->id2.i_list.l_tree_depth, last_eb_bh);
 
-			bitmapOffset =
-			    (__u32)(((fel->l_recs[i].e_blkno << osb->sb->s_blocksize_bits)) >>
-				   osb->s_clustersize_bits);
+	if (last_eb != fe->i_last_eb_blk) {
+		LOG_TRACE_ARGS("last_eb changed!\n");
+		OCFS_ASSERT(fe->id2.i_list.l_tree_depth);
+		last_eb = fe->i_last_eb_blk;
+		/* i_last_eb_blk may have changed, read it if
+		 * necessary. We don't have to worry about the
+		 * truncate to zero case here (where there becomes no
+		 * last_eb) because we never loop back after our work
+		 * is done. */
+		if (last_eb_bh) {
+			brelse(last_eb_bh);
+			last_eb_bh = NULL;
+		}
 
-			ocfs_handle_add_commit_bits(handle, numBitsAllocated, 
-						    bitmapOffset, -1, 0, 
-						    DISK_ALLOC_VOLUME);
+		status = ocfs_read_block(osb, last_eb, 
+					 &last_eb_bh, OCFS_BH_CACHED, 
+					 inode);
+		if (status < 0) {
+			LOG_ERROR_STATUS(status);
+			goto bail;
 		}
+		eb = (ocfs2_extent_block *) last_eb_bh->b_data;
+		OCFS_ASSERT(IS_VALID_EXTENT_BLOCK(eb));
+		el = &(eb->h_list);
+	}
+
+	/* by now, el will point to the extent list on the bottom most
+	 * portion of this tree. */
+	i = el->l_next_free_rec - 1;
+	if (el->l_recs[i].e_cpos >= target_i_clusters)
+		clusters_to_del = el->l_recs[i].e_clusters;
+	else
+		clusters_to_del = (el->l_recs[i].e_clusters 
+				   + el->l_recs[i].e_cpos) - target_i_clusters;
+
+	LOG_TRACE_ARGS("clusters_to_del = %u in this pass\n", clusters_to_del);
+	credits = ocfs_calc_tree_trunc_credits(osb->sb, clusters_to_del, 
+					       fe, el);
+	if (!ocfs_handle_started(handle)) {
+		handle = ocfs_start_trans(osb, handle, credits);
+		if (!handle) {
+			status = -ENOMEM;
+			LOG_ERROR_STATUS(status);
+			goto bail;
+		}
+		ocfs_handle_set_always_commits(handle, 1);
 	} else {
-		for (i = 0; i < fel->l_next_free_rec; i++) {
-			status = ocfs_read_block(osb,
-						 fel->l_recs[i].e_blkno, 
-						 &extent_bh,
-						 OCFS_BH_CACHED,
-						 inode);
-			if (status < 0) {
-				LOG_ERROR_STATUS (status);
-				goto leave;
-			}
-			extent = (ocfs2_extent_block *) extent_bh->b_data;
-			if (!IS_VALID_EXTENT_BLOCK(extent)) {
-				status = -EINVAL;
-				LOG_ERROR_STATUS(status);
-				goto leave;
-			}
+		status = ocfs_extend_trans(handle, credits);
+		if (status < 0) {
+			LOG_ERROR_STATUS(status);
+			goto bail;
+		}
+	}
 
-			status = ocfs_kill_this_tree(osb, extent_bh, handle, inode);
-			if (status < 0) {
-				LOG_ERROR_STATUS(status);
-				goto leave;
-			}
-			brelse(extent_bh);
-			extent_bh = NULL;
+	status = ocfs_do_truncate(osb, clusters_to_del, inode, fe_bh, 
+				  last_eb_bh, tc);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
+	OCFS_ASSERT(fe->i_clusters >= target_i_clusters);
+	if (fe->i_clusters > target_i_clusters)
+		goto start;
+bail:
+	ocfs_commit_trans(handle);
+	tc->tc_handle = NULL;
+
+	if (last_eb_bh)
+		brelse(last_eb_bh);
+
+	LOG_EXIT_STATUS(status);
+	return status;
+}
+
+
+/*
+ * Expects the inode to already be locked. This will figure out which
+ * inodes need to be locked and will put them on the returned truncate
+ * context.
+ */
+int ocfs_prepare_truncate(ocfs_super *osb, 
+			  struct inode *inode, 
+			  struct buffer_head *fe_bh, 
+			  ocfs2_truncate_context **tc)
+{
+	int status, metadata_delete;
+	unsigned int new_i_clusters;
+	ocfs_journal_handle *handle = NULL;
+	ocfs2_dinode *fe;
+	ocfs2_extent_block *eb;
+	ocfs2_extent_list *el;
+	struct buffer_head *last_eb_bh = NULL;
+	struct inode *ext_alloc_inode = NULL;
+	struct buffer_head *ext_alloc_bh = NULL;
+	struct inode *data_alloc_inode = NULL;
+	struct buffer_head *data_alloc_bh = NULL;
+
+	LOG_ENTRY();
+
+	*tc = NULL;
+
+	new_i_clusters = ocfs_clusters_for_bytes(osb->sb, inode->i_size);
+	fe = (ocfs2_dinode *) fe_bh->b_data;
+
+	LOG_TRACE_ARGS("fe->i_clusters = %u, new_i_clusters = %u, fe->i_size ="
+		       "%llu\n", fe->i_clusters, new_i_clusters, fe->i_size);
+
+	OCFS_ASSERT(fe->i_clusters > new_i_clusters);
+
+	*tc = kmalloc(sizeof(ocfs2_truncate_context), GFP_KERNEL);
+	if (!(*tc)) {
+		status = -ENOMEM;
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+	memset(*tc, 0, sizeof(ocfs2_truncate_context));
+
+	handle = ocfs_alloc_handle(osb);
+	if (handle == NULL) {
+		status = -ENOMEM;
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
+	metadata_delete = 0;
+	if (fe->id2.i_list.l_tree_depth) {
+		/* If we have a tree, then the truncate may result in
+		 * metadata deletes. Figure this out from the
+		 * rightmost leaf block.*/
+		status = ocfs_read_block(osb, fe->i_last_eb_blk,
+					 &last_eb_bh, OCFS_BH_CACHED, inode);
+		if (status < 0) {
+			LOG_ERROR_STATUS(status);
+			goto bail;
 		}
+		eb = (ocfs2_extent_block *) last_eb_bh->b_data;
+		OCFS_ASSERT(IS_VALID_EXTENT_BLOCK(eb));
+		el = &(eb->h_list);
+		if (el->l_recs[0].e_cpos >= new_i_clusters)
+			metadata_delete = 1;
 	}
 
-leave:
-	if (extent_bh)
-		brelse(extent_bh);
-	
-	LOG_EXIT_STATUS (status);
+	if (metadata_delete) {
+		LOG_TRACE_STR("Will have to delete metadata for this trunc. "
+			      "locking allocator.\n");
+		ext_alloc_inode = ocfs_get_system_file_inode(osb, EXTENT_ALLOC_SYSTEM_INODE, 0);
+		if (!ext_alloc_inode) {
+			status = -ENOMEM;
+			LOG_ERROR_STATUS(status);
+			goto bail;
+		}
+
+		status = ocfs_acquire_lock(osb, OCFS_LKM_EXMODE, 0, 
+					   &ext_alloc_bh, ext_alloc_inode);
+		if (status < 0) {
+			if (status != -EINTR)
+				LOG_ERROR_STATUS (status);
+			goto bail;
+		}
+		ocfs_handle_add_lock(handle, OCFS_LKM_EXMODE, 
+				     0, ext_alloc_inode);
+		ocfs_handle_add_inode(handle, ext_alloc_inode);
+	}
+
+	data_alloc_inode = ocfs_get_system_file_inode(osb, GLOBAL_BITMAP_SYSTEM_INODE, -1);
+	if (!data_alloc_inode) {
+		status = -EINVAL;
+		LOG_ERROR_STR("Could not get bitmap inode!");
+		goto bail;
+	}
+
+	status = ocfs_acquire_lock(osb, OCFS_LKM_EXMODE, 
+				   0, &data_alloc_bh, data_alloc_inode);
+	if (status < 0) {
+		if (status != -EINTR)
+			LOG_ERROR_STATUS (status);
+		goto bail;
+	}
+	ocfs_handle_add_lock(handle, OCFS_LKM_EXMODE, 
+			     0, data_alloc_inode);
+	ocfs_handle_add_inode(handle, data_alloc_inode);
+
+	(*tc)->tc_bitmap_inode    = data_alloc_inode;
+	(*tc)->tc_bitmap_bh       = data_alloc_bh;
+	(*tc)->tc_ext_alloc_inode = ext_alloc_inode;
+	(*tc)->tc_ext_alloc_bh    = ext_alloc_bh;
+	(*tc)->tc_last_eb_bh      = last_eb_bh;
+	(*tc)->tc_handle          = handle;
+bail:
+	if (status < 0) {
+		if (handle)
+			ocfs_commit_trans(handle);
+		if (last_eb_bh)
+			brelse(last_eb_bh);
+		if (ext_alloc_inode)
+			iput(ext_alloc_inode);
+		if (data_alloc_inode)
+			iput(data_alloc_inode);
+		if (ext_alloc_bh)
+			brelse(ext_alloc_bh);
+		if (data_alloc_bh)
+			brelse(data_alloc_bh);
+		if (*tc)
+			ocfs_free_truncate_context(*tc);
+		*tc = NULL;
+	}
+	LOG_EXIT();
 	return status;
-}				/* ocfs_free_file_extents */
+}
 
+void ocfs_free_truncate_context(ocfs2_truncate_context *tc)
+{
+	if (tc->tc_bitmap_inode)
+		iput(tc->tc_bitmap_inode);
+	if (tc->tc_bitmap_bh)
+		brelse(tc->tc_bitmap_bh);
+	if (tc->tc_ext_alloc_inode)
+		iput(tc->tc_ext_alloc_inode);
+	if (tc->tc_ext_alloc_bh)
+		brelse(tc->tc_ext_alloc_bh);
+	if (tc->tc_last_eb_bh)
+		brelse(tc->tc_last_eb_bh);
+	if (tc->tc_handle) {
+		OCFS_ASSERT(!ocfs_handle_started(tc->tc_handle));
+		ocfs_commit_trans(tc->tc_handle);
+	}
+	kfree(tc);
+}

Modified: trunk/src/alloc.h
===================================================================
--- trunk/src/alloc.h	2004-10-05 22:33:26 UTC (rev 1548)
+++ trunk/src/alloc.h	2004-10-07 01:38:01 UTC (rev 1549)
@@ -43,9 +43,6 @@
 				   ocfs2_dinode *fe,
 				   ocfs_journal_handle *handle,
 				   struct inode *inode);
-int ocfs_free_file_extents(ocfs_super *osb, struct buffer_head *fe_bh,
-			   ocfs_journal_handle *handle,
-			   struct inode *inode);
 int ocfs_get_leaf_extent(ocfs_super *osb, ocfs2_dinode *fe,
 			 __s64 Vbo, struct buffer_head **data_extent_bh,
 			 struct inode *inode);
@@ -114,4 +111,25 @@
 				  u32 bits_wanted,
 				  ocfs2_alloc_context *ac);
 
+typedef struct _ocfs2_truncate_context {
+	struct inode *tc_bitmap_inode;
+	struct buffer_head *tc_bitmap_bh;
+	struct inode *tc_ext_alloc_inode;
+	struct buffer_head *tc_ext_alloc_bh;
+	/* these get destroyed once it's passed to ocfs_commit_truncate. */
+	struct buffer_head *tc_last_eb_bh;
+	ocfs_journal_handle *tc_handle;
+} ocfs2_truncate_context;
+
+void ocfs_free_truncate_context(ocfs2_truncate_context *tc);
+
+int ocfs_prepare_truncate(ocfs_super *osb, 
+			  struct inode *inode, 
+			  struct buffer_head *fe_bh, 
+			  ocfs2_truncate_context **tc);
+int ocfs_commit_truncate(ocfs_super *osb,
+			 struct inode *inode,
+			 struct buffer_head *fe_bh,
+			 ocfs2_truncate_context *tc);
+
 #endif /* OCFS2_ALLOC_H */

Modified: trunk/src/file.c
===================================================================
--- trunk/src/file.c	2004-10-05 22:33:26 UTC (rev 1548)
+++ trunk/src/file.c	2004-10-07 01:38:01 UTC (rev 1549)
@@ -55,6 +55,10 @@
 #define OCFS_DEBUG_CONTEXT    OCFS_DEBUG_CONTEXT_FILE
 
 static int ocfs2_zero_extend(struct inode *inode);
+static int ocfs_orphan_for_truncate(ocfs_super *osb, 
+				    struct inode *inode,
+				    struct buffer_head *fe_bh,
+				    u64 new_i_size);
 
 static unsigned int ocfs_calc_overalloc_bits(ocfs_super *osb, 
 					     struct file *filp,
@@ -816,127 +820,176 @@
 	.ioctl = ocfs_ioctl
 };
 
+int ocfs_set_inode_size(ocfs_journal_handle *handle,
+			struct inode *inode,
+			struct buffer_head *fe_bh,
+			u64 new_i_size)
+{
+	int status;
+	struct super_block *sb = inode->i_sb;
+	ocfs_inode_private *oip = OCFS_I(inode);
+
+	LOG_ENTRY();
+
+	inode->i_size = new_i_size;
+	OCFS_SET_INODE_TIME(inode, i_mtime, OCFS_CURRENT_TIME);
+	inode->i_blocks = (new_i_size + sb->s_blocksize - 1) 
+		>> sb->s_blocksize_bits;
+	status = ocfs_mark_inode_dirty(handle, inode, fe_bh);
+	if (status < 0) {
+		LOG_ERROR_STATUS (status);
+		goto bail;
+	}
+
+	down(&oip->ip_sem);
+	oip->ip_mmu_private = inode->i_size;
+	ocfs_extent_map_destroy(&oip->ip_ext_map);
+	ocfs_extent_map_init (&oip->ip_ext_map);
+	up(&oip->ip_sem);
+bail:
+	LOG_EXIT_STATUS(status);
+	return status;
+}
+
+static int ocfs_orphan_for_truncate(ocfs_super *osb, 
+				    struct inode *inode,
+				    struct buffer_head *fe_bh,
+				    u64 new_i_size)
+{
+	int status;
+	ocfs_journal_handle *handle = NULL;
+
+	LOG_ENTRY();
+
+	/* TODO: This needs to actually orphen the inode in this
+	 * transaction. */
+
+	handle = ocfs_start_trans(osb, handle, 1);
+	if (handle == NULL) {
+		LOG_ERROR_STATUS (status = -ENOMEM);
+		goto bail;
+	}
+	ocfs_handle_set_always_commits(handle, 1);
+
+	status = ocfs_set_inode_size(handle, inode, fe_bh, new_i_size);
+	if (status < 0) {
+		LOG_ERROR_STATUS (status);
+		goto bail;
+	}
+bail:
+	if (handle)
+		ocfs_commit_trans(handle);
+	LOG_EXIT_STATUS(status);
+	return status;
+}
+
 /*
  * ocfs_truncate_file()
  *
  */
-static int ocfs_truncate_file(ocfs_super *osb, __u64 file_size,
+static int ocfs_truncate_file(ocfs_super *osb, 
+			      u64 new_i_size,
 			      struct inode *inode)
 {
 	int status = 0;
 	ocfs2_dinode *fe = NULL;
-	__u64 new_alloc_size;
-	struct buffer_head *bh = NULL;
+	struct buffer_head *fe_bh = NULL;
 	ocfs_journal_handle *handle = NULL;
+	ocfs2_truncate_context *tc = NULL;
 
-	LOG_ENTRY_ARGS("(inode = %llu, file_size = %llu\n", 
-		       OCFS_I(inode)->ip_blkno, file_size);
+	LOG_ENTRY_ARGS("(inode = %llu, new_i_size = %llu\n", 
+		       OCFS_I(inode)->ip_blkno, new_i_size);
 
-	new_alloc_size = ocfs_align_bytes_to_clusters(osb->sb,
-						      file_size);
-
-#ifdef PURE_EVIL
-	if (evil_filename_check(EVIL_INODE, inode)) {
-		LOG_ERROR_ARGS("EVIL TRUNCATE: file_size=%llu, new_alloc=%llu, old=%llu\n",
-			       file_size, new_alloc_size, inode->i_size);
-	}
-#endif
-
-
-	LOG_TRACE_ARGS("new_alloc_size = %llu\n", new_alloc_size);
-
 	handle = ocfs_alloc_handle(osb);
 	if (handle == NULL) {
 		LOG_ERROR_STATUS (status = -ENOMEM);
-		goto leave;
+		goto bail;
 	}
 
 	status = ocfs_acquire_lock (osb, OCFS_LKM_EXMODE, 
 				    FLAG_FILE_TRUNCATE|FLAG_FILE_UPDATE_OIN,
-				    &bh, inode);
+				    &fe_bh, inode);
 	if (status < 0) {
 		if (status != -EINTR)
 			LOG_ERROR_STATUS (status);
-		goto leave;
+		goto bail;
 	}
 	ocfs_handle_add_lock(handle, OCFS_LKM_EXMODE, 
 			     FLAG_FILE_TRUNCATE|FLAG_FILE_UPDATE_OIN, inode);
+	ocfs_handle_add_inode(handle, inode);
 
-	fe = (ocfs2_dinode *) bh->b_data;
-	if (!IS_VALID_FILE_ENTRY(fe)) {
-		LOG_ERROR_ARGS("Invalid fe at blkno %llu",
-			       OCFS_I(inode)->ip_blkno);
-		status = -EFAIL;
-		goto leave;
-	}
+	fe = (ocfs2_dinode *) fe_bh->b_data;
+	OCFS_ASSERT(IS_VALID_FILE_ENTRY(fe));
+	OCFS_ASSERT(fe->i_size == inode->i_size);
 
-	handle = ocfs_start_trans(osb, handle, OCFS_FILE_TRUNCATE_CREDITS);
-	if (handle == NULL) {
-		LOG_ERROR_STATUS (status = -ENOMEM);
-		goto leave;
-	}
-
-	/* add this fe to the journal transaction */
-	status = ocfs_journal_access(handle, bh, OCFS_JOURNAL_ACCESS_WRITE);
-	if (status < 0) {
-		LOG_ERROR_STATUS(status);
-		goto leave;
-	}
-
-	down(&OCFS_I(inode)->ip_sem);
-
-	/* the file entry might have changed underneath us (while
-	 * waiting on the lock). make sure the size is still a valid
-	 * one. This really ought to check for other things too, like
-	 * a valid bit, etc. */
-	if (file_size > fe->i_size) {
+	if (new_i_size > fe->i_size) {
 		LOG_TRACE_ARGS("asked to truncate file with size (%llu) "
 			       "to size (%llu)!\n", fe->i_size, 
-			       file_size);
-		up(&OCFS_I(inode)->ip_sem);
+			       new_i_size);
 		status = -EINVAL;
 		LOG_ERROR_STATUS(status);
-		goto leave;
+		goto bail;
 	}
 
-	fe->i_size = file_size;
-	fe->i_clusters = (u32)(new_alloc_size >> osb->s_clustersize_bits);
+	LOG_TRACE_ARGS("inode %llu, i_size = %llu, new_i_size = %llu\n",
+		       fe->i_blkno, fe->i_size, new_i_size);
 
-	status = ocfs_free_extents_for_truncate (osb, fe, handle, inode);
+	/* lets handle the simple truncate cases before doing any more
+	 * cluster locking. */
+	if (new_i_size == fe->i_size)
+		goto bail;
+
+	if (fe->i_clusters 
+	    == ocfs_clusters_for_bytes(osb->sb, new_i_size)) {
+		LOG_TRACE_ARGS("fe->i_clusters = %u, so we do a simple "
+			       "truncate\n", fe->i_clusters);
+		/* No allocation change is required, so lets fast path
+		 * this truncate. */	
+		handle = ocfs_start_trans(osb, handle, 1);
+		if (handle == NULL) {
+			LOG_ERROR_STATUS (status = -ENOMEM);
+			goto bail;
+		}
+		ocfs_handle_set_always_commits(handle, 1);
+
+		status = ocfs_set_inode_size(handle, inode, fe_bh, new_i_size);
+		if (status < 0)
+			LOG_ERROR_STATUS (status);
+		goto bail;
+	}
+
+	/* alright, we're going to need to do a full blown alloc size
+	 * change. Orphan the inode so that recovery can complete the
+	 * truncate if necessary. This does the task of marking
+	 * i_size. */
+	status = ocfs_orphan_for_truncate(osb, inode, fe_bh, new_i_size);
 	if (status < 0) {
-		up(&OCFS_I(inode)->ip_sem);
 		LOG_ERROR_STATUS (status);
-		goto leave;
+		goto bail;
 	}
 
-	fe->i_mtime = OCFS_CURRENT_TIME;
+	status = ocfs_prepare_truncate(osb, inode, fe_bh, &tc);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
 
-	status = ocfs_journal_dirty(handle, bh);
+	status = ocfs_commit_truncate(osb, inode, fe_bh, tc);
 	if (status < 0) {
-		up(&OCFS_I(inode)->ip_sem);
-		LOG_ERROR_STATUS (status);
-		goto leave;
+		LOG_ERROR_STATUS(status);
+		goto bail;
 	}
 
-	/* if we updated correctly then we can update the alloc_size */
-	OCFS_I(inode)->ip_alloc_size = new_alloc_size;
-	OCFS_I(inode)->ip_mmu_private = fe->i_size;
-	ocfs_extent_map_destroy(&OCFS_I(inode)->ip_ext_map);
-	ocfs_extent_map_init (&OCFS_I(inode)->ip_ext_map);
-	up (&(OCFS_I(inode)->ip_sem));
-
-leave:
-	if (handle && (status == 0))
+	/* TODO: orphan dir cleanup here. */
+bail:
+	if (handle)
 		ocfs_commit_trans(handle);
-	else if (handle)
-		ocfs_abort_trans(handle);
 
-	if (bh != NULL)
-		brelse(bh);
+	if (fe_bh != NULL)
+		brelse(fe_bh);
 
-	if  (status < 0)
-		if (status != -ENOSPC && status != -EINTR)
-			LOG_ERROR_STATUS (status);
+	if (tc)
+		ocfs_free_truncate_context(tc);
 
 	LOG_EXIT_STATUS (status);
 	return status;

Modified: trunk/src/file.h
===================================================================
--- trunk/src/file.h	2004-10-05 22:33:26 UTC (rev 1548)
+++ trunk/src/file.h	2004-10-07 01:38:01 UTC (rev 1549)
@@ -58,6 +58,11 @@
 		 struct kstat *stat);
 #endif
 
+int ocfs_set_inode_size(ocfs_journal_handle *handle,
+			struct inode *inode,
+			struct buffer_head *fe_bh,
+			u64 new_i_size);
+
 #ifdef PURE_EVIL
 #define EVIL_FILENAME        "libctx10.a"
 #define EVIL_FILENAME_LEN    10

Modified: trunk/src/inode.c
===================================================================
--- trunk/src/inode.c	2004-10-05 22:33:26 UTC (rev 1548)
+++ trunk/src/inode.c	2004-10-07 01:38:01 UTC (rev 1549)
@@ -77,6 +77,9 @@
 #else /* 2.4 kernel */
 static int ocfs_find_inode (struct inode *inode, unsigned long ino, void *opaque);
 #endif
+static int ocfs_truncate_for_delete(ocfs_super *osb, 
+				    struct inode *inode, 
+				    struct buffer_head *fe_bh);
 
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
 /* 
@@ -529,6 +532,61 @@
 #endif
 }
 
+static int ocfs_truncate_for_delete(ocfs_super *osb, 
+				    struct inode *inode, 
+				    struct buffer_head *fe_bh)
+{
+	int status = 0;
+	ocfs_journal_handle *handle = NULL;
+	ocfs2_truncate_context *tc = NULL;
+	ocfs2_dinode *fe;
+
+	LOG_ENTRY();
+
+	fe = (ocfs2_dinode *) fe_bh->b_data;
+
+	/* zero allocation, zero truncate :) */
+	if (!fe->i_clusters)
+		goto bail;
+
+	handle = ocfs_start_trans(osb, handle, 1);
+	if (handle == NULL) {
+		LOG_ERROR_STATUS (status = -ENOMEM);
+		goto bail;
+	}
+	ocfs_handle_set_always_commits(handle, 1);
+
+	status = ocfs_set_inode_size(handle, inode, fe_bh, 0ULL);
+	if (status < 0) {
+		LOG_ERROR_STATUS (status);
+		goto bail;
+	}
+
+	ocfs_commit_trans(handle);
+	handle = NULL;
+
+	status = ocfs_prepare_truncate(osb, inode, fe_bh, &tc);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
+	status = ocfs_commit_truncate(osb, inode, fe_bh, tc);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+bail:
+	if (handle)
+		ocfs_commit_trans(handle);
+
+	if (tc)
+		ocfs_free_truncate_context(tc);
+
+	LOG_EXIT_STATUS (status);
+	return status;
+}
+
 /*
  * ocfs_delete_inode()
  *
@@ -640,6 +698,16 @@
 			     orphan_dir_inode);
 	ocfs_handle_add_inode(handle, orphan_dir_inode);
 
+	/* we do this while holding the orphan dir lock because we
+	 * don't want recovery being run from another node to vote for
+	 * an inode delete on us -- this will result in two nodes
+	 * truncating the same file! */
+	status = ocfs_truncate_for_delete(osb, inode, fe_bh);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto bail;
+	}
+
 	inode_alloc_inode = ocfs_get_system_file_inode(osb, INODE_ALLOC_SYSTEM_INODE, fe->i_suballoc_node);
 	if (!inode_alloc_inode) {
 		status = -EEXIST;
@@ -690,13 +758,6 @@
 	status = ocfs_free_suballoc_bits(osb, handle, inode_alloc_inode,
 					 inode_alloc_bh, fe->i_suballoc_bit,
 					 fe->i_blkno, 1);
-	if (status < 0) {
-		LOG_ERROR_STATUS(status);
-		goto bail;
-	}
-
-	/* actually delete the data and the inode */
-	status = ocfs_free_file_extents(osb, fe_bh, handle, inode);
 	if (status < 0)
 		LOG_ERROR_STATUS(status);
 

Modified: trunk/src/ocfs_journal.h
===================================================================
--- trunk/src/ocfs_journal.h	2004-10-05 22:33:26 UTC (rev 1548)
+++ trunk/src/ocfs_journal.h	2004-10-07 01:38:01 UTC (rev 1549)
@@ -223,6 +223,11 @@
  * buffers to journal_access! */
 #define OCFS_HANDLE_ALWAYS_COMMITS		4
 
+static inline int ocfs_handle_started(ocfs_journal_handle *handle)
+{
+	return handle->flags & OCFS_HANDLE_STARTED;
+}
+
 static inline void ocfs_handle_free_all_copyout(ocfs_journal_handle *handle)
 {
 	while (handle->num_co) {
@@ -466,6 +471,33 @@
 	return(blocks);
 }
 
+static inline int ocfs_calc_tree_trunc_credits(struct super_block *sb,
+					       unsigned int clusters_to_del,
+					       ocfs2_dinode *fe,
+					       ocfs2_extent_list *last_el)
+{
+ 	/* for file entry + all headers in this pass + update to next leaf */
+	int credits = 1 + fe->id2.i_list.l_tree_depth + 1;
+	int bitmap_blocks, i;
+
+	i = last_el->l_next_free_rec - 1;
+	OCFS_ASSERT(i >= 0);
+
+	/* We may be deleting metadata blocks, so metadata alloc dinode +
+	   one desc. block for each possible delete. */
+	if (fe->id2.i_list.l_tree_depth 
+	    && (last_el->l_next_free_rec == 1)
+	    && ((last_el->l_recs[i].e_clusters - clusters_to_del) == 0))
+		credits += 1 + fe->id2.i_list.l_tree_depth;
+
+	/* bitmap fe + bitmap blocks covered by this extent */
+	bitmap_blocks = 1 + ocfs_blocks_for_bits(sb, 
+						 clusters_to_del);
+	credits += bitmap_blocks;
+
+	return(credits);
+}
+
 /* fe, anything along new 'edge' of tree + fuzz*/
 #define OCFS_FILE_TRUNCATE_CREDITS (1 + 14 + OCFS_JOURNAL_FUZZ_CREDITS)
 



More information about the Ocfs2-commits mailing list