[Ocfs2-tools-commits] taoma commits r1406 - in trunk: debugfs.ocfs2 extras fsck.ocfs2 fswreck libocfs2 libocfs2/include mkfs.ocfs2 sizetest tunefs.ocfs2

svn-commits at oss.oracle.com svn-commits at oss.oracle.com
Fri Aug 24 16:36:29 PDT 2007


Author: taoma
Date: 2007-08-24 16:36:10 -0700 (Fri, 24 Aug 2007)
New Revision: 1406

Modified:
   trunk/debugfs.ocfs2/commands.c
   trunk/debugfs.ocfs2/dump.c
   trunk/debugfs.ocfs2/find_block_inode.c
   trunk/debugfs.ocfs2/journal.c
   trunk/debugfs.ocfs2/utils.c
   trunk/extras/find_dup_extents.c
   trunk/fsck.ocfs2/extent.c
   trunk/fsck.ocfs2/journal.c
   trunk/fsck.ocfs2/pass1.c
   trunk/fswreck/dir.c
   trunk/fswreck/extent.c
   trunk/fswreck/symlink.c
   trunk/libocfs2/cached_inode.c
   trunk/libocfs2/dir_scan.c
   trunk/libocfs2/expanddir.c
   trunk/libocfs2/extend_file.c
   trunk/libocfs2/extent_map.c
   trunk/libocfs2/extents.c
   trunk/libocfs2/fileio.c
   trunk/libocfs2/heartbeat.c
   trunk/libocfs2/include/ocfs2.h
   trunk/libocfs2/include/ocfs2_fs.h
   trunk/libocfs2/mkjournal.c
   trunk/libocfs2/truncate.c
   trunk/mkfs.ocfs2/mkfs.c
   trunk/mkfs.ocfs2/mkfs.h
   trunk/sizetest/sizes.txt
   trunk/sizetest/sizetest.c
   trunk/tunefs.ocfs2/remove_slot.c
Log:
The disk layout for ocfs2 volume has been changed for sparse files, so
OCFS2 Tools also need to be revised to be fit for this new feature.

Signed-off-by: mfasheh

Modified: trunk/debugfs.ocfs2/commands.c
===================================================================
--- trunk/debugfs.ocfs2/commands.c	2007-08-24 23:30:25 UTC (rev 1405)
+++ trunk/debugfs.ocfs2/commands.c	2007-08-24 23:36:10 UTC (rev 1406)
@@ -426,21 +426,34 @@
 	errcode_t ret = 0;
 	char *buf = NULL;
 	int i;
-	uint32_t clstoff;
+	uint32_t clstoff, clusters;
 	uint32_t tmp;
 
 	clstoff = ocfs2_blocks_to_clusters(fs, blkoff);
 
 	for (i = 0; i < el->l_next_free_rec; ++i) {
 		rec = &(el->l_recs[i]);
+		clusters = ocfs2_rec_clusters(el->l_tree_depth, rec);
 
-		/* TODO Fix to handle sparse trees */
-		if (clstoff >= (rec->e_cpos + rec->e_clusters))
+		/*
+		 * For a sparse file, we may find an empty record.
+		 * Just skip it.
+		 */
+		if (!clusters)
 			continue;
 
+		if (clstoff >= (rec->e_cpos + clusters))
+			continue;
+
 		if (!el->l_tree_depth) {
-			tmp = blkoff - ocfs2_clusters_to_blocks(fs, clstoff);
-			dump_logical_blkno(out, rec->e_blkno + tmp);
+			if (clstoff < rec->e_cpos) {
+				dump_logical_blkno(out, 0);
+			} else {
+				tmp = blkoff -
+					ocfs2_clusters_to_blocks(fs,
+								 rec->e_cpos);
+				dump_logical_blkno(out, rec->e_blkno + tmp);
+			}
 			goto bail;
 		}
 
@@ -482,11 +495,23 @@
 	errcode_t ret = 0;
 	char *buf = NULL;
 	int i;
+	uint32_t clusters;
 
 	dump_extent_list (out, el);
 
 	for (i = 0; i < el->l_next_free_rec; ++i) {
 		rec = &(el->l_recs[i]);
+		clusters = ocfs2_rec_clusters(el->l_tree_depth, rec);
+
+		/*
+		 * In a unsuccessful insertion, we may shift a tree
+		 * add a new branch for it and do no insertion. So we
+		 * may meet a extent block which have
+		 * clusters == 0, this should only be happen
+		 * in the last extent rec. */
+		if (!clusters && i == el->l_next_free_rec - 1)
+			break;
+
 		if (el->l_tree_depth) {
 			ret = ocfs2_malloc_block(gbls.fs->fs_io, &buf);
 			if (ret)

Modified: trunk/debugfs.ocfs2/dump.c
===================================================================
--- trunk/debugfs.ocfs2/dump.c	2007-08-24 23:30:25 UTC (rev 1405)
+++ trunk/debugfs.ocfs2/dump.c	2007-08-24 23:36:10 UTC (rev 1406)
@@ -296,6 +296,7 @@
 {
 	struct ocfs2_extent_rec *rec;
 	int i;
+	uint32_t clusters;
 
 	fprintf(out, "\tTree Depth: %u   Count: %u   Next Free Rec: %u\n",
 		ext->l_tree_depth, ext->l_count, ext->l_next_free_rec);
@@ -303,12 +304,25 @@
 	if (!ext->l_next_free_rec)
 		goto bail;
 
-	fprintf(out, "\t## %-11s   %-12s   %-s\n", "Offset", "Clusters", "Block#");
+	if (ext->l_tree_depth)
+		fprintf(out, "\t## %-11s   %-12s   %-s\n", "Offset",
+			"Clusters", "Block#");
+	else
+		fprintf(out, "\t## %-11s   %-12s   %-13s   %s\n", "Offset",
+			"Clusters", "Block#", "Flags");
 
 	for (i = 0; i < ext->l_next_free_rec; ++i) {
 		rec = &(ext->l_recs[i]);
-		fprintf(out, "\t%-2d %-11u   %-12u   %"PRIu64"\n",
-		       	i, rec->e_cpos, rec->e_clusters, rec->e_blkno);
+		clusters = ocfs2_rec_clusters(ext->l_tree_depth, rec);
+
+		if (ext->l_tree_depth)
+			fprintf(out, "\t%-2d %-11u   %-12u   %"PRIu64"\n",
+				i, rec->e_cpos, clusters, rec->e_blkno);
+		else
+			fprintf(out,
+				"\t%-2d %-11u   %-12u   %-13"PRIu64"   0x%x\n",
+				i, rec->e_cpos, clusters, rec->e_blkno,
+				rec->e_flags);
 	}
 
 bail:

Modified: trunk/debugfs.ocfs2/find_block_inode.c
===================================================================
--- trunk/debugfs.ocfs2/find_block_inode.c	2007-08-24 23:30:25 UTC (rev 1405)
+++ trunk/debugfs.ocfs2/find_block_inode.c	2007-08-24 23:36:10 UTC (rev 1406)
@@ -49,6 +49,7 @@
 	int i;
 	int j;
 	uint64_t numblks;
+	uint32_t clusters;
 
 	if (*found >= count)
 		return 0;
@@ -63,6 +64,15 @@
 
 	for (i = 0; i < el->l_next_free_rec; ++i) {
 		rec = &(el->l_recs[i]);
+		clusters = ocfs2_rec_clusters(el->l_tree_depth, rec);
+
+		/*
+		 * For a sparse file, we may find an empty record.
+		 * Just skip it.
+		 */
+		if (!clusters)
+			continue;
+
 		if (el->l_tree_depth) {
 			ret = ocfs2_read_extent_block(fs, rec->e_blkno, buf);
 			if (ret) {
@@ -91,7 +101,7 @@
 			if (ba[j].status != STATUS_UNKNOWN)
 				continue;
 
-			numblks = ocfs2_clusters_to_blocks(fs, rec->e_clusters);
+			numblks = ocfs2_clusters_to_blocks(fs, clusters);
 
 			if (ba[j].blkno >= rec->e_blkno &&
 			    ba[j].blkno < rec->e_blkno + numblks) {

Modified: trunk/debugfs.ocfs2/journal.c
===================================================================
--- trunk/debugfs.ocfs2/journal.c	2007-08-24 23:30:25 UTC (rev 1405)
+++ trunk/debugfs.ocfs2/journal.c	2007-08-24 23:36:10 UTC (rev 1406)
@@ -88,12 +88,6 @@
 		goto bail;
 	}
 
-	ret = ocfs2_extent_map_init(fs, ci);
-	if (ret) {
-		com_err(gbls.cmd, ret, "while initializing extent map");
-		goto bail;
-	}
-
 	buflenbits = buflen >>
 			OCFS2_RAW_SB(gbls.fs->fs_super)->s_blocksize_bits;
 	ret = ocfs2_malloc_blocks(fs->fs_io, buflenbits, &buf);

Modified: trunk/debugfs.ocfs2/utils.c
===================================================================
--- trunk/debugfs.ocfs2/utils.c	2007-08-24 23:30:25 UTC (rev 1405)
+++ trunk/debugfs.ocfs2/utils.c	2007-08-24 23:36:10 UTC (rev 1406)
@@ -443,12 +443,6 @@
 		goto bail;
 	}
 
-	ret = ocfs2_extent_map_init(fs, ci);
-	if (ret) {
-		com_err(gbls.cmd, ret, "while initializing extent map");
-		goto bail;
-	}
-
 	buflen = 1024 * 1024;
 
 	ret = ocfs2_malloc_blocks(fs->fs_io,
@@ -517,12 +511,6 @@
 		goto bail;
 	}
 
-	ret = ocfs2_extent_map_init(fs, ci);
-	if (ret) {
-		com_err(gbls.cmd, ret, "while initializing extent map");
-		goto bail;
-	}
-
 	if (!*buflen) {
 		*buflen = (((ci->ci_inode->i_size + fs->fs_blocksize - 1) >>
 			    OCFS2_RAW_SB(fs->fs_super)->s_blocksize_bits) <<

Modified: trunk/extras/find_dup_extents.c
===================================================================
--- trunk/extras/find_dup_extents.c	2007-08-24 23:30:25 UTC (rev 1405)
+++ trunk/extras/find_dup_extents.c	2007-08-24 23:36:10 UTC (rev 1406)
@@ -68,7 +68,7 @@
 		OCFS2_RAW_SB(fs->fs_super)->s_blocksize_bits;
 
 	cluster = (uint32_t)(rec->e_blkno >> b_to_c_bits);
-	for (i = 0; i < rec->e_clusters; i++) {
+	for (i = 0; i < ocfs2_rec_clusters(tree_depth, rec); i++) {
 		ret = ocfs2_bitmap_set(we->extent_map,
 				       cluster + i,
 				       &oldval);
@@ -112,7 +112,7 @@
 		OCFS2_RAW_SB(fs->fs_super)->s_blocksize_bits;
 
 	cluster = (uint32_t)(rec->e_blkno >> b_to_c_bits);
-	for (i = 0; i < rec->e_clusters; i++) {
+	for (i = 0; i < ocfs2_rec_clusters(tree_depth, rec); i++) {
 		ret = ocfs2_bitmap_test(we->dup_map,
 					cluster + i,
 					&oldval);

Modified: trunk/fsck.ocfs2/extent.c
===================================================================
--- trunk/fsck.ocfs2/extent.c	2007-08-24 23:30:25 UTC (rev 1405)
+++ trunk/fsck.ocfs2/extent.c	2007-08-24 23:36:10 UTC (rev 1406)
@@ -155,10 +155,11 @@
 {
 	errcode_t ret = 0;
 	uint64_t first_block;
-	uint32_t last_cluster;
+	uint32_t last_cluster, clusters;
 
+	clusters = ocfs2_rec_clusters(el->l_tree_depth, er);
 	verbosef("cpos %u clusters %u blkno %"PRIu64"\n", er->e_cpos,
-		 er->e_clusters, er->e_blkno);
+		 clusters, er->e_blkno);
 
 	if (ocfs2_block_out_of_range(ost->ost_fs, er->e_blkno))
 		goto out;
@@ -204,7 +205,7 @@
 	/* imagine blkno 0, 1 er_clusters.  last_cluster is 1 and 
 	 * fs_clusters is 1, which is ok.. */
 	last_cluster = ocfs2_blocks_to_clusters(ost->ost_fs, er->e_blkno) +
-		       er->e_clusters;
+		       clusters;
 
 	if (last_cluster > ost->ost_fs->fs_clusters &&
 	    prompt(ost, PY, PR_EXTENT_CLUSTERS_OVERRUN,
@@ -214,7 +215,8 @@
 		   "clusters to fit it in the volume?", er->e_cpos, 
 		   di->i_blkno, last_cluster - ost->ost_fs->fs_clusters)) {
 
-		er->e_clusters -= last_cluster - ost->ost_fs->fs_clusters;
+		clusters -= last_cluster - ost->ost_fs->fs_clusters;
+		ocfs2_set_rec_clusters(el->l_tree_depth, er, clusters);
 		*changed = 1;
 	}
 	
@@ -236,6 +238,7 @@
 	struct ocfs2_extent_rec *er;
 	uint64_t max_size;
 	uint16_t i;
+	uint32_t clusters;
 	size_t cpy;
 
 	verbosef("depth %u count %u next_free %u\n", el->l_tree_depth,
@@ -286,7 +289,17 @@
 
 	for (i = 0; i < max_recs; i++) {
 		er = &el->l_recs[i];
+		clusters = ocfs2_rec_clusters(el->l_tree_depth, er);
 
+		/*
+		 * For a sparse file, we may find an empty record
+		 * in the left most record. Just skip it.
+		 */
+		if ((OCFS2_RAW_SB(ost->ost_fs->fs_super)->s_feature_incompat
+		     & OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC) &&
+		    el->l_tree_depth && !i && !clusters)
+			continue;
+
 		/* returns immediately if blkno is out of range.
 		 * descends into eb.  checks that data er doesn't
 		 * reference past the volume or anything crazy. */
@@ -328,11 +341,11 @@
 		/* mark the data clusters as used */
 		o2fsck_mark_clusters_allocated(ost,
 			ocfs2_blocks_to_clusters(ost->ost_fs, er->e_blkno),
-			er->e_clusters);
+			clusters);
 
-		ei->ei_clusters += er->e_clusters;
+		ei->ei_clusters += clusters;
 
-		max_size = (er->e_cpos + er->e_clusters) <<
+		max_size = (er->e_cpos + clusters) <<
 			   OCFS2_RAW_SB(ost->ost_fs->fs_super)->s_clustersize_bits;
 		if (max_size > ei->ei_max_size)
 			ei->ei_max_size = max_size;

Modified: trunk/fsck.ocfs2/journal.c
===================================================================
--- trunk/fsck.ocfs2/journal.c	2007-08-24 23:30:25 UTC (rev 1405)
+++ trunk/fsck.ocfs2/journal.c	2007-08-24 23:36:10 UTC (rev 1406)
@@ -238,7 +238,7 @@
 				      int check_dup)
 {
 	errcode_t ret;
-	int contig;
+	uint64_t contig;
 	int was_set;
 
 	ret = ocfs2_extent_map_get_blocks(ji->ji_cinode, blkoff, 1, blkno,
@@ -482,12 +482,6 @@
 	      OCFS2_JOURNAL_DIRTY_FL))
 		goto out;
 
-	err = ocfs2_extent_map_init(fs, ji->ji_cinode);
-	if (err) {
-		com_err(whoami, err, "while initializing extent map");
-		goto out;
-	}
-
 	err = lookup_journal_block(fs, ji, 0, &ji->ji_jsb_block, 1);
 	if (err)
 		goto out;
@@ -529,7 +523,8 @@
 	uint64_t blkno;
 	errcode_t ret;
 	ocfs2_cached_inode *cinode = NULL;
-	int contig, is_dirty;
+	int is_dirty;
+	uint64_t contig;
 	journal_superblock_t *jsb;
 
 	*should = 0;
@@ -564,12 +559,6 @@
 			goto out;
 		}
 
-		ret = ocfs2_extent_map_init(fs, cinode);
-		if (ret) {
-			com_err(whoami, ret, "while initializing extent map");
-			goto out;
-		}
-
 		is_dirty = cinode->ci_inode->id1.journal1.ij_flags &
 			   OCFS2_JOURNAL_DIRTY_FL;
 		verbosef("slot %d JOURNAL_DIRTY_FL: %d\n", i, is_dirty);
@@ -740,7 +729,7 @@
 				     ocfs2_cached_inode *ci)
 {
 	errcode_t ret;
-	int contig;
+	uint64_t contig;
 	uint64_t blkno;
 	char *buf = NULL;
 
@@ -748,10 +737,6 @@
 	if (ret)
 		goto out;
 
-	ret = ocfs2_extent_map_init(fs, ci);
-	if (ret)
-		goto out;
-
 	ret = ocfs2_extent_map_get_blocks(ci, 0, 1, &blkno, &contig);
 	if (ret)
 		goto out;

Modified: trunk/fsck.ocfs2/pass1.c
===================================================================
--- trunk/fsck.ocfs2/pass1.c	2007-08-24 23:30:25 UTC (rev 1405)
+++ trunk/fsck.ocfs2/pass1.c	2007-08-24 23:36:10 UTC (rev 1406)
@@ -739,7 +739,7 @@
 static errcode_t o2fsck_check_blocks(ocfs2_filesys *fs, o2fsck_state *ost,
 				     uint64_t blkno, struct ocfs2_dinode *di)
 {
-	uint64_t expected = 0;
+	uint64_t expected = 0, unexpected = 0;
 	errcode_t ret;
 	struct verifying_blocks vb = {
 		.vb_ost = ost,
@@ -804,29 +804,82 @@
 		goto out;	
 	}
 
-	if (vb.vb_num_blocks > 0)
-		expected = (vb.vb_last_block + 1) * fs->fs_blocksize;
+	/*
+	 * i_size and i_cluster mean quite different between a non-sparse
+	 * and sparse file system.
+	 *
+	 * For a non-sparse file system, the file size should be within the
+	 * clusters it is allocated, and the cluster size should be the same
+	 * as the number we calculate from extent iteration.
+	 *
+	 * For a sparse file, the file size can be greater than the real
+	 * last block offsets recorded in the extent list, but it shouldn't be
+	 * less than that cluster offset since we have already allocated some
+	 * blocks at that offset, so if the size is too small, fix it to the
+	 * end of the visible cluster end. It is also reasonable for a file
+	 * which has no allocated blocks but any number of byte sizes,
+	 * so we don't need to check its size either.
+	 */
+	if (OCFS2_RAW_SB(fs->fs_super)->s_feature_incompat &
+	    OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC) {
+		if (vb.vb_num_blocks > 0) {
+			expected = ocfs2_blocks_to_clusters(fs,
+							 vb.vb_last_block + 1);
+			expected *=  fs->fs_clustersize;
+			unexpected = expected - fs->fs_clustersize;
 
-	/* i_size is checked for symlinks elsewhere */
-	if (!S_ISLNK(di->i_mode) && di->i_size > expected &&
-	    prompt(ost, PY, PR_INODE_SIZE, "Inode %"PRIu64" has a size of "
-		   "%"PRIu64" but has %"PRIu64" bytes of actual data. "
-		   "Correct the file size?",
-		    di->i_blkno, di->i_size, expected)) {
-		di->i_size = expected;
-		o2fsck_write_inode(ost, blkno, di);
-	}
+			/* i_size is checked for symlinks elsewhere */
+			if (!S_ISLNK(di->i_mode) && di->i_size <= unexpected &&
+			    prompt(ost, PY, PR_INODE_SIZE, "Inode %"PRIu64
+				   " has a size of %"PRIu64" but has %"PRIu64
+				   " blocks of actual data. "
+				   "Correct the file size?",
+				    di->i_blkno, di->i_size,
+				    vb.vb_last_block + 1)) {
+				di->i_size = expected;
+				o2fsck_write_inode(ost, blkno, di);
+			}
+		}
 
-	if (vb.vb_num_blocks > 0)
-		expected = ocfs2_clusters_in_blocks(fs, vb.vb_last_block + 1);
+		if (vb.vb_num_blocks > 0)
+			expected = ocfs2_clusters_in_blocks(fs, vb.vb_num_blocks);
 
-	if (di->i_clusters < expected &&
-	    prompt(ost, PY, PR_INODE_CLUSTERS,
-		   "Inode %"PRIu64" has %"PRIu32" clusters but its "
-		   "blocks fit in %"PRIu64" clusters.  Correct the number of "
-		   "clusters?", di->i_blkno, di->i_clusters, expected)) {
-		di->i_clusters = expected;
-		o2fsck_write_inode(ost, blkno, di);
+		if (di->i_clusters < expected &&
+		    prompt(ost, PY, PR_INODE_CLUSTERS,
+			   "Inode %"PRIu64" has %"PRIu32" clusters but its "
+			   "blocks fit in %"PRIu64" clusters. "
+			   "Correct the number of clusters?",
+			   di->i_blkno, di->i_clusters, expected)) {
+			di->i_clusters = expected;
+			o2fsck_write_inode(ost, blkno, di);
+		}
+	} else {
+		if (vb.vb_num_blocks > 0)
+			expected = (vb.vb_last_block + 1) * fs->fs_blocksize;
+
+		/* i_size is checked for symlinks elsewhere */
+		if (!S_ISLNK(di->i_mode) && di->i_size > expected &&
+		    prompt(ost, PY, PR_INODE_SIZE, "Inode %"PRIu64" has a size of "
+			   "%"PRIu64" but has %"PRIu64" bytes of actual data. "
+			   "Correct the file size?",
+			    di->i_blkno, di->i_size, expected)) {
+			di->i_size = expected;
+			o2fsck_write_inode(ost, blkno, di);
+		}
+
+		if (vb.vb_num_blocks > 0)
+			expected = ocfs2_clusters_in_blocks(fs,
+							vb.vb_last_block + 1);
+
+		if (di->i_clusters < expected &&
+		    prompt(ost, PY, PR_INODE_CLUSTERS,
+			   "Inode %"PRIu64" has %"PRIu32" clusters but its "
+			   "blocks fit in %"PRIu64" clusters.  Correct the "
+			   "number of clusters?",
+			   di->i_blkno, di->i_clusters, expected)) {
+			di->i_clusters = expected;
+			o2fsck_write_inode(ost, blkno, di);
+		}
 	}
 out:
 	return ret;

Modified: trunk/fswreck/dir.c
===================================================================
--- trunk/fswreck/dir.c	2007-08-24 23:30:25 UTC (rev 1405)
+++ trunk/fswreck/dir.c	2007-08-24 23:36:10 UTC (rev 1406)
@@ -134,7 +134,7 @@
 	errcode_t ret;
 	char *buf = NULL;
 	uint64_t blkno, tmp_blkno;
-	int contig;
+	uint64_t contig;
 	ocfs2_cached_inode *cinode = NULL;
 	struct ocfs2_dir_entry *de = NULL, *newent = NULL;
 	char name[OCFS2_MAX_FILENAME_LEN];
@@ -145,10 +145,6 @@
 	if (ret)
 		FSWRK_COM_FATAL(progname, ret);
 
-	ret = ocfs2_extent_map_init(fs, cinode);
-	if (ret)
-		FSWRK_COM_FATAL(progname, ret);
-
 	/* get first blockno */
 	ret = ocfs2_extent_map_get_blocks(cinode, 0, 1, &blkno, &contig);
 	if (ret)
@@ -336,7 +332,7 @@
 void mess_up_dir_parent_dup(ocfs2_filesys *fs, uint64_t blkno)
 {
 	errcode_t ret;
-	int contig;
+	uint64_t contig;
 	uint64_t parent1, parent2, tmp_blkno,extblk;
 	char *buf = NULL;
 	struct ocfs2_dir_entry *de = NULL, *newent = NULL;
@@ -360,10 +356,6 @@
 	if (ret)
 		FSWRK_COM_FATAL(progname, ret);
 
-	ret = ocfs2_extent_map_init(fs, cinode);
-	if (ret)
-		FSWRK_COM_FATAL(progname, ret);
-
 	ret = ocfs2_extent_map_get_blocks(cinode, 0, 1, &extblk, &contig);
 	if (ret)
 		FSWRK_COM_FATAL(progname, ret);

Modified: trunk/fswreck/extent.c
===================================================================
--- trunk/fswreck/extent.c	2007-08-24 23:30:25 UTC (rev 1405)
+++ trunk/fswreck/extent.c	2007-08-24 23:36:10 UTC (rev 1406)
@@ -84,7 +84,7 @@
 {
 	errcode_t ret;
 	uint32_t n_clusters;
-	uint32_t i;
+	uint32_t i, offset = 0;
 	uint64_t blkno;
 	uint64_t tmpblk;
 
@@ -103,7 +103,8 @@
 		 * we insert each cluster in reverse. */
 		for(i = n_clusters; i; --i) {
 			tmpblk = blkno + ocfs2_clusters_to_blocks(fs, i - 1);
-		 	ret = ocfs2_insert_extent(fs, ino, tmpblk, 1);
+		 	ret = ocfs2_insert_extent(fs, ino, offset++,
+						  tmpblk, 1);
 			if (ret) 
 				FSWRK_COM_FATAL(progname, ret);	
 		}
@@ -311,14 +312,14 @@
 				blkno, oldno, er->e_blkno);
 			break;
 	 	case EXTENT_CLUSTERS_OVERRUN:
-			oldno = er->e_clusters;
-			er->e_clusters = fs->fs_clusters + 1;
+			oldno = er->e_leaf_clusters;
+			er->e_leaf_clusters = fs->fs_clusters + 1;
 			er->e_blkno = ocfs2_clusters_to_blocks(fs, 
 							fs->fs_clusters - 1);
 			fprintf(stdout, "EXTENT_CLUSTERS_OVERRUN: "
 				"Corrupt inode#%"PRIu64", "
 				"change cluster from %"PRIu64 " to %d\n",
-				blkno, oldno, er->e_clusters);
+				blkno, oldno, er->e_leaf_clusters);
 			break;
 		case EXTENT_BLKNO_RANGE:
 			er->e_blkno = 1;

Modified: trunk/fswreck/symlink.c
===================================================================
--- trunk/fswreck/symlink.c	2007-08-24 23:30:25 UTC (rev 1405)
+++ trunk/fswreck/symlink.c	2007-08-24 23:36:10 UTC (rev 1406)
@@ -76,17 +76,13 @@
 	errcode_t ret = 0;
 	ocfs2_cached_inode *cinode = NULL;
 	uint64_t new_blk;
-	int contig;
+	uint64_t contig;
 	char *buf = NULL;
 
 	ret = ocfs2_read_cached_inode(fs, blkno, &cinode);
 	if (ret)
 		FSWRK_COM_FATAL(progname, ret);
 
-	ret = ocfs2_extent_map_init(fs, cinode);
-	if (ret)
-		FSWRK_COM_FATAL(progname, ret);
-
 	/* get first block of the file */
 	ret = ocfs2_extent_map_get_blocks(cinode, 0, 1,
 					  &new_blk, &contig);
@@ -214,9 +210,9 @@
 		er = el->l_recs;
 		fprintf(stdout, "LINK_BLOCKS: "
 			"Corrupt inode#%"PRIu64","
-			"change e_clusters from %u to %u\n",
-			blkno, er->e_clusters, (er->e_clusters + 1));
-		er->e_clusters += 1;
+			"change e_leaf_clusters from %u to %u\n",
+			blkno, er->e_leaf_clusters, (er->e_leaf_clusters + 1));
+		er->e_leaf_clusters += 1;
 		break;
 	default:
 		FSWRK_FATAL("Invalid type[%d]\n", type);

Modified: trunk/libocfs2/cached_inode.c
===================================================================
--- trunk/libocfs2/cached_inode.c	2007-08-24 23:30:25 UTC (rev 1405)
+++ trunk/libocfs2/cached_inode.c	2007-08-24 23:36:10 UTC (rev 1406)
@@ -73,9 +73,6 @@
 	if (!cinode)
 		return OCFS2_ET_INVALID_ARGUMENT;
 	
-	if (cinode->ci_map)
-		ocfs2_extent_map_free(cinode);
-	
 	if (cinode->ci_chains)
 		ocfs2_bitmap_free(cinode->ci_chains);
 

Modified: trunk/libocfs2/dir_scan.c
===================================================================
--- trunk/libocfs2/dir_scan.c	2007-08-24 23:30:25 UTC (rev 1405)
+++ trunk/libocfs2/dir_scan.c	2007-08-24 23:36:10 UTC (rev 1406)
@@ -50,7 +50,7 @@
 {
 	errcode_t ret;
 	uint64_t blkno;
-	int cblocks;
+	uint64_t cblocks;
 
 	if (scan->blocks_read == scan->total_blocks)
 		return OCFS2_ET_ITERATION_COMPLETE;
@@ -146,10 +146,6 @@
 	if (ret)
 		goto bail_dir_block;
 
-	ret = ocfs2_extent_map_init(fs, scan->inode);
-	if (ret)
-		goto bail_inode;
-
 	scan->total_blocks = scan->inode->ci_inode->i_size /
 		fs->fs_blocksize;
 	/*
@@ -163,9 +159,6 @@
 
 	return 0;
 
-bail_inode:
-	ocfs2_free_cached_inode(scan->fs, scan->inode);
-
 bail_dir_block:
 	ocfs2_free(&scan->buf);
 

Modified: trunk/libocfs2/expanddir.c
===================================================================
--- trunk/libocfs2/expanddir.c	2007-08-24 23:30:25 UTC (rev 1405)
+++ trunk/libocfs2/expanddir.c	2007-08-24 23:36:10 UTC (rev 1406)
@@ -50,7 +50,7 @@
 	uint64_t used_blks;
 	uint64_t totl_blks;
 	uint64_t new_blk;
-	int contig;
+	uint64_t contig;
 	char *buf = NULL;
 
 	if (!(fs->fs_flags & OCFS2_FLAG_RW))
@@ -86,10 +86,6 @@
 			goto bail;
 	}
 
-	ret = ocfs2_extent_map_init(fs, cinode);
-	if (ret)
-		goto bail;
-
 	/* get the next free block */
 	ret = ocfs2_extent_map_get_blocks(cinode, used_blks, 1,
 					  &new_blk, &contig);

Modified: trunk/libocfs2/extend_file.c
===================================================================
--- trunk/libocfs2/extend_file.c	2007-08-24 23:30:25 UTC (rev 1405)
+++ trunk/libocfs2/extend_file.c	2007-08-24 23:36:10 UTC (rev 1406)
@@ -30,221 +30,1324 @@
 #include <unistd.h>
 #endif
 
+#include <inttypes.h>
+#include <errno.h>
+#include <assert.h>
 #include "ocfs2.h"
 
+/*
+ * Structures which describe a path through a btree, and functions to
+ * manipulate them.
+ *
+ * The idea here is to be as generic as possible with the tree
+ * manipulation code.
+ */
+struct ocfs2_path_item {
+	uint64_t			blkno;
+	char				*buf;
+	struct ocfs2_extent_list	*el;
+};
 
+#define OCFS2_MAX_PATH_DEPTH	5
+
+struct ocfs2_path {
+	int			p_tree_depth;
+	struct ocfs2_path_item	p_node[OCFS2_MAX_PATH_DEPTH];
+};
+
+#define path_root_blkno(_path) ((_path)->p_node[0].blkno)
+#define path_root_buf(_path) ((_path)->p_node[0].buf)
+#define path_root_el(_path) ((_path)->p_node[0].el)
+#define path_leaf_blkno(_path) ((_path)->p_node[(_path)->p_tree_depth].blkno)
+#define path_leaf_buf(_path) ((_path)->p_node[(_path)->p_tree_depth].buf)
+#define path_leaf_el(_path) ((_path)->p_node[(_path)->p_tree_depth].el)
+#define path_num_items(_path) ((_path)->p_tree_depth + 1)
+
 struct insert_ctxt {
 	ocfs2_filesys *fs;
 	struct ocfs2_dinode *di;
 	struct ocfs2_extent_rec rec;
 };
+/*
+ * Reset the actual path elements so that we can re-use the structure
+ * to build another path. Generally, this involves freeing the buffer
+ * heads.
+ */
+static void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root)
+{
+	int i, start = 0, depth = 0;
+	struct ocfs2_path_item *node;
 
-static errcode_t insert_extent_eb(struct insert_ctxt *ctxt,
-				  uint64_t eb_blkno);
+	if (keep_root)
+		start = 1;
 
+	for(i = start; i < path_num_items(path); i++) {
+		node = &path->p_node[i];
+		if (!node->buf)
+			continue;
+
+		ocfs2_free(&node->buf);
+		node->blkno = 0;
+		node->buf = NULL;
+		node->el = NULL;
+	}
+
+	/*
+	 * Tree depth may change during truncate, or insert. If we're
+	 * keeping the root extent list, then make sure that our path
+	 * structure reflects the proper depth.
+	 */
+	if (keep_root)
+		depth = path_root_el(path)->l_tree_depth;
+
+	path->p_tree_depth = depth;
+}
+
+static void ocfs2_free_path(struct ocfs2_path *path)
+{
+	/* We don't free the root because often in libocfs2 the root is a
+	 * shared buffer such as the inode.  Caller must be responsible for
+	 * handling the root of the path.
+	 */
+	if (path) {
+		ocfs2_reinit_path(path, 1);
+		ocfs2_free(&path);
+	}
+}
+
 /*
- * Update the leaf pointer from the previous last_eb_blk to the new
- * last_eb_blk.  Also updates the dinode's ->last_eb_blk.
+ * Make the *dest path the same as src and re-initialize src path to
+ * have a root only.
  */
-static errcode_t update_last_eb_blk(struct insert_ctxt *ctxt,
-				    struct ocfs2_extent_block *eb)
+static void ocfs2_mv_path(struct ocfs2_path *dest, struct ocfs2_path *src)
 {
+	int i;
+
+	assert(path_root_blkno(dest) == path_root_blkno(src));
+
+	for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
+		ocfs2_free(&dest->p_node[i].buf);
+
+		dest->p_node[i].blkno = src->p_node[i].blkno;
+		dest->p_node[i].buf = src->p_node[i].buf;
+		dest->p_node[i].el = src->p_node[i].el;
+
+		src->p_node[i].blkno = 0;
+		src->p_node[i].buf = NULL;
+		src->p_node[i].el = NULL;
+	}
+}
+
+/*
+ * Insert an extent block at given index.
+ *
+ * Note:
+ * This buf will be inserted into the path, so the caller shouldn't free it.
+ */
+static inline void ocfs2_path_insert_eb(struct ocfs2_path *path, int index,
+					char *buf)
+{
+	struct ocfs2_extent_block *eb = (struct ocfs2_extent_block *) buf;
+	/*
+	 * Right now, no root buf is an extent block, so this helps
+	 * catch code errors with dinode trees. The assertion can be
+	 * safely removed if we ever need to insert extent block
+	 * structures at the root.
+	 */
+	assert(index);
+
+	path->p_node[index].blkno = eb->h_blkno;
+	path->p_node[index].buf = (char *)buf;
+	path->p_node[index].el = &eb->h_list;
+}
+
+static struct ocfs2_path *ocfs2_new_path(ocfs2_filesys* fs, char *buf,
+					 struct ocfs2_extent_list *root_el)
+{
+	errcode_t ret = 0;
+	struct ocfs2_path *path = NULL;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)buf;
+
+	assert(root_el->l_tree_depth < OCFS2_MAX_PATH_DEPTH);
+
+	ret = ocfs2_malloc0(sizeof(*path), &path);
+	if (path) {
+		path->p_tree_depth = root_el->l_tree_depth;
+		path->p_node[0].blkno = di->i_blkno;
+		path->p_node[0].buf = buf;
+		path->p_node[0].el = root_el;
+	}
+
+	return path;
+}
+
+/*
+ * Allocate and initialize a new path based on a disk inode tree.
+ */
+static struct ocfs2_path *ocfs2_new_inode_path(ocfs2_filesys *fs,
+					       struct ocfs2_dinode *di)
+{
+	struct ocfs2_extent_list *el = &di->id2.i_list;
+
+	return ocfs2_new_path(fs, (char *)di, el);
+}
+
+/* Write all the extent block information to the disk.
+ * We write all paths furthur down than subtree_index.
+ * The caller will handle writing the sub_index.
+ */
+static errcode_t ocfs2_write_path_eb(ocfs2_filesys *fs,
+				     struct ocfs2_path *path, int sub_index)
+{
 	errcode_t ret;
-	char *buf;
-	struct ocfs2_extent_block *last_eb;
+	int i;
 
-	if (!ctxt->di->i_last_eb_blk)
-		return OCFS2_ET_INTERNAL_FAILURE;
+	for (i = path->p_tree_depth; i > sub_index; i--) {
+		ret = ocfs2_write_extent_block(fs,
+					       path->p_node[i].blkno,
+					       path->p_node[i].buf);
+		if (ret)
+			return ret;
+	}
 
-	ret = ocfs2_malloc_block(ctxt->fs->fs_io, &buf);
-	if (ret)
-		return ret;
+	return 0;
+}
 
-	ret = ocfs2_read_extent_block(ctxt->fs, ctxt->di->i_last_eb_blk,
-				      buf);
-	if (ret)
-		goto out;
+/* some extent blocks is modified and we need to synchronize them to the disk
+ * accordingly.
+ *
+ * We will not update the inode if subtree_index is "0" since it should be
+ * updated by the caller.
+ */
+static errcode_t ocfs2_sync_path_to_disk(ocfs2_filesys *fs,
+					 struct ocfs2_path *left_path,
+					 struct ocfs2_path *right_path,
+					 int subtree_index)
+{
+	errcode_t ret;
+	uint64_t blkno = right_path->p_node[subtree_index].blkno;
+	char *sub_root = right_path->p_node[subtree_index].buf;
 
-	last_eb = (struct ocfs2_extent_block *)buf;
-	last_eb->h_next_leaf_blk = eb->h_blkno;
+	assert(right_path);
 
-	ret = ocfs2_write_extent_block(ctxt->fs, last_eb->h_blkno,
-				       buf);
+	if (left_path) {
+		ret = ocfs2_write_path_eb(fs, left_path, subtree_index);
+		if (ret)
+			goto bail;
+	}
+
+	ret = ocfs2_write_path_eb(fs, right_path, subtree_index);
 	if (ret)
-		goto out;
+		goto bail;
 
-	/* This is written at the end by insert_extent() */
-	ctxt->di->i_last_eb_blk = eb->h_blkno;
+	if (subtree_index) {
+		/* subtree_index indicates an extent block. */
+		ret = ocfs2_write_extent_block(fs, blkno, sub_root);
+		if (ret)
+			goto bail;
+	}
+bail:
+	return ret;
+}
 
-out:
-	ocfs2_free(&buf);
+enum ocfs2_contig_type {
+	CONTIG_NONE = 0,
+	CONTIG_LEFT,
+	CONTIG_RIGHT
+};
 
-	return ret;
+/*
+ * NOTE: ocfs2_block_extent_contig(), ocfs2_extents_adjacent() and
+ * ocfs2_extent_contig only work properly against leaf nodes!
+ */
+static inline int ocfs2_block_extent_contig(ocfs2_filesys *fs,
+					    struct ocfs2_extent_rec *ext,
+					    uint64_t blkno)
+{
+	uint64_t blk_end = ext->e_blkno;
+
+	blk_end += ocfs2_clusters_to_blocks(fs, ext->e_leaf_clusters);
+
+	return blkno == blk_end;
 }
 
+static inline int ocfs2_extents_adjacent(struct ocfs2_extent_rec *left,
+					 struct ocfs2_extent_rec *right)
+{
+	uint32_t left_range;
+
+	left_range = left->e_cpos + left->e_leaf_clusters;
+
+	return (left_range == right->e_cpos);
+}
+
+static enum ocfs2_contig_type
+	ocfs2_extent_contig(ocfs2_filesys *fs,
+			    struct ocfs2_extent_rec *ext,
+			    struct ocfs2_extent_rec *insert_rec)
+{
+	uint64_t blkno = insert_rec->e_blkno;
+
+	if (ocfs2_extents_adjacent(ext, insert_rec) &&
+	    ocfs2_block_extent_contig(fs, ext, blkno))
+			return CONTIG_RIGHT;
+
+	blkno = ext->e_blkno;
+	if (ocfs2_extents_adjacent(insert_rec, ext) &&
+	    ocfs2_block_extent_contig(fs, insert_rec, blkno))
+		return CONTIG_LEFT;
+
+	return CONTIG_NONE;
+}
+
 /*
- * Add a child extent_block to a non-leaf extent list.
+ * NOTE: We can have pretty much any combination of contiguousness and
+ * appending.
+ *
+ * The usefulness of APPEND_TAIL is more in that it lets us know that
+ * we'll have to update the path to that leaf.
  */
-static errcode_t append_eb(struct insert_ctxt *ctxt,
-			   struct ocfs2_extent_list *el)
+enum ocfs2_append_type {
+	APPEND_NONE = 0,
+	APPEND_TAIL,
+};
+
+struct ocfs2_insert_type {
+	enum ocfs2_append_type	ins_appending;
+	enum ocfs2_contig_type	ins_contig;
+	int			ins_contig_index;
+	int			ins_free_records;
+	int			ins_tree_depth;
+};
+
+/*
+ * Helper function for ocfs2_add_branch() and shift_tree_depth().
+ *
+ * Returns the sum of the rightmost extent rec logical offset and
+ * cluster count.
+ *
+ * ocfs2_add_branch() uses this to determine what logical cluster
+ * value should be populated into the leftmost new branch records.
+ *
+ * shift_tree_depth() uses this to determine the # clusters
+ * value for the new topmost tree record.
+ */
+static inline uint32_t ocfs2_sum_rightmost_rec(struct ocfs2_extent_list  *el)
 {
+	uint16_t i = el->l_next_free_rec - 1;
+
+	return el->l_recs[i].e_cpos +
+		 ocfs2_rec_clusters(el->l_tree_depth, &el->l_recs[i]);
+
+}
+
+/*
+ * Add an entire tree branch to our inode. eb_buf is the extent block
+ * to start at, if we don't want to start the branch at the dinode
+ * structure.
+ *
+ * last_eb_buf is required as we have to update it's next_leaf pointer
+ * for the new last extent block.
+ *
+ * the new branch will be 'empty' in the sense that every block will
+ * contain a single record with e_clusters == 0.
+ */
+static int ocfs2_add_branch(ocfs2_filesys *fs,
+			    struct ocfs2_dinode *fe,
+			    char *eb_buf,
+			    char *last_eb_buf)
+{
 	errcode_t ret;
-	char *buf;
-	uint64_t blkno;
+	int new_blocks, i;
+	uint64_t next_blkno, new_last_eb_blk;
 	struct ocfs2_extent_block *eb;
-	struct ocfs2_extent_rec *rec;
+	struct ocfs2_extent_list  *eb_el;
+	struct ocfs2_extent_list  *el;
+	uint32_t new_cpos;
+	uint64_t *new_blknos = NULL;
+	char	**new_eb_bufs = NULL;
+	char *buf = NULL;
 
-	ret = ocfs2_malloc_block(ctxt->fs->fs_io, &buf);
-	if (ret)
-		return ret;
+	assert(last_eb_buf);
 
-	ret = ocfs2_new_extent_block(ctxt->fs, &blkno);
+	if (eb_buf) {
+		eb = (struct ocfs2_extent_block *) eb_buf;
+		el = &eb->h_list;
+	} else
+		el = &fe->id2.i_list;
+
+	/* we never add a branch to a leaf. */
+	assert(el->l_tree_depth);
+
+	new_blocks = el->l_tree_depth;
+
+	/* allocate the number of new eb blocks we need new_blocks should be
+	 * allocated here.*/
+	ret = ocfs2_malloc0(sizeof(uint64_t) * new_blocks, &new_blknos);
 	if (ret)
-		goto out;
+		goto bail;
+	memset(new_blknos, 0, sizeof(uint64_t) * new_blocks);
 
-	ret = ocfs2_read_extent_block(ctxt->fs, blkno, buf);
+	ret = ocfs2_malloc0(sizeof(char *) * new_blocks, &new_eb_bufs);
 	if (ret)
-		goto out;
+		goto bail;
+	memset(new_eb_bufs, 0, sizeof(char *) * new_blocks);
 
-	eb = (struct ocfs2_extent_block *)buf;
-	eb->h_list.l_tree_depth = el->l_tree_depth - 1;
+	for (i = 0; i < new_blocks; i++) {
+		ret = ocfs2_malloc_block(fs->fs_io, &buf);
+		if (ret)
+			return ret;
+		new_eb_bufs[i] = buf;
 
-	if (!eb->h_list.l_tree_depth) {
-		ret = update_last_eb_blk(ctxt, eb);
+		ret = ocfs2_new_extent_block(fs, &new_blknos[i]);
 		if (ret)
-			goto out;
+			goto bail;
+
+		ret = ocfs2_read_extent_block(fs, new_blknos[i], buf);
+		if (ret)
+			goto bail;
 	}
 
-	if (el->l_next_free_rec) {
-		rec = &el->l_recs[el->l_next_free_rec - 1];
-		if (!rec->e_blkno) {
-			rec->e_blkno = blkno;
-			goto out;
-		}
+	eb = (struct ocfs2_extent_block *)last_eb_buf;
+	new_cpos = ocfs2_sum_rightmost_rec(&eb->h_list);
+
+	/* Note: new_eb_bufs[new_blocks - 1] is the guy which will be
+	 * linked with the rest of the tree.
+	 * conversly, new_eb_bufs[0] is the new bottommost leaf.
+	 *
+	 * when we leave the loop, new_last_eb_blk will point to the
+	 * newest leaf, and next_blkno will point to the topmost extent
+	 * block.
+	 */
+	next_blkno = new_last_eb_blk = 0;
+	for(i = 0; i < new_blocks; i++) {
+		buf = new_eb_bufs[i];
+		eb = (struct ocfs2_extent_block *) buf;
+		eb_el = &eb->h_list;
+
+		eb->h_next_leaf_blk = 0;
+		eb_el->l_tree_depth = i;
+		eb_el->l_next_free_rec = 1;
+		memset(eb_el->l_recs, 0,
+		       sizeof(struct ocfs2_extent_rec) * eb_el->l_count);
+		/*
+		 * This actually counts as an empty extent as
+		 * c_clusters == 0
+		 */
+		eb_el->l_recs[0].e_cpos = new_cpos;
+		eb_el->l_recs[0].e_blkno = next_blkno;
+		/*
+		 * eb_el isn't always an interior node, but even leaf
+		 * nodes want a zero'd flags and reserved field so
+		 * this gets the whole 32 bits regardless of use.
+		 */
+		eb_el->l_recs[0].e_int_clusters = 0;
+
+		if (!eb_el->l_tree_depth)
+			new_last_eb_blk = eb->h_blkno;
+
+		next_blkno = eb->h_blkno;
 	}
-	rec = &el->l_recs[el->l_next_free_rec];
-	rec->e_blkno = blkno;
-	rec->e_cpos = ctxt->rec.e_cpos;
+
+	/* Link the new branch into the rest of the tree (el will
+	 * either be on the fe, or the extent block passed in.
+	 */
+	i = el->l_next_free_rec;
+	el->l_recs[i].e_blkno = next_blkno;
+	el->l_recs[i].e_cpos = new_cpos;
+	el->l_recs[i].e_int_clusters = 0;
 	el->l_next_free_rec++;
 
-out:
-	ocfs2_free(&buf);
+	/* fe needs a new last extent block pointer, as does the
+	 * next_leaf on the previously last-extent-block.
+	 */
+	fe->i_last_eb_blk = new_last_eb_blk;
 
+	/* here all the extent block and the new inode information should be
+	 * written back to the disk.
+	 */
+	for(i = 0; i < new_blocks; i++) {
+		buf = new_eb_bufs[i];
+		ret = ocfs2_write_extent_block(fs, new_blknos[i], buf);
+		if (ret)
+			goto bail;
+	}
+
+	/* update last_eb_buf's next_leaf pointer for
+	 * the new last extent block.
+	 */
+	eb = (struct ocfs2_extent_block *) last_eb_buf;
+	eb->h_next_leaf_blk = new_last_eb_blk;
+	ret = ocfs2_write_extent_block(fs, eb->h_blkno, last_eb_buf);
+	if (ret)
+		goto bail;
+
+	if (eb_buf) {
+		eb = (struct ocfs2_extent_block *)eb_buf;
+		ret = ocfs2_write_extent_block(fs, eb->h_blkno, eb_buf);
+		if (ret)
+			goto bail;
+	}
+
+	/* The inode information isn't updated since we use duplicated extent
+	 * block in the insertion and it may fail in other steps.
+	 */
+	ret = 0;
+bail:
+	if (new_eb_bufs) {
+		for (i = 0; i < new_blocks; i++)
+			if (new_eb_bufs[i])
+				ocfs2_free(&new_eb_bufs[i]);
+		ocfs2_free(&new_eb_bufs);
+	}
+
+	if (ret && new_blknos)
+		for (i = 0; i < new_blocks; i++)
+			if (new_blknos[i])
+				ocfs2_delete_extent_block(fs, new_blknos[i]);
+
+	if (new_blknos)
+		ocfs2_free(&new_blknos);
+
 	return ret;
 }
 
 /*
- * Insert a new extent into an extent list.  If this list is a leaf,
- * add it where appropriate.  Otherwise, recurse down the appropriate
- * branch, updating this list on the way back up.
+ * Should only be called when there is no space left in any of the
+ * leaf nodes. What we want to do is find the lowest tree depth
+ * non-leaf extent block with room for new records. There are three
+ * valid results of this search:
+ *
+ * 1) a lowest extent block is found, then we pass it back in
+ *    *target_buf and return '0'
+ *
+ * 2) the search fails to find anything, but the dinode has room. We
+ *    pass NULL back in *target_buf, but still return '0'
+ *
+ * 3) the search fails to find anything AND the dinode is full, in
+ *    which case we return > 0
+ *
+ * return status < 0 indicates an error.
  */
-static errcode_t insert_extent_el(struct insert_ctxt *ctxt,
-			  	  struct ocfs2_extent_list *el)
+static errcode_t ocfs2_find_branch_target(ocfs2_filesys *fs,
+					  struct ocfs2_dinode *fe,
+					  char **target_buf)
 {
-	errcode_t ret;
-	struct ocfs2_extent_rec *rec = NULL;
+	errcode_t ret = 0;
+	int i;
+	uint64_t blkno;
+	struct ocfs2_extent_block *eb;
+	struct ocfs2_extent_list  *el;
+	char *buf = NULL, *lowest_buf = NULL;
 
-	if (!el->l_tree_depth) {
-		/* A leaf extent_list can do one of three things: */
-		if (el->l_next_free_rec) {
-			/* It has at least one valid entry and... */
-			rec = &el->l_recs[el->l_next_free_rec - 1];
+	*target_buf = NULL;
 
-			/* (1) That entry is contiguous with the new
-			 *     one, so just enlarge the entry. */
-			if ((rec->e_blkno +
-			     ocfs2_clusters_to_blocks(ctxt->fs, rec->e_clusters)) ==
-			    ctxt->rec.e_blkno) {
-				rec->e_clusters += ctxt->rec.e_clusters;
-				return 0;
-			}
+	el = &fe->id2.i_list;
 
-			/* (2) That entry is zero length, so just fill
-			 *     it in with the new one. */
-			if (!rec->e_clusters) {
-				*rec = ctxt->rec;
-				return 0;
-			}
+	ret = ocfs2_malloc_block(fs->fs_io, &buf);
+	if (ret)
+		return ret;
 
-			if (el->l_next_free_rec == el->l_count)
-				return OCFS2_ET_NO_SPACE;
+	while(el->l_tree_depth > 1) {
+		if (el->l_next_free_rec == 0) {
+			ret = OCFS2_ET_CORRUPT_EXTENT_BLOCK;
+			goto bail;
 		}
+		i = el->l_next_free_rec - 1;
+		blkno = el->l_recs[i].e_blkno;
+		if (!blkno) {
+			ret = OCFS2_ET_CORRUPT_EXTENT_BLOCK;
+			goto bail;
+		}
 
-		/* (3) The new entry can't use an existing slot, so
-		 *     put it in a new slot. */
-		rec = &el->l_recs[el->l_next_free_rec];
-		*rec = ctxt->rec;
-		el->l_next_free_rec++;
-		return 0;
+		ret = ocfs2_read_extent_block(fs, blkno, buf);
+		if (ret)
+			goto bail;
+
+		eb = (struct ocfs2_extent_block *) buf;
+		el = &eb->h_list;
+
+		if (el->l_next_free_rec < el->l_count)
+			lowest_buf = buf;
 	}
 
-	/* We're a branch node */
-	ret = OCFS2_ET_NO_SPACE;
-	if (el->l_next_free_rec) {
-		/* If there exists a valid record, and it is not an
-		 * empty record (e_blkno points to a valid child),
-		 * try to fill along that branch. */
-		rec = &el->l_recs[el->l_next_free_rec - 1];
-		if (rec->e_blkno)
-			ret = insert_extent_eb(ctxt, rec->e_blkno);
+	/* If we didn't find one and the fe doesn't have any room,
+	 * then return '1' */
+	if (!lowest_buf
+	    && (fe->id2.i_list.l_next_free_rec == fe->id2.i_list.l_count))
+		ret = 1;
+
+	*target_buf = lowest_buf;
+bail:
+	if (buf && !*target_buf)
+		ocfs2_free(&buf);
+
+	return ret;
+}
+
+/*
+ * This is only valid for leaf nodes, which are the only ones that can
+ * have empty extents anyway.
+ */
+static inline int ocfs2_is_empty_extent(struct ocfs2_extent_rec *rec)
+{
+	return !rec->e_leaf_clusters;
+}
+
+/*
+ * This function will discard the rightmost extent record.
+ */
+static void ocfs2_shift_records_right(struct ocfs2_extent_list *el)
+{
+	int next_free = el->l_next_free_rec;
+	int count = el->l_count;
+	unsigned int num_bytes;
+
+	assert(next_free);
+	/* This will cause us to go off the end of our extent list. */
+	assert(next_free < count);
+
+	num_bytes = sizeof(struct ocfs2_extent_rec) * next_free;
+
+	memmove(&el->l_recs[1], &el->l_recs[0], num_bytes);
+}
+
+static void ocfs2_rotate_leaf(struct ocfs2_extent_list *el,
+			      struct ocfs2_extent_rec *insert_rec)
+{
+	int i, insert_index, next_free, has_empty, num_bytes;
+	uint32_t insert_cpos = insert_rec->e_cpos;
+	struct ocfs2_extent_rec *rec;
+
+	next_free = el->l_next_free_rec;
+	has_empty = ocfs2_is_empty_extent(&el->l_recs[0]);
+
+	assert(next_free);
+
+	/* The tree code before us didn't allow enough room in the leaf. */
+	if (el->l_next_free_rec == el->l_count && !has_empty)
+		assert(0);
+
+	/*
+	 * The easiest way to approach this is to just remove the
+	 * empty extent and temporarily decrement next_free.
+	 */
+	if (has_empty) {
+		/*
+		 * If next_free was 1 (only an empty extent), this
+		 * loop won't execute, which is fine. We still want
+		 * the decrement above to happen.
+		 */
+		for(i = 0; i < (next_free - 1); i++)
+			el->l_recs[i] = el->l_recs[i+1];
+
+		next_free--;
 	}
-	if (ret) {
-		if (ret != OCFS2_ET_NO_SPACE)
-			return ret;
-		
-		if ((el->l_next_free_rec == el->l_count) &&
-		    (el->l_recs[el->l_next_free_rec - 1].e_blkno))
-			return OCFS2_ET_NO_SPACE;
 
-		/* If there wasn't an existing child we insert to and
-		 * there are free slots, add a new child. */
-		ret = append_eb(ctxt, el);
+	/* Figure out what the new record index should be. */
+	for(i = 0; i < next_free; i++) {
+		rec = &el->l_recs[i];
+
+		if (insert_cpos < rec->e_cpos)
+			break;
+	}
+	insert_index = i;
+
+	assert(insert_index >= 0);
+	assert(insert_index < el->l_count);
+	assert(insert_index <= next_free);
+
+	/* No need to memmove if we're just adding to the tail. */
+	if (insert_index != next_free) {
+		assert(next_free < el->l_count);
+
+		num_bytes = next_free - insert_index;
+		num_bytes *= sizeof(struct ocfs2_extent_rec);
+		memmove(&el->l_recs[insert_index + 1],
+			&el->l_recs[insert_index],
+			num_bytes);
+	}
+
+	/*
+	 * Either we had an empty extent, and need to re-increment or
+	 * there was no empty extent on a non full rightmost leaf node,
+	 * in which case we still need to increment.
+	 */
+	next_free++;
+	el->l_next_free_rec = next_free;
+	/* Make sure none of the math above just messed up our tree. */
+	assert(el->l_next_free_rec <= el->l_count);
+
+	el->l_recs[insert_index] = *insert_rec;
+}
+
+/*
+ * Create an empty extent record .
+ *
+ * l_next_free_rec may be updated.
+ *
+ * If an empty extent already exists do nothing.
+ */
+static void ocfs2_create_empty_extent(struct ocfs2_extent_list *el)
+{
+	int next_free = el->l_next_free_rec;
+
+	assert(el->l_tree_depth == 0);
+
+	if (next_free == 0)
+		goto set_and_inc;
+
+	if (ocfs2_is_empty_extent(&el->l_recs[0]))
+		return;
+
+	ocfs2_shift_records_right(el);
+
+set_and_inc:
+	el->l_next_free_rec += 1;
+	memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
+}
+
+/*
+ * For a rotation which involves two leaf nodes, the "root node" is
+ * the lowest level tree node which contains a path to both leafs. This
+ * resulting set of information can be used to form a complete "subtree"
+ *
+ * This function is passed two full paths from the dinode down to a
+ * pair of adjacent leaves. It's task is to figure out which path
+ * index contains the subtree root - this can be the root index itself
+ * in a worst-case rotation.
+ *
+ * The array index of the subtree root is passed back.
+ */
+static int ocfs2_find_subtree_root(struct ocfs2_path *left,
+				   struct ocfs2_path *right)
+{
+	int i = 0;
+
+	/* Check that the caller passed in two paths from the same tree. */
+	assert(path_root_blkno(left) == path_root_blkno(right));
+
+	do {
+		i++;
+
+		/* The caller didn't pass two adjacent paths. */
+ 		if (i > left->p_tree_depth)
+			assert(0);
+	} while (left->p_node[i].blkno == right->p_node[i].blkno);
+
+	return i - 1;
+}
+
+typedef errcode_t (path_insert_t)(void *, char *);
+
+/*
+ * Traverse a btree path in search of cpos, starting at root_el.
+ *
+ * This code can be called with a cpos larger than the tree, in which
+ * case it will return the rightmost path.
+ */
+static errcode_t __ocfs2_find_path(ocfs2_filesys *fs,
+				   struct ocfs2_extent_list *root_el,
+				   uint32_t cpos,
+				   path_insert_t *func,
+				   void *data)
+{
+	int i, ret = 0;
+	uint32_t range;
+	uint64_t blkno;
+	char *buf = NULL;
+	struct ocfs2_extent_block *eb;
+	struct ocfs2_extent_list *el;
+	struct ocfs2_extent_rec *rec;
+
+	el = root_el;
+	while (el->l_tree_depth) {
+		if (el->l_next_free_rec == 0) {
+			ret = OCFS2_ET_CORRUPT_EXTENT_BLOCK;
+			goto out;
+
+		}
+
+
+		for(i = 0; i < el->l_next_free_rec - 1; i++) {
+			rec = &el->l_recs[i];
+
+			/*
+			 * In the case that cpos is off the allocation
+			 * tree, this should just wind up returning the
+			 * rightmost record.
+			 */
+			range = rec->e_cpos +
+				ocfs2_rec_clusters(el->l_tree_depth, rec);
+			if (cpos >= rec->e_cpos && cpos < range)
+			    break;
+		}
+
+		blkno = el->l_recs[i].e_blkno;
+		if (blkno == 0) {
+			ret = OCFS2_ET_CORRUPT_EXTENT_BLOCK;
+			goto out;
+		}
+
+		ret = ocfs2_malloc_block(fs->fs_io, &buf);
 		if (ret)
 			return ret;
 
-		/* append_eb() put a new record here, insert on it.
-		 * If the new child isn't a leaf, this recursion
-		 * will do the append_eb() again, all the way down to
-		 * the leaf. */
-		rec = &el->l_recs[el->l_next_free_rec - 1];
-		ret = insert_extent_eb(ctxt, rec->e_blkno);
+		ret = ocfs2_read_extent_block(fs, blkno, buf);
 		if (ret)
-			return ret;
+			goto out;
+
+		eb = (struct ocfs2_extent_block *) buf;
+		el = &eb->h_list;
+
+		if (el->l_next_free_rec > el->l_count) {
+			ret = OCFS2_ET_CORRUPT_EXTENT_BLOCK;
+			goto out;
+		}
+
+		/* The user's callback must give us the tip for how to
+		 * handle the buf we allocated by return values.
+		 *
+ 		 * 1) return '0':
+		 *    the function succeeds,and it will use the buf and
+		 *    take care of the buffer release.
+		 *
+ 		 * 2) return > 0:
+		 *    the function succeeds, and there is no need for buf,
+		 *    so we will release it.
+		 *
+		 * 3) return < 0:
+		 *    the function fails.
+		 */
+		if (func) {
+			ret = func(data, buf);
+
+			if (ret == 0) {
+				buf = NULL;
+				continue;
+			}
+			else if (ret < 0)
+				goto out;
+		}
+		ocfs2_free(&buf);
+		buf = NULL;
 	}
 
-	/* insert_extent_eb() doesn't update e_clusters so that
-	 * all updates are on the path up, not the path down.  Do the
-	 * update now. */
-	rec->e_clusters += ctxt->rec.e_clusters;
+out:
+	/* Catch any trailing buf that the loop didn't handle. */
+	if (buf)
+		ocfs2_free(&buf);
+
+	return ret;
+}
+
+/*
+ * Given an initialized path (that is, it has a valid root extent
+ * list), this function will traverse the btree in search of the path
+ * which would contain cpos.
+ *
+ * The path traveled is recorded in the path structure.
+ *
+ * Note that this will not do any comparisons on leaf node extent
+ * records, so it will work fine in the case that we just added a tree
+ * branch.
+ */
+struct find_path_data {
+	int index;
+	struct ocfs2_path *path;
+};
+
+static errcode_t find_path_ins(void *data, char *eb)
+{
+	struct find_path_data *fp = data;
+
+	ocfs2_path_insert_eb(fp->path, fp->index, eb);
+	fp->index++;
+
 	return 0;
 }
 
+static int ocfs2_find_path(ocfs2_filesys *fs, struct ocfs2_path *path,
+			   uint32_t cpos)
+{
+	struct find_path_data data;
+
+	data.index = 1;
+	data.path = path;
+	return __ocfs2_find_path(fs, path_root_el(path), cpos,
+				 find_path_ins, &data);
+}
+
 /*
- * Insert a new extent into this extent_block.  That means
- * reading the block, calling insert_extent_el() on the contained
- * extent list, and then writing out the updated block.
+ * Find the leaf block in the tree which would contain cpos. No
+ * checking of the actual leaf is done.
+ *
+ * This function doesn't handle non btree extent lists.
  */
-static errcode_t insert_extent_eb(struct insert_ctxt *ctxt,
-				  uint64_t eb_blkno)
+int ocfs2_find_leaf(ocfs2_filesys *fs, struct ocfs2_dinode *di,
+		    uint32_t cpos, char **leaf_buf)
 {
+	int ret;
+	char *buf = NULL;
+	struct ocfs2_path *path = NULL;
+	struct ocfs2_extent_list *el = &di->id2.i_list;
+
+	assert(el->l_tree_depth > 0);
+
+	path = ocfs2_new_inode_path(fs, di);
+	if (!path) {
+		ret = OCFS2_ET_NO_MEMORY;
+		goto out;
+	}
+
+	ret = ocfs2_find_path(fs, path, cpos);
+	if (ret)
+		goto out;
+
+	ret = ocfs2_malloc_block(fs->fs_io, &buf);
+	if (ret)
+		goto out;
+
+	memcpy(buf, path_leaf_buf(path), fs->fs_blocksize);
+	*leaf_buf = buf;
+out:
+	ocfs2_free_path(path);
+	return ret;
+}
+
+/*
+ * Adjust the adjacent records (left_rec, right_rec) involved in a rotation.
+ *
+ * Basically, we've moved stuff around at the bottom of the tree and
+ * we need to fix up the extent records above the changes to reflect
+ * the new changes.
+ *
+ * left_rec: the record on the left.
+ * left_child_el: is the child list pointed to by left_rec
+ * right_rec: the record to the right of left_rec
+ * right_child_el: is the child list pointed to by right_rec
+ *
+ * By definition, this only works on interior nodes.
+ */
+static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec,
+				    struct ocfs2_extent_list *left_child_el,
+				    struct ocfs2_extent_rec *right_rec,
+				    struct ocfs2_extent_list *right_child_el)
+{
+	uint32_t left_clusters, right_end;
+
+	/*
+	 * Interior nodes never have holes. Their cpos is the cpos of
+	 * the leftmost record in their child list. Their cluster
+	 * count covers the full theoretical range of their child list
+	 * - the range between their cpos and the cpos of the record
+	 * immediately to their right.
+	 */
+	left_clusters = right_child_el->l_recs[0].e_cpos;
+	left_clusters -= left_rec->e_cpos;
+	left_rec->e_int_clusters = left_clusters;
+
+	/*
+	 * Calculate the rightmost cluster count boundary before
+	 * moving cpos - we will need to adjust clusters after
+	 * updating e_cpos to keep the same highest cluster count.
+	 */
+	right_end = right_rec->e_cpos;
+	right_end += right_rec->e_int_clusters;
+
+	right_rec->e_cpos = left_rec->e_cpos;
+	right_rec->e_cpos += left_clusters;
+
+	right_end -= right_rec->e_cpos;
+	right_rec->e_int_clusters = right_end;
+}
+
+/*
+ * Adjust the adjacent root node records involved in a
+ * rotation. left_el_blkno is passed in as a key so that we can easily
+ * find it's index in the root list.
+ */
+static void ocfs2_adjust_root_records(struct ocfs2_extent_list *root_el,
+				      struct ocfs2_extent_list *left_el,
+				      struct ocfs2_extent_list *right_el,
+				      uint64_t left_el_blkno)
+{
+	int i;
+
+	assert(root_el->l_tree_depth > left_el->l_tree_depth);
+
+	for(i = 0; i < root_el->l_next_free_rec - 1; i++) {
+		if (root_el->l_recs[i].e_blkno == left_el_blkno)
+			break;
+	}
+
+	/*
+	 * The path walking code should have never returned a root and
+	 * two paths which are not adjacent.
+	 */
+	assert(i < (root_el->l_next_free_rec - 1));
+
+	ocfs2_adjust_adjacent_records(&root_el->l_recs[i], left_el,
+				      &root_el->l_recs[i + 1], right_el);
+}
+
+/*
+ * We've changed a leaf block (in right_path) and need to reflect that
+ * change back up the subtree.
+ *
+ * This happens in multiple places:
+ *   - When we've moved an extent record from the left path leaf to the right
+ *     path leaf to make room for an empty extent in the left path leaf.
+ *   - When our insert into the right path leaf is at the leftmost edge
+ *     and requires an update of the path immediately to it's left. This
+ *     can occur at the end of some types of rotation and appending inserts.
+ */
+static void ocfs2_complete_edge_insert(ocfs2_filesys *fs,
+				       struct ocfs2_path *left_path,
+				       struct ocfs2_path *right_path,
+				       int subtree_index)
+{
+	int i, idx;
+	uint64_t blkno;
+	struct ocfs2_extent_list *el, *left_el, *right_el;
+	struct ocfs2_extent_rec *left_rec, *right_rec;
+
+	/*
+	 * Update the counts and position values within all the
+	 * interior nodes to reflect the leaf rotation we just did.
+	 *
+	 * The root node is handled below the loop.
+	 *
+	 * We begin the loop with right_el and left_el pointing to the
+	 * leaf lists and work our way up.
+	 *
+	 * NOTE: within this loop, left_el and right_el always refer
+	 * to the *child* lists.
+	 */
+	left_el = path_leaf_el(left_path);
+	right_el = path_leaf_el(right_path);
+	for(i = left_path->p_tree_depth - 1; i > subtree_index; i--) {
+
+		/*
+		 * One nice property of knowing that all of these
+		 * nodes are below the root is that we only deal with
+		 * the leftmost right node record and the rightmost
+		 * left node record.
+		 */
+		el = left_path->p_node[i].el;
+		idx = left_el->l_next_free_rec - 1;
+		left_rec = &el->l_recs[idx];
+
+		el = right_path->p_node[i].el;
+		right_rec = &el->l_recs[0];
+
+		ocfs2_adjust_adjacent_records(left_rec, left_el, right_rec,
+					      right_el);
+
+		/*
+		 * Setup our list pointers now so that the current
+		 * parents become children in the next iteration.
+		 */
+		left_el = left_path->p_node[i].el;
+		right_el = right_path->p_node[i].el;
+	}
+
+	/*
+	 * At the root node, adjust the two adjacent records which
+	 * begin our path to the leaves.
+	 */
+
+	el = left_path->p_node[subtree_index].el;
+	left_el = left_path->p_node[subtree_index + 1].el;
+	right_el = right_path->p_node[subtree_index + 1].el;
+	blkno = left_path->p_node[subtree_index + 1].blkno;
+
+	ocfs2_adjust_root_records(el, left_el, right_el, blkno);
+
+	/* ocfs2_adjust_root_records only update the extent block in the left
+	 * path, and actually right_path->p_node[subtree_index].eb indicates the
+	 * same extent block, so we must keep them the same content.
+	 */
+	memcpy(right_path->p_node[subtree_index].buf,
+	       left_path->p_node[subtree_index].buf, fs->fs_blocksize);
+}
+
+/* Rotate the subtree to right.
+ *
+ * Note: After successful rotation, the extent block will be flashed
+ * to disk accordingly.
+ */
+static errcode_t ocfs2_rotate_subtree_right(ocfs2_filesys *fs,
+					    struct ocfs2_path *left_path,
+					    struct ocfs2_path *right_path,
+					    int subtree_index)
+{
 	errcode_t ret;
-	char *buf;
+	int i;
+	char *right_leaf_eb;
+	char *left_leaf_eb = NULL;
+	struct ocfs2_extent_list *right_el, *left_el;
+	struct ocfs2_extent_rec move_rec;
 	struct ocfs2_extent_block *eb;
 
-	ret = ocfs2_malloc_block(ctxt->fs->fs_io, &buf);
+	left_leaf_eb = path_leaf_buf(left_path);
+	eb = (struct ocfs2_extent_block *)left_leaf_eb;
+	left_el = path_leaf_el(left_path);
+
+	if (left_el->l_next_free_rec != left_el->l_count)
+		return OCFS2_ET_CORRUPT_EXTENT_BLOCK;
+
+	/*
+	 * This extent block may already have an empty record, so we
+	 * return early if so.
+	 */
+	if (ocfs2_is_empty_extent(&left_el->l_recs[0]))
+		return 0;
+
+	assert(left_path->p_node[subtree_index].blkno ==
+	       right_path->p_node[subtree_index].blkno);
+
+	right_leaf_eb = path_leaf_buf(right_path);
+	right_el = path_leaf_el(right_path);
+
+	ocfs2_create_empty_extent(right_el);
+
+	/* Do the copy now. */
+	i = left_el->l_next_free_rec - 1;
+	move_rec = left_el->l_recs[i];
+	right_el->l_recs[0] = move_rec;
+
+	/*
+	 * Clear out the record we just copied and shift everything
+	 * over, leaving an empty extent in the left leaf.
+	 *
+	 * We temporarily subtract from next_free_rec so that the
+	 * shift will lose the tail record (which is now defunct).
+	 */
+	left_el->l_next_free_rec -= 1;
+	ocfs2_shift_records_right(left_el);
+	memset(&left_el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
+	left_el->l_next_free_rec += 1;
+
+	ocfs2_complete_edge_insert(fs, left_path, right_path, subtree_index);
+
+	ret = ocfs2_sync_path_to_disk(fs, left_path, right_path, subtree_index);
+
+	return ret;
+}
+
+/*
+ * Given a full path, determine what cpos value would return us a path
+ * containing the leaf immediately to the left of the current one.
+ *
+ * Will return zero if the path passed in is already the leftmost path.
+ */
+static int ocfs2_find_cpos_for_left_leaf(struct ocfs2_path *path,
+					 uint32_t *cpos)
+{
+	int i, j, ret = 0;
+	uint64_t blkno;
+	struct ocfs2_extent_list *el;
+
+	assert(path->p_tree_depth > 0);
+
+	*cpos = 0;
+
+	blkno = path_leaf_blkno(path);
+
+	/* Start at the tree node just above the leaf and work our way up. */
+	i = path->p_tree_depth - 1;
+	while (i >= 0) {
+		el = path->p_node[i].el;
+
+		/* Find the extent record just before the one in our path. */
+		for(j = 0; j < el->l_next_free_rec; j++) {
+			if (el->l_recs[j].e_blkno == blkno) {
+				if (j == 0) {
+					if (i == 0) {
+						/*
+						 * We've determined that the
+						 * path specified is already
+						 * the leftmost one - return a
+						 * cpos of zero.
+						 */
+						goto out;
+					}
+					/*
+					 * The leftmost record points to our
+					 * leaf - we need to travel up the
+					 * tree one level.
+					 */
+					goto next_node;
+				}
+
+				*cpos = el->l_recs[j - 1].e_cpos;
+				*cpos = *cpos + ocfs2_rec_clusters(
+							el->l_tree_depth,
+							&el->l_recs[j - 1]);
+				*cpos = *cpos - 1;
+				goto out;
+			}
+		}
+
+		/*
+		 * If we got here, we never found a valid node where
+		 * the tree indicated one should be.
+		 */
+		ret = OCFS2_ET_CORRUPT_EXTENT_BLOCK;
+		goto out;
+
+next_node:
+		blkno = path->p_node[i].blkno;
+		i--;
+	}
+
+out:
+	return ret;
+}
+
+/*
+ * Trap the case where we're inserting into the theoretical range past
+ * the _actual_ left leaf range. Otherwise, we'll rotate a record
+ * whose cpos is less than ours into the right leaf.
+ *
+ * It's only necessary to look at the rightmost record of the left
+ * leaf because the logic that calls us should ensure that the
+ * theoretical ranges in the path components above the leaves are
+ * correct.
+ */
+static int ocfs2_rotate_requires_path_adjustment(struct ocfs2_path *left_path,
+						 uint32_t insert_cpos)
+{
+	struct ocfs2_extent_list *left_el;
+	struct ocfs2_extent_rec *rec;
+	int next_free;
+
+	left_el = path_leaf_el(left_path);
+	next_free = left_el->l_next_free_rec;
+	rec = &left_el->l_recs[next_free - 1];
+
+	if (insert_cpos > rec->e_cpos)
+		return 1;
+	return 0;
+}
+
+/*
+ * Rotate all the records in a btree right one record, starting at insert_cpos.
+ *
+ * The path to the rightmost leaf should be passed in.
+ *
+ * The array is assumed to be large enough to hold an entire path (tree depth).
+ *
+ * Upon succesful return from this function:
+ *
+ * - The 'right_path' array will contain a path to the leaf block
+ *   whose range contains e_cpos.
+ * - That leaf block will have a single empty extent in list index 0.
+ * - In the case that the rotation requires a post-insert update,
+ *   *ret_left_path will contain a valid path which can be passed to
+ *   ocfs2_insert_path().
+ */
+static int ocfs2_rotate_tree_right(ocfs2_filesys *fs,
+				   uint32_t insert_cpos,
+				   struct ocfs2_path *right_path,
+				   struct ocfs2_path **ret_left_path)
+{
+	int ret, start;
+	uint32_t cpos;
+	struct ocfs2_path *left_path = NULL;
+
+	*ret_left_path = NULL;
+
+	left_path = ocfs2_new_path(fs, path_root_buf(right_path),
+				   path_root_el(right_path));
+	if (!left_path) {
+		ret = OCFS2_ET_NO_MEMORY;
+		goto out;
+	}
+
+	ret = ocfs2_find_cpos_for_left_leaf(right_path, &cpos);
 	if (ret)
-		return ret;
+		goto out;
 
-	ret = ocfs2_read_extent_block(ctxt->fs, eb_blkno, buf);
-	if (!ret) {
-		eb = (struct ocfs2_extent_block *)buf;
-		ret = insert_extent_el(ctxt, &eb->h_list);
+	/*
+	 * What we want to do here is:
+	 *
+	 * 1) Start with the rightmost path.
+	 *
+	 * 2) Determine a path to the leaf block directly to the left
+         *    of that leaf.
+	 *
+	 * 3) Determine the 'subtree root' - the lowest level tree node
+	 *    which contains a path to both leaves.
+	 *
+	 * 4) Rotate the subtree.
+	 *
+	 * 5) Find the next subtree by considering the left path to be
+         *    the new right path.
+	 *
+	 * The check at the top of this while loop also accepts
+	 * insert_cpos == cpos because cpos is only a _theoretical_
+	 * value to get us the left path - insert_cpos might very well
+	 * be filling that hole.
+	 *
+	 * Stop at a cpos of '0' because we either started at the
+	 * leftmost branch (i.e., a tree with one branch and a
+	 * rotation inside of it), or we've gone as far as we can in
+	 * rotating subtrees.
+	 */
+	while (cpos && insert_cpos <= cpos) {
+
+		ret = ocfs2_find_path(fs, left_path, cpos);
+		if (ret)
+			goto out;
+
+		if (path_leaf_blkno(left_path) == path_leaf_blkno(right_path))
+			assert(0);
+
+		if (ocfs2_rotate_requires_path_adjustment(left_path,
+							  insert_cpos)) {
+			/*
+			 * We've rotated the tree as much as we
+			 * should. The rest is up to
+			 * ocfs2_insert_path() to complete, after the
+			 * record insertion. We indicate this
+			 * situation by returning the left path.
+			 *
+			 * The reason we don't adjust the records here
+			 * before the record insert is that an error
+			 * later might break the rule where a parent
+			 * record e_cpos will reflect the actual
+			 * e_cpos of the 1st nonempty record of the
+			 * child list.
+			 */
+			*ret_left_path = left_path;
+			goto out_ret_path;
+		}
+
+		start = ocfs2_find_subtree_root(left_path, right_path);
+
+		ret = ocfs2_rotate_subtree_right(fs, left_path, right_path,
+						 start);
+		if (ret)
+			goto out;
+
+		/*
+		 * There is no need to re-read the next right path
+		 * as we know that it'll be our current left
+		 * path. Optimize by copying values instead.
+		 */
+		ocfs2_mv_path(right_path, left_path);
+
+		ret = ocfs2_find_cpos_for_left_leaf(right_path, &cpos);
+		if (ret)
+			goto out;
 	}
 
-	if (!ret)
-		ret = ocfs2_write_extent_block(ctxt->fs, eb_blkno, buf);
+out:
+	ocfs2_free_path(left_path);
 
-	ocfs2_free(&buf);
+out_ret_path:
 	return ret;
 }
 
@@ -253,13 +1356,14 @@
  * copying all extent records from the dinode into the extent block,
  * and then pointing the dinode to the new extent_block.
  */
-static errcode_t shift_tree_depth(struct insert_ctxt *ctxt)
+static errcode_t shift_tree_depth(struct insert_ctxt *ctxt, char **new_eb)
 {
 	errcode_t ret;
-	char *buf;
+	char *buf = NULL;
 	uint64_t blkno;
 	struct ocfs2_extent_block *eb;
 	struct ocfs2_extent_list *el;
+	uint32_t new_clusters;
 
 	el = &ctxt->di->id2.i_list;
 	if (el->l_next_free_rec != el->l_count)
@@ -283,66 +1387,838 @@
 	memcpy(eb->h_list.l_recs, el->l_recs,
 	       sizeof(struct ocfs2_extent_rec) * el->l_count);
 
+	new_clusters = ocfs2_sum_rightmost_rec(&eb->h_list);
+
 	el->l_tree_depth++;
 	memset(el->l_recs, 0,
 	       sizeof(struct ocfs2_extent_rec) * el->l_count);
 	el->l_recs[0].e_cpos = 0;
 	el->l_recs[0].e_blkno = blkno;
-	el->l_recs[0].e_clusters = ctxt->di->i_clusters;
+	el->l_recs[0].e_int_clusters = new_clusters;
 	el->l_next_free_rec = 1;
 
 	if (el->l_tree_depth == 1)
 		ctxt->di->i_last_eb_blk = blkno;
 
 	ret = ocfs2_write_extent_block(ctxt->fs, blkno, buf);
+	if (!ret)
+		*new_eb = buf;
 out:
-	ocfs2_free(&buf);
+	if (buf && !*new_eb)
+		ocfs2_free(&buf);
 
 	return ret;
 }
 
+static void ocfs2_figure_contig_type(ocfs2_filesys *fs,
+				     struct ocfs2_insert_type *insert,
+				     struct ocfs2_extent_list *el,
+				     struct ocfs2_extent_rec *insert_rec)
+{
+	int i;
+	enum ocfs2_contig_type contig_type = CONTIG_NONE;
+
+	assert(el->l_tree_depth == 0);
+
+	for(i = 0; i < el->l_next_free_rec; i++) {
+		contig_type = ocfs2_extent_contig(fs, &el->l_recs[i],
+						  insert_rec);
+		if (contig_type != CONTIG_NONE) {
+			insert->ins_contig_index = i;
+			break;
+		}
+	}
+	insert->ins_contig = contig_type;
+}
+
 /*
- * Takes a new contiguous extend, defined by (blkno, clusters), and
- * inserts it into the tree of dinode ino.  This follows the driver's
- * allocation pattern.  It tries to insert on the existing tree, and
- * if that tree is completely full, then shifts the tree depth.
+ * This should only be called against the righmost leaf extent list.
+ *
+ * ocfs2_figure_appending_type() will figure out whether we'll have to
+ * insert at the tail of the rightmost leaf.
+ *
+ * This should also work against the dinode list for tree's with 0
+ * depth. If we consider the dinode list to be the rightmost leaf node
+ * then the logic here makes sense.
  */
-errcode_t ocfs2_insert_extent(ocfs2_filesys *fs, uint64_t ino,
+static void ocfs2_figure_appending_type(struct ocfs2_insert_type *insert,
+					struct ocfs2_extent_list *el,
+					struct ocfs2_extent_rec *insert_rec)
+{
+	int i;
+	uint32_t cpos = insert_rec->e_cpos;
+	struct ocfs2_extent_rec *rec;
+
+	insert->ins_appending = APPEND_NONE;
+
+	assert(el->l_tree_depth == 0);
+
+	if (!el->l_next_free_rec)
+		goto set_tail_append;
+
+	if (ocfs2_is_empty_extent(&el->l_recs[0])) {
+		/* Were all records empty? */
+		if (el->l_next_free_rec == 1)
+			goto set_tail_append;
+	}
+
+	i = el->l_next_free_rec - 1;
+	rec = &el->l_recs[i];
+
+	if (cpos >= (rec->e_cpos + rec->e_leaf_clusters))
+		goto set_tail_append;
+
+	return;
+
+set_tail_append:
+	insert->ins_appending = APPEND_TAIL;
+}
+
+/*
+ * Helper function called at the begining of an insert.
+ *
+ * This computes a few things that are commonly used in the process of
+ * inserting into the btree:
+ *   - Whether the new extent is contiguous with an existing one.
+ *   - The current tree depth.
+ *   - Whether the insert is an appending one.
+ *   - The total # of free records in the tree.
+ *
+ * All of the information is stored on the ocfs2_insert_type
+ * structure.
+ */
+static int ocfs2_figure_insert_type(struct insert_ctxt *ctxt,
+				    char **last_eb_buf,
+				    struct ocfs2_insert_type *insert)
+{
+	int ret;
+	struct ocfs2_extent_block *eb;
+	struct ocfs2_extent_list *el;
+	struct ocfs2_dinode *di = ctxt->di;
+	struct ocfs2_extent_rec *insert_rec = &ctxt->rec;
+	ocfs2_filesys *fs = ctxt->fs;
+	struct ocfs2_path *path = NULL;
+	char *buf = NULL;
+
+	el = &di->id2.i_list;
+	insert->ins_tree_depth = el->l_tree_depth;
+
+	if (el->l_tree_depth) {
+		/*
+		 * If we have tree depth, we read in the
+		 * rightmost extent block ahead of time as
+		 * ocfs2_figure_insert_type() and ocfs2_add_branch()
+		 * may want it later.
+		 */
+		ret = ocfs2_malloc_block(fs->fs_io, &buf);
+		if (ret)
+			return ret;
+
+		ret = ocfs2_read_extent_block(fs, di->i_last_eb_blk, buf);
+		if (ret)
+			goto out;
+
+		eb = (struct ocfs2_extent_block *) buf;
+		el = &eb->h_list;
+	}
+	/*
+	 * Unless we have a contiguous insert, we'll need to know if
+	 * there is room left in our allocation tree for another
+	 * extent record.
+	 *
+	 * XXX: This test is simplistic, we can search for empty
+	 * extent records too.
+	 */
+	insert->ins_free_records = el->l_count - el->l_next_free_rec;
+
+	if (!insert->ins_tree_depth) {
+		insert->ins_free_records = el->l_count - el->l_next_free_rec;
+		ocfs2_figure_contig_type(fs, insert, el, insert_rec);
+		ocfs2_figure_appending_type(insert, el, insert_rec);
+		return 0;
+	}
+
+	path = ocfs2_new_inode_path(fs, di);
+	if (!path) {
+		ret = OCFS2_ET_NO_MEMORY;
+		goto out;
+	}
+	/*
+	 * In the case that we're inserting past what the tree
+	 * currently accounts for, ocf2_find_path() will return for
+	 * us the rightmost tree path. This is accounted for below in
+	 * the appending code.
+	 */
+	ret = ocfs2_find_path(fs, path, insert_rec->e_cpos);
+	if (ret)
+		goto out;
+
+	el = path_leaf_el(path);
+
+	/*
+	 * Now that we have the path, there's two things we want to determine:
+	 * 1) Contiguousness (also set contig_index if this is so)
+	 *
+	 * 2) Are we doing an append? We can trivially break this up
+         *     into two types of appends: simple record append, or a
+         *     rotate inside the tail leaf.
+	 */
+	ocfs2_figure_contig_type(fs, insert, el, insert_rec);
+
+	/*
+	 * The insert code isn't quite ready to deal with all cases of
+	 * left contiguousness. Specifically, if it's an insert into
+	 * the 1st record in a leaf, it will require the adjustment of
+	 * e_clusters on the last record of the path directly to it's
+	 * left. For now, just catch that case and fool the layers
+	 * above us. This works just fine for tree_depth == 0, which
+	 * is why we allow that above.
+	 */
+	if (insert->ins_contig == CONTIG_LEFT &&
+	    insert->ins_contig_index == 0)
+		insert->ins_contig = CONTIG_NONE;
+
+	/*
+	 * Ok, so we can simply compare against last_eb to figure out
+	 * whether the path doesn't exist. This will only happen in
+	 * the case that we're doing a tail append, so maybe we can
+	 * take advantage of that information somehow.
+	 */
+	if (di->i_last_eb_blk == path_leaf_blkno(path)) {
+		/*
+		 * Ok, ocfs2_find_path() returned us the rightmost
+		 * tree path. This might be an appending insert. There are
+		 * two cases:
+		 *    1) We're doing a true append at the tail:
+		 *	-This might even be off the end of the leaf
+		 *    2) We're "appending" by rotating in the tail
+		 */
+		ocfs2_figure_appending_type(insert, el, insert_rec);
+	}
+
+out:
+	ocfs2_free_path(path);
+
+	if (ret == 0)
+		*last_eb_buf = buf;
+	else if (buf)
+		ocfs2_free(&buf);
+	return ret;
+}
+
+/*
+ * Do the final bits of extent record insertion at the target leaf
+ * list. If this leaf is part of an allocation tree, it is assumed
+ * that the tree above has been prepared.
+ */
+static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec,
+				 struct ocfs2_extent_list *el,
+				 struct ocfs2_insert_type *insert)
+{
+	int i = insert->ins_contig_index;
+	unsigned int range;
+	struct ocfs2_extent_rec *rec;
+
+	assert(el->l_tree_depth == 0);
+
+	/*
+	 * Contiguous insert - either left or right.
+	 */
+	if (insert->ins_contig != CONTIG_NONE) {
+		rec = &el->l_recs[i];
+		if (insert->ins_contig == CONTIG_LEFT) {
+			rec->e_blkno = insert_rec->e_blkno;
+			rec->e_cpos = insert_rec->e_cpos;
+		}
+		rec->e_leaf_clusters += insert_rec->e_leaf_clusters;
+		return;
+	}
+
+	/*
+	 * Handle insert into an empty leaf.
+	 */
+	if (el->l_next_free_rec == 0 ||
+	    (el->l_next_free_rec == 1 &&
+	     ocfs2_is_empty_extent(&el->l_recs[0]))) {
+		el->l_recs[0] = *insert_rec;
+		el->l_next_free_rec = 1;
+		return;
+	}
+
+	/*
+	 * Appending insert.
+	 */
+	if (insert->ins_appending == APPEND_TAIL) {
+		i = el->l_next_free_rec - 1;
+		rec = &el->l_recs[i];
+		range = rec->e_cpos + rec->e_leaf_clusters;
+		assert(insert_rec->e_cpos >= range);
+
+		i++;
+		el->l_recs[i] = *insert_rec;
+		el->l_next_free_rec += 1;
+		return;
+	}
+
+	/*
+	 * Ok, we have to rotate.
+	 *
+	 * At this point, it is safe to assume that inserting into an
+	 * empty leaf and appending to a leaf have both been handled
+	 * above.
+	 *
+	 * This leaf needs to have space, either by the empty 1st
+	 * extent record, or by virtue of an l_next_rec < l_count.
+	 */
+	ocfs2_rotate_leaf(el, insert_rec);
+}
+
+static int ocfs2_append_rec_to_path(ocfs2_filesys *fs,
+				    struct ocfs2_extent_rec *insert_rec,
+				    struct ocfs2_path *right_path,
+				    struct ocfs2_path **ret_left_path)
+{
+	int ret, i, next_free;
+	struct ocfs2_extent_list *el;
+	struct ocfs2_path *left_path = NULL;
+
+	*ret_left_path = NULL;
+
+	/*
+	 * This shouldn't happen for non-trees. The extent rec cluster
+	 * count manipulation below only works for interior nodes.
+	 */
+	assert(right_path->p_tree_depth > 0);
+
+	/*
+	 * If our appending insert is at the leftmost edge of a leaf,
+	 * then we might need to update the rightmost records of the
+	 * neighboring path.
+	 */
+
+	el = path_leaf_el(right_path);
+	next_free = el->l_next_free_rec;
+	if (next_free == 0 ||
+	    (next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0]))) {
+		uint32_t left_cpos;
+
+		ret = ocfs2_find_cpos_for_left_leaf(right_path, &left_cpos);
+		if (ret)
+			goto out;
+		/*
+		 * No need to worry if the append is already in the
+		 * leftmost leaf.
+		 */
+		if (left_cpos) {
+			left_path = ocfs2_new_path(fs,
+						   path_root_buf(right_path),
+						   path_root_el(right_path));
+			if (!left_path) {
+				ret = OCFS2_ET_NO_MEMORY;
+				goto out;
+			}
+
+			ret = ocfs2_find_path(fs, left_path, left_cpos);
+			if (ret)
+				goto out;
+		}
+	}
+
+	el = path_root_el(right_path);
+	i = 0;
+	while (1) {
+		struct ocfs2_extent_rec *rec;
+
+		next_free = el->l_next_free_rec;
+		if (next_free == 0) {
+			ret = OCFS2_ET_CORRUPT_EXTENT_BLOCK;
+			goto out;
+		}
+
+		rec = &el->l_recs[next_free - 1];
+		rec->e_int_clusters = insert_rec->e_cpos;
+		rec->e_int_clusters += insert_rec->e_leaf_clusters;
+		rec->e_int_clusters -= rec->e_cpos;
+
+		/*
+		 * Since we have changed the extent block in the right path,
+		 * we have to keep them the same in the left path we found
+		 * above.
+		 */
+		if (left_path && left_path->p_node[i].blkno ==
+					right_path->p_node[i].blkno)
+			memcpy(left_path->p_node[i].buf,
+			       right_path->p_node[i].buf,
+			       fs->fs_blocksize);
+		/* Don't touch the leaf node */
+		if (++i >= right_path->p_tree_depth)
+			break;
+
+		el = right_path->p_node[i].el;
+	}
+
+	*ret_left_path = left_path;
+	ret = 0;
+out:
+	if (ret)
+		ocfs2_free_path(left_path);
+	return ret;
+}
+
+/*
+ * This function only does inserts on an allocation b-tree. For dinode
+ * lists, ocfs2_insert_at_leaf() is called directly.
+ *
+ * right_path is the path we want to do the actual insert
+ * in. left_path should only be passed in if we need to update that
+ * portion of the tree after an edge insert.
+ */
+static int ocfs2_insert_path(struct insert_ctxt* ctxt,
+			     struct ocfs2_path *left_path,
+			     struct ocfs2_path *right_path,
+			     struct ocfs2_extent_rec *insert_rec,
+			     struct ocfs2_insert_type *insert)
+{
+	int ret, subtree_index;
+	struct ocfs2_extent_list *el;
+
+	el = path_leaf_el(right_path);
+
+	ocfs2_insert_at_leaf(insert_rec, el, insert);
+
+	if (left_path) {
+		/*
+		 * The rotate code has indicated that we need to fix
+		 * up portions of the tree after the insert.
+		 */
+		subtree_index = ocfs2_find_subtree_root(left_path, right_path);
+		ocfs2_complete_edge_insert(ctxt->fs, left_path,
+				        right_path, subtree_index);
+	} else
+		subtree_index = 0;
+
+	ret = ocfs2_sync_path_to_disk(ctxt->fs, left_path,
+				      right_path, subtree_index);
+	if (ret)
+		goto out;
+
+	ret = 0;
+out:
+	return ret;
+}
+
+static int ocfs2_do_insert_extent(struct insert_ctxt* ctxt,
+				  struct ocfs2_insert_type *type)
+{
+	int ret, rotate = 0;
+	uint32_t cpos;
+	struct ocfs2_path *right_path = NULL;
+	struct ocfs2_path *left_path = NULL;
+	struct ocfs2_extent_rec *insert_rec = &ctxt->rec;
+	ocfs2_filesys *fs = ctxt->fs;
+	struct ocfs2_dinode *di = ctxt->di;
+	struct ocfs2_extent_list *el = &di->id2.i_list;
+
+	if (el->l_tree_depth == 0) {
+		ocfs2_insert_at_leaf(insert_rec, el, type);
+		goto out_update_clusters;
+	}
+
+	right_path = ocfs2_new_inode_path(fs, di);
+	if (!right_path) {
+		ret = OCFS2_ET_NO_MEMORY;
+		goto out;
+	}
+
+	/*
+	 * Determine the path to start with. Rotations need the
+	 * rightmost path, everything else can go directly to the
+	 * target leaf.
+	 */
+	cpos = insert_rec->e_cpos;
+	if (type->ins_appending == APPEND_NONE &&
+	    type->ins_contig == CONTIG_NONE) {
+		rotate = 1;
+		cpos = UINT_MAX;
+	}
+
+	ret = ocfs2_find_path(fs, right_path, cpos);
+	if (ret)
+		goto out;
+
+	/*
+	 * Rotations and appends need special treatment - they modify
+	 * parts of the tree's above them.
+	 *
+	 * Both might pass back a path immediate to the left of the
+	 * one being inserted to. This will be cause
+	 * ocfs2_insert_path() to modify the rightmost records of
+	 * left_path to account for an edge insert.
+	 *
+	 * XXX: When modifying this code, keep in mind that an insert
+	 * can wind up skipping both of these two special cases...
+	 */
+
+	if (rotate) {
+		ret = ocfs2_rotate_tree_right(fs, insert_rec->e_cpos,
+					      right_path, &left_path);
+		if (ret)
+			goto out;
+	} else if (type->ins_appending == APPEND_TAIL
+		   && type->ins_contig != CONTIG_LEFT) {
+		ret = ocfs2_append_rec_to_path(fs, insert_rec,
+					       right_path, &left_path);
+		if (ret)
+			goto out;
+ 	}
+
+	ret = ocfs2_insert_path(ctxt, left_path, right_path, insert_rec, type);
+	if (ret)
+		goto out;
+
+out_update_clusters:
+	di->i_clusters += insert_rec->e_leaf_clusters;
+	ret = 0;
+
+out:
+	ocfs2_free_path(left_path);
+	ocfs2_free_path(right_path);
+
+	return ret;
+}
+
+struct duplicate_ctxt {
+	struct ocfs2_dinode *di;
+	uint64_t next_leaf_blk;
+};
+
+static errcode_t duplicate_extent_block(ocfs2_filesys *fs,
+					struct ocfs2_extent_list *old_el,
+					struct ocfs2_extent_list *new_el,
+					struct duplicate_ctxt *ctxt)
+{
+	int i;
+	errcode_t ret;
+	uint64_t blkno, new_blkno;
+	struct ocfs2_extent_rec *rec = NULL;
+	char *eb_buf = NULL, *new_eb_buf = NULL;
+	struct ocfs2_extent_block *eb = NULL;
+	struct ocfs2_extent_list *child_old_el = NULL, *child_new_el = NULL;
+
+	assert (old_el->l_tree_depth > 0);
+
+	/* empty the whole extent list at first. */
+	*new_el = *old_el;
+	new_el->l_next_free_rec = 0;
+	memset(new_el->l_recs, 0,
+	       sizeof(struct ocfs2_extent_rec) * new_el->l_count);
+
+	if (old_el->l_next_free_rec == 0) {
+		/* XXX:
+		 * We have a tree depth > 0 and no extent record in it,
+		 * should it be a corrupted block?
+		 */
+		ret = OCFS2_ET_CORRUPT_EXTENT_BLOCK;
+		goto bail;
+	}
+
+	ret = ocfs2_malloc_block(fs->fs_io, &eb_buf);
+	if (ret)
+		goto bail;
+	ret = ocfs2_malloc_block(fs->fs_io, &new_eb_buf);
+	if (ret)
+		goto bail;
+
+	/* we iterate the extent list from the last one for recording
+	 * the next_leaf_blk for the previous leaf.
+	 */
+	for (i = old_el->l_next_free_rec - 1; i >= 0; i--) {
+		rec = &old_el->l_recs[i];
+
+		if (!ocfs2_rec_clusters(old_el->l_tree_depth, rec))
+			continue;
+
+		blkno = rec->e_blkno;
+		ret = ocfs2_read_extent_block(fs, blkno, eb_buf);
+		if (ret)
+			goto bail;
+
+		/* First make the new_buf the same as the old buf. */
+		memcpy(new_eb_buf, eb_buf, fs->fs_blocksize);
+
+		eb = (struct ocfs2_extent_block *)eb_buf;
+		child_old_el = &eb->h_list;
+		eb = (struct ocfs2_extent_block *)new_eb_buf;
+		child_new_el = &eb->h_list;
+
+		if (child_old_el->l_tree_depth > 0) {
+			/* the extent record in our list still has child extent
+			 * block, so we have to iterate it.
+			 */
+			ret = duplicate_extent_block(fs,
+						     child_old_el,
+						     child_new_el,
+						     ctxt);
+			if (ret)
+				goto bail;
+		}
+
+		/* now we allocate a new extent block and save it. */
+		ret = ocfs2_new_extent_block(fs, &new_blkno);
+		if (ret)
+			goto bail;
+
+		eb = (struct ocfs2_extent_block *)new_eb_buf;
+		eb->h_blkno = new_blkno;
+		if (child_old_el->l_tree_depth == 0) {
+			/*
+			 * This is the leaf blkno, we have to set its
+			 * h_next_leaf_blk and then record itself for
+			 * future use.
+			 */
+			eb->h_next_leaf_blk = ctxt->next_leaf_blk;
+			ctxt->next_leaf_blk = new_blkno;
+		}
+
+		ret = ocfs2_write_extent_block(fs, new_blkno, new_eb_buf);
+		if (ret)
+			goto bail;
+
+		memcpy(&new_el->l_recs[i], rec, sizeof(struct ocfs2_extent_rec));
+		new_el->l_recs[i].e_blkno = new_blkno;
+
+		eb = (struct ocfs2_extent_block *)new_eb_buf;
+		/* set the new i_last_eb_blk in the new dinode. */
+		if (ctxt->di->i_last_eb_blk == blkno)
+			ctxt->di->i_last_eb_blk = new_blkno;
+	}
+
+	new_el->l_next_free_rec = old_el->l_next_free_rec;
+	ret = 0;
+
+bail:
+	if (eb_buf)
+		ocfs2_free(&eb_buf);
+	if (new_eb_buf)
+		ocfs2_free(&new_eb_buf);
+	/* Free all the extent block we allocate. */
+	if (ret) {
+		for (i = 0; i < old_el->l_next_free_rec; i++) {
+			rec = &new_el->l_recs[i];
+			if (rec->e_blkno)
+				ocfs2_delete_extent_block(fs, rec->e_blkno);
+		}
+	}
+
+	return ret;
+}
+
+static errcode_t duplicate_extent_block_dinode(ocfs2_filesys *fs,
+					       char *old_buf, char *new_buf)
+{
+	errcode_t ret = 0;
+	struct ocfs2_dinode *old_di = NULL, *new_di = NULL;
+	struct ocfs2_extent_list *old_el = NULL, *new_el = NULL;
+	struct duplicate_ctxt ctxt;
+
+	old_di = (struct ocfs2_dinode *)old_buf;
+	old_el = &old_di->id2.i_list;
+	new_di = (struct ocfs2_dinode *)new_buf;
+	new_el = &new_di->id2.i_list;
+
+	assert(old_el->l_tree_depth > 0);
+
+	/* empty the whole extent list at first. */
+	*new_el = *old_el;
+	memset(new_el->l_recs, 0,
+	       sizeof(struct ocfs2_extent_rec) * new_el->l_count);
+	new_el->l_next_free_rec = 0;
+
+	memset(&ctxt, 0, sizeof(ctxt));
+	ctxt.di = new_di;
+	ctxt.next_leaf_blk = 0;
+	ret = duplicate_extent_block(fs, old_el, new_el, &ctxt);
+
+	return ret;
+}
+
+static void free_duplicated_extent_block(ocfs2_filesys *fs,
+					struct ocfs2_extent_list *el)
+{
+	int i;
+	errcode_t ret;
+	char *buf = NULL;
+	struct ocfs2_extent_rec *rec;
+	struct ocfs2_extent_list *child_el;
+	struct ocfs2_extent_block *eb;
+
+	assert(el->l_tree_depth > 0);
+
+	ret = ocfs2_malloc_block(fs->fs_io, &buf);
+	if (ret)
+		return;
+
+	for (i = 0; i < el->l_next_free_rec; i ++) {
+		rec = &el->l_recs[i];
+
+		if (!ocfs2_rec_clusters(el->l_tree_depth, rec))
+			continue;
+
+		ret = ocfs2_read_extent_block(fs, rec->e_blkno, buf);
+		if (ret)
+			continue;
+
+		eb = (struct ocfs2_extent_block *)buf;
+		child_el = &eb->h_list;
+		if (child_el->l_tree_depth > 0)
+			free_duplicated_extent_block(fs, child_el);
+
+		ocfs2_delete_extent_block(fs, rec->e_blkno);
+	}
+
+	if(buf)
+		ocfs2_free(&buf);
+}
+
+static void free_duplicated_extent_block_dinode(ocfs2_filesys *fs,
+						char *di_buf)
+{
+	struct ocfs2_dinode *di = NULL;
+	struct ocfs2_extent_list *el = NULL;
+
+	di = (struct ocfs2_dinode *)di_buf;
+	el = &di->id2.i_list;
+
+	assert(el->l_tree_depth > 0);
+
+	free_duplicated_extent_block(fs, el);
+}
+
+/*
+ * Insert an extent into an inode btree.
+ */
+errcode_t ocfs2_insert_extent(ocfs2_filesys *fs, uint64_t ino, uint32_t cpos,
 			      uint64_t c_blkno, uint32_t clusters)
 {
 	errcode_t ret;
+	int shift;
 	struct insert_ctxt ctxt;
-	char *buf;
+	struct ocfs2_insert_type insert = {0, };
+	char *di_buf = NULL, *last_eb = NULL, *eb_buf = NULL;
+	char *backup_buf = NULL;
 
-	ret = ocfs2_malloc_block(fs->fs_io, &buf);
+	ret = ocfs2_malloc_block(fs->fs_io, &di_buf);
 	if (ret)
 		return ret;
 
 	ctxt.fs = fs;
-	ctxt.di = (struct ocfs2_dinode *)buf;
+	ctxt.di = (struct ocfs2_dinode *)di_buf;
 
-	ret = ocfs2_read_inode(fs, ino, buf);
+	ret = ocfs2_read_inode(fs, ino, di_buf);
 	if (ret)
-		goto out_free_buf;
+		goto bail;
 
-	ctxt.rec.e_cpos = ctxt.di->i_clusters;
+	/* In order to orderize the written block sequence and avoid
+	 * the corruption for the inode, we duplicate the extent block
+	 * here and do the insertion in the duplicated ones.
+	 *
+	 * Note: we only do this in case the file has extent blocks.
+	 * And if the duplicate process fails, we should go on the normal
+	 * insert process.
+	 */
+	if (ctxt.di->id2.i_list.l_tree_depth) {
+		ret = ocfs2_malloc_block(fs->fs_io, &backup_buf);
+		if (ret)
+			goto bail;
+
+		memcpy(backup_buf, di_buf, fs->fs_blocksize);
+
+		/* duplicate the extent block. If it succeeds, di_buf
+		 * will point to the new allocated extent blocks, and
+		 * the following insertion will happens to the new ones.
+		 */
+		ret = duplicate_extent_block_dinode(fs, backup_buf, di_buf);
+		if (ret) {
+			memcpy(di_buf, backup_buf,fs->fs_blocksize);
+			ocfs2_free(&backup_buf);
+			backup_buf = NULL;
+		}
+	}
+
+	memset(&ctxt.rec, 0, sizeof(struct ocfs2_extent_rec));
+	ctxt.rec.e_cpos = cpos;
 	ctxt.rec.e_blkno = c_blkno;
-	ctxt.rec.e_clusters = clusters;
-	ret = insert_extent_el(&ctxt, &ctxt.di->id2.i_list);
-	if (ret == OCFS2_ET_NO_SPACE) {
-		ret = shift_tree_depth(&ctxt);
-		if (!ret)
-			ret = insert_extent_el(&ctxt,
-					       &ctxt.di->id2.i_list);
+	ctxt.rec.e_leaf_clusters = clusters;
+
+	ret = ocfs2_figure_insert_type(&ctxt,&last_eb, &insert);
+	if (ret)
+		goto bail;
+
+	/*
+	 * Avoid growing the tree unless we're out of records and the
+	 * insert type requres one.
+	 */
+	if (insert.ins_contig != CONTIG_NONE || insert.ins_free_records)
+		goto out_add;
+
+	shift = ocfs2_find_branch_target(fs, ctxt.di, &eb_buf);
+	if (shift < 0) {
+		ret = shift;
+		goto bail;
 	}
-	if (!ret) {
-		ctxt.di->i_clusters += clusters;
-		ret = ocfs2_write_inode(fs, ino, buf);
+
+	/* We traveled all the way to the bottom of the allocation tree
+	 * and didn't find room for any more extents - we need to add
+	 * another tree level */
+	if (shift) {
+
+		/* shift_tree_depth will return us a buffer with
+		 * the new extent block (so we can pass that to
+		 * ocfs2_add_branch). */
+		ret = shift_tree_depth(&ctxt, &eb_buf);
+		if (ret)
+			goto bail;
+
+		insert.ins_tree_depth++;
+
+		if (insert.ins_tree_depth == 1)
+			goto out_add;
 	}
 
-out_free_buf:
-	ocfs2_free(&buf);
+	/* call ocfs2_add_branch to add the final part of the tree with
+	 * the new data. */
+	ret = ocfs2_add_branch(ctxt.fs, ctxt.di, eb_buf, last_eb);
+	if (ret)
+		goto bail;
 
+out_add:
+	/* Finally, we can add clusters. This might rotate the tree for us. */
+	ret = ocfs2_do_insert_extent(&ctxt, &insert);
+	if (ret)
+		goto bail;
+
+	ret = ocfs2_write_inode(fs, ino, di_buf);
+
+bail:
+	if (backup_buf) {
+		/* we have duplicated the extent block during the insertion.
+		 * so if it succeeds, we should free the old ones, and if fails,
+		 * the duplicate ones should be freed.
+		 */
+		if (ret)
+			free_duplicated_extent_block_dinode(fs, di_buf);
+		else
+			free_duplicated_extent_block_dinode(fs, backup_buf);
+		ocfs2_free(&backup_buf);
+	}
+
+	if (eb_buf)
+		ocfs2_free(&eb_buf);
+	if (last_eb)
+		ocfs2_free(&last_eb);
+	if (di_buf)
+		ocfs2_free(&di_buf);
+
 	return ret;
 }
 
@@ -350,34 +2226,84 @@
 				  uint32_t new_clusters)
 {
 	errcode_t ret = 0;
-	uint32_t n_clusters = 0;
-	uint64_t blkno;
+	uint32_t n_clusters = 0, cpos;
+	uint64_t blkno, file_size;
+	char *buf = NULL;
+	struct ocfs2_dinode* di = NULL;
 
 	if (!(fs->fs_flags & OCFS2_FLAG_RW))
 		return OCFS2_ET_RO_FILESYS;
 
+	ret = ocfs2_malloc_block(fs->fs_io, &buf);
+	if (ret)
+		goto out_free_buf;
+
+	ret = ocfs2_read_inode(fs, ino, buf);
+	if (ret)
+		goto out_free_buf;
+
+	di = (struct ocfs2_dinode *)buf;
+
+	file_size = di->i_size;
+	cpos = (file_size + fs->fs_clustersize - 1) / fs->fs_clustersize;
 	while (new_clusters) {
 		n_clusters = 1;
 		ret = ocfs2_new_clusters(fs, 1, new_clusters, &blkno,
-			&n_clusters);
+					 &n_clusters);
 		if (ret)
 			break;
 
-	 	ret = ocfs2_insert_extent(fs, ino, blkno, n_clusters);
+	 	ret = ocfs2_insert_extent(fs, ino, cpos, blkno, n_clusters);
 		if (ret) {
 			/* XXX: We don't wan't to overwrite the error
 			 * from insert_extent().  But we probably need
 			 * to BE LOUDLY UPSET. */
 			ocfs2_free_clusters(fs, n_clusters, blkno);
-			break;
+			goto out_free_buf;
 		}
 
 	 	new_clusters -= n_clusters;
+		cpos += n_clusters;
 	}
 
+out_free_buf:
+	if (buf)
+		ocfs2_free(&buf);
 	return ret;
 }
 
+errcode_t ocfs2_extend_file(ocfs2_filesys *fs, uint64_t ino, uint64_t new_size)
+{
+	errcode_t ret = 0;
+	char *buf = NULL;
+	struct ocfs2_dinode* di = NULL;
+
+	if (!(fs->fs_flags & OCFS2_FLAG_RW))
+		return OCFS2_ET_RO_FILESYS;
+	ret = ocfs2_malloc_block(fs->fs_io, &buf);
+	if (ret)
+		return ret;
+
+	ret = ocfs2_read_inode(fs, ino, buf);
+	if (ret)
+		goto out_free_buf;
+
+	di = (struct ocfs2_dinode *)buf;
+	if (di->i_size >= new_size) {
+		ret = EINVAL;
+		goto out_free_buf;
+	}
+
+	di->i_size = new_size;
+
+	ret = ocfs2_write_inode(fs, ino, buf);
+
+out_free_buf:
+	if (buf)
+		ocfs2_free(&buf);
+	return ret;
+}
+
 #ifdef DEBUG_EXE
 #include <stdio.h>
 #include <stdlib.h>

Modified: trunk/libocfs2/extent_map.c
===================================================================
--- trunk/libocfs2/extent_map.c	2007-08-24 23:30:25 UTC (rev 1405)
+++ trunk/libocfs2/extent_map.c	2007-08-24 23:36:10 UTC (rev 1406)
@@ -27,674 +27,245 @@
 
 #include <string.h>
 #include <inttypes.h>
+#include <assert.h>
 
 #include "ocfs2.h"
 
 #include "extent_map.h"
 
-struct extent_map_context {
-	ocfs2_cached_inode *cinode;
-	errcode_t errcode;
-};
-
 /*
- * Find an entry in the tree that intersects the region passed in.
- * Note that this will find straddled intervals, it is up to the
- * callers to enforce any boundary conditions.
- *
- * The rb_node garbage lets insertion share the search.  Trivial
- * callers pass NULL.
+ * Return the 1st index within el which contains an extent start
+ * larger than v_cluster.
  */
-static ocfs2_extent_map_entry *
-ocfs2_extent_map_lookup(ocfs2_extent_map *em,
-			uint32_t cpos, uint32_t clusters,
-			struct rb_node ***ret_p,
-			struct rb_node **ret_parent)
+static int ocfs2_search_for_hole_index(struct ocfs2_extent_list *el,
+				       uint32_t v_cluster)
 {
-	struct rb_node **p = &em->em_extents.rb_node;
-	struct rb_node *parent = NULL;
-	ocfs2_extent_map_entry *ent = NULL;
-
-	while (*p)
-	{
-		parent = *p;
-		ent = rb_entry(parent, ocfs2_extent_map_entry, e_node); if ((cpos + clusters) <= ent->e_rec.e_cpos) {
-			p = &(*p)->rb_left;
-			ent = NULL;
-		} else if (cpos >= (ent->e_rec.e_cpos +
-				    ent->e_rec.e_clusters)) {
-			p = &(*p)->rb_right;
-			ent = NULL;
-		} else
-			break;
-	}
-
-	if (ret_p != NULL)
-		*ret_p = p;
-	if (ret_parent != NULL)
-		*ret_parent = parent;
-	return ent;
-}
-
-static errcode_t ocfs2_extent_map_find_leaf(ocfs2_cached_inode *cinode,
-					    uint32_t cpos,
-					    uint32_t clusters,
-					    struct ocfs2_extent_list *el)
-{
-	errcode_t ret;
 	int i;
-	char *eb_buf = NULL;
-	uint64_t blkno;
-	struct ocfs2_extent_block *eb;
 	struct ocfs2_extent_rec *rec;
 
-	if (el->l_tree_depth) {
-		ret = ocfs2_malloc_block(cinode->ci_fs->fs_io, &eb_buf);
-		if (ret)
-			return ret;
-	}
+	for(i = 0; i < el->l_next_free_rec; i++) {
+		rec = &el->l_recs[i];
 
-	while (el->l_tree_depth)
-	{
-		blkno = 0;
-		for (i = 0; i < el->l_next_free_rec; i++) {
-			rec = &el->l_recs[i];
-
-			ret = OCFS2_ET_CORRUPT_EXTENT_BLOCK;
-			if (rec->e_cpos >=
-			    cinode->ci_inode->i_clusters)
-				goto out_free;
-
-			if ((rec->e_cpos + rec->e_clusters) <= cpos) {
-				ret = ocfs2_extent_map_insert(cinode,
-							      rec,
-							      el->l_tree_depth);
-				if (ret)
-					goto out_free;
-				continue;
-			}
-			if ((cpos + clusters) <= rec->e_cpos) {
-				ret = ocfs2_extent_map_insert(cinode,
-							      rec,
-							      el->l_tree_depth);
-				if (ret)
-					goto out_free;
-				continue;
-			}
-			
-			/* Check to see if we're stradling */
-			ret = OCFS2_ET_INVALID_EXTENT_LOOKUP;
-			if ((rec->e_cpos > cpos) ||
-			    ((cpos + clusters) >
-			     (rec->e_cpos + rec->e_clusters)))
-				goto out_free;
-
-			/*
-			 * We don't insert this record because we're
-			 * about to traverse it
-			 */
-
-			ret = OCFS2_ET_CORRUPT_EXTENT_BLOCK;
-			if (blkno)
-				goto out_free;
-			blkno = rec->e_blkno;
-		}
-
-		/*
-		 * We don't support holes, and we're still up
-		 * in the branches, so we'd better have found someone
-		 */
-		ret = OCFS2_ET_CORRUPT_EXTENT_BLOCK;
-		if (!blkno)
-			goto out_free;
-
-		ret = ocfs2_read_extent_block(cinode->ci_fs,
-					      blkno, eb_buf);
-		if (ret)
-			goto out_free;
-
-		eb = (struct ocfs2_extent_block *)eb_buf;
-		el = &eb->h_list;
+		if (v_cluster < rec->e_cpos)
+			break;
 	}
 
-	if (el->l_tree_depth)
-		abort();
-
-	for (i = 0; i < el->l_next_free_rec; i++) {
-		rec = &el->l_recs[i];
-		ret = ocfs2_extent_map_insert(cinode, rec,
-					      el->l_tree_depth);
-		if (ret)
-			goto out_free;
-	}
-
-	ret = 0;
-
-out_free:
-	if (eb_buf)
-		ocfs2_free(&eb_buf);
-
-	return ret;
+	return i;
 }
 
 /*
- * This lookup actually will read from disk.  It has one invariant:
- * It will never re-traverse blocks.  This means that all inserts should
- * be new regions or more granular regions (both allowed by insert).
+ * Figure out the size of a hole which starts at v_cluster within the given
+ * extent list.
+ *
+ * If there is no more allocation past v_cluster, we return the maximum
+ * cluster size minus v_cluster.
+ *
+ * If we have in-inode extents, then el points to the dinode list and
+ * eb_buf is NULL. Otherwise, eb_buf should point to the extent block
+ * containing el.
  */
-static errcode_t ocfs2_extent_map_lookup_read(ocfs2_cached_inode *cinode,
-				      uint32_t cpos,
-				      uint32_t clusters,
-				      ocfs2_extent_map_entry **ret_ent)
+static int ocfs2_figure_hole_clusters(ocfs2_cached_inode *cinode,
+				      struct ocfs2_extent_list *el,
+				      char *eb_buf,
+				      uint32_t v_cluster,
+				      uint32_t *num_clusters)
 {
-	errcode_t ret;
-	ocfs2_extent_map_entry *ent;
-	char *eb_buf = NULL;
-	ocfs2_extent_map *em = cinode->ci_map;
-	struct ocfs2_extent_block *eb;
-	struct ocfs2_extent_list *el;
+	int ret, i;
+	char *next_eb_buf = NULL;
+	struct ocfs2_extent_block *eb, *next_eb;
 
-	ent = ocfs2_extent_map_lookup(em, cpos, clusters, NULL, NULL);
-	if (ent) {
-		if (!ent->e_tree_depth) {
-			*ret_ent = ent;
-			return 0;
-		}
+	i = ocfs2_search_for_hole_index(el, v_cluster);
 
-		ret = ocfs2_malloc_block(cinode->ci_fs->fs_io,
-					 &eb_buf);
-		if (ret)
-			return ret;
-
-		ret = ocfs2_read_extent_block(cinode->ci_fs,
-					      ent->e_rec.e_blkno,
-					      eb_buf);
-		if (ret) {
-			ocfs2_free(&eb_buf);
-			return ret;
-		}
-
+	if (i == el->l_next_free_rec && eb_buf) {
 		eb = (struct ocfs2_extent_block *)eb_buf;
-		el = &eb->h_list;
-	} else 
-		el = &cinode->ci_inode->id2.i_list;
 
-	ret = ocfs2_extent_map_find_leaf(cinode, cpos, clusters, el);
-	if (eb_buf)
-		ocfs2_free(&eb_buf);
-	if (ret)
-		return ret;
+		/*
+		 * Check the next leaf for any extents.
+		 */
+		if (eb->h_next_leaf_blk == 0)
+			goto no_more_extents;
 
-	ent = ocfs2_extent_map_lookup(em, cpos, clusters, NULL, NULL);
-	if (!ent || ent->e_tree_depth)
-		return OCFS2_ET_CORRUPT_EXTENT_BLOCK;
-
-	*ret_ent = ent;
-
-	return 0;
-}
-
-static errcode_t ocfs2_extent_map_insert_entry(ocfs2_extent_map *em,
-					       ocfs2_extent_map_entry *ent)
-{
-	struct rb_node **p, *parent;
-	ocfs2_extent_map_entry *old_ent;
-	
-	old_ent = ocfs2_extent_map_lookup(em, ent->e_rec.e_cpos,
-					  ent->e_rec.e_clusters,
-					  &p, &parent);
-	if (old_ent)
-		return OCFS2_ET_INVALID_EXTENT_LOOKUP;
-
-	rb_link_node(&ent->e_node, parent, p);
-	rb_insert_color(&ent->e_node, &em->em_extents);
-
-	return 0;
-}
-
-errcode_t ocfs2_extent_map_insert(ocfs2_cached_inode *cinode,
-				  struct ocfs2_extent_rec *rec,
-				  int tree_depth)
-{
-	errcode_t ret;
-	ocfs2_extent_map *em = cinode->ci_map;
-	ocfs2_extent_map_entry *old_ent, *new_ent;
-	ocfs2_extent_map_entry *left_ent = NULL, *right_ent = NULL;
-
-	if (!em)
-		return OCFS2_ET_INVALID_ARGUMENT;
-
-	if ((rec->e_cpos + rec->e_clusters) > em->em_clusters)
-		return OCFS2_ET_INVALID_EXTENT_LOOKUP;
-
-	ret = ocfs2_malloc0(sizeof(struct _ocfs2_extent_map_entry),
-			    &new_ent);
-	if (ret)
-		return ret;
-
-	new_ent->e_rec = *rec;
-	new_ent->e_tree_depth = tree_depth;
-	ret = ocfs2_extent_map_insert_entry(em, new_ent);
-	if (!ret)
-		return 0;
-
-	ret = OCFS2_ET_INTERNAL_FAILURE;
-	old_ent = ocfs2_extent_map_lookup(em, rec->e_cpos,
-					  rec->e_clusters, NULL, NULL);
-
-	if (!old_ent)
-		goto out_free;
-
-	ret = OCFS2_ET_INVALID_EXTENT_LOOKUP;
-	if (old_ent->e_tree_depth < tree_depth)
-		goto out_free;
-	if (old_ent->e_tree_depth == tree_depth) {
-		if (!memcmp(rec, &old_ent->e_rec,
-			    sizeof(struct ocfs2_extent_rec)))
-			ret = 0;  /* Same entry, just skip */
-		goto out_free;
-	}
-
-	/*
-	 * We do it in this order specifically so that malloc failures
-	 * do not leave an inconsistent tree.
-	 */
-	if (rec->e_cpos > old_ent->e_rec.e_cpos) {
-		ret = ocfs2_malloc0(sizeof(struct _ocfs2_extent_map_entry),
-				    &left_ent);
+		ret = ocfs2_malloc_block(cinode->ci_fs->fs_io, &next_eb_buf);
 		if (ret)
-			goto out_free;
-		*left_ent = *old_ent;
-		left_ent->e_rec.e_clusters =
-			rec->e_cpos - left_ent->e_rec.e_cpos;
-	}
-	if ((old_ent->e_rec.e_cpos +
-	     old_ent->e_rec.e_clusters) > 
-	    (rec->e_cpos + rec->e_clusters)) {
-		ret = ocfs2_malloc0(sizeof(struct _ocfs2_extent_map_entry),
-				    &right_ent);
-		if (ret)
-			goto out_free;
-		*right_ent = *old_ent;
-		right_ent->e_rec.e_cpos =
-			rec->e_cpos + rec->e_clusters;
-		right_ent->e_rec.e_clusters =
-			(old_ent->e_rec.e_cpos +
-			 old_ent->e_rec.e_clusters) -
-			right_ent->e_rec.e_cpos;
-	}
+			goto out;
 
-	rb_erase(&old_ent->e_node, &em->em_extents);
-
-	if (left_ent) {
-		ret = ocfs2_extent_map_insert_entry(em,
-						    left_ent);
+		ret = ocfs2_read_extent_block(cinode->ci_fs,
+					      eb->h_next_leaf_blk, next_eb_buf);
 		if (ret)
-			goto out_free;
-		left_ent = NULL;
-	}
+			goto out;
 
-	ret = ocfs2_extent_map_insert_entry(em, new_ent);
-	if (ret)
-		goto out_free;
-	new_ent = NULL;
+		next_eb = (struct ocfs2_extent_block *)next_eb_buf;
 
-	if (right_ent) {
-		ret = ocfs2_extent_map_insert_entry(em,
-						    right_ent);
-		if (ret)
-			goto out_free;
+		el = &next_eb->h_list;
+
+		i = ocfs2_search_for_hole_index(el, v_cluster);
+		if (i > 0) {
+			if ((i > 1) || ocfs2_rec_clusters(el->l_tree_depth,
+							  &el->l_recs[0])) {
+				ret = OCFS2_ET_CORRUPT_EXTENT_BLOCK;
+			goto out;
+			}
+		}
 	}
 
-	ocfs2_free(&old_ent);
+no_more_extents:
+	if (i == el->l_next_free_rec) {
+		/*
+		 * We're at the end of our existing allocation. Just
+		 * return the maximum number of clusters we could
+		 * possibly allocate.
+		 */
+		*num_clusters = UINT32_MAX - v_cluster;
+	} else
+		*num_clusters = el->l_recs[i].e_cpos - v_cluster;
 
-	return 0;
-
-out_free:
-	if (left_ent)
-		ocfs2_free(&left_ent);
-	if (right_ent)
-		ocfs2_free(&right_ent);
-	if (new_ent)
-		ocfs2_free(&new_ent);
-
+	ret = 0;
+out:
+	if (next_eb_buf)
+		ocfs2_free(&next_eb_buf);
 	return ret;
 }
 
-
 /*
- * Look up the record containing this cluster offset.  This record is
- * part of the extent map.  Do not free it.  Any changes you make to
- * it will reflect in the extent map.  So, if your last extent
- * is (cpos = 10, clusters = 10) and you truncate the file by 5
- * clusters, you want to do:
+ * Return the index of the extent record which contains cluster #v_cluster.
+ * -1 is returned if it was not found.
  *
- * ret = ocfs2_extent_map_get_rec(em, orig_size - 5, &rec);
- * rec->e_clusters -= 5;
+ * Should work fine on interior and exterior nodes.
  */
-errcode_t ocfs2_extent_map_get_rec(ocfs2_cached_inode *cinode,
-				   uint32_t cpos,
-				   struct ocfs2_extent_rec **rec)
+static int ocfs2_search_extent_list(struct ocfs2_extent_list *el,
+				    uint32_t v_cluster)
 {
-	errcode_t ret = OCFS2_ET_EXTENT_NOT_FOUND;
-	ocfs2_extent_map *em = cinode->ci_map;
-	ocfs2_extent_map_entry *ent = NULL;
+	int ret = -1;
+	int i;
+	struct ocfs2_extent_rec *rec;
+	uint32_t rec_end, rec_start, clusters;
 
-	*rec = NULL;
+	for(i = 0; i < el->l_next_free_rec; i++) {
+		rec = &el->l_recs[i];
 
-	if (!em)
-		return OCFS2_ET_INVALID_ARGUMENT;
+		rec_start = rec->e_cpos;
+		clusters = ocfs2_rec_clusters(el->l_tree_depth, rec);
+		rec_end = rec_start + clusters;
 
-	if (cpos >= cinode->ci_inode->i_clusters)
-		return OCFS2_ET_INVALID_EXTENT_LOOKUP;
-
-	ent = ocfs2_extent_map_lookup(em, cpos, 1, NULL, NULL);
-	
-	if (ent) {
-		*rec = &ent->e_rec;
-		ret = 0;
+		if (v_cluster >= rec_start && v_cluster < rec_end) {
+			ret = i;
+			break;
+		}
 	}
 
 	return ret;
 }
 
-errcode_t ocfs2_extent_map_get_clusters(ocfs2_cached_inode *cinode,
-					uint32_t v_cpos, int count,
-					uint32_t *p_cpos,
-					int *ret_count)
+static errcode_t ocfs2_get_clusters(ocfs2_cached_inode *cinode,
+				    uint32_t v_cluster,
+				    uint32_t *p_cluster,
+		 		    uint32_t *num_clusters)
 {
-	errcode_t ret;
-	uint32_t coff, ccount;
-	ocfs2_extent_map_entry *ent = NULL;
+	int i;
+	errcode_t ret =  0;
 	ocfs2_filesys *fs = cinode->ci_fs;
-
-	*p_cpos = ccount = 0;
-
-	if (!cinode->ci_map)
-		return OCFS2_ET_INVALID_ARGUMENT;
-
-	if ((v_cpos + count) > cinode->ci_map->em_clusters)
-		return OCFS2_ET_INVALID_EXTENT_LOOKUP;
-
-	ret = ocfs2_extent_map_lookup_read(cinode, v_cpos, count, &ent);
-	if (ret)
-		return ret;
-
-	if (ent) {
-		/* We should never find ourselves straddling an interval */
-		if ((ent->e_rec.e_cpos > v_cpos) ||
-		    ((v_cpos + count) >
-		     (ent->e_rec.e_cpos + ent->e_rec.e_clusters)))
-			return OCFS2_ET_INVALID_EXTENT_LOOKUP;
-
-		coff = v_cpos - ent->e_rec.e_cpos;
-		*p_cpos = ocfs2_blocks_to_clusters(fs,
-						   ent->e_rec.e_blkno) +
-			coff;
-
-		if (ret_count)
-			*ret_count = ent->e_rec.e_clusters - coff;
-
-		return 0;
-	}
-
-
-	return OCFS2_ET_EXTENT_NOT_FOUND;
-}
-
-errcode_t ocfs2_extent_map_get_blocks(ocfs2_cached_inode *cinode,
-				      uint64_t v_blkno, int count,
-				      uint64_t *p_blkno, int *ret_count)
-{
-	errcode_t ret;
-	uint64_t boff;
-	uint32_t cpos, clusters;
-	ocfs2_filesys *fs = cinode->ci_fs;
-	int bpc = ocfs2_clusters_to_blocks(fs, 1);
-	ocfs2_extent_map_entry *ent = NULL;
+	struct ocfs2_dinode *di;
+	struct ocfs2_extent_block *eb;
+	struct ocfs2_extent_list *el;
 	struct ocfs2_extent_rec *rec;
+	char *eb_buf = NULL;
+	uint32_t coff;
 
-	*p_blkno = 0;
+	di = cinode->ci_inode;
+	el = &di->id2.i_list;
 
-	if (!cinode->ci_map)
-		return OCFS2_ET_INVALID_ARGUMENT;
+	if (el->l_tree_depth) {
+		ret = ocfs2_find_leaf(fs, di, v_cluster, &eb_buf);
+		if (ret)
+			goto out;
 
-	cpos = ocfs2_blocks_to_clusters(fs, v_blkno);
-	clusters = ocfs2_blocks_to_clusters(fs,
-					    (uint64_t)count + bpc - 1);
-	if ((cpos + clusters) > cinode->ci_map->em_clusters)
-		return OCFS2_ET_INVALID_EXTENT_LOOKUP;
+		eb = (struct ocfs2_extent_block *) eb_buf;
+		el = &eb->h_list;
 
-	ret = ocfs2_extent_map_lookup_read(cinode, cpos, clusters, &ent);
-	if (ret)
-		return ret;
-
-	if (ent)
-	{
-		rec = &ent->e_rec;
-
-		/* We should never find ourselves straddling an interval */
-		if ((rec->e_cpos > cpos) ||
-		    ((cpos + clusters) >
-		     (rec->e_cpos + rec->e_clusters)))
-			return OCFS2_ET_INVALID_EXTENT_LOOKUP;
-
-		boff = ocfs2_clusters_to_blocks(fs, cpos - rec->e_cpos);
-		boff += (v_blkno % bpc);
-		*p_blkno = rec->e_blkno + boff;
-
-		if (ret_count) {
-			*ret_count = ocfs2_clusters_to_blocks(fs,
-							      rec->e_clusters) - boff;
+		if (el->l_tree_depth) {
+			ret = OCFS2_ET_CORRUPT_EXTENT_BLOCK;
+			goto out;
 		}
-
-		return 0;
 	}
 
-	return OCFS2_ET_EXTENT_NOT_FOUND;
-}
+	i = ocfs2_search_extent_list(el, v_cluster);
+	if (i == -1) {
+		/*
+		 * A hole was found. Return some canned values that
+		 * callers can key on. If asked for, num_clusters will
+		 * be populated with the size of the hole.
 
-errcode_t ocfs2_extent_map_init(ocfs2_filesys *fs,
-				ocfs2_cached_inode *cinode)
-{
-	errcode_t ret;
+		 */
+		*p_cluster = 0;
+		if (num_clusters) {
+			ret = ocfs2_figure_hole_clusters(cinode, el, eb_buf,
+							 v_cluster,
+							 num_clusters);
+			if (ret)
+				goto out;
+		}
+	} else {
+		rec = &el->l_recs[i];
 
-	ret = ocfs2_malloc0(sizeof(struct _ocfs2_extent_map),
-			    &cinode->ci_map);
-	if (ret)
-		return ret;
+		assert(v_cluster >= rec->e_cpos);
 
-	cinode->ci_map->em_clusters = cinode->ci_inode->i_clusters;
-	cinode->ci_map->em_extents = RB_ROOT;
+		if (!rec->e_blkno) {
+			ret = OCFS2_ET_BAD_BLKNO;
+			goto out;
+		}
 
-	return 0;
-}
+		coff = v_cluster - rec->e_cpos;
 
-void ocfs2_extent_map_free(ocfs2_cached_inode *cinode)
-{
-	if (!cinode->ci_map)
-		return;
+		*p_cluster = ocfs2_blocks_to_clusters(fs, rec->e_blkno);
+		*p_cluster = *p_cluster + coff;
 
-	ocfs2_extent_map_drop(cinode, 0);
-	ocfs2_free(&cinode->ci_map);
-}
-
-
-static int extent_map_func(ocfs2_filesys *fs,
-			   struct ocfs2_extent_rec *rec,
-		  	   int tree_depth,
-			   uint32_t ccount,
-			   uint64_t ref_blkno,
-			   int ref_recno,
-			   void *priv_data)
-{
-	errcode_t ret;
-	int iret = 0;
-	struct extent_map_context *ctxt = priv_data;
-
-	if (rec->e_cpos >= ctxt->cinode->ci_inode->i_clusters) {
-		ctxt->errcode = OCFS2_ET_CORRUPT_EXTENT_BLOCK;
-		iret |= OCFS2_EXTENT_ABORT;
-	} else {
-		ret = ocfs2_extent_map_insert(ctxt->cinode, rec,
-					      tree_depth);
-		if (ret) {
-			ctxt->errcode = ret;
-			iret |= OCFS2_EXTENT_ABORT;
-		}
+		if (num_clusters)
+			*num_clusters = ocfs2_rec_clusters(el->l_tree_depth,
+							   rec) - coff;
 	}
 
-	return iret;
+out:
+	if (eb_buf)
+		ocfs2_free(&eb_buf);
+	return ret;
 }
 
-errcode_t ocfs2_load_extent_map(ocfs2_filesys *fs,
-				ocfs2_cached_inode *cinode)
+errcode_t ocfs2_extent_map_get_blocks(ocfs2_cached_inode *cinode,
+				      uint64_t v_blkno, int count,
+				      uint64_t *p_blkno, uint64_t *ret_count)
 {
 	errcode_t ret;
-	struct extent_map_context ctxt;
+	int bpc;
+	uint32_t cpos, num_clusters, p_cluster;
+	uint64_t boff = 0;
+	ocfs2_filesys *fs = cinode->ci_fs;
 
-	if (!cinode)
-		return OCFS2_ET_INVALID_ARGUMENT;
+	bpc = ocfs2_clusters_to_blocks(fs, 1);
+	cpos = ocfs2_blocks_to_clusters(fs, v_blkno);
 
-	ret = ocfs2_extent_map_init(fs, cinode);
+	ret = ocfs2_get_clusters(cinode, cpos, &p_cluster, &num_clusters);
 	if (ret)
-		return ret;
+		goto out;
 
-	ctxt.cinode = cinode;
-	ctxt.errcode = 0;
+	/*
+	 * p_cluster == 0 indicates a hole.
+	 */
+	if (p_cluster) {
+		boff = ocfs2_clusters_to_blocks(fs, p_cluster);
+		boff += (v_blkno & (uint64_t)(bpc - 1));
+	}
 
-	ret = ocfs2_extent_iterate(fs, cinode->ci_blkno, 0, NULL,
-				   extent_map_func, &ctxt);
-	if (ret)
-		goto cleanup;
+	*p_blkno = boff;
 
-	if (ctxt.errcode) {
-		ret = ctxt.errcode;
-		goto cleanup;
+	if (ret_count) {
+		*ret_count = ocfs2_clusters_to_blocks(fs, num_clusters);
+		*ret_count -= v_blkno & (uint64_t)(bpc - 1);
 	}
 
-	return 0;
-
-cleanup:
-	ocfs2_extent_map_free(cinode);
-
+out:
 	return ret;
 }
 
-static void __ocfs2_extent_map_drop(ocfs2_cached_inode  *cinode,
-				    uint32_t new_clusters,
-				    struct rb_node **free_head,
-				    ocfs2_extent_map_entry **tail_ent)
-{
-	struct rb_node *node, *next;
-	ocfs2_extent_map *em = cinode->ci_map;
-	ocfs2_extent_map_entry *ent;
-
-	*free_head = NULL;
-
-	ent = NULL;
-	node = rb_last(&em->em_extents);
-	while (node)
-	{
-		next = rb_prev(node);
-
-		ent = rb_entry(node, ocfs2_extent_map_entry,
-			       e_node);
-		if (ent->e_rec.e_cpos < new_clusters)
-			break;
-
-		rb_erase(&ent->e_node, &em->em_extents);
-
-		node->rb_right = *free_head;
-		*free_head = node;
-
-		ent = NULL;
-		node = next;
-	}
-
-	/* Do we have an entry straddling new_clusters? */
-	if (tail_ent) {
-		if (ent &&
-		    ((ent->e_rec.e_cpos + ent->e_rec.e_clusters) >
-		     new_clusters))
-			*tail_ent = ent;
-		else
-			*tail_ent = NULL;
-	}
-
-	return;
-}
-
-static void __ocfs2_extent_map_drop_cleanup(struct rb_node *free_head)
-{
-	struct rb_node *node;
-	ocfs2_extent_map_entry *ent;
-
-	while (free_head) {
-		node = free_head;
-		free_head = node->rb_right;
-
-		ent = rb_entry(node, ocfs2_extent_map_entry,
-			       e_node);
-		ocfs2_free(&ent);
-	}
-}
-
-
-/*
- * Remove all entries past new_clusters, inclusive of an entry that
- * contains new_clusters.  This is effectively a cache forget.
- *
- * If you want to also clip the last extent by some number of clusters,
- * you need to call ocfs2_extent_map_trunc().
- */
-errcode_t ocfs2_extent_map_drop(ocfs2_cached_inode *cinode,
-				uint32_t new_clusters)
-{
-	struct rb_node *free_head = NULL;
-	ocfs2_extent_map *em = cinode->ci_map;
-	ocfs2_extent_map_entry *ent;
-
-	if (!em)
-		return OCFS2_ET_INVALID_ARGUMENT;
-
-	__ocfs2_extent_map_drop(cinode, new_clusters, &free_head, &ent);
-
-	if (ent) {
-		rb_erase(&ent->e_node, &em->em_extents);
-		ent->e_node.rb_right = free_head;
-		free_head = &ent->e_node;
-	}
-
-	if (free_head)
-		__ocfs2_extent_map_drop_cleanup(free_head);
-
-	return 0;
-}
-
-/*
- * Remove all entries past new_clusters and also clip any extent
- * straddling new_clusters, if there is one.
- */
-errcode_t ocfs2_extent_map_trunc(ocfs2_cached_inode *cinode,
-				 uint32_t new_clusters)
-{
-	struct rb_node *free_head = NULL;
-	ocfs2_extent_map_entry *ent = NULL;
-
-	__ocfs2_extent_map_drop(cinode, new_clusters, &free_head, &ent);
-
-	if (ent)
-		ent->e_rec.e_clusters =
-			new_clusters - ent->e_rec.e_cpos;
-
-	if (free_head)
-		__ocfs2_extent_map_drop_cleanup(free_head);
-
-	return 0;
-}
-
-
 #ifdef DEBUG_EXE
 #include <stdlib.h>
 #include <getopt.h>
@@ -702,10 +273,7 @@
 
 enum debug_op {
 	OP_NONE = 0,
-	OP_WALK,
-	OP_LOOKUP_CLUSTER,
 	OP_LOOKUP_BLOCK,
-	OP_LOOKUP_REC,
 };
 
 static uint64_t read_number(const char *num)
@@ -773,65 +341,9 @@
 static void print_usage(void)
 {
 	fprintf(stderr,
-		"Usage: extent_map -i <inode_blkno> -w <filename>\n"
-		"       extent_map -i <inode_blkno> -b <blkno>:<blocks> <filename>\n"
-		"       extent_map -i <inode_blkno> -c <cpos>:<clusters> <filename>\n"
-		"       extent_map -i <inode_blkno> -r <cpos> <filename>\n");
+		"Usage: extent_map -i <inode_blkno> -b <blkno>:<blocks> <filename>\n");
 }
 
-static int walk_extents_func(ocfs2_filesys *fs,
-			     ocfs2_cached_inode *cinode, int op)
-{
-	ocfs2_extent_map *em;
-	struct rb_node *node;
-	uint32_t ccount;
-	ocfs2_extent_map_entry *ent;
-	int i;
-
-	em = cinode->ci_map;
-
-	fprintf(stdout, "EXTENTS:\n");
-
-	ccount = 0;
-
-	for (node = rb_first(&em->em_extents); node; node = rb_next(node)) {
-		ent = rb_entry(node, ocfs2_extent_map_entry, e_node);
-
-		if (op == OP_WALK) {
-			fprintf(stdout,
-				"(%08"PRIu32", %08"PRIu32", %08"PRIu64") |"
-				" + %08"PRIu32" = %08"PRIu32" / %08"PRIu32"\n",
-				ent->e_rec.e_cpos,
-				ent->e_rec.e_clusters,
-				ent->e_rec.e_blkno, ccount,
-				ccount + ent->e_rec.e_clusters,
-				cinode->ci_inode->i_clusters);
-
-			ccount += ent->e_rec.e_clusters;
-		} else {
-			fprintf(stdout, "@%d: ",
-				ent->e_tree_depth);
-
-			for (i = cinode->ci_inode->id2.i_list.l_tree_depth;
-			     i > ent->e_tree_depth; i--)
-				fprintf(stdout, "  ");
-
-			fprintf(stdout,
-				"(%08"PRIu32", %08"PRIu32", %09"PRIu64")\n",
-				ent->e_rec.e_cpos,
-				ent->e_rec.e_clusters,
-				ent->e_rec.e_blkno);
-		}
-	}
-
-	if (op == OP_WALK)
-		fprintf(stdout, "TOTAL: %"PRIu32"\n",
-			cinode->ci_inode->i_clusters);
-
-	return 0;
-}
-
-
 extern int opterr, optind;
 extern char *optarg;
 
@@ -851,7 +363,7 @@
 
 	initialize_ocfs_error_table();
 
-	while ((c = getopt(argc, argv, "i:b:c:r:w")) != EOF) {
+	while ((c = getopt(argc, argv, "i:b:")) != EOF) {
 		switch (c) {
 			case 'i':
 				blkno = read_number(optarg);
@@ -864,15 +376,6 @@
 				}
 				break;
 
-			case 'w':
-				if (op) {
-					fprintf(stderr, "Cannot specify more than one operation\n");
-					print_usage();
-					return 1;
-				}
-				op = OP_WALK;
-				break;
-
 			case 'b':
 				if (op) {
 					fprintf(stderr, "Cannot specify more than one operation\n");
@@ -888,31 +391,6 @@
 				op = OP_LOOKUP_BLOCK;
 				break;
 
-			case 'c':
-				if (op) {
-					fprintf(stderr, "Cannot specify more than one operation\n");
-					print_usage();
-					return 1;
-				}
-				if (read_c_numbers(optarg,
-						   &cpos, &count)) {
-					fprintf(stderr, "Invalid cluster range: %s\n", optarg);
-					print_usage();
-					return 1;
-				}
-				op = OP_LOOKUP_CLUSTER;
-				break;
-
-			case 'r':
-				if (op) {
-					fprintf(stderr, "Cannot specify more than one operation\n");
-					print_usage();
-					return 1;
-				}
-				cpos = read_number(optarg);
-				op = OP_LOOKUP_REC;
-				break;
-
 			default:
 				print_usage();
 				return 1;
@@ -950,76 +428,19 @@
 		blkno, filename,
 		cinode->ci_inode->id2.i_list.l_tree_depth);
 
-	if (op == OP_WALK) {
-		ret = ocfs2_load_extent_map(fs, cinode);
-		if (ret) {
-			com_err(argv[0], ret,
-				"while loading extents");
-			goto out_free;
-		}
-	} else {
-		ret = ocfs2_extent_map_init(fs, cinode);
-		if (ret) {
-			com_err(argv[0], ret,
-				"while initializing extent map");
-			goto out_free;
-		}
-
-		switch (op) {
-			case OP_LOOKUP_BLOCK:
-				ret = ocfs2_extent_map_get_blocks(cinode,
-								  blkoff,
-								  count,
-								  &blkno,
-								  &contig);
-				if (ret) {
-					com_err(argv[0], ret, 
-						"looking up block range %"PRIu64":%d", blkoff, count);
-					goto out_free;
-				}
-				fprintf(stdout, "Lookup of block range %"PRIu64":%d returned %"PRIu64":%d\n",
-					blkoff, count, blkno, contig);
-				break;
-
-			case OP_LOOKUP_CLUSTER:
-				ret = ocfs2_extent_map_get_clusters(cinode,
-								  cpos,
-								  count,
-								  &coff,
-								  &contig);
-				if (ret) {
-					com_err(argv[0], ret, 
-						"looking up cluster range %"PRIu32":%d", cpos, count);
-					goto out_free;
-				}
-				fprintf(stdout, "Lookup of cluster range %"PRIu32":%d returned %"PRIu32":%d\n",
-					cpos, count, coff, contig);
-				break;
-				
-			case OP_LOOKUP_REC:
-				ret = ocfs2_extent_map_get_rec(cinode,
-							       cpos,
-							       &rec);
-				if (ret) {
-					com_err(argv[0], ret, 
-						"looking up cluster %"PRIu32"", cpos);
-					goto out_free;
-				}
-				fprintf(stdout, "Lookup of cluster %"PRIu32" returned (%"PRIu32", %"PRIu32", %"PRIu64")\n",
-					cpos, rec->e_cpos,
-					rec->e_clusters, rec->e_blkno);
-				break;
-
-			default:
-				ret = OCFS2_ET_INTERNAL_FAILURE;
-				com_err(argv[0], ret,
-					"Invalid op can't happen!\n");
-				goto out_free;
-		}
+	ret = ocfs2_extent_map_get_blocks(cinode,
+					  blkoff,
+					  count,
+					  &blkno,
+					  &contig);
+	if (ret) {
+		com_err(argv[0], ret,
+			"looking up block range %"PRIu64":%d", blkoff, count);
+		goto out_free;
 	}
+	fprintf(stdout, "Lookup of block range %"PRIu64":%d returned %"PRIu64":%d\n",
+		blkoff, count, blkno, contig);
 
-	walk_extents_func(fs, cinode, op);
-
 out_free:
 	ocfs2_free_cached_inode(fs, cinode);
 

Modified: trunk/libocfs2/extents.c
===================================================================
--- trunk/libocfs2/extents.c	2007-08-24 23:30:25 UTC (rev 1405)
+++ trunk/libocfs2/extents.c	2007-08-24 23:36:10 UTC (rev 1406)
@@ -49,7 +49,10 @@
 		struct ocfs2_extent_rec *rec = &el->l_recs[i];
 
 		rec->e_cpos = bswap_32(rec->e_cpos);
-		rec->e_clusters = bswap_32(rec->e_clusters);
+		if (el->l_tree_depth)
+			rec->e_int_clusters = bswap_32(rec->e_int_clusters);
+		else
+			rec->e_leaf_clusters = bswap_16(rec->e_leaf_clusters);
 		rec->e_blkno = bswap_64(rec->e_blkno);
 	}
 }
@@ -248,7 +251,7 @@
 				iret |= update_eb_rec(ctxt, &before,
 						      &el->l_recs[i]);
 
-			if (el->l_recs[i].e_clusters &&
+			if (el->l_recs[i].e_int_clusters &&
 			   (el->l_recs[i].e_cpos >= ctxt->last_eb_cpos)) {
 				/*
 				 * Only set last_eb_blkno if current extent
@@ -261,6 +264,12 @@
 			}
 
 		} else {
+			/*
+			 * For a sparse file, we may find an empty record
+			 * in the left most record. Just skip it.
+			 */
+			if (!i && !el->l_recs[i].e_leaf_clusters)
+				continue;
 			iret |= (*ctxt->func)(ctxt->fs, &el->l_recs[i],
 					      el->l_tree_depth,
 					      ctxt->ccount, ref_blkno,
@@ -268,7 +277,8 @@
 			if (iret & OCFS2_EXTENT_CHANGED)
 				iret |= update_leaf_rec(ctxt, &before,
 							&el->l_recs[i]);
-			ctxt->ccount += el->l_recs[i].e_clusters;
+			ctxt->ccount += ocfs2_rec_clusters(el->l_tree_depth,
+							   &el->l_recs[i]);
 		}
 		if (iret & (OCFS2_EXTENT_ABORT | OCFS2_EXTENT_ERROR))
 			break;
@@ -276,7 +286,8 @@
 
 	if (iret & OCFS2_EXTENT_CHANGED) {
 		for (i = 0; i < el->l_count; i++) {
-			if (el->l_recs[i].e_clusters)
+			if (ocfs2_rec_clusters(el->l_tree_depth,
+					       &el->l_recs[i]))
 				continue;
 			el->l_next_free_rec = i;
 			break;
@@ -334,9 +345,12 @@
 	if (flags & (OCFS2_EXTENT_ABORT | OCFS2_EXTENT_ERROR))
 		iret |= flags & (OCFS2_EXTENT_ABORT | OCFS2_EXTENT_ERROR);
 
-	/* if the list was changed and we still have recs then we need
-	 * to write the changes to disk */
-	if (changed & OCFS2_EXTENT_CHANGED && el->l_next_free_rec) {
+	/*
+	 * If the list was changed, we should write the changes to disk.
+	 * Note:
+	 * For a sparse file, we may have an empty extent block.
+	 */
+	if (changed & OCFS2_EXTENT_CHANGED) {
 		ctxt->errcode = ocfs2_write_extent_block(ctxt->fs,
 							 eb_rec->e_blkno,
 						ctxt->eb_bufs[tree_depth]);
@@ -508,8 +522,9 @@
 	uint64_t blkno, bcount, bend;
 	int iret = 0;
 
-	bcount = ocfs2_clusters_to_blocks(fs, ccount);
-	bend = bcount + ocfs2_clusters_to_blocks(fs, rec->e_clusters);
+	bcount = ocfs2_clusters_to_blocks(fs, rec->e_cpos);
+	bend = bcount + ocfs2_clusters_to_blocks(fs,
+					ocfs2_rec_clusters(tree_depth, rec));
 
 	for (blkno = rec->e_blkno; bcount < bend; blkno++, bcount++) {
 		if (((bcount * fs->fs_blocksize) >= ctxt->inode->i_size) &&
@@ -627,12 +642,14 @@
 		fprintf(stdout, " ");
 	fprintf(stdout, "(%08"PRIu32", %08"PRIu32", %08"PRIu64") |"
 			" + %08"PRIu32" = %08"PRIu32" / %08"PRIu32"\n",
-		rec->e_cpos, rec->e_clusters,
-		rec->e_blkno, ccount, ccount + rec->e_clusters,
+		rec->e_cpos, ocfs2_rec_clustes(tree_depth, rec),
+		rec->e_blkno, ccount,
+		ccount + ocfs2_rec_clusters(tree_depth, rec),
 		wi->di->i_clusters);
 
 	if (!tree_depth &&
-	    ((ccount + rec->e_clusters) == wi->di->i_clusters))
+	    ((ccount + ocfs2_rec_clusters(tree_depth, rec)) ==
+							 wi->di->i_clusters))
 		fprintf(stdout, "TOTAL: %u\n", wi->di->i_clusters);
 
 	return 0;

Modified: trunk/libocfs2/fileio.c
===================================================================
--- trunk/libocfs2/fileio.c	2007-08-24 23:30:25 UTC (rev 1405)
+++ trunk/libocfs2/fileio.c	2007-08-24 23:36:10 UTC (rev 1406)
@@ -128,7 +128,7 @@
 	errcode_t	ret = 0;
 	char		*ptr = (char *) buf;
 	uint32_t	wanted_blocks;
-	uint32_t	contig_blocks;
+	uint64_t	contig_blocks;
 	uint64_t	v_blkno;
 	uint64_t	p_blkno;
 	uint32_t	tmp;
@@ -162,9 +162,15 @@
 		if (contig_blocks > wanted_blocks)
 			contig_blocks = wanted_blocks;
 
-		ret = io_read_block(fs->fs_io, p_blkno, contig_blocks, ptr);
-		if (ret)
-			return ret;
+		if (!p_blkno) {
+			/* we meet with a hole, just empty the content.*/
+			memset(ptr, 0, contig_blocks * fs->fs_blocksize);
+		} else {
+			ret = io_read_block(fs->fs_io, p_blkno,
+					    contig_blocks, ptr);
+			if (ret)
+				return ret;
+		}
 
 		*got += (contig_blocks <<
 			 OCFS2_RAW_SB(fs->fs_super)->s_blocksize_bits);
@@ -184,6 +190,38 @@
 	return ret;
 }
 
+/*
+ * Emtpy the blocks on the disk.
+ */
+static errcode_t empty_blocks(ocfs2_filesys *fs,
+			      uint64_t start_blk,
+			      uint64_t num_blocks)
+{
+	errcode_t ret;
+	char *buf = NULL;
+
+	ret = ocfs2_malloc_block(fs->fs_io, &buf);
+	if (ret)
+		goto bail;
+
+	memset(buf, 0, fs->fs_blocksize);
+
+	while (num_blocks) {
+		ret = io_write_block(fs->fs_io, start_blk, 1, buf);
+		if (ret)
+			goto bail;
+
+		num_blocks--;
+		start_blk++;
+	}
+
+bail:
+	if (buf)
+		ocfs2_free(&buf);
+
+	return ret;
+}
+
 errcode_t ocfs2_file_write(ocfs2_cached_inode *ci, void *buf, uint32_t count,
 			   uint64_t offset, uint32_t *wrote)
 {
@@ -191,12 +229,16 @@
 	errcode_t	ret = 0;
 	char		*ptr = (char *) buf;
 	uint32_t	wanted_blocks;
-	uint32_t	contig_blocks;
+	uint64_t	contig_blocks;
 	uint64_t	v_blkno;
-	uint64_t	p_blkno;
+	uint64_t	p_blkno, p_alloc, p_offset = 0;
 	uint32_t	tmp;
 	uint64_t	num_blocks;
 	int		bs_bits = OCFS2_RAW_SB(fs->fs_super)->s_blocksize_bits;
+	uint64_t	ino = ci->ci_blkno;
+	uint32_t	n_clusters, cluster_begin, cluster_end;
+	uint64_t	bpc = fs->fs_clustersize/fs->fs_blocksize;
+	int		insert = 0;
 
 	/* o_direct requires aligned io */
 	tmp = fs->fs_blocksize - 1;
@@ -225,10 +267,89 @@
 		if (contig_blocks > wanted_blocks)
 			contig_blocks = wanted_blocks;
 
+	 	if (!p_blkno) {
+			/*
+			 * We meet with a hole here, so we allocate clusters
+			 * and empty the both ends in case.
+			 *
+			 * We will postpone the extent insertion after we
+			 * successfully write the extent block, so that and
+			 * problems happens in block writing would not affect
+			 * the file.
+			 */
+			cluster_begin = ocfs2_blocks_to_clusters(fs, v_blkno);
+			cluster_end = ocfs2_blocks_to_clusters(fs,
+						v_blkno + contig_blocks -1);
+			n_clusters = cluster_end - cluster_begin + 1;
+			ret = ocfs2_new_clusters(fs, 1, n_clusters, &p_alloc,
+						 &n_clusters);
+			if (ret || n_clusters == 0)
+				return ret;
+
+			p_offset = v_blkno & (bpc - 1);
+			p_blkno = p_alloc + p_offset;
+			if (p_offset) {
+				/*
+				 * The user don't write the first blocks,
+				 * so we have to clear them.
+				 */
+				ret = empty_blocks(fs, p_alloc, p_offset);
+				if (ret)
+					return ret;
+			}
+
+			contig_blocks = n_clusters * bpc - p_offset;
+			if (contig_blocks > wanted_blocks) {
+				/*
+				 * we don't need to write that many blocks,
+				 * so empty the blocks at the bottom.
+				 */
+				ret = empty_blocks(fs, p_blkno + wanted_blocks,
+						contig_blocks - wanted_blocks);
+				if (ret)
+					return ret;
+				contig_blocks = wanted_blocks;
+			}
+
+			insert = 1;
+		}
+
 		ret = io_write_block(fs->fs_io, p_blkno, contig_blocks, ptr);
 		if (ret)
 			return ret;
 
+		if (insert) {
+	 		ret = ocfs2_insert_extent(fs, ci->ci_blkno,
+					ocfs2_blocks_to_clusters(fs,v_blkno),
+					p_alloc, n_clusters);
+			if (ret) {
+				/*
+				 * XXX: We don't wan't to overwrite the error
+				 * from insert_extent().  But we probably need
+				 * to BE LOUDLY UPSET.
+				 */
+				ocfs2_free_clusters(fs, n_clusters, p_alloc);
+				return ret;
+			}
+
+			/*
+			 * since the inode information has been changed, we
+			 * may need to reinitialize it and test whether we can
+			 * really find the inserted extents.
+			 */
+			ocfs2_free_cached_inode(fs, ci);
+			ret = ocfs2_read_cached_inode(fs,ino, &ci);
+			ret = ocfs2_extent_map_get_blocks(ci, v_blkno, 1,
+						&p_blkno, NULL);
+			/* now we shouldn't find a hole. */
+			if (!p_blkno || p_blkno != p_alloc + p_offset)
+				ret = OCFS2_ET_INTERNAL_FAILURE;
+			if (ret)
+				return ret;
+
+			insert = 0;
+		}
+
 		*wrote += (contig_blocks << bs_bits);
 		wanted_blocks -= contig_blocks;
 
@@ -240,6 +361,7 @@
 				*wrote = (uint32_t) (ci->ci_inode->i_size - offset);
 			/* break */
 		}
+
 	}
 
 	return ret;
@@ -432,4 +554,3 @@
 	return 0;
 }
 #endif  /* DEBUG_EXE */
-

Modified: trunk/libocfs2/heartbeat.c
===================================================================
--- trunk/libocfs2/heartbeat.c	2007-08-24 23:30:25 UTC (rev 1405)
+++ trunk/libocfs2/heartbeat.c	2007-08-24 23:36:10 UTC (rev 1406)
@@ -89,7 +89,7 @@
 		goto leave;
 	}
 
-	blocks = rec->e_clusters << cluster_bits;
+	blocks = ocfs2_rec_clusters(0, rec) << cluster_bits;
 	blocks >>= block_bits;
 
 	if (blocks > O2NM_MAX_NODES)

Modified: trunk/libocfs2/include/ocfs2.h
===================================================================
--- trunk/libocfs2/include/ocfs2.h	2007-08-24 23:30:25 UTC (rev 1405)
+++ trunk/libocfs2/include/ocfs2.h	2007-08-24 23:36:10 UTC (rev 1406)
@@ -190,7 +190,6 @@
 typedef struct _ocfs2_filesys ocfs2_filesys;
 typedef struct _ocfs2_cached_inode ocfs2_cached_inode;
 typedef struct _io_channel io_channel;
-typedef struct _ocfs2_extent_map ocfs2_extent_map;
 typedef struct _ocfs2_inode_scan ocfs2_inode_scan;
 typedef struct _ocfs2_dir_scan ocfs2_dir_scan;
 typedef struct _ocfs2_bitmap ocfs2_bitmap;
@@ -229,7 +228,6 @@
 	struct _ocfs2_filesys *ci_fs;
 	uint64_t ci_blkno;
 	struct ocfs2_dinode *ci_inode;
-	ocfs2_extent_map *ci_map;
 	ocfs2_bitmap *ci_chains;
 };
 
@@ -295,30 +293,12 @@
 
 void ocfs2_swap_extent_list_from_cpu(struct ocfs2_extent_list *el);
 void ocfs2_swap_extent_list_to_cpu(struct ocfs2_extent_list *el);
-errcode_t ocfs2_extent_map_init(ocfs2_filesys *fs,
-				ocfs2_cached_inode *cinode);
-void ocfs2_extent_map_free(ocfs2_cached_inode *cinode);
-errcode_t ocfs2_extent_map_insert(ocfs2_cached_inode *cinode,
-				  struct ocfs2_extent_rec *rec,
-				  int tree_depth);
-errcode_t ocfs2_extent_map_drop(ocfs2_cached_inode *cinode,
-				 uint32_t new_clusters);
-errcode_t ocfs2_extent_map_trunc(ocfs2_cached_inode *cinode,
-				 uint32_t new_clusters);
-errcode_t ocfs2_extent_map_get_rec(ocfs2_cached_inode *cinode,
-				   uint32_t cpos,
-				   struct ocfs2_extent_rec **rec);
-errcode_t ocfs2_extent_map_get_clusters(ocfs2_cached_inode *cinode,
-					uint32_t v_cpos, int count,
-					uint32_t *p_cpos,
-					int *ret_count);
 errcode_t ocfs2_extent_map_get_blocks(ocfs2_cached_inode *cinode,
 				      uint64_t v_blkno, int count,
 				      uint64_t *p_blkno,
-				      int *ret_count);
-errcode_t ocfs2_load_extent_map(ocfs2_filesys *fs,
-				ocfs2_cached_inode *cinode);
-
+				      uint64_t *ret_count);
+int ocfs2_find_leaf(ocfs2_filesys *fs, struct ocfs2_dinode *di,
+		    uint32_t cpos, char **leaf_buf);
 void ocfs2_swap_journal_superblock(journal_superblock_t *jsb);
 errcode_t ocfs2_init_journal_superblock(ocfs2_filesys *fs, char *buf,
 					int buflen, uint32_t jrnl_size);
@@ -541,7 +521,7 @@
 errcode_t ocfs2_new_dir_block(ocfs2_filesys *fs, uint64_t dir_ino,
 			      uint64_t parent_ino, char **block);
 
-errcode_t ocfs2_insert_extent(ocfs2_filesys *fs, uint64_t ino,
+errcode_t ocfs2_insert_extent(ocfs2_filesys *fs, uint64_t ino, uint32_t cpos,
 			      uint64_t c_blkno, uint32_t clusters);
 
 errcode_t ocfs2_new_inode(ocfs2_filesys *fs, uint64_t *ino, int mode);
@@ -549,8 +529,14 @@
 errcode_t ocfs2_delete_inode(ocfs2_filesys *fs, uint64_t ino);
 errcode_t ocfs2_new_extent_block(ocfs2_filesys *fs, uint64_t *blkno);
 errcode_t ocfs2_delete_extent_block(ocfs2_filesys *fs, uint64_t blkno);
+/*
+ * Allocate the blocks and insert them to the file.
+ * only i_clusters of dinode will be updated accordingly, i_size not changed.
+ */
 errcode_t ocfs2_extend_allocation(ocfs2_filesys *fs, uint64_t ino,
 				  uint32_t new_clusters);
+/* Extend the file to the new size. No clusters will be allocated. */
+errcode_t ocfs2_extend_file(ocfs2_filesys *fs, uint64_t ino, uint64_t new_size);
 errcode_t ocfs2_truncate(ocfs2_filesys *fs, uint64_t ino, uint64_t new_i_size);
 errcode_t ocfs2_new_clusters(ocfs2_filesys *fs,
 			     uint32_t min,
@@ -737,6 +723,35 @@
 }
 
 /*
+ * Helper function to look at the # of clusters in an extent record.
+ */
+static inline uint32_t ocfs2_rec_clusters(uint16_t tree_depth,
+					  struct ocfs2_extent_rec *rec)
+{
+	/*
+	 * Cluster count in extent records is slightly different
+	 * between interior nodes and leaf nodes. This is to support
+	 * unwritten extents which need a flags field in leaf node
+	 * records, thus shrinking the available space for a clusters
+	 * field.
+	 */
+	if (tree_depth)
+		return rec->e_int_clusters;
+	else
+		return rec->e_leaf_clusters;
+}
+
+static inline void ocfs2_set_rec_clusters(uint16_t tree_depth,
+					  struct ocfs2_extent_rec *rec,
+					  uint32_t clusters)
+{
+	if (tree_depth)
+		rec->e_int_clusters = clusters;
+	else
+		rec->e_leaf_clusters = clusters;
+}
+
+/*
  * shamelessly lifted from the kernel
  *
  * min()/max() macros that also do

Modified: trunk/libocfs2/include/ocfs2_fs.h
===================================================================
--- trunk/libocfs2/include/ocfs2_fs.h	2007-08-24 23:30:25 UTC (rev 1405)
+++ trunk/libocfs2/include/ocfs2_fs.h	2007-08-24 23:36:10 UTC (rev 1406)
@@ -86,7 +86,8 @@
 	OCFS2_SB(sb)->s_feature_incompat &= ~(mask)
 
 #define OCFS2_FEATURE_COMPAT_SUPP	OCFS2_FEATURE_COMPAT_BACKUP_SB
-#define OCFS2_FEATURE_INCOMPAT_SUPP	OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT
+#define OCFS2_FEATURE_INCOMPAT_SUPP	(OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT \
+					 | OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC)
 #define OCFS2_FEATURE_RO_COMPAT_SUPP	0
 
 /*
@@ -124,6 +125,11 @@
  */
 #define OCFS2_FEATURE_COMPAT_BACKUP_SB		0x0001
 
+/*
+ * Unwritten extents support.
+ */
+#define OCFS2_FEATURE_RO_COMPAT_UNWRITTEN	0x0001
+
 /* The byte offset of the first backup block will be 1G.
  * The following will be 4G, 16G, 64G, 256G and 1T.
  */
@@ -164,12 +170,46 @@
 #define OCFS2_FL_MODIFIABLE	(0x000100FF)	/* User modifiable flags */
 
 /*
+ * Extent record flags (e_node.leaf.flags)
+ */
+#define OCFS2_EXT_UNWRITTEN	(0x01)	/* Extent is allocated but
+					 * unwritten */
+
+/*
  * ioctl commands
  */
 #define OCFS2_IOC_GETFLAGS	_IOR('f', 1, long)
 #define OCFS2_IOC_SETFLAGS	_IOW('f', 2, long)
+#define OCFS2_IOC32_GETFLAGS	_IOR('f', 1, int)
+#define OCFS2_IOC32_SETFLAGS	_IOW('f', 2, int)
 
 /*
+ * Space reservation / allocation / free ioctls and argument structure
+ * are designed to be compatible with XFS.
+ *
+ * ALLOCSP* and FREESP* are not and will never be supported, but are
+ * included here for completeness.
+ */
+struct ocfs2_space_resv {
+	__s16		l_type;
+	__s16		l_whence;
+	__s64		l_start;
+	__s64		l_len;		/* len == 0 means until end of file */
+	__s32		l_sysid;
+	__u32		l_pid;
+	__s32		l_pad[4];	/* reserve area			    */
+};
+
+#define OCFS2_IOC_ALLOCSP		_IOW ('X', 10, struct ocfs2_space_resv)
+#define OCFS2_IOC_FREESP		_IOW ('X', 11, struct ocfs2_space_resv)
+#define OCFS2_IOC_RESVSP		_IOW ('X', 40, struct ocfs2_space_resv)
+#define OCFS2_IOC_UNRESVSP	_IOW ('X', 41, struct ocfs2_space_resv)
+#define OCFS2_IOC_ALLOCSP64	_IOW ('X', 36, struct ocfs2_space_resv)
+#define OCFS2_IOC_FREESP64	_IOW ('X', 37, struct ocfs2_space_resv)
+#define OCFS2_IOC_RESVSP64	_IOW ('X', 42, struct ocfs2_space_resv)
+#define OCFS2_IOC_UNRESVSP64	_IOW ('X', 43, struct ocfs2_space_resv)
+
+/*
  * Journal Flags (ocfs2_dinode.id1.journal1.i_flags)
  */
 #define OCFS2_JOURNAL_DIRTY_FL	(0x00000001)	/* Journal needs recovery */
@@ -291,10 +331,21 @@
 /*
  * On disk extent record for OCFS2
  * It describes a range of clusters on disk.
+ *
+ * Length fields are divided into interior and leaf node versions.
+ * This leaves room for a flags field (OCFS2_EXT_*) in the leaf nodes.
  */
 struct ocfs2_extent_rec {
 /*00*/	__le32 e_cpos;		/* Offset into the file, in clusters */
-	__le32 e_clusters;	/* Clusters covered by this extent */
+	union {
+		__le32 e_int_clusters; /* Clusters covered by all children */
+		struct {
+			__le16 e_leaf_clusters; /* Clusters covered by this
+						   extent */
+			__u8 e_reserved1;
+			__u8 e_flags; /* Extent flags */
+		};
+	};
 	__le64 e_blkno;		/* Physical disk offset, in blocks */
 /*10*/
 };
@@ -320,7 +371,10 @@
 /*00*/	__le16 l_tree_depth;		/* Extent tree depth from this
 					   point.  0 means data extents
 					   hang directly off this
-					   header (a leaf) */
+					   header (a leaf)
+					   NOTE: The high 8 bits cannot be
+					   used - tree_depth is never that big.
+					*/
 	__le16 l_count;			/* Number of extent records */
 	__le16 l_next_free_rec;		/* Next unused extent slot */
 	__le16 l_reserved1;
@@ -455,7 +509,9 @@
 	__le32 i_ctime_nsec;
 	__le32 i_mtime_nsec;
 	__le32 i_attr;
-	__le32 i_reserved1;
+	__le16 i_orphaned_slot;		/* Only valid when OCFS2_ORPHANED_FL
+					   was set in i_flags */
+	__le16 i_reserved1;
 /*70*/	__le64 i_reserved2[8];
 /*B8*/	union {
 		__le64 i_pad1;		/* Generic way to refer to this
@@ -596,7 +652,7 @@
 
 	if (index >= 0 && index < OCFS2_MAX_BACKUP_SUPERBLOCKS) {
 		offset <<= (2 * index);
-		offset /= sb->s_blocksize;
+		offset >>= sb->s_blocksize_bits;
 		return offset;
 	}
 

Modified: trunk/libocfs2/mkjournal.c
===================================================================
--- trunk/libocfs2/mkjournal.c	2007-08-24 23:30:25 UTC (rev 1405)
+++ trunk/libocfs2/mkjournal.c	2007-08-24 23:36:10 UTC (rev 1406)
@@ -217,10 +217,6 @@
 	uint64_t offset = 0;
 	uint32_t wrote, count, jrnl_blocks;
 
-	ret = ocfs2_extent_map_init(fs, ci);
-	if (ret)
-		goto out;
-
 #define BUFLEN	1048576
 	ret = ocfs2_malloc_blocks(fs->fs_io, (BUFLEN >> bs_bits), &buf);
 	if (ret)

Modified: trunk/libocfs2/truncate.c
===================================================================
--- trunk/libocfs2/truncate.c	2007-08-24 23:30:25 UTC (rev 1405)
+++ trunk/libocfs2/truncate.c	2007-08-24 23:36:10 UTC (rev 1406)
@@ -32,6 +32,11 @@
 
 #include "ocfs2.h"
 
+struct truncate_ctxt {
+	uint64_t new_size_in_clusters;
+	uint32_t new_i_clusters;
+};
+
 /*
  * Delete and free clusters if needed.  This only works with DEPTH_TRAVERSE.
  */
@@ -41,18 +46,23 @@
 			    uint64_t ref_blkno, int ref_recno,
 			    void *priv_data)
 {
-	uint32_t len, new_i_clusters = *(uint32_t *)priv_data;
+	struct truncate_ctxt *ctxt = (struct truncate_ctxt *)priv_data;
+	uint32_t len, new_size_in_clusters = ctxt->new_size_in_clusters;
 	uint64_t start = 0;
 	errcode_t ret;
+	int func_ret = OCFS2_EXTENT_ERROR;
+	char *buf = NULL;
+	struct ocfs2_extent_list *el = NULL;
 
-	if ((rec->e_cpos + rec->e_clusters) <= new_i_clusters)
+	if ((rec->e_cpos + ocfs2_rec_clusters(tree_depth, rec)) <=
+							new_size_in_clusters)
 		return 0;
 
-	if (rec->e_cpos >= new_i_clusters) {
+	if (rec->e_cpos >= new_size_in_clusters) {
 		/* the rec is entirely outside the new size, free it */
 		if (!tree_depth) {
 			start = rec->e_blkno;
-			len = rec->e_clusters;
+			len = ocfs2_rec_clusters(tree_depth, rec);
 		} else {
 			/* here we meet with a full empty extent block, delete
 			 * it. The extent list it contains should already be
@@ -63,91 +73,169 @@
 				goto bail;
 		}
 
-		rec->e_blkno = 0;
-		rec->e_clusters = 0;
-		rec->e_cpos = 0;
+		memset(rec, 0, sizeof(struct ocfs2_extent_rec));
 	} else {
 		/* we're truncating into the middle of the rec */
-		len = rec->e_cpos + rec->e_clusters;
-		len -= new_i_clusters;
-		rec->e_clusters = new_i_clusters - rec->e_cpos;
-		if (!tree_depth)
+		len = rec->e_cpos +
+			 ocfs2_rec_clusters(tree_depth, rec);
+		len -= new_size_in_clusters;
+		if (!tree_depth) {
+			ocfs2_set_rec_clusters(tree_depth, rec,
+				 	new_size_in_clusters - rec->e_cpos);
 			start = rec->e_blkno +
-				ocfs2_clusters_to_blocks(fs, rec->e_clusters);
+				ocfs2_clusters_to_blocks(fs,
+						ocfs2_rec_clusters(tree_depth,
+								   rec));
+		} else {
+			ocfs2_set_rec_clusters(tree_depth, rec,
+					new_size_in_clusters - rec->e_cpos);
+			/*
+			 * For a sparse file, we may meet with another
+			 * situation here:
+			 * The start of the left most extent rec is greater
+			 * than the new size we truncate the file to, but the
+			 * start of the extent block is less than that size.
+			 * In this case, actually all the extent records in
+			 * this extent block have been removed. So we have
+			 * to remove the extent block also.
+			 * In this function, we have to reread the extent list
+			 * to see whether the extent block is empty or not.
+			 */
+			ret = ocfs2_malloc_block(fs->fs_io, &buf);
+			if (ret)
+				goto bail;
+
+			ret = ocfs2_read_extent_block(fs, rec->e_blkno, buf);
+			if (ret)
+				goto bail;
+
+			el = &((struct ocfs2_extent_block *)buf)->h_list;
+			if (el->l_next_free_rec == 0) {
+				ret = ocfs2_delete_extent_block(fs, rec->e_blkno);
+				if (ret)
+					goto bail;
+				memset(rec, 0, sizeof(struct ocfs2_extent_rec));
+			}
+		}
 	}
 
 	if (start) {
 		ret = ocfs2_free_clusters(fs, len, start);
 		if (ret)
 			goto bail;
+		ctxt->new_i_clusters -= len;
 	}
 
-	return OCFS2_EXTENT_CHANGED;
+	func_ret =  OCFS2_EXTENT_CHANGED;
 bail:
-	return OCFS2_EXTENT_ERROR;
+	if (buf)
+		ocfs2_free(&buf);
+	return func_ret;
 }
 
+/*
+ * Zero the area past i_size but still within an allocated
+ * cluster. This avoids exposing nonzero data on subsequent file
+ * extends.
+ */
+static errcode_t ocfs2_zero_tail_for_truncate(ocfs2_cached_inode *ci,
+					      uint64_t new_size)
+{
+	errcode_t ret;
+	char *buf = NULL;
+	ocfs2_filesys *fs = ci->ci_fs;
+	uint64_t start_blk, p_blkno, contig_blocks, start_off;
+	int count, byte_counts, bpc = fs->fs_clustersize /fs->fs_blocksize;
+
+	if (new_size >= ci->ci_inode->i_size || new_size == 0)
+		return 0;
+
+	start_blk = new_size / fs->fs_blocksize;
+
+	ret = ocfs2_extent_map_get_blocks(ci, start_blk, 1,
+					  &p_blkno, &contig_blocks);
+	if (ret)
+		goto out;
+
+	/* Tail is a hole. */
+	if (!p_blkno)
+		goto out;
+
+	/* calculate the total blocks we need to empty. */
+	count = bpc - (p_blkno & (bpc - 1));
+	ret = ocfs2_malloc_blocks(fs->fs_io, count, &buf);
+	if (ret)
+		goto out;
+
+	ret = io_read_block(fs->fs_io, p_blkno, count, buf);
+	if (ret)
+		goto out;
+
+	/* empty the content after the new_size and within the same cluster. */
+	start_off = new_size % fs->fs_blocksize;
+	byte_counts = count * fs->fs_blocksize - start_off;
+	memset(buf + start_off, 0, byte_counts);
+
+	ret = io_write_block(fs->fs_io, p_blkno, count, buf);
+
+out:
+	if (buf)
+		ocfs2_free(&buf);
+	return ret;
+}
+
 /* XXX care about zeroing new clusters and final partially truncated 
  * clusters */
 errcode_t ocfs2_truncate(ocfs2_filesys *fs, uint64_t ino, uint64_t new_i_size)
 {
 	errcode_t ret;
-	char *buf;
-	struct ocfs2_dinode *di;
-	uint32_t new_i_clusters;
-	uint64_t new_i_blocks;
+	uint32_t new_size_in_clusters;
+	uint64_t new_size_in_blocks;
+	ocfs2_cached_inode *ci = NULL;
 
-	ret = ocfs2_malloc_block(fs->fs_io, &buf);
+	ret = ocfs2_read_cached_inode(fs, ino, &ci);
 	if (ret)
-		return ret;
-
-	ret = ocfs2_read_inode(fs, ino, buf);
-	if (ret)
 		goto out;
-	di = (struct ocfs2_dinode *)buf;
 
-	if (di->i_size == new_i_size)
+	if (ci->ci_inode->i_size == new_i_size)
 		goto out;
 
-	new_i_blocks = ocfs2_blocks_in_bytes(fs, new_i_size);
-	new_i_clusters = ocfs2_clusters_in_blocks(fs, new_i_blocks);
+	new_size_in_blocks = ocfs2_blocks_in_bytes(fs, new_i_size);
+	new_size_in_clusters = ocfs2_clusters_in_blocks(fs, new_size_in_blocks);
 
-	if (di->i_clusters < new_i_clusters) {
-		ret = ocfs2_extend_allocation(fs, ino,
-					new_i_clusters - di->i_clusters);
-		if (ret)
-			goto out;
+	if (ci->ci_inode->i_size < new_i_size)
+		ret = ocfs2_extend_file(fs, ino, new_i_size);
+	else {
+		struct truncate_ctxt ctxt = {
+			.new_i_clusters = ci->ci_inode->i_clusters,
+			.new_size_in_clusters = new_size_in_clusters,
+		};
 
-		/* the information of dinode has been changed, and we need to
-		 * read it again.
-		 */
-		ret = ocfs2_read_inode(fs, ino, buf);
-		if (ret)
-			goto out;
-	} else {
-		ret = ocfs2_extent_iterate_inode(fs, di,
+		ret = ocfs2_extent_iterate_inode(fs, ci->ci_inode,
 						 OCFS2_EXTENT_FLAG_DEPTH_TRAVERSE,
 						 NULL, truncate_iterate,
-						 &new_i_clusters);
+						 &ctxt);
 		if (ret)
 			goto out;
 
+		ci->ci_inode->i_clusters = ctxt.new_i_clusters;
 		/* now all the clusters and extent blocks are freed.
 		 * only when the file's content is empty, should the tree depth
 		 * change.
 		 */
-		if (new_i_clusters == 0)
-			di->id2.i_list.l_tree_depth = 0;
+		if (ctxt.new_i_clusters == 0)
+			ci->ci_inode->id2.i_list.l_tree_depth = 0;
 
+		ret = ocfs2_zero_tail_for_truncate(ci, new_i_size);
+		if (ret)
+			goto out;
+
+		ci->ci_inode->i_size = new_i_size;
+		ret = ocfs2_write_cached_inode(fs, ci);
 	}
-
-	di->i_clusters = new_i_clusters;
-	di->i_size = new_i_size;
-	ret = ocfs2_write_inode(fs, ino, buf);
-
 out:
-	ocfs2_free(&buf);
-
+	if (ci)
+		ocfs2_free_cached_inode(fs, ci);
 	return ret;
 }
 

Modified: trunk/mkfs.ocfs2/mkfs.c
===================================================================
--- trunk/mkfs.ocfs2/mkfs.c	2007-08-24 23:30:25 UTC (rev 1405)
+++ trunk/mkfs.ocfs2/mkfs.c	2007-08-24 23:36:10 UTC (rev 1406)
@@ -116,8 +116,82 @@
 
 enum {
 	BACKUP_SUPER_OPTION = CHAR_MAX + 1,
+	FEATURE_LEVEL,
+	FEATURES_OPTION,
 };
 
+struct fs_feature_flags {
+	const char *ff_str;
+	/* this flag is the feature's own flag. */
+	fs_options ff_own_flags;
+	/*
+	 * this flag includes the feature's own flag and
+	 * all the other features' flag it depends on.
+	 */
+	fs_options ff_flags;
+};
+
+static struct fs_feature_flags ocfs2_supported_features[] = {
+	{
+		"local",
+		{0, OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT, 0},
+		{0, OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT, 0},
+	},
+	{
+		"sparse",
+		{0, OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC, 0},
+		{0, OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC, 0},
+	},
+	{
+		"backup-super",
+		{OCFS2_FEATURE_COMPAT_BACKUP_SB, 0, 0},
+		{OCFS2_FEATURE_COMPAT_BACKUP_SB, 0, 0},
+	},
+	{
+		"unwritten",
+		{0, 0, OCFS2_FEATURE_RO_COMPAT_UNWRITTEN},
+		{0, OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC,
+		 OCFS2_FEATURE_RO_COMPAT_UNWRITTEN},
+	},
+	{
+		NULL,
+		{0, 0, 0},
+		{0, 0, 0}
+	},
+};
+
+enum feature_level_indexes {
+	FEATURE_LEVEL_DEFAULT = 0,
+	FEATURE_LEVEL_MAX_COMPAT,
+	FEATURE_LEVEL_MAX_FEATURES,
+};
+
+struct feature_level_translation {
+	const char *fl_str;
+	enum feature_level_indexes fl_type;
+};
+
+static struct feature_level_translation ocfs2_feature_levels_table[] = {
+	{"default", FEATURE_LEVEL_DEFAULT},
+	{"max-compat", FEATURE_LEVEL_MAX_COMPAT},
+	{"max-features", FEATURE_LEVEL_MAX_FEATURES},
+	{NULL, FEATURE_LEVEL_DEFAULT},
+};
+
+static fs_options feature_level_defaults[] = {
+	{OCFS2_FEATURE_COMPAT_BACKUP_SB,
+	 OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC,
+	 0},  /* FEATURE_LEVEL_DEFAULT */
+
+	{OCFS2_FEATURE_COMPAT_BACKUP_SB,
+	 0,
+	 0}, /* FEATURE_LEVEL_MAX_COMPAT */
+
+	{OCFS2_FEATURE_COMPAT_BACKUP_SB,
+	 OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC,
+	 OCFS2_FEATURE_RO_COMPAT_UNWRITTEN}, /* FEATURE_LEVEL_MAX_FEATURES */
+};
+
 static uint64_t align_bytes_to_clusters_ceil(State *s,
 					     uint64_t bytes)
 {
@@ -505,6 +579,159 @@
 	}
 }
 
+/* Get the feature level according to the value set by "--fs-feature-level". */
+static void parse_feature_level_opts(char *progname, const char *typestr,
+				     enum feature_level_indexes *index)
+{
+	int i;
+
+	for(i = 0; ocfs2_feature_levels_table[i].fl_str; i++) {
+		if (strcmp(typestr,
+			  ocfs2_feature_levels_table[i].fl_str) == 0) {
+			*index = ocfs2_feature_levels_table[i].fl_type;
+			break;
+		}
+	}
+
+	if (!ocfs2_feature_levels_table[i].fl_str) {
+		com_err(progname, 0,
+			"unrecognized fs-feature-level:%s", typestr);
+		exit(1);
+	}
+}
+
+static void inline merge_features(fs_options *features,
+				  fs_options new_features)
+{
+	features->compat |= new_features.compat;
+	features->incompat |= new_features.incompat;
+	features->ro_compat |= new_features.ro_compat;
+}
+
+/*
+ * Parse the feature string set by the user in "--fs-features".
+ * for all the features the user want to set, they are added into
+ * the "feature_flags". For those the user want to clear(with "no"
+ * in the beginning), they are stored in the "reverse_flags".
+ */
+static void parse_feature_opts(char *progname, const char *opts,
+			       fs_options *feature_flags,
+			       fs_options *reverse_flags)
+{
+	char *options, *token, *next, *p, *arg;
+	int i, reverse = 0;
+
+	memset(feature_flags, 0, sizeof(fs_options));
+	memset(reverse_flags, 0, sizeof(fs_options));
+
+	options = strdup(opts);
+	for (token = options; token && *token; token = next) {
+		reverse = 0;
+		p = strchr(token, ',');
+		next = NULL;
+
+		if (p) {
+			*p = '\0';
+			next = p + 1;
+		}
+
+		arg = strstr(token, "no");
+		if (arg && arg == token) {
+			reverse = 1;
+			token += 2;
+		}
+
+		for(i = 0; ocfs2_supported_features[i].ff_str; i++) {
+			if (strcmp(token,
+				   ocfs2_supported_features[i].ff_str) == 0) {
+				if (!reverse)
+					merge_features(feature_flags,
+					ocfs2_supported_features[i].ff_flags);
+				else
+					merge_features(reverse_flags,
+					ocfs2_supported_features[i].ff_own_flags);
+				break;
+			}
+		}
+		if (!ocfs2_supported_features[i].ff_str) {
+			com_err(progname, 0,
+				"unrecognized fs-feature-string:%s", token);
+			exit(1);
+		}
+	}
+
+	free(options);
+}
+
+static int check_feature_flags(fs_options *fs_flags,
+			       fs_options *fs_r_flags)
+{
+	int ret = 1;
+
+	if (fs_r_flags->compat &&
+	    fs_flags->compat & fs_r_flags->compat)
+		ret = 0;
+	else if (fs_r_flags->incompat &&
+		 fs_flags->incompat & fs_r_flags->incompat)
+		ret = 0;
+	else if (fs_r_flags->ro_compat &&
+		 fs_flags->ro_compat & fs_r_flags->ro_compat)
+		ret = 0;
+
+	return ret;
+}
+
+/*
+ * Check and Merge all the diffent features set by the user.
+ *
+ * level_set: all the features a user set by choose a feature level.
+ * feature_set: all the features a user set by "--fs-features".
+ * reverse_set: all the features a user want to clear by "--fs-features".
+ */
+static int merge_feature_flags_with_level(State *s,
+					  fs_options *level_set,
+					  fs_options *feature_set,
+					  fs_options *reverse_set)
+{
+	int i;
+
+	/*
+	 * "Check whether the user asked for a flag to be set and cleared,
+	 * which is illegal. The feature_set and reverse_set are both set
+	 * by "--fs-features", so they shouldn't collide with each other.
+	 */
+	if (!check_feature_flags(feature_set, reverse_set))
+		return 0;
+
+	/* Now combine all the features the user has set. */
+	s->feature_flags = *level_set;
+	merge_features(&s->feature_flags, *feature_set);
+
+	/*
+	 * We have to remove all the features in the reverse set
+	 * and other features which depend on them.
+	 */
+	for(i = 0; ocfs2_supported_features[i].ff_str; i++) {
+		if ((reverse_set->compat &
+			ocfs2_supported_features[i].ff_flags.compat) ||
+		    (reverse_set->incompat &
+			ocfs2_supported_features[i].ff_flags.incompat) ||
+		    (reverse_set->ro_compat &
+			ocfs2_supported_features[i].ff_flags.ro_compat)) {
+			s->feature_flags.compat &=
+	    		~ocfs2_supported_features[i].ff_own_flags.compat;
+
+			s->feature_flags.incompat &=
+	    		~ocfs2_supported_features[i].ff_own_flags.incompat;
+
+			s->feature_flags.ro_compat &=
+	    		~ocfs2_supported_features[i].ff_own_flags.ro_compat;
+		}
+	}
+
+	return 1;
+}
+
 static State *
 get_state(int argc, char **argv)
 {
@@ -523,8 +750,10 @@
 	uint64_t val;
 	uint64_t journal_size_in_bytes = 0;
 	enum ocfs2_fs_types fs_type = FS_DEFAULT;
-	int mount = 0;
-	int no_backup_super = 0;
+	int mount = -1;
+	int no_backup_super = -1;
+	enum feature_level_indexes index = FEATURE_LEVEL_DEFAULT;
+	fs_options feature_flags ={0,0,0}, reverse_flags = {0,0,0};
 
 	static struct option long_options[] = {
 		{ "block-size", 1, 0, 'b' },
@@ -539,6 +768,8 @@
 		{ "force", 0, 0, 'F'},
 		{ "mount", 1, 0, 'M'},
 		{ "no-backup-super", 0, 0, BACKUP_SUPER_OPTION },
+		{ "fs-feature-level=", 1, 0, FEATURE_LEVEL },
+		{ "fs-features=", 1, 0, FEATURES_OPTION },
 		{ 0, 0, 0, 0}
 	};
 
@@ -673,6 +904,17 @@
 			no_backup_super = 1;
 			break;
 
+		case FEATURE_LEVEL:
+			parse_feature_level_opts(progname, optarg,
+						 &index);
+			break;
+
+		case FEATURES_OPTION:
+			parse_feature_opts(progname, optarg,
+					      &feature_flags,
+					      &reverse_flags);
+			break;
+
 		default:
 			usage(progname);
 			break;
@@ -735,10 +977,33 @@
 
 	s->fs_type = fs_type;
 
-	s->mount = mount;
+	if(!merge_feature_flags_with_level(s, &feature_level_defaults[index],
+					   &feature_flags, &reverse_flags)) {
+		com_err(s->progname, 0,
+			"Incompatible feature flags"
+			" were specified\n");
+		exit(1);
+	}
 
-	s->no_backup_super = no_backup_super;
+	if (s->feature_flags.incompat & OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT)
+		s->mount = MOUNT_LOCAL;
+	else
+		s->mount = MOUNT_CLUSTER;
+	if (s->feature_flags.compat & OCFS2_FEATURE_COMPAT_BACKUP_SB)
+		s->no_backup_super = 0;
+	else
+		s->no_backup_super = 1;
 
+
+	/* Here if the user set these flags explicitly, we will use them and
+	 * discard the setting in the features set.
+	 */
+	if (mount != -1)
+		s->mount = mount;
+
+	if (no_backup_super != -1)
+		s->no_backup_super = no_backup_super;
+
 	return s;
 }
 
@@ -851,6 +1116,8 @@
 	fprintf(stderr, "usage: %s [-b block-size] [-C cluster-size] "
 		"[-J journal-options]\n\t\t[-L volume-label] [-M mount-type] "
 		"[-N number-of-node-slots]\n\t\t[-T filesystem-type] [-HFqvV] "
+		"\n\t\t[--fs-feature-level=[default|max-compat|max-features]] "
+		"\n\t\t[--fs-features=[[no]sparse,...]]"
 		"[--no-backup-super] device [blocks-count]\n", progname);
 	exit(0);
 }
@@ -1716,7 +1983,6 @@
 		  SystemFileDiskRecord *root_rec, SystemFileDiskRecord *sys_rec)
 {
 	struct ocfs2_dinode *di;
-	uint32_t incompat;
 	uint64_t super_off = rec->fe_off;
 
 	di = do_malloc(s, s->blocksize);
@@ -1750,14 +2016,27 @@
 	di->id2.i_super.s_max_slots = s->initial_slots;
 	di->id2.i_super.s_first_cluster_group = s->first_cluster_group_blkno;
 
-	incompat = 0;
-	if (s->hb_dev)
-		incompat |= OCFS2_FEATURE_INCOMPAT_HEARTBEAT_DEV;
+	if (s->hb_dev) {
+		s->feature_flags.incompat =
+				 	OCFS2_FEATURE_INCOMPAT_HEARTBEAT_DEV;
+		s->feature_flags.compat = 0;
+		s->feature_flags.ro_compat = 0;
+	}
 
 	if (s->mount == MOUNT_LOCAL)
-		incompat |= OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT;
+		s->feature_flags.incompat |=
+					 OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT;
+	/*
+	 * we clear the "backup_sb" here since it should be written by
+	 * format_backup_super, not by us. And we have already set the
+	 * "s->no_backup_super" according to the features in get_state,
+	 * so it is safe to clear the flag here.
+	 */
+	s->feature_flags.compat &= !OCFS2_FEATURE_COMPAT_BACKUP_SB;
 
-	di->id2.i_super.s_feature_incompat = incompat;
+	di->id2.i_super.s_feature_incompat = s->feature_flags.incompat;
+	di->id2.i_super.s_feature_compat = s->feature_flags.compat;
+	di->id2.i_super.s_feature_ro_compat = s->feature_flags.ro_compat;
 
 	strcpy(di->id2.i_super.s_label, s->vol_label);
 	memcpy(di->id2.i_super.s_uuid, s->uuid, 16);
@@ -1899,7 +2178,7 @@
 	if (rec->extent_len) {
 		di->id2.i_list.l_next_free_rec = 1;
 		di->id2.i_list.l_recs[0].e_cpos = 0;
-		di->id2.i_list.l_recs[0].e_clusters = clusters;
+		ocfs2_set_rec_clusters(0, &di->id2.i_list.l_recs[0], clusters);
 		di->id2.i_list.l_recs[0].e_blkno =
 			rec->extent_off >> s->blocksize_bits;
 	}

Modified: trunk/mkfs.ocfs2/mkfs.h
===================================================================
--- trunk/mkfs.ocfs2/mkfs.h	2007-08-24 23:30:25 UTC (rev 1405)
+++ trunk/mkfs.ocfs2/mkfs.h	2007-08-24 23:36:10 UTC (rev 1406)
@@ -192,6 +192,14 @@
 	SystemFileDiskRecord *record;
 };
 
+typedef struct _fs_options fs_options;
+
+struct _fs_options {
+	uint32_t compat;
+	uint32_t incompat;
+	uint32_t ro_compat;
+};
+
 typedef struct _State State;
 
 struct _State {
@@ -241,6 +249,8 @@
 	uint32_t first_cluster_group;
 	uint64_t first_cluster_group_blkno;
 
+	fs_options feature_flags;
+
 	enum ocfs2_fs_types fs_type;
 };
 

Modified: trunk/sizetest/sizes.txt
===================================================================
--- trunk/sizetest/sizes.txt	2007-08-24 23:30:25 UTC (rev 1405)
+++ trunk/sizetest/sizes.txt	2007-08-24 23:36:10 UTC (rev 1406)
@@ -1,6 +1,6 @@
 [off]	ocfs2_extent_rec    	[size]
 0x000	e_cpos              	+0x04
-0x004	e_clusters          	+0x04
+0x004	e_int_clusters         	+0x04
 0x008	e_blkno             	+0x08
 	Total               	0x010
 

Modified: trunk/sizetest/sizetest.c
===================================================================
--- trunk/sizetest/sizetest.c	2007-08-24 23:30:25 UTC (rev 1405)
+++ trunk/sizetest/sizetest.c	2007-08-24 23:36:10 UTC (rev 1406)
@@ -49,7 +49,7 @@
 	START_TYPE(ocfs2_extent_rec);
 
 	SHOW_OFFSET(struct ocfs2_extent_rec, e_cpos);
-	SHOW_OFFSET(struct ocfs2_extent_rec, e_clusters);
+	SHOW_OFFSET(struct ocfs2_extent_rec, e_int_clusters);
 	SHOW_OFFSET(struct ocfs2_extent_rec, e_blkno);
 
 	END_TYPE(struct ocfs2_extent_rec);

Modified: trunk/tunefs.ocfs2/remove_slot.c
===================================================================
--- trunk/tunefs.ocfs2/remove_slot.c	2007-08-24 23:30:25 UTC (rev 1405)
+++ trunk/tunefs.ocfs2/remove_slot.c	2007-08-24 23:36:10 UTC (rev 1406)
@@ -378,10 +378,6 @@
 	uint64_t offset = 0;
 	uint32_t wrote, count;
 
-	ret = ocfs2_extent_map_init(fs, ci);
-	if (ret)
-		goto out;
-
 #define BUFLEN	1048576
 	ret = ocfs2_malloc_blocks(fs->fs_io, (BUFLEN >> bs_bits), &buf);
 	if (ret)




More information about the Ocfs2-tools-commits mailing list