[Ocfs2-commits] mfasheh commits r1024 - in trunk/src: . inc

Tue Jun 8 18:52:30 CDT 2004

Author: mfasheh
Date: 2004-06-08 17:52:28 -0500 (Tue, 08 Jun 2004)
New Revision: 1024

Modified:
   trunk/src/alloc.c
   trunk/src/dir.c
   trunk/src/dlm.c
   trunk/src/file.c
   trunk/src/hash.c
   trunk/src/inc/io.h
   trunk/src/inc/ocfs.h
   trunk/src/inc/ocfs_journal.h
   trunk/src/inc/proto.h
   trunk/src/inode.c
   trunk/src/journal.c
   trunk/src/lockres.c
   trunk/src/namei.c
   trunk/src/nm.c
   trunk/src/super.c
   trunk/src/sysfile.c
Log:
* Fix a couple bugs in my 1st commit of the add-inode-to-handle stuff.

* Clean up find_files_on_disk to not do the read of the FE, all paths
  that use it now do the read *after* the find_files_on_disk call, when
  we can make sure to lock ourselves properly :)

* As a result, we no longer needed to pass fe_bh to ocfs_iget in
  ocfs_lookup. Remove that field from the prototype and cleanup the
  callers and callees of ocfs_iget to reflect that change.

* Remove trans_lock! yay, we can now do multiple parallel transactions.

* Revamp our locking scheme around I/O and abort changes to use a new
  semaphore on the inode private (ip_io_sem) instead of i_sem. This is
  far more flexible for our needs.

* Do the final bits of locking needed to insure we lock around i/o and
  other buffer changes. Put some BUG's in the bh_sem stuff (not in the
  replacement functions used if BH_SEM_DEBUG is enabled though!). This
  will help us catch any remaining cases where we trample on each others
  buffers.

* Also, put a BUG in the acquire_lockres function for when we don't
  have ip_io_sem as it should now always be taken when we do that.

* Add a fix for a JBD race in journal_start and journal_stop with
  transaction variables like t_updates.

* Put in a few of the flags we'll eventually be using with the
  orphaned inode directories.

* Ext alloc file changes weren't being properly locked out, fix that.

* Local alloc moves were marked as sync transactions (rare) due to the
  old code requiring some changes to hit the journal sooner. We now 
  journal bitmap changes, so it's safe to turn that off. Should provide 
  a slight increase in performance.

* Various cleanups



Modified: trunk/src/alloc.c
===================================================================

--- trunk/src/alloc.c	2004-06-08 22:18:55 UTC (rev 1023)
+++ trunk/src/alloc.c	2004-06-08 22:52:28 UTC (rev 1024)
@@ -301,7 +301,7 @@
 	}
 
 	/* start the transaction here to preserve ordering with the
-	 * bitmap i_sems... */
+	 * bitmap io_sems... */
 	handle = ocfs_start_trans(osb, credits);
 	if (!handle) {
 		status = -ENOMEM;
@@ -315,13 +315,13 @@
 		  osb->vol_layout.root_int_off;
 	for (i = 0; i < OCFS_MAXIMUM_NODES; i++, lock_id += osb->sect_size) {
 		if (free_ext_node[i] != NULL) {
-			extnode_inode[i] = ocfs_iget(osb, lock_id, NULL);
+			extnode_inode[i] = ocfs_iget(osb, lock_id);
 			if (!extnode_inode[i]) {
 				status = -EINVAL;
 				LOG_ERROR_STATUS (status);
 				goto abort;
 			}
-			down(&extnode_inode[i]->i_sem);
+			down(&OCFS_I(extnode_inode[i])->ip_io_sem);
 
 			status = ocfs_acquire_lock (osb, lock_id,
 				 		    OCFS_DLM_EXCLUSIVE_LOCK,
@@ -329,7 +329,7 @@
 						    &ugly_hack_bh, 
 						    extnode_inode[i]);
 			if (status < 0) {
-				up(&extnode_inode[i]->i_sem);
+				up(&OCFS_I(extnode_inode[i])->ip_io_sem);
 				iput(extnode_inode[i]);
 				extnode_inode[i] = NULL;
 				if (status != -EINTR)
@@ -350,14 +350,14 @@
 			LOG_ERROR_STATUS (status);
 			goto abort;
 		}
-		down(&vol_inode->i_sem);
+		down(&OCFS_I(vol_inode)->ip_io_sem);
 
 		status = ocfs_acquire_lock (osb, OCFS_BITMAP_LOCK_OFFSET,
 					    OCFS_DLM_EXCLUSIVE_LOCK,
 					    FLAG_FILE_CREATE,
 					    &globalbh, vol_inode);
 		if (status < 0) {
-			up(&vol_inode->i_sem);
+			up(&OCFS_I(vol_inode)->ip_io_sem);
 			iput(vol_inode);
 			vol_inode = NULL;
 
@@ -444,12 +444,12 @@
 
 	for (i = 0; i < OCFS_MAXIMUM_NODES; i++) {
 		if (extnode_inode[i]) {
-			up(&extnode_inode[i]->i_sem);
+			up(&OCFS_I(extnode_inode[i])->ip_io_sem);
 			iput(extnode_inode[i]);
 		}
 	}
 	if (vol_inode) {
-		up(&vol_inode->i_sem);
+		up(&OCFS_I(vol_inode)->ip_io_sem);
 		iput(vol_inode);
 	}
 
@@ -2688,6 +2688,9 @@
 
 	OCFS_ASSERT (osb);
 
+	if ((bitmap_inode && !lock_bh) || (lock_bh && !bitmap_inode))
+		BUG();
+
 	if (!bitmap_inode) {
 		bitmap_inode = igrab(osb->system_inodes[GLOBAL_BITMAP_SYSTEM_INODE]);
 		if (!bitmap_inode) {
@@ -2714,8 +2717,8 @@
 				LOG_ERROR_STATUS (status);
 			goto leave;
 		}
-		ocfs_journal_add_lock(handle, OCFS_DLM_EXCLUSIVE_LOCK, 
-				      0, bh, bitmap_inode);
+		ocfs_handle_add_lock(handle, OCFS_DLM_EXCLUSIVE_LOCK, 
+				      0, bh, bitmap_inode, 1);
 	}
 
 	status = ocfs_journal_access(handle, bh, OCFS_JOURNAL_ACCESS_WRITE);
@@ -2914,8 +2917,18 @@
 	/* Allocate a block of size blocksize from the relevant file/bitmap */
 	OCFS_ASSERT (blockSize);
 
+	if (down_trylock(&OCFS_I(inode)->ip_io_sem) == 0) {
+		LOG_TRACE_ARGS("Uhoh, asking me to allocate on an unlocked system file! (type = %u, i_ino = %lu)\n", Type, inode->i_ino);
+		BUG();
+	}
+
 	/* Get a lock on the file */
 	lockId = (bm_file * osb->sect_size) + osb->vol_layout.root_int_off;
+	if (lockId != GET_INODE_FEOFF(inode)) {
+		LOG_TRACE_ARGS("lockId = %llu, offset = %llu\n", lockId, 
+			       GET_INODE_FEOFF(inode));
+		BUG();
+	}
 	status = ocfs_acquire_lock (osb, lockId, OCFS_DLM_EXCLUSIVE_LOCK,
 			     FLAG_FILE_CREATE, &bh, inode);
 	if (status < 0) {
@@ -2923,9 +2936,9 @@
 		goto leave;
 	}
 
-	ocfs_journal_add_lock(handle, OCFS_DLM_EXCLUSIVE_LOCK, 
+	ocfs_handle_add_lock(handle, OCFS_DLM_EXCLUSIVE_LOCK, 
 			      FLAG_FILE_CREATE, 
-			      bh, inode);
+			      bh, inode, 1);
 
 	status = ocfs_ugly_hack(handle, bh);
 	if (status < 0) {
@@ -3493,10 +3506,6 @@
 				ocfs_shutdown_local_alloc(osb, NULL, 0, 
 							  0);
 
-				/* we want to make sure an empty alloc
-				 * hits disk. */
-				ocfs_handle_set_sync(handle, 1);
-
 				/* the bh might not have been dirtied to
 				 * the journal yet. */
 				tmpstat = ocfs_journal_dirty(handle, 
@@ -3547,8 +3556,8 @@
 			goto bail;
 		}
 
-		ocfs_journal_add_lock(handle, OCFS_DLM_EXCLUSIVE_LOCK, 0,
-				      main_bm_bh, main_bm_inode);
+		ocfs_handle_add_lock(handle, OCFS_DLM_EXCLUSIVE_LOCK, 0,
+				      main_bm_bh, main_bm_inode, 1);
 
 		status = ocfs_sync_local_to_main(osb, &(handle->commit_bits),
 						 NULL, 0);
@@ -3655,6 +3664,10 @@
 		use_global = 0;
 
 	if (!use_global) {
+		if (handle->flags & OCFS_HANDLE_LOCAL_ALLOC) {
+			printk("whoa, I already have local alloc sem!?!\n");
+			BUG();
+		}
 		down(&osb->local_alloc_sem);
 		handle->flags |= OCFS_HANDLE_LOCAL_ALLOC;
 		status = ocfs_find_space_from_local(osb, bitswanted, 

Modified: trunk/src/dir.c
===================================================================
--- trunk/src/dir.c	2004-06-08 22:18:55 UTC (rev 1023)
+++ trunk/src/dir.c	2004-06-08 22:52:28 UTC (rev 1024)
@@ -168,10 +168,9 @@
 
 
 /* ocfs_find_files_on_disk()
- * NOTE: this should always be called with inode->i_sem taken!
+ * NOTE: this should always be called with parent dir ip_io_sem taken!
  */
-/* parent off changed to file entry offset of parent! */
-int ocfs_find_files_on_disk (ocfs_super * osb, struct dentry * dentry, struct buffer_head ** fe_bh, struct inode *inode, struct inode *file_inode, int take_lock, struct buffer_head **dirent_bh, struct ocfs2_dir_entry **dirent)
+int ocfs_find_files_on_disk (ocfs_super * osb, struct dentry * dentry, __u64 *fe_off, struct inode *inode, int take_lock, struct buffer_head **dirent_bh, struct ocfs2_dir_entry **dirent)
 {
 	int status = -ENOENT;
 	int tmpstat;
@@ -180,7 +179,7 @@
 	__u32 lock_type = OCFS_DLM_ENABLE_CACHE_LOCK;
 	__u64 parent_off = GET_INODE_FEOFF(inode);
 	
-	LOG_ENTRY_ARGS ("(osb=%p, parent=%llu, dentry=%p, fe_bh=%p, inode=%p)\n", osb, parent_off, dentry, fe_bh, inode);
+	LOG_ENTRY_ARGS ("(osb=%p, parent=%llu, dentry=%p, inode=%p)\n", osb, parent_off, dentry, inode);
 
 	if (take_lock) {
 		/* Get a lock on the directory... */
@@ -200,12 +199,9 @@
 	if (!*dirent_bh || !*dirent)
 		goto leave;
 
-	status = ocfs_read_bh(osb, (*dirent)->inode, fe_bh, OCFS_BH_CACHED, file_inode);
-	if (status < 0) {
-		brelse(*dirent_bh);
-		LOG_ERROR_STATUS(status);
-		status = -ENOENT;
-	}
+	*fe_off = (*dirent)->inode;
+
+	status = 0;
 leave:
 
 	if (take_lock && lock_acq)

Modified: trunk/src/dlm.c
===================================================================
--- trunk/src/dlm.c	2004-06-08 22:18:55 UTC (rev 1023)
+++ trunk/src/dlm.c	2004-06-08 22:52:28 UTC (rev 1024)
@@ -684,8 +684,8 @@
 			status = ocfs_journal_dirty(handle, *bh);
 			lockres->lock_holders++;
 #warning I hope these lock flags are alright.
-			ocfs_journal_add_lock(handle, lockres->lock_type, 0, 
-					      *bh, inode);
+			ocfs_handle_add_lock(handle, lockres->lock_type, 0, 
+					     *bh, inode, 0);
 		} else
 			status = ocfs_write_bh (osb, *bh, 0, inode);
 		if (status < 0) 

Modified: trunk/src/file.c
===================================================================
--- trunk/src/file.c	2004-06-08 22:18:55 UTC (rev 1023)
+++ trunk/src/file.c	2004-06-08 22:52:28 UTC (rev 1024)
@@ -74,7 +74,7 @@
 /* 
  * ocfs_inode_notify_open()
  * 
- * you should be holding i_sem and priv_sem in this function. 
+ * you should be holding io_sem and priv_sem in this function. 
  * If needed add ourselves to the open map. Only call this 
  * on 1st open of a file. Marks the oin as "in use"
  */
@@ -177,7 +177,7 @@
 
 	/* kch - for an open request we are already given the 
 	* inode, and therefore we are given the oin too */
-	down(&inode->i_sem);
+	down (&(OCFS_I(inode)->ip_io_sem));
 	down (&(OCFS_I(inode)->priv_sem));
 	have_oin_sem = 1;
 
@@ -186,14 +186,14 @@
 		status = ocfs_read_bh(osb, GET_INODE_FEOFF(inode), &fe_bh, 
 				      OCFS_BH_CACHED, inode);
 		if (status < 0) {
-			up(&inode->i_sem);
+			up(&OCFS_I(inode)->ip_io_sem);
 			LOG_ERROR_STATUS(status);
 			goto leave;
 		}
 
 		status = ocfs_inode_notify_open(osb, fe_bh, NULL, inode);
 		if (status < 0) {
-			up(&inode->i_sem);
+			up(&OCFS_I(inode)->ip_io_sem);
 			LOG_ERROR_STATUS(status);
 			if (status != -EINTR) {
 				LOG_ERROR_ARGS("Open request made for nonexistent "
@@ -207,7 +207,7 @@
 
 		status = ocfs_inode_fill_ext_map (osb, fe_bh, inode);
 		if (status < 0) {
-			up(&inode->i_sem);
+			up(&OCFS_I(inode)->ip_io_sem);
 			LOG_ERROR_STATUS(status);
 			goto leave;
 		}
@@ -217,13 +217,13 @@
 		status = ocfs_verify_update_inode (osb, inode, &truncate_pages,
 						   0);
 		if (status < 0) {
-			up(&inode->i_sem);
+			up(&OCFS_I(inode)->ip_io_sem);
 			LOG_ERROR_STATUS (status);
 			goto leave;
 		}
 	}
 
-	up(&inode->i_sem);
+	up(&OCFS_I(inode)->ip_io_sem);
 	/* yes, hold onto priv_sem. */
 
 	if (OCFS_I(inode)->open_hndl_cnt > 0) {
@@ -660,8 +660,8 @@
 		if (status < 0) {
 			ocfs_abort_trans(handle);
 		} else {
-			ocfs_journal_add_lock(handle, locktype, lockFlags, 
-					      bh, inode);
+			ocfs_handle_add_lock(handle, locktype, lockFlags, 
+					     bh, inode, 0);
 			have_disk_lock = 0;
 
 			ocfs_commit_trans(handle);
@@ -760,10 +760,12 @@
 
 	if (OCFS_I(inode)->needs_verification) {
 		LOG_TRACE_STR ("OIN_NEEDS_VERIFICATION");
+		down (&(OCFS_I(inode)->ip_io_sem));
 		down (&(OCFS_I(inode)->priv_sem));
 		status = ocfs_verify_update_inode (osb, inode, &needs_trunc, 
 						   0);
 		up (&(OCFS_I(inode)->priv_sem));
+		up (&(OCFS_I(inode)->ip_io_sem));
 		if (needs_trunc)
 			ocfs_truncate_inode_pages(inode, 0);
 		if (status < 0) {
@@ -800,8 +802,9 @@
 		LOG_TRACE_ARGS
 		    ("Will need more allocation: have=%llu, need=%llu\n",
 		     OCFS_I(inode)->alloc_size, newsize);
-
+		down(&OCFS_I(inode)->ip_io_sem);
 		status = ocfs_extend_file (osb, newsize, GET_INODE_FEOFF(inode), NULL, inode, NULL);
+		up(&OCFS_I(inode)->ip_io_sem);
 		if (status < 0) {
 			if (status != -EINTR && status != -ENOSPC) {
 				LOG_ERROR_STATUS (status);
@@ -888,12 +891,12 @@
 
 	if (OCFS_I(inode)->needs_verification) {
 		/* yay, locking hell! */
-		down(&inode->i_sem);
+		down(&OCFS_I(inode)->ip_io_sem);
 		down (&(OCFS_I(inode)->priv_sem));
 		status = ocfs_verify_update_inode (osb, inode, &needs_trunc, 
 						   0);
 		up (&(OCFS_I(inode)->priv_sem));
-		up(&inode->i_sem);
+		up(&OCFS_I(inode)->ip_io_sem);
 		if (needs_trunc)
 			ocfs_truncate_inode_pages(inode, 0);
 		if (status < 0) {
@@ -1127,7 +1130,6 @@
 	OCFS_BH_PUT_DATA(bh);
 	fileEntry = NULL;
 
-
 	if (passed_handle == NULL) {
 		credits = ocfs_calc_extend_credits(((__u32) allocSize), 
 						   osb->vol_layout.cluster_size);
@@ -1292,8 +1294,8 @@
 			} else {
 				lockFlags |= FLAG_FILE_UPDATE_OIN;
 
-				ocfs_journal_add_lock(handle, locktype,
-						      lockFlags, bh, inode);
+				ocfs_handle_add_lock(handle, locktype,
+						     lockFlags, bh, inode, 0);
 				have_disk_lock = 0;
 
 				ocfs_commit_trans(handle);
@@ -1344,10 +1346,7 @@
 
 	osb = OCFS_SB(inode->i_sb);
 
-	/* NOTE: Other filesystems get away without locking this, but
-	 * we're clustered and this has to hit disk now... */
-	if (!(attr->ia_valid & ATTR_SIZE))
-		down(&inode->i_sem);
+	down(&OCFS_I(inode)->ip_io_sem);
 
 	if (!dentry->d_parent || !dentry->d_parent->d_inode) {
 		LOG_ERROR_STR ("bad inode or root inode");
@@ -1448,8 +1447,7 @@
 	inode_setattr (inode, attr);
 
 bail:
-	if (!(attr->ia_valid & ATTR_SIZE))
-		up(&inode->i_sem);
+	up(&OCFS_I(inode)->ip_io_sem);
 
 #ifndef BH_SEM_LEAK_CHECKING
 	if (error < 0)

Modified: trunk/src/hash.c
===================================================================
--- trunk/src/hash.c	2004-06-08 22:18:55 UTC (rev 1023)
+++ trunk/src/hash.c	2004-06-08 22:52:28 UTC (rev 1024)
@@ -251,6 +251,7 @@
 #ifdef VERBOSE_BH_SEM
 		LOG_TRACE_ARGS("need to wait... modified and pid is %d\n", sem->s_pid);
 #endif
+		LOG_ERROR_ARGS("Uhoh, read lock wanted on modified buffer! (pid=%d, block=%lu)\n", sem->s_pid, bh->b_blocknr);
 		ret = OCFS_BH_SEM_WAIT_ON_MODIFY;
 	} else {
 #ifdef VERBOSE_BH_SEM
@@ -285,8 +286,9 @@
 			/* refcount as if it weren't modified */
 			ocfs_bh_sem_get(sem);
 		} else if (sem->s_pid != current->pid) {
-			LOG_TRACE_ARGS("need to wait... modified and pid is %d\n", sem->s_pid);
+//			LOG_TRACE_ARGS("need to wait... modified and pid is %d\n", sem->s_pid);
 			ret = OCFS_BH_SEM_WAIT_ON_MODIFY;
+			LOG_ERROR_ARGS("Uhoh, write lock wanted on modified buffer! (pid=%d, block=%lu)\n", sem->s_pid, bh->b_blocknr);
 		}
 	} else {
 		//LOG_TRACE_ARGS("buffer NOT modified\n");

Modified: trunk/src/inc/io.h
===================================================================
--- trunk/src/inc/io.h	2004-06-08 22:18:55 UTC (rev 1023)
+++ trunk/src/inc/io.h	2004-06-08 22:52:28 UTC (rev 1024)
@@ -115,6 +115,8 @@
 
 	while (1) {
 		if (ocfs_bh_sem_lock(bh) == OCFS_BH_SEM_WAIT_ON_MODIFY) {
+			BUG();
+
 			ocfs_bh_sem_unlock(bh);
 			wait_on_buffer_modified(bh);
 		} else {
@@ -175,6 +177,7 @@
 			       "this process is not the lock "
 			       "holder!\n");
 #endif
+			BUG();
 			ocfs_bh_sem_unlock(bh);
 			wait_on_buffer_modified(bh);
 		} else {
@@ -205,6 +208,8 @@
 		       "this process is not the lock "
 		       "holder!\n");
 #endif
+		LOG_ERROR_STR("Trylock about to BUG()");
+		BUG();
 		ocfs_bh_sem_unlock(bh);
 		return NULL;
 	}

Modified: trunk/src/inc/ocfs.h
===================================================================
--- trunk/src/inc/ocfs.h	2004-06-08 22:18:55 UTC (rev 1023)
+++ trunk/src/inc/ocfs.h	2004-06-08 22:52:28 UTC (rev 1024)
@@ -272,7 +272,7 @@
 #define  FLAG_FILE_CREATE_DIR     0x00000040
 #define  FLAG_FILE_UPDATE_OIN     0x00000080
 #define  FLAG_FILE_RELEASE_MASTER 0x00000100
-#define  FLAG_FILE_UNUSED2        0x00000200
+#define  FLAG_DROP_LINK           0x00000200
 #define  FLAG_CHANGE_MASTER       0x00000400
 #define  FLAG_ADD_OIN_MAP         0x00000800
 #define  FLAG_DIR                 0x00001000
@@ -1106,18 +1106,13 @@
 	__u64 last_upd_seq_num;
 };
 
-/* OCFS2 Inode Private Data
- *
- * feoff/voteoff can change during rename. Luckily, rename takes a ton
- * of locks and does several checks, so you're safe reading these values
- * if any of the following is true:
- *  1) you have i_sem
- *  2) you have priv_sem
- *  3) open_hndl_cnt > 0 
- */
+struct _ocfs_journal_handle;
+
+/* OCFS2 Inode Private Data */
 typedef struct _ocfs_inode_private
 {
-	/* always valid, just a simple back pointer. */
+	/* inode and feoff fields never change and are always safe to
+	 * read. */
 	struct inode     *inode;
 
 	__u64             feoff;
@@ -1125,12 +1120,24 @@
 	/* These fields are protected by priv_sem */
 	struct semaphore  priv_sem;
 	__u32             open_hndl_cnt;
-	int              needs_verification;
+	int               needs_verification;
 	__u64             chng_seq_num;
 	ocfs_extent_map   map;
 	__s64             alloc_size;
 	__u32             oin_flags;
 
+	/* This protects io on the metadata buffers related to this
+	 * inode. We also consider an "abort_trans" an I/O as it will
+	 * revert the buffer back to a previous state. */
+	struct semaphore  ip_io_sem;
+
+	/* Used by the journalling code to attach an inode to a
+	 * handle.  These are protected by ip_io_sem in order to lock
+	 * out other I/O to the inode until we either commit or
+	 * abort. */
+	struct list_head            ip_handle_list;
+	struct _ocfs_journal_handle *ip_handle;
+
 	/* inode_extend_sem locks out extends on behalf of other nodes. */
 	struct semaphore  inode_extend_sem;
 
@@ -1152,8 +1159,6 @@
 
 	ocfs_lock_res     i_lockres;
 	__u32 		  i_dir_start_lookup;
-
-	struct list_head  handle_list;
 } ocfs_inode_private;
 
 /* Eventually, the 'flags' and 'oin_flags' fields need to be
@@ -1205,9 +1210,6 @@
 	return (unsigned long)((off >> sb->s_blocksize_bits) & (__u64)ULONG_MAX);
 }
 
-
-
-#warning take this out when all the lockres stuff checks out
 #define GET_INODE_LOCKRES(i) ({ if (i==NULL) BUG(); (&(OCFS_I(i)->i_lockres)); })
 
 typedef enum _ocfs_vol_state
@@ -1392,7 +1394,6 @@
 	__u32 cfg_numblocks;
 	struct semaphore publish_lock;  /* protects r/w to publish sector */
 	atomic_t node_req_vote;         /* set when node's vote req pending */
-	struct semaphore trans_lock;	/* serializes transactions */
 	int publish_dirty;
 	struct list_head needs_flush_head;
 	wait_queue_head_t flush_event;
@@ -2044,19 +2045,6 @@
 	de->file_type = ocfs_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
 }
 
-
-
-typedef struct _ocfs_find_inode_args
-{
-	__u64 feoff;
-	struct buffer_head *fe_bh;
-	unsigned long ino;
-	__u32 flags;
-}
-ocfs_find_inode_args;
-
-#define OCFS_FIND_INODE_FLAG_SYSFILE              0x00000002
-
 /* timeout structure taken from Ben's aio.c */
 typedef struct _ocfs_timeout {
 	struct timer_list	timer;
@@ -2159,23 +2147,6 @@
 	return ret;
 }
 
-
-
-/*  
- *  Trans Lock:
- *  Right now OCFS2 only supports a single transaction at a
- *  time. Transactions are locked out by using trans_lock. 
- */
-static inline void ocfs_take_trans_lock(ocfs_super *osb)
-{
-	down(&osb->trans_lock);
-}
-
-static inline void ocfs_release_trans_lock(ocfs_super *osb)
-{
-	up(&osb->trans_lock);
-}
-
 typedef struct _ocfs_journal_handle ocfs_journal_handle;
 
 #include "proto.h"

Modified: trunk/src/inc/ocfs_journal.h
===================================================================
--- trunk/src/inc/ocfs_journal.h	2004-06-08 22:18:55 UTC (rev 1023)
+++ trunk/src/inc/ocfs_journal.h	2004-06-08 22:52:28 UTC (rev 1024)
@@ -46,8 +46,6 @@
 struct _ocfs_file_entry;
 struct _ocfs_journal_handle;
 
-/* most of the ocfs_journal structure is protected by the
- * trans_lock. BEWARE. */
 typedef struct _ocfs_journal ocfs_journal;
 struct _ocfs_journal {
 	enum ocfs_journal_state   state;      /* Journals current state   */
@@ -71,12 +69,6 @@
 						 to access file entry	  */
 	atomic_t                  num_trans;  /* Number of transactions 
 					       * currently in the system. */
-	struct _ocfs_journal_handle *curr;    /* pointer to currently
-					       * running handle. In
-					       * the future when we do
-					       * multiple concurrent
-					       * transactions this may
-					       * become a list.*/
 	/* locking order: trans_lock -> commit_sem -> journal.curr.list_lock */
 	struct semaphore          commit_sem; /* protects *everything*
 					       * in the commited list
@@ -86,6 +78,7 @@
 	struct list_head          commited;   /* doubly linked list of all
 					       * commited handles awaiting
 					       * checkpointing.           */
+	struct rw_semaphore       trans_barrier;
 };
 
 typedef struct _ocfs_journal_lock ocfs_journal_lock;
@@ -94,6 +87,7 @@
 	__u32 flags;
 	struct buffer_head *bh;
 	struct inode *inode;
+	int req_io_sem;
 	struct list_head lock_list;
 };
 
@@ -120,7 +114,7 @@
 	int                 num_buffs;
 	struct buffer_head  **buffs;
 
-	/* The following three fields are for ocfs_journal_add_lock */
+	/* The following three fields are for ocfs_handle_add_lock */
 	spinlock_t          list_lock; /* Used to protect the 'locks'
 					* list. Only used if the
 					* handle is the same as
@@ -242,9 +236,9 @@
  *                          buffer. Will have to call ocfs_journal_dirty once
  *                          we've actually dirtied it. Type is one of . or .
  *  ocfs_journal_dirty    - Mark a journalled buffer as having dirty data.
- *  ocfs_journal_add_lock - Sometimes we need to delay lock release
+ *  ocfs_handle_add_lock  - Sometimes we need to delay lock release
  *                          until after a transaction has been completed. Use
- *                          ocfs_journal_add_lock to indicate that a lock needs
+ *                          ocfs_handle_add_lock to indicate that a lock needs
  *                          to be released at the end of that handle. Locks 
  *                          will be released in the order that they are added. 
  *  ocfs_handle_add_inode - Add a locked inode to a transaction.
@@ -293,10 +287,11 @@
  */
 int                  ocfs_journal_dirty(ocfs_journal_handle *handle, 
 					struct buffer_head *bh);
-void                 ocfs_journal_add_lock(ocfs_journal_handle *handle, 
-					   __u32 type, __u32 flags, 
-					   struct buffer_head *bh, 
-					   struct inode *inode);
+void                 ocfs_handle_add_lock(ocfs_journal_handle *handle, 
+					  __u32 type, __u32 flags, 
+					  struct buffer_head *bh, 
+					  struct inode *inode,
+					  int req_io_sem);
 /*
  * Some transactions require us to leave inodes in a locked state
  * until we either commit or abort because the buffer state can change

Modified: trunk/src/inc/proto.h
===================================================================
--- trunk/src/inc/proto.h	2004-06-08 22:18:55 UTC (rev 1023)
+++ trunk/src/inc/proto.h	2004-06-08 22:52:28 UTC (rev 1024)
@@ -102,10 +102,8 @@
 /* dir.c */
 int empty_dir(struct inode *inode);  /* FIXME: to namei.c */
 int ocfs_find_files_on_disk(ocfs_super *osb, struct dentry *dentry,
-			    struct buffer_head **fe_bh,
-			    struct inode *inode,
-			    struct inode *file_inode, int take_lock,
-			    struct buffer_head **dirent_bh,
+			    __u64 *fe_off, struct inode *inode,
+			    int take_lock, struct buffer_head **dirent_bh,
 			    struct ocfs2_dir_entry **dirent);
 int ocfs_readdir(struct file *filp, void *dirent, filldir_t filldir);
 
@@ -228,8 +226,7 @@
 			       struct inode * inode, int block,
 			       int create, int *err, int reada);
 void ocfs_clear_inode(struct inode *inode);
-struct inode *ocfs_iget(ocfs_super *osb, __u64 feoff, 
-			struct buffer_head *fe_bh);
+struct inode *ocfs_iget(ocfs_super *osb, __u64 feoff);
 int ocfs_inode_init_private(struct inode *inode);
 int ocfs_inode_revalidate(struct dentry *dentry);
 void ocfs_populate_inode(struct inode *inode, ocfs_file_entry *fe,

Modified: trunk/src/inode.c
===================================================================
--- trunk/src/inode.c	2004-06-08 22:18:55 UTC (rev 1023)
+++ trunk/src/inode.c	2004-06-08 22:52:28 UTC (rev 1024)
@@ -34,6 +34,16 @@
 
 extern struct semaphore recovery_list_sem;
 
+typedef struct _ocfs_find_inode_args
+{
+	__u64 feoff;
+	unsigned long ino;
+	__u32 flags;
+}
+ocfs_find_inode_args;
+
+#define OCFS_FIND_INODE_FLAG_SYSFILE              0x00000002
+
 static int ocfs_readpage (struct file *file, struct page *page);
 static int ocfs_prepare_write (struct file *file, struct page *page, unsigned from, unsigned to);
 static int ocfs_commit_write (struct file *file, struct page *page, unsigned from, unsigned to);
@@ -146,25 +156,16 @@
 
 /* 
  * ocfs_iget()
- *
- * Not all fields are required, pick your poison:
- *   * fe_bh only -- voteoff and feoff should both be zero then.
- *   * voteoff and feoff -- fe_bh can be NULL. 
- *     If AND ONLY IF the inode has no file entry (as in the main bitmap), 
- *     are you allowed to have feoff = 0.
- * 
- *   If you give me both, I'll prefer fe_bh.
+ * feoff is *required*
  */
-struct inode *ocfs_iget(ocfs_super *osb, __u64 feoff, 
-			struct buffer_head *fe_bh)
+struct inode *ocfs_iget(ocfs_super *osb, __u64 feoff)
 {
 	struct inode *inode = NULL;
 	struct super_block *sb = osb->sb;
-	ocfs_file_entry *fe;
 	ocfs_find_inode_args args;
 	__u32 flags = 0;
 
-	LOG_ENTRY_ARGS("(feoff = %llu, fe_bh = %p)\n", feoff, fe_bh);
+	LOG_ENTRY_ARGS("(feoff = %llu)\n", feoff);
 
 	/* Shortcut: if they ask for the root dirnode, just return
 	 * it. */
@@ -194,22 +195,6 @@
 		goto bail;
 	}
 
-	/* Ok, lets try to be smart here. We need a very specific set
-	 * of arguments to get our inode. Figure these out from the
-	 * available data. */
-	if (fe_bh) {
-		/* best case -- we can figure out what we need from
-		 * the file entry! */
-		fe = OCFS_BH_GET_DATA_READ(fe_bh);
-		if (!IS_VALID_FILE_ENTRY(fe)) {
-			OCFS_BH_PUT_DATA(fe_bh);
-			LOG_ERROR_STATUS(-EINVAL);
-			goto bail;
-		}
-		feoff = fe->this_sector;
-		OCFS_BH_PUT_DATA(fe_bh);
-	}
-
 	/* Ok. By now we've either got the offsets passed to us by the
 	 * caller, or we just pulled them off the bh. Lets do some
 	 * sanity checks to make sure they're OK. */
@@ -223,7 +208,6 @@
 		flags |= OCFS_FIND_INODE_FLAG_SYSFILE;
 
 	args.feoff = feoff;
-	args.fe_bh = fe_bh;
 	args.flags = flags;
 	args.ino = ino_from_off(sb, feoff);
 
@@ -285,7 +269,6 @@
 {
 	ocfs_find_inode_args *args = NULL;
 	int ret = 0;
-	ocfs_file_entry *fe = NULL;
 
 	LOG_ENTRY_ARGS ("(0x%p, %lu, 0x%p)\n", inode, ino, opaque);
 	
@@ -303,8 +286,6 @@
 
 	ret = 1;
 bail:
-	if (fe)
-		OCFS_BH_PUT_DATA(args->fe_bh);
 	LOG_EXIT_INT (ret);
 	return ret;
 }				/* ocfs_find_inode */
@@ -337,8 +318,11 @@
 	i->open_hndl_cnt = 0;
 	ocfs_extent_map_init (&i->map);
 	INIT_LIST_HEAD(&i->recovery_list);
-	INIT_LIST_HEAD(&i->handle_list);
+	INIT_LIST_HEAD(&i->ip_handle_list);
+	i->ip_handle = NULL;
 
+	init_MUTEX(&i->ip_io_sem);
+
 	/* These should be set in read_inode2. */
 	i->alloc_size = 0ULL;
 	i->feoff = 0ULL;
@@ -509,19 +493,15 @@
 	feoff = args->feoff;
 	sysfile = (args->flags & OCFS_FIND_INODE_FLAG_SYSFILE);
 
-	/* Uhoh, they didn't give us a buffer. Read the FE off
-	 * disk. This is safe because the kernel only does one
-	 * read_inode2 for a new inode, and if it doesn't exist yet
-	 * then nobody can be working on it! */
-	if (!args->fe_bh) {
-		status = ocfs_read_bh(osb, args->feoff, &bh, 0, NULL);
-		if (status < 0) {
-			LOG_ERROR_STATUS(status);
-			make_bad_inode (inode);
-			goto bail;
-		}
-	} else
-		bh = args->fe_bh;
+	/* Read the FE off disk. This is safe because the kernel only
+	 * does one read_inode2 for a new inode, and if it doesn't
+	 * exist yet then nobody can be working on it! */
+	status = ocfs_read_bh(osb, args->feoff, &bh, 0, NULL);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		make_bad_inode (inode);
+		goto bail;
+	}
 
 	fe = OCFS_BH_GET_DATA_READ(bh);
 
@@ -568,7 +548,7 @@
 	if (fe)
 		OCFS_BH_PUT_DATA(bh);
 
-	if (args && !args->fe_bh && bh)
+	if (args && bh)
 		brelse(bh);
 
 	LOG_EXIT ();
@@ -592,7 +572,6 @@
 {
 	ocfs_find_inode_args *args = NULL;
 	int ret = 0;
-	ocfs_file_entry *fe = NULL;
 
 	LOG_ENTRY_ARGS ("(0x%p, %lu, %llu, 0x%p)\n", inode, inode->i_ino, GET_INODE_FEOFF(inode), opaque);
 
@@ -612,8 +591,6 @@
 
 	ret = 1;
 bail:
-	if (fe)
-		OCFS_BH_PUT_DATA(args->fe_bh);
 	LOG_EXIT_INT (ret);
 	return ret;
 }				/* ocfs_find_actor */
@@ -684,7 +661,7 @@
 	osb = OCFS_SB(inode->i_sb);
 
 	if (!inode->u.generic_ip) {
-		LOG_ERROR_ARGS("inode %llu has no generic_ip!\n", GET_INODE_FEOFF(inode));
+		LOG_ERROR_ARGS("inode %lu has no generic_ip!\n", inode->i_ino);
 		goto bail;
 	}
 
@@ -1727,7 +1704,7 @@
 
 	osb = OCFS_SB(inode->i_sb);
 
-	down (&inode->i_sem);
+	down (&(OCFS_I(inode)->ip_io_sem));
 	down (&(OCFS_I(inode)->priv_sem));
 
 	if (INODE_DELETED(inode)) {
@@ -1756,7 +1733,7 @@
 
 bail:
 	up (&(OCFS_I(inode)->priv_sem));
-	up (&inode->i_sem);
+	up (&(OCFS_I(inode)->ip_io_sem));
 
 	if (needs_trunc)
 		ocfs_truncate_inode_pages(inode, 0);

Modified: trunk/src/journal.c
===================================================================
--- trunk/src/journal.c	2004-06-08 22:18:55 UTC (rev 1023)
+++ trunk/src/journal.c	2004-06-08 22:52:28 UTC (rev 1024)
@@ -38,21 +38,7 @@
 	TRANS_CACHE
 } release_locks_action;
 
-/*  
- *  Trans Lock:
- *  Right now OCFS2 only supports a single transaction at a
- *  time. Transactions are locked out by using trans_lock. 
- */
-#define ocfs_take_trans_lock(osb) 					\
-	do {								\
-		down(&osb->trans_lock);					\
-	} while (0)
 
-#define ocfs_release_trans_lock(osb) 					\
-	do {								\
-		up (&osb->trans_lock);					\
-	} while (0)
-
 static int ocfs_checkpoint_handle(ocfs_journal_handle *handle);
 static int ocfs_revoke_handle(ocfs_journal_handle *handle);
 static int ocfs_reset_publish (ocfs_super * osb, __u64 node_num);
@@ -63,6 +49,46 @@
 static int __ocfs_recovery_thread(void *arg);
 static int ocfs_commit_cache (ocfs_super * osb, int data_flush);
 
+/* 
+ * JBD in 2.4 kernels has a bug in that it doesn't do any locking of
+ * the t_updates transaction variable. If we don't serialize calls to
+ * journal_start/journal_stop, then it can get way out of whack,
+ * resulting in either a crash or a lockup. As far as I can tell, they
+ * never hit this bug in ext3 because those calls somehow manage to
+ * get serialized. I wish I didn't have to use lock_kernel here, but
+ * we actually want the "drop on sleep" behavior which we can't get
+ * with any other lock.
+ * 
+ * 2.6 does it the right way by spinlocking around it's structures.
+ * 
+ * These two should be moved to compat.h when it exists. 
+ */
+static inline handle_t *ocfs_journal_start(journal_t *journal, int nblocks)
+{
+	handle_t * h;
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)
+	lock_kernel();
+#endif
+	h = journal_start(journal, nblocks);
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)
+	unlock_kernel();
+#endif
+	return(h);
+}
+
+static inline int ocfs_journal_stop(handle_t *handle)
+{
+	int status;
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)
+	lock_kernel();
+#endif
+	status = journal_stop(handle);
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)
+	unlock_kernel();
+#endif
+	return(status);
+}
+
 /* DO NOT EVER CALL THIS FUNCTION WITH A LOCKED BUFFER HEAD! */
 ocfs_journal_handle * ocfs_start_trans(ocfs_super *osb, int max_buffs) 
 {
@@ -74,9 +100,11 @@
 	if (!osb || !osb->journal->k_journal)
 		BUG();
 
-	/* for now, we only do one transaction at a time. Eventually
-	 * this and trans_in_progress need to be replaced. */
-	ocfs_take_trans_lock(osb);
+	/* JBD might support this, but our journalling code doesn't yet. */
+	if (journal_current_handle()) {
+		LOG_ERROR_STR("Recursive transaction attempted!");
+		BUG();
+	}
 
 	retval = ocfs_malloc(sizeof(*retval));
 	if (!retval) {
@@ -112,9 +140,13 @@
 	retval->osb = osb;
 	retval->commit_bits = NULL;
 
+	down_read(&osb->journal->trans_barrier);
+
 	/* actually start the transaction now */
-	retval->k_handle = journal_start(journal, max_buffs);
+	retval->k_handle = ocfs_journal_start(journal, max_buffs);
 	if (IS_ERR(retval->k_handle)) {
+		up_read(&osb->journal->trans_barrier);
+
 		LOG_ERROR_STR("journal_start() failed!");
 		LOG_ERROR_STATUS((int)PTR_ERR(retval->k_handle));
 		retval->k_handle = NULL;
@@ -123,10 +155,6 @@
 
 	atomic_inc(&(osb->journal->num_trans));
 
-	down(&osb->journal->commit_sem);
-	osb->journal->curr = retval;
-	up(&osb->journal->commit_sem);
-
 	/* default handle flags! */
 	ocfs_handle_set_sync(retval, 1);
 	ocfs_handle_set_checkpoint(retval, 1);
@@ -135,7 +163,6 @@
 	return(retval);
 
 done_free:
-	ocfs_release_trans_lock(osb);
 
 	if (retval) {
 		if (retval->buffs)
@@ -202,7 +229,7 @@
 	if (!handle->num_buffs)
 		goto done;
 
-	new_handle = journal_start(journal->k_journal, handle->num_buffs);
+	new_handle = ocfs_journal_start(journal->k_journal, handle->num_buffs);
 	if (IS_ERR(new_handle)) {
 		retval = PTR_ERR(new_handle);
 		new_handle = NULL;
@@ -227,7 +254,7 @@
 
 done:
 	if (new_handle)
-		journal_stop(new_handle);
+		ocfs_journal_stop(new_handle);
 
 	LOG_EXIT_STATUS(retval);
 	return(retval);
@@ -241,22 +268,31 @@
 	if (!inode)
 		BUG();
 
-	if (!list_empty(&OCFS_I(inode)->handle_list)) {
+	if (OCFS_I(inode)->ip_handle == handle) {
+		/* sanity check */
+		if (list_empty(&OCFS_I(inode)->ip_handle_list))
+			BUG();
+
 		/* I think this can happen to the main bitmap inode if
 		 * we extend a regular file and also have to extend a
 		 * system file in the same transaction */
-		LOG_ERROR_ARGS("Inode %lu already has a transaction!\n",
+		LOG_TRACE_ARGS("Inode %lu already added to transaction!\n",
 			       inode->i_ino);
 		return;
 	}
 
 	atomic_inc(&inode->i_count);
 
-	down(&inode->i_sem);
+	down(&OCFS_I(inode)->ip_io_sem);
 
-	list_del(&(OCFS_I(inode)->handle_list));
-	list_add_tail(&(OCFS_I(inode)->handle_list), &(handle->inode_list));
+	/* sanity check */
+	if (OCFS_I(inode)->ip_handle)
+		BUG();
 
+	OCFS_I(inode)->ip_handle = handle;
+	list_del(&(OCFS_I(inode)->ip_handle_list));
+	list_add_tail(&(OCFS_I(inode)->ip_handle_list), &(handle->inode_list));
+
 	return;
 }
 
@@ -267,21 +303,22 @@
 	ocfs_inode_private *ip;
 
 	list_for_each_safe(p, n, &handle->inode_list) {
-		ip = list_entry(p, ocfs_inode_private, handle_list);
+		ip = list_entry(p, ocfs_inode_private, ip_handle_list);
 		inode = ip->inode;
 
-		list_del(&OCFS_I(inode)->handle_list);
-		INIT_LIST_HEAD(&OCFS_I(inode)->handle_list);
+		OCFS_I(inode)->ip_handle = NULL;
+		list_del(&OCFS_I(inode)->ip_handle_list);
+		INIT_LIST_HEAD(&OCFS_I(inode)->ip_handle_list);
 
-		up(&inode->i_sem);
+		up(&OCFS_I(inode)->ip_io_sem);
 		iput(inode);
 	}
 	return;
 }
 
-/* This does no locking of the handle, so make sure that the handle
- * isn't on journal->curr. If the handle is on journal->commited, then
- * you want to be holding the commit_sem before calling this. */
+/* This does no locking of the handle. If the handle is on
+ * journal->commited, then you want to be holding the commit_sem
+ * before calling this. */
 static int ocfs_journal_release_locks(ocfs_journal_handle *handle, 
 				      release_locks_action action)
 {
@@ -295,9 +332,6 @@
 
 	osb = handle->osb;
 
-	if (osb->journal->curr == handle)
-		BUG();
-
 	LOG_TRACE_ARGS("num_locks = %d\n", handle->num_locks);
 
 	list_for_each_safe(p, n, &(handle->locks)) {
@@ -307,13 +341,15 @@
 			BUG();
 
 		/* The cache list holds unlocked inodes */
+		if (action == TRANS_CACHE || lock->req_io_sem)
+			down(&OCFS_I(lock->inode)->ip_io_sem);
+
 		if (action == TRANS_CACHE)
-			down(&lock->inode->i_sem);	
+			printk("got sem on inode %lu\n", lock->inode->i_ino);
 
 		/* The file may have been deleted before we got to
 		 * this lock release. If so, just skip it.  */
-		if ((!lock->inode)
-		    || (lock->inode && !INODE_DELETED(lock->inode))) {
+		if (!INODE_DELETED(lock->inode)) {
 
 			tmpstat = ocfs_release_lock(osb, 
 						    GET_INODE_FEOFF(lock->inode),
@@ -331,13 +367,12 @@
 			}
 		}
 
-		if (action == TRANS_CACHE)
-			up(&lock->inode->i_sem);
+		if (action == TRANS_CACHE || lock->req_io_sem)
+			up(&OCFS_I(lock->inode)->ip_io_sem);
 
 		if (lock->bh != NULL)
 			brelse(lock->bh);
-		if (lock->inode)
-			iput(lock->inode);
+		iput(lock->inode);
 		list_del(&(lock->lock_list));
 		handle->num_locks--;
 		ocfs_free(lock);
@@ -392,6 +427,21 @@
 	}								\
 } while (0)
 
+static inline int ocfs_journal_flush(ocfs_journal *journal) 
+{
+	int retval;
+
+	down_write(&journal->trans_barrier);
+	journal_lock_updates(journal->k_journal);
+
+	retval = journal_flush(journal->k_journal);
+
+	journal_unlock_updates(journal->k_journal);
+	up_write(&journal->trans_barrier);
+
+	return(retval);
+}
+
 /*
  * ocfs_commit_trans
  */
@@ -429,15 +479,31 @@
 	else
 		kern_handle->h_sync = 0;
 
+	/* Ok, we're done changing these buffers now... */
+	for(i = 0; i < handle->num_buffs; i++)
+		ocfs_clear_buffer_modified(handle->buffs[i]);
+
+	/* release inode semaphores we took during this transaction */
+	ocfs_handle_unlock_inodes(handle);
+	if (handle->flags & OCFS_HANDLE_LOCAL_ALLOC)
+		up(&osb->local_alloc_sem);
+
 	/* actually stop the transaction. if we've set h_sync,
 	 * it'll have been commited when we return */
-	retval = journal_stop(kern_handle);
+	retval = ocfs_journal_stop(kern_handle);
 	if (retval < 0) {
 		LOG_ERROR_STATUS(retval);
 		LOG_ERROR_STR("Could not commit transaction");
 		BUG();
 	}
 
+	/* in the checkpoint case we num_trans as there's nothing for
+	 * the commit thread to do on our behalf. */
+	if (checkpoint)
+		atomic_dec(&(osb->journal->num_trans));
+
+	up_read(&journal->trans_barrier);
+
 	handle->k_handle = NULL; /* it's been free'd in journal_stop */
 
 	/* In the future we'll try to queue up as many
@@ -445,41 +511,31 @@
 	 * will checkpoint and revoke everything from that
 	 * transaction. */
 	if (checkpoint) {
-		/* checkpoint from buffer_head list */
-		retval = ocfs_checkpoint_handle(handle);
-		if (retval < 0) {
-			LOG_ERROR_STR("Could not checkpoint transaction!");
-			BUG();
-		}
-
-		/* revoke from buffer_head list, commit revoke records */
-		retval = ocfs_revoke_handle(handle);
-		if (retval < 0) {
-			LOG_ERROR_STR("Could not completely revoke "
-				      "transaction!");
-			BUG();
-		}
-	} else { 
-		/* If we're not checkpointing, we have to be careful
-		 * to also clear the modified bits. */
-		for(i = 0; i < handle->num_buffs; i++)
-			ocfs_clear_buffer_modified(handle->buffs[i]);
+		retval = ocfs_journal_flush(journal);
+		if (retval < 0)
+			LOG_ERROR_STATUS(retval);
 	}
 
-/* done: */
+	/* Do the next few steps before we put the handle on any lists
+	 * where it might be freed! */
 	for(i = 0; i < handle->num_buffs; i++) {
 		brelse(handle->buffs[i]);
 		handle->buffs[i] = NULL;
 	}
 	handle->num_buffs = 0;
+	if (handle->buffs) {
+		ocfs_free(handle->buffs);
+		handle->buffs = NULL;
+	}
 
-	down(&journal->commit_sem);
-	journal->curr = NULL;
+	/* At this point, we don't need the copyout buffers. */
+	ocfs_handle_free_all_copyout(handle);
 
+	commit_head = handle->commit_bits;
+	handle->commit_bits = NULL;
+
+/* done: */
 	if (checkpoint) {
-		up(&journal->commit_sem);
-		atomic_dec(&(osb->journal->num_trans));
-
 		/* Release locks associated with this handle. */
 		retval = ocfs_journal_release_locks(handle, TRANS_COMMIT);
 		if (retval < 0)
@@ -488,37 +544,17 @@
 		/* If we're not going to checkpoint the handle on
 		 * commit then we need to add it to our journals list
 		 * so it can be done later */
+		down(&journal->commit_sem);
 		list_add_tail(&(handle->h_list), &(journal->commited));
 		osb->needs_flush = 1;
 		up(&journal->commit_sem);
+		/* Ok, any references to the handle after this are
+		 * unsafe as it might be processed (and free'd from
+		 * memory) by the commit thread! */
 	}
 
-	/* At this point, we don't need the copyout buffers. */
-	ocfs_handle_free_all_copyout(handle);
-
-	/* we don't free the kernel handle because jbd has freed it. */
-	if (handle->buffs) {
-		ocfs_free(handle->buffs);
-		handle->buffs = NULL;
-	}
-
-	/* save off while we still have trans lock */
-	commit_head = handle->commit_bits;
-	handle->commit_bits = NULL;
-
-	/* release inode semaphores we took during this transaction */
-	ocfs_handle_unlock_inodes(handle);
-	if (handle->flags | OCFS_HANDLE_LOCAL_ALLOC)
-		up(&osb->local_alloc_sem);
-
-	/* This has to happen after we release the other locks. */
-	ocfs_release_trans_lock(osb);
-
-	if (commit_head && (retval == 0)) {
-		if (!sync)
-			BUG();
+	if (commit_head && (retval == 0))
 		ocfs_process_bitmap_free_head(osb, commit_head);
-	}
 	ocfs_free_bitmap_free_head(commit_head);
 
 	if (checkpoint)
@@ -620,28 +656,33 @@
 		}
 	}
 
+	for(i = 0; i < handle->num_buffs; i++)
+		ocfs_clear_buffer_modified(handle->buffs[i]);
+
+	/* release inode semaphores we took during this transaction */
+	ocfs_handle_unlock_inodes(handle);
+	if (handle->flags & OCFS_HANDLE_LOCAL_ALLOC)
+		up(&osb->local_alloc_sem);
+
 	/* done copying them, free it now. */
 	ocfs_handle_free_all_copyout(handle);
 
 	/* want to force our handle to disk in abort case. */
 	handle->k_handle->h_sync = 1;
 
-	retval = journal_stop(handle->k_handle);
+	retval = ocfs_journal_stop(handle->k_handle);
 	if (retval < 0) {
 		LOG_ERROR_STR("Could not commit aborted transaction!");
 		LOG_ERROR_STATUS(retval);
 	}
+	atomic_dec(&(osb->journal->num_trans));
 
+	up_read(&journal->trans_barrier);
+
 	handle->k_handle = NULL;
 
-	atomic_dec(&(osb->journal->num_trans));
 
 /* done: */
-
-	down(&osb->journal->commit_sem);
-	osb->journal->curr = NULL;
-	up(&osb->journal->commit_sem);
-
 	if (handle->num_buffs) {
 		/* Ok, we now want to fill our buffers with the older (but
 		 * valid) data, instead of leaving them with the aborted
@@ -649,31 +690,19 @@
 		 * transactions in the journal so that we know that disk
 		 * reflects the latest correct blocks. After that, we just
 		 * repopulate the buffers from disk. */
-		journal_lock_updates(journal->k_journal);
-		retval = journal_flush(journal->k_journal);
-		journal_unlock_updates(journal->k_journal);
+		retval = ocfs_journal_flush(journal);
 		if (retval < 0)
 			LOG_ERROR_STATUS(retval);
 	}
 
-	for(i = 0; i < handle->num_buffs; i++) {
-		ocfs_clear_buffer_modified(handle->buffs[i]);
+	for(i = 0; i < handle->num_buffs; i++)
 		brelse(handle->buffs[i]);
-	}
 
 	/* drop locks associated with the handle here. */
 	retval = ocfs_journal_release_locks(handle, TRANS_ABORT);
 	if (retval < 0)
 		LOG_ERROR_STATUS(retval);
 
-	/* release inode semaphores we took during this transaction */
-	ocfs_handle_unlock_inodes(handle);
-	if (handle->flags | OCFS_HANDLE_LOCAL_ALLOC)
-		up(&osb->local_alloc_sem);
-
-	/* This has to happen after we release the other locks. */
-	ocfs_release_trans_lock(osb);
-
 	/* Should only be processed in commit. */
 	ocfs_free_bitmap_free_head(handle->commit_bits);
 
@@ -864,8 +893,9 @@
 /* We are expecting to be run on the current running transaction, so
  * we use the spin_lock here. You really shouldn't be calling this on
  * other transactions anyway... */
-void ocfs_journal_add_lock(ocfs_journal_handle *handle, __u32 type, __u32 flags, 
-			   struct buffer_head *bh, struct inode *inode) 
+void ocfs_handle_add_lock(ocfs_journal_handle *handle, __u32 type, 
+			  __u32 flags, struct buffer_head *bh, 
+			  struct inode *inode, int req_io_sem) 
 {
 	ocfs_journal_lock *lock;
 
@@ -884,13 +914,13 @@
 	lock->flags = flags;
 	lock->bh    = bh;
 	lock->inode = inode;
+	lock->req_io_sem  = req_io_sem;
 
 	if (bh)
 		get_bh(bh);
-	
-	if (inode)
-		atomic_inc(&inode->i_count);
 
+	atomic_inc(&inode->i_count);
+
 	spin_lock(&handle->list_lock);
 	list_add_tail(&(lock->lock_list), &(handle->locks));
 	handle->num_locks++;
@@ -925,7 +955,7 @@
 		  osb->vol_layout.root_int_off;
 
 	/* Ok, look up the inode for our journal */
-	inode = ocfs_iget(osb, lock_id, NULL);
+	inode = ocfs_iget(osb, lock_id);
 	if (inode == NULL) {
 		LOG_ERROR_STR("access error");
 		status = -EACCES;
@@ -938,12 +968,18 @@
 		status = -EACCES;
 		goto done;
 	}
+
+	down(&OCFS_I(inode)->ip_io_sem);
+
+
 	SET_INODE_JOURNAL(inode);
 
 	/* TODO: Use another type of lock. */
 	status = ocfs_acquire_lock (osb, lock_id, OCFS_DLM_EXCLUSIVE_LOCK,
 				    FLAG_FILE_CREATE, &bh, inode);
 	if (status < 0) {
+		up(&OCFS_I(inode)->ip_io_sem);
+
 		if (status != -EINTR)
 			LOG_ERROR_STR("Could not get lock on journal!");
 		goto done;
@@ -961,6 +997,8 @@
 	if (status < 0) {
 		OCFS_BH_PUT_DATA(bh);
 		fe = NULL;
+		up(&OCFS_I(inode)->ip_io_sem);
+
 		goto done;
 	}
 
@@ -983,6 +1021,8 @@
 					DLOCK_FLAG_OPEN_MAP|DLOCK_FLAG_ADD_SELF, 
 					&bh, inode, NULL);
 	if (status < 0) {
+		up(&OCFS_I(inode)->ip_io_sem);
+
 		LOG_ERROR_STATUS(status);
 		goto done;
 	}
@@ -990,6 +1030,8 @@
 	LOG_TRACE_ARGS("inode->alloc_size = %llu\n", 
 		       OCFS_I(inode)->alloc_size);
 
+	up(&OCFS_I(inode)->ip_io_sem);
+
 	/* call the kernels journal init function now */
 	k_journal = journal_init_inode(inode);
 	if (k_journal == NULL) {
@@ -1012,7 +1054,9 @@
 	osb->journal->lockbh = bh;
 	osb->journal->lock_id = lock_id;
 	atomic_set(&(osb->journal->num_trans), 0);
+	init_rwsem(&(osb->journal->trans_barrier));
 	osb->journal->state = OCFS_JOURNAL_LOADED;
+
 	status = 0;
 done:
 	if (status < 0) {
@@ -1021,13 +1065,15 @@
 				OCFS_BH_PUT_DATA(bh);
 			brelse(bh);
 		}
-		if (inode)
+		if (inode) {
 			OCFS_I(inode)->open_hndl_cnt--;
+			iput(inode);
+		}
 	}
 
 	LOG_EXIT_STATUS(status);
 	return(status);
-}
+} /* ocfs_journal_init */
 
 /*
   if the journal has been ocfs_malloc'd it needs to be freed after this call.
@@ -1065,9 +1111,7 @@
 	 * release any locks that are still held.
 	 * set the SHUTDOWN flag and release the trans lock.
 	 * the commit thread will take the trans lock for us below. */
-	down(&osb->trans_lock);
 	journal->state = OCFS_JOURNAL_IN_SHUTDOWN;
-	up(&osb->trans_lock);
 
 	/* wake the commit thread */
 	atomic_set (&osb->flush_event_woken, 1);
@@ -1090,20 +1134,22 @@
 
 	OCFS_I(inode)->open_hndl_cnt--;
 
+	down(&OCFS_I(inode)->ip_io_sem);
 	/* unlock our journal */
 	status = ocfs_release_lock (osb, journal->lock_id,
 				    OCFS_DLM_EXCLUSIVE_LOCK,
 				    FLAG_FILE_CREATE, 
 				    journal->lockbh, inode);
+	up(&OCFS_I(inode)->ip_io_sem);
 	if (status < 0)
 		LOG_ERROR_STATUS (status);
-	
+
 	brelse (journal->lockbh);
 	journal->lockbh = NULL;
 
 	journal->state = OCFS_JOURNAL_FREE;
 
-	up (&osb->trans_lock);
+//	up_write(&journal->trans_barrier);
 done:
 	if (inode)
 		iput(inode);
@@ -1363,7 +1409,7 @@
 		+ osb->vol_layout.root_int_off;
 
 	/* Ok, look up the inode for our journal */
-	inode = ocfs_iget(osb, lock_id, NULL);
+	inode = ocfs_iget(osb, lock_id);
 	if (inode == NULL) {
 		LOG_ERROR_STR("access error");
 		status = -EACCES;
@@ -1376,6 +1422,9 @@
 		status = -EACCES;
 		goto done;
 	}
+
+	down(&OCFS_I(inode)->ip_io_sem);
+
 	SET_INODE_JOURNAL(inode);
 
 	/* Should not ever be called to recover ourselves -- in that
@@ -1387,6 +1436,8 @@
 				    OCFS_DLM_EXCLUSIVE_LOCK,
 				    FLAG_FILE_CREATE|FLAG_FILE_RECOVERY, 
 				    &bh, inode);
+
+	up(&OCFS_I(inode)->ip_io_sem);
 	if (status < 0) {
 		LOG_TRACE_ARGS("status returned from acquire_lock=%d\n", 
 			       status);
@@ -1415,9 +1466,12 @@
 	OCFS_I(inode)->alloc_size = alloc_size;
 
 	/* add this node to openmap and update disk lock */
+	down(&OCFS_I(inode)->ip_io_sem);
+
 	status = ocfs_update_disk_lock (osb, 
 					DLOCK_FLAG_OPEN_MAP|DLOCK_FLAG_ADD_SELF, 
 					&bh, inode, NULL);
+	up(&OCFS_I(inode)->ip_io_sem);
 	if (status < 0) {
 		LOG_ERROR_STATUS(status);
 		goto done;
@@ -1483,11 +1537,15 @@
 		up(&(osb->recovery_lock));
 
 	/* drop the lock on this nodes journal */
-	if (got_lock)
+	if (got_lock) {
+		down(&OCFS_I(inode)->ip_io_sem);
+
 		status = ocfs_release_lock(osb, lock_id, 
 					   OCFS_DLM_EXCLUSIVE_LOCK, 
 					   FLAG_FILE_CREATE|FLAG_FILE_RECOVERY,
 					   bh, inode);
+		up(&OCFS_I(inode)->ip_io_sem);
+	}
 	if (inode)
 		iput(inode);
 
@@ -1628,15 +1686,6 @@
 		    (osb->osb_flags & OCFS_OSB_FLAGS_BEING_DISMOUNTED))
 			finish = 1;
 
-		if (down_trylock(&osb->trans_lock) != 0) {
-			LOG_TRACE_ARGS("commit thread: trylock failed, miss=%d\n", misses);
-			if (++misses < OCFS_COMMIT_MISS_MAX && finish == 0)
-				continue;
-			LOG_TRACE_ARGS("commit thread: about to down\n");
-			down(&osb->trans_lock);
-			misses = 0;
-		}
-
 		status = ocfs_commit_cache(osb, 0);
 		if (status < 0)
 			LOG_ERROR_STATUS(status);
@@ -1645,8 +1694,6 @@
 			break;
 	}
 
-
-
 	/* Flush all scheduled tasks */
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
 	flush_scheduled_work ();
@@ -1677,23 +1724,27 @@
 
 	LOG_ENTRY_ARGS("(data_flush = %u)\n", data_flush);
 
-	if (down_trylock(&osb->trans_lock) == 0)
-		BUG();
-
 	journal = osb->journal;
 
 	if (atomic_read(&journal->num_trans) == 0) {
-		up(&osb->trans_lock);
-
 		LOG_TRACE_STR("No transactions for me to flush!");
 		goto flush_data;
 	}
 
 	/* flush all pending commits and checkpoint the journal. */
+	down_write(&journal->trans_barrier);
+
+	/* check again, this time locked :) */
+	if (atomic_read(&journal->num_trans) == 0) {
+		up_write(&journal->trans_barrier);
+		goto flush_data;
+	}
+
 	journal_lock_updates(journal->k_journal);
 	status = journal_flush(journal->k_journal);
+
+	up_write(&journal->trans_barrier);
 	if (status < 0) {
-		up(&osb->trans_lock);
 		journal_unlock_updates(journal->k_journal);
 
 		LOG_ERROR_STATUS(status);
@@ -1703,8 +1754,6 @@
 	LOG_TRACE_ARGS("flushing %d transactions\n", 
 		       atomic_read(&journal->num_trans));
 
-	atomic_set(&journal->num_trans, 0);
-
 	/* now we can run an unlock against any pending handles and
 	 * release them. */
 	down(&journal->commit_sem);
@@ -1726,9 +1775,6 @@
 	up(&journal->commit_sem);
 
 	osb->needs_flush = 0;
-	/* shutdown code wants to hold the trans lock */
-	if (journal->state != OCFS_JOURNAL_IN_SHUTDOWN)
-		up(&osb->trans_lock);
 
 	down(&commit->c_lock);
 	list_for_each_safe(p, n, &commit->c_list) {
@@ -1738,6 +1784,8 @@
 			LOG_ERROR_STATUS((status = tmpstat));
 		list_del(&(handle->h_list));
 		ocfs_free(handle);
+
+		atomic_dec(&journal->num_trans);
 	}
 	up(&commit->c_lock);
 

Modified: trunk/src/lockres.c
===================================================================
--- trunk/src/lockres.c	2004-06-08 22:18:55 UTC (rev 1023)
+++ trunk/src/lockres.c	2004-06-08 22:52:28 UTC (rev 1024)
@@ -119,6 +119,10 @@
 	return status;
 }				/* ocfs_find_update_res */
 
+
+#define ocfs_container_of(ptr, type, member) ({                      \
+        const typeof( ((type *)0)->member ) *__mptr = (ptr);    \
+         (type *)( (char *)__mptr - offsetof(type,member) );})
 /*
  * ocfs_acquire_lockres()
  *
@@ -131,6 +135,8 @@
 	unsigned long jif = 0;
 	int status = 0;
 	int cnt = 0;
+	struct inode *inode;
+	ocfs_inode_private *ip;
 
 	LOG_ENTRY_ARGS ("(0x%p, %u)\n", lockres, timeout);
 
@@ -147,6 +153,11 @@
 		if (lockres->in_use) {
 			if (lockres->thread_id != mypid) {
 				spin_unlock (&lockres->lock_mutex);
+				LOG_ERROR_ARGS ("lockpid=%d, newpid=%d,"
+						" timedout\n",
+						lockres->thread_id, mypid);
+				BUG();
+
 				if (jif && jif < jiffies) {
 					LOG_TRACE_ARGS ("lockpid=%d, newpid=%d,"
 						" timedout\n",
@@ -162,7 +173,7 @@
 				}
 				ocfs_sleep (OCFS_NM_HEARTBEAT_TIME / 10);
 			}
-			else {
+			else  {
 				printk("lockres in_use=%d, pid=%d, mypid=%d\n", lockres->in_use, lockres->thread_id, mypid);
 				BUG();
 				lockres->in_use++;
@@ -173,6 +184,13 @@
 			lockres->in_use = 1;
 			lockres->thread_id = mypid;
 			spin_unlock (&lockres->lock_mutex);
+			ip = ocfs_container_of(lockres, ocfs_inode_private, i_lockres);
+			inode = ip->inode;
+			if (down_trylock(&OCFS_I(inode)->ip_io_sem) == 0) {
+				LOG_ERROR_ARGS("locking lockres without io_sem! ino = %lu, offset = %llu\n", inode->i_ino, OCFS_I(inode)->feoff);
+
+				BUG();
+			}
 			break;
 		}
 	}

Modified: trunk/src/namei.c
===================================================================
--- trunk/src/namei.c	2004-06-08 22:18:55 UTC (rev 1023)
+++ trunk/src/namei.c	2004-06-08 22:52:28 UTC (rev 1024)
@@ -69,8 +69,8 @@
 #endif
 {
 	int status;
-	ocfs_file_entry *fe;
-	struct buffer_head *fe_bh = NULL, *dirent_bh = NULL;
+	__u64 fe_off;
+	struct buffer_head *dirent_bh = NULL;
 	struct inode *inode = NULL;
 	struct super_block *sb = dir->i_sb;
 	struct dentry *ret;
@@ -89,20 +89,13 @@
 	LOG_TRACE_ARGS("about to call find_files_on_disk with inode=%p\n", 
 		       dir);
 
-	status = ocfs_find_files_on_disk (osb, dentry, &fe_bh, dir, inode, 1, &dirent_bh, &dirent);
+	down(&OCFS_I(dir)->ip_io_sem);
+	status = ocfs_find_files_on_disk (osb, dentry, &fe_off, dir, 1, &dirent_bh, &dirent);
+	up(&OCFS_I(dir)->ip_io_sem);
 	if (status < 0)
 		goto bail_add;
 	
-	fe = OCFS_BH_GET_DATA_READ(fe_bh);
-	if (!IS_VALID_FILE_ENTRY(fe)) {
-		printk("ocfs2: invalid file entry!  parent=%llu, name='%*s'\n",
-		       GET_INODE_FEOFF(dir), dentry->d_name.len, 
-		       dentry->d_name.name);
-		BUG();
-	}
-	OCFS_BH_PUT_DATA(fe_bh);
-
-	inode = ocfs_iget(osb, 0, fe_bh);
+	inode = ocfs_iget(osb, fe_off);
 	if (!inode) {
 		LOG_ERROR_STR("Could not create inode!");
 		ret = ERR_PTR (-EACCES);
@@ -115,8 +108,6 @@
 	ret = NULL;
 
 bail:
-	if (fe_bh)
-		brelse(fe_bh);
 	if (dirent_bh)
 		brelse(dirent_bh);
 	
@@ -139,6 +130,8 @@
 	LOG_ENTRY_ARGS ("(0x%p, 0x%p, %d, %d, '%*s')\n", dir, dentry, mode,
 			dev, dentry->d_name.len, dentry->d_name.name);
 
+	down(&OCFS_I(dir)->ip_io_sem);
+
 	/* get our super block */
 	osb = OCFS_SB(dir->i_sb);
 	if (osb->osb_flags & OCFS_OSB_FLAGS_SHUTDOWN) {
@@ -184,8 +177,9 @@
 	}
 
 	/* Ok, we got the lock -- we'd better add it to our transaction */
-	ocfs_journal_add_lock(handle, OCFS_DLM_ENABLE_CACHE_LOCK, 
-			      FLAG_FILE_CREATE | FLAG_DIR, parent_fe_bh, dir);
+	ocfs_handle_add_lock(handle, OCFS_DLM_ENABLE_CACHE_LOCK, 
+			     FLAG_FILE_CREATE | FLAG_DIR, parent_fe_bh, dir, 
+			     0);
 
 	/* do the real work now. */
 	status = ocfs_mknod_locked(osb, dir, dentry, mode, dev,
@@ -208,9 +202,9 @@
 
 	ocfs_init_lockres (osb, inode);
 
+	ocfs_handle_add_inode(handle, inode);
 	status = ocfs_update_lockres (osb, GET_INODE_FEOFF(inode), 
 				      &new_fe_bh, NULL, 0, inode, 0, 0);
-
 	if (S_ISDIR (mode)) {
 		struct buffer_head *newdirbh = NULL;
 		int retval = 0;
@@ -274,6 +268,8 @@
 	if ((status < 0) && handle)
 		ocfs_abort_trans(handle);
 
+	up(&OCFS_I(dir)->ip_io_sem);
+
 	if (status == -ENOSPC)
 		LOG_TRACE_STR ("Disk is full");
 	else if (status < 0 && status != -EINTR)
@@ -339,7 +335,7 @@
 		LOG_ERROR_STATUS (status);
 		goto leave;
 	}
-		
+
 	status = ocfs_read_bh(osb, bitmapOffset, new_fe_bh,
 				      OCFS_BH_CACHED, inode);
 
@@ -502,7 +498,7 @@
 	struct inode *inode = dentry->d_inode;
 	int retval = -EBUSY;
 	ocfs_super *osb = OCFS_SB(dir->i_sb);
-	__u64 fileOff = GET_INODE_FEOFF(inode);
+	__u64 fe_off = GET_INODE_FEOFF(inode);
 	struct inode *parentInode = dentry->d_parent->d_inode;
 	ocfs_file_entry *fe = NULL;
 	__u32 lockFlags = (S_ISDIR (inode->i_mode) ? (FLAG_FILE_DELETE | FLAG_DIR) : FLAG_FILE_DELETE);
@@ -518,7 +514,7 @@
 	LOG_ENTRY_ARGS ("(0x%p, 0x%p, '%*s')\n", dir, dentry,
 			dentry->d_name.len, dentry->d_name.name);
 
-	LOG_TRACE_ARGS("ino = %llu\n", fileOff);
+	LOG_TRACE_ARGS("ino = %llu\n", fe_off);
 
 	status = -EBUSY;
 
@@ -526,7 +522,7 @@
 		LOG_TRACE_STR ("dentry is not empty, cannot delete");
 		goto bail;
 	} else if (OCFS_I(inode)->open_hndl_cnt > 0) {
-		LOG_TRACE_ARGS ("Cannot remove an open file (open_hndl_cnt = %u, fileOff = %llu, d_count=%u)\n", OCFS_I(inode)->open_hndl_cnt, fileOff, atomic_read(&dentry->d_count));
+		LOG_TRACE_ARGS ("Cannot remove an open file (open_hndl_cnt = %u, fe_off = %llu, d_count=%u)\n", OCFS_I(inode)->open_hndl_cnt, fe_off, atomic_read(&dentry->d_count));
 		goto bail;
 	} else if (inode == osb->root_inode) {
 		LOG_TRACE_STR ("Cannot delete the root directory");
@@ -538,16 +534,19 @@
 	spin_lock(&oin_num_ext_lock);
 	if (OCFS_I(inode)->num_extends) {
 		LOG_ERROR_ARGS ("Cannot remove a file with = "
-				"%u, pending extends (fileOff "
+				"%u, pending extends (fe_off "
 				"= %llu)\n", 
 				OCFS_I(inode)->num_extends,
-				fileOff);
+				fe_off);
 		spin_unlock(&oin_num_ext_lock);
 		status = -EBUSY;
 		goto bail;
 	}
 	spin_unlock(&oin_num_ext_lock);
 
+	down(&OCFS_I(dir)->ip_io_sem);
+	down(&OCFS_I(inode)->ip_io_sem);
+
 	handle = ocfs_start_trans(osb, OCFS_FILE_DELETE_CREDITS);
 	if (handle == NULL) {
 		LOG_ERROR_STATUS (status = -ENOMEM);
@@ -567,14 +566,23 @@
 
 	/* this will re-read the directory now with the EXCLUSIVE */
 	/* lock already held; it will also return the fe_bh to us */
-	status = ocfs_find_files_on_disk (osb, dentry, &fe_bh, parentInode, 
-					  inode, 0, &dirent_bh, &dirent);
+	status = ocfs_find_files_on_disk (osb, dentry, &fe_off, parentInode, 
+					  0, &dirent_bh, &dirent);
 	if (status < 0) {
 		LOG_ERROR_STATUS(status);
 		goto leave;
 	}
 
-	status = ocfs_acquire_lock (osb, fileOff, OCFS_DLM_EXCLUSIVE_LOCK,
+	if (fe_off != GET_INODE_FEOFF(inode))
+		BUG();
+
+	status = ocfs_read_bh(osb, fe_off, &fe_bh, OCFS_BH_CACHED, inode);
+	if (status < 0) {
+		LOG_ERROR_STATUS(status);
+		goto leave;
+	}
+
+	status = ocfs_acquire_lock (osb, fe_off, OCFS_DLM_EXCLUSIVE_LOCK,
 			lockFlags, &fe_bh, inode);
 	if (status < 0) {
 		if (status != -EINTR)
@@ -688,7 +696,7 @@
 	/* need this to alert dentry-owners on other nodes */
 	/* Release the file lock if we acquired it */
 	if (got_file) {
-		tmpstat = ocfs_release_lock(osb, fileOff, 
+		tmpstat = ocfs_release_lock(osb, fe_off, 
 					    OCFS_DLM_EXCLUSIVE_LOCK, 
 					    lockFlags, fe_bh, inode);
 		if (tmpstat < 0)
@@ -718,6 +726,9 @@
 		if (drop_inode) 
 			SET_INODE_DELETED(inode);
 	}
+
+	up(&OCFS_I(inode)->ip_io_sem);
+	up(&OCFS_I(dir)->ip_io_sem);
 bail:
 	if (status < 0 && status != -ENOTEMPTY && 
 	    status != -EPERM && status != -EBUSY && status != -EINTR) {
@@ -828,10 +839,10 @@
 		}
 	} else if (handle) {
 		if (id2_locked)
-			ocfs_journal_add_lock(handle, type2, flags2, 
-					      *bh2, inode2);
-		ocfs_journal_add_lock(handle, type1, flags1, *bh1, 
-				      inode1);
+			ocfs_handle_add_lock(handle, type2, flags2, 
+					     *bh2, inode2, 0);
+		ocfs_handle_add_lock(handle, type1, flags1, *bh1, 
+				     inode1, 0);
 	}
 
 	LOG_EXIT_STATUS(status);
@@ -891,10 +902,13 @@
 	/* new parent dir offset */
 	newDirOff = GET_INODE_FEOFF(new_dir);
 	
+	double_down(&OCFS_I(old_dir)->ip_io_sem, &OCFS_I(new_dir)->ip_io_sem);
+	down(&OCFS_I(old_inode)->ip_io_sem);
 
 	if (new_inode) {
 		if (ocfs_inc_icount(new_inode) < 0)
 			BUG();
+		down(&OCFS_I(new_inode)->ip_io_sem);
 	}
 
 	if (atomic_read (&old_dentry->d_count) > 2) {
@@ -1021,8 +1035,8 @@
 
 	/* check if the target already exists (in which case we need
 	 * to delete it */
-	status = ocfs_find_files_on_disk(osb, new_dentry, &newfe_bh, 
-					 new_dir, new_inode, 0, &new_de_bh, &new_de);
+	status = ocfs_find_files_on_disk(osb, new_dentry, &newfe_lockid, 
+					 new_dir, 0, &new_de_bh, &new_de);
 	/* The only error we allow here is -ENOENT because the new
 	 * file not existing is perfectly valid. */
 	if ((status < 0) && (status != -ENOENT)) {
@@ -1035,13 +1049,25 @@
 	/* In case we need to overwrite an existing file, we blow it
 	 * away first */
 	if (new_de) {
+		if (newfe_lockid != GET_INODE_FEOFF(new_inode))
+			BUG();
+
+		status = ocfs_read_bh(osb, newfe_lockid, &newfe_bh, 
+				      OCFS_BH_CACHED, new_inode);
+		if (status < 0) {
+			LOG_ERROR_STATUS(status);
+			goto finally;
+		}
+
 		/* TODO: change this block to the ext3-style orphan model */
 		newfe = OCFS_BH_GET_DATA_READ(newfe_bh);
 		if (newfe->attribs & OCFS_ATTRIB_DIRECTORY)
 			newfe_flags = FLAG_DIR;
 		newfe_flags |= FLAG_FILE_DELETE;
-		newfe_lockid = newfe->this_sector;
 
+		if (newfe_lockid != newfe->this_sector)
+			BUG();
+
 		OCFS_BH_PUT_DATA(newfe_bh);
 		newfe = NULL;
 
@@ -1227,8 +1253,13 @@
 				  newfe_flags, NULL, new_inode);
 	}
 
-	if (new_inode)
+	double_up(&OCFS_I(old_dir)->ip_io_sem, &OCFS_I(new_dir)->ip_io_sem);
+	up(&OCFS_I(old_inode)->ip_io_sem);
+
+	if (new_inode) {
+		up(&OCFS_I(new_inode)->ip_io_sem);
 		iput(new_inode);
+	}
 
 	if (tmpfe)
 		ocfs_release_file_entry (tmpfe);
@@ -1290,17 +1321,23 @@
 	sb = dir->i_sb;
 	osb = OCFS_SB(sb);
 
+	down(&OCFS_I(dir)->ip_io_sem);
+
 	inode = new_inode (sb);
 	if (IS_ERR (inode)) {
 		status = PTR_ERR(inode);
+		inode = NULL;
 		LOG_ERROR_STR("new_inode failed!");
 		goto bail;
 	}
 
 	if (ocfs_inode_init_private(inode)) {
 		LOG_ERROR_STATUS(status = -ENOMEM);
+		iput(inode);
+		inode = NULL;
 		goto bail;
 	}
+	down(&OCFS_I(inode)->ip_io_sem);
 
 	l = strlen (symname) + 1;
 	newsize = l - 1;
@@ -1394,6 +1431,10 @@
 	}
 
 bail:
+	if (inode)
+		up(&OCFS_I(inode)->ip_io_sem);
+	up(&OCFS_I(dir)->ip_io_sem);
+
 	if (new_fe_bh) {
 		if (fe)
 			OCFS_BH_PUT_DATA(new_fe_bh);

Modified: trunk/src/nm.c
===================================================================
--- trunk/src/nm.c	2004-06-08 22:18:55 UTC (rev 1023)
+++ trunk/src/nm.c	2004-06-08 22:52:28 UTC (rev 1024)
@@ -67,7 +67,7 @@
 	"REMASTER_THIS",        // remaster lock to me
 	"REMASTER_REQUESTOR",   // remaster lock to requestor
 	"DROP_READONLY",        // RO cachelock needs to convert to RW
-	"READONLY"
+	"READONLY",
 };
 #endif
 
@@ -522,6 +522,11 @@
 			my_node_wins = (node_num < osb->node_num);
 	}
 
+//	if (flags & FLAG_DROP_LINK) {
+//		vote_type = RELEASE_DENTRY;
+//		goto done;
+//	}
+
 	if (flags & FLAG_DROP_READONLY) {
 		vote_type = DROP_READONLY;
 		goto done;
@@ -632,7 +637,7 @@
 	int inc_inode_seq = 0;
 	int disk_vote = (ctxt->request_method == DISK_VOTE);
 	int comm_vote = (ctxt->request_method == COMM_VOTE);
-	int have_i_sem = 0;
+	int have_io_sem = 0;
 	ocfs_publish *publish = (disk_vote ? ctxt->u.publish : NULL);
 	ocfs_dlm_msg *dlm_msg = (comm_vote ? ctxt->u.dlm_msg : NULL);
 	__u32 node_num = ctxt->node_num;
@@ -683,7 +688,7 @@
 	if ((flags & (FLAG_FILE_DELETE | FLAG_FILE_RENAME)) && (flags & FLAG_RELEASE_LOCK))
 		inode = NULL;
 	else {
-		inode = ocfs_iget(osb, lock_id, NULL);
+		inode = ocfs_iget(osb, lock_id);
 		if (!inode) {
 			status = -EFAIL;
 			LOG_ERROR_ARGS("Could not find inode: lock_id = %llu, "
@@ -693,8 +698,8 @@
 			goto leave;
 		}
 
-		down(&inode->i_sem);
-		have_i_sem = 1;
+		down(&OCFS_I(inode)->ip_io_sem);
+		have_io_sem = 1;
 
 		lockres = GET_INODE_LOCKRES(inode);
 		status = ocfs_update_lockres (osb, lock_id, NULL, NULL,
@@ -768,7 +773,21 @@
 			}
 			vote_response = FLAG_VOTE_OIN_UPDATED;
 			break;
-		
+
+#if 0
+		case RELEASE_DENTRY:
+			if (!inode)
+				BUG();
+
+			/* we always vote yes on this one. */
+			vote_response = FLAG_VOTE_NODE;
+			printk("going to prune dentries for inode %lu\n",
+			       inode->i_ino);
+
+			d_prune_aliases (inode);
+			inode->i_nlink--;
+			break;
+#endif
 		case DELETE_RENAME_RELEASE:
 			/* ACK and done */
 			vote_response = FLAG_VOTE_NODE;
@@ -838,15 +857,15 @@
 #else
 					fsync_inode_buffers (inode);
 #endif
-					up(&inode->i_sem);
-					have_i_sem = 0;
+					up(&OCFS_I(inode)->ip_io_sem);
+					have_io_sem = 0;
 				}
 				break;
 			}
 
 			if (inode) {
-				up(&inode->i_sem);
-				have_i_sem = 0;
+				up(&OCFS_I(inode)->ip_io_sem);
+				have_io_sem = 0;
 			}
 
 			/* Set the always update master on open flag */
@@ -949,7 +968,7 @@
 				lockres->readonly_map &= ~(1 << osb->node_num);
 				if (lockres->readonly_map != 0ULL) {
 					OCFS_ASSERT(lockres->readonly_node == osb->node_num);
-#warning need to make sure inode is not NULL in process_vote
+					OCFS_ASSERT(inode);
 					status = ocfs_drop_readonly_cache_lock(osb, inode, 1);
 					if (status < 0)
 						LOG_ERROR_STATUS(status);
@@ -1071,9 +1090,9 @@
 			 * the actual IO that a readdir may have in 
 			 * progress, if it's possible to have a corrupt 
 			 * readdir.  for now, skip it.
-			 * NOTE: can't just take i_sem because lock order
-			 * needs to be i_sem->lockres... would have to 
-			 * drop lockres, take i_sem, take lockres, then 
+			 * NOTE: can't just take io_sem because lock order
+			 * needs to be io_sem->lockres... would have to 
+			 * drop lockres, take io_sem, take lockres, then 
 			 * recheck all the conditions to see if still 
 			 * appropriate, then do the work and drop both.
 			 * seems like a lot of work.  almost as many lines
@@ -1228,8 +1247,8 @@
 	if (inode) {
 		if (inc_inode_seq)
 			ocfs_inc_inode_seq(osb, inode, 1);
-		if (have_i_sem)
-			up(&inode->i_sem);
+		if (have_io_sem)
+			up(&OCFS_I(inode)->ip_io_sem);
 		iput(inode);
 	}
 
@@ -1356,6 +1375,7 @@
 
 	if (yield) {
 		/* this will wait until process_vote gets to the release */
+		down(&OCFS_I(inode)->ip_io_sem);
 		ocfs_acquire_lockres(lockres, 0); // ocfs_process_vote ocfs_acquire_lock
 	}
 
@@ -1390,6 +1410,9 @@
 			if (yield) {
 				/* from nm thread, give some time to waiters */
 				ocfs_release_lockres(lockres); // ocfs_process_vote ocfs_acquire_lock
+				up(&OCFS_I(inode)->ip_io_sem);
+
+				down(&OCFS_I(inode)->ip_io_sem);
 				ocfs_acquire_lockres(lockres, 0); // ocfs_process_vote ocfs_acquire_lock
 			}
 			continue;
@@ -1406,8 +1429,10 @@
 	lockres->lock_state &= ~FLAG_READONLY_DROPPING;
 
 leave:
-	if (yield)
+	if (yield) {
 		ocfs_release_lockres(lockres); // ocfs_process_vote ocfs_acquire_lock
+		up(&OCFS_I(inode)->ip_io_sem);
+	}
 
 	if (inode)
 		iput(inode);

Modified: trunk/src/super.c
===================================================================
--- trunk/src/super.c	2004-06-08 22:18:55 UTC (rev 1023)
+++ trunk/src/super.c	2004-06-08 22:52:28 UTC (rev 1024)
@@ -206,7 +206,7 @@
 
 	/* the vol bitmap */
 	sys_off = OCFS_BITMAP_LOCK_OFFSET;
-	new = ocfs_iget(osb, sys_off, NULL);
+	new = ocfs_iget(osb, sys_off);
 	if (!new) {
 		status = -EINVAL;
 		LOG_ERROR_STATUS(status);
@@ -218,7 +218,7 @@
 	sys_off = osb->vol_layout.root_int_off + 
 		((OCFS_FILE_FILE_ALLOC_BITMAP + osb->node_num) 
 		 * osb->sect_size);
-	new = ocfs_iget(osb, sys_off, NULL);
+	new = ocfs_iget(osb, sys_off);
 	if (!new) {
 		status = -EINVAL;
 		LOG_ERROR_STATUS(status);
@@ -230,7 +230,7 @@
 	sys_off = osb->vol_layout.root_int_off + 
 		((OCFS_INODE_BITMAP + osb->node_num) 
 		 * osb->sect_size);
-	new = ocfs_iget(osb, sys_off, NULL);
+	new = ocfs_iget(osb, sys_off);
 	if (!new) {
 		status = -EINVAL;
 		LOG_ERROR_STATUS(status);
@@ -241,7 +241,7 @@
 	/* journal file */
 	sys_off = osb->vol_layout.root_int_off + 
 		((OCFS_JOURNAL_FILE + osb->node_num) * osb->sect_size);
-	new = ocfs_iget(osb, sys_off, NULL);
+	new = ocfs_iget(osb, sys_off);
 	if (!new) {
 		status = -EINVAL;
 		LOG_ERROR_STATUS(status);
@@ -993,7 +993,7 @@
 	osb->vol_state = VOLUME_ENABLED;
 	up (&(osb->osb_res));
 
-	inode = ocfs_iget(osb, OCFS_ROOT_INODE_FE_OFF(osb), NULL);
+	inode = ocfs_iget(osb, OCFS_ROOT_INODE_FE_OFF(osb));
 	if (!inode) {
 		status = -EIO;
 		LOG_ERROR_STATUS (status);
@@ -1389,7 +1389,6 @@
 	init_MUTEX (&(osb->osb_res));
 	init_MUTEX (&(osb->recovery_lock));
 	init_MUTEX (&(osb->comm_lock));
-	init_MUTEX (&(osb->trans_lock));
 	init_MUTEX (&(osb->extend_sem));
 	init_MUTEX (&(osb->cfg_lock));
 	init_MUTEX (&(osb->vote_sem));

Modified: trunk/src/sysfile.c
===================================================================
--- trunk/src/sysfile.c	2004-06-08 22:18:55 UTC (rev 1023)
+++ trunk/src/sysfile.c	2004-06-08 22:52:28 UTC (rev 1024)
@@ -249,6 +249,7 @@
 	int numbhs, i;
 	char *data;
 	struct buffer_head **bhs;
+	struct inode *ext_alloc_inode = NULL;
 
 	LOG_ENTRY_ARGS ("(FileId = %u, Size = %llu)\n", FileId, FileSize);
 
@@ -300,6 +301,14 @@
 		    osb->vol_layout.data_start_off;
 		actualLength = numClusterAlloc * osb->vol_layout.cluster_size;
 
+		ext_alloc_inode = igrab(osb->system_inodes[FILE_ALLOC_BITMAP_SYSTEM_INODE]);
+		if (!ext_alloc_inode) {
+			status = -EFAIL;
+			LOG_ERROR_STATUS(status);
+			goto leave;
+		}
+
+		ocfs_handle_add_inode(handle, ext_alloc_inode);
 		status = ocfs_allocate_extent (osb, fe_bh, handle,  
 					       actualDiskOffset, actualLength, NULL);
 		if (status < 0) {
@@ -366,6 +375,9 @@
 		OCFS_BH_PUT_DATA(fe_bh);
 	if (local_fe)
 		brelse(fe_bh);
+	if (ext_alloc_inode)
+		iput(ext_alloc_inode);
+
 	LOG_EXIT_STATUS (status);
 	return status;
 }				/* ocfs_extend_system_file */