[Ocfs2-commits] mfasheh commits r1106 - in trunk: . src src/inc

Tue Jun 15 18:55:35 CDT 2004

Author: mfasheh
Date: 2004-06-15 17:55:33 -0500 (Tue, 15 Jun 2004)
New Revision: 1106

Modified:
   trunk/TODO
   trunk/src/alloc.c
   trunk/src/file.c
   trunk/src/inc/ocfs.h
   trunk/src/inc/proto.h
   trunk/src/inode.c
   trunk/src/journal.c
   trunk/src/lockres.c
   trunk/src/namei.c
   trunk/src/nm.c
Log:
* turn ip_io_sem into an rwsem.

* finish locking down disk I/O. I believe I've got all paths now...

* turn inode_extend_sem into an rwsem. This way multiple people can be
  doing read/writes to a file at the same time (it was blocking
  otherwise). Fix a deadlock that can happen between most other locks
  and the inode_extend_sem by taking it ouside of any other lock
  context, the only exception being the lockres in process_vote.

* make lockres locking semaphore based instead of our lame recursive
  lock. There are no more recursive locks in ocfs.

* increment i_count on an inode when we put it on the recovery list
  and iput it when we take it off. This needs some testing...



Modified: trunk/TODO
===================================================================

--- trunk/TODO	2004-06-15 22:29:45 UTC (rev 1105)
+++ trunk/TODO	2004-06-15 22:55:33 UTC (rev 1106)
@@ -20,16 +20,9 @@
 	DISK_LOCK_SEQNUM(fe) = changeSeqNum;
   code as it is equally useless.
 
-* get rid of as much of lockres as possible.
-
 * make slabs for: ocfs_journal_handle, and ocfs_journal_copyout and maybe
   ocfs_journal_lock
 
-* when we put the inode on the recovery list we should inc i_count, and just
-  be sure to iput it when we remove it off that list, and clean up the lists
-  during shutdown (before we start doing all our iputs otherwise we'll leak
-  those inodes)
-
 * fops, iops on bitmap file (and maybe other system files) should probably be
   different and / or special cased. In a related todo: Get rid of the 
   INODE_JOURNAL flag on our inodes and use the system file flag instead.

Modified: trunk/src/alloc.c
===================================================================
--- trunk/src/alloc.c	2004-06-15 22:29:45 UTC (rev 1105)
+++ trunk/src/alloc.c	2004-06-15 22:55:33 UTC (rev 1106)
@@ -321,7 +321,7 @@
 				LOG_ERROR_STATUS (status);
 				goto abort;
 			}
-			down(&OCFS_I(extnode_inode[i])->ip_io_sem);
+			down_write(&OCFS_I(extnode_inode[i])->ip_io_sem);
 
 			status = ocfs_acquire_lock (osb, 
 						    OCFS_DLM_EXCLUSIVE_LOCK,
@@ -329,7 +329,7 @@
 						    &ugly_hack_bh, 
 						    extnode_inode[i]);
 			if (status < 0) {
-				up(&OCFS_I(extnode_inode[i])->ip_io_sem);
+				up_write(&OCFS_I(extnode_inode[i])->ip_io_sem);
 				iput(extnode_inode[i]);
 				extnode_inode[i] = NULL;
 				if (status != -EINTR)
@@ -350,13 +350,13 @@
 			LOG_ERROR_STATUS (status);
 			goto abort;
 		}
-		down(&OCFS_I(vol_inode)->ip_io_sem);
+		down_write(&OCFS_I(vol_inode)->ip_io_sem);
 
 		status = ocfs_acquire_lock (osb, OCFS_DLM_EXCLUSIVE_LOCK,
 					    FLAG_FILE_CREATE,
 					    &globalbh, vol_inode);
 		if (status < 0) {
-			up(&OCFS_I(vol_inode)->ip_io_sem);
+			up_write(&OCFS_I(vol_inode)->ip_io_sem);
 			iput(vol_inode);
 			vol_inode = NULL;
 
@@ -442,7 +442,7 @@
 	if (extnode_inode) {
 		for (i = 0; i < OCFS_MAXIMUM_NODES; i++) {
 			if (extnode_inode[i]) {
-				up(&OCFS_I(extnode_inode[i])->ip_io_sem);
+				up_write(&OCFS_I(extnode_inode[i])->ip_io_sem);
 				iput(extnode_inode[i]);
                         }
 		}
@@ -450,7 +450,7 @@
 	}
 
 	if (vol_inode) {
-		up(&OCFS_I(vol_inode)->ip_io_sem);
+		up_write(&OCFS_I(vol_inode)->ip_io_sem);
 		iput(vol_inode);
 	}
 
@@ -2422,7 +2422,9 @@
  * decoded and updated in the extent map.
  *
  */
-int ocfs_lookup_file_allocation (ocfs_super * osb, __s64 Vbo, __s64 * Lbo, __u32 sectors, u32 *sector_count, struct inode *inode)
+int ocfs_lookup_file_allocation (ocfs_super * osb, __s64 Vbo, __s64 * Lbo, 
+				 __u32 sectors, u32 *sector_count, 
+				 struct inode *inode, int locked)
 {
 	int status = -EFAIL;
 	ocfs_file_entry *fe = NULL;
@@ -2434,28 +2436,76 @@
 	__s64 localVbo;
 	__u64 cnt;
 	__u32 NumIndex;
+	int have_io_sem = 0;
 
-	LOG_ENTRY_ARGS("(vbo=%llu, sectors=%u, inode=%llu)\n", Vbo, sectors, GET_INODE_FEOFF(inode));
+	LOG_ENTRY_ARGS("(vbo=%llu, sectors=%u, inode=%llu)\n", Vbo, sectors, 
+		       GET_INODE_FEOFF(inode));
 
 	OCFS_ASSERT (osb);
 	OCFS_ASSERT (inode);
 
-	if (INODE_JOURNAL(inode) || Vbo < OCFS_I(inode)->alloc_size) {
-		if (ocfs_lookup_extent_map_entry (osb, &(OCFS_I(inode)->map), 
-			   Vbo, Lbo, &cnt, &NumIndex) && cnt >= sectors) {
-			status = 0;
-			goto finally;
-		}
+	/* for direct io we want to skip all locking. If you're a
+	 * system file, ip_io_sem should already have been taken
+	 * before coming here.  */
+	if (((OCFS_I(inode)->oin_flags & OCFS_OIN_OPEN_FOR_DIRECTIO) 
+	     || OCFS_I(inode)->flags & OCFS_INODE_SYSTEM_FILE)
+	    && (!locked))
+		printk("ocfs2: inode %lu, locked = %d, open direct = %u, "
+		       "sysfile = %u\n", inode->i_ino, locked,
+		       (OCFS_I(inode)->oin_flags & OCFS_OIN_OPEN_FOR_DIRECTIO),
+		       (OCFS_I(inode)->flags & OCFS_INODE_SYSTEM_FILE));
+
+	if (!locked)
+		down(&(OCFS_I(inode)->priv_sem));
+
+check_alloc_sz:
+	status = 0;
+
+	if (Vbo < OCFS_I(inode)->alloc_size)
+		status = ocfs_lookup_extent_map_entry(osb, 
+						      &(OCFS_I(inode)->map), 
+						      Vbo, Lbo, &cnt, 
+						      &NumIndex);
+
+	if (!locked)
+		up(&(OCFS_I(inode)->priv_sem));
+
+	if (status && cnt >= sectors) {
+		/* Found a what we were looking for. */
+		status = 0;
+		goto finally;
 	}
 
+	/* Ok, we didn't find it in the extent map (or we need to
+	 * refresh as alloc sizes don't match up. */
+
+	if (!locked) {
+		/* yay for lock ordering. We must take ip_io_sem
+		 * before priv_sem. */
+		down_read(&OCFS_I(inode)->ip_io_sem);
+		down(&OCFS_I(inode)->priv_sem);
+	}
+
+	/* Make sure we still need to hit disk. */
+	if (Vbo >= OCFS_I(inode)->alloc_size) {
+		if (!locked)
+			up_read(&OCFS_I(inode)->ip_io_sem);
+		goto check_alloc_sz;
+	}
+	up(&OCFS_I(inode)->priv_sem);
+
+	if (!locked)
+		have_io_sem = 1;
+
 	remainingLength = sectors;
 	localVbo = Vbo;
-	    		
+
 	/*  We are looking for a Vbo, but it is not in the Map or not Valid. */
 	/*  Thus we have to go to the disk, and update the Map */
 
 	/* Read the file Entry corresponding to this */
-	status = ocfs_read_bh(osb, GET_INODE_FEOFF(inode), &fe_bh, OCFS_BH_COND_CACHED, inode);
+	status = ocfs_read_bh(osb, GET_INODE_FEOFF(inode), &fe_bh, 
+			      OCFS_BH_COND_CACHED, inode);
 	if (status < 0) {
 		LOG_ERROR_STATUS (status);
 		goto finally;
@@ -2468,7 +2518,7 @@
 		goto finally;
 	}
 
-	if (!INODE_JOURNAL(inode) && Vbo >= (__s64) fe->alloc_size) {
+	if (Vbo >= (__s64) fe->alloc_size) {
 		LOG_ERROR_ARGS ("vbo=%llu, fe->alloc_sz=%llu alloc_size=%llu", 
 				Vbo, fe->alloc_size,
 				OCFS_I(inode)->alloc_size);
@@ -2477,8 +2527,12 @@
 	}
 
 	if (fe->local_ext) {
+		if (!locked)
+			down(&(OCFS_I(inode)->priv_sem));
 		status = ocfs_update_extent_map (osb, &OCFS_I(inode)->map, fe,
 						 NULL, NULL, LOCAL_EXT);
+		if (!locked)
+			up(&(OCFS_I(inode)->priv_sem));
 		if (status < 0) {
 			LOG_ERROR_STATUS (status);
 			goto finally;
@@ -2487,7 +2541,8 @@
 		/* Extents are branched and we are no longer using
 		 * Local Extents for this File Entry. */
 
-		status = ocfs_get_leaf_extent (osb, fe, localVbo, &ext_bh, inode);
+		status = ocfs_get_leaf_extent(osb, fe, localVbo, &ext_bh, 
+					      inode);
 		if (status < 0) {
 			LOG_ERROR_STATUS (status);
 			goto finally;
@@ -2495,8 +2550,16 @@
 
 		OcfsExtent = OCFS_BH_GET_DATA_READ(ext_bh);
 		while (1) {
-			status = ocfs_update_extent_map (osb, &OCFS_I(inode)->map, OcfsExtent,
-						 &localVbo, &remainingLength, NONLOCAL_EXT);
+			if (!locked)
+				down(&(OCFS_I(inode)->priv_sem));
+			status = ocfs_update_extent_map (osb, 
+							 &OCFS_I(inode)->map, 
+							 OcfsExtent,
+							 &localVbo, 
+							 &remainingLength, 
+							 NONLOCAL_EXT);
+			if (!locked)
+				up(&(OCFS_I(inode)->priv_sem));
 			if (status < 0) {
 				LOG_ERROR_STATUS(status);
 				goto finally;
@@ -2523,7 +2586,8 @@
 
 				status = ocfs_read_bh(osb, next_data_ext, 
 						      &ext_bh, 
-						      OCFS_BH_COND_CACHED, inode);
+						      OCFS_BH_COND_CACHED, 
+						      inode);
 				if (status < 0) {
 					LOG_ERROR_STATUS(status);
 					goto finally;
@@ -2539,17 +2603,24 @@
 		}
 	}
 
-	if (ocfs_lookup_extent_map_entry (osb, &(OCFS_I(inode)->map), Vbo, Lbo, &cnt, &NumIndex) &&
-	    cnt >= sectors) {
+	if (!locked)
+		down(&(OCFS_I(inode)->priv_sem));
+	if (ocfs_lookup_extent_map_entry (osb, &(OCFS_I(inode)->map), Vbo, Lbo,
+					  &cnt, &NumIndex) && cnt >= sectors) {
 		status = 0;
 	} else
 		status = -EFAIL;
+	if (!locked)
+		up(&(OCFS_I(inode)->priv_sem));
 
 	/* want to return cnt only if asked for it */
 	if (sector_count)
 		*sector_count = (u32) cnt;
 
 finally:
+	if (have_io_sem)
+		up_read(&OCFS_I(inode)->ip_io_sem);
+
 	if (fe_bh) {
 		if (fe)
 			OCFS_BH_PUT_DATA(fe_bh);
@@ -2565,7 +2636,6 @@
 	return (status);
 }				/* ocfs_lookup_file_allocation */
 
-
 /* ocfs_get_leaf_extent()
  * '*data_exent_bh' should be NULL.
  */
@@ -2917,11 +2987,6 @@
 	/* Allocate a block of size blocksize from the relevant file/bitmap */
 	OCFS_ASSERT (blockSize);
 
-	if (down_trylock(&OCFS_I(inode)->ip_io_sem) == 0) {
-		LOG_TRACE_ARGS("Uhoh, asking me to allocate on an unlocked system file! (type = %u, i_ino = %lu)\n", Type, inode->i_ino);
-		BUG();
-	}
-
 	status = ocfs_acquire_lock (osb, OCFS_DLM_EXCLUSIVE_LOCK,
 			     FLAG_FILE_CREATE, &bh, inode);
 	if (status < 0) {

Modified: trunk/src/file.c
===================================================================
--- trunk/src/file.c	2004-06-15 22:29:45 UTC (rev 1105)
+++ trunk/src/file.c	2004-06-15 22:55:33 UTC (rev 1106)
@@ -165,7 +165,8 @@
 	int truncate_pages = 0;
 
 	LOG_ENTRY_ARGS ("(0x%p, 0x%p, '%*s')\n", inode, file, 
-			file->f_dentry->d_name.len, file->f_dentry->d_name.name);
+			file->f_dentry->d_name.len, 
+			file->f_dentry->d_name.name);
 
 	osb = OCFS_SB(inode->i_sb);
 
@@ -177,7 +178,7 @@
 
 	/* kch - for an open request we are already given the 
 	* inode, and therefore we are given the oin too */
-	down (&(OCFS_I(inode)->ip_io_sem));
+	down_write (&(OCFS_I(inode)->ip_io_sem));
 	down (&(OCFS_I(inode)->priv_sem));
 	have_oin_sem = 1;
 
@@ -186,14 +187,14 @@
 		status = ocfs_read_bh(osb, GET_INODE_FEOFF(inode), &fe_bh, 
 				      OCFS_BH_CACHED, inode);
 		if (status < 0) {
-			up(&OCFS_I(inode)->ip_io_sem);
+			up_write(&OCFS_I(inode)->ip_io_sem);
 			LOG_ERROR_STATUS(status);
 			goto leave;
 		}
 
 		status = ocfs_inode_notify_open(osb, fe_bh, NULL, inode);
 		if (status < 0) {
-			up(&OCFS_I(inode)->ip_io_sem);
+			up_write(&OCFS_I(inode)->ip_io_sem);
 			LOG_ERROR_STATUS(status);
 			if (status != -EINTR) {
 				LOG_ERROR_ARGS("Open request made for nonexistent "
@@ -207,7 +208,7 @@
 
 		status = ocfs_inode_fill_ext_map (osb, fe_bh, inode);
 		if (status < 0) {
-			up(&OCFS_I(inode)->ip_io_sem);
+			up_write(&OCFS_I(inode)->ip_io_sem);
 			LOG_ERROR_STATUS(status);
 			goto leave;
 		}
@@ -217,13 +218,13 @@
 		status = ocfs_verify_update_inode (osb, inode, &truncate_pages,
 						   0);
 		if (status < 0) {
-			up(&OCFS_I(inode)->ip_io_sem);
+			up_write(&OCFS_I(inode)->ip_io_sem);
 			LOG_ERROR_STATUS (status);
 			goto leave;
 		}
 	}
 
-	up(&OCFS_I(inode)->ip_io_sem);
+	up_write(&OCFS_I(inode)->ip_io_sem);
 	/* yes, hold onto priv_sem. */
 
 	if (OCFS_I(inode)->open_hndl_cnt > 0) {
@@ -321,6 +322,7 @@
 	ocfs_super * osb;
 	struct dentry *dentry;
 	int last_close = 0;
+	int dec = 0;
 
 	LOG_ENTRY_ARGS ("(0x%p, 0x%p, '%*s')\n", inode, file,
 			file->f_dentry->d_name.len, file->f_dentry->d_name.name);
@@ -368,7 +370,9 @@
 			OCFS_I(inode)->num_extends = 0;
 			list_del(&OCFS_I(inode)->recovery_list);
 			INIT_LIST_HEAD(&OCFS_I(inode)->recovery_list);
-			up(&OCFS_I(inode)->inode_extend_sem);
+			up_write(&OCFS_I(inode)->inode_extend_sem);
+
+			dec = 1;
 		}
 		spin_unlock(&oin_num_ext_lock);
 		up(&recovery_list_sem);
@@ -384,6 +388,9 @@
 bail:
 //	ocfs_bh_sem_hash_cleanup_pid(ocfs_getpid());
 
+	if (dec)
+		iput(inode);
+
 	LOG_EXIT_INT (0);
 	return 0;
 }				/* ocfs_file_release */
@@ -751,12 +758,12 @@
 
 	if (OCFS_I(inode)->needs_verification) {
 		LOG_TRACE_STR ("OIN_NEEDS_VERIFICATION");
-		down (&(OCFS_I(inode)->ip_io_sem));
+		down_read (&(OCFS_I(inode)->ip_io_sem));
 		down (&(OCFS_I(inode)->priv_sem));
 		status = ocfs_verify_update_inode (osb, inode, &needs_trunc, 
 						   0);
 		up (&(OCFS_I(inode)->priv_sem));
-		up (&(OCFS_I(inode)->ip_io_sem));
+		up_read (&(OCFS_I(inode)->ip_io_sem));
 		if (needs_trunc)
 			ocfs_truncate_inode_pages(inode, 0);
 		if (status < 0) {
@@ -793,9 +800,9 @@
 		LOG_TRACE_ARGS
 		    ("Will need more allocation: have=%llu, need=%llu\n",
 		     OCFS_I(inode)->alloc_size, newsize);
-		down(&OCFS_I(inode)->ip_io_sem);
+		down_write(&OCFS_I(inode)->ip_io_sem);
 		status = ocfs_extend_file (osb, newsize, GET_INODE_FEOFF(inode), NULL, inode, NULL);
-		up(&OCFS_I(inode)->ip_io_sem);
+		up_write(&OCFS_I(inode)->ip_io_sem);
 		if (status < 0) {
 			if (status != -EINTR && status != -ENOSPC) {
 				LOG_ERROR_STATUS (status);
@@ -882,12 +889,12 @@
 
 	if (OCFS_I(inode)->needs_verification) {
 		/* yay, locking hell! */
-		down(&OCFS_I(inode)->ip_io_sem);
+		down_read(&OCFS_I(inode)->ip_io_sem);
 		down (&(OCFS_I(inode)->priv_sem));
 		status = ocfs_verify_update_inode (osb, inode, &needs_trunc, 
 						   0);
 		up (&(OCFS_I(inode)->priv_sem));
-		up(&OCFS_I(inode)->ip_io_sem);
+		up_read(&OCFS_I(inode)->ip_io_sem);
 		if (needs_trunc)
 			ocfs_truncate_inode_pages(inode, 0);
 		if (status < 0) {
@@ -1331,7 +1338,7 @@
 
 	osb = OCFS_SB(inode->i_sb);
 
-	down(&OCFS_I(inode)->ip_io_sem);
+	down_write(&OCFS_I(inode)->ip_io_sem);
 
 	if (!dentry->d_parent || !dentry->d_parent->d_inode) {
 		LOG_ERROR_STR ("bad inode or root inode");
@@ -1432,7 +1439,7 @@
 	inode_setattr (inode, attr);
 
 bail:
-	up(&OCFS_I(inode)->ip_io_sem);
+	up_write(&OCFS_I(inode)->ip_io_sem);
 
 #ifndef BH_SEM_LEAK_CHECKING
 	if (error < 0)

Modified: trunk/src/inc/ocfs.h
===================================================================
--- trunk/src/inc/ocfs.h	2004-06-15 22:29:45 UTC (rev 1105)
+++ trunk/src/inc/ocfs.h	2004-06-15 22:55:33 UTC (rev 1106)
@@ -1084,11 +1084,9 @@
 	__u32 master_node_num;	/* Master Node */
 	__u32 lock_state;
 	__u32 lock_holders;
-	__u8 in_use;
 	__u8 lock_type;
-	int thread_id;		// XXX
 	atomic_t lr_ref_cnt;	/* When 0, freed */  // XXX
-	spinlock_t lock_mutex;  // XXX
+	struct semaphore lock_mutex;
 	__u32 readonly_node;
 	__u64 readonly_map;
 	__u64 oin_openmap;
@@ -1118,7 +1116,7 @@
 	/* This protects io on the metadata buffers related to this
 	 * inode. We also consider an "abort_trans" an I/O as it will
 	 * revert the buffer back to a previous state. */
-	struct semaphore  ip_io_sem;
+	struct rw_semaphore  ip_io_sem;
 
 	/* Used by the journalling code to attach an inode to a
 	 * handle.  These are protected by ip_io_sem in order to lock
@@ -1128,7 +1126,7 @@
 	struct _ocfs_journal_handle *ip_handle;
 
 	/* inode_extend_sem locks out extends on behalf of other nodes. */
-	struct semaphore  inode_extend_sem;
+	struct rw_semaphore  inode_extend_sem;
 
 	struct list_head  recovery_list; /* protected by recovery_list_sem */
 	__u32             num_extends; /* protected by oin_num_ext_lock */

Modified: trunk/src/inc/proto.h
===================================================================
--- trunk/src/inc/proto.h	2004-06-15 22:29:45 UTC (rev 1105)
+++ trunk/src/inc/proto.h	2004-06-15 22:55:33 UTC (rev 1106)
@@ -63,8 +63,9 @@
 			 struct inode *inode);
 int ocfs_load_local_alloc(ocfs_super *osb);
 int ocfs_lookup_file_allocation(ocfs_super *osb, __s64 Vbo,
-				__s64 *Lbo, __u32 sectors,
-				u32 *sector_count, struct inode *inode);
+				 __s64 *Lbo, __u32 sectors,
+				 u32 *sector_count, struct inode *inode, 
+				 int locked);
 int ocfs_process_bitmap_free_head(ocfs_super *osb,
 				  ocfs_bitmap_free_head *f);
 int ocfs_recover_local_alloc(ocfs_super *osb, int node_num);

Modified: trunk/src/inode.c
===================================================================
--- trunk/src/inode.c	2004-06-15 22:29:45 UTC (rev 1105)
+++ trunk/src/inode.c	2004-06-15 22:55:33 UTC (rev 1106)
@@ -314,14 +314,14 @@
 	i->flags = 0;
 	atomic_set(&i->i_clean_buffer_seq, 0);
 	init_MUTEX(&(i->priv_sem));
-	init_MUTEX(&(i->inode_extend_sem));
+	init_rwsem(&(i->inode_extend_sem));
 	i->open_hndl_cnt = 0;
 	ocfs_extent_map_init (&i->map);
 	INIT_LIST_HEAD(&i->recovery_list);
 	INIT_LIST_HEAD(&i->ip_handle_list);
 	i->ip_handle = NULL;
 
-	init_MUTEX(&i->ip_io_sem);
+	init_rwsem(&i->ip_io_sem);
 
 	/* These should be set in read_inode2. */
 	i->alloc_size = 0ULL;
@@ -717,12 +717,12 @@
 	/* take ip_io_sem on the inode, only to avoid a warning in
 	 * acquire_lockres. We can get rid of it when we get rid of
 	 * acquire_lockres */
-	down(&OCFS_I(inode)->ip_io_sem);
+	down_write(&OCFS_I(inode)->ip_io_sem);
 	if (S_ISDIR(inode->i_mode))
 		lock_flags |= FLAG_DIR;
 	status = ocfs_acquire_lock(osb, OCFS_DLM_EXCLUSIVE_LOCK, lock_flags, 
 				   &fe_bh, inode);
-	up(&OCFS_I(inode)->ip_io_sem);
+	up_write(&OCFS_I(inode)->ip_io_sem);
 	if (status < 0) {
 		/* EBUSY here is assumed to mean that other nodes are
 		 * still using the inode. We're done here though, so
@@ -788,10 +788,10 @@
 		ocfs_abort_trans(handle);
 
 	if (release_disk_lock) {
-		down(&OCFS_I(inode)->ip_io_sem);
+		down_write(&OCFS_I(inode)->ip_io_sem);
 		status = ocfs_release_lock(osb, OCFS_DLM_EXCLUSIVE_LOCK, 
 					   lock_flags, fe_bh, inode);
-		up(&OCFS_I(inode)->ip_io_sem);
+		up_write(&OCFS_I(inode)->ip_io_sem);
 		if (status < 0)
 			LOG_ERROR_STATUS(status);
 	}
@@ -1062,7 +1062,7 @@
 	}
 
 	// do we need extend sem?  no extend dlm message for dirs
-	tmperr = ocfs_lookup_file_allocation(osb, vbo, &lbo, 1, NULL, inode);
+	tmperr = ocfs_lookup_file_allocation(osb, vbo, &lbo, 1, NULL, inode,1);
 	if (tmperr < 0)
 		goto fail;
 
@@ -1119,13 +1119,19 @@
 	__s64 vbo = 0;
 	__s64 lbo = 0;
 	__u32 len;
-	int oin_locked = 0;
+	int open_direct;
 
 	LOG_ENTRY_ARGS ("(0x%p, %llu, 0x%p, %d)\n", inode,
 			(unsigned long long)iblock, bh_result, create);
 
+	open_direct = OCFS_I(inode)->oin_flags & OCFS_OIN_OPEN_FOR_DIRECTIO;
+
 	if (S_ISLNK (inode->i_mode)) {
-		err = ocfs_symlink_get_block (inode, iblock, bh_result, create);
+		/* this always does I/O for some reason. */
+		down_read(&OCFS_I(inode)->ip_io_sem);
+		err = ocfs_symlink_get_block (inode, iblock, bh_result, 
+					      create);
+		up_read(&OCFS_I(inode)->ip_io_sem);
 		goto bail;
 	}
 
@@ -1139,23 +1145,27 @@
 
 	if (!INODE_JOURNAL(inode) && vbo >= OCFS_I(inode)->alloc_size) {
 		LOG_TRACE_STR("Extending allocation");
-		err = ocfs_extend_file(osb, vbo + osb->sect_size, GET_INODE_FEOFF(inode), NULL, inode, NULL);
+		LOG_ERROR_ARGS("extending inode %lu in get_block!!\n", 
+			       inode->i_ino);
+		down_write(&OCFS_I(inode)->ip_io_sem);
+		err = ocfs_extend_file(osb, vbo + osb->sect_size, 
+				       GET_INODE_FEOFF(inode), NULL, inode, 
+				       NULL);
+		up_write(&OCFS_I(inode)->ip_io_sem);
 		if (err < 0) {
 			err = -ENOSPC;
 			LOG_ERROR_STATUS (err);
 			goto bail;
 		}
 	}
-	
-	if (!(OCFS_I(inode)->oin_flags & OCFS_OIN_OPEN_FOR_DIRECTIO)) {
-		down(&(OCFS_I(inode)->priv_sem));
-		oin_locked = 1;
-	}
 
 	len = 1;
-	down(&OCFS_I(inode)->inode_extend_sem);
-	err = ocfs_lookup_file_allocation(osb, vbo, &lbo, len, NULL, inode);
-	up(&OCFS_I(inode)->inode_extend_sem);
+	if (!open_direct)
+		down_read(&OCFS_I(inode)->inode_extend_sem);
+	err = ocfs_lookup_file_allocation(osb, vbo, &lbo, len, NULL, 
+					   inode, open_direct);
+	if (!open_direct)
+		up_read(&OCFS_I(inode)->inode_extend_sem);
 	if (err < 0) {
 		LOG_ERROR_ARGS ("vbo=%lld lbo=%lld len=%u", vbo, lbo, len);
 		goto bail;
@@ -1176,10 +1186,6 @@
 	if (err < 0)
 		err = -EIO;
 
-	if (oin_locked && !(OCFS_I(inode)->oin_flags & OCFS_OIN_OPEN_FOR_DIRECTIO)) {
-		up(&(OCFS_I(inode)->priv_sem));
-	}
-
 	LOG_EXIT_INT (err);
 	return err;
 }				/* ocfs_get_block */
@@ -1216,7 +1222,8 @@
 
 	vbo = (__s64) block << inode->i_sb->s_blocksize_bits;
 	len = 1;
-	err = ocfs_lookup_file_allocation(osb, vbo, &lbo, len, NULL, inode);
+	err = ocfs_lookup_file_allocation(osb, vbo, &lbo, len, NULL, 
+					   inode, 1);
 	if (err < 0) {
 		LOG_ERROR_ARGS ("vbo=%lld lbo=%lld len=%u", vbo,
 				lbo, len);
@@ -1255,7 +1262,8 @@
 
 	vbo = (__s64) iblock << inode->i_sb->s_blocksize_bits;
 	len = 1;
-	err = ocfs_lookup_file_allocation (osb, vbo, &lbo, len, NULL, inode);
+	err = ocfs_lookup_file_allocation(osb, vbo, &lbo, len, NULL, 
+					   inode, 1);
 	if (err < 0) {
 		LOG_ERROR_STATUS (err);
 		err = -1;
@@ -1404,7 +1412,7 @@
 	 * our logical offset */	
 	/* TODO: Try our damndest to give sizes in multiples of PAGE_SIZE */
 	status = ocfs_lookup_file_allocation(osb, vbo, &lbo, max_blocks, 
-					     &new_size, inode);
+					     &new_size, inode, 1);
 
 	/* Do whatever we need to the buffer_head */
 	if (set_new) {
@@ -1884,7 +1892,7 @@
 
 	osb = OCFS_SB(inode->i_sb);
 
-	down (&(OCFS_I(inode)->ip_io_sem));
+	down_read (&(OCFS_I(inode)->ip_io_sem));
 	down (&(OCFS_I(inode)->priv_sem));
 
 	if (INODE_DELETED(inode)) {
@@ -1913,7 +1921,7 @@
 
 bail:
 	up (&(OCFS_I(inode)->priv_sem));
-	up (&(OCFS_I(inode)->ip_io_sem));
+	up_read (&(OCFS_I(inode)->ip_io_sem));
 
 	if (needs_trunc)
 		ocfs_truncate_inode_pages(inode, 0);

Modified: trunk/src/journal.c
===================================================================
--- trunk/src/journal.c	2004-06-15 22:29:45 UTC (rev 1105)
+++ trunk/src/journal.c	2004-06-15 22:55:33 UTC (rev 1106)
@@ -196,7 +196,8 @@
 
 	atomic_inc(&inode->i_count);
 
-	down(&OCFS_I(inode)->ip_io_sem);
+	/* we're obviously changing it... */
+	down_write(&OCFS_I(inode)->ip_io_sem);
 
 	/* sanity check */
 	if (OCFS_I(inode)->ip_handle)
@@ -223,7 +224,7 @@
 		list_del(&OCFS_I(inode)->ip_handle_list);
 		INIT_LIST_HEAD(&OCFS_I(inode)->ip_handle_list);
 
-		up(&OCFS_I(inode)->ip_io_sem);
+		up_write(&OCFS_I(inode)->ip_io_sem);
 		iput(inode);
 	}
 	return;
@@ -255,7 +256,7 @@
 
 		/* The cache list holds unlocked inodes */
 		if (action == TRANS_CACHE || lock->req_io_sem)
-			down(&OCFS_I(lock->inode)->ip_io_sem);
+			down_write(&OCFS_I(lock->inode)->ip_io_sem);
 
 		/* The file may have been deleted before we got to
 		 * this lock release. If so, just skip it.  */
@@ -277,7 +278,7 @@
 		}
 
 		if (action == TRANS_CACHE || lock->req_io_sem)
-			up(&OCFS_I(lock->inode)->ip_io_sem);
+			up_write(&OCFS_I(lock->inode)->ip_io_sem);
 
 		if (lock->bh != NULL)
 			brelse(lock->bh);
@@ -882,7 +883,7 @@
 		goto done;
 	}
 
-	down(&OCFS_I(inode)->ip_io_sem);
+	down_write(&OCFS_I(inode)->ip_io_sem);
 
 
 	SET_INODE_JOURNAL(inode);
@@ -891,7 +892,7 @@
 	status = ocfs_acquire_lock (osb, OCFS_DLM_EXCLUSIVE_LOCK,
 				    FLAG_FILE_CREATE, &bh, inode);
 	if (status < 0) {
-		up(&OCFS_I(inode)->ip_io_sem);
+		up_write(&OCFS_I(inode)->ip_io_sem);
 
 		if (status != -EINTR)
 			LOG_ERROR_STR("Could not get lock on journal!");
@@ -910,7 +911,7 @@
 	if (status < 0) {
 		OCFS_BH_PUT_DATA(bh);
 		fe = NULL;
-		up(&OCFS_I(inode)->ip_io_sem);
+		up_write(&OCFS_I(inode)->ip_io_sem);
 
 		goto done;
 	}
@@ -934,7 +935,7 @@
 					DLOCK_FLAG_OPEN_MAP|DLOCK_FLAG_ADD_SELF, 
 					&bh, inode, NULL);
 	if (status < 0) {
-		up(&OCFS_I(inode)->ip_io_sem);
+		up_write(&OCFS_I(inode)->ip_io_sem);
 
 		LOG_ERROR_STATUS(status);
 		goto done;
@@ -943,7 +944,7 @@
 	LOG_TRACE_ARGS("inode->alloc_size = %llu\n", 
 		       OCFS_I(inode)->alloc_size);
 
-	up(&OCFS_I(inode)->ip_io_sem);
+	up_write(&OCFS_I(inode)->ip_io_sem);
 
 	/* call the kernels journal init function now */
 	k_journal = journal_init_inode(inode);
@@ -1047,12 +1048,12 @@
 
 	OCFS_I(inode)->open_hndl_cnt--;
 
-	down(&OCFS_I(inode)->ip_io_sem);
+	down_write(&OCFS_I(inode)->ip_io_sem);
 	/* unlock our journal */
 	status = ocfs_release_lock (osb, OCFS_DLM_EXCLUSIVE_LOCK,
 				    FLAG_FILE_CREATE, 
 				    journal->lockbh, inode);
-	up(&OCFS_I(inode)->ip_io_sem);
+	up_write(&OCFS_I(inode)->ip_io_sem);
 	if (status < 0)
 		LOG_ERROR_STATUS (status);
 
@@ -1161,7 +1162,7 @@
 	LOG_TRACE_ARGS("Force reading %u blocks\n", totalblks);
 
 	status = ocfs_lookup_file_allocation(osb, vbo, &lbo, size, NULL, 
-					     inode);
+					     inode, 1);
 	if (status < 0) {
 		LOG_ERROR_STATUS(status);
 		goto bail;
@@ -1335,7 +1336,7 @@
 		goto done;
 	}
 
-	down(&OCFS_I(inode)->ip_io_sem);
+	down_write(&OCFS_I(inode)->ip_io_sem);
 
 	SET_INODE_JOURNAL(inode);
 
@@ -1348,7 +1349,7 @@
 				    FLAG_FILE_CREATE|FLAG_FILE_RECOVERY, 
 				    &bh, inode);
 
-	up(&OCFS_I(inode)->ip_io_sem);
+	up_write(&OCFS_I(inode)->ip_io_sem);
 	if (status < 0) {
 		LOG_TRACE_ARGS("status returned from acquire_lock=%d\n", 
 			       status);
@@ -1377,12 +1378,12 @@
 	OCFS_I(inode)->alloc_size = alloc_size;
 
 	/* add this node to openmap and update disk lock */
-	down(&OCFS_I(inode)->ip_io_sem);
+	down_write(&OCFS_I(inode)->ip_io_sem);
 
 	status = ocfs_update_disk_lock (osb, 
 					DLOCK_FLAG_OPEN_MAP|DLOCK_FLAG_ADD_SELF, 
 					&bh, inode, NULL);
-	up(&OCFS_I(inode)->ip_io_sem);
+	up_write(&OCFS_I(inode)->ip_io_sem);
 	if (status < 0) {
 		LOG_ERROR_STATUS(status);
 		goto done;
@@ -1449,12 +1450,12 @@
 
 	/* drop the lock on this nodes journal */
 	if (got_lock) {
-		down(&OCFS_I(inode)->ip_io_sem);
+		down_write(&OCFS_I(inode)->ip_io_sem);
 
 		status = ocfs_release_lock(osb, OCFS_DLM_EXCLUSIVE_LOCK, 
 					   FLAG_FILE_CREATE|FLAG_FILE_RECOVERY,
 					   bh, inode);
-		up(&OCFS_I(inode)->ip_io_sem);
+		up_write(&OCFS_I(inode)->ip_io_sem);
 	}
 	if (inode)
 		iput(inode);

Modified: trunk/src/lockres.c
===================================================================
--- trunk/src/lockres.c	2004-06-15 22:29:45 UTC (rev 1105)
+++ trunk/src/lockres.c	2004-06-15 22:55:33 UTC (rev 1106)
@@ -119,10 +119,6 @@
 	return status;
 }				/* ocfs_find_update_res */
 
-
-#define ocfs_container_of(ptr, type, member) ({                      \
-        const typeof( ((type *)0)->member ) *__mptr = (ptr);    \
-         (type *)( (char *)__mptr - offsetof(type,member) );})
 /*
  * ocfs_acquire_lockres()
  *
@@ -131,68 +127,33 @@
  */
 int ocfs_acquire_lockres (ocfs_lock_res * lockres, __u32 timeout)
 {
-	int mypid;
-	unsigned long jif = 0;
-	int status = 0;
-	int cnt = 0;
-	struct inode *inode;
-	ocfs_inode_private *ip;
+	unsigned long jif;
+	int status;
 
 	LOG_ENTRY_ARGS ("(0x%p, %u)\n", lockres, timeout);
 
 	OCFS_ASSERT(lockres);
 
-	mypid = ocfs_getpid ();
+	if (!timeout) {
+		down(&lockres->lock_mutex);
+		status = 0;
+		goto bail;
+	}
 
-	if (timeout)
-		jif = jiffies + (timeout * HZ / 1000);
+	jif = jiffies + (timeout * HZ / 1000);
 
-	while (1) {
-		spin_lock (&lockres->lock_mutex);
+	while(1) {
+		if (!down_trylock(&lockres->lock_mutex)) {
+			status = 0;
+			break;
+		}
 
-		if (lockres->in_use) {
-			if (lockres->thread_id != mypid) {
-				spin_unlock (&lockres->lock_mutex);
-				LOG_ERROR_ARGS ("lockpid=%d, newpid=%d,"
-						" timedout\n",
-						lockres->thread_id, mypid);
-				BUG();
-
-				if (jif && jif < jiffies) {
-					LOG_TRACE_ARGS ("lockpid=%d, newpid=%d,"
-						" timedout\n",
-						lockres->thread_id, mypid);
-					status = -ETIMEDOUT;
-					goto bail;
-				}
-
-				if (++cnt == 10) {
-					LOG_TRACE_ARGS ("lockpid=%d, newpid=%d\n",
-						lockres->thread_id, mypid);
-					cnt = 0;
-				}
-				ocfs_sleep (OCFS_NM_HEARTBEAT_TIME / 10);
-			}
-			else  {
-				printk("lockres in_use=%d, pid=%d, mypid=%d\n", lockres->in_use, lockres->thread_id, mypid);
-				BUG();
-				lockres->in_use++;
-				spin_unlock (&lockres->lock_mutex);
-				break;
-			}
-		} else {
-			lockres->in_use = 1;
-			lockres->thread_id = mypid;
-			spin_unlock (&lockres->lock_mutex);
-			ip = ocfs_container_of(lockres, ocfs_inode_private, i_lockres);
-			inode = ip->inode;
-			if (down_trylock(&OCFS_I(inode)->ip_io_sem) == 0) {
-				LOG_ERROR_ARGS("locking lockres without io_sem! ino = %lu, offset = %llu\n", inode->i_ino, OCFS_I(inode)->feoff);
-
-				BUG();
-			}
+		if (jif < jiffies) {
+			status = -ETIMEDOUT;
 			break;
 		}
+
+		ocfs_sleep (OCFS_NM_HEARTBEAT_TIME / 10);
 	}
 
 bail:
@@ -210,20 +171,7 @@
 
 	OCFS_ASSERT(lockres);
 
-	spin_lock (&lockres->lock_mutex);
-	if (lockres->in_use == 0) {
-		LOG_ERROR_ARGS("Releasing lockres with inuse 0: 0x%p\n", lockres);
-		BUG();
-	} else {
-		if (lockres->thread_id != current->pid)
-			LOG_ERROR_ARGS("PID %d is trying to release lockres held by PID %d\n", 
-				       current->pid, lockres->thread_id);
-		lockres->in_use--;
-		if (lockres->in_use == 0) {
-			lockres->thread_id = 0;
-		}
-	}
-	spin_unlock (&lockres->lock_mutex);
+	up(&lockres->lock_mutex);
 
 	LOG_EXIT ();
 	return;
@@ -244,10 +192,9 @@
 	lockres->master_node_num = OCFS_INVALID_NODE_NUM;
 	lockres->last_upd_seq_num = 0;
 	lockres->oin_openmap = 0;
-	lockres->in_use = 0;
 	lockres->lock_state = 0;
 
-	spin_lock_init (&lockres->lock_mutex);
+	init_MUTEX(&lockres->lock_mutex);
 	atomic_set (&lockres->lr_ref_cnt, 0);
 
 	lockres->readonly_map = 0ULL;

Modified: trunk/src/namei.c
===================================================================
--- trunk/src/namei.c	2004-06-15 22:29:45 UTC (rev 1105)
+++ trunk/src/namei.c	2004-06-15 22:55:33 UTC (rev 1106)
@@ -32,8 +32,6 @@
 
 #define OCFS_DEBUG_CONTEXT    OCFS_DEBUG_CONTEXT_NAMEI
 
-extern spinlock_t oin_num_ext_lock;
-
 static int inline search_dirblock(struct buffer_head * bh, struct inode *dir, 
 				  const char *name, int namelen, 
 				  unsigned long offset, 
@@ -65,7 +63,7 @@
 			   struct inode *inode, ocfs_file_entry *fe);
 
 static struct dentry_operations ocfs_dentry_ops = {
-	.d_revalidate = ocfs_dentry_revalidate	// let's test it out!
+	.d_revalidate = ocfs_dentry_revalidate
 };
 
 static inline int ocfs_add_entry(ocfs_journal_handle *handle, 
@@ -110,11 +108,11 @@
 	LOG_TRACE_ARGS("about to call find_files_on_disk with inode=%p\n", 
 		       dir);
 
-	down(&OCFS_I(dir)->ip_io_sem);
+	down_read(&OCFS_I(dir)->ip_io_sem);
 	status = ocfs_find_files_on_disk(osb, dentry->d_name.name, 
 					 dentry->d_name.len, &fe_off, dir, 1, 
 					 &dirent_bh, &dirent);
-	up(&OCFS_I(dir)->ip_io_sem);
+	up_read(&OCFS_I(dir)->ip_io_sem);
 	if (status < 0)
 		goto bail_add;
 	
@@ -153,7 +151,7 @@
 	LOG_ENTRY_ARGS ("(0x%p, 0x%p, %d, %d, '%*s')\n", dir, dentry, mode,
 			dev, dentry->d_name.len, dentry->d_name.name);
 
-	down(&OCFS_I(dir)->ip_io_sem);
+	down_write(&OCFS_I(dir)->ip_io_sem);
 
 	/* get our super block */
 	osb = OCFS_SB(dir->i_sb);
@@ -287,7 +285,7 @@
 	if ((status < 0) && handle)
 		ocfs_abort_trans(handle);
 
-	up(&OCFS_I(dir)->ip_io_sem);
+	up_write(&OCFS_I(dir)->ip_io_sem);
 
 	if (status == -ENOSPC)
 		LOG_TRACE_STR ("Disk is full");
@@ -555,22 +553,9 @@
 	}
 	status = -EFAIL;
 
-	spin_lock(&oin_num_ext_lock);
-	if (OCFS_I(inode)->num_extends) {
-		LOG_ERROR_ARGS ("Cannot remove a file with = "
-				"%u, pending extends (fe_off "
-				"= %llu)\n", 
-				OCFS_I(inode)->num_extends,
-				fe_off);
-		spin_unlock(&oin_num_ext_lock);
-		status = -EBUSY;
-		goto bail;
-	}
-	spin_unlock(&oin_num_ext_lock);
+	down_write(&OCFS_I(dir)->ip_io_sem);
+	down_write(&OCFS_I(inode)->ip_io_sem);
 
-	down(&OCFS_I(dir)->ip_io_sem);
-	down(&OCFS_I(inode)->ip_io_sem);
-
 	handle = ocfs_start_trans(osb, OCFS_FILE_DELETE_CREDITS);
 	if (handle == NULL) {
 		LOG_ERROR_STATUS (status = -ENOMEM);
@@ -725,8 +710,8 @@
 
 	}
 
-	up(&OCFS_I(inode)->ip_io_sem);
-	up(&OCFS_I(dir)->ip_io_sem);
+	up_write(&OCFS_I(inode)->ip_io_sem);
+	up_write(&OCFS_I(dir)->ip_io_sem);
 bail:
 	if (status < 0 && status != -ENOTEMPTY && 
 	    status != -EPERM && status != -EBUSY && status != -EINTR) {
@@ -850,28 +835,27 @@
 	return(status);
 } /* ocfs_double_lock */
 
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
-static inline void double_down(struct semaphore *s1, struct semaphore *s2)
+static inline void double_down_write(struct rw_semaphore *s1, 
+				     struct rw_semaphore *s2)
 {
 	if (s1 != s2) {
 		if ((unsigned long) s1 < (unsigned long) s2) {
-			struct semaphore *tmp = s2;
+			struct rw_semaphore *tmp = s2;
 			s2 = s1; s1 = tmp;
 		}
-		down(s1);
+		down_write(s1);
 	}
-	down(s2);
+	down_write(s2);
 }
 
-static inline void double_up(struct semaphore *s1, struct semaphore *s2)
+static inline void double_up_write(struct rw_semaphore *s1, 
+				   struct rw_semaphore *s2)
 {
-	up(s1);
+	up_write(s1);
 	if (s1 != s2)
-		up(s2);
+		up_write(s2);
 }
 
-#endif
-
 #define PARENT_INO(buffer) \
 	((struct ocfs2_dir_entry *) ((char *) buffer + \
 	le16_to_cpu(((struct ocfs2_dir_entry *) buffer)->rec_len)))->inode
@@ -915,13 +899,14 @@
 
 	oldfe_lockid = GET_INODE_FEOFF(old_inode);
 
-	double_down(&OCFS_I(old_dir)->ip_io_sem, &OCFS_I(new_dir)->ip_io_sem);
-	down(&OCFS_I(old_inode)->ip_io_sem);
+	double_down_write(&OCFS_I(old_dir)->ip_io_sem, 
+			  &OCFS_I(new_dir)->ip_io_sem);
+	down_write(&OCFS_I(old_inode)->ip_io_sem);
 
 	if (new_inode) {
 		if (ocfs_inc_icount(new_inode) < 0)
 			BUG();
-		down(&OCFS_I(new_inode)->ip_io_sem);
+		down_write(&OCFS_I(new_inode)->ip_io_sem);
 	}
 
 	if (atomic_read (&old_dentry->d_count) > 2) {
@@ -943,13 +928,6 @@
 		status = -EBUSY;
 		goto bail;
 	}
-	spin_lock(&oin_num_ext_lock);
-	if (OCFS_I(old_inode)->num_extends) {
-		spin_unlock(&oin_num_ext_lock);
-		status = -EBUSY;
-		goto bail;
-	}
-	spin_unlock(&oin_num_ext_lock);
 
 	/* start our transaction */
 	handle = ocfs_start_trans(osb, OCFS_FILE_RENAME_CREDITS);
@@ -1220,11 +1198,12 @@
 				  newfe_flags, NULL, new_inode);
 	}
 
-	double_up(&OCFS_I(old_dir)->ip_io_sem, &OCFS_I(new_dir)->ip_io_sem);
-	up(&OCFS_I(old_inode)->ip_io_sem);
+	double_up_write(&OCFS_I(old_dir)->ip_io_sem, 
+			&OCFS_I(new_dir)->ip_io_sem);
+	up_write(&OCFS_I(old_inode)->ip_io_sem);
 
 	if (new_inode) {
-		up(&OCFS_I(new_inode)->ip_io_sem);
+		up_write(&OCFS_I(new_inode)->ip_io_sem);
 		iput(new_inode);
 	}
 
@@ -1283,7 +1262,7 @@
 	sb = dir->i_sb;
 	osb = OCFS_SB(sb);
 
-	down(&OCFS_I(dir)->ip_io_sem);
+	down_write(&OCFS_I(dir)->ip_io_sem);
 
 	inode = new_inode (sb);
 	if (IS_ERR (inode)) {
@@ -1299,7 +1278,7 @@
 		inode = NULL;
 		goto bail;
 	}
-	down(&OCFS_I(inode)->ip_io_sem);
+	down_write(&OCFS_I(inode)->ip_io_sem);
 
 	l = strlen (symname) + 1;
 	newsize = l - 1;
@@ -1391,8 +1370,8 @@
 
 bail:
 	if (inode)
-		up(&OCFS_I(inode)->ip_io_sem);
-	up(&OCFS_I(dir)->ip_io_sem);
+		up_write(&OCFS_I(inode)->ip_io_sem);
+	up_write(&OCFS_I(dir)->ip_io_sem);
 
 	if (new_fe_bh) {
 		if (fe)

Modified: trunk/src/nm.c
===================================================================
--- trunk/src/nm.c	2004-06-15 22:29:45 UTC (rev 1105)
+++ trunk/src/nm.c	2004-06-15 22:55:33 UTC (rev 1106)
@@ -51,6 +51,10 @@
 	int yield;
 } ocfs_ro_cache_drop_ctxt;
 
+static void ocfs_mark_inode_for_extend(ocfs_super *osb, struct inode *inode,
+				       __u32 node_num);
+static void ocfs_clear_inode_for_extend(ocfs_super *osb, struct inode *inode,
+					__u32 node_num);
 
 void ocfs_process_vote_worker(void *val);
 
@@ -576,7 +580,112 @@
 	return vote_type;
 }
 
+static void ocfs_mark_inode_for_extend(ocfs_super *osb, struct inode *inode,
+				       __u32 node_num)
+{
 
+	down(&OCFS_I(inode)->priv_sem);
+
+	/* if we don't have it open, then don't bother with this. */
+	if (!OCFS_I(inode)->open_hndl_cnt) {
+		up(&OCFS_I(inode)->priv_sem);
+		return;
+	}
+
+	spin_lock(&oin_num_ext_lock);
+
+	if (OCFS_I(inode)->num_extends < 0)
+		BUG();
+
+	/* this isn't the 1st extend against the inode, so just inc
+	 * the counter. */
+	if (OCFS_I(inode)->num_extends > 0) {
+		OCFS_I(inode)->num_extends++;
+		spin_unlock(&oin_num_ext_lock);
+		up(&OCFS_I(inode)->priv_sem);
+		return;
+	}
+
+	/* ok, we're going to have to take the extend sem. We can't do
+	 * this holding priv_sem so we drop it and recheck after we've
+	 * got it. */
+	spin_unlock(&oin_num_ext_lock);
+	up(&OCFS_I(inode)->priv_sem);
+
+	/* take the extend_sem on behalf of
+	 * this other node. It won't be
+	 * released until he does his last
+	 * release broadcast. This has the
+	 * effect of locking out
+	 * lookup_file_allocation on this
+	 * inode. */
+	down_write(&OCFS_I(inode)->inode_extend_sem);
+
+	down(&OCFS_I(inode)->priv_sem);
+	if (!OCFS_I(inode)->open_hndl_cnt) {
+		up_write(&OCFS_I(inode)->inode_extend_sem);
+		up(&OCFS_I(inode)->priv_sem);
+		return;
+	}
+
+	atomic_inc(&inode->i_count);
+
+	/* Ok, we've still got it open. Put this guy on the recovery
+	 * list in case the extending node dies. */
+	down(&recovery_list_sem);
+	spin_lock(&oin_num_ext_lock);
+	OCFS_I(inode)->num_extends++;
+	list_add_tail(&OCFS_I(inode)->recovery_list, 
+		      &osb->lock_recovery_lists[node_num]);
+	spin_unlock(&oin_num_ext_lock);
+	up(&recovery_list_sem);
+
+	up(&OCFS_I(inode)->priv_sem);
+	return;
+}
+
+static void ocfs_clear_inode_for_extend(ocfs_super *osb, struct inode *inode,
+					__u32 node_num)
+{
+	int dec = 0;
+
+	down(&OCFS_I(inode)->priv_sem);
+
+	/* if we no longer have it open, then the close path has dealt
+	 * with this. */
+	if (!OCFS_I(inode)->open_hndl_cnt)
+		goto done;
+
+	down(&recovery_list_sem);
+	spin_lock(&oin_num_ext_lock);
+
+	OCFS_I(inode)->num_extends--;
+
+	if (OCFS_I(inode)->num_extends < 0)
+		BUG();
+
+	if (!OCFS_I(inode)->num_extends) {
+		list_del(&OCFS_I(inode)->recovery_list);
+		INIT_LIST_HEAD(&OCFS_I(inode)->recovery_list);
+
+		up_write(&OCFS_I(inode)->inode_extend_sem);
+
+		dec = 1;
+	}
+
+	spin_unlock(&oin_num_ext_lock);
+	up(&recovery_list_sem);
+
+done:
+	up(&OCFS_I(inode)->priv_sem);
+
+	/* we want iputs to happen outside of as many locks as possible. */
+	if (dec)
+		iput(inode);
+
+	return;
+}
+
 /* Search the journals committed transactions list for a given
  * inode. If it's in there, return true, zero otherwise and -1 on
  * error. Must hold the journal->commit_sem before going here! */
@@ -703,7 +812,7 @@
 		 * Please see the note in ocfs_delete_inode. */
 		osb->voting_ino = inode->i_ino;
 
-		down(&OCFS_I(inode)->ip_io_sem);
+		down_write(&OCFS_I(inode)->ip_io_sem);
 		have_io_sem = 1;
 
 		lockres = GET_INODE_LOCKRES(inode);
@@ -737,15 +846,17 @@
 		printk("Invalid request! flags = 0x%x\n", flags);
 
 #endif
-	/* get_process_vote_action will only allow CHANGE_MASTER, RELEASE_CACHE, and
-	 * ADD_OIN_MAP on a CACHE lock held by this node.  the CHANGE_MASTER/RELEASE_CACHE
-	 * path needs to check the readonly map to see if any nodes need to be updated.  this
-	 * is not necessary for the ADD_OIN_MAP path since it cannot actually modify any
-	 * data or metadata under the lock.
+	/* get_process_vote_action will only allow CHANGE_MASTER,
+	 * RELEASE_CACHE, and ADD_OIN_MAP on a CACHE lock held by this
+	 * node.  the CHANGE_MASTER/RELEASE_CACHE path needs to check
+	 * the readonly map to see if any nodes need to be updated.
+	 * this is not necessary for the ADD_OIN_MAP path since it
+	 * cannot actually modify any data or metadata under the lock.
 	 */
 
 	if (disk_vote) {
-		/* Zero out the vote for everybody, if any already set and hung */
+		/* Zero out the vote for everybody, if any already set
+		 * and hung */
 		vote = OCFS_BH_GET_DATA_WRITE(vote_bh);
 		for (i = 0; i < num_nodes; i++)
 			vote->vote[i] = 0;
@@ -1129,6 +1240,9 @@
 			break;
 	}
 
+	up_write(&OCFS_I(inode)->ip_io_sem);
+	have_io_sem = 0;
+
 	if (inode && (flags & (FLAG_FILE_EXTEND|FLAG_FILE_TRUNCATE)) && 
 	    ((flags & FLAG_ACQUIRE_LOCK && vote_response==FLAG_VOTE_NODE) ||
 	    (flags & FLAG_RELEASE_LOCK))) {
@@ -1137,57 +1251,14 @@
 				      "extend" : "truncate", flags & FLAG_RELEASE_LOCK ? 
 				      "release" : "acquire", inode, node_num);
 
-		down(&OCFS_I(inode)->priv_sem);
-		if (OCFS_I(inode)->open_hndl_cnt 
-		    && (flags & FLAG_ACQUIRE_LOCK)) {
-			spin_lock(&oin_num_ext_lock);
-
-			if (OCFS_I(inode)->num_extends < 0)
-				BUG();
-			
-			if (OCFS_I(inode)->num_extends > 0) {
-				OCFS_I(inode)->num_extends++;
-				spin_unlock(&oin_num_ext_lock);
-			} else {
-				spin_unlock(&oin_num_ext_lock);
-
-				/* take the extend_sem on behalf of
-				 * this other node. It won't be
-				 * released until he does his last
-				 * release broadcast. This has the
-				 * effect of locking out
-				 * lookup_file_allocation on this
-				 * inode. */
-				down(&OCFS_I(inode)->inode_extend_sem);
-
-				down(&recovery_list_sem);
-				spin_lock(&oin_num_ext_lock);
-				OCFS_I(inode)->num_extends++;
-				list_add_tail(&OCFS_I(inode)->recovery_list, &osb->lock_recovery_lists[node_num]);
-				spin_unlock(&oin_num_ext_lock);
-				up(&recovery_list_sem);
-			}
-		} else if (OCFS_I(inode)->open_hndl_cnt 
-			   && (flags & FLAG_RELEASE_LOCK)) {
-			down(&recovery_list_sem);
-			spin_lock(&oin_num_ext_lock);
-
-			OCFS_I(inode)->num_extends--;
-			
-			if (OCFS_I(inode)->num_extends < 0)
-				BUG();
-			
-			if (!OCFS_I(inode)->num_extends) {
-				list_del(&OCFS_I(inode)->recovery_list);
-				INIT_LIST_HEAD(&OCFS_I(inode)->recovery_list);
-
-				up(&OCFS_I(inode)->inode_extend_sem);
-			}
-			spin_unlock(&oin_num_ext_lock);
-
-			up(&recovery_list_sem);
+		if (flags & FLAG_ACQUIRE_LOCK)
+			ocfs_mark_inode_for_extend(osb, inode, node_num);
+		else if (flags & FLAG_RELEASE_LOCK)
+			ocfs_clear_inode_for_extend(osb, inode, node_num);
+		else {
+			printk("uhoh, bad vote flags! 0x%x\n", flags);
+			BUG();
 		}
-		up(&OCFS_I(inode)->priv_sem);
 	}
 
 	if (disk_vote) {
@@ -1223,16 +1294,15 @@
 		}
 	}
 
-	if (lockres) {
+	if (lockres)
 		ocfs_release_lockres (lockres); // ocfs_process_vote
-	}
 
 leave:
 	if (inode) {
 		if (inc_inode_seq)
 			ocfs_inc_inode_seq(osb, inode, 1);
 		if (have_io_sem)
-			up(&OCFS_I(inode)->ip_io_sem);
+			up_write(&OCFS_I(inode)->ip_io_sem);
 	}
 
 	if (inode)
@@ -1290,6 +1360,7 @@
 
 	LOG_ENTRY_ARGS("(node_num = %u)\n", node_num);
 
+start:
 	down(&recovery_list_sem);
 	list_for_each_safe (iter, temp, &osb->lock_recovery_lists[node_num]) {
 		i = list_entry (iter, ocfs_inode_private, recovery_list);
@@ -1301,7 +1372,12 @@
 			OCFS_I(inode)->num_extends = 0;
 			list_del(&OCFS_I(inode)->recovery_list);
 			INIT_LIST_HEAD(&OCFS_I(inode)->recovery_list);
-			up(&OCFS_I(inode)->inode_extend_sem);
+			up_write(&OCFS_I(inode)->inode_extend_sem);
+
+			spin_unlock(&oin_num_ext_lock);
+			up (&recovery_list_sem);
+			iput(inode);
+			goto start;
 		} else
 			LOG_ERROR_STR("oin is in recovery list, but has zero extend counter value!");
 
@@ -1362,7 +1438,7 @@
 
 	if (yield) {
 		/* this will wait until process_vote gets to the release */
-		down(&OCFS_I(inode)->ip_io_sem);
+		down_write(&OCFS_I(inode)->ip_io_sem);
 		ocfs_acquire_lockres(lockres, 0); // ocfs_process_vote ocfs_acquire_lock
 	}
 
@@ -1397,9 +1473,9 @@
 			if (yield) {
 				/* from nm thread, give some time to waiters */
 				ocfs_release_lockres(lockres); // ocfs_process_vote ocfs_acquire_lock
-				up(&OCFS_I(inode)->ip_io_sem);
+				up_write(&OCFS_I(inode)->ip_io_sem);
 
-				down(&OCFS_I(inode)->ip_io_sem);
+				down_write(&OCFS_I(inode)->ip_io_sem);
 				ocfs_acquire_lockres(lockres, 0); // ocfs_process_vote ocfs_acquire_lock
 			}
 			continue;
@@ -1418,7 +1494,7 @@
 leave:
 	if (yield) {
 		ocfs_release_lockres(lockres); // ocfs_process_vote ocfs_acquire_lock
-		up(&OCFS_I(inode)->ip_io_sem);
+		up_write(&OCFS_I(inode)->ip_io_sem);
 	}
 
 	if (inode)