[Ocfs2-commits] mfasheh commits r2389 - in trunk/fs/ocfs2: . cluster

Fri Jun 10 17:08:42 CDT 2005

Author: mfasheh
Signed-off-by: manish
Date: 2005-06-10 17:08:41 -0500 (Fri, 10 Jun 2005)
New Revision: 2389

Modified:
   trunk/fs/ocfs2/cluster/masklog.c
   trunk/fs/ocfs2/cluster/masklog.h
   trunk/fs/ocfs2/dcache.c
   trunk/fs/ocfs2/dir.c
   trunk/fs/ocfs2/dir.h
   trunk/fs/ocfs2/dlmglue.c
   trunk/fs/ocfs2/dlmglue.h
   trunk/fs/ocfs2/file.c
   trunk/fs/ocfs2/inode.c
   trunk/fs/ocfs2/inode.h
   trunk/fs/ocfs2/namei.c
   trunk/fs/ocfs2/ocfs2.h
   trunk/fs/ocfs2/sysfile.c
   trunk/fs/ocfs2/vote.c
   trunk/fs/ocfs2/vote.h
Log:
* have ocfs2_lookup handle it's own cluster locking and don't drop the lock
  until we've completed our iget and d_add. Otherwise we race unlink of that  
  name and can't rely on it still being valid.

* open and the vote thread were racing on the check / set of the
  INODE_DELETED flag. Fix this.

* unlink / rename votes were naively calling d_prune_aliases, which doesn't
  catch all of the dentry races we might have in a cluster. Pass through
  some unique identifying characteristics and find the exact dentry to give to
  d_delete instead.

* unsurprisingly, clustered rename has the same deadlock issues that local
  rename has. Mimic the VFS locking here and introduce a cluster rename lock.

* ocfs2_rename was bugging on a condition which we could just cleanly error
  out on.

* ocfs2_meta_lock was only warning on getting a lock for a wiped inode.
  Realizing that this is perfectly valid, change the code there to cleanly
  error out with -ENOENT.

* we weren't setting dentry ops on all dentries which wasn't getting our
  d_revalidate callback on new names.

* introduce some dcache.c specific tracing

* proper locking around the i_nlink changes in the vote thread

* add some more tracing for certain conditions in dlmglue.

Signed-off-by: manish



Modified: trunk/fs/ocfs2/cluster/masklog.c
===================================================================

--- trunk/fs/ocfs2/cluster/masklog.c	2005-06-10 19:33:33 UTC (rev 2388)
+++ trunk/fs/ocfs2/cluster/masklog.c	2005-06-10 22:08:41 UTC (rev 2389)
@@ -208,6 +208,7 @@
 	set_a_string(NAMEI);
 	set_a_string(INODE);
 	set_a_string(VOTE);
+	set_a_string(DCACHE);
 	set_a_string(ERROR);
 	set_a_string(NOTICE);
 	set_a_string(KTHREAD);

Modified: trunk/fs/ocfs2/cluster/masklog.h
===================================================================
--- trunk/fs/ocfs2/cluster/masklog.h	2005-06-10 19:33:33 UTC (rev 2388)
+++ trunk/fs/ocfs2/cluster/masklog.h	2005-06-10 22:08:41 UTC (rev 2389)
@@ -105,6 +105,7 @@
 #define ML_NAMEI	0x0000000000400000ULL /* ocfs2 directory / namespace */
 #define ML_INODE	0x0000000000800000ULL /* ocfs2 inode manipulation */
 #define ML_VOTE		0x0000000001000000ULL /* ocfs2 node messaging  */
+#define ML_DCACHE	0x0000000002000000ULL /* ocfs2 dcache operations */
 /* bits that are infrequently given and frequently matched in the high word */
 #define ML_ERROR	0x0000000100000000ULL /* sent to KERN_ERR */
 #define ML_NOTICE	0x0000000200000000ULL /* setn to KERN_NOTICE */

Modified: trunk/fs/ocfs2/dcache.c
===================================================================
--- trunk/fs/ocfs2/dcache.c	2005-06-10 19:33:33 UTC (rev 2388)
+++ trunk/fs/ocfs2/dcache.c	2005-06-10 22:08:41 UTC (rev 2389)
@@ -28,7 +28,7 @@
 #include <linux/slab.h>
 #include <linux/namei.h>
 
-#define MLOG_MASK_PREFIX ML_NAMEI
+#define MLOG_MASK_PREFIX ML_DCACHE
 #include <cluster/masklog.h>
 
 #include "ocfs2.h"
@@ -48,8 +48,12 @@
 	mlog_entry("(0x%p, '%.*s')\n", dentry,
 		   dentry->d_name.len, dentry->d_name.name);
 
-	if (inode == NULL)
+	/* Never trust a negative dentry - force a new lookup. */
+	if (inode == NULL) {
+		mlog(0, "negative dentry: %.*s\n", dentry->d_name.len,
+		     dentry->d_name.name);
 		goto bail;
+	}
 
 	osb = OCFS2_SB(inode->i_sb);
 
@@ -58,7 +62,7 @@
 	if (inode != osb->root_inode) {
 		spin_lock(&OCFS2_I(inode)->ip_lock);
 		/* did we or someone else delete this inode? */
-		if (INODE_DELETED(inode)) {
+		if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) {
 			spin_unlock(&OCFS2_I(inode)->ip_lock);
 			mlog(0, "inode (%"MLFu64") deleted, returning false\n",
 			     OCFS2_I(inode)->ip_blkno);

Modified: trunk/fs/ocfs2/dir.c
===================================================================
--- trunk/fs/ocfs2/dir.c	2005-06-10 19:33:33 UTC (rev 2388)
+++ trunk/fs/ocfs2/dir.c	2005-06-10 22:08:41 UTC (rev 2389)
@@ -203,29 +203,20 @@
 /*
  * NOTE: this should always be called with parent dir i_sem taken.
  */
-int ocfs2_find_files_on_disk(ocfs2_super *osb, const char *name,
-			     int namelen, u64 *blkno,
-			     struct inode *inode, int take_lock,
+int ocfs2_find_files_on_disk(const char *name,
+			     int namelen,
+			     u64 *blkno,
+			     struct inode *inode,
 			     struct buffer_head **dirent_bh,
 			     struct ocfs2_dir_entry **dirent)
 {
 	int status = -ENOENT;
-	int lock_acq = 0;
+	ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 
 	mlog_entry("(osb=%p, parent=%"MLFu64", name='%.*s', blkno=%p, "
 		   "inode=%p)\n",
 		   osb, OCFS2_I(inode)->ip_blkno, namelen, name, blkno, inode);
 
-	if (take_lock) {
-		status = ocfs2_meta_lock(inode, NULL, NULL, 0);
-		if (status < 0) {
-			if (status != -ENOENT)
-				mlog_errno(status);
-			goto leave;
-		}
-		lock_acq = 1;
-	}
-
 	*dirent_bh = ocfs2_find_entry(name, namelen, inode, dirent);
 	if (!*dirent_bh || !*dirent) {
 		status = -ENOENT;
@@ -236,10 +227,6 @@
 
 	status = 0;
 leave:
-
-	if (take_lock && lock_acq)
-		ocfs2_meta_unlock(inode, 0);
-
 	if (status < 0) {
 		*dirent = NULL;
 		if (*dirent_bh) {

Modified: trunk/fs/ocfs2/dir.h
===================================================================
--- trunk/fs/ocfs2/dir.h	2005-06-10 19:33:33 UTC (rev 2388)
+++ trunk/fs/ocfs2/dir.h	2005-06-10 22:08:41 UTC (rev 2389)
@@ -30,9 +30,10 @@
 			      const char *name,
 			      int namelen);
 int ocfs2_empty_dir(struct inode *inode);  /* FIXME: to namei.c */
-int ocfs2_find_files_on_disk(ocfs2_super *osb, const char *name,
-			     int namelen, u64 *blkno,
-			     struct inode *inode, int take_lock,
+int ocfs2_find_files_on_disk(const char *name,
+			     int namelen,
+			     u64 *blkno,
+			     struct inode *inode,
 			     struct buffer_head **dirent_bh,
 			     struct ocfs2_dir_entry **dirent);
 int ocfs2_readdir(struct file *filp, void *dirent, filldir_t filldir);

Modified: trunk/fs/ocfs2/dlmglue.c
===================================================================
--- trunk/fs/ocfs2/dlmglue.c	2005-06-10 19:33:33 UTC (rev 2388)
+++ trunk/fs/ocfs2/dlmglue.c	2005-06-10 22:08:41 UTC (rev 2389)
@@ -66,9 +66,10 @@
 static spinlock_t clean_buffer_lock = SPIN_LOCK_UNLOCKED;
 
 static char ocfs2_lock_type_char[OCFS2_NUM_LOCK_TYPES] = {
-	[OCFS2_TYPE_META]  = 'M',
-	[OCFS2_TYPE_DATA]  = 'D',
-	[OCFS2_TYPE_SUPER] = 'S'
+	[OCFS2_TYPE_META]   = 'M',
+	[OCFS2_TYPE_DATA]   = 'D',
+	[OCFS2_TYPE_SUPER]  = 'S',
+	[OCFS2_TYPE_RENAME] = 'R'
 };
 
 static int ocfs2_build_lock_name(enum ocfs2_lock_type type,
@@ -82,6 +83,10 @@
 static void ocfs2_super_ast_func(void *opaque);
 static void ocfs2_super_bast_func(void *opaque,
 				  int level);
+static void ocfs2_rename_ast_func(void *opaque);
+static void ocfs2_rename_bast_func(void *opaque,
+				   int level);
+
 /* so far, all locks have gotten along with the same unlock ast */
 static void ocfs2_unlock_ast_func(void *opaque,
 				  dlm_status status);
@@ -91,8 +96,8 @@
 			      int *requeue);
 static int ocfs2_unblock_data(struct ocfs2_lock_res *lockres,
 			      int *requeue);
-static int ocfs2_unblock_super(struct ocfs2_lock_res *lockres,
-			       int *requeue);
+static int ocfs2_unblock_osb_lock(struct ocfs2_lock_res *lockres,
+				  int *requeue);
 typedef void (ocfs2_convert_worker_t)(struct ocfs2_lock_res *, int);
 static int ocfs2_generic_unblock_lock(ocfs2_super *osb,
 				      struct ocfs2_lock_res *lockres,
@@ -107,29 +112,36 @@
 };
 
 static struct ocfs2_lock_res_ops ocfs2_inode_meta_lops = {
-	.ast = ocfs2_inode_ast_func,
-	.bast = ocfs2_inode_bast_func,
-	.unlock_ast = ocfs2_unlock_ast_func,
-	.unblock = ocfs2_unblock_meta,
+	.ast		= ocfs2_inode_ast_func,
+	.bast		= ocfs2_inode_bast_func,
+	.unlock_ast	= ocfs2_unlock_ast_func,
+	.unblock	= ocfs2_unblock_meta,
 };
 
 static void ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres,
 				      int blocking);
 
 static struct ocfs2_lock_res_ops ocfs2_inode_data_lops = {
-	.ast = ocfs2_inode_ast_func,
-	.bast = ocfs2_inode_bast_func,
-	.unlock_ast = ocfs2_unlock_ast_func,
-	.unblock = ocfs2_unblock_data,
+	.ast		= ocfs2_inode_ast_func,
+	.bast		= ocfs2_inode_bast_func,
+	.unlock_ast	= ocfs2_unlock_ast_func,
+	.unblock	= ocfs2_unblock_data,
 };
 
 static struct ocfs2_lock_res_ops ocfs2_super_lops = {
-	.ast = ocfs2_super_ast_func,
-	.bast = ocfs2_super_bast_func,
-	.unlock_ast = ocfs2_unlock_ast_func,
-	.unblock = ocfs2_unblock_super,
+	.ast		= ocfs2_super_ast_func,
+	.bast		= ocfs2_super_bast_func,
+	.unlock_ast	= ocfs2_unlock_ast_func,
+	.unblock	= ocfs2_unblock_osb_lock,
 };
 
+static struct ocfs2_lock_res_ops ocfs2_rename_lops = {
+	.ast		= ocfs2_rename_ast_func,
+	.bast		= ocfs2_rename_bast_func,
+	.unlock_ast	= ocfs2_unlock_ast_func,
+	.unblock	= ocfs2_unblock_osb_lock,
+};
+
 static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres)
 {
 	return lockres->l_type == OCFS2_TYPE_META || 
@@ -141,9 +153,15 @@
 	return lockres->l_type == OCFS2_TYPE_SUPER;
 }
 
+static inline int ocfs2_is_rename_lock(struct ocfs2_lock_res *lockres)
+{
+	return lockres->l_type == OCFS2_TYPE_RENAME;
+}
+
 static inline ocfs2_super *ocfs2_lock_res_super(struct ocfs2_lock_res *lockres)
 {
-	OCFS2_ASSERT(ocfs2_is_super_lock(lockres));
+	BUG_ON(!ocfs2_is_super_lock(lockres) 
+	       && !ocfs2_is_rename_lock(lockres));
 
 	return (ocfs2_super *) lockres->l_priv;
 }
@@ -201,7 +219,7 @@
 				       int new_level);
 static int ocfs2_meta_lock_update(struct inode *inode,
 				  struct buffer_head **bh);
-static void ocfs2_drop_super_lock(ocfs2_super *osb);
+static void ocfs2_drop_osb_locks(ocfs2_super *osb);
 static inline int ocfs2_highest_compat_lock_level(int level);
 static int __ocfs2_downconvert_lock(ocfs2_super *osb,
 				    struct ocfs2_lock_res *lockres,
@@ -395,6 +413,29 @@
 	return status;
 }
 
+int ocfs2_rename_lock_res_init(struct ocfs2_lock_res *res,
+			       ocfs2_super *osb)
+{
+	enum ocfs2_lock_type type = OCFS2_TYPE_RENAME;
+	int status;
+
+	mlog_entry_void();
+
+	ocfs2_lock_res_init_once(res);
+
+	status = ocfs2_build_lock_name(type, 0, 0, &res->l_name);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	ocfs2_lock_res_init_common(res, type, &ocfs2_rename_lops, osb);
+
+bail:
+	mlog_exit(status);
+	return status;
+}
+
 void ocfs2_lock_res_free(struct ocfs2_lock_res *res)
 {
 	mlog_entry_void();
@@ -674,6 +715,11 @@
 		ocfs2_generic_handle_downconvert_action(lockres);
 		break;
 	default:
+		mlog(ML_ERROR, "lockres %s: ast fired with invalid action: %u "
+		     "lockres flags = 0x%lx, unlock action: %u\n",
+		     lockres->l_name, lockres->l_action, lockres->l_flags,
+		     lockres->l_unlock_action);
+
 		BUG();
 	}
 
@@ -752,21 +798,15 @@
 	mlog_exit_void();
 }
 
-static void ocfs2_super_ast_func(void *opaque)
+static void ocfs2_generic_ast_func(struct ocfs2_lock_res *lockres)
 {
-	struct ocfs2_lock_res *lockres = opaque;
-	dlm_lockstatus *lksb;
+	dlm_lockstatus *lksb = &lockres->l_lksb;
 
-	mlog_entry_void();
+	spin_lock(&lockres->l_lock);
 
-	mlog(0, "Superblock AST fired\n");
-
-	OCFS2_ASSERT(ocfs2_is_super_lock(lockres));
-
-	spin_lock(&lockres->l_lock);
-	lksb = &(lockres->l_lksb);
 	if (lksb->status != DLM_NORMAL) {
-		mlog(ML_ERROR, "lksb status value of %u!\n", lksb->status);
+		mlog(ML_ERROR, "lockres %s: lksb status value of %u!\n",
+		     lockres->l_name, lksb->status);
 		spin_unlock(&lockres->l_lock);
 		return;
 	}
@@ -784,30 +824,32 @@
 	default:
 		BUG();
 	}
+
 	/* set it to something invalid so if we get called again we
 	 * can catch it. */
 	lockres->l_action = OCFS2_AST_INVALID;
 	spin_unlock(&lockres->l_lock);
+
 	wake_up(&lockres->l_event);
-
-	mlog_exit_void();
 }
 
-static void ocfs2_super_bast_func(void *opaque, int level)
+static void ocfs2_generic_bast_func(ocfs2_super *osb,
+				    struct ocfs2_lock_res *lockres,
+				    int level,
+				    int ignore_refresh)
 {
-	struct ocfs2_lock_res *lockres = opaque;
-	ocfs2_super *osb;
 	int needs_downconvert;
 
 	mlog_entry_void();
-       	osb = ocfs2_lock_res_super(lockres);
 
-	mlog(0, "Superblock BAST fired\n");
-
 	spin_lock(&lockres->l_lock);
 	needs_downconvert = ocfs2_generic_handle_bast(lockres, level);
 	if (needs_downconvert)
 		ocfs2_schedule_blocked_lock(osb, lockres);
+
+	if (ignore_refresh)
+		lockres_clear_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
+
 	spin_unlock(&lockres->l_lock);
 
 	ocfs2_kick_vote_thread(osb);
@@ -816,6 +858,68 @@
 	mlog_exit_void();
 }
 
+static void ocfs2_super_ast_func(void *opaque)
+{
+	struct ocfs2_lock_res *lockres = opaque;
+
+	mlog_entry_void();
+	mlog(0, "Superblock AST fired\n");
+
+	BUG_ON(!ocfs2_is_super_lock(lockres));
+	ocfs2_generic_ast_func(lockres);
+
+	mlog_exit_void();
+}
+
+static void ocfs2_super_bast_func(void *opaque,
+				  int level)
+{
+	struct ocfs2_lock_res *lockres = opaque;
+	ocfs2_super *osb;
+
+	mlog_entry_void();
+	mlog(0, "Superblock BAST fired\n");
+
+	BUG_ON(!ocfs2_is_super_lock(lockres));
+       	osb = ocfs2_lock_res_super(lockres);
+	ocfs2_generic_bast_func(osb, lockres, level, 0);
+
+	mlog_exit_void();
+}
+
+static void ocfs2_rename_ast_func(void *opaque)
+{
+	struct ocfs2_lock_res *lockres = opaque;
+
+	mlog_entry_void();
+
+	mlog(0, "Rename AST fired\n");
+
+	BUG_ON(!ocfs2_is_rename_lock(lockres));
+
+	ocfs2_generic_ast_func(lockres);
+
+	mlog_exit_void();
+}
+
+static void ocfs2_rename_bast_func(void *opaque,
+				   int level)
+{
+	struct ocfs2_lock_res *lockres = opaque;
+	ocfs2_super *osb;
+
+	mlog_entry_void();
+
+	mlog(0, "Rename BAST fired\n");
+
+	BUG_ON(!ocfs2_is_rename_lock(lockres));
+
+	osb = ocfs2_lock_res_super(lockres);
+	ocfs2_generic_bast_func(osb, lockres, level, 1);
+
+	mlog_exit_void();
+}
+
 static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
 						int convert)
 {
@@ -1030,6 +1134,10 @@
 	}
 
 	if (level > lockres->l_level) {
+		if (lockres->l_action != OCFS2_AST_INVALID)
+			mlog(ML_ERROR, "lockres %s has action %u pending\n",
+			     lockres->l_name, lockres->l_action);
+
 		lockres->l_action = OCFS2_AST_CONVERT;
 		lockres->l_requested = level;
 		lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
@@ -1470,7 +1578,7 @@
 	mlog_entry_void();
 
 	spin_lock(&OCFS2_I(inode)->ip_lock);
-	if (INODE_DELETED(inode)) {
+	if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) {
 		mlog(0, "Orphaned inode %"MLFu64" was deleted while we "
 		     "were waiting on a lock. ip_flags = 0x%x\n",
 		     OCFS2_I(inode)->ip_blkno, OCFS2_I(inode)->ip_flags);
@@ -1698,7 +1806,7 @@
 {
 	int status;
 	int level = ex ? LKM_EXMODE : LKM_PRMODE;
-	struct ocfs2_lock_res *lockres = &osb->super_lockres;
+	struct ocfs2_lock_res *lockres = &osb->osb_super_lockres;
 	struct buffer_head *bh;
 	ocfs2_slot_info *si = osb->slot_info;
 
@@ -1740,11 +1848,30 @@
 			int ex)
 {
 	int level = ex ? LKM_EXMODE : LKM_PRMODE;
-	struct ocfs2_lock_res *lockres = &osb->super_lockres;
+	struct ocfs2_lock_res *lockres = &osb->osb_super_lockres;
 
 	ocfs2_cluster_unlock(osb, lockres, level);
 }
 
+int ocfs2_rename_lock(ocfs2_super *osb)
+{
+	int status;
+	struct ocfs2_lock_res *lockres = &osb->osb_rename_lockres;
+
+	status = ocfs2_cluster_lock(osb, lockres, LKM_EXMODE, 0, NULL, 0);
+	if (status < 0)
+		mlog_errno(status);
+
+	return status;
+}
+
+void ocfs2_rename_unlock(ocfs2_super *osb)
+{
+	struct ocfs2_lock_res *lockres = &osb->osb_rename_lockres;
+
+	ocfs2_cluster_unlock(osb, lockres, LKM_EXMODE);
+}
+
 int ocfs2_dlm_init(ocfs2_super *osb)
 {
 	int status;
@@ -1778,9 +1905,16 @@
 
 	osb->dlm = dlm;
 
-	status = ocfs2_super_lock_res_init(&osb->super_lockres, osb);
+	status = ocfs2_super_lock_res_init(&osb->osb_super_lockres, osb);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_rename_lock_res_init(&osb->osb_rename_lockres, osb);
 	if (status < 0)
 		mlog_errno(status);
+
 bail:
 
 	mlog_exit(status);
@@ -1791,14 +1925,16 @@
 {
 	mlog_entry_void();
 
-	ocfs2_drop_super_lock(osb);
+	ocfs2_drop_osb_locks(osb);
 
 	if (osb->vote_task) {
 		kthread_stop(osb->vote_task);
 		osb->vote_task = NULL;
 	}
 
-	ocfs2_lock_res_free(&osb->super_lockres);
+	ocfs2_lock_res_free(&osb->osb_super_lockres);
+	ocfs2_lock_res_free(&osb->osb_rename_lockres);
+
 	dlm_unregister_domain(osb->dlm);
 	osb->dlm = NULL;
 
@@ -1878,6 +2014,7 @@
 			   lockres);
 	if (status != DLM_NORMAL) {
 		ocfs2_log_dlm_error("dlmunlock", status, lockres);
+		mlog(ML_ERROR, "lockres flags: %lu\n", lockres->l_flags);
 		dlm_print_one_lock(lockres->l_lksb.lockid);
 		BUG();
 	}
@@ -1931,16 +2068,20 @@
 	return  __ocfs2_drop_lock(osb, lockres);
 }
 
-static void ocfs2_drop_super_lock(ocfs2_super *osb)
+static void ocfs2_drop_osb_locks(ocfs2_super *osb)
 {
 	int status;
 
 	mlog_entry_void();
 
-	status = ocfs2_drop_lock(osb, &osb->super_lockres, NULL);
+	status = ocfs2_drop_lock(osb, &osb->osb_super_lockres, NULL);
 	if (status < 0)
 		mlog_errno(status);
 
+	status = ocfs2_drop_lock(osb, &osb->osb_rename_lockres, NULL);
+	if (status < 0)
+		mlog_errno(status);
+
 	mlog_exit(status);
 }
 
@@ -2372,14 +2513,18 @@
 	return status;
 }
 
-static int ocfs2_unblock_super(struct ocfs2_lock_res *lockres,
-			       int *requeue)
+/* Generic unblock function for any lockres whose private data is an
+ * ocfs2_super pointer. */
+static int ocfs2_unblock_osb_lock(struct ocfs2_lock_res *lockres,
+				  int *requeue)
 {
 	int status;
 	ocfs2_super *osb;
 
 	mlog_entry_void();
 
+	mlog(0, "Unblock lockres %s\n", lockres->l_name);
+
 	osb = ocfs2_lock_res_super(lockres);
 
 	status = ocfs2_generic_unblock_lock(osb,

Modified: trunk/fs/ocfs2/dlmglue.h
===================================================================
--- trunk/fs/ocfs2/dlmglue.h	2005-06-10 19:33:33 UTC (rev 2388)
+++ trunk/fs/ocfs2/dlmglue.h	2005-06-10 22:08:41 UTC (rev 2389)
@@ -113,6 +113,8 @@
 		     int ex);
 void ocfs2_super_unlock(ocfs2_super *osb,
 			int ex);
+int ocfs2_rename_lock(ocfs2_super *osb);
+void ocfs2_rename_unlock(ocfs2_super *osb);
 /* for the vote thread */
 void ocfs2_process_blocked_lock(ocfs2_super *osb,
 				struct ocfs2_lock_res *lockres);

Modified: trunk/fs/ocfs2/file.c
===================================================================
--- trunk/fs/ocfs2/file.c	2005-06-10 19:33:33 UTC (rev 2388)
+++ trunk/fs/ocfs2/file.c	2005-06-10 22:08:41 UTC (rev 2389)
@@ -88,13 +88,24 @@
 	mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file, 
 		   file->f_dentry->d_name.len, file->f_dentry->d_name.name);
 
-	status = -EACCES;
+	spin_lock(&oi->ip_lock);
 
-	spin_lock(&oi->ip_lock);
+	/* Check that the inode hasn't been wiped from disk by another
+	 * node. If it hasn't then we're safe as long as we hold the
+	 * spin lock until our increment of open count. */
+	if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) {
+		spin_unlock(&oi->ip_lock);
+
+		status = -ENOENT;
+		goto leave;
+	}
+
 	if (oi->ip_open_count &&
 	    !ocfs2_valid_open(mode, 
 			      oi->ip_flags & OCFS2_INODE_OPEN_DIRECT)) {
 		spin_unlock(&oi->ip_lock);
+
+		status = -EACCES;
 		goto leave;
 	}
 
@@ -480,7 +491,11 @@
 
 	fe = (ocfs2_dinode *) fe_bh->b_data;
 	OCFS2_BUG_ON_INVALID_DINODE(fe);
-	OCFS2_ASSERT(fe->i_size == i_size_read(inode));
+	mlog_bug_on_msg(fe->i_size != i_size_read(inode),
+			"Inode %"MLFu64", inode i_size = %"MLFu64" != di "
+			"i_size = %"MLFu64", i_flags = 0x%x\n",
+			OCFS2_I(inode)->ip_blkno, i_size_read(inode),
+			fe->i_size, fe->i_flags);
 
 	if (new_i_size > fe->i_size) {
 		mlog(0, "asked to truncate file with size (%"MLFu64") "

Modified: trunk/fs/ocfs2/inode.c
===================================================================
--- trunk/fs/ocfs2/inode.c	2005-06-10 19:33:33 UTC (rev 2388)
+++ trunk/fs/ocfs2/inode.c	2005-06-10 22:08:41 UTC (rev 2389)
@@ -554,7 +554,7 @@
 
 		/* XXX: Is this really necessary? */
 		spin_lock(&OCFS2_I(inode)->ip_lock);
-		SET_INODE_DELETED(inode);
+		OCFS2_I(inode)->ip_flags |= OCFS2_INODE_DELETED;
 		spin_unlock(&OCFS2_I(inode)->ip_lock);
 		goto bail_unblock;
 	}
@@ -670,7 +670,7 @@
 		goto bail_unblock;
 	}
 
-	SET_INODE_DELETED(inode);
+	OCFS2_I(inode)->ip_flags |= OCFS2_INODE_DELETED;
 bail_unblock:
 	status = sigprocmask(SIG_SETMASK, &oldset, NULL);
 	if (status < 0)
@@ -716,7 +716,7 @@
 	/* We very well may get a clear_inode before all an inodes
 	 * metadata has hit disk. Of course, we can't drop any cluster
 	 * locks until the journal has finished with it. */
-	if (!INODE_DELETED(inode))
+	if (!(OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED))
 		ocfs2_checkpoint_inode(inode);
 
 	mlog_bug_on_msg(!list_empty(&oi->ip_io_markers),
@@ -847,7 +847,7 @@
 	osb = OCFS2_SB(inode->i_sb);
 
 	spin_lock(&OCFS2_I(inode)->ip_lock);
-	if (INODE_DELETED(inode)) {
+	if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) {
 		spin_unlock(&OCFS2_I(inode)->ip_lock);
 		mlog(0, "inode deleted!\n");
 		status = -ENOENT;

Modified: trunk/fs/ocfs2/inode.h
===================================================================
--- trunk/fs/ocfs2/inode.h	2005-06-10 19:33:33 UTC (rev 2388)
+++ trunk/fs/ocfs2/inode.h	2005-06-10 22:08:41 UTC (rev 2389)
@@ -91,13 +91,8 @@
 	return container_of(inode, struct ocfs2_inode_info, vfs_inode);
 }
 
-#define INODE_DELETED(i) (OCFS2_I(i)->ip_flags & OCFS2_INODE_DELETED)
-#define SET_INODE_DELETED(i) (OCFS2_I(i)->ip_flags |= OCFS2_INODE_DELETED)
-#define CLEAR_INODE_DELETED(i) (OCFS2_I(i)->ip_flags &= (~OCFS2_INODE_DELETED))
-
 #define INODE_JOURNAL(i) (OCFS2_I(i)->ip_flags & OCFS2_INODE_JOURNAL)
 #define SET_INODE_JOURNAL(i) (OCFS2_I(i)->ip_flags |= OCFS2_INODE_JOURNAL)
-#define CLEAR_INODE_JOURNAL(i) (OCFS2_I(i)->ip_flags &= (~OCFS2_INODE_JOURNAL))
 
 extern kmem_cache_t *ocfs2_inode_cache;
 

Modified: trunk/fs/ocfs2/namei.c
===================================================================
--- trunk/fs/ocfs2/namei.c	2005-06-10 19:33:33 UTC (rev 2388)
+++ trunk/fs/ocfs2/namei.c	2005-06-10 22:08:41 UTC (rev 2389)
@@ -129,10 +129,10 @@
 				  struct dentry *dentry, 
 				  struct inode *inode, u64 blkno, 
 				  struct buffer_head *parent_fe_bh,
-				  struct buffer_head *insert_bh) 
+				  struct buffer_head *insert_bh)
 {
 	return __ocfs2_add_entry(handle, dentry->d_parent->d_inode, 
-				 dentry->d_name.name, dentry->d_name.len, 
+				 dentry->d_name.name, dentry->d_name.len,
 				 inode, blkno, parent_fe_bh, insert_bh);
 }
 
@@ -146,9 +146,7 @@
 	u64 blkno;
 	struct buffer_head *dirent_bh = NULL;
 	struct inode *inode = NULL;
-	struct super_block *sb = dir->i_sb;
 	struct dentry *ret;
-	ocfs2_super *osb = OCFS2_SB(sb);
 	struct ocfs2_dir_entry *dirent;
 
 	mlog_entry("(0x%p, 0x%p, '%.*s')\n", dir, dentry,
@@ -159,26 +157,43 @@
 		goto bail;
 	}
 
-	mlog(0, "about to call find_files_on_disk with inode=%p\n", dir);
+	mlog(0, "find name %.*s in directory %"MLFu64"\n", dentry->d_name.len,
+	     dentry->d_name.name, OCFS2_I(dir)->ip_blkno);
 
-	status = ocfs2_find_files_on_disk(osb, dentry->d_name.name,
+	status = ocfs2_meta_lock(dir, NULL, NULL, 0);
+	if (status < 0) {
+		if (status != -ENOENT)
+			mlog_errno(status);
+		ret = ERR_PTR(status);
+		goto bail;
+	}
+
+	status = ocfs2_find_files_on_disk(dentry->d_name.name,
 					  dentry->d_name.len, &blkno,
-					  dir, 1, &dirent_bh, &dirent);
+					  dir, &dirent_bh, &dirent);
 	if (status < 0)
 		goto bail_add;
-	
-	inode = ocfs2_iget(osb, blkno);
+
+	inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno);
 	if (!inode) {
-		mlog(ML_ERROR, "Could not create inode!\n");
+		mlog(ML_ERROR, "Unable to create inode %"MLFu64"\n", blkno);
 		ret = ERR_PTR(-EACCES);
-		goto bail;
+		goto bail_unlock;
 	}
 
 bail_add:
+
 	dentry->d_op = &ocfs2_dentry_ops;
 	d_add(dentry, inode);
 	ret = NULL;
 
+bail_unlock:
+	/* Don't drop the cluster lock until *after* the d_add --
+	 * unlink on another node will message us to remove that
+	 * dentry under this lock so otherwise we can race this with
+	 * the vote thread and have a stale dentry. */
+	ocfs2_meta_unlock(dir, 0);
+
 bail:
 	if (dirent_bh)
 		brelse(dirent_bh);
@@ -195,28 +210,35 @@
 	struct dentry *parent;
 	struct inode *inode;
 	struct inode *dir = child->d_inode;
-	struct super_block *sb = dir->i_sb;
-	ocfs2_super *osb = OCFS2_SB(sb);
 	struct buffer_head *dirent_bh = NULL;
 	struct ocfs2_dir_entry *dirent;
 
 	mlog_entry("(0x%p, '%.*s')\n", child,
 		   child->d_name.len, child->d_name.name);
 
-	mlog(0, "about to call find_files_on_disk with inode=%p\n", dir);
+	mlog(0, "find parent of directory %"MLFu64"\n",
+	     OCFS2_I(dir)->ip_blkno);
 
-	status = ocfs2_find_files_on_disk(osb, "..", 2, &blkno,
-					  dir, 1, &dirent_bh, &dirent);
+	status = ocfs2_meta_lock(dir, NULL, NULL, 0);
 	if (status < 0) {
-		parent = ERR_PTR(-ENOENT);
+		if (status != -ENOENT)
+			mlog_errno(status);
+		parent = ERR_PTR(status);
 		goto bail;
 	}
 
-	inode = ocfs2_iget(osb, blkno);
+	status = ocfs2_find_files_on_disk("..", 2, &blkno, dir, &dirent_bh,
+					  &dirent);
+	if (status < 0) {
+		parent = ERR_PTR(-ENOENT);
+		goto bail_unlock;
+	}
+
+	inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno);
 	if (!inode) {
-		mlog(ML_ERROR, "Could not create inode!\n");
+		mlog(ML_ERROR, "Unable to create inode %"MLFu64"\n", blkno);
 		parent = ERR_PTR(-EACCES);
-		goto bail;
+		goto bail_unlock;
 	}
 
 	parent = d_alloc_anon(inode);
@@ -225,10 +247,13 @@
 		parent = ERR_PTR(-ENOMEM);
 	}
 
-bail:
+bail_unlock:
+	ocfs2_meta_unlock(dir, 0);
+
 	if (dirent_bh)
 		brelse(dirent_bh);
 
+bail:
 	mlog_exit_ptr(parent);
 
 	return parent;
@@ -310,7 +335,6 @@
 {
 	int status = 0;
 	struct buffer_head *parent_fe_bh = NULL;
-	u64 file_off;
 	ocfs2_journal_handle *handle = NULL;
 	ocfs2_super *osb;
 	ocfs2_dinode *fe = NULL;
@@ -428,8 +452,6 @@
 	if (status < 0)
 		mlog_errno(status);
 
-	file_off = fe->i_blkno << dir->i_sb->s_blocksize_bits;
-
 	if (S_ISDIR(mode)) {
 		status = ocfs2_fill_new_dir(osb, handle, dir, inode, 
 					    new_fe_bh, data_ac);
@@ -461,8 +483,8 @@
 	}
 
 	insert_inode_hash(inode);
+	dentry->d_op = &ocfs2_dentry_ops;
 	d_instantiate(dentry, inode);
-
 	status = 0;
 leave:
 	if (handle)
@@ -737,6 +759,7 @@
 	}
 
 	atomic_inc(&inode->i_count);
+	dentry->d_op = &ocfs2_dentry_ops;
 	d_instantiate(dentry, inode);
 bail:
 	if (handle)
@@ -797,10 +820,9 @@
 		goto leave;
 	}
 
-	status = ocfs2_find_files_on_disk(osb, dentry->d_name.name,
+	status = ocfs2_find_files_on_disk(dentry->d_name.name,
 					  dentry->d_name.len, &blkno,
-					  dir, 0, &dirent_bh,
-					  &dirent);
+					  dir, &dirent_bh, &dirent);
 	if (status < 0) {
 		if (status != -ENOENT)
 			mlog_errno(status);
@@ -843,7 +865,7 @@
 	else
 		inode->i_nlink--;
 
-	status = ocfs2_request_unlink_vote(inode,
+	status = ocfs2_request_unlink_vote(inode, dentry,
 					   (unsigned int) inode->i_nlink);
 	if (status < 0) {
 		/* This vote should succeed under all normal
@@ -1009,7 +1031,7 @@
 			struct inode *new_dir,
 			struct dentry *new_dentry)
 {
-	int status = 0;
+	int status = 0, rename_lock = 0;
 	struct inode *old_inode = old_dentry->d_inode;
 	struct inode *new_inode = new_dentry->d_inode;
 	ocfs2_dinode *newfe = NULL;
@@ -1053,6 +1075,26 @@
 		}
 	}
 
+	/* Assume a directory heirarchy thusly:
+	 * a/b/c
+	 * a/d
+	 * a,b,c, and d are all directories.
+	 *
+	 * from cwd of 'a' on both nodes:
+	 * node1: mv b/c d
+	 * node2: mv d   b/c
+	 *
+	 * And that's why, just like the VFS, we need a file system
+	 * rename lock. */
+	if (old_dentry != new_dentry) {
+		status = ocfs2_rename_lock(osb);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+		rename_lock = 1;
+	}
+
 	handle = ocfs2_alloc_handle(osb);
 	if (handle == NULL) {
 		status = -ENOMEM;
@@ -1093,7 +1135,7 @@
 			goto bail;
 		}
 
-		status = ocfs2_request_rename_vote(old_inode);
+		status = ocfs2_request_rename_vote(old_inode, old_dentry);
 		if (status < 0) {
 			mlog_errno(status);
 			goto bail;
@@ -1115,7 +1157,7 @@
 	} else {
 		/* Ah, the simple case - we're a file so just send a
 		 * message. */
-		status = ocfs2_request_rename_vote(old_inode);
+		status = ocfs2_request_rename_vote(old_inode, old_dentry);
 		if (status < 0) {
 			mlog_errno(status);
 			goto bail;
@@ -1140,10 +1182,10 @@
 
 	/* check if the target already exists (in which case we need
 	 * to delete it */
-	status = ocfs2_find_files_on_disk(osb, new_dentry->d_name.name,
+	status = ocfs2_find_files_on_disk(new_dentry->d_name.name,
 					  new_dentry->d_name.len, 
-					  &newfe_blkno, new_dir, 0,
-					  &new_de_bh, &new_de);
+					  &newfe_blkno, new_dir, &new_de_bh,
+					  &new_de);
 	/* The only error we allow here is -ENOENT because the new
 	 * file not existing is perfectly valid. */
 	if ((status < 0) && (status != -ENOENT)) {
@@ -1160,10 +1202,22 @@
 	/* In case we need to overwrite an existing file, we blow it
 	 * away first */
 	if (new_de) {
-		BUG_ON(!new_inode);
+		/* VFS didn't think there existed an inode here, but
+		 * someone else in the cluster must have raced our
+		 * rename to create one. Today we error cleanly, in
+		 * the future we should consider calling iget to build
+		 * a new struct inode for this entry. */
+		if (!new_inode) {
+			status = -EACCES;
 
+			mlog(0, "We found an inode for name %.*s but VFS "
+			     "didn't give us one.\n", new_dentry->d_name.len,
+			     new_dentry->d_name.name);
+			goto bail;
+		}
+
 		if (OCFS2_I(new_inode)->ip_blkno != newfe_blkno) {
-			status = -ENOENT;
+			status = -EACCES;
 
 			mlog(0, "Inode blkno (%"MLFu64") and dir (%"MLFu64") "
 			     "disagree. ip_flags = %x\n",
@@ -1184,7 +1238,8 @@
 		else
 			links_count = (unsigned int) (new_inode->i_nlink - 1);
 
-		status = ocfs2_request_unlink_vote(new_inode, links_count);
+		status = ocfs2_request_unlink_vote(new_inode, new_dentry,
+						   links_count);
 		if (status < 0) {
 			mlog_errno(status);
 			goto bail;
@@ -1208,7 +1263,7 @@
 			}
 		}
 	} else {
-		OCFS2_ASSERT(new_dentry->d_parent->d_inode == new_dir);
+		BUG_ON(new_dentry->d_parent->d_inode != new_dir);
 
 		status = ocfs2_check_dir_for_entry(new_dir,
 						   new_dentry->d_name.name,
@@ -1343,7 +1398,7 @@
 				status = ocfs2_journal_dirty(handle, new_dir_bh);
 			}
 		}
-	
+
 	if (old_dir_nlink != old_dir->i_nlink) {
 		if (!old_dir_bh) {
 			mlog(ML_ERROR, "need to change nlink for old dir "
@@ -1364,6 +1419,9 @@
 
 	status = 0;
 bail:
+	if (rename_lock)
+		ocfs2_rename_unlock(osb);
+
 	if (handle)
 		ocfs2_commit_trans(handle);
 
@@ -1668,6 +1726,7 @@
 	}
 
 	insert_inode_hash(inode);
+	dentry->d_op = &ocfs2_dentry_ops;
 	d_instantiate(dentry, inode);
 bail:
 	if (handle)
@@ -1723,7 +1782,7 @@
 			     const char *name, int namelen, 
 			     struct inode *inode, u64 blkno, 
 			     struct buffer_head *parent_fe_bh, 
-			     struct buffer_head *insert_bh) 
+			     struct buffer_head *insert_bh)
 {
 	unsigned long offset;
 	unsigned short rec_len;
@@ -1821,6 +1880,7 @@
 	while (i < bh->b_size) {
 		if (!ocfs2_check_dir_entry(dir, de, bh, i)) {
 			status = -EIO;
+			mlog_errno(status);
 			goto bail;
 		}
 		if (de == de_del)  {
@@ -1828,6 +1888,7 @@
 						      OCFS2_JOURNAL_ACCESS_WRITE);
 			if (status < 0) {
 				status = -EIO;
+				mlog_errno(status);
 				goto bail;
 			}
 			if (pde)
@@ -1902,8 +1963,6 @@
 	return ret;
 }
 
-
-
 struct buffer_head *ocfs2_find_entry(const char *name, int namelen, 
 				     struct inode *dir, 
 				     struct ocfs2_dir_entry **res_dir)
@@ -1929,6 +1988,7 @@
 	if (start >= nblocks)
 		start = 0;
 	block = start;
+
 restart:
 	do {
 		/*
@@ -2141,8 +2201,7 @@
 	status = __ocfs2_add_entry(handle, orphan_dir_inode, name,
 				   OCFS2_ORPHAN_NAMELEN, inode,
 				   OCFS2_I(inode)->ip_blkno,
-				   orphan_dir_bh,
-				   de_bh);
+				   orphan_dir_bh, de_bh);
 	if (status < 0) {
 		mlog_errno(status);
 		goto leave;

Modified: trunk/fs/ocfs2/ocfs2.h
===================================================================
--- trunk/fs/ocfs2/ocfs2.h	2005-06-10 19:33:33 UTC (rev 2388)
+++ trunk/fs/ocfs2/ocfs2.h	2005-06-10 22:08:41 UTC (rev 2389)
@@ -73,6 +73,7 @@
 	OCFS2_TYPE_META = 0,
 	OCFS2_TYPE_DATA,
 	OCFS2_TYPE_SUPER,
+	OCFS2_TYPE_RENAME,
 	OCFS2_NUM_LOCK_TYPES
 };
 
@@ -230,7 +231,8 @@
 	char dev_str[20];		/* "major,minor" of the device */
 
 	dlm_ctxt *dlm;
-	struct ocfs2_lock_res super_lockres;
+	struct ocfs2_lock_res osb_super_lockres;
+	struct ocfs2_lock_res osb_rename_lockres;
 
 	wait_queue_head_t recovery_event;
 

Modified: trunk/fs/ocfs2/sysfile.c
===================================================================
--- trunk/fs/ocfs2/sysfile.c	2005-06-10 19:33:33 UTC (rev 2388)
+++ trunk/fs/ocfs2/sysfile.c	2005-06-10 22:08:41 UTC (rev 2389)
@@ -110,9 +110,9 @@
 					sizeof(namebuf),
 					type, slot);
 	
-	status = ocfs2_find_files_on_disk(osb, namebuf, strlen(namebuf),
+	status = ocfs2_find_files_on_disk(namebuf, strlen(namebuf),
 					  &blkno, osb->sys_root_inode, 
-					  0, &dirent_bh, &de);
+					  &dirent_bh, &de);
 	if (status < 0) {
 		goto bail;
 	}

Modified: trunk/fs/ocfs2/vote.c
===================================================================
--- trunk/fs/ocfs2/vote.c	2005-06-10 19:33:33 UTC (rev 2388)
+++ trunk/fs/ocfs2/vote.c	2005-06-10 22:08:41 UTC (rev 2389)
@@ -63,6 +63,9 @@
 	u32 h_node_num;    /* node sending this particular message. */
 } ocfs2_msg_hdr;
 
+/* OCFS2_MAX_FILENAME_LEN is 255 characters, but we want to align this
+ * for the network. */
+#define OCFS2_VOTE_FILENAME_LEN 256
 typedef struct _ocfs2_vote_msg
 {
 	ocfs2_msg_hdr v_hdr;
@@ -71,6 +74,9 @@
 		s32 v_orphaned_slot;	/* Used during delete votes */
 		u32 v_nlink;		/* Used during unlink votes */
 	} md1;				/* Message type dependant 1 */
+	u32 v_unlink_namelen;
+	u64 v_unlink_parent;
+	u8  v_unlink_dirent[OCFS2_VOTE_FILENAME_LEN];
 } ocfs2_vote_msg;
 
 /* Responses are given these values to maintain backwards
@@ -198,7 +204,7 @@
 
 	spin_lock(&OCFS2_I(inode)->ip_lock);
 	/* vote no if the file is still open. */
-	if (OCFS2_I(inode)->ip_open_count > 0) {
+	if (OCFS2_I(inode)->ip_open_count) {
 		mlog(0, "open count = %u\n",
 		     OCFS2_I(inode)->ip_open_count);
 		spin_unlock(&OCFS2_I(inode)->ip_lock);
@@ -223,32 +229,111 @@
 	truncate_inode_pages(inode->i_mapping, 0);
 	ocfs2_extent_map_trunc(inode, 0);
 
+	spin_lock(&OCFS2_I(inode)->ip_lock);	
+	/* double check open count - someone might have raced this
+	 * thread into ocfs2_file_open while we were writing out
+	 * data. If we're to allow a wipe of this inode now, we *must*
+	 * hold the spinlock until we've marked it. */
+	if (OCFS2_I(inode)->ip_open_count) {
+		mlog(0, "Raced to wipe! open count = %u\n",
+		     OCFS2_I(inode)->ip_open_count);
+		spin_unlock(&OCFS2_I(inode)->ip_lock);
+		goto done;
+	}
+
+	/* Mark the inode as being wiped from disk. */
+	OCFS2_I(inode)->ip_flags |= OCFS2_INODE_DELETED;
+
 	/* If we get here, then we're voting 'yes', so commit the
 	 * delete on our side. */
 	response = OCFS2_RESPONSE_OK;
 
-	spin_lock(&OCFS2_I(inode)->ip_lock);
-	SET_INODE_DELETED(inode);
 	/* We set the SKIP_DELETE flag on the inode so we don't try to
 	 * delete it in delete_inode ourselves. */
 	OCFS2_I(inode)->ip_flags |=  OCFS2_INODE_SKIP_DELETE;
 	spin_unlock(&OCFS2_I(inode)->ip_lock);
 
+	/* Not sure this is necessary anymore. */
 	d_prune_aliases(inode);
 
 done:
 	return response;
 }
 
+static int ocfs2_match_dentry(struct dentry *dentry,
+			      u64 parent_blkno,
+			      unsigned int namelen,
+			      const char *name)
+{
+	struct inode *parent;
+
+	if (!dentry->d_parent) {
+		mlog(0, "Detached from parent.\n");
+		return 0;
+	}
+
+	parent = dentry->d_parent->d_inode;
+	/* Negative parent dentry? */
+	if (!parent)
+		return 0;
+
+	/* Name is in a different directory. */
+	if (OCFS2_I(parent)->ip_blkno != parent_blkno)
+		return 0;
+
+	if (dentry->d_name.len != namelen)
+		return 0;
+
+	/* comparison above guarantees this is safe. */
+	if (memcmp(dentry->d_name.name, name, namelen))
+		return 0;
+
+	return 1;
+}
+
 static void ocfs2_process_dentry_request(struct inode *inode,
 					 int rename,
-					 unsigned int new_nlink)
+					 unsigned int new_nlink,
+					 u64 parent_blkno,
+					 unsigned int namelen,
+					 const char *name)
 {
-	d_prune_aliases(inode);
+	struct dentry *dentry = NULL;
+	struct list_head *p;
 
+	mlog(0, "parent %"MLFu64", namelen = %u, name = %.*s\n", parent_blkno,
+	     namelen, namelen, name);
+
+	spin_lock(&dcache_lock);
+
+	/* Another node is removing this name from the system. It is
+	 * up to us to find the corresponding dentry and if it exists,
+	 * unhash it from the dcache. */
+	list_for_each(p, &inode->i_dentry) {
+		dentry = list_entry(p, struct dentry, d_alias);
+
+		if (ocfs2_match_dentry(dentry, parent_blkno, namelen, name)) {
+			mlog(0, "dentry found: %.*s\n",
+			     dentry->d_name.len, dentry->d_name.name);
+
+			dget_locked(dentry);
+			break;
+		}
+
+		dentry = NULL;
+	}
+
+	spin_unlock(&dcache_lock);
+
+	if (dentry) {
+		d_delete(dentry);
+		dput(dentry);
+	}
+
 	/* for rename, we don't change link counts */
 	if (!rename) {
 		mlog(0, "new_nlink = %u\n", new_nlink);
+
 		inode->i_nlink = new_nlink;
 	}
 }
@@ -259,8 +344,8 @@
 	int net_status, vote_response;
 	int orphaned_slot = 0;
 	int rename = 0;
-	unsigned int node_num, generation, new_nlink;
-	u64 blkno;
+	unsigned int node_num, generation, new_nlink, namelen;
+	u64 blkno, parent_blkno;
 	enum ocfs2_vote_request request;
 	struct inode *inode = NULL;
 	ocfs2_msg_hdr *hdr = &msg->v_hdr;
@@ -341,9 +426,13 @@
 		rename = 1;
 		/* fall through */
 	case OCFS2_VOTE_REQ_UNLINK:
+		parent_blkno = be64_to_cpu(msg->v_unlink_parent);
+		namelen = ntohl(msg->v_unlink_namelen);
 		/* new_nlink will be ignored in case of a rename vote */
 		new_nlink = ntohl(msg->md1.v_nlink);
-		ocfs2_process_dentry_request(inode, rename, new_nlink);
+		ocfs2_process_dentry_request(inode, rename, new_nlink,
+					     parent_blkno, namelen,
+					     msg->v_unlink_dirent);
 		break;
 	default:
 		mlog(ML_ERROR, "node %u, invalid request: %u\n",
@@ -634,37 +723,47 @@
 	return status;
 }
 
-static int ocfs2_do_request_vote(ocfs2_super *osb,
-				 u64 blkno,
-				 unsigned int generation,
-				 enum ocfs2_vote_request type,
-				 u32 priv,
-				 struct ocfs2_net_response_cb *callback)
+static ocfs2_vote_msg * ocfs2_new_vote_request(ocfs2_super *osb,
+					       u64 blkno,
+					       unsigned int generation,
+					       enum ocfs2_vote_request type,
+					       u32 priv)
 {
-	int status, response;
-	unsigned int response_id;
-	ocfs2_vote_msg *request = NULL;
+	ocfs2_vote_msg *request;
 	ocfs2_msg_hdr *hdr;
 
-	OCFS2_ASSERT(ocfs2_is_valid_vote_request(type));
+	BUG_ON(!ocfs2_is_valid_vote_request(type));
 
 	request = kcalloc(1, sizeof(*request), GFP_KERNEL);
 	if (!request) {
-		status = -ENOMEM;
-		mlog_errno(status);
-		goto bail;
+		mlog_errno(-ENOMEM);
+	} else {
+		hdr = &request->v_hdr;
+		hdr->h_node_num = htonl((unsigned int) osb->node_num);
+		hdr->h_request = htonl(type);
+		hdr->h_blkno = cpu_to_be64(blkno);
+		hdr->h_generation = htonl(generation);
+
+		request->md1.v_generic1 = htonl(priv);
 	}
 
-	hdr = &request->v_hdr;
+	return request;
+}
 
+/* Complete the buildup of a new vote request and process the
+ * broadcast return value. */
+static int ocfs2_do_request_vote(ocfs2_super *osb,
+				 ocfs2_vote_msg *request,
+				 struct ocfs2_net_response_cb *callback)
+{
+	int status, response;
+	unsigned int response_id;
+	ocfs2_msg_hdr *hdr;
+
 	response_id = ocfs2_new_response_id(osb);
 
+	hdr = &request->v_hdr;
 	hdr->h_response_id = htonl(response_id);
-	hdr->h_request = htonl(type);
-	hdr->h_blkno = cpu_to_be64(blkno);
-	hdr->h_generation = htonl(generation);
-	hdr->h_node_num = htonl((unsigned int) osb->node_num);
-	request->md1.v_generic1 = htonl(priv);
 
 	status = ocfs2_broadcast_vote(osb, request, response_id, &response,
 				      callback);
@@ -675,15 +774,12 @@
 
 	status = response;
 bail:
-	if (request)
-		kfree(request);
 
 	return status;
 }
 
 static int ocfs2_request_vote(struct inode *inode,
-			      enum ocfs2_vote_request type,
-			      u32 priv,
+			      ocfs2_vote_msg *request,
 			      struct ocfs2_net_response_cb *callback)
 {
 	int status;
@@ -706,12 +802,7 @@
 		status = 0;
 		if (!ocfs2_node_map_is_only(osb, &osb->mounted_map,
 					   osb->node_num))
-			status = ocfs2_do_request_vote(osb, 
-						       OCFS2_I(inode)->ip_blkno,
-						       inode->i_generation,
-						       type,
-						       priv,
-						       callback);
+			status = ocfs2_do_request_vote(osb, request, callback);
 
 		ocfs2_super_unlock(osb, 0);
 	}
@@ -752,8 +843,10 @@
 
 int ocfs2_request_delete_vote(struct inode *inode)
 {
-	int orphaned_slot;
+	int orphaned_slot, status;
 	struct ocfs2_net_response_cb delete_cb;
+	ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	ocfs2_vote_msg *request;
 
 	spin_lock(&OCFS2_I(inode)->ip_lock);
 	orphaned_slot = OCFS2_I(inode)->ip_orphaned_slot;
@@ -765,66 +858,153 @@
 	mlog(0, "Inode %"MLFu64", we start thinking orphaned slot is %d\n",
 	     OCFS2_I(inode)->ip_blkno, orphaned_slot);
 
-	return ocfs2_request_vote(inode,
-				  OCFS2_VOTE_REQ_DELETE,
-				  orphaned_slot,
-				  &delete_cb);
+	status = -ENOMEM;
+	request = ocfs2_new_vote_request(osb, OCFS2_I(inode)->ip_blkno,
+					 inode->i_generation,
+					 OCFS2_VOTE_REQ_DELETE, orphaned_slot);
+	if (request) {
+		status = ocfs2_request_vote(inode, request, &delete_cb);
+
+		kfree(request);
+	}
+
+	return status;
 }
 
+static void ocfs2_setup_unlink_vote(ocfs2_vote_msg *request,
+				    struct dentry *dentry)
+{
+	struct inode *parent = dentry->d_parent->d_inode;
+
+	/* We need some values which will uniquely identify a dentry
+	 * on the other nodes so that they can find it and run
+	 * d_delete against it. Parent directory block and full name
+	 * should suffice. */
+
+	mlog(0, "unlink/rename request: parent: %"MLFu64" name: %.*s\n",
+	     OCFS2_I(parent)->ip_blkno, dentry->d_name.len,
+	     dentry->d_name.name);
+
+	request->v_unlink_parent = cpu_to_be64(OCFS2_I(parent)->ip_blkno);
+	request->v_unlink_namelen = htonl(dentry->d_name.len);
+	memcpy(request->v_unlink_dirent, dentry->d_name.name,
+	       dentry->d_name.len);
+}
+
 int ocfs2_request_unlink_vote(struct inode *inode,
+			      struct dentry *dentry,
 			      unsigned int nlink)
 {
-	return ocfs2_request_vote(inode,
-				  OCFS2_VOTE_REQ_UNLINK,
-				  nlink,
-				  NULL);
+	int status;
+	ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	ocfs2_vote_msg *request;
+
+	if (dentry->d_name.len > OCFS2_VOTE_FILENAME_LEN)
+		return -ENAMETOOLONG;
+
+	status = -ENOMEM;
+	request = ocfs2_new_vote_request(osb, OCFS2_I(inode)->ip_blkno,
+					 inode->i_generation,
+					 OCFS2_VOTE_REQ_UNLINK, nlink);
+	if (request) {
+		ocfs2_setup_unlink_vote(request, dentry);
+
+		status = ocfs2_request_vote(inode, request, NULL);
+
+		kfree(request);
+	}
+	return status;
 }
 
-int ocfs2_request_rename_vote(struct inode *inode)
+int ocfs2_request_rename_vote(struct inode *inode,
+			      struct dentry *dentry)
 {
-	return ocfs2_request_vote(inode,
-				  OCFS2_VOTE_REQ_RENAME,
-				  0,
-				  NULL);
+	int status;
+	ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	ocfs2_vote_msg *request;
+
+	if (dentry->d_name.len > OCFS2_VOTE_FILENAME_LEN)
+		return -ENAMETOOLONG;
+
+	status = -ENOMEM;
+	request = ocfs2_new_vote_request(osb, OCFS2_I(inode)->ip_blkno,
+					 inode->i_generation,
+					 OCFS2_VOTE_REQ_RENAME, 0);
+	if (request) {
+		ocfs2_setup_unlink_vote(request, dentry);
+
+		status = ocfs2_request_vote(inode, request, NULL);
+
+		kfree(request);
+	}
+	return status;
 }
 
 int ocfs2_request_mount_vote(ocfs2_super *osb)
 {
 	int status;
+	ocfs2_vote_msg *request = NULL;
 
+	request = ocfs2_new_vote_request(osb, 0ULL, 0, 
+					 OCFS2_VOTE_REQ_MOUNT, 0);
+	if (!request) {
+		status = -ENOMEM;
+		goto bail;
+	}
+
 	status = -EAGAIN;
 	while (status == -EAGAIN) {
-		if (signal_pending(current))
-			return -ERESTARTSYS;
+		if (signal_pending(current)) {
+			status = -ERESTARTSYS;
+			goto bail;
+		}
 
 		if (ocfs2_node_map_is_only(osb, &osb->mounted_map,
-					   osb->node_num))
-			return 0;
+					   osb->node_num)) {
+			status = 0;
+			goto bail;
+		}
 
-		status = ocfs2_do_request_vote(osb, 0ULL, 0,
-					       OCFS2_VOTE_REQ_MOUNT,
-					       0, NULL);
+		status = ocfs2_do_request_vote(osb, request, NULL);
 	}
+
+bail:
+	if (request)
+		kfree(request);
+
 	return status;
 }
 
 int ocfs2_request_umount_vote(ocfs2_super *osb)
 {
 	int status;
+	ocfs2_vote_msg *request = NULL;
 
+	request = ocfs2_new_vote_request(osb, 0ULL, 0, 
+					 OCFS2_VOTE_REQ_UMOUNT, 0);
+	if (!request) {
+		status = -ENOMEM;
+		goto bail;
+	}
+
 	status = -EAGAIN;
 	while (status == -EAGAIN) {
 		/* Do not check signals on this vote... We really want
 		 * this one to go all the way through. */
 
 		if (ocfs2_node_map_is_only(osb, &osb->mounted_map,
-					   osb->node_num))
-			return 0;
+					   osb->node_num)) {
+			status = 0;
+			goto bail;
+		}
 
-		status = ocfs2_do_request_vote(osb, 0ULL, 0,
-					       OCFS2_VOTE_REQ_UMOUNT,
-					       0, NULL);
+		status = ocfs2_do_request_vote(osb, request, NULL);
 	}
+
+bail:
+	if (request)
+		kfree(request);
+
 	return status;
 }
 

Modified: trunk/fs/ocfs2/vote.h
===================================================================
--- trunk/fs/ocfs2/vote.h	2005-06-10 19:33:33 UTC (rev 2388)
+++ trunk/fs/ocfs2/vote.h	2005-06-10 22:08:41 UTC (rev 2389)
@@ -40,8 +40,10 @@
 
 int ocfs2_request_delete_vote(struct inode *inode);
 int ocfs2_request_unlink_vote(struct inode *inode,
+			      struct dentry *dentry,
 			      unsigned int nlink);
-int ocfs2_request_rename_vote(struct inode *inode);
+int ocfs2_request_rename_vote(struct inode *inode,
+			      struct dentry *dentry);
 int ocfs2_request_mount_vote(ocfs2_super *osb);
 int ocfs2_request_umount_vote(ocfs2_super *osb);
 int ocfs2_register_net_handlers(ocfs2_super *osb);