[Ocfs2-commits] mfasheh commits r2467 - trunk/fs/ocfs2

Mon Jul 11 16:43:23 CDT 2005

Author: mfasheh
Signed-off-by: jlbec
Date: 2005-07-11 16:43:21 -0500 (Mon, 11 Jul 2005)
New Revision: 2467

Modified:
   trunk/fs/ocfs2/Makefile
   trunk/fs/ocfs2/dlmglue.c
   trunk/fs/ocfs2/file.c
   trunk/fs/ocfs2/inode.c
   trunk/fs/ocfs2/vote.c
Log:
* Add some workaround code for a problem where delete_inode calls
  truncate_inode_pages before OCFS2 can check whether to actually wipe the
  inode. This is enabled right now by default as the kernel patch to fix  
  this is still under review.     

* Fix up delete_inode to seperate out the parts which do a cluster query
  from the parts which actually handle the removal of an inode from the 
  system.            

Signed-off-by: jlbec



Modified: trunk/fs/ocfs2/Makefile
===================================================================

--- trunk/fs/ocfs2/Makefile	2005-07-11 19:03:00 UTC (rev 2466)
+++ trunk/fs/ocfs2/Makefile	2005-07-11 21:43:21 UTC (rev 2467)
@@ -26,6 +26,8 @@
 EXTRA_CFLAGS += -DJOURNAL_ACCESS_WITH_CREDITS
 endif
 
+EXTRA_CFLAGS += -DOCFS2_DELETE_INODE_WORKAROUND
+
 #
 # Since SUBDIRS means something to kbuild, define them safely.  Do not
 # include trailing slashes.

Modified: trunk/fs/ocfs2/dlmglue.c
===================================================================
--- trunk/fs/ocfs2/dlmglue.c	2005-07-11 19:03:00 UTC (rev 2466)
+++ trunk/fs/ocfs2/dlmglue.c	2005-07-11 21:43:21 UTC (rev 2467)
@@ -1436,23 +1436,24 @@
 {
 	int status = 0;
 	u32 trustable_clusters = 0;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
 	struct ocfs2_lock_res *lockres;
 	ocfs2_dinode *fe;
 
 	mlog_entry_void();
 
-	spin_lock(&OCFS2_I(inode)->ip_lock);
-	if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) {
+	spin_lock(&oi->ip_lock);
+	if (oi->ip_flags & OCFS2_INODE_DELETED) {
 		mlog(0, "Orphaned inode %"MLFu64" was deleted while we "
 		     "were waiting on a lock. ip_flags = 0x%x\n",
-		     OCFS2_I(inode)->ip_blkno, OCFS2_I(inode)->ip_flags);
-		spin_unlock(&OCFS2_I(inode)->ip_lock);
+		     oi->ip_blkno, oi->ip_flags);
+		spin_unlock(&oi->ip_lock);
 		status = -ENOENT;
 		goto bail;
 	}
-	spin_unlock(&OCFS2_I(inode)->ip_lock);
+	spin_unlock(&oi->ip_lock);
 
-	lockres = &OCFS2_I(inode)->ip_meta_lockres;
+	lockres = &oi->ip_meta_lockres;
 
 	if (!ocfs2_should_refresh_lock_res(lockres))
 		goto bail;
@@ -1469,9 +1470,8 @@
 	} else {
 		/* Boo, we have to go to disk. */
 		/* read bh, cast, ocfs2_refresh_inode */
-		status = ocfs2_read_block(OCFS2_SB(inode->i_sb),
-					  OCFS2_I(inode)->ip_blkno, bh,
-					  OCFS2_BH_CACHED, inode);
+		status = ocfs2_read_block(OCFS2_SB(inode->i_sb), oi->ip_blkno,
+					  bh, OCFS2_BH_CACHED, inode);
 		if (status < 0) {
 			mlog_errno(status);
 			goto bail_refresh;
@@ -1491,13 +1491,12 @@
 				le32_to_cpu(fe->i_generation),
 				"Invalid dinode %"MLFu64" disk generation: %u "
 				"inode->i_generation: %u\n",
-				OCFS2_I(inode)->ip_blkno,
-				le32_to_cpu(fe->i_generation),
+				oi->ip_blkno, le32_to_cpu(fe->i_generation),
 				inode->i_generation);
 		mlog_bug_on_msg(fe->i_dtime || !(fe->i_flags & OCFS2_VALID_FL),
 				"Stale dinode %"MLFu64" dtime: %"MLFu64" "
-				"flags: 0x%x\n", OCFS2_I(inode)->ip_blkno,
-				fe->i_dtime, fe->i_flags);
+				"flags: 0x%x\n", oi->ip_blkno, fe->i_dtime,
+				fe->i_flags);
 
 		ocfs2_refresh_inode(inode, fe);
 	}
@@ -1512,6 +1511,21 @@
 	ocfs2_set_local_seq_from_lvb(lockres);
 	ocfs2_reset_meta_lvb_values(inode);
 
+#ifdef OCFS2_DELETE_INODE_WORKAROUND
+	/* We might as well check this here - since the inode is now
+	 * locked, an up to date view will indicate whether this was
+	 * never actually orphaned -- i_nlink should be zero for an
+	 * orphaned inode. */
+	spin_lock(&oi->ip_lock);
+	if (inode->i_nlink &&
+	    oi->ip_flags & OCFS2_INODE_MAYBE_ORPHANED) {
+		mlog(0, "Inode %"MLFu64": clearing maybe_orphaned flag\n",
+		     oi->ip_blkno);
+		oi->ip_flags &= ~OCFS2_INODE_MAYBE_ORPHANED;
+	}
+	spin_unlock(&oi->ip_lock);
+#endif
+
 	status = 0;
 bail_refresh:
 	ocfs2_complete_lock_res_refresh(lockres, status);

Modified: trunk/fs/ocfs2/file.c
===================================================================
--- trunk/fs/ocfs2/file.c	2005-07-11 19:03:00 UTC (rev 2466)
+++ trunk/fs/ocfs2/file.c	2005-07-11 21:43:21 UTC (rev 2467)
@@ -102,6 +102,17 @@
 		       file->f_dentry->d_name.name);
 
 	spin_lock(&oi->ip_lock);
+#ifdef OCFS2_DELETE_INODE_WORKAROUND
+	/* Do the sync *before* decrementing ip_open_count as
+	 * otherwise the voting code might allow this inode to be
+	 * wiped. */
+	if (oi->ip_open_count == 1 &&
+	    oi->ip_flags & OCFS2_INODE_MAYBE_ORPHANED) {
+		spin_unlock(&oi->ip_lock);
+		write_inode_now(inode, 1);
+		spin_lock(&oi->ip_lock);
+	}
+#endif
 	if (!--oi->ip_open_count)
 		oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT;
 	spin_unlock(&oi->ip_lock);

Modified: trunk/fs/ocfs2/inode.c
===================================================================
--- trunk/fs/ocfs2/inode.c	2005-07-11 19:03:00 UTC (rev 2466)
+++ trunk/fs/ocfs2/inode.c	2005-07-11 21:43:21 UTC (rev 2467)
@@ -441,44 +441,165 @@
 	return status;
 }
 
-void ocfs2_delete_inode(struct inode *inode)
+static int ocfs2_remove_inode(struct inode *inode,
+			      struct buffer_head *di_bh,
+			      struct inode *orphan_dir_inode,
+			      struct buffer_head *orphan_dir_bh)
 {
-	int status = 0;
-	int unlock = 0;
-	int orphaned_slot;
-	struct inode *orphan_dir_inode = NULL;
+	int status;
 	struct inode *inode_alloc_inode = NULL;
-	ocfs2_journal_handle *handle = NULL;
+	struct buffer_head *inode_alloc_bh = NULL;
+	ocfs2_journal_handle *handle;
 	ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-	struct buffer_head *orphan_dir_bh = NULL;
-	struct buffer_head *inode_alloc_bh = NULL;
-	struct buffer_head *fe_bh = NULL;
-	ocfs2_dinode *fe;
-	sigset_t blocked, oldset;
+	ocfs2_dinode *di = (ocfs2_dinode *) di_bh->b_data;
 
-	mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino);
+	inode_alloc_inode =
+		ocfs2_get_system_file_inode(osb, INODE_ALLOC_SYSTEM_INODE,
+					    le16_to_cpu(di->i_suballoc_slot));
+	if (!inode_alloc_inode) {
+		status = -EEXIST;
+		mlog_errno(status);
+		goto bail;
+	}
 
-	if (is_bad_inode(inode))
+	down(&inode_alloc_inode->i_sem);
+	status = ocfs2_meta_lock(inode_alloc_inode, NULL, &inode_alloc_bh, 1);
+	if (status < 0) {
+		up(&inode_alloc_inode->i_sem);
+
+		mlog_errno(status);
 		goto bail;
+	}
 
-	if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE) {
-		mlog(0, "Skipping system file delete.\n");
+	handle = ocfs2_start_trans(osb, NULL, OCFS2_DELETE_INODE_CREDITS);
+	if (handle == NULL) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail_unlock;
+	}
+
+	ocfs2_set_inode_lock_trans(osb->journal, inode_alloc_inode);
+	ocfs2_set_inode_lock_trans(osb->journal, orphan_dir_inode);
+	/* Set the inode locking information, even though we're wiping
+	 * the inode - if we error before completing the wipe, we'll
+	 * want to checkpoint our progress so other nodes get an
+	 * up-to-date picture. */
+	ocfs2_set_inode_lock_trans(osb->journal, inode);
+
+	status = ocfs2_orphan_del(osb, handle, orphan_dir_inode, inode,
+				  orphan_dir_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail_commit;
+	}
+
+	/* set the inodes dtime */
+	status = ocfs2_journal_access(handle, inode, di_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail_commit;
+	}
+
+	di->i_dtime = CURRENT_TIME.tv_sec;
+	di->i_flags &= (~(OCFS2_VALID_FL | OCFS2_ORPHANED_FL));
+
+	status = ocfs2_journal_dirty(handle, di_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail_commit;
+	}
+
+	ocfs2_remove_from_cache(inode, di_bh);
+
+	status = ocfs2_free_dinode(handle, inode_alloc_inode,
+				   inode_alloc_bh, di);
+	if (status < 0)
+		mlog_errno(status);
+
+bail_commit:
+	ocfs2_commit_trans(handle);
+bail_unlock:
+	ocfs2_meta_unlock(inode_alloc_inode, 1);
+	up(&inode_alloc_inode->i_sem);
+	brelse(inode_alloc_bh);
+bail:
+	iput(inode_alloc_inode);
+
+	return status;
+}
+
+static int ocfs2_wipe_inode(struct inode *inode,
+			    struct buffer_head *di_bh)
+{
+	int status, orphaned_slot;
+	struct inode *orphan_dir_inode = NULL;
+	struct buffer_head *orphan_dir_bh = NULL;
+	ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	/* We've already voted on this so it should be readonly - no
+	 * spinlock needed. */
+	orphaned_slot = OCFS2_I(inode)->ip_orphaned_slot;
+	orphan_dir_inode = ocfs2_get_system_file_inode(osb,
+						       ORPHAN_DIR_SYSTEM_INODE,
+						       orphaned_slot);
+	if (!orphan_dir_inode) {
+		status = -EEXIST;
+		mlog_errno(status);
 		goto bail;
 	}
 
-	if (inode == osb->root_inode) {
-		mlog(0, "Skipping root inode delete.\n");
+	/* Lock the orphan dir. The lock will be held for the entire
+	 * delete_inode operation. We do this now to avoid races with
+	 * recovery completion on other nodes. */
+	down(&orphan_dir_inode->i_sem);
+	status = ocfs2_meta_lock(orphan_dir_inode, NULL, &orphan_dir_bh, 1);
+	if (status < 0) {
+		up(&orphan_dir_inode->i_sem);
+
+		mlog_errno(status);
 		goto bail;
 	}
 
-	spin_lock(&OCFS2_I(inode)->ip_lock);
-	if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SKIP_DELETE) {
-		spin_unlock(&OCFS2_I(inode)->ip_lock);
-		mlog(0, "Skipping delete of %lu because another node "
-		     "has done this for us.\n", inode->i_ino);
+	/* we do this while holding the orphan dir lock because we
+	 * don't want recovery being run from another node to vote for
+	 * an inode delete on us -- this will result in two nodes
+	 * truncating the same file! */
+	status = ocfs2_truncate_for_delete(osb, inode, di_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail_unlock_dir;
+	}
+
+	status = ocfs2_remove_inode(inode, di_bh, orphan_dir_inode,
+				    orphan_dir_bh);
+	if (status < 0)
+		mlog_errno(status);
+
+bail_unlock_dir:
+	ocfs2_meta_unlock(orphan_dir_inode, 1);
+	up(&orphan_dir_inode->i_sem);
+	brelse(orphan_dir_bh);
+bail:
+	iput(orphan_dir_inode);
+
+	return status;
+}
+
+/* There is a series of simple checks that should be done before a
+ * vote is even considered. Encapsulate those in this function. */
+static int ocfs2_inode_is_valid_to_delete(struct inode *inode)
+{
+	int ret = 0;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	/* We shouldn't be getting here for the root directory
+	 * inode.. */
+	if (inode == osb->root_inode) {
+		mlog(ML_ERROR, "Skipping delete of root inode.\n");
 		goto bail;
 	}
-	spin_unlock(&OCFS2_I(inode)->ip_lock);
 
 	/* If we're coming from process_vote we can't go into our own
 	 * voting [hello, deadlock city!], so unforuntately we just
@@ -491,211 +612,215 @@
 		goto bail;
 	}
 
-	/* We want to blocks signals in delete_inode as the lock and
-	 * messaging paths may return us -ERESTARTSYS. This however
-	 * could result in inodes being orphaned forever. */
-	sigfillset(&blocked);
-
-	status = sigprocmask(SIG_BLOCK, &blocked, &oldset);
-	if (status < 0) {
-		mlog_errno(status);
-		goto bail;
+	spin_lock(&oi->ip_lock);
+	/* OCFS2 *never* deletes system files. This should technically
+	 * never get here as system file inodes should always have a
+	 * positive link count. */
+	if (oi->ip_flags & OCFS2_INODE_SYSTEM_FILE) {
+		mlog(ML_ERROR, "Skipping delete of system file %"MLFu64".\n",
+		     oi->ip_blkno);
+		goto bail_unlock;
 	}
 
-	status = ocfs2_meta_lock(inode, NULL, &fe_bh, 1);
-	if (status < 0) {
-		if (status != -ENOENT)
-			mlog_errno(status);
-		goto bail_unblock;
-	}
-	unlock = 1;
-
-	/* While we were waiting for the lock, another node might have
-	 * asked to delete the inode. Recheck our flags to catch this
-	 * race and just clear_inode instead.*/
-	spin_lock(&OCFS2_I(inode)->ip_lock);
-	if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SKIP_DELETE) {
-		spin_unlock(&OCFS2_I(inode)->ip_lock);
+	/* If we have voted "yes" on the wipe of this inode for
+	 * another node, it will be marked here so we can safely skip
+	 * it. Recovery will cleanup any inodes we might inadvertantly
+	 * skip here. */
+	if (oi->ip_flags & OCFS2_INODE_SKIP_DELETE) {
 		mlog(0, "Skipping delete of %lu because another node "
 		     "has done this for us.\n", inode->i_ino);
-		goto bail_unblock;
+		goto bail_unlock;
 	}
-	spin_unlock(&OCFS2_I(inode)->ip_lock);
 
-	status = ocfs2_request_delete_vote(inode);
-	if (status < 0) {
-		/* EBUSY here is assumed to mean that other nodes are
-		 * still using the inode. We're done here though, so
-		 * avoid doing anything on disk and let them worry
-		 * about deleting it. */
-		if (status != -EBUSY)
-			mlog_errno(status);
-		goto bail_unblock;
-	}
+	ret = 1;
+bail_unlock:
+	spin_unlock(&oi->ip_lock);
+bail:
+	return ret;
+}
 
-	spin_lock(&OCFS2_I(inode)->ip_lock);
-	orphaned_slot = OCFS2_I(inode)->ip_orphaned_slot;
-	spin_unlock(&OCFS2_I(inode)->ip_lock);
+/* Query the cluster to determine whether we should wipe an inode from
+ * disk or not.
+ *
+ * Requires the inode to have the cluster lock. */
+static int ocfs2_query_inode_wipe(struct inode *inode,
+				  struct buffer_head *di_bh,
+				  int *wipe)
+{
+	int status = 0;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	ocfs2_dinode *di;
 
-	if (orphaned_slot == OCFS2_INVALID_SLOT) {
-		/* Nobody knew which slot this inode was orphaned
-		 * into. This may happen during node death and
-		 * recovery knows how to clean it up so we can safely
-		 * ignore this inode for now on. */
-		mlog(0, "Nobody knew where inode %"MLFu64" was orphaned!\n",
-		     OCFS2_I(inode)->ip_blkno);
+	*wipe = 0;
 
-		/* XXX: Is this really necessary? */
-		spin_lock(&OCFS2_I(inode)->ip_lock);
-		OCFS2_I(inode)->ip_flags |= OCFS2_INODE_DELETED;
-		spin_unlock(&OCFS2_I(inode)->ip_lock);
-		goto bail_unblock;
+	/* While we were waiting for the cluster lock in
+	 * ocfs2_delete_inode, another node might have asked to delete
+	 * the inode. Recheck our flags to catch this. */
+	if (!ocfs2_inode_is_valid_to_delete(inode)) {
+		mlog(0, "Skipping delete of %"MLFu64" because flags changed\n",
+		     oi->ip_blkno);
+		goto bail;
 	}
 
-	mlog(0, "Inode %"MLFu64" is ok to wipe from orphan dir slot %d\n",
-	     OCFS2_I(inode)->ip_blkno, orphaned_slot);
+	/* Now that we have an up to date inode, we can double check
+	 * the link count. */
+	if (inode->i_nlink) {
+		mlog(0, "Skipping delete of %"MLFu64" because nlink = %u\n",
+		     oi->ip_blkno, inode->i_nlink);
+		goto bail;
+	}
 
-	fe = (ocfs2_dinode *) fe_bh->b_data;
-	if (!(fe->i_flags & OCFS2_ORPHANED_FL)) {
+	/* Do some basic inode verification... */
+	di = (ocfs2_dinode *) di_bh->b_data;
+	if (!(di->i_flags & OCFS2_ORPHANED_FL)) {
 		/* for lack of a better error? */
 		status = -EEXIST;
 		mlog(ML_ERROR,
 		     "Inode %"MLFu64" (on-disk %"MLFu64") not orphaned!\n",
 		     OCFS2_I(inode)->ip_blkno,
-		     fe->i_blkno);
-		goto bail_unblock;
+		     di->i_blkno);
+		goto bail;
 	}
 
 	/* has someone already deleted us?! baaad... */
-	if (fe->i_dtime) {
+	if (di->i_dtime) {
 		status = -EEXIST;
 		mlog_errno(status);
-		goto bail_unblock;
+		goto bail;
 	}
 
-	if (fe->i_links_count) {
-		status = -EBUSY;
-		mlog_errno(status);
-		goto bail_unblock;
+	status = ocfs2_request_delete_vote(inode);
+	/* -EBUSY means that other nodes are still using the
+	 * inode. We're done here though, so avoid doing anything on
+	 * disk and let them worry about deleting it. */
+	if (status == -EBUSY) {
+		status = 0;
+		mlog(0, "Skipping delete of %"MLFu64" because it is in use on"
+		     "other nodes\n", oi->ip_blkno);
+		goto bail;
 	}
-
-	/* Oop, lets be carefull of lock / trans ordering here... */
-	handle = ocfs2_alloc_handle(osb);
-	if (handle == NULL) {
-		status = -ENOMEM;
-		mlog_errno(status);
-		goto bail_unblock;
-	}
-
-	orphan_dir_inode = ocfs2_get_system_file_inode(osb,
-						       ORPHAN_DIR_SYSTEM_INODE,
-						       orphaned_slot);
-	if (!orphan_dir_inode) {
-		status = -EEXIST;
-		mlog_errno(status);
-		goto bail_unblock;
-	}
-	ocfs2_handle_add_inode(handle, orphan_dir_inode);
-	status = ocfs2_meta_lock(orphan_dir_inode, handle, &orphan_dir_bh, 1);
 	if (status < 0) {
 		mlog_errno(status);
-		goto bail_unblock;
+		goto bail;
 	}
 
-	/* we do this while holding the orphan dir lock because we
-	 * don't want recovery being run from another node to vote for
-	 * an inode delete on us -- this will result in two nodes
-	 * truncating the same file! */
-	status = ocfs2_truncate_for_delete(osb, inode, fe_bh);
-	if (status < 0) {
-		mlog_errno(status);
-		goto bail_unblock;
+	spin_lock(&oi->ip_lock);
+	if (oi->ip_orphaned_slot == OCFS2_INVALID_SLOT) {
+		/* Nobody knew which slot this inode was orphaned
+		 * into. This may happen during node death and
+		 * recovery knows how to clean it up so we can safely
+		 * ignore this inode for now on. */
+		mlog(0, "Nobody knew where inode %"MLFu64" was orphaned!\n",
+		     oi->ip_blkno);
+	} else {
+		*wipe = 1;
+
+		mlog(0, "Inode %"MLFu64" is ok to wipe from orphan dir %d\n",
+		     oi->ip_blkno, oi->ip_orphaned_slot);
 	}
+	spin_unlock(&oi->ip_lock);
 
-	inode_alloc_inode =
-		ocfs2_get_system_file_inode(osb, INODE_ALLOC_SYSTEM_INODE,
-					    le16_to_cpu(fe->i_suballoc_slot));
-	if (!inode_alloc_inode) {
-		status = -EEXIST;
-		mlog_errno(status);
-		goto bail_unblock;
+bail:
+	return status;
+}
+
+/* Support function for ocfs2_delete_inode. Will help us keep the
+ * inode data in a consistent state for clear_inode. Always truncates
+ * pages, optionally sync's them first. */
+static void ocfs2_cleanup_delete_inode(struct inode *inode,
+				       int sync_data)
+{
+	mlog(0, "Cleanup inode %"MLFu64", sync = %d\n",
+	     OCFS2_I(inode)->ip_blkno, sync_data);
+#ifndef OCFS2_DELETE_INODE_WORKAROUND
+	if (sync_data)
+		write_inode_now(inode, 1);
+	truncate_inode_pages(&inode->i_data, 0);
+#endif
+}
+
+void ocfs2_delete_inode(struct inode *inode)
+{
+	int wipe, status;
+	sigset_t blocked, oldset;
+	struct buffer_head *di_bh = NULL;
+
+	mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino);
+
+	if (is_bad_inode(inode)) {
+		mlog(0, "Skipping delete of bad inode\n");
+		goto bail;
 	}
-	ocfs2_handle_add_inode(handle, inode_alloc_inode);
-	status = ocfs2_meta_lock(inode_alloc_inode, handle, &inode_alloc_bh,
-				 1);
-	if (status < 0) {
-		mlog_errno(status);
-		goto bail_unblock;
-	}
 
-	handle = ocfs2_start_trans(osb, handle, OCFS2_DELETE_INODE_CREDITS);
-	if (handle == NULL) {
-		status = -ENOMEM;
-		mlog_errno(status);
-		goto bail_unblock;
+	if (!ocfs2_inode_is_valid_to_delete(inode)) {
+		/* It's probably not necessary to truncate_inode_pages
+		 * here but we do it for safety anyway (it will most
+		 * likely be a no-op anyway) */
+		ocfs2_cleanup_delete_inode(inode, 0);
+		goto bail;
 	}
-	/* Set the locking information, even though we're wiping the
-	 * inode - if we error before completing the wipe, we'll want
-	 * to checkpoint our progress so other nodes get an up-to-date
-	 * picture. */
-	ocfs2_set_inode_lock_trans(osb->journal, inode);
 
-	status = ocfs2_orphan_del(osb, handle, orphan_dir_inode, inode,
-				  orphan_dir_bh);
+	/* We want to block signals in delete_inode as the lock and
+	 * messaging paths may return us -ERESTARTSYS. Which would
+	 * cause us to exit early, resulting in inodes being orphaned
+	 * forever. */
+	sigfillset(&blocked);
+	status = sigprocmask(SIG_BLOCK, &blocked, &oldset);
 	if (status < 0) {
 		mlog_errno(status);
-		goto bail_unblock;
+		ocfs2_cleanup_delete_inode(inode, 1);
+		goto bail;
 	}
 
-	/* set the inodes dtime */
-	status = ocfs2_journal_access(handle, inode, fe_bh,
-				      OCFS2_JOURNAL_ACCESS_WRITE);
+	/* Lock down the inode. This gives us an up to date view of
+	 * it's metadata (for verification), and allows us to
+	 * serialize delete_inode votes. */
+	status = ocfs2_meta_lock(inode, NULL, &di_bh, 1);
 	if (status < 0) {
-		mlog_errno(status);
+		if (status != -ENOENT)
+			mlog_errno(status);
+		ocfs2_cleanup_delete_inode(inode, 0);
 		goto bail_unblock;
 	}
 
-	fe->i_dtime = CURRENT_TIME.tv_sec;
-	fe->i_flags &= (~(OCFS2_VALID_FL | OCFS2_ORPHANED_FL));
+	/* Query the cluster. This will be the final decision made
+	 * before we go ahead and wipe the inode. */
+	status = ocfs2_query_inode_wipe(inode, di_bh, &wipe);
+	if (!wipe || status < 0) {
+		/* Error and inode busy vote both mean we won't be
+		 * removing the inode, so they take almost the same
+		 * path. */
+		if (status < 0)
+			mlog_errno(status);
 
-	status = ocfs2_journal_dirty(handle, fe_bh);
-	if (status < 0) {
-		mlog_errno(status);
-		goto bail_unblock;
+		/* Someone in the cluster has voted to not wipe this
+		 * inode, or it was never completely orphaned. Write
+		 * out the pages and exit now. */
+		ocfs2_cleanup_delete_inode(inode, 1);
+		goto bail_unlock_inode;
 	}
 
-	ocfs2_remove_from_cache(inode, fe_bh);
+	ocfs2_cleanup_delete_inode(inode, 0);
 
-	status = ocfs2_free_dinode(handle, inode_alloc_inode,
-				   inode_alloc_bh, fe);
+	status = ocfs2_wipe_inode(inode, di_bh);
 	if (status < 0) {
 		mlog_errno(status);
-		goto bail_unblock;
+		goto bail_unlock_inode;
 	}
 
+	/* Mark the inode as successfully deleted. This is important
+	 * for ocfs2_clear_inode as it will check this flag and skip
+	 * any checkpointing work */
 	OCFS2_I(inode)->ip_flags |= OCFS2_INODE_DELETED;
+
+bail_unlock_inode:
+	ocfs2_meta_unlock(inode, 1);
+	brelse(di_bh);
 bail_unblock:
 	status = sigprocmask(SIG_SETMASK, &oldset, NULL);
 	if (status < 0)
 		mlog_errno(status);
-
 bail:
-	if (handle)
-		ocfs2_commit_trans(handle);
-	if (unlock)
-		ocfs2_meta_unlock(inode, 1);
-	if (orphan_dir_bh)
-		brelse(orphan_dir_bh);
-	if (inode_alloc_bh)
-		brelse(inode_alloc_bh);
-	if (fe_bh)
-		brelse(fe_bh);
-	if (orphan_dir_inode)
-		iput(orphan_dir_inode);
-	if (inode_alloc_inode)
-		iput(inode_alloc_inode);
-
-	/* we must clear inode. */
 	clear_inode(inode);
 	mlog_exit_void();
 }

Modified: trunk/fs/ocfs2/vote.c
===================================================================
--- trunk/fs/ocfs2/vote.c	2005-07-11 19:03:00 UTC (rev 2466)
+++ trunk/fs/ocfs2/vote.c	2005-07-11 21:43:21 UTC (rev 2467)
@@ -295,6 +295,7 @@
 {
 	struct dentry *dentry = NULL;
 	struct list_head *p;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
 
 	mlog(0, "parent %"MLFu64", namelen = %u, name = %.*s\n", parent_blkno,
 	     namelen, namelen, name);
@@ -336,8 +337,23 @@
 		 * to force ocfs2_delete_inode, who will take the
 		 * proper cluster locks to sort things out. */
 		if (new_nlink == 0) {
-			spin_lock(&OCFS2_I(inode)->ip_lock);
-			OCFS2_I(inode)->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED;
+			spin_lock(&oi->ip_lock);
+			oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED;
+
+#ifdef OCFS2_DELETE_INODE_WORKAROUND
+			/* Do a sync now as we can't be sure whether
+			 * the inode will actually be orphaned or
+			 * not. We condition this on the open count as
+			 * otherwise, ocfs2_file_release will handle
+			 * it for us. */
+			if (!oi->ip_open_count) {
+				spin_unlock(&oi->ip_lock);
+				write_inode_now(inode, 1);
+				/* strange indentation past the
+				 * 'else', but I want to keep the non
+				 * hack code purty :) */
+			} else
+#endif
 			spin_unlock(&OCFS2_I(inode)->ip_lock);
 		}
 	}