[Ocfs2-commits] mfasheh commits r2391 - in trunk: . fs/ocfs2

Mon Jun 13 17:58:45 CDT 2005

Author: mfasheh
Signed-off-by: jlbec
Date: 2005-06-13 17:58:43 -0500 (Mon, 13 Jun 2005)
New Revision: 2391

Modified:
   trunk/Config.make.in
   trunk/configure.in
   trunk/fs/ocfs2/Makefile
   trunk/fs/ocfs2/inode.c
   trunk/fs/ocfs2/inode.h
   trunk/fs/ocfs2/journal.c
   trunk/fs/ocfs2/super.c
   trunk/fs/ocfs2/vote.c
Log:
* Don't set i_nlink in the vote thread anymore. Instead provide our own
  drop_inode which can clear it when no other processes have references.

* Document our delete_inode path better. 

* ocfs2_recover_orphans might have taken a reference on an inode which has  
  already passed through the vote thread in which case we weren't clearing the
  state setup there.

* The atomic_inc / atomic_dec in ocfs2_delete_inode is no longer necessary  
  so remove that.

* Set checkpointing information on inodes in ocfs2_delete_inode so that if
  the operation fails, we don't give up the lock without a checkpoint.     

Signed-off-by: jlbec



Modified: trunk/Config.make.in
===================================================================

--- trunk/Config.make.in	2005-06-13 18:37:09 UTC (rev 2390)
+++ trunk/Config.make.in	2005-06-13 22:58:43 UTC (rev 2391)
@@ -51,6 +51,7 @@
 EXTRA_CFLAGS += @KAPI_COMPAT_CFLAGS@
 
 MISSING_SOCK_CREATE_LITE = @MISSING_SOCK_CREATE_LITE@
+MISSING_GENERIC_DROP_INODE = @MISSING_GENERIC_DROP_INODE@
 JOURNAL_ACCESS_WITH_CREDITS = @JOURNAL_ACCESS_WITH_CREDITS@
 BACKING_DEV_CAPABILITIES = @BACKING_DEV_CAPABILITIES@
 IDR_GET_NEW_RETURNS_ID = @IDR_GET_NEW_RETURNS_ID@

Modified: trunk/configure.in
===================================================================
--- trunk/configure.in	2005-06-13 18:37:09 UTC (rev 2390)
+++ trunk/configure.in	2005-06-13 22:58:43 UTC (rev 2391)
@@ -212,6 +212,10 @@
 OCFS2_CHECK_KERNEL(sock_create_lite, net.h,, MISSING_SOCK_CREATE_LITE=yes)
 AC_SUBST(MISSING_SOCK_CREATE_LITE)
 
+MISSING_GENERIC_DROP_INODE=
+OCFS2_CHECK_KERNEL(generic_drop_inode, fs.h,, MISSING_GENERIC_DROP_INODE=yes)
+AC_SUBST(MISSING_GENERIC_DROP_INODE)
+
 OCFS2_KERNEL_COMPAT(assert_spin_locked, spinlock.h)
 
 OCFS2_CHECK_KERNEL([kref_init with release callback], kref.h,

Modified: trunk/fs/ocfs2/Makefile
===================================================================
--- trunk/fs/ocfs2/Makefile	2005-06-13 18:37:09 UTC (rev 2390)
+++ trunk/fs/ocfs2/Makefile	2005-06-13 22:58:43 UTC (rev 2391)
@@ -26,6 +26,10 @@
 EXTRA_CFLAGS += -DJOURNAL_ACCESS_WITH_CREDITS
 endif
 
+ifdef MISSING_GENERIC_DROP_INODE
+EXTRA_CFLAGS += -DMISSING_GENERIC_DROP_INODE
+endif
+
 #
 # Since SUBDIRS means something to kbuild, define them safely.  Do not
 # include trailing slashes.

Modified: trunk/fs/ocfs2/inode.c
===================================================================
--- trunk/fs/ocfs2/inode.c	2005-06-13 18:37:09 UTC (rev 2390)
+++ trunk/fs/ocfs2/inode.c	2005-06-13 22:58:43 UTC (rev 2391)
@@ -488,7 +488,7 @@
 	 * the node who's doing the actual deleting should handle it
 	 * anyway. */
 	if (current == osb->vote_task) {
-		mlog(0, "Skipping delete of %lu because we're currently"
+		mlog(0, "Skipping delete of %lu because we're currently "
 		     "in process_vote\n", inode->i_ino);
 		goto bail;
 	}
@@ -504,12 +504,7 @@
 		goto bail;
 	}
 
-	/* ocfs2_meta_lock and friends might igrab / iput this guy, so we
-	 * take an extra ref. to avoid recursive calls to
-	 * delete_inode. */
-	atomic_inc(&inode->i_count);
 	status = ocfs2_meta_lock(inode, NULL, &fe_bh, 1);
-	atomic_dec(&inode->i_count);
 	if (status < 0) {
 		if (status != -ENOENT)
 			mlog_errno(status);
@@ -638,6 +633,11 @@
 		mlog_errno(status);
 		goto bail_unblock;
 	}
+	/* Set the locking information, even though we're wiping the
+	 * inode - if we error before completing the wipe, we'll want
+	 * to checkpoint our progress so other nodes get an up-to-date
+	 * picture. */
+	ocfs2_set_inode_lock_trans(osb->journal, inode);
 
 	status = ocfs2_orphan_del(osb, handle, orphan_dir_inode, inode, 
 				  orphan_dir_bh);
@@ -707,7 +707,7 @@
 	if (!inode)
 		goto bail;
 
-	mlog(0, "Clearing inode: %"MLFu64", nlink = %u)\n",
+	mlog(0, "Clearing inode: %"MLFu64", nlink = %u\n",
 	     OCFS2_I(inode)->ip_blkno, inode->i_nlink);
 
 	mlog_bug_on_msg(OCFS2_SB(inode->i_sb) == NULL,
@@ -715,7 +715,10 @@
 
 	/* We very well may get a clear_inode before all an inodes
 	 * metadata has hit disk. Of course, we can't drop any cluster
-	 * locks until the journal has finished with it. */
+	 * locks until the journal has finished with it. The only
+	 * exception here are successfully wiped inodes - their
+	 * metadata can now be considered to be part of the system
+	 * inodes from which it came. */
 	if (!(OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED))
 		ocfs2_checkpoint_inode(inode);
 
@@ -772,6 +775,51 @@
 	mlog_exit_void();
 }
 
+#ifdef MISSING_GENERIC_DROP_INODE
+/* This is *only* a hack for kernels which don't have
+ * generic_drop_inode exported yet. */
+static void generic_drop_inode(struct inode *inode)
+{
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+
+	if (inode->i_nlink) {
+		/* Nasty, nasty we have i_nlink, but since we can't
+		 * get to generic_forget_inode, we are forced to call
+		 * generic_delete_inode but want to tell
+		 * ocfs2_delete_inode to skip the deletion path. */
+		spin_lock(&oi->ip_lock);
+		oi->ip_flags |= OCFS2_INODE_SKIP_DELETE;
+		spin_unlock(&oi->ip_lock);
+	}
+	generic_delete_inode(inode);
+}
+#endif
+
+/* Called under inode_lock, with no more references on the
+ * struct inode, so it's safe here to check the flags field
+ * and to manipulate i_nlink without any other locks. */
+void ocfs2_drop_inode(struct inode *inode)
+{
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+
+	mlog_entry_void();
+
+	mlog(0, "Drop inode %"MLFu64", nlink = %u, ip_flags = 0x%x\n",
+	     oi->ip_blkno, inode->i_nlink, oi->ip_flags);
+
+	/* Testing ip_orphaned_slot here wouldn't work because we may
+	 * not have gotten a delete_inode vote from any other nodes
+	 * yet. */
+	if (oi->ip_flags & OCFS2_INODE_MAYBE_ORPHANED) {
+		mlog(0, "Inode was orphaned on another node, clearing nlink.\n");
+		inode->i_nlink = 0;
+	}
+
+	generic_drop_inode(inode);
+
+	mlog_exit_void();
+}
+
 /*
  * TODO: this should probably be merged into ocfs2_get_block
  * 

Modified: trunk/fs/ocfs2/inode.h
===================================================================
--- trunk/fs/ocfs2/inode.h	2005-06-13 18:37:09 UTC (rev 2390)
+++ trunk/fs/ocfs2/inode.h	2005-06-13 22:58:43 UTC (rev 2391)
@@ -75,16 +75,32 @@
 /*
  * Flags for the ip_flags field
  */
-/* Special types of inodes */
+/* System file inodes  */
 #define OCFS2_INODE_SYSTEM_FILE		0x00000001
 #define OCFS2_INODE_JOURNAL		0x00000002
 #define OCFS2_INODE_BITMAP		0x00000004
-/* Has the inode been deleted yet? */
+/* This inode has been wiped from disk */
 #define OCFS2_INODE_DELETED		0x00000008
 /* Another node is deleting, so our delete is a nop */
 #define OCFS2_INODE_SKIP_DELETE		0x00000010
+/* 
+ * Has the inode been orphaned on another node? 
+ *
+ * This hints to ocfs2_drop_inode that it should clear i_nlink before
+ * continuing.
+ *
+ * We *only* set this on unlink vote from another node. If the inode
+ * was locally orphaned, then we're sure of the state and don't need
+ * to twiddle i_nlink later - it's either zero or not depending on
+ * whether our unlink succeeded. Otherwise we got this from a node
+ * whose intention was to orphan the inode, however he may have
+ * crashed, failed etc, so we let ocfs2_drop_inode zero the value and
+ * rely on ocfs2_delete_inode to sort things out under the proper
+ * cluster locks.
+ */
+#define OCFS2_INODE_MAYBE_ORPHANED	0x00000020
 /* Does someone have the file open O_DIRECT */
-#define OCFS2_INODE_OPEN_DIRECT		0x00000020
+#define OCFS2_INODE_OPEN_DIRECT		0x00000040
 
 static inline struct ocfs2_inode_info *OCFS2_I(struct inode *inode)
 {
@@ -100,6 +116,7 @@
 				int *err, int reada);
 void ocfs2_clear_inode(struct inode *inode);
 void ocfs2_delete_inode(struct inode *inode);
+void ocfs2_drop_inode(struct inode *inode);
 struct inode *ocfs2_iget(ocfs2_super *osb, u64 feoff);
 struct inode *ocfs2_ilookup(ocfs2_super *osb, u64 feoff);
 int ocfs2_inode_init_private(struct inode *inode);

Modified: trunk/fs/ocfs2/journal.c
===================================================================
--- trunk/fs/ocfs2/journal.c	2005-06-13 18:37:09 UTC (rev 2390)
+++ trunk/fs/ocfs2/journal.c	2005-06-13 22:58:43 UTC (rev 2391)
@@ -1366,6 +1366,7 @@
 	struct buffer_head *bh = NULL;
 	struct ocfs2_dir_entry *de;
 	struct super_block *sb = osb->sb;
+	struct ocfs2_inode_info *oi;
 
 	mlog(0, "Recover inodes from orphan dir in slot %d\n", slot);
 
@@ -1463,15 +1464,24 @@
 	orphan_dir_inode = NULL;
 
 	while (inode) {
-		mlog(0, "iput orphan %"MLFu64"\n",
-		     OCFS2_I(inode)->ip_blkno);
+		oi = OCFS2_I(inode);
+		mlog(0, "iput orphan %"MLFu64"\n", oi->ip_blkno);
 
-		iter = OCFS2_I(inode)->ip_next_orphan;
+		iter = oi->ip_next_orphan;
 
-		spin_lock(&OCFS2_I(inode)->ip_lock);
-		OCFS2_I(inode)->ip_orphaned_slot = slot;
-		spin_unlock(&OCFS2_I(inode)->ip_lock);
+		spin_lock(&oi->ip_lock);
+		/* Delete voting may have set these on the assumption
+		 * that the other node would wipe them successfully.
+		 * If they are still in the node's orphan dir, we need
+		 * to reset that state. */
+		oi->ip_flags &= ~(OCFS2_INODE_DELETED|OCFS2_INODE_SKIP_DELETE);
 
+		/* Set the proper information to get us going into
+		 * ocfs2_delete_inode. */
+		oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED;
+		oi->ip_orphaned_slot = slot;
+		spin_unlock(&oi->ip_lock);
+
 		iput(inode);
 
 		inode = iter;

Modified: trunk/fs/ocfs2/super.c
===================================================================
--- trunk/fs/ocfs2/super.c	2005-06-13 18:37:09 UTC (rev 2390)
+++ trunk/fs/ocfs2/super.c	2005-06-13 22:58:43 UTC (rev 2391)
@@ -113,6 +113,7 @@
 	.statfs		= ocfs2_statfs,
 	.alloc_inode	= ocfs2_alloc_inode,
 	.destroy_inode	= ocfs2_destroy_inode,
+	.drop_inode	= ocfs2_drop_inode,
 	.clear_inode	= ocfs2_clear_inode,
 	.delete_inode	= ocfs2_delete_inode, 
 	.sync_fs	= ocfs2_sync_fs,

Modified: trunk/fs/ocfs2/vote.c
===================================================================
--- trunk/fs/ocfs2/vote.c	2005-06-13 18:37:09 UTC (rev 2390)
+++ trunk/fs/ocfs2/vote.c	2005-06-13 22:58:43 UTC (rev 2391)
@@ -164,8 +164,7 @@
 	mlog(0, "DELETE vote on inode %lu, read lnk_cnt = %u, slot = %d\n",
 	     inode->i_ino, inode->i_nlink, *orphaned_slot);
 
-	/* force this as ours may be out of date. */
-	inode->i_nlink = 0;
+	spin_lock(&OCFS2_I(inode)->ip_lock);
 
 	/* Whatever our vote response is, we want to make sure that
 	 * the orphaned slot is recorded properly on this node *and*
@@ -174,8 +173,6 @@
 	 * respond with BUSY he doesn't actually need the orphaned
 	 * slot, but it doesn't hurt to do it here anyway. */
 	if ((*orphaned_slot) != OCFS2_INVALID_SLOT) {
-		spin_lock(&OCFS2_I(inode)->ip_lock);
-
 		mlog_bug_on_msg(OCFS2_I(inode)->ip_orphaned_slot != 
 				OCFS2_INVALID_SLOT && 
 				OCFS2_I(inode)->ip_orphaned_slot != 
@@ -190,19 +187,14 @@
 		     OCFS2_I(inode)->ip_blkno, *orphaned_slot);
 
 		OCFS2_I(inode)->ip_orphaned_slot = *orphaned_slot;
-		spin_unlock(&OCFS2_I(inode)->ip_lock);
 	} else {
-		spin_lock(&OCFS2_I(inode)->ip_lock);
-
 		mlog(0, "Sending back orphaned slot %d for inode %"MLFu64"\n",
 		     OCFS2_I(inode)->ip_orphaned_slot,
 		     OCFS2_I(inode)->ip_blkno);
 
 		*orphaned_slot = OCFS2_I(inode)->ip_orphaned_slot;
-		spin_unlock(&OCFS2_I(inode)->ip_lock);
 	}
 
-	spin_lock(&OCFS2_I(inode)->ip_lock);
 	/* vote no if the file is still open. */
 	if (OCFS2_I(inode)->ip_open_count) {
 		mlog(0, "open count = %u\n",
@@ -249,7 +241,10 @@
 	response = OCFS2_RESPONSE_OK;
 
 	/* We set the SKIP_DELETE flag on the inode so we don't try to
-	 * delete it in delete_inode ourselves. */
+	 * delete it in delete_inode ourselves, thus avoiding
+	 * unecessary lock pinging. If the other node failed to wipe
+	 * the inode as a result of a crash, then recovery will pick
+	 * up the slack. */
 	OCFS2_I(inode)->ip_flags |=  OCFS2_INODE_SKIP_DELETE;
 	spin_unlock(&OCFS2_I(inode)->ip_lock);
 
@@ -330,11 +325,21 @@
 		dput(dentry);
 	}
 
-	/* for rename, we don't change link counts */
+	/* rename votes don't send link counts */
 	if (!rename) {
 		mlog(0, "new_nlink = %u\n", new_nlink);
 
-		inode->i_nlink = new_nlink;
+		/* We don't have the proper locks here to directly
+		 * change i_nlink and besides, the vote is sent
+		 * *before* the operation so it may have failed on the
+		 * other node. This passes a hint to ocfs2_drop_inode
+		 * to force ocfs2_delete_inode, who will take the
+		 * proper cluster locks to sort things out. */
+		if (new_nlink == 0) {
+			spin_lock(&OCFS2_I(inode)->ip_lock);
+			OCFS2_I(inode)->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED;
+			spin_unlock(&OCFS2_I(inode)->ip_lock);
+		}
 	}
 }