[Ocfs2-devel] [PATCH] ocfs2: flush dentry lock drop when sync ocfs2 volume.
Jan Kara
jack at suse.cz
Mon Jul 20 05:34:01 PDT 2009
Hi,
On Mon 20-07-09 17:08:42, Tao Ma wrote:
> In commit ea455f8ab68338ba69f5d3362b342c115bea8e13, we move the
> dentry lock put process into ocfs2_wq. This is OK for most case,
> but as for umount, it lead to at least 2 bugs. See
> http://oss.oracle.com/bugzilla/show_bug.cgi?id=1133 and
> http://oss.oracle.com/bugzilla/show_bug.cgi?id=1135. And it happens
> easily if we have opened a lot of inodes.
>
> For 1135, the reason is that during umount will call
> generic_shutdown_super and it will do:
> 1. shrink_dcache_for_umount
> 2. sync_filesystem.
> 3. invalidate_inodes.
>
> In shrink_dcache_for_umount, we will drop the dentry, and queue
> ocfs2_wq for dentry lock put. While in invalidate_inodes we will
> call invalidate_list which will iterate all the inodes for the sb.
> The bad thing is that in this function it will call
> cond_resched_lock(&inode_lock). So if in any case, we are scheduled
> out and ocfs2_wq is scheduled and drop some inodes, the "next" in
> invalidate_list will get damaged(have next->next = next). And the
> invalidate_list will enter dead loop and cause very high cpu.
Aw, well spotted. Sorry for the bug.
> So the only chance that we can solve this problem is flush dentry put
> in step 2 of generic_shutdown_super, that is sync_filesystem. And
> this patch is just adding dentry put flush process in ocfs2_sync_fs.
>
> Jan,
> Will dentry put in sync_fs have potential dead lock with quota
> lock? If yes, maybe we have to revert that commit which cause this umount
> problem and find other ways instead.
Well, it might be OK but IMHO it's a crude hack. I think the right fix
should be different: OCFS2 should provide it's own .kill_sb function. In
that function we should make sure ocfs2_wq stops putting inode references
and then flush the list from ocfs2_put_super.
Regarding quota this should be safe as the only lock we hold is
umount_sem.
Below is a patch doing what I'd imagine (lightly tested only). Could you
verify whether it fixes the issue you can see?
Honza
--
Jan Kara <jack at suse.cz>
SUSE Labs, CR
---
>From f96286432cc05f80e3aabc0c4795c62ffdd37d9a Mon Sep 17 00:00:00 2001
From: Jan Kara <jack at suse.cz>
Date: Mon, 20 Jul 2009 12:12:36 +0200
Subject: [PATCH] ocfs2: Fix deadlock on umount
In commit ea455f8ab68338ba69f5d3362b342c115bea8e13, we move the
dentry lock put process into ocfs2_wq. This is OK for most cases,
but as for umount, it lead to at least 2 bugs. See
http://oss.oracle.com/bugzilla/show_bug.cgi?id=1133 and
http://oss.oracle.com/bugzilla/show_bug.cgi?id=1135. And it happens
easily if we have opened a lot of inodes.
For 1135, the reason is that during umount will call
generic_shutdown_super and it will do:
1. shrink_dcache_for_umount
2. sync_filesystem.
3. invalidate_inodes.
In shrink_dcache_for_umount, we will drop the dentry, and queue
ocfs2_wq for dentry lock put. While in invalidate_inodes we will
call invalidate_list which will iterate all the inodes for the sb.
The bad thing is that in this function it will call
cond_resched_lock(&inode_lock). So if in any case, we are scheduled
out and ocfs2_wq is scheduled and drop some inodes, the "next" in
invalidate_list will get damaged(have next->next = next). And the
invalidate_list will enter dead loop and cause very high cpu.
Fix the problem by making sure that ocfs2_wq does no more putting of
inode references when umounting starts and drop all the references
from ocfs2_put_super().
Signed-off-by: Jan Kara <jack at suse.cz>
---
fs/ocfs2/dcache.c | 35 +++++++++++++++++++++++++++--------
fs/ocfs2/dcache.h | 3 +++
fs/ocfs2/ocfs2.h | 13 +++++++++++++
fs/ocfs2/super.c | 25 ++++++++++++++++++++++---
4 files changed, 65 insertions(+), 11 deletions(-)
diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index b574431..fd4fa65 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -310,22 +310,19 @@ out_attach:
return ret;
}
-static DEFINE_SPINLOCK(dentry_list_lock);
+DEFINE_SPINLOCK(dentry_list_lock);
/* We limit the number of dentry locks to drop in one go. We have
* this limit so that we don't starve other users of ocfs2_wq. */
#define DL_INODE_DROP_COUNT 64
/* Drop inode references from dentry locks */
-void ocfs2_drop_dl_inodes(struct work_struct *work)
+static void __ocfs2_drop_dl_inodes(struct ocfs2_super *osb, int drop_count)
{
- struct ocfs2_super *osb = container_of(work, struct ocfs2_super,
- dentry_lock_work);
struct ocfs2_dentry_lock *dl;
- int drop_count = DL_INODE_DROP_COUNT;
spin_lock(&dentry_list_lock);
- while (osb->dentry_lock_list && drop_count--) {
+ while (osb->dentry_lock_list && (drop_count < 0 || drop_count--)) {
dl = osb->dentry_lock_list;
osb->dentry_lock_list = dl->dl_next;
spin_unlock(&dentry_list_lock);
@@ -333,11 +330,32 @@ void ocfs2_drop_dl_inodes(struct work_struct *work)
kfree(dl);
spin_lock(&dentry_list_lock);
}
- if (osb->dentry_lock_list)
+ spin_unlock(&dentry_list_lock);
+}
+
+void ocfs2_drop_dl_inodes(struct work_struct *work)
+{
+ struct ocfs2_super *osb = container_of(work, struct ocfs2_super,
+ dentry_lock_work);
+
+ __ocfs2_drop_dl_inodes(osb, DL_INODE_DROP_COUNT);
+ /*
+ * Don't queue dropping if umount is in progress. We flush the
+ * list in ocfs2_dismount_volume
+ */
+ spin_lock(&dentry_list_lock);
+ if (osb->dentry_lock_list &&
+ !ocfs2_test_osb_flag(osb, OCFS2_OSB_UMOUNT))
queue_work(ocfs2_wq, &osb->dentry_lock_work);
spin_unlock(&dentry_list_lock);
}
+/* Flush the whole work queue */
+void ocfs2_drop_all_dl_inodes(struct ocfs2_super *osb)
+{
+ __ocfs2_drop_dl_inodes(osb, -1);
+}
+
/*
* ocfs2_dentry_iput() and friends.
*
@@ -368,7 +386,8 @@ static void ocfs2_drop_dentry_lock(struct ocfs2_super *osb,
/* We leave dropping of inode reference to ocfs2_wq as that can
* possibly lead to inode deletion which gets tricky */
spin_lock(&dentry_list_lock);
- if (!osb->dentry_lock_list)
+ if (!osb->dentry_lock_list &&
+ !ocfs2_test_osb_flag(osb, OCFS2_OSB_UMOUNT))
queue_work(ocfs2_wq, &osb->dentry_lock_work);
dl->dl_next = osb->dentry_lock_list;
osb->dentry_lock_list = dl;
diff --git a/fs/ocfs2/dcache.h b/fs/ocfs2/dcache.h
index faa12e7..f5dd178 100644
--- a/fs/ocfs2/dcache.h
+++ b/fs/ocfs2/dcache.h
@@ -49,10 +49,13 @@ struct ocfs2_dentry_lock {
int ocfs2_dentry_attach_lock(struct dentry *dentry, struct inode *inode,
u64 parent_blkno);
+extern spinlock_t dentry_list_lock;
+
void ocfs2_dentry_lock_put(struct ocfs2_super *osb,
struct ocfs2_dentry_lock *dl);
void ocfs2_drop_dl_inodes(struct work_struct *work);
+void ocfs2_drop_all_dl_inodes(struct ocfs2_super *osb);
struct dentry *ocfs2_find_local_alias(struct inode *inode, u64 parent_blkno,
int skip_unhashed);
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index c9345eb..cfa7102 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -227,6 +227,7 @@ enum ocfs2_mount_options
#define OCFS2_OSB_SOFT_RO 0x0001
#define OCFS2_OSB_HARD_RO 0x0002
#define OCFS2_OSB_ERROR_FS 0x0004
+#define OCFS2_OSB_UMOUNT 0x0008
#define OCFS2_DEFAULT_ATIME_QUANTUM 60
struct ocfs2_journal;
@@ -490,6 +491,18 @@ static inline void ocfs2_set_osb_flag(struct ocfs2_super *osb,
spin_unlock(&osb->osb_lock);
}
+
+static inline unsigned long ocfs2_test_osb_flag(struct ocfs2_super *osb,
+ unsigned long flag)
+{
+ unsigned long ret;
+
+ spin_lock(&osb->osb_lock);
+ ret = osb->osb_flags & flag;
+ spin_unlock(&osb->osb_lock);
+ return ret;
+}
+
static inline void ocfs2_set_ro_flag(struct ocfs2_super *osb,
int hard)
{
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 7efb349..6cf5ab8 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1213,14 +1213,27 @@ static int ocfs2_get_sb(struct file_system_type *fs_type,
mnt);
}
+static void ocfs2_kill_sb(struct super_block *sb)
+{
+ struct ocfs2_super *osb = OCFS2_SB(sb);
+
+ /* Prevent further queueing of inode drop events */
+ spin_lock(&dentry_list_lock);
+ ocfs2_set_osb_flag(osb, OCFS2_OSB_UMOUNT);
+ spin_unlock(&dentry_list_lock);
+ /* Wait for work to finish and/or remove it */
+ cancel_work_sync(&osb->dentry_lock_work);
+
+ kill_block_super(sb);
+}
+
static struct file_system_type ocfs2_fs_type = {
.owner = THIS_MODULE,
.name = "ocfs2",
.get_sb = ocfs2_get_sb, /* is this called when we mount
* the fs? */
- .kill_sb = kill_block_super, /* set to the generic one
- * right now, but do we
- * need to change that? */
+ .kill_sb = ocfs2_kill_sb,
+
.fs_flags = FS_REQUIRES_DEV|FS_RENAME_DOES_D_MOVE,
.next = NULL
};
@@ -1819,6 +1832,12 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
debugfs_remove(osb->osb_ctxt);
+ /*
+ * Flush inode dropping work queue so that deletes are
+ * performed while the filesystem is still working
+ */
+ ocfs2_drop_all_dl_inodes(osb);
+
/* Orphan scan should be stopped as early as possible */
ocfs2_orphan_scan_stop(osb);
--
1.6.0.2
More information about the Ocfs2-devel
mailing list