[Ocfs2-devel] [PATCH 5/5] ocfs2: Implement delayed dropping of last dquot reference

Srinivas Eeda srinivas.eeda at oracle.com
Mon Jan 20 22:47:43 PST 2014


On 01/20/2014 07:31 AM, Goldwyn Rodrigues wrote:
> On 01/16/2014 04:58 PM, Jan Kara wrote:
>> On Thu 16-01-14 23:28:49, Jan Kara wrote:
>>> We cannot drop last dquot reference from downconvert thread as that
>>> creates the following deadlock:
>>>
>>> NODE 1                                  NODE2
>>> holds dentry lock for 'foo'
>>> holds inode lock for GLOBAL_BITMAP_SYSTEM_INODE
>>>                                          dquot_initialize(bar)
>>> ocfs2_dquot_acquire()
>>> ocfs2_inode_lock(USER_QUOTA_SYSTEM_INODE)
>>>                                              ...
>>> downconvert thread (triggered from another
>>> node or a different process from NODE2)
>>>    ocfs2_dentry_post_unlock()
>>>      ...
>>>      iput(foo)
>>>        ocfs2_evict_inode(foo)
>>>          ocfs2_clear_inode(foo)
>>>            dquot_drop(inode)
>>>              ...
>>>         ocfs2_dquot_release()
>>>                ocfs2_inode_lock(USER_QUOTA_SYSTEM_INODE)
>>>                 - blocks
>>>                                              finds we need more 
>>> space in
>>>                                              quota file
>>>                                              ...
>>> ocfs2_extend_no_holes()
>>> ocfs2_inode_lock(GLOBAL_BITMAP_SYSTEM_INODE)
>>>                                                  - deadlocks waiting 
>>> for
>>>                                                    downconvert thread
>>>
>>> We solve the problem by postponing dropping of the last dquot reference
>>> to a workqueue if it happens from the downconvert thread.
>>    Hum, now looking again into ocfs2_clear_inode() there are more 
>> problems
>> than I originally thought. Look for example at
>> ocfs2_mark_lockres_freeing(). That will block on rw/inode/open lock if
>> there is downconvert pending waiting for that downconvert to finish.
>> However that never happens when ocfs2_clear_inode() is called from the
>> downconvert thread.
>>
>> So we are back to square one - I don't see a way how to fix these 
>> deadlocks
>> without postponing dropping of inode reference to a workqueue :(.
>>
>
> Since the reason of the unlink performance is the delay in calling 
> ocfs2_open_unlock(), and the ocfs2_mark_lockres_freeing() comes after 
> ocfs2_open_unlock(): can we move the call to ocfs2_open_unlock() to 
> ocfs2_evict_inode() and then perform ocfs2_clear_inode() in a deferred 
> way?
once ocfs2_evict_inode is returned, vfs would destroy the inode, so I 
think we should do the cleanup before that and hence we cannot differ 
ocfs2_clear_inode from inside ocfs2_evict_inode

May be we should queue ocfs2_blocking_ast call itself to down convert 
thread. That way it doesn't prevent down convert thread from clearing 
the inode. Once when down convert thread comes to processes the queued 
ast/bast and finds lockres cleared it can just return.


>
>
>>                                 Honza
>>
>>
>>>
>>> Signed-off-by: Jan Kara <jack at suse.cz>
>>> ---
>>>   fs/ocfs2/ocfs2.h        |  5 +++++
>>>   fs/ocfs2/quota.h        |  2 ++
>>>   fs/ocfs2/quota_global.c | 35 +++++++++++++++++++++++++++++++++++
>>>   fs/ocfs2/super.c        |  8 ++++++++
>>>   4 files changed, 50 insertions(+)
>>>
>>> diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
>>> index ca81f6b49236..f6134345fe42 100644
>>> --- a/fs/ocfs2/ocfs2.h
>>> +++ b/fs/ocfs2/ocfs2.h
>>> @@ -30,6 +30,7 @@
>>>   #include <linux/sched.h>
>>>   #include <linux/wait.h>
>>>   #include <linux/list.h>
>>> +#include <linux/llist.h>
>>>   #include <linux/rbtree.h>
>>>   #include <linux/workqueue.h>
>>>   #include <linux/kref.h>
>>> @@ -410,6 +411,10 @@ struct ocfs2_super
>>>       struct list_head blocked_lock_list;
>>>       unsigned long blocked_lock_count;
>>>
>>> +    /* List of dquot structures to drop last reference to */
>>> +    struct llist_head dquot_drop_list;
>>> +    struct work_struct dquot_drop_work;
>>> +
>>>       wait_queue_head_t        osb_mount_event;
>>>
>>>       /* Truncate log info */
>>> diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h
>>> index d5ab56cbe5c5..f266d67df3c6 100644
>>> --- a/fs/ocfs2/quota.h
>>> +++ b/fs/ocfs2/quota.h
>>> @@ -28,6 +28,7 @@ struct ocfs2_dquot {
>>>       unsigned int dq_use_count;    /* Number of nodes having 
>>> reference to this entry in global quota file */
>>>       s64 dq_origspace;    /* Last globally synced space usage */
>>>       s64 dq_originodes;    /* Last globally synced inode usage */
>>> +    struct llist_node list;    /* Member of list of dquots to drop */
>>>   };
>>>
>>>   /* Description of one chunk to recover in memory */
>>> @@ -110,6 +111,7 @@ int ocfs2_read_quota_phys_block(struct inode 
>>> *inode, u64 p_block,
>>>   int ocfs2_create_local_dquot(struct dquot *dquot);
>>>   int ocfs2_local_release_dquot(handle_t *handle, struct dquot *dquot);
>>>   int ocfs2_local_write_dquot(struct dquot *dquot);
>>> +void ocfs2_drop_dquot_refs(struct work_struct *work);
>>>
>>>   extern const struct dquot_operations ocfs2_quota_operations;
>>>   extern struct quota_format_type ocfs2_quota_format;
>>> diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
>>> index aaa50611ec66..7921e209c64b 100644
>>> --- a/fs/ocfs2/quota_global.c
>>> +++ b/fs/ocfs2/quota_global.c
>>> @@ -10,6 +10,7 @@
>>>   #include <linux/jiffies.h>
>>>   #include <linux/writeback.h>
>>>   #include <linux/workqueue.h>
>>> +#include <linux/llist.h>
>>>
>>>   #include <cluster/masklog.h>
>>>
>>> @@ -679,6 +680,27 @@ static int ocfs2_calc_qdel_credits(struct 
>>> super_block *sb, int type)
>>>              OCFS2_INODE_UPDATE_CREDITS;
>>>   }
>>>
>>> +void ocfs2_drop_dquot_refs(struct work_struct *work)
>>> +{
>>> +    struct ocfs2_super *osb = container_of(work, struct ocfs2_super,
>>> +                           dquot_drop_work);
>>> +    struct llist_node *list;
>>> +    struct ocfs2_dquot *odquot, *next_odquot;
>>> +
>>> +    list = llist_del_all(&osb->dquot_drop_list);
>>> +    llist_for_each_entry_safe(odquot, next_odquot, list, list) {
>>> +        /* Drop the reference we acquired in ocfs2_dquot_release() */
>>> +        dqput(&odquot->dq_dquot);
>>> +    }
>>> +}
>>> +
>>> +/*
>>> + * Called when the last reference to dquot is dropped. If we are 
>>> called from
>>> + * downconvert thread, we cannot do all the handling here because 
>>> grabbing
>>> + * quota lock could deadlock (the node holding the quota lock could 
>>> need some
>>> + * other cluster lock to proceed but with blocked downconvert 
>>> thread we cannot
>>> + * release any lock).
>>> + */
>>>   static int ocfs2_release_dquot(struct dquot *dquot)
>>>   {
>>>       handle_t *handle;
>>> @@ -694,6 +716,19 @@ static int ocfs2_release_dquot(struct dquot 
>>> *dquot)
>>>       /* Check whether we are not racing with some other dqget() */
>>>       if (atomic_read(&dquot->dq_count) > 1)
>>>           goto out;
>>> +    /* Running from downconvert thread? Postpone quota processing 
>>> to wq */
>>> +    if (current == osb->dc_task) {
>>> +        /*
>>> +         * Grab our own reference to dquot and queue it for delayed
>>> +         * dropping.  Quota code rechecks after calling
>>> +         * ->release_dquot() and won't free dquot structure.
>>> +         */
>>> +        dqgrab(dquot);
>>> +        /* First entry on list -> queue work */
>>> +        if (llist_add(&OCFS2_DQUOT(dquot)->list, 
>>> &osb->dquot_drop_list))
>>> +            queue_work(ocfs2_wq, &osb->dquot_drop_work);
>>> +        goto out;
>>> +    }
>>>       status = ocfs2_lock_global_qf(oinfo, 1);
>>>       if (status < 0)
>>>           goto out;
>>> diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
>>> index c7f71360666f..0c5ea9454967 100644
>>> --- a/fs/ocfs2/super.c
>>> +++ b/fs/ocfs2/super.c
>>> @@ -1920,6 +1920,11 @@ static void ocfs2_dismount_volume(struct 
>>> super_block *sb, int mnt_err)
>>>
>>>       ocfs2_disable_quotas(osb);
>>>
>>> +    /* All dquots should be freed by now */
>>> +    WARN_ON(!llist_empty(&osb->dquot_drop_list));
>>> +    /* Wait for worker to be done with the work structure in osb */
>>> +    cancel_work_sync(&osb->dquot_drop_work);
>>> +
>>>       ocfs2_shutdown_local_alloc(osb);
>>>
>>>       ocfs2_truncate_log_shutdown(osb);
>>> @@ -2247,6 +2252,9 @@ static int ocfs2_initialize_super(struct 
>>> super_block *sb,
>>>       INIT_WORK(&journal->j_recovery_work, ocfs2_complete_recovery);
>>>       journal->j_state = OCFS2_JOURNAL_FREE;
>>>
>>> +    INIT_WORK(&osb->dquot_drop_work, ocfs2_drop_dquot_refs);
>>> +    init_llist_head(&osb->dquot_drop_list);
>>> +
>>>       /* get some pseudo constants for clustersize bits */
>>>       osb->s_clustersize_bits =
>>>           le32_to_cpu(di->id2.i_super.s_clustersize_bits);
>>> -- 
>>> 1.8.1.4
>>>
>
>




More information about the Ocfs2-devel mailing list