[Ocfs2-devel] [PATCH] ocfs2/dlm: remove potential deadlock -V2

Sunil Mushran sunil.mushran at oracle.com
Mon Jul 19 16:14:12 PDT 2010


A better (and simpler) fix would be to not dlm_grab() in dlm_init_lockres().
Then we can remove the corresponding dlm_put() in dlm_lockres_release().
That grab is not required because we don't call dlm_unregister_domain()
based on refcount.

On 07/19/2010 03:11 AM, Wengang Wang wrote:
> Any comment?
>
> On 10-06-21 21:43, Wengang Wang wrote:
>    
>> When we need to take both dlm_domain_lock and dlm->spinlock, we should take
>> them in order of: dlm_domain_lock then dlm->spinlock.
>>
>> There is pathes disobey this order. That is calling dlm_lockres_put() with
>> dlm->spinlock held in dlm_run_purge_list. dlm_lockres_put() calls dlm_put() at
>> the ref and dlm_put() locks on dlm_domain_lock.
>>
>> Fix:
>> In dlm_run_purge_list, if it could the last ref on the lockres, unlock
>> dlm->spinlock before calling dlm_lockres_put() and lock it back after
>> dlm_lockres_put().
>>
>> Signed-off-by: Wengang Wang<wen.gang.wang at oracle.com>
>> ---
>>   fs/ocfs2/dlm/dlmthread.c |   27 +++++++++++++++++++--------
>>   1 files changed, 19 insertions(+), 8 deletions(-)
>>
>> diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
>> index d4f73ca..b1bd624 100644
>> --- a/fs/ocfs2/dlm/dlmthread.c
>> +++ b/fs/ocfs2/dlm/dlmthread.c
>> @@ -152,6 +152,7 @@ void dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
>>   	spin_unlock(&dlm->spinlock);
>>   }
>>
>> +/* returns 1 if the lockres is removed from purge list, 0 otherwise */
>>   static int dlm_purge_lockres(struct dlm_ctxt *dlm,
>>   			     struct dlm_lock_resource *res)
>>   {
>> @@ -217,7 +218,9 @@ static int dlm_purge_lockres(struct dlm_ctxt *dlm,
>>   		     res, master);
>>   		list_del_init(&res->purge);
>>   		spin_unlock(&res->spinlock);
>> +		/* not the last ref, dlm_run_purge_list() holds another */
>>   		dlm_lockres_put(res);
>> +		ret = 1;
>>   		dlm->purge_count--;
>>   	} else
>>   		spin_unlock(&res->spinlock);
>> @@ -232,13 +235,13 @@ static int dlm_purge_lockres(struct dlm_ctxt *dlm,
>>   		spin_unlock(&res->spinlock);
>>   		wake_up(&res->wq);
>>   	}
>> -	return 0;
>> +	return ret;
>>   }
>>
>>   static void dlm_run_purge_list(struct dlm_ctxt *dlm,
>>   			       int purge_now)
>>   {
>> -	unsigned int run_max, unused;
>> +	unsigned int run_max, unused, removed;
>>   	unsigned long purge_jiffies;
>>   	struct dlm_lock_resource *lockres;
>>
>> @@ -280,13 +283,21 @@ static void dlm_run_purge_list(struct dlm_ctxt *dlm,
>>
>>   		/* This may drop and reacquire the dlm spinlock if it
>>   		 * has to do migration. */
>> -		if (dlm_purge_lockres(dlm, lockres))
>> -			BUG();
>> -
>> +		removed = dlm_purge_lockres(dlm, lockres);
>> +		/* If the lockres is removed from purge list, this could be
>> +		 * the last ref. Unlock dlm->spinlock to avoid deadlock
>> +		 * --dlm_lockres_put() does spin_lock on dlm_domain_lock on the
>> +		 * last ref while there, in other path, could be lock requests
>> +		 * in normal order: dlm_domain_lock then dlm->spinlock.
>> +		 */
>> +		if (removed)
>> +			spin_unlock(&dlm->spinlock);
>>   		dlm_lockres_put(lockres);
>> -
>> -		/* Avoid adding any scheduling latencies */
>> -		cond_resched_lock(&dlm->spinlock);
>> +		if (removed)
>> +			spin_lock(&dlm->spinlock);
>> +		else
>> +			/* Avoid adding any scheduling latencies */
>> +			cond_resched_lock(&dlm->spinlock);
>>   	}
>>
>>   	spin_unlock(&dlm->spinlock);
>> -- 
>> 1.6.6.1
>>
>>
>> _______________________________________________
>> Ocfs2-devel mailing list
>> Ocfs2-devel at oss.oracle.com
>> http://oss.oracle.com/mailman/listinfo/ocfs2-devel
>>      
> _______________________________________________
> Ocfs2-devel mailing list
> Ocfs2-devel at oss.oracle.com
> http://oss.oracle.com/mailman/listinfo/ocfs2-devel
>    




More information about the Ocfs2-devel mailing list