[Ocfs2-devel] [PATCH] Dynamic lockres hash table

Jan Kara jack at suse.cz
Wed Mar 5 10:26:44 PST 2008


On Tue 04-03-08 18:33:03, Sunil Mushran wrote:
> My main problem with a mount option is that it is not dynamic.
>
> I was thinking along lines of having a sysfs param that will
> allow users to dynamically resize the number of pages alloted
> to the hash. This will definitely require us running tests to see
> how long it takes to rehash with 500K lockres under the
> dlm_spinlock.
   I see. I didn't know you intended to do this dynamically. But yes, it's
better than what I have if rehashing will be fast enough.

> I guess as a first step, we should add a avg lookup time stat.
  Actually, it's non-trivial to measure (differently than by profiling).
You cannot use standard time functions because they have too low resolution
- we are speaking about microseconds here...

> But all this will take time.
>
> How about we increase the defaults in 1.4 from 4 pages to 16 or
> even 32 pages. This will be for Enterprise Kernels only and we
> should be able to assume that they will have 128K per mount to
> spare.
  Definitely, they should have 16 or even 32 pages per mount.  With 500K
lockres, which is not so extreme on bigger FS, 16 pages mean hash chains of
average length 61 on x86_64. It is not ideal but I guess it should be
sufficient.

								Honza

> Jan Kara wrote:
>>   Hello,
>>
>>   because SLES10 SP2 is closer than I thought, I've written the patch to
>> dynamically size the hash table with locks in DLM. First, there's new 
>> mount
>> option hash_buckets which allows you to set number of hash buckets
>> explicitely. Then there is also code which tries to estimate reasonable
>> hash size when mounting the filesystem - what I put there is:
>>  1) we estimate the number of possible files a device_size / max(64KB,
>> 4*cluster_size) - this is used as the number of buckets (number of locks
>> we need to store in memory is roughly twice the number of cached files in
>> memory).
>>  2) we never take more than 1/2048 of total ram
>>
>>   If you think the estimates should be different, please speak up.
>>
>> 									Honza
>>
>>   ------------------------------------------------------------------------
>>
>> From: Jan Kara <jack at suse.cz>
>> Subject: Allow setting of size of lockres hash
>>
>> Hash table with cluster locks had a fixed size of 2048 entries on 64-bit 
>> archs.
>> This is too few when used for a larger filesystem. Add the possibility to 
>> set
>> the size of the hash table as a mount option and also introduce some 
>> better
>> estimation on the needed table size.
>>
>> Signed-off-by: Jan Kara <jack at suse.cz>
>>
>> Index: linux-2.6.16-SLES10_SP2_BRANCH/fs/ocfs2/dlm/dlmapi.h
>> ===================================================================
>> --- linux-2.6.16-SLES10_SP2_BRANCH.orig/fs/ocfs2/dlm/dlmapi.h
>> +++ linux-2.6.16-SLES10_SP2_BRANCH/fs/ocfs2/dlm/dlmapi.h
>> @@ -193,7 +193,8 @@ enum dlm_status dlmunlock(struct dlm_ctx
>>  			  dlm_astunlockfunc_t *unlockast,
>>  			  void *data);
>>  -struct dlm_ctxt * dlm_register_domain(const char *domain, u32 key);
>> +struct dlm_ctxt * dlm_register_domain(const char *domain, u32 key,
>> +	unsigned int buckets);
>>   void dlm_unregister_domain(struct dlm_ctxt *dlm);
>>  Index: linux-2.6.16-SLES10_SP2_BRANCH/fs/ocfs2/dlm/dlmcommon.h
>> ===================================================================
>> --- linux-2.6.16-SLES10_SP2_BRANCH.orig/fs/ocfs2/dlm/dlmcommon.h
>> +++ linux-2.6.16-SLES10_SP2_BRANCH/fs/ocfs2/dlm/dlmcommon.h
>> @@ -37,14 +37,8 @@
>>  #define DLM_THREAD_SHUFFLE_INTERVAL    5     // flush everything every 5 
>> passes
>>  #define DLM_THREAD_MS                  200   // flush at least every 200 
>> ms
>>  -#define DLM_HASH_SIZE_DEFAULT	(1 << 14)
>> -#if DLM_HASH_SIZE_DEFAULT < PAGE_SIZE
>> -# define DLM_HASH_PAGES		1
>> -#else
>> -# define DLM_HASH_PAGES		(DLM_HASH_SIZE_DEFAULT / PAGE_SIZE)
>> -#endif
>> +#define DLM_DEFAULT_HASH_BUCKETS (1 << 14)
>>  #define DLM_BUCKETS_PER_PAGE	(PAGE_SIZE / sizeof(struct hlist_head))
>> -#define DLM_HASH_BUCKETS	(DLM_HASH_PAGES * DLM_BUCKETS_PER_PAGE)
>>   /* Intended to make it easier for us to switch out hash functions */
>>  #define dlm_lockid_hash(_n, _l) full_name_hash(_n, _l)
>> @@ -96,6 +90,7 @@ enum dlm_ctxt_state {
>>  struct dlm_ctxt
>>  {
>>  	struct list_head list;
>> +	unsigned int lockres_hash_buckets;
>>  	struct hlist_head **lockres_hash;
>>  	struct list_head dirty_list;
>>  	struct list_head purge_list;
>> @@ -148,7 +143,7 @@ struct dlm_ctxt
>>   static inline struct hlist_head *dlm_lockres_hash(struct dlm_ctxt *dlm, 
>> unsigned i)
>>  {
>> -	return dlm->lockres_hash[(i / DLM_BUCKETS_PER_PAGE) % DLM_HASH_PAGES] + 
>> (i % DLM_BUCKETS_PER_PAGE);
>> +	return dlm->lockres_hash[(i % dlm->lockres_hash_buckets) / 
>> DLM_BUCKETS_PER_PAGE] + (i % DLM_BUCKETS_PER_PAGE);
>>  }
>>   /* these keventd work queue items are for less-frequently
>> Index: linux-2.6.16-SLES10_SP2_BRANCH/fs/ocfs2/dlm/dlmdebug.c
>> ===================================================================
>> --- linux-2.6.16-SLES10_SP2_BRANCH.orig/fs/ocfs2/dlm/dlmdebug.c
>> +++ linux-2.6.16-SLES10_SP2_BRANCH/fs/ocfs2/dlm/dlmdebug.c
>> @@ -381,7 +381,7 @@ void dlm_dump_lock_resources(struct dlm_
>>  	}
>>   	spin_lock(&dlm->spinlock);
>> -	for (i=0; i<DLM_HASH_BUCKETS; i++) {
>> +	for (i=0; i<dlm->lockres_hash_buckets; i++) {
>>  		bucket = dlm_lockres_hash(dlm, i);
>>  		hlist_for_each_entry(res, iter, bucket, hash_node)
>>  			dlm_print_one_lock_resource(res);
>> Index: linux-2.6.16-SLES10_SP2_BRANCH/fs/ocfs2/dlm/dlmdomain.c
>> ===================================================================
>> --- linux-2.6.16-SLES10_SP2_BRANCH.orig/fs/ocfs2/dlm/dlmdomain.c
>> +++ linux-2.6.16-SLES10_SP2_BRANCH/fs/ocfs2/dlm/dlmdomain.c
>> @@ -98,9 +98,8 @@ static void **dlm_alloc_pagevec(int page
>>  		if (!(vec[i] = (void *)__get_free_page(GFP_KERNEL)))
>>  			goto out_free;
>>  -	mlog(0, "Allocated DLM hash pagevec; %d pages (%lu expected), %lu 
>> buckets per page\n",
>> -	     pages, (unsigned long)DLM_HASH_PAGES,
>> -	     (unsigned long)DLM_BUCKETS_PER_PAGE);
>> +	mlog(0, "Allocated DLM hash pagevec; %d pages, %lu buckets per page\n",
>> +	     pages, (unsigned long)DLM_BUCKETS_PER_PAGE);
>>  	return vec;
>>  out_free:
>>  	dlm_free_pagevec(vec, i);
>> @@ -289,7 +288,8 @@ static void dlm_free_ctxt_mem(struct dlm
>>  	dlm_proc_del_domain(dlm);
>>   	if (dlm->lockres_hash)
>> -		dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES);
>> +		dlm_free_pagevec((void **)dlm->lockres_hash,
>> +			dlm->lockres_hash_buckets / DLM_BUCKETS_PER_PAGE);
>>   	if (dlm->name)
>>  		kfree(dlm->name);
>> @@ -412,7 +412,7 @@ static int dlm_migrate_all_locks(struct   	num = 0;
>>  	spin_lock(&dlm->spinlock);
>> -	for (i = 0; i < DLM_HASH_BUCKETS; i++) {
>> +	for (i = 0; i < dlm->lockres_hash_buckets; i++) {
>>  redo_bucket:
>>  		n = 0;
>>  		bucket = dlm_lockres_hash(dlm, i);
>> @@ -1360,8 +1360,8 @@ bail:
>>  	return status;
>>  }
>>  -static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
>> -				u32 key)
>> +static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain, u32 key,
>> +				unsigned int buckets)
>>  {
>>  	int i;
>>  	struct dlm_ctxt *dlm = NULL;
>> @@ -1380,7 +1380,14 @@ static struct dlm_ctxt *dlm_alloc_ctxt(c
>>  		goto leave;
>>  	}
>>  -	dlm->lockres_hash = (struct hlist_head 
>> **)dlm_alloc_pagevec(DLM_HASH_PAGES);
>> +	if (!buckets)
>> +		buckets = DLM_DEFAULT_HASH_BUCKETS;
>> +	buckets = (buckets + DLM_BUCKETS_PER_PAGE - 1) / DLM_BUCKETS_PER_PAGE
>> +		  * DLM_BUCKETS_PER_PAGE;
>> +	dlm->lockres_hash_buckets = buckets;
>> +
>> +	dlm->lockres_hash = (struct hlist_head **)dlm_alloc_pagevec(buckets
>> +				/ DLM_BUCKETS_PER_PAGE);
>>  	if (!dlm->lockres_hash) {
>>  		mlog_errno(-ENOMEM);
>>  		kfree(dlm->name);
>> @@ -1389,7 +1396,7 @@ static struct dlm_ctxt *dlm_alloc_ctxt(c
>>  		goto leave;
>>  	}
>>  -	for (i = 0; i < DLM_HASH_BUCKETS; i++)
>> +	for (i = 0; i < dlm->lockres_hash_buckets; i++)
>>  		INIT_HLIST_HEAD(dlm_lockres_hash(dlm, i));
>>   	strcpy(dlm->name, domain);
>> @@ -1458,8 +1465,8 @@ leave:
>>  /*
>>   * dlm_register_domain: one-time setup per "domain"
>>   */
>> -struct dlm_ctxt * dlm_register_domain(const char *domain,
>> -			       u32 key)
>> +struct dlm_ctxt * dlm_register_domain(const char *domain, u32 key,
>> +			unsigned int buckets)
>>  {
>>  	int ret;
>>  	struct dlm_ctxt *dlm = NULL;
>> @@ -1515,7 +1522,7 @@ retry:
>>  	if (!new_ctxt) {
>>  		spin_unlock(&dlm_domain_lock);
>>  -		new_ctxt = dlm_alloc_ctxt(domain, key);
>> +		new_ctxt = dlm_alloc_ctxt(domain, key, buckets);
>>  		if (new_ctxt)
>>  			goto retry;
>>  Index: linux-2.6.16-SLES10_SP2_BRANCH/fs/ocfs2/dlm/dlmrecovery.c
>> ===================================================================
>> --- linux-2.6.16-SLES10_SP2_BRANCH.orig/fs/ocfs2/dlm/dlmrecovery.c
>> +++ linux-2.6.16-SLES10_SP2_BRANCH/fs/ocfs2/dlm/dlmrecovery.c
>> @@ -2020,7 +2020,7 @@ static void dlm_finish_local_lockres_rec
>>  	 * for now we need to run the whole hash, clear
>>  	 * the RECOVERING state and set the owner
>>  	 * if necessary */
>> -	for (i = 0; i < DLM_HASH_BUCKETS; i++) {
>> +	for (i = 0; i < dlm->lockres_hash_buckets; i++) {
>>  		bucket = dlm_lockres_hash(dlm, i);
>>  		hlist_for_each_entry(res, hash_iter, bucket, hash_node) {
>>  			if (res->state & DLM_LOCK_RES_RECOVERING) {
>> @@ -2201,7 +2201,7 @@ static void dlm_do_local_recovery_cleanu
>>  	 *    can be kicked again to see if any ASTs or BASTs
>>  	 *    need to be fired as a result.
>>  	 */
>> -	for (i = 0; i < DLM_HASH_BUCKETS; i++) {
>> +	for (i = 0; i < dlm->lockres_hash_buckets; i++) {
>>  		bucket = dlm_lockres_hash(dlm, i);
>>  		hlist_for_each_entry(res, iter, bucket, hash_node) {
>>   			/* always prune any $RECOVERY entries for dead nodes,
>> Index: linux-2.6.16-SLES10_SP2_BRANCH/fs/ocfs2/dlm/userdlm.c
>> ===================================================================
>> --- linux-2.6.16-SLES10_SP2_BRANCH.orig/fs/ocfs2/dlm/userdlm.c
>> +++ linux-2.6.16-SLES10_SP2_BRANCH/fs/ocfs2/dlm/userdlm.c
>> @@ -661,7 +661,7 @@ struct dlm_ctxt *user_dlm_register_conte
>>   	snprintf(domain, name->len + 1, "%.*s", name->len, name->name);
>>  -	dlm = dlm_register_domain(domain, dlm_key);
>> +	dlm = dlm_register_domain(domain, dlm_key, 0);
>>  	if (IS_ERR(dlm))
>>  		mlog_errno(PTR_ERR(dlm));
>>  Index: linux-2.6.16-SLES10_SP2_BRANCH/fs/ocfs2/dlmglue.c
>> ===================================================================
>> --- linux-2.6.16-SLES10_SP2_BRANCH.orig/fs/ocfs2/dlmglue.c
>> +++ linux-2.6.16-SLES10_SP2_BRANCH/fs/ocfs2/dlmglue.c
>> @@ -2514,7 +2514,8 @@ int ocfs2_dlm_init(struct ocfs2_super *o
>>  	dlm_key = crc32_le(0, osb->uuid_str, strlen(osb->uuid_str));
>>   	/* for now, uuid == domain */
>> -	dlm = dlm_register_domain(osb->uuid_str, dlm_key);
>> +	dlm = dlm_register_domain(osb->uuid_str, dlm_key,
>> +			osb->dlm_hash_buckets);
>>  	if (IS_ERR(dlm)) {
>>  		status = PTR_ERR(dlm);
>>  		mlog_errno(status);
>> Index: linux-2.6.16-SLES10_SP2_BRANCH/fs/ocfs2/ocfs2.h
>> ===================================================================
>> --- linux-2.6.16-SLES10_SP2_BRANCH.orig/fs/ocfs2/ocfs2.h
>> +++ linux-2.6.16-SLES10_SP2_BRANCH/fs/ocfs2/ocfs2.h
>> @@ -218,6 +218,7 @@ struct ocfs2_super
>>   	unsigned long s_mount_opt;
>>  	unsigned int s_atime_quantum;
>> +	unsigned int dlm_hash_buckets;
>>   	u16 max_slots;
>>  	s16 node_num;
>> Index: linux-2.6.16-SLES10_SP2_BRANCH/fs/ocfs2/super.c
>> ===================================================================
>> --- linux-2.6.16-SLES10_SP2_BRANCH.orig/fs/ocfs2/super.c
>> +++ linux-2.6.16-SLES10_SP2_BRANCH/fs/ocfs2/super.c
>> @@ -40,6 +40,7 @@
>>  #include <linux/crc32.h>
>>  #include <linux/debugfs.h>
>>  #include <linux/mount.h>
>> +#include <linux/mm.h>
>>   #include <cluster/nodemanager.h>
>>  @@ -88,6 +89,7 @@ struct mount_options
>>  	unsigned int	atime_quantum;
>>  	signed short	slot;
>>  	unsigned int	localalloc_opt;
>> +	unsigned int	dlm_hash_buckets;
>>  };
>>   static int ocfs2_parse_options(struct super_block *sb, char *options,
>> @@ -169,6 +171,7 @@ enum {
>>  	Opt_commit,
>>  	Opt_localalloc,
>>  	Opt_localflocks,
>> +	Opt_dlm_hash_buckets,
>>  #ifdef OCFS2_ORACORE_WORKAROUNDS
>>  	Opt_datavolume,
>>  #endif
>> @@ -190,6 +193,7 @@ static match_table_t tokens = {
>>  	{Opt_commit, "commit=%u"},
>>  	{Opt_localalloc, "localalloc=%d"},
>>  	{Opt_localflocks, "localflocks"},
>> +	{Opt_dlm_hash_buckets, "hash_buckets=%u"},
>>  #ifdef OCFS2_ORACORE_WORKAROUNDS
>>  	{Opt_datavolume, "datavolume"},
>>  #endif
>> @@ -633,6 +637,22 @@ static int ocfs2_fill_super(struct super
>>  	osb->preferred_slot = parsed_options.slot;
>>  	osb->osb_commit_interval = parsed_options.commit_interval;
>>  	osb->local_alloc_size = parsed_options.localalloc_opt;
>> +	if (parsed_options.dlm_hash_buckets)
>> +		osb->dlm_hash_buckets = parsed_options.dlm_hash_buckets;
>> +	else {
>> +		/* Let's count 4 clusters per file, 64 KB at least */
>> +		unsigned int exp_file_size_shift =
>> +				max(16, osb->s_clustersize_bits + 2);
>> +		struct sysinfo i;
>> +
>> +		si_meminfo(&i);
>> +		/* Estimate number of files on FS and limit space used by
>> +		 * hash table by 1/2048 of kernel memory */
>> +		osb->dlm_hash_buckets = min_t(unsigned long long,
>> +			sb->s_bdev->bd_inode->i_size >> exp_file_size_shift,
>> +			(i.totalram >> 11) * (PAGE_SIZE /
>> +					sizeof(struct hlist_head)));
>> +	}
>>   #ifdef OCFS2_ORACORE_WORKAROUNDS
>>  	if (osb->s_mount_opt & OCFS2_MOUNT_COMPAT_OCFS)
>> @@ -807,6 +827,7 @@ static int ocfs2_parse_options(struct su
>>  	mopt->atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM;
>>  	mopt->slot = OCFS2_INVALID_SLOT;
>>  	mopt->localalloc_opt = OCFS2_DEFAULT_LOCAL_ALLOC_SIZE;
>> +	mopt->dlm_hash_buckets = 0;
>>   	if (!options) {
>>  		status = 1;
>> @@ -919,6 +940,19 @@ static int ocfs2_parse_options(struct su
>>  			if (!is_remount)
>>  				mopt->mount_opt |= OCFS2_MOUNT_LOCALFLOCKS;
>>  			break;
>> +		case Opt_dlm_hash_buckets:
>> +			if (is_remount) {
>> +				mlog(ML_ERROR, "Changing number of hash buckets"
>> +					" during remount is not supported.\n");
>> +				status = 0;
>> +				goto bail;
>> +			}
>> +			if (match_int(&args[0], &option) || option <= 0) {
>> +				status = 0;
>> +				goto bail;
>> +			}
>> +			mopt->dlm_hash_buckets = option;
>> +			break;
>>  		default:
>>  			mlog(ML_ERROR,
>>  			     "Unrecognized mount option \"%s\" "
>>   
>
-- 
Jan Kara <jack at suse.cz>
SUSE Labs, CR



More information about the Ocfs2-devel mailing list