From srinivas.eeda at oracle.com  Mon May  7 16:21:28 2012
From: srinivas.eeda at oracle.com (Srinivas Eeda)
Date: Mon,  7 May 2012 16:21:28 -0700
Subject: [Ocfs2-devel] [PATCH 1/3] ocfs2: new structure to implment
	discontiguous local alloc bitmap
In-Reply-To: <1336432890-18638-1-git-send-email-srinivas.eeda@oracle.com>
References: <1336432890-18638-1-git-send-email-srinivas.eeda@oracle.com>
Message-ID: <1336432890-18638-2-git-send-email-srinivas.eeda@oracle.com>

Current local alloc handles single contiguous free chunk of clusters. This
patch enhances local alloc to handle discontigous free chunks. It adds a new
ocfs2_local_alloc_rec structure which tracks single contiguous free chunk. An
array of these sit in the bitmap itself and track discontiguous chunks. In
best case there is only one record and increases as the filesystem gets
fragmented. Number of records at a time are limited depending on the size
of the bitmap and the max limit is defined by OCFS2_MAX_LOCAL_ALLOC_RECS.

Signed-off-by: Srinivas Eeda <srinivas.eeda at oracle.com>
---
 fs/ocfs2/localalloc.c |   10 ++++++++++
 fs/ocfs2/ocfs2.h      |    8 ++++++++
 fs/ocfs2/ocfs2_fs.h   |   48 ++++++++++++++++++++++++++++++++++++++++++------
 3 files changed, 60 insertions(+), 6 deletions(-)

diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index 210c352..4190e53 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -48,6 +48,16 @@
 
 #define OCFS2_LOCAL_ALLOC(dinode)	(&((dinode)->id2.i_lab))
 
+#define OCFS2_LOCAL_ALLOC_REC_SZ(la)	(le16_to_cpu(la->la_rec_count) *\
+					 sizeof(struct ocfs2_local_alloc_rec))
+#define OCFS2_LOCAL_ALLOC_BITMAP(la)    ((char *)(&(la->la_recs)) +\
+					 OCFS2_LOCAL_ALLOC_REC_SZ(la))
+#define OCFS2_LOCAL_ALLOC_BITS_PER_REC (sizeof(struct ocfs2_local_alloc_rec)*8)
+
+/* Maximum number of local alloc records */
+#define OCFS2_MAX_LOCAL_ALLOC_REC_LIMIT	128
+
+
 static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc);
 
 static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index d355e6e..d4c36d2 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -367,6 +367,7 @@ struct ocfs2_super
 							 * by osb_lock */
 
 	struct buffer_head *local_alloc_bh;
+	struct inode	   *local_alloc_inode;
 
 	u64 la_last_gd;
 
@@ -522,6 +523,13 @@ static inline int ocfs2_supports_discontig_bg(struct ocfs2_super *osb)
 	return 0;
 }
 
+static inline int ocfs2_supports_discontig_la(struct ocfs2_super *osb)
+{
+	if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_DISCONTIG_LA)
+		return 1;
+	return 0;
+}
+
 static inline unsigned int ocfs2_link_max(struct ocfs2_super *osb)
 {
 	if (ocfs2_supports_indexed_dirs(osb))
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 938387a..6a0fe02 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -102,7 +102,8 @@
 					 | OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS \
 					 | OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE \
 					 | OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG	\
-					 | OCFS2_FEATURE_INCOMPAT_CLUSTERINFO)
+					 | OCFS2_FEATURE_INCOMPAT_CLUSTERINFO \
+					 | OCFS2_FEATURE_INCOMPAT_DISCONTIG_LA)
 #define OCFS2_FEATURE_RO_COMPAT_SUPP	(OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \
 					 | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \
 					 | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)
@@ -177,6 +178,9 @@
  */
 #define OCFS2_FEATURE_INCOMPAT_CLUSTERINFO	0x4000
 
+/* Discontiguous local alloc */
+#define OCFS2_FEATURE_INCOMPAT_DISCONTIG_LA	0x8000
+
 /*
  * backup superblock flag is used to indicate that this volume
  * has backup superblocks.
@@ -664,14 +668,19 @@ struct ocfs2_super_block {
  * Local allocation bitmap for OCFS2 slots
  * Note that it exists inside an ocfs2_dinode, so all offsets are
  * relative to the start of ocfs2_dinode.id2.
+ * Each ocfs2_local_alloc_rec tracks one contigous chunk of clusters.
  */
+struct ocfs2_local_alloc_rec {
+	__le32 la_start;	/* 1st cluster in this extent */
+	__le32 la_clusters;	/* Number of contiguous clusters */
+};
+
 struct ocfs2_local_alloc
 {
 /*00*/	__le32 la_bm_off;	/* Starting bit offset in main bitmap */
 	__le16 la_size;		/* Size of included bitmap, in bytes */
-	__le16 la_reserved1;
-	__le64 la_reserved2;
-/*10*/	__u8   la_bitmap[0];
+	__le16 la_rec_count;	/* Number of discontiguous records */
+	struct ocfs2_local_alloc_rec la_recs[0]; /* Localalloc records */
 };
 
 /*
@@ -1380,11 +1389,24 @@ static inline u16 ocfs2_local_alloc_size(struct super_block *sb)
 	u16 size;
 
 	size = sb->s_blocksize -
-		offsetof(struct ocfs2_dinode, id2.i_lab.la_bitmap);
+		offsetof(struct ocfs2_dinode, id2.i_lab.la_recs);
+	size -= sizeof(struct ocfs2_local_alloc_rec);
 
 	return size;
 }
 
+/* effectively this is also the bitmap size */
+static inline u32 ocfs2_local_alloc_cluster_count(struct ocfs2_local_alloc *la)
+{
+	u32 i, clusters;
+
+	clusters = 0;
+	for (i = 0; i < le16_to_cpu(la->la_rec_count); i++)
+		clusters +=  le32_to_cpu(la->la_recs[i].la_clusters);
+
+	return clusters;
+}
+
 static inline int ocfs2_group_bitmap_size(struct super_block *sb,
 					  int suballocator,
 					  u32 feature_incompat)
@@ -1528,11 +1550,25 @@ static inline int ocfs2_local_alloc_size(int blocksize)
 	int size;
 
 	size = blocksize -
-		offsetof(struct ocfs2_dinode, id2.i_lab.la_bitmap);
+		offsetof(struct ocfs2_dinode, id2.i_lab.la_recs);
+	size -= sizeof(struct ocfs2_local_alloc_rec);
 
 	return size;
 }
 
+/* effectively this is also the bitmap size */
+static inline uint32_t
+ocfs2_local_alloc_cluster_count(struct ocfs2_local_alloc *la)
+{
+	uint32_t i, clusters;
+
+	clusters = 0;
+	for (i = 0; i < le16_to_cpu(la->la_rec_count); i++)
+		clusters +=  le32_to_cpu(la->la_recs[i].la_clusters);
+
+	return clusters;
+}
+
 static inline int ocfs2_group_bitmap_size(int blocksize,
 					  int suballocator,
 					  uint32_t feature_incompat)
-- 
1.5.4.3


From srinivas.eeda at oracle.com  Mon May  7 16:21:29 2012
From: srinivas.eeda at oracle.com (Srinivas Eeda)
Date: Mon,  7 May 2012 16:21:29 -0700
Subject: [Ocfs2-devel] [PATCH 2/3] ocfs2: implement discontiguous localalloc
	bitmap
In-Reply-To: <1336432890-18638-1-git-send-email-srinivas.eeda@oracle.com>
References: <1336432890-18638-1-git-send-email-srinivas.eeda@oracle.com>
Message-ID: <1336432890-18638-3-git-send-email-srinivas.eeda@oracle.com>

This patch adds supporting functions and modifies localalloc code to implement
discontiguous localalloc bitmap.

Signed-off-by: Srinivas Eeda <srinivas.eeda at oracle.com>
---
 fs/ocfs2/localalloc.c |  523 ++++++++++++++++++++++++++++++++-----------------
 1 files changed, 342 insertions(+), 181 deletions(-)

diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index 4190e53..f63381e 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -48,6 +48,9 @@
 
 #define OCFS2_LOCAL_ALLOC(dinode)	(&((dinode)->id2.i_lab))
 
+/* defines minimum contiguous required */
+#define OCFS2_LOCAL_ALLOC_MIN_BITS	2
+
 #define OCFS2_LOCAL_ALLOC_REC_SZ(la)	(le16_to_cpu(la->la_rec_count) *\
 					 sizeof(struct ocfs2_local_alloc_rec))
 #define OCFS2_LOCAL_ALLOC_BITMAP(la)    ((char *)(&(la->la_recs)) +\
@@ -58,7 +61,8 @@
 #define OCFS2_MAX_LOCAL_ALLOC_REC_LIMIT	128
 
 
-static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc);
+static u32 ocfs2_local_alloc_count_bits(struct ocfs2_super *osb,
+					struct ocfs2_dinode *alloc);
 
 static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
 					     struct ocfs2_dinode *alloc,
@@ -82,8 +86,7 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
 					handle_t *handle,
 					struct ocfs2_alloc_context *ac);
 
-static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
-					  struct inode *local_alloc_inode);
+static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb);
 
 /*
  * ocfs2_la_default_mb() - determine a default size, in megabytes of
@@ -202,6 +205,74 @@ unsigned int ocfs2_la_default_mb(struct ocfs2_super *osb)
 	return la_mb;
 }
 
+static u32 ocfs2_local_bitmap_to_cluster(struct ocfs2_local_alloc *la, u32 bit)
+{
+	u32 start, prev, offset;
+	int rec;
+
+	rec = start = prev = 0;
+	for (rec = 0; rec < le16_to_cpu(la->la_rec_count); rec++) {
+		prev = start;
+		start += le32_to_cpu(la->la_recs[rec].la_clusters);
+		if (bit < start)
+			break;
+	}
+	offset = le32_to_cpu(la->la_recs[rec].la_start) + (bit - prev);
+
+	return offset;
+}
+
+/*
+ * This function is called before allocating a new chunk for the localalloc
+ * bitmap to make sure there is enough space in the bitmap for the new record
+ */
+static u32 ocfs2_local_alloc_adjust_bits_wanted(struct ocfs2_local_alloc *la,
+						struct ocfs2_alloc_context *ac)
+{
+	u32 required, available, cluster_cnt;
+
+	if (ac->ac_bits_given == ac->ac_bits_wanted)
+		return 0;
+
+	/* total bits available in bitmap */
+	available   = le16_to_cpu(la->la_size) << 3;
+	cluster_cnt = ocfs2_local_alloc_cluster_count(la);
+
+	/*
+	 * Wanted shouldn't be greater than bitmap size and given should be
+	 * equal to cluster count
+	 */
+	BUG_ON(ac->ac_bits_given > ac->ac_bits_wanted);
+	BUG_ON(ac->ac_bits_wanted > available);
+	BUG_ON(ac->ac_bits_given != cluster_cnt);
+
+	/* reduce bits taken by each record structure */
+	available -= (le16_to_cpu(la->la_rec_count) *
+		      OCFS2_LOCAL_ALLOC_BITS_PER_REC);
+
+	/* reduce space reserved for bitmap for already allocated clusters */
+	available -= cluster_cnt;
+
+	/* if available bits are not enough to fit a new record return 0 */
+	if (available < (OCFS2_LOCAL_ALLOC_BITS_PER_REC + 1))
+		return 0;
+
+	/* Adjust space that will be consumed by new record structure */
+	available -= OCFS2_LOCAL_ALLOC_BITS_PER_REC;
+
+	required = ac->ac_bits_wanted - ac->ac_bits_given;
+
+	/*
+	 * we can't allocate clusters more than the bits available. Adjust
+	 * bits wanted
+	 */
+	if (required > available) {
+		ac->ac_bits_wanted = ac->ac_bits_given + available;
+		return available;
+	} else
+		return required;
+}
+
 void ocfs2_la_set_sizes(struct ocfs2_super *osb, int requested_mb)
 {
 	struct super_block *sb = osb->sb;
@@ -239,12 +310,14 @@ void ocfs2_local_alloc_seen_free_bits(struct ocfs2_super *osb,
 				      unsigned int num_clusters)
 {
 	spin_lock(&osb->osb_lock);
-	if (osb->local_alloc_state == OCFS2_LA_DISABLED ||
-	    osb->local_alloc_state == OCFS2_LA_THROTTLED)
-		if (num_clusters >= osb->local_alloc_default_bits) {
-			cancel_delayed_work(&osb->la_enable_wq);
+	if (osb->local_alloc_state == OCFS2_LA_DISABLED) {
+		cancel_delayed_work(&osb->la_enable_wq);
+		if (num_clusters >= osb->local_alloc_bits)
+			osb->local_alloc_state = OCFS2_LA_THROTTLED;
+
+		if (num_clusters >= osb->local_alloc_default_bits)
 			osb->local_alloc_state = OCFS2_LA_ENABLED;
-		}
+	}
 	spin_unlock(&osb->osb_lock);
 }
 
@@ -280,7 +353,7 @@ int ocfs2_alloc_should_use_local(struct ocfs2_super *osb, u64 bits)
 	 * a new block group. We want to be sure block group
 	 * allocations go through the local alloc, so allow an
 	 * allocation to take up to half the bitmap. */
-	if (bits > (la_bits / 2))
+	if ((la_bits > OCFS2_LOCAL_ALLOC_MIN_BITS) && (bits > (la_bits / 2)))
 		goto bail;
 
 	ret = 1;
@@ -348,21 +421,21 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb)
 	}
 
 	/* do a little verification. */
-	num_used = ocfs2_local_alloc_count_bits(alloc);
+	num_used = ocfs2_local_alloc_count_bits(osb, alloc);
 
 	/* hopefully the local alloc has always been recovered before
 	 * we load it. */
 	if (num_used
 	    || alloc->id1.bitmap1.i_used
 	    || alloc->id1.bitmap1.i_total
-	    || la->la_bm_off)
+	    || la->la_rec_count)
 		mlog(ML_ERROR, "Local alloc hasn't been recovered!\n"
-		     "found = %u, set = %u, taken = %u, off = %u\n",
+		     "found = %u, set = %u, taken = %u\n",
 		     num_used, le32_to_cpu(alloc->id1.bitmap1.i_used),
-		     le32_to_cpu(alloc->id1.bitmap1.i_total),
-		     OCFS2_LOCAL_ALLOC(alloc)->la_bm_off);
+		     le32_to_cpu(alloc->id1.bitmap1.i_total));
 
-	osb->local_alloc_bh = alloc_bh;
+	osb->local_alloc_bh    = alloc_bh;
+	osb->local_alloc_inode = inode;
 	osb->local_alloc_state = OCFS2_LA_ENABLED;
 
 bail:
@@ -389,7 +462,6 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
 {
 	int status;
 	handle_t *handle;
-	struct inode *local_alloc_inode = NULL;
 	struct buffer_head *bh = NULL;
 	struct buffer_head *main_bm_bh = NULL;
 	struct inode *main_bm_inode = NULL;
@@ -402,16 +474,6 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
 	if (osb->local_alloc_state == OCFS2_LA_UNUSED)
 		goto out;
 
-	local_alloc_inode =
-		ocfs2_get_system_file_inode(osb,
-					    LOCAL_ALLOC_SYSTEM_INODE,
-					    osb->slot_num);
-	if (!local_alloc_inode) {
-		status = -ENOENT;
-		mlog_errno(status);
-		goto out;
-	}
-
 	osb->local_alloc_state = OCFS2_LA_DISABLED;
 
 	ocfs2_resmap_uninit(&osb->osb_la_resmap);
@@ -451,13 +513,19 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
 	}
 	memcpy(alloc_copy, alloc, bh->b_size);
 
-	status = ocfs2_journal_access_di(handle, INODE_CACHE(local_alloc_inode),
+	status = ocfs2_journal_access_di(handle,
+					 INODE_CACHE(osb->local_alloc_inode),
 					 bh, OCFS2_JOURNAL_ACCESS_WRITE);
 	if (status < 0) {
 		mlog_errno(status);
 		goto out_commit;
 	}
 
+	status = ocfs2_sync_local_to_main(osb, handle, alloc_copy,
+					  main_bm_inode, main_bm_bh);
+	if (status < 0)
+		mlog_errno(status);
+
 	ocfs2_clear_local_alloc(alloc);
 	ocfs2_journal_dirty(handle, bh);
 
@@ -465,11 +533,6 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
 	osb->local_alloc_bh = NULL;
 	osb->local_alloc_state = OCFS2_LA_UNUSED;
 
-	status = ocfs2_sync_local_to_main(osb, handle, alloc_copy,
-					  main_bm_inode, main_bm_bh);
-	if (status < 0)
-		mlog_errno(status);
-
 out_commit:
 	ocfs2_commit_trans(osb, handle);
 
@@ -483,9 +546,6 @@ out_mutex:
 	iput(main_bm_inode);
 
 out:
-	if (local_alloc_inode)
-		iput(local_alloc_inode);
-
 	if (alloc_copy)
 		kfree(alloc_copy);
 }
@@ -641,22 +701,11 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
 {
 	int status;
 	struct ocfs2_dinode *alloc;
-	struct inode *local_alloc_inode;
 	unsigned int free_bits;
 
 	BUG_ON(!ac);
 
-	local_alloc_inode =
-		ocfs2_get_system_file_inode(osb,
-					    LOCAL_ALLOC_SYSTEM_INODE,
-					    osb->slot_num);
-	if (!local_alloc_inode) {
-		status = -ENOENT;
-		mlog_errno(status);
-		goto bail;
-	}
-
-	mutex_lock(&local_alloc_inode->i_mutex);
+	mutex_lock(&osb->local_alloc_inode->i_mutex);
 
 	/*
 	 * We must double check state and allocator bits because
@@ -675,12 +724,12 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
 
 #ifdef CONFIG_OCFS2_DEBUG_FS
 	if (le32_to_cpu(alloc->id1.bitmap1.i_used) !=
-	    ocfs2_local_alloc_count_bits(alloc)) {
+	    ocfs2_local_alloc_count_bits(osb, alloc)) {
 		ocfs2_error(osb->sb, "local alloc inode %llu says it has "
 			    "%u free bits, but a count shows %u",
 			    (unsigned long long)le64_to_cpu(alloc->i_blkno),
 			    le32_to_cpu(alloc->id1.bitmap1.i_used),
-			    ocfs2_local_alloc_count_bits(alloc));
+			    ocfs2_local_alloc_count_bits(osb, alloc));
 		status = -EIO;
 		goto bail;
 	}
@@ -690,8 +739,7 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
 		le32_to_cpu(alloc->id1.bitmap1.i_used);
 	if (bits_wanted > free_bits) {
 		/* uhoh, window change time. */
-		status =
-			ocfs2_local_alloc_slide_window(osb, local_alloc_inode);
+		status = ocfs2_local_alloc_slide_window(osb);
 		if (status < 0) {
 			if (status != -ENOSPC)
 				mlog_errno(status);
@@ -714,7 +762,7 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
 			goto bail;
 	}
 
-	ac->ac_inode = local_alloc_inode;
+	ac->ac_inode = osb->local_alloc_inode;
 	/* We should never use localalloc from another slot */
 	ac->ac_alloc_slot = osb->slot_num;
 	ac->ac_which = OCFS2_AC_USE_LOCAL;
@@ -722,9 +770,8 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
 	ac->ac_bh = osb->local_alloc_bh;
 	status = 0;
 bail:
-	if (status < 0 && local_alloc_inode) {
-		mutex_unlock(&local_alloc_inode->i_mutex);
-		iput(local_alloc_inode);
+	if (status < 0 && osb->local_alloc_inode) {
+		mutex_unlock(&osb->local_alloc_inode->i_mutex);
 	}
 
 	trace_ocfs2_reserve_local_alloc_bits(
@@ -745,7 +792,7 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
 {
 	int status, start;
 	struct inode *local_alloc_inode;
-	void *bitmap;
+	u8 *bitmap;
 	struct ocfs2_dinode *alloc;
 	struct ocfs2_local_alloc *la;
 
@@ -764,8 +811,8 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
 		goto bail;
 	}
 
-	bitmap = la->la_bitmap;
-	*bit_off = le32_to_cpu(la->la_bm_off) + start;
+	bitmap = OCFS2_LOCAL_ALLOC_BITMAP(la);
+	*bit_off = ocfs2_local_bitmap_to_cluster(la, start);
 	*num_bits = bits_wanted;
 
 	status = ocfs2_journal_access_di(handle,
@@ -792,16 +839,29 @@ bail:
 	return status;
 }
 
-static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc)
+static u32 ocfs2_local_alloc_count_bits(struct ocfs2_super *osb,
+					struct ocfs2_dinode *alloc)
 {
 	int i;
-	u8 *buffer;
+	u8 *bitmap;
 	u32 count = 0;
 	struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc);
 
-	buffer = la->la_bitmap;
-	for (i = 0; i < le16_to_cpu(la->la_size); i++)
-		count += hweight8(buffer[i]);
+	/*
+	 * if discontig is not enabled then lets update the first localalloc
+	 * record with the current bitmap block info. We are doing this because
+	 * old disk formats are not aware of the records.
+	 */
+	if (!ocfs2_supports_discontig_la(osb) && la->la_bm_off) {
+		la->la_rec_count = cpu_to_le16(1);
+		la->la_recs[0].la_start = la->la_bm_off;
+		la->la_recs[0].la_clusters = alloc->id1.bitmap1.i_total;
+	}
+
+	bitmap = OCFS2_LOCAL_ALLOC_BITMAP(la);
+	for (i = 0; i < le32_to_cpu(alloc->id1.bitmap1.i_total); i++)
+		if (ocfs2_test_bit(i, bitmap))
+			count++;
 
 	trace_ocfs2_local_alloc_count_bits(count);
 	return count;
@@ -812,10 +872,11 @@ static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
 				     u32 *numbits,
 				     struct ocfs2_alloc_reservation *resv)
 {
-	int numfound, bitoff, left, startoff, lastzero;
-	int local_resv = 0;
+	int numfound, bitoff, left, startoff;
+	int i, local_resv = 0;
 	struct ocfs2_alloc_reservation r;
-	void *bitmap = NULL;
+	struct ocfs2_local_alloc *la;
+	u8 *bitmap = NULL;
 	struct ocfs2_reservation_map *resmap = &osb->osb_la_resmap;
 
 	if (!alloc->id1.bitmap1.i_total) {
@@ -847,37 +908,44 @@ static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
 	 * Reservations are disabled. Handle this the old way.
 	 */
 
-	bitmap = OCFS2_LOCAL_ALLOC(alloc)->la_bitmap;
+	la     = OCFS2_LOCAL_ALLOC(alloc);
+	bitmap = OCFS2_LOCAL_ALLOC_BITMAP(la);
 
-	numfound = bitoff = startoff = 0;
-	lastzero = -1;
-	left = le32_to_cpu(alloc->id1.bitmap1.i_total);
-	while ((bitoff = ocfs2_find_next_zero_bit(bitmap, left, startoff)) != -1) {
-		if (bitoff == left) {
-			/* mlog(0, "bitoff (%d) == left", bitoff); */
-			break;
-		}
-		/* mlog(0, "Found a zero: bitoff = %d, startoff = %d, "
-		   "numfound = %d\n", bitoff, startoff, numfound);*/
-
-		/* Ok, we found a zero bit... is it contig. or do we
-		 * start over?*/
-		if (bitoff == startoff) {
-			/* we found a zero */
-			numfound++;
-			startoff++;
-		} else {
-			/* got a zero after some ones */
-			numfound = 1;
-			startoff = bitoff+1;
-		}
-		/* we got everything we needed */
-		if (numfound == *numbits) {
-			/* mlog(0, "Found it all!\n"); */
-			break;
+	left = numfound = bitoff = startoff = 0;
+	for (i = 0; i < le16_to_cpu(la->la_rec_count); i++) {
+
+		numfound  = 0;
+		startoff += left;
+		left      = le32_to_cpu(la->la_recs[i].la_clusters);
+
+		while ((bitoff = ocfs2_find_next_zero_bit(bitmap, left,
+							  startoff)) != -1) {
+			if (bitoff == left) {
+				/* mlog(0, "bitoff (%d) == left", bitoff); */
+				break;
+			}
+			/* mlog(0, "Found a zero: bitoff = %d, startoff = %d, "
+			 * "numfound = %d\n", bitoff, startoff, numfound);*/
+
+			/* Ok, we found a zero bit... is it contig. or do we
+			 * start over?*/
+			if (bitoff == startoff) {
+				/* we found a zero */
+				numfound++;
+				startoff++;
+			} else {
+				/* got a zero after some ones */
+				numfound = 1;
+				startoff = bitoff+1;
+			}
+			/* we got everything we needed */
+			if (numfound == *numbits) {
+				/* mlog(0, "Found it all!\n"); */
+				goto out;
+			}
 		}
 	}
-
+out:
 	trace_ocfs2_local_alloc_find_clear_bits_search_bitmap(bitoff, numfound);
 
 	if (numfound == *numbits)
@@ -900,12 +968,18 @@ static void ocfs2_clear_local_alloc(struct ocfs2_dinode *alloc)
 {
 	struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc);
 	int i;
+	u8 *bitmap;
 
 	alloc->id1.bitmap1.i_total = 0;
 	alloc->id1.bitmap1.i_used = 0;
+	la->la_rec_count = 0;
 	la->la_bm_off = 0;
+
+	/* We reset the rec count so following will clear records as well */
+	bitmap = OCFS2_LOCAL_ALLOC_BITMAP(la);
+	bitmap += sizeof(struct ocfs2_local_alloc);
 	for(i = 0; i < le16_to_cpu(la->la_size); i++)
-		la->la_bitmap[i] = 0;
+		bitmap[i] = 0;
 }
 
 #if 0
@@ -933,17 +1007,64 @@ static void ocfs2_verify_zero_bits(unsigned long *bitmap,
  * assumes you've already locked the main bitmap -- the bitmap inode
  * passed is used for caching.
  */
+static int ocfs2_sync_local_rec_to_main(struct ocfs2_super *osb,
+					handle_t *handle,
+					struct ocfs2_dinode *alloc,
+					struct inode *main_bm_inode,
+					struct buffer_head *main_bm_bh,
+					u8 *bitmap, u64 la_start_blk,
+					int start, int left)
+{
+	int bit_off = 0, status = 0, prev, count;
+	u64 blkno;
+
+	prev = start;
+	count = 0;
+	while ((bit_off = ocfs2_find_next_zero_bit(bitmap, left,
+						   start)) != -1) {
+		if ((bit_off < left) && (bit_off == start)) {
+			count++;
+			start++;
+			continue;
+		}
+		if (count) {
+			blkno = la_start_blk +
+				ocfs2_clusters_to_blocks(osb->sb,
+						 (start - prev) - count);
+			mlog(0, "\nfreeing %u bits starting at local "
+			     "alloc bit %u (la_start_blk = %llu, "
+			     "blkno = %llu)\n",
+			     count, ((start - prev) - count),
+			     (unsigned long long)la_start_blk,
+			     (unsigned long long)blkno);
+			status = ocfs2_release_clusters(handle, main_bm_inode,
+							main_bm_bh, blkno,
+							count);
+			if (status < 0) {
+				mlog_errno(status);
+				goto bail;
+			}
+		}
+		if (bit_off >= left)
+			break;
+		count = 1;
+		start = bit_off + 1;
+	}
+bail:
+	return status;
+}
+
 static int ocfs2_sync_local_to_main(struct ocfs2_super *osb,
 				    handle_t *handle,
 				    struct ocfs2_dinode *alloc,
 				    struct inode *main_bm_inode,
 				    struct buffer_head *main_bm_bh)
 {
-	int status = 0;
-	int bit_off, left, count, start;
+	int i, status = 0;
+	int total, start, rec_cnt, credits;
+	u32 clusters;
 	u64 la_start_blk;
-	u64 blkno;
-	void *bitmap;
+	u8 *bitmap;
 	struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc);
 
 	trace_ocfs2_sync_local_to_main(
@@ -954,49 +1075,58 @@ static int ocfs2_sync_local_to_main(struct ocfs2_super *osb,
 		goto bail;
 	}
 
+	/* if all bits are used nothing to sync, just return */
 	if (le32_to_cpu(alloc->id1.bitmap1.i_used) ==
 	    le32_to_cpu(alloc->id1.bitmap1.i_total)) {
 		goto bail;
 	}
 
-	la_start_blk = ocfs2_clusters_to_blocks(osb->sb,
-						le32_to_cpu(la->la_bm_off));
-	bitmap = la->la_bitmap;
-	start = count = bit_off = 0;
-	left = le32_to_cpu(alloc->id1.bitmap1.i_total);
+	bitmap  = OCFS2_LOCAL_ALLOC_BITMAP(la);
+	rec_cnt = le16_to_cpu(la->la_rec_count) - 1;
 
-	while ((bit_off = ocfs2_find_next_zero_bit(bitmap, left, start))
-	       != -1) {
-		if ((bit_off < left) && (bit_off == start)) {
-			count++;
-			start++;
-			continue;
-		}
-		if (count) {
-			blkno = la_start_blk +
-				ocfs2_clusters_to_blocks(osb->sb,
-							 start - count);
+	for (i = rec_cnt; i >= 0 ; i--) {
+		la_start_blk = ocfs2_clusters_to_blocks(osb->sb,
+					le32_to_cpu(la->la_recs[i].la_start));
 
-			trace_ocfs2_sync_local_to_main_free(
-			     count, start - count,
-			     (unsigned long long)la_start_blk,
-			     (unsigned long long)blkno);
+		total    = le32_to_cpu(alloc->id1.bitmap1.i_total);
+		clusters = le32_to_cpu(la->la_recs[i].la_clusters);
+		start    = total - clusters;
 
-			status = ocfs2_release_clusters(handle,
-							main_bm_inode,
-							main_bm_bh, blkno,
-							count);
+		status = ocfs2_sync_local_rec_to_main(osb, handle, alloc,
+						      main_bm_inode,
+						      main_bm_bh, bitmap,
+						      la_start_blk, start,
+						      total);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+		la->la_bm_off              = 0;
+		la->la_recs[i].la_start    = 0;
+		la->la_recs[i].la_clusters = 0;
+		le16_add_cpu(&la->la_rec_count, -1);
+		le32_add_cpu(&alloc->id1.bitmap1.i_total, -clusters);
+
+		ocfs2_journal_dirty(handle, osb->local_alloc_bh);
+
+		/* if we need more credits extend the transaction */
+		credits = OCFS2_WINDOW_MOVE_CREDITS - handle->h_buffer_credits;
+		if (credits > 0) {
+			status = ocfs2_extend_trans(handle, credits);
+			if (status < 0) {
+				mlog_errno(status);
+				goto bail;
+			}
+			status = ocfs2_journal_access_di(handle,
+					 INODE_CACHE(osb->local_alloc_inode),
+					 osb->local_alloc_bh,
+					 OCFS2_JOURNAL_ACCESS_WRITE);
 			if (status < 0) {
 				mlog_errno(status);
 				goto bail;
 			}
 		}
-		if (bit_off >= left)
-			break;
-		count = 1;
-		start = bit_off + 1;
 	}
-
 bail:
 	if (status)
 		mlog_errno(status);
@@ -1046,9 +1176,12 @@ static int ocfs2_recalc_la_window(struct ocfs2_super *osb,
 		 * We ran out of contiguous space in the primary
 		 * bitmap. Drastically reduce the number of bits used
 		 * by local alloc until we have to disable it.
+		 * In general we will be seeing atleast few contiguous free
+		 * bits. It should be ok to keep local alloc enabled even
+		 * in extreme case where max available contiguous free bit is 1
 		 */
 		bits = osb->local_alloc_bits >> 1;
-		if (bits > ocfs2_megabytes_to_clusters(osb->sb, 1)) {
+		if (bits) {
 			/*
 			 * By setting state to THROTTLED, we'll keep
 			 * the number of local alloc bits used down
@@ -1096,8 +1229,9 @@ static int ocfs2_local_alloc_reserve_for_window(struct ocfs2_super *osb,
 		goto bail;
 	}
 
+	osb->local_alloc_bits = osb->local_alloc_default_bits;
 retry_enospc:
-	(*ac)->ac_bits_wanted = osb->local_alloc_default_bits;
+	(*ac)->ac_bits_wanted = osb->local_alloc_bits;
 	status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac);
 	if (status == -ENOSPC) {
 		if (ocfs2_recalc_la_window(osb, OCFS2_LA_EVENT_ENOSPC) ==
@@ -1137,9 +1271,11 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
 					struct ocfs2_alloc_context *ac)
 {
 	int status = 0;
-	u32 cluster_off, cluster_count;
+	u32 wanted, cluster_off, cluster_count;
 	struct ocfs2_dinode *alloc = NULL;
 	struct ocfs2_local_alloc *la;
+	u8 *bitmap;
+	int i, rec_cnt, credits;
 
 	alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
 	la = OCFS2_LOCAL_ALLOC(alloc);
@@ -1156,72 +1292,97 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
 	/* we used the generic suballoc reserve function, but we set
 	 * everything up nicely, so there's no reason why we can't use
 	 * the more specific cluster api to claim bits. */
-	status = ocfs2_claim_clusters(handle, ac, osb->local_alloc_bits,
-				      &cluster_off, &cluster_count);
-	if (status == -ENOSPC) {
-retry_enospc:
-		/*
-		 * Note: We could also try syncing the journal here to
-		 * allow use of any free bits which the current
-		 * transaction can't give us access to. --Mark
-		 */
-		if (ocfs2_recalc_la_window(osb, OCFS2_LA_EVENT_FRAGMENTED) ==
-		    OCFS2_LA_DISABLED)
-			goto bail;
-
-		ac->ac_bits_wanted = osb->local_alloc_default_bits;
-		status = ocfs2_claim_clusters(handle, ac,
-					      osb->local_alloc_bits,
-					      &cluster_off,
+	rec_cnt = 0;
+	wanted = osb->local_alloc_bits;
+	while (1) {
+		status = ocfs2_claim_clusters(handle, ac, wanted, &cluster_off,
 					      &cluster_count);
-		if (status == -ENOSPC)
-			goto retry_enospc;
-		/*
-		 * We only shrunk the *minimum* number of in our
-		 * request - it's entirely possible that the allocator
-		 * might give us more than we asked for.
-		 */
-		if (status == 0) {
-			spin_lock(&osb->osb_lock);
-			osb->local_alloc_bits = cluster_count;
-			spin_unlock(&osb->osb_lock);
+		if (status == -ENOSPC) {
+			/* reduce window size and retry */
+			if (ocfs2_recalc_la_window(osb,
+			   OCFS2_LA_EVENT_FRAGMENTED) == OCFS2_LA_DISABLED)
+				break;
+			wanted = osb->local_alloc_bits;
+			continue;
+		} else if (status < 0)
+			break;
+
+		BUG_ON(ac->ac_bits_given > ac->ac_bits_wanted);
+
+		/* found a window */
+		la->la_recs[rec_cnt].la_start    = cpu_to_le32(cluster_off);
+		la->la_recs[rec_cnt].la_clusters = cpu_to_le32(cluster_count);
+		rec_cnt++;
+		la->la_rec_count = cpu_to_le16(rec_cnt);
+		le32_add_cpu(&alloc->id1.bitmap1.i_total, cluster_count);
+
+		ocfs2_journal_dirty(handle, osb->local_alloc_bh);
+
+		if (!ocfs2_supports_discontig_la(osb)) {
+			la->la_bm_off = cpu_to_le32(cluster_off);
+			break;
+		}
+
+		/* exit if we can't fit another record */
+		wanted = ocfs2_local_alloc_adjust_bits_wanted(la, ac);
+		if (!wanted)
+			break;
+
+		if (wanted > osb->local_alloc_bits)
+			wanted = osb->local_alloc_bits;
+
+		/* if we need more credits extend the transaction */
+		if (rec_cnt >= OCFS2_MAX_LOCAL_ALLOC_REC_LIMIT)
+			break;
+
+		credits = OCFS2_WINDOW_MOVE_CREDITS - handle->h_buffer_credits;
+		if (credits > 0) {
+			status = ocfs2_extend_trans(handle, credits);
+			if (status < 0) {
+				mlog_errno(status);
+				goto bail;
+			}
+			status = ocfs2_journal_access_di(handle,
+					 INODE_CACHE(osb->local_alloc_inode),
+					 osb->local_alloc_bh,
+					 OCFS2_JOURNAL_ACCESS_WRITE);
+			if (status < 0) {
+				mlog_errno(status);
+				goto bail;
+			}
 		}
 	}
-	if (status < 0) {
-		if (status != -ENOSPC)
-			mlog_errno(status);
+	if (!rec_cnt)
 		goto bail;
-	}
 
+	osb->local_alloc_state = OCFS2_LA_ENABLED;
+	spin_lock(&osb->osb_lock);
+	if (cluster_count > osb->local_alloc_bits)
+		osb->local_alloc_bits = cluster_count;
+	spin_unlock(&osb->osb_lock);
 	osb->la_last_gd = ac->ac_last_group;
 
-	la->la_bm_off = cpu_to_le32(cluster_off);
-	alloc->id1.bitmap1.i_total = cpu_to_le32(cluster_count);
-	/* just in case... In the future when we find space ourselves,
-	 * we don't have to get all contiguous -- but we'll have to
-	 * set all previously used bits in bitmap and update
-	 * la_bits_set before setting the bits in the main bitmap. */
-	alloc->id1.bitmap1.i_used = 0;
-	memset(OCFS2_LOCAL_ALLOC(alloc)->la_bitmap, 0,
-	       le16_to_cpu(la->la_size));
-
-	ocfs2_resmap_restart(&osb->osb_la_resmap, cluster_count,
-			     OCFS2_LOCAL_ALLOC(alloc)->la_bitmap);
+	bitmap = OCFS2_LOCAL_ALLOC_BITMAP(la);
+	ocfs2_resmap_restart(&osb->osb_la_resmap, rec_cnt,
+			     alloc->id1.bitmap1.i_total, bitmap);
+	for (i = 0; i < rec_cnt; i++)
+		ocfs2_resmap_set_ext(&osb->osb_la_resmap, i,
+				     le32_to_cpu(la->la_recs[i].la_clusters));
 
-	trace_ocfs2_local_alloc_new_window_result(
-		OCFS2_LOCAL_ALLOC(alloc)->la_bm_off,
+	trace_ocfs2_local_alloc_new_window_result
+		(OCFS2_LOCAL_ALLOC(alloc)->la_recs[0].la_start,
 		le32_to_cpu(alloc->id1.bitmap1.i_total));
 
 bail:
-	if (status)
+	if ((status < 0) && (status != -ENOSPC))
 		mlog_errno(status);
+
 	return status;
 }
 
 /* Note that we do *NOT* lock the local alloc inode here as
  * it's been locked already for us. */
-static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
-					  struct inode *local_alloc_inode)
+static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb)
 {
 	int status = 0;
 	struct buffer_head *main_bm_bh = NULL;
@@ -1268,7 +1429,7 @@ static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
 	memcpy(alloc_copy, alloc, osb->local_alloc_bh->b_size);
 
 	status = ocfs2_journal_access_di(handle,
-					 INODE_CACHE(local_alloc_inode),
+					 INODE_CACHE(osb->local_alloc_inode),
 					 osb->local_alloc_bh,
 					 OCFS2_JOURNAL_ACCESS_WRITE);
 	if (status < 0) {
-- 
1.5.4.3


From srinivas.eeda at oracle.com  Mon May  7 16:21:30 2012
From: srinivas.eeda at oracle.com (Srinivas Eeda)
Date: Mon,  7 May 2012 16:21:30 -0700
Subject: [Ocfs2-devel] [PATCH 3/3] ocfs2: modify reservation code to support
	discontigous localalloc
In-Reply-To: <1336432890-18638-1-git-send-email-srinivas.eeda@oracle.com>
References: <1336432890-18638-1-git-send-email-srinivas.eeda@oracle.com>
Message-ID: <1336432890-18638-4-git-send-email-srinivas.eeda@oracle.com>

Currently reservation code assumes a bitmap given to it is all one contigous
chunk. This patch enhances it to handle a discontigous chunks. It adds new
fields m_bitmap_ext_cnt and m_bitmap_ext_arr. m_bitmap_ext_arr tracks the sizes
of each contigous free bits and m_bitmap_ext_cnt trackes number of
m_bitmap_ext_arr.

Signed-off-by: Srinivas Eeda <srinivas.eeda at oracle.com>
---
 fs/ocfs2/reservations.c |   41 ++++++++++++++++++++++++++++++++++-------
 fs/ocfs2/reservations.h |    7 ++++++-
 2 files changed, 40 insertions(+), 8 deletions(-)

diff --git a/fs/ocfs2/reservations.c b/fs/ocfs2/reservations.c
index 41ffd36..fea93d7 100644
--- a/fs/ocfs2/reservations.c
+++ b/fs/ocfs2/reservations.c
@@ -291,7 +291,15 @@ static void ocfs2_resmap_clear_all_resv(struct ocfs2_reservation_map *resmap)
 	}
 }
 
-void ocfs2_resmap_restart(struct ocfs2_reservation_map *resmap,
+void ocfs2_resmap_set_ext(struct ocfs2_reservation_map *resmap, int arr, u32 sz)
+{
+	if (ocfs2_resmap_disabled(resmap))
+		return;
+
+	resmap->m_bitmap_ext_arr[arr] = sz;
+}
+
+void ocfs2_resmap_restart(struct ocfs2_reservation_map *resmap, u32 ext_cnt,
 			  unsigned int clen, char *disk_bitmap)
 {
 	if (ocfs2_resmap_disabled(resmap))
@@ -300,9 +308,21 @@ void ocfs2_resmap_restart(struct ocfs2_reservation_map *resmap,
 	spin_lock(&resv_lock);
 
 	ocfs2_resmap_clear_all_resv(resmap);
+
+	/* free existing extent array */
+	if (resmap->m_bitmap_ext_arr)
+		kfree(resmap->m_bitmap_ext_arr);
+
 	resmap->m_bitmap_len = clen;
 	resmap->m_disk_bitmap = disk_bitmap;
 
+	resmap->m_bitmap_ext_cnt = ext_cnt;
+	resmap->m_bitmap_ext_arr = kmalloc((sizeof(u32) * ext_cnt), GFP_NOFS);
+	if (!resmap->m_bitmap_ext_arr) {
+		mlog_errno(-ENOMEM);
+		resmap->m_osb->osb_resv_level = 0;
+	}
+
 	spin_unlock(&resv_lock);
 }
 
@@ -419,20 +439,26 @@ static int ocfs2_resmap_find_free_bits(struct ocfs2_reservation_map *resmap,
 				       unsigned int *rlen)
 {
 	void *bitmap = resmap->m_disk_bitmap;
-	unsigned int best_start, best_len = 0;
+	unsigned int best_start, len, ext, best_len = 0;
 	int offset, start, found;
 
 	trace_ocfs2_resmap_find_free_bits_begin(search_start, search_len,
 						wanted, resmap->m_bitmap_len);
 
-	found = best_start = best_len = 0;
-
+	found = best_start = best_len = ext = 0;
 	start = search_start;
+	len = resmap->m_bitmap_ext_arr[ext++];
 	while ((offset = ocfs2_find_next_zero_bit(bitmap, resmap->m_bitmap_len,
-						 start)) != -1) {
+						  start)) != -1) {
 		/* Search reached end of the region */
 		if (offset >= (search_start + search_len))
-			break;
+			goto out;
+
+		if (offset >= len) {
+			len += resmap->m_bitmap_ext_arr[ext];
+			found = 1;
+			start = offset + 1;
+		}
 
 		if (offset == start) {
 			/* we found a zero */
@@ -450,9 +476,10 @@ static int ocfs2_resmap_find_free_bits(struct ocfs2_reservation_map *resmap,
 		}
 
 		if (found >= wanted)
-			break;
+			goto out;
 	}
 
+out:
 	if (best_len == 0)
 		return 0;
 
diff --git a/fs/ocfs2/reservations.h b/fs/ocfs2/reservations.h
index 42c2b80..bb5e94f 100644
--- a/fs/ocfs2/reservations.h
+++ b/fs/ocfs2/reservations.h
@@ -56,6 +56,8 @@ struct ocfs2_reservation_map {
 	u32			m_bitmap_len;	/* Number of valid
 						 * bits available */
 
+	u32			m_bitmap_ext_cnt;
+	u32			*m_bitmap_ext_arr;
 	struct list_head	m_lru;		/* LRU of reservations
 						 * structures. */
 
@@ -94,6 +96,9 @@ void ocfs2_resv_discard(struct ocfs2_reservation_map *resmap,
 int ocfs2_resmap_init(struct ocfs2_super *osb,
 		      struct ocfs2_reservation_map *resmap);
 
+void ocfs2_resmap_set_ext(struct ocfs2_reservation_map *resmap, int arr,
+			  u32 sz);
+
 /**
  * ocfs2_resmap_restart() - "restart" a reservation bitmap
  * @resmap: reservations bitmap
@@ -107,7 +112,7 @@ int ocfs2_resmap_init(struct ocfs2_super *osb,
  * reservations. A future version will recalculate existing
  * reservations based on the new bitmap.
  */
-void ocfs2_resmap_restart(struct ocfs2_reservation_map *resmap,
+void ocfs2_resmap_restart(struct ocfs2_reservation_map *resmap, u32 ext_cnt,
 			  unsigned int clen, char *disk_bitmap);
 
 /**
-- 
1.5.4.3


From srinivas.eeda at oracle.com  Mon May  7 16:21:27 2012
From: srinivas.eeda at oracle.com (Srinivas Eeda)
Date: Mon,  7 May 2012 16:21:27 -0700
Subject: [Ocfs2-devel] ocfs2 discontiguous localalloc patches
Message-ID: <1336432890-18638-1-git-send-email-srinivas.eeda@oracle.com>

Hi all,

can you please review following 3 patches that implement discontiguous
localalloc bitmap support for ocfs2 file system. This feature helps
applications that significantly fragment the filesystem.

These fixes needs changes to ocfs2 tools as well. I am sending those patches
for review separately.

A write up on this feature is available at
http://oss.oracle.com/osswiki/OCFS2/DesignDocs/DiscontiguousLocalAlloc.html

Thanks,
--Srini


From jlbec at evilplan.org  Mon May  7 17:01:01 2012
From: jlbec at evilplan.org (Joel Becker)
Date: Mon, 7 May 2012 17:01:01 -0700
Subject: [Ocfs2-devel] ocfs2 discontiguous localalloc patches
In-Reply-To: <1336432890-18638-1-git-send-email-srinivas.eeda@oracle.com>
References: <1336432890-18638-1-git-send-email-srinivas.eeda@oracle.com>
Message-ID: <20120508000100.GB4713@dhcp-172-17-9-228.mtv.corp.google.com>

On Mon, May 07, 2012 at 04:21:27PM -0700, Srinivas Eeda wrote:
> can you please review following 3 patches that implement discontiguous
> localalloc bitmap support for ocfs2 file system. This feature helps
> applications that significantly fragment the filesystem.

	Hi Srini.  Have you some performance numbers backing this?  That
is, I believe that the described filesystem turned off local alloc.  Do
you have proof that these patches, turning it back on, improved the
customer's performance?

Joel

-- 

"But all my words come back to me
 In shades of mediocrity.
 Like emptiness in harmony
 I need someone to comfort me."

			http://www.jlbec.org/
			jlbec at evilplan.org


From jlbec at evilplan.org  Mon May  7 17:05:33 2012
From: jlbec at evilplan.org (Joel Becker)
Date: Mon, 7 May 2012 17:05:33 -0700
Subject: [Ocfs2-devel] [PATCH 1/3] ocfs2: new structure to implment
 discontiguous local alloc bitmap
In-Reply-To: <1336432890-18638-2-git-send-email-srinivas.eeda@oracle.com>
References: <1336432890-18638-1-git-send-email-srinivas.eeda@oracle.com>
	<1336432890-18638-2-git-send-email-srinivas.eeda@oracle.com>
Message-ID: <20120508000532.GC4713@dhcp-172-17-9-228.mtv.corp.google.com>

On Mon, May 07, 2012 at 04:21:28PM -0700, Srinivas Eeda wrote:
> Current local alloc handles single contiguous free chunk of clusters. This
> patch enhances local alloc to handle discontigous free chunks. It adds a new
> ocfs2_local_alloc_rec structure which tracks single contiguous free chunk. An
> array of these sit in the bitmap itself and track discontiguous chunks. In
> best case there is only one record and increases as the filesystem gets
> fragmented. Number of records at a time are limited depending on the size
> of the bitmap and the max limit is defined by OCFS2_MAX_LOCAL_ALLOC_RECS.
> 
> Signed-off-by: Srinivas Eeda <srinivas.eeda at oracle.com>
> ---
>  fs/ocfs2/localalloc.c |   10 ++++++++++
>  fs/ocfs2/ocfs2.h      |    8 ++++++++
>  fs/ocfs2/ocfs2_fs.h   |   48 ++++++++++++++++++++++++++++++++++++++++++------
>  3 files changed, 60 insertions(+), 6 deletions(-)
> 
> diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
> index 210c352..4190e53 100644
> --- a/fs/ocfs2/localalloc.c
> +++ b/fs/ocfs2/localalloc.c
> @@ -48,6 +48,16 @@
>  
>  #define OCFS2_LOCAL_ALLOC(dinode)	(&((dinode)->id2.i_lab))
>  
> +#define OCFS2_LOCAL_ALLOC_REC_SZ(la)	(le16_to_cpu(la->la_rec_count) *\
> +					 sizeof(struct ocfs2_local_alloc_rec))
> +#define OCFS2_LOCAL_ALLOC_BITMAP(la)    ((char *)(&(la->la_recs)) +\
> +					 OCFS2_LOCAL_ALLOC_REC_SZ(la))
> +#define OCFS2_LOCAL_ALLOC_BITS_PER_REC (sizeof(struct ocfs2_local_alloc_rec)*8)
> +
> +/* Maximum number of local alloc records */
> +#define OCFS2_MAX_LOCAL_ALLOC_REC_LIMIT	128
> +
> +
>  static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc);
>  
>  static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
> diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
> index d355e6e..d4c36d2 100644
> --- a/fs/ocfs2/ocfs2.h
> +++ b/fs/ocfs2/ocfs2.h
> @@ -367,6 +367,7 @@ struct ocfs2_super
>  							 * by osb_lock */
>  
>  	struct buffer_head *local_alloc_bh;
> +	struct inode	   *local_alloc_inode;
>  
>  	u64 la_last_gd;
>  
> @@ -522,6 +523,13 @@ static inline int ocfs2_supports_discontig_bg(struct ocfs2_super *osb)
>  	return 0;
>  }
>  
> +static inline int ocfs2_supports_discontig_la(struct ocfs2_super *osb)
> +{
> +	if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_DISCONTIG_LA)
> +		return 1;
> +	return 0;
> +}
> +
>  static inline unsigned int ocfs2_link_max(struct ocfs2_super *osb)
>  {
>  	if (ocfs2_supports_indexed_dirs(osb))
> diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
> index 938387a..6a0fe02 100644
> --- a/fs/ocfs2/ocfs2_fs.h
> +++ b/fs/ocfs2/ocfs2_fs.h
> @@ -102,7 +102,8 @@
>  					 | OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS \
>  					 | OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE \
>  					 | OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG	\
> -					 | OCFS2_FEATURE_INCOMPAT_CLUSTERINFO)
> +					 | OCFS2_FEATURE_INCOMPAT_CLUSTERINFO \
> +					 | OCFS2_FEATURE_INCOMPAT_DISCONTIG_LA)
>  #define OCFS2_FEATURE_RO_COMPAT_SUPP	(OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \
>  					 | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \
>  					 | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)
> @@ -177,6 +178,9 @@
>   */
>  #define OCFS2_FEATURE_INCOMPAT_CLUSTERINFO	0x4000
>  
> +/* Discontiguous local alloc */
> +#define OCFS2_FEATURE_INCOMPAT_DISCONTIG_LA	0x8000

	I really wish this could be an RO_COMPAT flag, but I think that
recovery on RO mounts will break with this.  Mark, please confirm, but I
think it has to be INCOMPAT.

> @@ -664,14 +668,19 @@ struct ocfs2_super_block {
>   * Local allocation bitmap for OCFS2 slots
>   * Note that it exists inside an ocfs2_dinode, so all offsets are
>   * relative to the start of ocfs2_dinode.id2.
> + * Each ocfs2_local_alloc_rec tracks one contigous chunk of clusters.
>   */
> +struct ocfs2_local_alloc_rec {
> +	__le32 la_start;	/* 1st cluster in this extent */
> +	__le32 la_clusters;	/* Number of contiguous clusters */
> +};
> +
>  struct ocfs2_local_alloc
>  {
>  /*00*/	__le32 la_bm_off;	/* Starting bit offset in main bitmap */
>  	__le16 la_size;		/* Size of included bitmap, in bytes */
> -	__le16 la_reserved1;
> -	__le64 la_reserved2;
> -/*10*/	__u8   la_bitmap[0];
> +	__le16 la_rec_count;	/* Number of discontiguous records */
> +	struct ocfs2_local_alloc_rec la_recs[0]; /* Localalloc records */
>  };

	You can't delete la_bitmap.  Any filesystem without DISCONTIG_LA
will be expecting the inline bitmap to start there.

> @@ -1380,11 +1389,24 @@ static inline u16 ocfs2_local_alloc_size(struct super_block *sb)
>  	u16 size;
>  
>  	size = sb->s_blocksize -
> -		offsetof(struct ocfs2_dinode, id2.i_lab.la_bitmap);
> +		offsetof(struct ocfs2_dinode, id2.i_lab.la_recs);
> +	size -= sizeof(struct ocfs2_local_alloc_rec);

	You can't do this without checking for DISCONTIG_LA.  Again,
filesystems without DISCONTIG_LA will be starting at la_bitmap.

Joel


-- 

"If at first you don't succeed, cover all traces that you tried."
                                                        -Unknown

			http://www.jlbec.org/
			jlbec at evilplan.org


From jlbec at evilplan.org  Mon May  7 17:22:58 2012
From: jlbec at evilplan.org (Joel Becker)
Date: Mon, 7 May 2012 17:22:58 -0700
Subject: [Ocfs2-devel] [PATCH 2/3] ocfs2: implement discontiguous
	localalloc bitmap
In-Reply-To: <1336432890-18638-3-git-send-email-srinivas.eeda@oracle.com>
References: <1336432890-18638-1-git-send-email-srinivas.eeda@oracle.com>
	<1336432890-18638-3-git-send-email-srinivas.eeda@oracle.com>
Message-ID: <20120508002256.GD4713@dhcp-172-17-9-228.mtv.corp.google.com>

On Mon, May 07, 2012 at 04:21:29PM -0700, Srinivas Eeda wrote:
> This patch adds supporting functions and modifies localalloc code to implement
> discontiguous localalloc bitmap.
> 
> Signed-off-by: Srinivas Eeda <srinivas.eeda at oracle.com>
> ---
>  fs/ocfs2/localalloc.c |  523 ++++++++++++++++++++++++++++++++-----------------
>  1 files changed, 342 insertions(+), 181 deletions(-)
> 
> diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
> index 4190e53..f63381e 100644
> --- a/fs/ocfs2/localalloc.c
> +++ b/fs/ocfs2/localalloc.c
> @@ -48,6 +48,9 @@
>  
>  #define OCFS2_LOCAL_ALLOC(dinode)	(&((dinode)->id2.i_lab))
>  
> +/* defines minimum contiguous required */
> +#define OCFS2_LOCAL_ALLOC_MIN_BITS	2
> +
>  #define OCFS2_LOCAL_ALLOC_REC_SZ(la)	(le16_to_cpu(la->la_rec_count) *\
>  					 sizeof(struct ocfs2_local_alloc_rec))
>  #define OCFS2_LOCAL_ALLOC_BITMAP(la)    ((char *)(&(la->la_recs)) +\
> @@ -58,7 +61,8 @@
>  #define OCFS2_MAX_LOCAL_ALLOC_REC_LIMIT	128
>  
>  
> -static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc);
> +static u32 ocfs2_local_alloc_count_bits(struct ocfs2_super *osb,
> +					struct ocfs2_dinode *alloc);
>  
>  static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
>  					     struct ocfs2_dinode *alloc,
> @@ -82,8 +86,7 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
>  					handle_t *handle,
>  					struct ocfs2_alloc_context *ac);
>  
> -static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
> -					  struct inode *local_alloc_inode);
> +static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb);

	I noted that you moved local_alloc_inode into ocfs2_super in the
previous patch.  Lifting that into the super should be one distinct
patch.  It should add the field to ocfs2_super and change the function
signatures at the same time.  Munging it with other patches confuses the
issue.
 
> @@ -202,6 +205,74 @@ unsigned int ocfs2_la_default_mb(struct ocfs2_super *osb)
>  	return la_mb;
>  }
>  
> +static u32 ocfs2_local_bitmap_to_cluster(struct ocfs2_local_alloc *la, u32 bit)
> +{
> +	u32 start, prev, offset;
> +	int rec;
> +
> +	rec = start = prev = 0;
> +	for (rec = 0; rec < le16_to_cpu(la->la_rec_count); rec++) {
> +		prev = start;
> +		start += le32_to_cpu(la->la_recs[rec].la_clusters);
> +		if (bit < start)
> +			break;
> +	}
> +	offset = le32_to_cpu(la->la_recs[rec].la_start) + (bit - prev);
> +
> +	return offset;
> +}

	This can't work for non-DISCONTIG_LA filesystems.  I looked, and
you call this regardless of the feature bits.  Old filesystems will
crash, because they have bitmap bits instead of la_rec_count.  This is
why I said you couldn't remove la_bitmap.

> +/*
> + * This function is called before allocating a new chunk for the localalloc
> + * bitmap to make sure there is enough space in the bitmap for the new record
> + */
> +static u32 ocfs2_local_alloc_adjust_bits_wanted(struct ocfs2_local_alloc *la,
> +						struct ocfs2_alloc_context *ac)
> +{
> +	u32 required, available, cluster_cnt;
> +
> +	if (ac->ac_bits_given == ac->ac_bits_wanted)
> +		return 0;
> +
> +	/* total bits available in bitmap */
> +	available   = le16_to_cpu(la->la_size) << 3;
> +	cluster_cnt = ocfs2_local_alloc_cluster_count(la);
> +
> +	/*
> +	 * Wanted shouldn't be greater than bitmap size and given should be
> +	 * equal to cluster count
> +	 */
> +	BUG_ON(ac->ac_bits_given > ac->ac_bits_wanted);
> +	BUG_ON(ac->ac_bits_wanted > available);
> +	BUG_ON(ac->ac_bits_given != cluster_cnt);
> +
> +	/* reduce bits taken by each record structure */
> +	available -= (le16_to_cpu(la->la_rec_count) *
> +		      OCFS2_LOCAL_ALLOC_BITS_PER_REC);

	Again, no check for DISCONTIG_LA.  I'm going to stop mentioning
this.  Just assume that every place you want to touch la_rec_count, you
need to make sure you have a DISCONTIG_LA filesystem.

> @@ -348,21 +421,21 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb)
>  	}
>  
>  	/* do a little verification. */
> -	num_used = ocfs2_local_alloc_count_bits(alloc);
> +	num_used = ocfs2_local_alloc_count_bits(osb, alloc);
>  
>  	/* hopefully the local alloc has always been recovered before
>  	 * we load it. */
>  	if (num_used
>  	    || alloc->id1.bitmap1.i_used
>  	    || alloc->id1.bitmap1.i_total
> -	    || la->la_bm_off)
> +	    || la->la_rec_count)

	I lied.  You can't trust la_rec_count for non-DISCONTIG_LA
filesystems, so you can't have a naked check here.  Conversely,
la_bm_off is the valid check for those filesystems.  You need to
alternate based on the feature.

> @@ -690,8 +739,7 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
>  		le32_to_cpu(alloc->id1.bitmap1.i_used);
>  	if (bits_wanted > free_bits) {
>  		/* uhoh, window change time. */
> -		status =
> -			ocfs2_local_alloc_slide_window(osb, local_alloc_inode);
> +		status = ocfs2_local_alloc_slide_window(osb);

	This is what I mean about osb->local_alloc_inode.  There should
be a first patch that does these changes only.

> @@ -745,7 +792,7 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
>  {
>  	int status, start;
>  	struct inode *local_alloc_inode;
> -	void *bitmap;
> +	u8 *bitmap;

	I'm not sure about this.  Do you have a reason?

>  	struct ocfs2_dinode *alloc;
>  	struct ocfs2_local_alloc *la;
>  
> @@ -764,8 +811,8 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
>  		goto bail;
>  	}
>  
> -	bitmap = la->la_bitmap;
> -	*bit_off = le32_to_cpu(la->la_bm_off) + start;
> +	bitmap = OCFS2_LOCAL_ALLOC_BITMAP(la);
> +	*bit_off = ocfs2_local_bitmap_to_cluster(la, start);

	Here is the call that assumes a DISCONTIG_LA filesystem.

>  	*num_bits = bits_wanted;
>  
>  	status = ocfs2_journal_access_di(handle,
> @@ -792,16 +839,29 @@ bail:
>  	return status;
>  }
>  
> -static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc)
> +static u32 ocfs2_local_alloc_count_bits(struct ocfs2_super *osb,
> +					struct ocfs2_dinode *alloc)
>  {
>  	int i;
> -	u8 *buffer;
> +	u8 *bitmap;
>  	u32 count = 0;
>  	struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc);
>  
> -	buffer = la->la_bitmap;
> -	for (i = 0; i < le16_to_cpu(la->la_size); i++)
> -		count += hweight8(buffer[i]);
> +	/*
> +	 * if discontig is not enabled then lets update the first localalloc
> +	 * record with the current bitmap block info. We are doing this because
> +	 * old disk formats are not aware of the records.
> +	 */
> +	if (!ocfs2_supports_discontig_la(osb) && la->la_bm_off) {
> +		la->la_rec_count = cpu_to_le16(1);
> +		la->la_recs[0].la_start = la->la_bm_off;
> +		la->la_recs[0].la_clusters = alloc->id1.bitmap1.i_total;
> +	}

	OH MY DOG NO.  NEVER EVER DO THIS.  You cannot update an old
filesystem on the fly!  What about other nodes that are running older
versions of the software?  They will crash or corrupt data!  The entire
point of feature bits is to make sure all nodes are speaking the same
code.

NAK NAK NAK

	This explains why you trusted la_rec_count earlier.  But that is
broken.  When your patches are done, the code should use la_bm_off and
la_bitmap when !DISCONTIG_LA and then use la_rec_count, etc when
DISCONTIG_LA.  The only way to transition between them is a tunefs.ocfs2
operation that walks the filesystem, flushes the bitmap, and then
sets/clears la_rec_count appropriately depending on the direction..

Joel

-- 

"I inject pure kryptonite into my brain.
 It improves my kung fu, and it eases the pain."

			http://www.jlbec.org/
			jlbec at evilplan.org


From jlbec at evilplan.org  Mon May  7 17:28:23 2012
From: jlbec at evilplan.org (Joel Becker)
Date: Mon, 7 May 2012 17:28:23 -0700
Subject: [Ocfs2-devel] [PATCH 1/3] ocfs2: new structure to implment
 discontiguous local alloc bitmap
In-Reply-To: <1336432890-18638-2-git-send-email-srinivas.eeda@oracle.com>
References: <1336432890-18638-1-git-send-email-srinivas.eeda@oracle.com>
	<1336432890-18638-2-git-send-email-srinivas.eeda@oracle.com>
Message-ID: <20120508002820.GE4713@dhcp-172-17-9-228.mtv.corp.google.com>

On Mon, May 07, 2012 at 04:21:28PM -0700, Srinivas Eeda wrote:
> Current local alloc handles single contiguous free chunk of clusters. This
> patch enhances local alloc to handle discontigous free chunks. It adds a new
> ocfs2_local_alloc_rec structure which tracks single contiguous free chunk. An
> array of these sit in the bitmap itself and track discontiguous chunks. In
> best case there is only one record and increases as the filesystem gets
> fragmented. Number of records at a time are limited depending on the size
> of the bitmap and the max limit is defined by OCFS2_MAX_LOCAL_ALLOC_RECS.
> 
> Signed-off-by: Srinivas Eeda <srinivas.eeda at oracle.com>
> ---
>  fs/ocfs2/localalloc.c |   10 ++++++++++
>  fs/ocfs2/ocfs2.h      |    8 ++++++++
>  fs/ocfs2/ocfs2_fs.h   |   48 ++++++++++++++++++++++++++++++++++++++++++------
>  3 files changed, 60 insertions(+), 6 deletions(-)
> 
> diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
> index 210c352..4190e53 100644
> --- a/fs/ocfs2/localalloc.c
> +++ b/fs/ocfs2/localalloc.c
> @@ -48,6 +48,16 @@
>  
>  #define OCFS2_LOCAL_ALLOC(dinode)	(&((dinode)->id2.i_lab))
>  
> +#define OCFS2_LOCAL_ALLOC_REC_SZ(la)	(le16_to_cpu(la->la_rec_count) *\
> +					 sizeof(struct ocfs2_local_alloc_rec))
> +#define OCFS2_LOCAL_ALLOC_BITMAP(la)    ((char *)(&(la->la_recs)) +\
> +					 OCFS2_LOCAL_ALLOC_REC_SZ(la))

	Another point.  Not only does this macro not handle
!DISCONTIG_LA filesystems (as described in my other email about this
patch), it should be a static inline function.  See eg: INODE_CACHE() in
fs/ocfs2/inode.h

Joel

-- 

Life's Little Instruction Book #456

	"Send your loved one flowers.  Think of a reason later."

			http://www.jlbec.org/
			jlbec at evilplan.org


From jlbec at evilplan.org  Mon May  7 17:34:31 2012
From: jlbec at evilplan.org (Joel Becker)
Date: Mon, 7 May 2012 17:34:31 -0700
Subject: [Ocfs2-devel] [PATCH 3/3] ocfs2: modify reservation code to
 support discontigous localalloc
In-Reply-To: <1336432890-18638-4-git-send-email-srinivas.eeda@oracle.com>
References: <1336432890-18638-1-git-send-email-srinivas.eeda@oracle.com>
	<1336432890-18638-4-git-send-email-srinivas.eeda@oracle.com>
Message-ID: <20120508003431.GF4713@dhcp-172-17-9-228.mtv.corp.google.com>

On Mon, May 07, 2012 at 04:21:30PM -0700, Srinivas Eeda wrote:
> Currently reservation code assumes a bitmap given to it is all one contigous
> chunk. This patch enhances it to handle a discontigous chunks. It adds new
> fields m_bitmap_ext_cnt and m_bitmap_ext_arr. m_bitmap_ext_arr tracks the sizes
> of each contigous free bits and m_bitmap_ext_cnt trackes number of
> m_bitmap_ext_arr.
> 
> Signed-off-by: Srinivas Eeda <srinivas.eeda at oracle.com>

Hi Srini,
	A patch like this should come before the feature patch.  Once
this code can treat the old single-range bitmap as a one-element
multiple-range bitmap, you can add the multiple-range change easily.

> +void ocfs2_resmap_set_ext(struct ocfs2_reservation_map *resmap, int arr, u32 sz)
> +{
> +	if (ocfs2_resmap_disabled(resmap))
> +		return;
> +
> +	resmap->m_bitmap_ext_arr[arr] = sz;
> +}

	I don't see this function called anywhere.  And please don't use
needless abbreviations.  If you want to say ocfs2_resmap_set_extent(),
write it out.  I don't quite get the arguments, and since it isn't
called, I can't figure out how they are used.

Joel

-- 

"To announce that there must be no criticism of them president, or
 that we are to stand by the president, right or wrong, is not only
 unpatriotic and servile, but is morally treasonable to the American
 public."
	- Theodore Roosevelt

			http://www.jlbec.org/
			jlbec at evilplan.org


From srinivas.eeda at oracle.com  Mon May  7 18:26:58 2012
From: srinivas.eeda at oracle.com (Srinivas Eeda)
Date: Mon, 07 May 2012 18:26:58 -0700
Subject: [Ocfs2-devel] ocfs2 discontiguous localalloc patches
In-Reply-To: <20120508000100.GB4713@dhcp-172-17-9-228.mtv.corp.google.com>
References: <1336432890-18638-1-git-send-email-srinivas.eeda@oracle.com>
	<20120508000100.GB4713@dhcp-172-17-9-228.mtv.corp.google.com>
Message-ID: <4FA87662.9040006@oracle.com>

Joel Becker wrote:
> On Mon, May 07, 2012 at 04:21:27PM -0700, Srinivas Eeda wrote:
>   
>> can you please review following 3 patches that implement discontiguous
>> localalloc bitmap support for ocfs2 file system. This feature helps
>> applications that significantly fragment the filesystem.
>>     
>
> 	Hi Srini.  Have you some performance numbers backing this?  That
> is, I believe that the described filesystem turned off local alloc.  Do
> you have proof that these patches, turning it back on, improved the
> customer's performance?
>
> Joel
>   
Hi Joel,

thanks a lot for the quick reply.

I have some stat_sysdir.sh snapshots at 
http://oss.oracle.com/~seeda/diag/stat_sysdir/ collected from a system. 
It has 4 snapshots collected when the file system usage is at 8%, 19%, 
21% and 52%.

In file stat_sysdir_52_percent_usage_slow_del.out, for the filesystem 
that has UUID: 3A6F54DF288C4AF2ABD1E00FC49BE7ED you could see that 
local_alloc:0000 bitmap total is 38 and is 0(disabled) for 
local_alloc:0001, and local_alloc:0002. for the filesystem that has uuid 
AC444DB162AE427C899BA89E076DD479, all localalloc appears to be disabled. 
Sorry I didn't collect /sys/kernel/debug/fs/<uuid>/fs_state. But, given 
the file system state, even if localalloc is not disabled localalloc 
need to be refilled every 40 clusters.

Thanks,
--Srini


From srinivas.eeda at oracle.com  Mon May  7 19:10:32 2012
From: srinivas.eeda at oracle.com (Srinivas Eeda)
Date: Mon, 07 May 2012 19:10:32 -0700
Subject: [Ocfs2-devel] [PATCH 2/3] ocfs2: implement discontiguous
	localalloc bitmap
In-Reply-To: <20120508002256.GD4713@dhcp-172-17-9-228.mtv.corp.google.com>
References: <1336432890-18638-1-git-send-email-srinivas.eeda@oracle.com>
	<1336432890-18638-3-git-send-email-srinivas.eeda@oracle.com>
	<20120508002256.GD4713@dhcp-172-17-9-228.mtv.corp.google.com>
Message-ID: <4FA88098.9070301@oracle.com>

Joel Becker wrote:
> On Mon, May 07, 2012 at 04:21:29PM -0700, Srinivas Eeda wrote:
>   
>
> 	OH MY DOG NO.  NEVER EVER DO THIS.  You cannot update an old
> filesystem on the fly!  What about other nodes that are running older
> versions of the software?  They will crash or corrupt data!  The entire
> point of feature bits is to make sure all nodes are speaking the same
> code.
>
> NAK NAK NAK
>
> 	This explains why you trusted la_rec_count earlier.  But that is
> broken.  When your patches are done, the code should use la_bm_off and
> la_bitmap when !DISCONTIG_LA and then use la_rec_count, etc when
> DISCONTIG_LA.  The only way to transition between them is a tunefs.ocfs2
> operation that walks the filesystem, flushes the bitmap, and then
> sets/clears la_rec_count appropriately depending on the direction..
>   
Please please don't hate me :( ... the changes takes care of old formats 
as well ...  I used the reserved space in the structure so that the code 
changes will be minimal and still compatible with old file system 
formats. I agree that we need to have some reserved space still 
available. So as discussed I'll redo the changes accordingly. Please 
ignore all the patches.

Thanks,
--Srini


From xiaowei.hu at oracle.com  Thu May 24 22:53:22 2012
From: xiaowei.hu at oracle.com (xiaowei.hu at oracle.com)
Date: Fri, 25 May 2012 13:53:22 +0800
Subject: [Ocfs2-devel] [PATCH] Fix waiting status race condition in dlm
	recovery
Message-ID: <1337925202-13086-1-git-send-email-xiaowei.hu@oracle.com>

From: "Xiaowei.Hu" <xiaowei.hu at oracle.com>

when the master requested locks ,but one/some of the live nodes died,
after it received the request msg and before send out the locks packages,
the recovery will fall into endless loop,waiting for the status changed to finalize

NodeA                                     NodeB
selected as recovery master
dlm_remaster_locks
  -> dlm_requeset_all_locks
  this send request locks msg to B
                                          received the msg from A,
                                          queue worker dlm_request_all_locks_worker
                                          return 0
go on set state to requested
wait for the state become done
                                          NodeB lost connection due to network
                                          before the worker begin, or it die.
NodeA still waiting for the
change of reco state.
It won't end if it not get data done msg
And at this time nodeB do not realize this (or it just died),
it won't send the msg for ever, nodeA left in the recovery process forever.

This patch let the recovery master check if the node still in live node
map when it stay in REQUESTED status.

Signed-off-by: Xiaowei.Hu <xiaowei.hu at oracle.com>
---
 fs/ocfs2/dlm/dlmrecovery.c |    9 +++++++++
 1 files changed, 9 insertions(+), 0 deletions(-)

diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 01ebfd0..62659e8 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -555,6 +555,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
 	int all_nodes_done;
 	int destroy = 0;
 	int pass = 0;
+	int dying = 0;
 
 	do {
 		/* we have become recovery master.  there is no escaping
@@ -659,6 +660,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
 		list_for_each_entry(ndata, &dlm->reco.node_data, list) {
 			mlog(0, "checking recovery state of node %u\n",
 			     ndata->node_num);
+			dying = 0;
 			switch (ndata->state) {
 				case DLM_RECO_NODE_DATA_INIT:
 				case DLM_RECO_NODE_DATA_REQUESTING:
@@ -679,6 +681,13 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
 					     dlm->name, ndata->node_num,
 					     ndata->state==DLM_RECO_NODE_DATA_RECEIVING ?
 					     "receiving" : "requested");
+					spin_lock(&dlm->spinlock);
+					dying = !test_bit(ndata->node_num, dlm->live_nodes_map);
+					spin_unlock(&dlm->spinlock);
+					if (dying) {
+						ndata->state = DLM_RECO_NODE_DATA_DEAD;
+						break;
+					}
 					all_nodes_done = 0;
 					break;
 				case DLM_RECO_NODE_DATA_DONE:
-- 
1.7.7.6


From srinivas.eeda at oracle.com  Fri May 25 15:17:57 2012
From: srinivas.eeda at oracle.com (srinivas eeda)
Date: Fri, 25 May 2012 15:17:57 -0700
Subject: [Ocfs2-devel] [PATCH] Fix waiting status race condition in dlm
	recovery
In-Reply-To: <1337925202-13086-1-git-send-email-xiaowei.hu@oracle.com>
References: <1337925202-13086-1-git-send-email-xiaowei.hu@oracle.com>
Message-ID: <4FC00515.1060105@oracle.com>

comments inline

On 5/24/2012 10:53 PM, xiaowei.hu at oracle.com wrote:
> From: "Xiaowei.Hu"<xiaowei.hu at oracle.com>
>
> when the master requested locks ,but one/some of the live nodes died,
> after it received the request msg and before send out the locks packages,
> the recovery will fall into endless loop,waiting for the status changed to finalize
>
> NodeA                                     NodeB
> selected as recovery master
> dlm_remaster_locks
>    ->  dlm_requeset_all_locks
>    this send request locks msg to B
>                                            received the msg from A,
>                                            queue worker dlm_request_all_locks_worker
>                                            return 0
> go on set state to requested
> wait for the state become done
>                                            NodeB lost connection due to network
>                                            before the worker begin, or it die.
> NodeA still waiting for the
> change of reco state.
> It won't end if it not get data done msg
> And at this time nodeB do not realize this (or it just died),
> it won't send the msg for ever, nodeA left in the recovery process forever.
>
> This patch let the recovery master check if the node still in live node
> map when it stay in REQUESTED status.
>
> Signed-off-by: Xiaowei.Hu<xiaowei.hu at oracle.com>
> ---
>   fs/ocfs2/dlm/dlmrecovery.c |    9 +++++++++
>   1 files changed, 9 insertions(+), 0 deletions(-)
>
> diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
> index 01ebfd0..62659e8 100644
> --- a/fs/ocfs2/dlm/dlmrecovery.c
> +++ b/fs/ocfs2/dlm/dlmrecovery.c
> @@ -555,6 +555,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
>   	int all_nodes_done;
>   	int destroy = 0;
>   	int pass = 0;
> +	int dying = 0;
>
>   	do {
>   		/* we have become recovery master.  there is no escaping
> @@ -659,6 +660,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
>   		list_for_each_entry(ndata,&dlm->reco.node_data, list) {
>   			mlog(0, "checking recovery state of node %u\n",
>   			     ndata->node_num);
> +			dying = 0;
>   			switch (ndata->state) {
>   				case DLM_RECO_NODE_DATA_INIT:
>   				case DLM_RECO_NODE_DATA_REQUESTING:
> @@ -679,6 +681,13 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
>   					     dlm->name, ndata->node_num,
>   					     ndata->state==DLM_RECO_NODE_DATA_RECEIVING ?
>   					     "receiving" : "requested");
> +					spin_lock(&dlm->spinlock);
> +					dying = !test_bit(ndata->node_num, dlm->live_nodes_map);
> +					spin_unlock(&dlm->spinlock);
> +					if (dying) {
> +						ndata->state = DLM_RECO_NODE_DATA_DEAD;
> +						break;
> +					}
>   					all_nodes_done = 0;
>   					break;
>   				case DLM_RECO_NODE_DATA_DONE:
fix seems to address the issue, but can you please add a function 
dlm_is_node_in_livemap similar to dlm_is_node_dead so that it' improves 
readability. You can then add the following to check if the node is 
still alive
+        if (!dlm_is_node_in_livemap(dlm, ndata->node_num))
+            ndate->state = DLM_RECO_NODE_DATA_DEAD;
+        else
+            all_nodes_done = 0;


From xiaowei.hu at oracle.com  Fri May 25 19:05:14 2012
From: xiaowei.hu at oracle.com (Xiaowei)
Date: Sat, 26 May 2012 10:05:14 +0800
Subject: [Ocfs2-devel] [PATCH] Fix waiting status race condition in dlm
	recovery
In-Reply-To: <4FC00515.1060105@oracle.com>
References: <1337925202-13086-1-git-send-email-xiaowei.hu@oracle.com>
	<4FC00515.1060105@oracle.com>
Message-ID: <4FC03A5A.6090705@oracle.com>

Thanks Srini ,
This sounds good, I tried to use dlm_is_node_dead in this patch , but 
this function can't report
another node is dead if this node already in recovery process. It was 
blocked to set the bit in domain_map,
but the live_nodes_map could always reflect the really live nodes.

I will reformat the patch.

Thanks,
Xiaowei

On 05/26/2012 06:17 AM, srinivas eeda wrote:
> comments inline
>
> On 5/24/2012 10:53 PM, xiaowei.hu at oracle.com wrote:
>> From: "Xiaowei.Hu"<xiaowei.hu at oracle.com>
>>
>> when the master requested locks ,but one/some of the live nodes died,
>> after it received the request msg and before send out the locks 
>> packages,
>> the recovery will fall into endless loop,waiting for the status 
>> changed to finalize
>>
>> NodeA                                     NodeB
>> selected as recovery master
>> dlm_remaster_locks
>>    ->  dlm_requeset_all_locks
>>    this send request locks msg to B
>>                                            received the msg from A,
>>                                            queue worker 
>> dlm_request_all_locks_worker
>>                                            return 0
>> go on set state to requested
>> wait for the state become done
>>                                            NodeB lost connection due 
>> to network
>>                                            before the worker begin, 
>> or it die.
>> NodeA still waiting for the
>> change of reco state.
>> It won't end if it not get data done msg
>> And at this time nodeB do not realize this (or it just died),
>> it won't send the msg for ever, nodeA left in the recovery process 
>> forever.
>>
>> This patch let the recovery master check if the node still in live node
>> map when it stay in REQUESTED status.
>>
>> Signed-off-by: Xiaowei.Hu<xiaowei.hu at oracle.com>
>> ---
>>   fs/ocfs2/dlm/dlmrecovery.c |    9 +++++++++
>>   1 files changed, 9 insertions(+), 0 deletions(-)
>>
>> diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
>> index 01ebfd0..62659e8 100644
>> --- a/fs/ocfs2/dlm/dlmrecovery.c
>> +++ b/fs/ocfs2/dlm/dlmrecovery.c
>> @@ -555,6 +555,7 @@ static int dlm_remaster_locks(struct dlm_ctxt 
>> *dlm, u8 dead_node)
>>       int all_nodes_done;
>>       int destroy = 0;
>>       int pass = 0;
>> +    int dying = 0;
>>
>>       do {
>>           /* we have become recovery master.  there is no escaping
>> @@ -659,6 +660,7 @@ static int dlm_remaster_locks(struct dlm_ctxt 
>> *dlm, u8 dead_node)
>>           list_for_each_entry(ndata,&dlm->reco.node_data, list) {
>>               mlog(0, "checking recovery state of node %u\n",
>>                    ndata->node_num);
>> +            dying = 0;
>>               switch (ndata->state) {
>>                   case DLM_RECO_NODE_DATA_INIT:
>>                   case DLM_RECO_NODE_DATA_REQUESTING:
>> @@ -679,6 +681,13 @@ static int dlm_remaster_locks(struct dlm_ctxt 
>> *dlm, u8 dead_node)
>>                            dlm->name, ndata->node_num,
>>                            ndata->state==DLM_RECO_NODE_DATA_RECEIVING ?
>>                            "receiving" : "requested");
>> +                    spin_lock(&dlm->spinlock);
>> +                    dying = !test_bit(ndata->node_num, 
>> dlm->live_nodes_map);
>> +                    spin_unlock(&dlm->spinlock);
>> +                    if (dying) {
>> +                        ndata->state = DLM_RECO_NODE_DATA_DEAD;
>> +                        break;
>> +                    }
>>                       all_nodes_done = 0;
>>                       break;
>>                   case DLM_RECO_NODE_DATA_DONE:
> fix seems to address the issue, but can you please add a function 
> dlm_is_node_in_livemap similar to dlm_is_node_dead so that it' 
> improves readability. You can then add the following to check if the 
> node is still alive
> +        if (!dlm_is_node_in_livemap(dlm, ndata->node_num))
> +            ndate->state = DLM_RECO_NODE_DATA_DEAD;
> +        else
> +            all_nodes_done = 0;


From xiaowei.hu at oracle.com  Fri May 25 19:27:29 2012
From: xiaowei.hu at oracle.com (xiaowei.hu at oracle.com)
Date: Sat, 26 May 2012 10:27:29 +0800
Subject: [Ocfs2-devel] [PATCH] Fix waiting status race condition in dlm
	recovery V2
Message-ID: <1337999249-15242-1-git-send-email-xiaowei.hu@oracle.com>

From: "Xiaowei.Hu" <xiaowei.hu at oracle.com>

when the master requested locks ,but one/some of the live nodes died,
after it received the request msg and before send out the locks packages,
the recovery will fall into endless loop,waiting for the status changed to finalize

NodeA                                     NodeB
selected as recovery master
dlm_remaster_locks
  -> dlm_requeset_all_locks
  this send request locks msg to B
                                          received the msg from A,
                                          queue worker dlm_request_all_locks_worker
                                          return 0
go on set state to requested
wait for the state become done
                                          NodeB lost connection due to network
                                          before the worker begin, or it die.

NodeA still waiting for the change of reco state.
It won't end if it not get data done msg.
And at this time nodeB do not realize this (or it just died),
it won't send the msg for ever, nodeA left in the recovery process forever.

This patch let the recovery master check if the node still in live node
map when it stay in REQUESTED status.

Signed-off-by: Xiaowei.Hu <xiaowei.hu at oracle.com>
---
 fs/ocfs2/dlm/dlmrecovery.c |   16 +++++++++++++++-
 1 files changed, 15 insertions(+), 1 deletions(-)

diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 01ebfd0..546c5b5 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -339,6 +339,17 @@ static int dlm_reco_master_ready(struct dlm_ctxt *dlm)
 	return ready;
 }
 
+/* returns true if node is still in the live node map
+ * this map is cleared before domain map,could be checked in recovery*/
+int dlm_is_node_in_livemap(struct dlm_ctxt *dlm, u8 node)
+{
+	int live;
+	spin_lock(&dlm->spinlock);
+	live = !test_bit(node, dlm->live_nodes_map);
+	spin_unlock(&dlm->spinlock);
+	return live;
+}
+
 /* returns true if node is no longer in the domain
  * could be dead or just not joined */
 int dlm_is_node_dead(struct dlm_ctxt *dlm, u8 node)
@@ -679,7 +690,10 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
 					     dlm->name, ndata->node_num,
 					     ndata->state==DLM_RECO_NODE_DATA_RECEIVING ?
 					     "receiving" : "requested");
-					all_nodes_done = 0;
+					if (!dlm_is_node_in_livemap(dlm, ndata->node_num))
+						ndata->state = DLM_RECO_NODE_DATA_DEAD;
+					else
+						all_nodes_done = 0;
 					break;
 				case DLM_RECO_NODE_DATA_DONE:
 					mlog(0, "%s: node %u state is done\n",
-- 
1.7.7.6


From sunil.mushran at gmail.com  Tue May 29 15:09:08 2012
From: sunil.mushran at gmail.com (Sunil Mushran)
Date: Tue, 29 May 2012 15:09:08 -0700
Subject: [Ocfs2-devel] [PATCH] Fix waiting status race condition in dlm
	recovery
In-Reply-To: <1337925202-13086-1-git-send-email-xiaowei.hu@oracle.com>
References: <1337925202-13086-1-git-send-email-xiaowei.hu@oracle.com>
Message-ID: <CAEeiSHXcaKXi7Qm5vLBmTp2CjiB7DCrUee5qmr03YpuJbzP5yg@mail.gmail.com>

On Thu, May 24, 2012 at 10:53 PM, <xiaowei.hu at oracle.com> wrote:

>
> diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
> index 01ebfd0..62659e8 100644
> --- a/fs/ocfs2/dlm/dlmrecovery.c
> +++ b/fs/ocfs2/dlm/dlmrecovery.c
> @@ -555,6 +555,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8
> dead_node)
>        int all_nodes_done;
>        int destroy = 0;
>        int pass = 0;
> +       int dying = 0;
>
>        do {
>                /* we have become recovery master.  there is no escaping
> @@ -659,6 +660,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8
> dead_node)
>                list_for_each_entry(ndata, &dlm->reco.node_data, list) {
>                        mlog(0, "checking recovery state of node %u\n",
>                             ndata->node_num);
> +                       dying = 0;
>                        switch (ndata->state) {
>                                case DLM_RECO_NODE_DATA_INIT:
>                                case DLM_RECO_NODE_DATA_REQUESTING:
> @@ -679,6 +681,13 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm,
> u8 dead_node)
>                                             dlm->name, ndata->node_num,
>
> ndata->state==DLM_RECO_NODE_DATA_RECEIVING ?
>                                             "receiving" : "requested");
> +                                       spin_lock(&dlm->spinlock);
> +                                       dying = !test_bit(ndata->node_num,
> dlm->live_nodes_map);
> +                                       spin_unlock(&dlm->spinlock);
> +                                       if (dying) {
> +                                               ndata->state =
> DLM_RECO_NODE_DATA_DEAD;
> +                                               break;
> +                                       }
>


I would suggest exploring adding this in dlm hb down event. Checking live
map all
over the place is hacky. We do it more than we should right now. Let's not
add to the
mess.


>                                        all_nodes_done = 0;
>                                        break;
>                                case DLM_RECO_NODE_DATA_DONE:
> --
> 1.7.7.6
>
>
> _______________________________________________
> Ocfs2-devel mailing list
> Ocfs2-devel at oss.oracle.com
> http://oss.oracle.com/mailman/listinfo/ocfs2-devel
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: http://oss.oracle.com/pipermail/ocfs2-devel/attachments/20120529/1080a567/attachment.html 

From xiaowei.hu at oracle.com  Tue May 29 17:41:09 2012
From: xiaowei.hu at oracle.com (Xiaowei)
Date: Wed, 30 May 2012 08:41:09 +0800
Subject: [Ocfs2-devel] [PATCH] Fix waiting status race condition in dlm
 recovery
In-Reply-To: <CAEeiSHXcaKXi7Qm5vLBmTp2CjiB7DCrUee5qmr03YpuJbzP5yg@mail.gmail.com>
References: <1337925202-13086-1-git-send-email-xiaowei.hu@oracle.com>
	<CAEeiSHXcaKXi7Qm5vLBmTp2CjiB7DCrUee5qmr03YpuJbzP5yg@mail.gmail.com>
Message-ID: <4FC56CA5.8040902@oracle.com>

On 05/30/2012 06:09 AM, Sunil Mushran wrote:
> On Thu, May 24, 2012 at 10:53 PM, <xiaowei.hu at oracle.com 
> <mailto:xiaowei.hu at oracle.com>> wrote:
>
>
>     diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
>     index 01ebfd0..62659e8 100644
>     --- a/fs/ocfs2/dlm/dlmrecovery.c
>     +++ b/fs/ocfs2/dlm/dlmrecovery.c
>     @@ -555,6 +555,7 @@ static int dlm_remaster_locks(struct dlm_ctxt
>     *dlm, u8 dead_node)
>            int all_nodes_done;
>            int destroy = 0;
>            int pass = 0;
>     +       int dying = 0;
>
>            do {
>                    /* we have become recovery master.  there is no
>     escaping
>     @@ -659,6 +660,7 @@ static int dlm_remaster_locks(struct dlm_ctxt
>     *dlm, u8 dead_node)
>                    list_for_each_entry(ndata, &dlm->reco.node_data,
>     list) {
>                            mlog(0, "checking recovery state of node %u\n",
>                                 ndata->node_num);
>     +                       dying = 0;
>                            switch (ndata->state) {
>                                    case DLM_RECO_NODE_DATA_INIT:
>                                    case DLM_RECO_NODE_DATA_REQUESTING:
>     @@ -679,6 +681,13 @@ static int dlm_remaster_locks(struct dlm_ctxt
>     *dlm, u8 dead_node)
>                                                 dlm->name,
>     ndata->node_num,
>                                                
>     ndata->state==DLM_RECO_NODE_DATA_RECEIVING ?
>                                                 "receiving" :
>     "requested");
>     +                                       spin_lock(&dlm->spinlock);
>     +                                       dying =
>     !test_bit(ndata->node_num, dlm->live_nodes_map);
>     +                                       spin_unlock(&dlm->spinlock);
>     +                                       if (dying) {
>     +                                               ndata->state =
>     DLM_RECO_NODE_DATA_DEAD;
>     +                                               break;
>     +                                       }
>
>
>
>
>
> I would suggest exploring adding this in dlm hb down event. Checking 
> live map all
> over the place is hacky. We do it more than we should right now. Let's 
> not add to the
> mess.
HI Sunil,

Do you mean we should clear the bit in domain map in dlm hb down event 
directly when the node down
and check with dlm_is_node_dead at here?
Or how could we explore and ensure the node is alive during the whole 
migrate process?One node could die even after it sends out one locks 
package and before the next if there were too many locks on that lockres.

Thanks,
Xiaowei
>
>
>
>                                            all_nodes_done = 0;
>                                            break;
>                                    case DLM_RECO_NODE_DATA_DONE:
>     --
>     1.7.7.6
>
>
>     _______________________________________________
>     Ocfs2-devel mailing list
>     Ocfs2-devel at oss.oracle.com <mailto:Ocfs2-devel at oss.oracle.com>
>     http://oss.oracle.com/mailman/listinfo/ocfs2-devel
>
>

-------------- next part --------------
An HTML attachment was scrubbed...
URL: http://oss.oracle.com/pipermail/ocfs2-devel/attachments/20120530/5fcb3ea7/attachment.html 

From sunil.mushran at gmail.com  Wed May 30 18:18:12 2012
From: sunil.mushran at gmail.com (Sunil Mushran)
Date: Wed, 30 May 2012 18:18:12 -0700
Subject: [Ocfs2-devel] [PATCH] Fix waiting status race condition in dlm
	recovery
In-Reply-To: <4FC56CA5.8040902@oracle.com>
References: <1337925202-13086-1-git-send-email-xiaowei.hu@oracle.com>
	<CAEeiSHXcaKXi7Qm5vLBmTp2CjiB7DCrUee5qmr03YpuJbzP5yg@mail.gmail.com>
	<4FC56CA5.8040902@oracle.com>
Message-ID: <CAEeiSHWkhD8x8nrix2+Wc1nesH8CExU6kA10nCH0J1nCwUaDtg@mail.gmail.com>

On Tue, May 29, 2012 at 5:41 PM, Xiaowei <xiaowei.hu at oracle.com> wrote:
> On 05/30/2012 06:09 AM, Sunil Mushran wrote:
> I would suggest exploring adding this in dlm hb down event. Checking live
> map all
> over the place is hacky. We do it more than we should right now. Let's not
> add to the
> mess.
>
> HI Sunil,
>
> Do you mean we should clear the bit in domain map in dlm hb down event
> directly when the node down
> and check with dlm_is_node_dead at here?
> Or how could we explore and ensure the node is alive during the whole
> migrate process?One node could die even after it sends out one locks package
> and before the next if there were too many locks on that lockres.

dlm hb down event is triggered when a node is declared dead. That's where we
clean up pending mles, etc. You can add a check for recovery and add logic to
change the reco state for that node there.


From junxiao.bi at oracle.com  Wed May 30 21:12:29 2012
From: junxiao.bi at oracle.com (Junxiao Bi)
Date: Thu, 31 May 2012 12:12:29 +0800
Subject: [Ocfs2-devel] [PATCH 1/2] aio: make kiocb->private NUll in
	init_sync_kiocb()
Message-ID: <1338437550-24499-1-git-send-email-junxiao.bi@oracle.com>

Ocfs2 uses kiocb.*private as a flag of unsigned long size. In
commit a11f7e6 ocfs2: serialize unaligned aio, the unaligned
io flag is involved in it to serialize the unaligned aio. As
*private is not initialized in init_sync_kiocb() of do_sync_write(),
this unaligned io flag may be unexpectly set in an aligned dio.
And this will cause OCFS2_I(inode)->ip_unaligned_aio decreased
to -1 in ocfs2_dio_end_io(), thus the following unaligned dio
will hang forever at ocfs2_aiodio_wait() in ocfs2_file_write_iter().

We can't initialized this flag in ocfs2_file_write_iter() since
it may be invoked several times by do_sync_write(). So we initialize
it in init_sync_kiocb(), it's also useful for other similiar use of
it in the future.

Signed-off-by: Junxiao Bi <junxiao.bi at oracle.com>
---
 include/linux/aio.h |    1 +
 1 file changed, 1 insertion(+)

diff --git a/include/linux/aio.h b/include/linux/aio.h
index 2314ad8..b1a520e 100644
--- a/include/linux/aio.h
+++ b/include/linux/aio.h
@@ -140,6 +140,7 @@ struct kiocb {
 		(x)->ki_dtor = NULL;			\
 		(x)->ki_obj.tsk = tsk;			\
 		(x)->ki_user_data = 0;                  \
+		(x)->private = NULL;			\
 	} while (0)
 
 #define AIO_RING_MAGIC			0xa10a10a1
-- 
1.7.9.5


From junxiao.bi at oracle.com  Wed May 30 21:12:30 2012
From: junxiao.bi at oracle.com (Junxiao Bi)
Date: Thu, 31 May 2012 12:12:30 +0800
Subject: [Ocfs2-devel] [PATCH 2/2] ocfs2: clear unaligned io flag when dio
	fails
In-Reply-To: <1338437550-24499-1-git-send-email-junxiao.bi@oracle.com>
References: <1338437550-24499-1-git-send-email-junxiao.bi@oracle.com>
Message-ID: <1338437550-24499-2-git-send-email-junxiao.bi@oracle.com>

The unaligned io flag is set in the kiocb when an unaligned
dio is issued, it should be cleared even when the dio fails,
or it may affect the following io which are using the same
kiocb.

Signed-off-by: Junxiao Bi <junxiao.bi at oracle.com>
---
 fs/ocfs2/file.c |    4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 061591a..98513c8 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2422,8 +2422,10 @@ out_dio:
 		unaligned_dio = 0;
 	}
 
-	if (unaligned_dio)
+	if (unaligned_dio) {
+		ocfs2_iocb_clear_unaligned_aio(iocb);
 		atomic_dec(&OCFS2_I(inode)->ip_unaligned_aio);
+	}
 
 out:
 	if (rw_level != -1)
-- 
1.7.9.5


From jmoyer at redhat.com  Thu May 31 07:08:13 2012
From: jmoyer at redhat.com (Jeff Moyer)
Date: Thu, 31 May 2012 10:08:13 -0400
Subject: [Ocfs2-devel] [PATCH 1/2] aio: make kiocb->private NUll in
	init_sync_kiocb()
In-Reply-To: <1338437550-24499-1-git-send-email-junxiao.bi@oracle.com>
	(Junxiao Bi's message of "Thu, 31 May 2012 12:12:29 +0800")
References: <1338437550-24499-1-git-send-email-junxiao.bi@oracle.com>
Message-ID: <x49k3zscuk2.fsf@segfault.boston.devel.redhat.com>

Junxiao Bi <junxiao.bi at oracle.com> writes:

> Ocfs2 uses kiocb.*private as a flag of unsigned long size. In
> commit a11f7e6 ocfs2: serialize unaligned aio, the unaligned
> io flag is involved in it to serialize the unaligned aio. As
> *private is not initialized in init_sync_kiocb() of do_sync_write(),
> this unaligned io flag may be unexpectly set in an aligned dio.
> And this will cause OCFS2_I(inode)->ip_unaligned_aio decreased
> to -1 in ocfs2_dio_end_io(), thus the following unaligned dio
> will hang forever at ocfs2_aiodio_wait() in ocfs2_file_write_iter().

> We can't initialized this flag in ocfs2_file_write_iter() since
> it may be invoked several times by do_sync_write(). So we initialize
> it in init_sync_kiocb(), it's also useful for other similiar use of
> it in the future.

I don't see any ocfs2_file_write_iter in the upstream kernel.
ocfs2_file_aio_write most certainly could set ->private to 0, it
will only be called once for a given kiocb.

That point aside, I have no issues with setting private to NULL in
init_sync_kiocb.  If you fix up the comment to reflect reality
w.r.t. the upstream kernel source, I'll ack the patch.

Cheers,
Jeff


From jmoyer at redhat.com  Thu May 31 07:09:09 2012
From: jmoyer at redhat.com (Jeff Moyer)
Date: Thu, 31 May 2012 10:09:09 -0400
Subject: [Ocfs2-devel] [PATCH 2/2] ocfs2: clear unaligned io flag when
	dio fails
In-Reply-To: <1338437550-24499-2-git-send-email-junxiao.bi@oracle.com>
	(Junxiao Bi's message of "Thu, 31 May 2012 12:12:30 +0800")
References: <1338437550-24499-1-git-send-email-junxiao.bi@oracle.com>
	<1338437550-24499-2-git-send-email-junxiao.bi@oracle.com>
Message-ID: <x49fwagcuii.fsf@segfault.boston.devel.redhat.com>

Junxiao Bi <junxiao.bi at oracle.com> writes:

> The unaligned io flag is set in the kiocb when an unaligned
> dio is issued, it should be cleared even when the dio fails,
> or it may affect the following io which are using the same
> kiocb.

What code is re-using kiocbs, much less re-using them without
re-initializing them?

-Jeff


From junxiao.bi at oracle.com  Thu May 31 18:41:52 2012
From: junxiao.bi at oracle.com (Junxiao Bi)
Date: Fri, 01 Jun 2012 09:41:52 +0800
Subject: [Ocfs2-devel] [PATCH 1/2] aio: make kiocb->private NUll in
	init_sync_kiocb()
In-Reply-To: <x49k3zscuk2.fsf@segfault.boston.devel.redhat.com>
References: <1338437550-24499-1-git-send-email-junxiao.bi@oracle.com>
	<x49k3zscuk2.fsf@segfault.boston.devel.redhat.com>
Message-ID: <4FC81DE0.5080403@oracle.com>

On 05/31/2012 10:08 PM, Jeff Moyer wrote:
> Junxiao Bi <junxiao.bi at oracle.com> writes:
>
>> Ocfs2 uses kiocb.*private as a flag of unsigned long size. In
>> commit a11f7e6 ocfs2: serialize unaligned aio, the unaligned
>> io flag is involved in it to serialize the unaligned aio. As
>> *private is not initialized in init_sync_kiocb() of do_sync_write(),
>> this unaligned io flag may be unexpectly set in an aligned dio.
>> And this will cause OCFS2_I(inode)->ip_unaligned_aio decreased
>> to -1 in ocfs2_dio_end_io(), thus the following unaligned dio
>> will hang forever at ocfs2_aiodio_wait() in ocfs2_file_write_iter().
>> We can't initialized this flag in ocfs2_file_write_iter() since
>> it may be invoked several times by do_sync_write(). So we initialize
>> it in init_sync_kiocb(), it's also useful for other similiar use of
>> it in the future.
> I don't see any ocfs2_file_write_iter in the upstream kernel.
> ocfs2_file_aio_write most certainly could set ->private to 0, it
> will only be called once for a given kiocb.
>From sys_io_submit->..->io_submit_one->aio_run_iocb->aio_rw_vect_retry, 
it seems that aio_write could be called two times. See the following
scenario.
1. There is a file opened with direct io flag, in aio_rw_vect_retry,
aio_write is called first time. If the direct io can
not be completed, it will fall back into buffer io, see line 2329 in
aio_write.
2. If the very buffer io is a partial write, then it will return back
to  aio_rw_vect_retry and issue the second aio_write.
>
> That point aside, I have no issues with setting private to NULL in
> init_sync_kiocb.  If you fix up the comment to reflect reality
> w.r.t. the upstream kernel source, I'll ack the patch.
OK, I will fix the comment.
>
> Cheers,
> Jeff


From junxiao.bi at oracle.com  Thu May 31 18:44:25 2012
From: junxiao.bi at oracle.com (Junxiao Bi)
Date: Fri, 01 Jun 2012 09:44:25 +0800
Subject: [Ocfs2-devel] [PATCH 2/2] ocfs2: clear unaligned io flag when
	dio fails
In-Reply-To: <x49fwagcuii.fsf@segfault.boston.devel.redhat.com>
References: <1338437550-24499-1-git-send-email-junxiao.bi@oracle.com>
	<1338437550-24499-2-git-send-email-junxiao.bi@oracle.com>
	<x49fwagcuii.fsf@segfault.boston.devel.redhat.com>
Message-ID: <4FC81E79.1080003@oracle.com>

On 05/31/2012 10:09 PM, Jeff Moyer wrote:
> Junxiao Bi <junxiao.bi at oracle.com> writes:
>
>> The unaligned io flag is set in the kiocb when an unaligned
>> dio is issued, it should be cleared even when the dio fails,
>> or it may affect the following io which are using the same
>> kiocb.
> What code is re-using kiocbs, much less re-using them without
> re-initializing them?
See my comment in another thread. aio_write seems called two times with
the same kiocb.
>
> -Jeff


From akinobu.mita at gmail.com  Sun May 20 06:24:03 2012
From: akinobu.mita at gmail.com (Akinobu Mita)
Date: Sun, 20 May 2012 13:24:03 -0000
Subject: [Ocfs2-devel] [PATCH 01/10] string: introduce memweight
Message-ID: <1337520203-29147-1-git-send-email-akinobu.mita@gmail.com>

memweight() is the function that counts the total number of bits set
in memory area.  The memory area doesn't need to be aligned to
long-word boundary unlike bitmap_weight().

Signed-off-by: Akinobu Mita <akinobu.mita at gmail.com>
Cc: Anders Larsen <al at alarsen.net>
Cc: Alasdair Kergon <agk at redhat.com>
Cc: dm-devel at redhat.com
Cc: linux-fsdevel at vger.kernel.org
Cc: Laurent Pinchart <laurent.pinchart at ideasonboard.com>
Cc: linux-media at vger.kernel.org
Cc: Mark Fasheh <mfasheh at suse.com>
Cc: Joel Becker <jlbec at evilplan.org>
Cc: ocfs2-devel at oss.oracle.com
Cc: Jan Kara <jack at suse.cz>
Cc: linux-ext4 at vger.kernel.org
Cc: Andrew Morton <akpm at linux-foundation.org>
Cc: Andreas Dilger <adilger.kernel at dilger.ca>
Cc: "Theodore Ts'o" <tytso at mit.edu>
---
 include/linux/string.h |    3 +++
 lib/string.c           |   37 +++++++++++++++++++++++++++++++++++++
 2 files changed, 40 insertions(+), 0 deletions(-)

diff --git a/include/linux/string.h b/include/linux/string.h
index e033564..ffe0442 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -145,4 +145,7 @@ static inline bool strstarts(const char *str, const char *prefix)
 	return strncmp(str, prefix, strlen(prefix)) == 0;
 }
 #endif
+
+extern size_t memweight(const void *ptr, size_t bytes);
+
 #endif /* _LINUX_STRING_H_ */
diff --git a/lib/string.c b/lib/string.c
index e5878de..c8b92a0 100644
--- a/lib/string.c
+++ b/lib/string.c
@@ -26,6 +26,7 @@
 #include <linux/export.h>
 #include <linux/bug.h>
 #include <linux/errno.h>
+#include <linux/bitmap.h>
 
 #ifndef __HAVE_ARCH_STRNICMP
 /**
@@ -824,3 +825,39 @@ void *memchr_inv(const void *start, int c, size_t bytes)
 	return check_bytes8(start, value, bytes % 8);
 }
 EXPORT_SYMBOL(memchr_inv);
+
+/**
+ * memweight - count the total number of bits set in memory area
+ * @ptr: pointer to the start of the area
+ * @bytes: the size of the area
+ */
+size_t memweight(const void *ptr, size_t bytes)
+{
+	size_t w = 0;
+	size_t longs;
+	union {
+		const void *ptr;
+		const unsigned char *b;
+		unsigned long address;
+	} bitmap;
+
+	for (bitmap.ptr = ptr; bytes > 0 && bitmap.address % sizeof(long);
+			bytes--, bitmap.address++)
+		w += hweight8(*bitmap.b);
+
+	for (longs = bytes / sizeof(long); longs > 0; ) {
+		size_t bits = min_t(size_t, INT_MAX & ~(BITS_PER_LONG - 1),
+					longs * BITS_PER_LONG);
+
+		w += bitmap_weight(bitmap.ptr, bits);
+		bytes -= bits / BITS_PER_BYTE;
+		bitmap.address += bits / BITS_PER_BYTE;
+		longs -= bits / BITS_PER_LONG;
+	}
+
+	for (; bytes > 0; bytes--, bitmap.address++)
+		w += hweight8(*bitmap.b);
+
+	return w;
+}
+EXPORT_SYMBOL(memweight);
-- 
1.7.7.6


From akinobu.mita at gmail.com  Sun May 20 06:24:09 2012
From: akinobu.mita at gmail.com (Akinobu Mita)
Date: Sun, 20 May 2012 13:24:09 -0000
Subject: [Ocfs2-devel] [PATCH 07/10] ocfs2: use memweight()
In-Reply-To: <1337520203-29147-1-git-send-email-akinobu.mita@gmail.com>
References: <1337520203-29147-1-git-send-email-akinobu.mita@gmail.com>
Message-ID: <1337520203-29147-7-git-send-email-akinobu.mita@gmail.com>

Use memweight to count the total number of bits set in memory area.

Signed-off-by: Akinobu Mita <akinobu.mita at gmail.com>
Cc: Mark Fasheh <mfasheh at suse.com>
Cc: Joel Becker <jlbec at evilplan.org>
Cc: ocfs2-devel at oss.oracle.com
---
 fs/ocfs2/localalloc.c |    8 ++------
 1 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index 210c352..a9f78c7 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -784,14 +784,10 @@ bail:
 
 static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc)
 {
-	int i;
-	u8 *buffer;
-	u32 count = 0;
+	u32 count;
 	struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc);
 
-	buffer = la->la_bitmap;
-	for (i = 0; i < le16_to_cpu(la->la_size); i++)
-		count += hweight8(buffer[i]);
+	count = memweight(la->la_bitmap, le16_to_cpu(la->la_size));
 
 	trace_ocfs2_local_alloc_count_bits(count);
 	return count;
-- 
1.7.7.6


From akinobu.mita at gmail.com  Wed May 23 05:12:21 2012
From: akinobu.mita at gmail.com (Akinobu Mita)
Date: Wed, 23 May 2012 12:12:21 -0000
Subject: [Ocfs2-devel] [PATCH 01/10] string: introduce memweight
In-Reply-To: <20120523092113.GG10452@quack.suse.cz>
References: <1337520203-29147-1-git-send-email-akinobu.mita@gmail.com>
	<20120523092113.GG10452@quack.suse.cz>
Message-ID: <CAC5umyi=ridqRZGGh0+_xw0-GCN+69B33Qz82-9x4dVODGGx6w@mail.gmail.com>

2012/5/23 Jan Kara <jack at suse.cz>:
> On Sun 20-05-12 22:23:14, Akinobu Mita wrote:
>> memweight() is the function that counts the total number of bits set
>> in memory area. ?The memory area doesn't need to be aligned to
>> long-word boundary unlike bitmap_weight().
> ?Thanks for the patch. I have some comments below.

Thanks for the review.

>> @@ -824,3 +825,39 @@ void *memchr_inv(const void *start, int c, size_t bytes)
>> ? ? ? return check_bytes8(start, value, bytes % 8);
>> ?}
>> ?EXPORT_SYMBOL(memchr_inv);
>> +
>> +/**
>> + * memweight - count the total number of bits set in memory area
>> + * @ptr: pointer to the start of the area
>> + * @bytes: the size of the area
>> + */
>> +size_t memweight(const void *ptr, size_t bytes)
>> +{
>> + ? ? size_t w = 0;
>> + ? ? size_t longs;
>> + ? ? union {
>> + ? ? ? ? ? ? const void *ptr;
>> + ? ? ? ? ? ? const unsigned char *b;
>> + ? ? ? ? ? ? unsigned long address;
>> + ? ? } bitmap;
> ?Ugh, this is ugly and mostly unnecessary. Just use "const unsigned char
> *bitmap".
>
>> +
>> + ? ? for (bitmap.ptr = ptr; bytes > 0 && bitmap.address % sizeof(long);
>> + ? ? ? ? ? ? ? ? ? ? bytes--, bitmap.address++)
>> + ? ? ? ? ? ? w += hweight8(*bitmap.b);
> ?This can be:
> ? ? ? ?count = ((unsigned long)bitmap) % sizeof(long);

The count should be the size of unaligned area and it can be greater than
bytes. So

        count = min(bytes,
                    sizeof(long) - ((unsigned long)bitmap) % sizeof(long));

> ? ? ? ?while (count--) {
> ? ? ? ? ? ? ? ?w += hweight(*bitmap);
> ? ? ? ? ? ? ? ?bitmap++;
> ? ? ? ? ? ? ? ?bytes--;
> ? ? ? ?}
>> +
>> + ? ? for (longs = bytes / sizeof(long); longs > 0; ) {
>> + ? ? ? ? ? ? size_t bits = min_t(size_t, INT_MAX & ~(BITS_PER_LONG - 1),
>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? longs * BITS_PER_LONG);
> ?I find it highly unlikely that someone would have such a large bitmap
> (256 MB or more on 32-bit). Also the condition as you wrote it can just
> overflow so it won't have the desired effect. Just do
> ? ? ? ?BUG_ON(longs >= ULONG_MAX / BITS_PER_LONG);

The bits argument of bitmap_weight() is int type. So this should be

        BUG_ON(longs >= INT_MAX / BITS_PER_LONG);

> and remove the loop completely. If someone comes with such a huge bitmap,
> the code can be modified easily (after really closely inspecting whether
> such a huge bitmap is really well justified).

size_t memweight(const void *ptr, size_t bytes)
{
	size_t w = 0;
	size_t longs;
	const unsigned char *bitmap = ptr;

	for (; bytes > 0 && ((unsigned long)bitmap) % sizeof(long);
			bytes--, bitmap++)
		w += hweight8(*bitmap);

	longs = bytes / sizeof(long);
	BUG_ON(longs >= INT_MAX / BITS_PER_LONG);
	w += bitmap_weight((unsigned long *)bitmap, longs * BITS_PER_LONG);
	bytes -= longs * sizeof(long);
	bitmap += longs * sizeof(long);

	for (; bytes > 0; bytes--, bitmap++)
		w += hweight8(*bitmap);

	return w;
}


From akinobu.mita at gmail.com  Thu May 24 04:54:21 2012
From: akinobu.mita at gmail.com (Akinobu Mita)
Date: Thu, 24 May 2012 11:54:21 -0000
Subject: [Ocfs2-devel] [PATCH 01/10] string: introduce memweight
In-Reply-To: <20120523131559.GA7064@parisc-linux.org>
References: <1337520203-29147-1-git-send-email-akinobu.mita@gmail.com>
	<20120523092113.GG10452@quack.suse.cz>
	<CAC5umyi=ridqRZGGh0+_xw0-GCN+69B33Qz82-9x4dVODGGx6w@mail.gmail.com>
	<20120523131559.GA7064@parisc-linux.org>
Message-ID: <CAC5umyhPzvAhX8Y-oa5Kr-G7ZAJ15HV_H2HMgxrV2KZUJadsNw@mail.gmail.com>

2012/5/23 Matthew Wilcox <matthew at wil.cx>:
> On Wed, May 23, 2012 at 09:12:18PM +0900, Akinobu Mita wrote:
>> size_t memweight(const void *ptr, size_t bytes)
>
> Why should this return size_t instead of unsigned long?

I just use the same type as the bytes argument without mature
consideration.  If unsigned long is better than size_t, I'll
change the return type.

>> {
>> ? ? ? size_t w = 0;
>> ? ? ? size_t longs;
>> ? ? ? const unsigned char *bitmap = ptr;
>>
>> ? ? ? for (; bytes > 0 && ((unsigned long)bitmap) % sizeof(long);
>> ? ? ? ? ? ? ? ? ? ? ? bytes--, bitmap++)
>> ? ? ? ? ? ? ? w += hweight8(*bitmap);
>>
>> ? ? ? longs = bytes / sizeof(long);
>> ? ? ? BUG_ON(longs >= INT_MAX / BITS_PER_LONG);
>> ? ? ? w += bitmap_weight((unsigned long *)bitmap, longs * BITS_PER_LONG);
>> ? ? ? bytes -= longs * sizeof(long);
>> ? ? ? bitmap += longs * sizeof(long);
>>
>> ? ? ? for (; bytes > 0; bytes--, bitmap++)
>> ? ? ? ? ? ? ? w += hweight8(*bitmap);
>>
>> ? ? ? return w;
>> }
>
> bitmap_weight copes with a bitmask that isn't a multiple of BITS_PER_LONG
> in size already. ?So I think this can be done as:
>
> unsigned long memweight(const void *s, size_t n)
> {
> ? ? ? ?const unsigned char *ptr = s;
> ? ? ? ?unsigned long r = 0;
>
> ? ? ? ?while (n > 0 && (unsigned long)ptr % sizeof(long)) {
> ? ? ? ? ? ? ? ?r += hweight8(*ptr);
> ? ? ? ? ? ? ? ?n--;
> ? ? ? ? ? ? ? ?ptr++;
> ? ? ? ?}
>
> ? ? ? ?BUG_ON(n >= INT_MAX / 8)
>
> ? ? ? ?return r + bitmap_weight((unsigned long *)ptr, n * 8);
> }

This works perfectly on little-endian machines.  But it doesn't work
on big-endian machines, if the bottom edge of memory area is not
aligned on long word boundary.