From srinivas.eeda at oracle.com Mon May 7 16:21:28 2012 From: srinivas.eeda at oracle.com (Srinivas Eeda) Date: Mon, 7 May 2012 16:21:28 -0700 Subject: [Ocfs2-devel] [PATCH 1/3] ocfs2: new structure to implment discontiguous local alloc bitmap In-Reply-To: <1336432890-18638-1-git-send-email-srinivas.eeda@oracle.com> References: <1336432890-18638-1-git-send-email-srinivas.eeda@oracle.com> Message-ID: <1336432890-18638-2-git-send-email-srinivas.eeda@oracle.com> Current local alloc handles single contiguous free chunk of clusters. This patch enhances local alloc to handle discontigous free chunks. It adds a new ocfs2_local_alloc_rec structure which tracks single contiguous free chunk. An array of these sit in the bitmap itself and track discontiguous chunks. In best case there is only one record and increases as the filesystem gets fragmented. Number of records at a time are limited depending on the size of the bitmap and the max limit is defined by OCFS2_MAX_LOCAL_ALLOC_RECS. Signed-off-by: Srinivas Eeda --- fs/ocfs2/localalloc.c | 10 ++++++++++ fs/ocfs2/ocfs2.h | 8 ++++++++ fs/ocfs2/ocfs2_fs.h | 48 ++++++++++++++++++++++++++++++++++++++++++------ 3 files changed, 60 insertions(+), 6 deletions(-) diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index 210c352..4190e53 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c @@ -48,6 +48,16 @@ #define OCFS2_LOCAL_ALLOC(dinode) (&((dinode)->id2.i_lab)) +#define OCFS2_LOCAL_ALLOC_REC_SZ(la) (le16_to_cpu(la->la_rec_count) *\ + sizeof(struct ocfs2_local_alloc_rec)) +#define OCFS2_LOCAL_ALLOC_BITMAP(la) ((char *)(&(la->la_recs)) +\ + OCFS2_LOCAL_ALLOC_REC_SZ(la)) +#define OCFS2_LOCAL_ALLOC_BITS_PER_REC (sizeof(struct ocfs2_local_alloc_rec)*8) + +/* Maximum number of local alloc records */ +#define OCFS2_MAX_LOCAL_ALLOC_REC_LIMIT 128 + + static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc); static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb, diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index d355e6e..d4c36d2 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -367,6 +367,7 @@ struct ocfs2_super * by osb_lock */ struct buffer_head *local_alloc_bh; + struct inode *local_alloc_inode; u64 la_last_gd; @@ -522,6 +523,13 @@ static inline int ocfs2_supports_discontig_bg(struct ocfs2_super *osb) return 0; } +static inline int ocfs2_supports_discontig_la(struct ocfs2_super *osb) +{ + if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_DISCONTIG_LA) + return 1; + return 0; +} + static inline unsigned int ocfs2_link_max(struct ocfs2_super *osb) { if (ocfs2_supports_indexed_dirs(osb)) diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index 938387a..6a0fe02 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h @@ -102,7 +102,8 @@ | OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS \ | OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE \ | OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG \ - | OCFS2_FEATURE_INCOMPAT_CLUSTERINFO) + | OCFS2_FEATURE_INCOMPAT_CLUSTERINFO \ + | OCFS2_FEATURE_INCOMPAT_DISCONTIG_LA) #define OCFS2_FEATURE_RO_COMPAT_SUPP (OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \ | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \ | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA) @@ -177,6 +178,9 @@ */ #define OCFS2_FEATURE_INCOMPAT_CLUSTERINFO 0x4000 +/* Discontiguous local alloc */ +#define OCFS2_FEATURE_INCOMPAT_DISCONTIG_LA 0x8000 + /* * backup superblock flag is used to indicate that this volume * has backup superblocks. @@ -664,14 +668,19 @@ struct ocfs2_super_block { * Local allocation bitmap for OCFS2 slots * Note that it exists inside an ocfs2_dinode, so all offsets are * relative to the start of ocfs2_dinode.id2. + * Each ocfs2_local_alloc_rec tracks one contigous chunk of clusters. */ +struct ocfs2_local_alloc_rec { + __le32 la_start; /* 1st cluster in this extent */ + __le32 la_clusters; /* Number of contiguous clusters */ +}; + struct ocfs2_local_alloc { /*00*/ __le32 la_bm_off; /* Starting bit offset in main bitmap */ __le16 la_size; /* Size of included bitmap, in bytes */ - __le16 la_reserved1; - __le64 la_reserved2; -/*10*/ __u8 la_bitmap[0]; + __le16 la_rec_count; /* Number of discontiguous records */ + struct ocfs2_local_alloc_rec la_recs[0]; /* Localalloc records */ }; /* @@ -1380,11 +1389,24 @@ static inline u16 ocfs2_local_alloc_size(struct super_block *sb) u16 size; size = sb->s_blocksize - - offsetof(struct ocfs2_dinode, id2.i_lab.la_bitmap); + offsetof(struct ocfs2_dinode, id2.i_lab.la_recs); + size -= sizeof(struct ocfs2_local_alloc_rec); return size; } +/* effectively this is also the bitmap size */ +static inline u32 ocfs2_local_alloc_cluster_count(struct ocfs2_local_alloc *la) +{ + u32 i, clusters; + + clusters = 0; + for (i = 0; i < le16_to_cpu(la->la_rec_count); i++) + clusters += le32_to_cpu(la->la_recs[i].la_clusters); + + return clusters; +} + static inline int ocfs2_group_bitmap_size(struct super_block *sb, int suballocator, u32 feature_incompat) @@ -1528,11 +1550,25 @@ static inline int ocfs2_local_alloc_size(int blocksize) int size; size = blocksize - - offsetof(struct ocfs2_dinode, id2.i_lab.la_bitmap); + offsetof(struct ocfs2_dinode, id2.i_lab.la_recs); + size -= sizeof(struct ocfs2_local_alloc_rec); return size; } +/* effectively this is also the bitmap size */ +static inline uint32_t +ocfs2_local_alloc_cluster_count(struct ocfs2_local_alloc *la) +{ + uint32_t i, clusters; + + clusters = 0; + for (i = 0; i < le16_to_cpu(la->la_rec_count); i++) + clusters += le32_to_cpu(la->la_recs[i].la_clusters); + + return clusters; +} + static inline int ocfs2_group_bitmap_size(int blocksize, int suballocator, uint32_t feature_incompat) -- 1.5.4.3 From srinivas.eeda at oracle.com Mon May 7 16:21:29 2012 From: srinivas.eeda at oracle.com (Srinivas Eeda) Date: Mon, 7 May 2012 16:21:29 -0700 Subject: [Ocfs2-devel] [PATCH 2/3] ocfs2: implement discontiguous localalloc bitmap In-Reply-To: <1336432890-18638-1-git-send-email-srinivas.eeda@oracle.com> References: <1336432890-18638-1-git-send-email-srinivas.eeda@oracle.com> Message-ID: <1336432890-18638-3-git-send-email-srinivas.eeda@oracle.com> This patch adds supporting functions and modifies localalloc code to implement discontiguous localalloc bitmap. Signed-off-by: Srinivas Eeda --- fs/ocfs2/localalloc.c | 523 ++++++++++++++++++++++++++++++++----------------- 1 files changed, 342 insertions(+), 181 deletions(-) diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index 4190e53..f63381e 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c @@ -48,6 +48,9 @@ #define OCFS2_LOCAL_ALLOC(dinode) (&((dinode)->id2.i_lab)) +/* defines minimum contiguous required */ +#define OCFS2_LOCAL_ALLOC_MIN_BITS 2 + #define OCFS2_LOCAL_ALLOC_REC_SZ(la) (le16_to_cpu(la->la_rec_count) *\ sizeof(struct ocfs2_local_alloc_rec)) #define OCFS2_LOCAL_ALLOC_BITMAP(la) ((char *)(&(la->la_recs)) +\ @@ -58,7 +61,8 @@ #define OCFS2_MAX_LOCAL_ALLOC_REC_LIMIT 128 -static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc); +static u32 ocfs2_local_alloc_count_bits(struct ocfs2_super *osb, + struct ocfs2_dinode *alloc); static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb, struct ocfs2_dinode *alloc, @@ -82,8 +86,7 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb, handle_t *handle, struct ocfs2_alloc_context *ac); -static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb, - struct inode *local_alloc_inode); +static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb); /* * ocfs2_la_default_mb() - determine a default size, in megabytes of @@ -202,6 +205,74 @@ unsigned int ocfs2_la_default_mb(struct ocfs2_super *osb) return la_mb; } +static u32 ocfs2_local_bitmap_to_cluster(struct ocfs2_local_alloc *la, u32 bit) +{ + u32 start, prev, offset; + int rec; + + rec = start = prev = 0; + for (rec = 0; rec < le16_to_cpu(la->la_rec_count); rec++) { + prev = start; + start += le32_to_cpu(la->la_recs[rec].la_clusters); + if (bit < start) + break; + } + offset = le32_to_cpu(la->la_recs[rec].la_start) + (bit - prev); + + return offset; +} + +/* + * This function is called before allocating a new chunk for the localalloc + * bitmap to make sure there is enough space in the bitmap for the new record + */ +static u32 ocfs2_local_alloc_adjust_bits_wanted(struct ocfs2_local_alloc *la, + struct ocfs2_alloc_context *ac) +{ + u32 required, available, cluster_cnt; + + if (ac->ac_bits_given == ac->ac_bits_wanted) + return 0; + + /* total bits available in bitmap */ + available = le16_to_cpu(la->la_size) << 3; + cluster_cnt = ocfs2_local_alloc_cluster_count(la); + + /* + * Wanted shouldn't be greater than bitmap size and given should be + * equal to cluster count + */ + BUG_ON(ac->ac_bits_given > ac->ac_bits_wanted); + BUG_ON(ac->ac_bits_wanted > available); + BUG_ON(ac->ac_bits_given != cluster_cnt); + + /* reduce bits taken by each record structure */ + available -= (le16_to_cpu(la->la_rec_count) * + OCFS2_LOCAL_ALLOC_BITS_PER_REC); + + /* reduce space reserved for bitmap for already allocated clusters */ + available -= cluster_cnt; + + /* if available bits are not enough to fit a new record return 0 */ + if (available < (OCFS2_LOCAL_ALLOC_BITS_PER_REC + 1)) + return 0; + + /* Adjust space that will be consumed by new record structure */ + available -= OCFS2_LOCAL_ALLOC_BITS_PER_REC; + + required = ac->ac_bits_wanted - ac->ac_bits_given; + + /* + * we can't allocate clusters more than the bits available. Adjust + * bits wanted + */ + if (required > available) { + ac->ac_bits_wanted = ac->ac_bits_given + available; + return available; + } else + return required; +} + void ocfs2_la_set_sizes(struct ocfs2_super *osb, int requested_mb) { struct super_block *sb = osb->sb; @@ -239,12 +310,14 @@ void ocfs2_local_alloc_seen_free_bits(struct ocfs2_super *osb, unsigned int num_clusters) { spin_lock(&osb->osb_lock); - if (osb->local_alloc_state == OCFS2_LA_DISABLED || - osb->local_alloc_state == OCFS2_LA_THROTTLED) - if (num_clusters >= osb->local_alloc_default_bits) { - cancel_delayed_work(&osb->la_enable_wq); + if (osb->local_alloc_state == OCFS2_LA_DISABLED) { + cancel_delayed_work(&osb->la_enable_wq); + if (num_clusters >= osb->local_alloc_bits) + osb->local_alloc_state = OCFS2_LA_THROTTLED; + + if (num_clusters >= osb->local_alloc_default_bits) osb->local_alloc_state = OCFS2_LA_ENABLED; - } + } spin_unlock(&osb->osb_lock); } @@ -280,7 +353,7 @@ int ocfs2_alloc_should_use_local(struct ocfs2_super *osb, u64 bits) * a new block group. We want to be sure block group * allocations go through the local alloc, so allow an * allocation to take up to half the bitmap. */ - if (bits > (la_bits / 2)) + if ((la_bits > OCFS2_LOCAL_ALLOC_MIN_BITS) && (bits > (la_bits / 2))) goto bail; ret = 1; @@ -348,21 +421,21 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb) } /* do a little verification. */ - num_used = ocfs2_local_alloc_count_bits(alloc); + num_used = ocfs2_local_alloc_count_bits(osb, alloc); /* hopefully the local alloc has always been recovered before * we load it. */ if (num_used || alloc->id1.bitmap1.i_used || alloc->id1.bitmap1.i_total - || la->la_bm_off) + || la->la_rec_count) mlog(ML_ERROR, "Local alloc hasn't been recovered!\n" - "found = %u, set = %u, taken = %u, off = %u\n", + "found = %u, set = %u, taken = %u\n", num_used, le32_to_cpu(alloc->id1.bitmap1.i_used), - le32_to_cpu(alloc->id1.bitmap1.i_total), - OCFS2_LOCAL_ALLOC(alloc)->la_bm_off); + le32_to_cpu(alloc->id1.bitmap1.i_total)); - osb->local_alloc_bh = alloc_bh; + osb->local_alloc_bh = alloc_bh; + osb->local_alloc_inode = inode; osb->local_alloc_state = OCFS2_LA_ENABLED; bail: @@ -389,7 +462,6 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb) { int status; handle_t *handle; - struct inode *local_alloc_inode = NULL; struct buffer_head *bh = NULL; struct buffer_head *main_bm_bh = NULL; struct inode *main_bm_inode = NULL; @@ -402,16 +474,6 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb) if (osb->local_alloc_state == OCFS2_LA_UNUSED) goto out; - local_alloc_inode = - ocfs2_get_system_file_inode(osb, - LOCAL_ALLOC_SYSTEM_INODE, - osb->slot_num); - if (!local_alloc_inode) { - status = -ENOENT; - mlog_errno(status); - goto out; - } - osb->local_alloc_state = OCFS2_LA_DISABLED; ocfs2_resmap_uninit(&osb->osb_la_resmap); @@ -451,13 +513,19 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb) } memcpy(alloc_copy, alloc, bh->b_size); - status = ocfs2_journal_access_di(handle, INODE_CACHE(local_alloc_inode), + status = ocfs2_journal_access_di(handle, + INODE_CACHE(osb->local_alloc_inode), bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto out_commit; } + status = ocfs2_sync_local_to_main(osb, handle, alloc_copy, + main_bm_inode, main_bm_bh); + if (status < 0) + mlog_errno(status); + ocfs2_clear_local_alloc(alloc); ocfs2_journal_dirty(handle, bh); @@ -465,11 +533,6 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb) osb->local_alloc_bh = NULL; osb->local_alloc_state = OCFS2_LA_UNUSED; - status = ocfs2_sync_local_to_main(osb, handle, alloc_copy, - main_bm_inode, main_bm_bh); - if (status < 0) - mlog_errno(status); - out_commit: ocfs2_commit_trans(osb, handle); @@ -483,9 +546,6 @@ out_mutex: iput(main_bm_inode); out: - if (local_alloc_inode) - iput(local_alloc_inode); - if (alloc_copy) kfree(alloc_copy); } @@ -641,22 +701,11 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb, { int status; struct ocfs2_dinode *alloc; - struct inode *local_alloc_inode; unsigned int free_bits; BUG_ON(!ac); - local_alloc_inode = - ocfs2_get_system_file_inode(osb, - LOCAL_ALLOC_SYSTEM_INODE, - osb->slot_num); - if (!local_alloc_inode) { - status = -ENOENT; - mlog_errno(status); - goto bail; - } - - mutex_lock(&local_alloc_inode->i_mutex); + mutex_lock(&osb->local_alloc_inode->i_mutex); /* * We must double check state and allocator bits because @@ -675,12 +724,12 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb, #ifdef CONFIG_OCFS2_DEBUG_FS if (le32_to_cpu(alloc->id1.bitmap1.i_used) != - ocfs2_local_alloc_count_bits(alloc)) { + ocfs2_local_alloc_count_bits(osb, alloc)) { ocfs2_error(osb->sb, "local alloc inode %llu says it has " "%u free bits, but a count shows %u", (unsigned long long)le64_to_cpu(alloc->i_blkno), le32_to_cpu(alloc->id1.bitmap1.i_used), - ocfs2_local_alloc_count_bits(alloc)); + ocfs2_local_alloc_count_bits(osb, alloc)); status = -EIO; goto bail; } @@ -690,8 +739,7 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb, le32_to_cpu(alloc->id1.bitmap1.i_used); if (bits_wanted > free_bits) { /* uhoh, window change time. */ - status = - ocfs2_local_alloc_slide_window(osb, local_alloc_inode); + status = ocfs2_local_alloc_slide_window(osb); if (status < 0) { if (status != -ENOSPC) mlog_errno(status); @@ -714,7 +762,7 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb, goto bail; } - ac->ac_inode = local_alloc_inode; + ac->ac_inode = osb->local_alloc_inode; /* We should never use localalloc from another slot */ ac->ac_alloc_slot = osb->slot_num; ac->ac_which = OCFS2_AC_USE_LOCAL; @@ -722,9 +770,8 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb, ac->ac_bh = osb->local_alloc_bh; status = 0; bail: - if (status < 0 && local_alloc_inode) { - mutex_unlock(&local_alloc_inode->i_mutex); - iput(local_alloc_inode); + if (status < 0 && osb->local_alloc_inode) { + mutex_unlock(&osb->local_alloc_inode->i_mutex); } trace_ocfs2_reserve_local_alloc_bits( @@ -745,7 +792,7 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb, { int status, start; struct inode *local_alloc_inode; - void *bitmap; + u8 *bitmap; struct ocfs2_dinode *alloc; struct ocfs2_local_alloc *la; @@ -764,8 +811,8 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb, goto bail; } - bitmap = la->la_bitmap; - *bit_off = le32_to_cpu(la->la_bm_off) + start; + bitmap = OCFS2_LOCAL_ALLOC_BITMAP(la); + *bit_off = ocfs2_local_bitmap_to_cluster(la, start); *num_bits = bits_wanted; status = ocfs2_journal_access_di(handle, @@ -792,16 +839,29 @@ bail: return status; } -static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc) +static u32 ocfs2_local_alloc_count_bits(struct ocfs2_super *osb, + struct ocfs2_dinode *alloc) { int i; - u8 *buffer; + u8 *bitmap; u32 count = 0; struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc); - buffer = la->la_bitmap; - for (i = 0; i < le16_to_cpu(la->la_size); i++) - count += hweight8(buffer[i]); + /* + * if discontig is not enabled then lets update the first localalloc + * record with the current bitmap block info. We are doing this because + * old disk formats are not aware of the records. + */ + if (!ocfs2_supports_discontig_la(osb) && la->la_bm_off) { + la->la_rec_count = cpu_to_le16(1); + la->la_recs[0].la_start = la->la_bm_off; + la->la_recs[0].la_clusters = alloc->id1.bitmap1.i_total; + } + + bitmap = OCFS2_LOCAL_ALLOC_BITMAP(la); + for (i = 0; i < le32_to_cpu(alloc->id1.bitmap1.i_total); i++) + if (ocfs2_test_bit(i, bitmap)) + count++; trace_ocfs2_local_alloc_count_bits(count); return count; @@ -812,10 +872,11 @@ static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb, u32 *numbits, struct ocfs2_alloc_reservation *resv) { - int numfound, bitoff, left, startoff, lastzero; - int local_resv = 0; + int numfound, bitoff, left, startoff; + int i, local_resv = 0; struct ocfs2_alloc_reservation r; - void *bitmap = NULL; + struct ocfs2_local_alloc *la; + u8 *bitmap = NULL; struct ocfs2_reservation_map *resmap = &osb->osb_la_resmap; if (!alloc->id1.bitmap1.i_total) { @@ -847,37 +908,44 @@ static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb, * Reservations are disabled. Handle this the old way. */ - bitmap = OCFS2_LOCAL_ALLOC(alloc)->la_bitmap; + la = OCFS2_LOCAL_ALLOC(alloc); + bitmap = OCFS2_LOCAL_ALLOC_BITMAP(la); - numfound = bitoff = startoff = 0; - lastzero = -1; - left = le32_to_cpu(alloc->id1.bitmap1.i_total); - while ((bitoff = ocfs2_find_next_zero_bit(bitmap, left, startoff)) != -1) { - if (bitoff == left) { - /* mlog(0, "bitoff (%d) == left", bitoff); */ - break; - } - /* mlog(0, "Found a zero: bitoff = %d, startoff = %d, " - "numfound = %d\n", bitoff, startoff, numfound);*/ - - /* Ok, we found a zero bit... is it contig. or do we - * start over?*/ - if (bitoff == startoff) { - /* we found a zero */ - numfound++; - startoff++; - } else { - /* got a zero after some ones */ - numfound = 1; - startoff = bitoff+1; - } - /* we got everything we needed */ - if (numfound == *numbits) { - /* mlog(0, "Found it all!\n"); */ - break; + left = numfound = bitoff = startoff = 0; + for (i = 0; i < le16_to_cpu(la->la_rec_count); i++) { + + numfound = 0; + startoff += left; + left = le32_to_cpu(la->la_recs[i].la_clusters); + + while ((bitoff = ocfs2_find_next_zero_bit(bitmap, left, + startoff)) != -1) { + if (bitoff == left) { + /* mlog(0, "bitoff (%d) == left", bitoff); */ + break; + } + /* mlog(0, "Found a zero: bitoff = %d, startoff = %d, " + * "numfound = %d\n", bitoff, startoff, numfound);*/ + + /* Ok, we found a zero bit... is it contig. or do we + * start over?*/ + if (bitoff == startoff) { + /* we found a zero */ + numfound++; + startoff++; + } else { + /* got a zero after some ones */ + numfound = 1; + startoff = bitoff+1; + } + /* we got everything we needed */ + if (numfound == *numbits) { + /* mlog(0, "Found it all!\n"); */ + goto out; + } } } - +out: trace_ocfs2_local_alloc_find_clear_bits_search_bitmap(bitoff, numfound); if (numfound == *numbits) @@ -900,12 +968,18 @@ static void ocfs2_clear_local_alloc(struct ocfs2_dinode *alloc) { struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc); int i; + u8 *bitmap; alloc->id1.bitmap1.i_total = 0; alloc->id1.bitmap1.i_used = 0; + la->la_rec_count = 0; la->la_bm_off = 0; + + /* We reset the rec count so following will clear records as well */ + bitmap = OCFS2_LOCAL_ALLOC_BITMAP(la); + bitmap += sizeof(struct ocfs2_local_alloc); for(i = 0; i < le16_to_cpu(la->la_size); i++) - la->la_bitmap[i] = 0; + bitmap[i] = 0; } #if 0 @@ -933,17 +1007,64 @@ static void ocfs2_verify_zero_bits(unsigned long *bitmap, * assumes you've already locked the main bitmap -- the bitmap inode * passed is used for caching. */ +static int ocfs2_sync_local_rec_to_main(struct ocfs2_super *osb, + handle_t *handle, + struct ocfs2_dinode *alloc, + struct inode *main_bm_inode, + struct buffer_head *main_bm_bh, + u8 *bitmap, u64 la_start_blk, + int start, int left) +{ + int bit_off = 0, status = 0, prev, count; + u64 blkno; + + prev = start; + count = 0; + while ((bit_off = ocfs2_find_next_zero_bit(bitmap, left, + start)) != -1) { + if ((bit_off < left) && (bit_off == start)) { + count++; + start++; + continue; + } + if (count) { + blkno = la_start_blk + + ocfs2_clusters_to_blocks(osb->sb, + (start - prev) - count); + mlog(0, "\nfreeing %u bits starting at local " + "alloc bit %u (la_start_blk = %llu, " + "blkno = %llu)\n", + count, ((start - prev) - count), + (unsigned long long)la_start_blk, + (unsigned long long)blkno); + status = ocfs2_release_clusters(handle, main_bm_inode, + main_bm_bh, blkno, + count); + if (status < 0) { + mlog_errno(status); + goto bail; + } + } + if (bit_off >= left) + break; + count = 1; + start = bit_off + 1; + } +bail: + return status; +} + static int ocfs2_sync_local_to_main(struct ocfs2_super *osb, handle_t *handle, struct ocfs2_dinode *alloc, struct inode *main_bm_inode, struct buffer_head *main_bm_bh) { - int status = 0; - int bit_off, left, count, start; + int i, status = 0; + int total, start, rec_cnt, credits; + u32 clusters; u64 la_start_blk; - u64 blkno; - void *bitmap; + u8 *bitmap; struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc); trace_ocfs2_sync_local_to_main( @@ -954,49 +1075,58 @@ static int ocfs2_sync_local_to_main(struct ocfs2_super *osb, goto bail; } + /* if all bits are used nothing to sync, just return */ if (le32_to_cpu(alloc->id1.bitmap1.i_used) == le32_to_cpu(alloc->id1.bitmap1.i_total)) { goto bail; } - la_start_blk = ocfs2_clusters_to_blocks(osb->sb, - le32_to_cpu(la->la_bm_off)); - bitmap = la->la_bitmap; - start = count = bit_off = 0; - left = le32_to_cpu(alloc->id1.bitmap1.i_total); + bitmap = OCFS2_LOCAL_ALLOC_BITMAP(la); + rec_cnt = le16_to_cpu(la->la_rec_count) - 1; - while ((bit_off = ocfs2_find_next_zero_bit(bitmap, left, start)) - != -1) { - if ((bit_off < left) && (bit_off == start)) { - count++; - start++; - continue; - } - if (count) { - blkno = la_start_blk + - ocfs2_clusters_to_blocks(osb->sb, - start - count); + for (i = rec_cnt; i >= 0 ; i--) { + la_start_blk = ocfs2_clusters_to_blocks(osb->sb, + le32_to_cpu(la->la_recs[i].la_start)); - trace_ocfs2_sync_local_to_main_free( - count, start - count, - (unsigned long long)la_start_blk, - (unsigned long long)blkno); + total = le32_to_cpu(alloc->id1.bitmap1.i_total); + clusters = le32_to_cpu(la->la_recs[i].la_clusters); + start = total - clusters; - status = ocfs2_release_clusters(handle, - main_bm_inode, - main_bm_bh, blkno, - count); + status = ocfs2_sync_local_rec_to_main(osb, handle, alloc, + main_bm_inode, + main_bm_bh, bitmap, + la_start_blk, start, + total); + if (status < 0) { + mlog_errno(status); + goto bail; + } + la->la_bm_off = 0; + la->la_recs[i].la_start = 0; + la->la_recs[i].la_clusters = 0; + le16_add_cpu(&la->la_rec_count, -1); + le32_add_cpu(&alloc->id1.bitmap1.i_total, -clusters); + + ocfs2_journal_dirty(handle, osb->local_alloc_bh); + + /* if we need more credits extend the transaction */ + credits = OCFS2_WINDOW_MOVE_CREDITS - handle->h_buffer_credits; + if (credits > 0) { + status = ocfs2_extend_trans(handle, credits); + if (status < 0) { + mlog_errno(status); + goto bail; + } + status = ocfs2_journal_access_di(handle, + INODE_CACHE(osb->local_alloc_inode), + osb->local_alloc_bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto bail; } } - if (bit_off >= left) - break; - count = 1; - start = bit_off + 1; } - bail: if (status) mlog_errno(status); @@ -1046,9 +1176,12 @@ static int ocfs2_recalc_la_window(struct ocfs2_super *osb, * We ran out of contiguous space in the primary * bitmap. Drastically reduce the number of bits used * by local alloc until we have to disable it. + * In general we will be seeing atleast few contiguous free + * bits. It should be ok to keep local alloc enabled even + * in extreme case where max available contiguous free bit is 1 */ bits = osb->local_alloc_bits >> 1; - if (bits > ocfs2_megabytes_to_clusters(osb->sb, 1)) { + if (bits) { /* * By setting state to THROTTLED, we'll keep * the number of local alloc bits used down @@ -1096,8 +1229,9 @@ static int ocfs2_local_alloc_reserve_for_window(struct ocfs2_super *osb, goto bail; } + osb->local_alloc_bits = osb->local_alloc_default_bits; retry_enospc: - (*ac)->ac_bits_wanted = osb->local_alloc_default_bits; + (*ac)->ac_bits_wanted = osb->local_alloc_bits; status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac); if (status == -ENOSPC) { if (ocfs2_recalc_la_window(osb, OCFS2_LA_EVENT_ENOSPC) == @@ -1137,9 +1271,11 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb, struct ocfs2_alloc_context *ac) { int status = 0; - u32 cluster_off, cluster_count; + u32 wanted, cluster_off, cluster_count; struct ocfs2_dinode *alloc = NULL; struct ocfs2_local_alloc *la; + u8 *bitmap; + int i, rec_cnt, credits; alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data; la = OCFS2_LOCAL_ALLOC(alloc); @@ -1156,72 +1292,97 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb, /* we used the generic suballoc reserve function, but we set * everything up nicely, so there's no reason why we can't use * the more specific cluster api to claim bits. */ - status = ocfs2_claim_clusters(handle, ac, osb->local_alloc_bits, - &cluster_off, &cluster_count); - if (status == -ENOSPC) { -retry_enospc: - /* - * Note: We could also try syncing the journal here to - * allow use of any free bits which the current - * transaction can't give us access to. --Mark - */ - if (ocfs2_recalc_la_window(osb, OCFS2_LA_EVENT_FRAGMENTED) == - OCFS2_LA_DISABLED) - goto bail; - - ac->ac_bits_wanted = osb->local_alloc_default_bits; - status = ocfs2_claim_clusters(handle, ac, - osb->local_alloc_bits, - &cluster_off, + rec_cnt = 0; + wanted = osb->local_alloc_bits; + while (1) { + status = ocfs2_claim_clusters(handle, ac, wanted, &cluster_off, &cluster_count); - if (status == -ENOSPC) - goto retry_enospc; - /* - * We only shrunk the *minimum* number of in our - * request - it's entirely possible that the allocator - * might give us more than we asked for. - */ - if (status == 0) { - spin_lock(&osb->osb_lock); - osb->local_alloc_bits = cluster_count; - spin_unlock(&osb->osb_lock); + if (status == -ENOSPC) { + /* reduce window size and retry */ + if (ocfs2_recalc_la_window(osb, + OCFS2_LA_EVENT_FRAGMENTED) == OCFS2_LA_DISABLED) + break; + wanted = osb->local_alloc_bits; + continue; + } else if (status < 0) + break; + + BUG_ON(ac->ac_bits_given > ac->ac_bits_wanted); + + /* found a window */ + la->la_recs[rec_cnt].la_start = cpu_to_le32(cluster_off); + la->la_recs[rec_cnt].la_clusters = cpu_to_le32(cluster_count); + rec_cnt++; + la->la_rec_count = cpu_to_le16(rec_cnt); + le32_add_cpu(&alloc->id1.bitmap1.i_total, cluster_count); + + ocfs2_journal_dirty(handle, osb->local_alloc_bh); + + if (!ocfs2_supports_discontig_la(osb)) { + la->la_bm_off = cpu_to_le32(cluster_off); + break; + } + + /* exit if we can't fit another record */ + wanted = ocfs2_local_alloc_adjust_bits_wanted(la, ac); + if (!wanted) + break; + + if (wanted > osb->local_alloc_bits) + wanted = osb->local_alloc_bits; + + /* if we need more credits extend the transaction */ + if (rec_cnt >= OCFS2_MAX_LOCAL_ALLOC_REC_LIMIT) + break; + + credits = OCFS2_WINDOW_MOVE_CREDITS - handle->h_buffer_credits; + if (credits > 0) { + status = ocfs2_extend_trans(handle, credits); + if (status < 0) { + mlog_errno(status); + goto bail; + } + status = ocfs2_journal_access_di(handle, + INODE_CACHE(osb->local_alloc_inode), + osb->local_alloc_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto bail; + } } } - if (status < 0) { - if (status != -ENOSPC) - mlog_errno(status); + if (!rec_cnt) goto bail; - } + osb->local_alloc_state = OCFS2_LA_ENABLED; + spin_lock(&osb->osb_lock); + if (cluster_count > osb->local_alloc_bits) + osb->local_alloc_bits = cluster_count; + spin_unlock(&osb->osb_lock); osb->la_last_gd = ac->ac_last_group; - la->la_bm_off = cpu_to_le32(cluster_off); - alloc->id1.bitmap1.i_total = cpu_to_le32(cluster_count); - /* just in case... In the future when we find space ourselves, - * we don't have to get all contiguous -- but we'll have to - * set all previously used bits in bitmap and update - * la_bits_set before setting the bits in the main bitmap. */ - alloc->id1.bitmap1.i_used = 0; - memset(OCFS2_LOCAL_ALLOC(alloc)->la_bitmap, 0, - le16_to_cpu(la->la_size)); - - ocfs2_resmap_restart(&osb->osb_la_resmap, cluster_count, - OCFS2_LOCAL_ALLOC(alloc)->la_bitmap); + bitmap = OCFS2_LOCAL_ALLOC_BITMAP(la); + ocfs2_resmap_restart(&osb->osb_la_resmap, rec_cnt, + alloc->id1.bitmap1.i_total, bitmap); + for (i = 0; i < rec_cnt; i++) + ocfs2_resmap_set_ext(&osb->osb_la_resmap, i, + le32_to_cpu(la->la_recs[i].la_clusters)); - trace_ocfs2_local_alloc_new_window_result( - OCFS2_LOCAL_ALLOC(alloc)->la_bm_off, + trace_ocfs2_local_alloc_new_window_result + (OCFS2_LOCAL_ALLOC(alloc)->la_recs[0].la_start, le32_to_cpu(alloc->id1.bitmap1.i_total)); bail: - if (status) + if ((status < 0) && (status != -ENOSPC)) mlog_errno(status); + return status; } /* Note that we do *NOT* lock the local alloc inode here as * it's been locked already for us. */ -static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb, - struct inode *local_alloc_inode) +static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb) { int status = 0; struct buffer_head *main_bm_bh = NULL; @@ -1268,7 +1429,7 @@ static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb, memcpy(alloc_copy, alloc, osb->local_alloc_bh->b_size); status = ocfs2_journal_access_di(handle, - INODE_CACHE(local_alloc_inode), + INODE_CACHE(osb->local_alloc_inode), osb->local_alloc_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { -- 1.5.4.3 From srinivas.eeda at oracle.com Mon May 7 16:21:30 2012 From: srinivas.eeda at oracle.com (Srinivas Eeda) Date: Mon, 7 May 2012 16:21:30 -0700 Subject: [Ocfs2-devel] [PATCH 3/3] ocfs2: modify reservation code to support discontigous localalloc In-Reply-To: <1336432890-18638-1-git-send-email-srinivas.eeda@oracle.com> References: <1336432890-18638-1-git-send-email-srinivas.eeda@oracle.com> Message-ID: <1336432890-18638-4-git-send-email-srinivas.eeda@oracle.com> Currently reservation code assumes a bitmap given to it is all one contigous chunk. This patch enhances it to handle a discontigous chunks. It adds new fields m_bitmap_ext_cnt and m_bitmap_ext_arr. m_bitmap_ext_arr tracks the sizes of each contigous free bits and m_bitmap_ext_cnt trackes number of m_bitmap_ext_arr. Signed-off-by: Srinivas Eeda --- fs/ocfs2/reservations.c | 41 ++++++++++++++++++++++++++++++++++------- fs/ocfs2/reservations.h | 7 ++++++- 2 files changed, 40 insertions(+), 8 deletions(-) diff --git a/fs/ocfs2/reservations.c b/fs/ocfs2/reservations.c index 41ffd36..fea93d7 100644 --- a/fs/ocfs2/reservations.c +++ b/fs/ocfs2/reservations.c @@ -291,7 +291,15 @@ static void ocfs2_resmap_clear_all_resv(struct ocfs2_reservation_map *resmap) } } -void ocfs2_resmap_restart(struct ocfs2_reservation_map *resmap, +void ocfs2_resmap_set_ext(struct ocfs2_reservation_map *resmap, int arr, u32 sz) +{ + if (ocfs2_resmap_disabled(resmap)) + return; + + resmap->m_bitmap_ext_arr[arr] = sz; +} + +void ocfs2_resmap_restart(struct ocfs2_reservation_map *resmap, u32 ext_cnt, unsigned int clen, char *disk_bitmap) { if (ocfs2_resmap_disabled(resmap)) @@ -300,9 +308,21 @@ void ocfs2_resmap_restart(struct ocfs2_reservation_map *resmap, spin_lock(&resv_lock); ocfs2_resmap_clear_all_resv(resmap); + + /* free existing extent array */ + if (resmap->m_bitmap_ext_arr) + kfree(resmap->m_bitmap_ext_arr); + resmap->m_bitmap_len = clen; resmap->m_disk_bitmap = disk_bitmap; + resmap->m_bitmap_ext_cnt = ext_cnt; + resmap->m_bitmap_ext_arr = kmalloc((sizeof(u32) * ext_cnt), GFP_NOFS); + if (!resmap->m_bitmap_ext_arr) { + mlog_errno(-ENOMEM); + resmap->m_osb->osb_resv_level = 0; + } + spin_unlock(&resv_lock); } @@ -419,20 +439,26 @@ static int ocfs2_resmap_find_free_bits(struct ocfs2_reservation_map *resmap, unsigned int *rlen) { void *bitmap = resmap->m_disk_bitmap; - unsigned int best_start, best_len = 0; + unsigned int best_start, len, ext, best_len = 0; int offset, start, found; trace_ocfs2_resmap_find_free_bits_begin(search_start, search_len, wanted, resmap->m_bitmap_len); - found = best_start = best_len = 0; - + found = best_start = best_len = ext = 0; start = search_start; + len = resmap->m_bitmap_ext_arr[ext++]; while ((offset = ocfs2_find_next_zero_bit(bitmap, resmap->m_bitmap_len, - start)) != -1) { + start)) != -1) { /* Search reached end of the region */ if (offset >= (search_start + search_len)) - break; + goto out; + + if (offset >= len) { + len += resmap->m_bitmap_ext_arr[ext]; + found = 1; + start = offset + 1; + } if (offset == start) { /* we found a zero */ @@ -450,9 +476,10 @@ static int ocfs2_resmap_find_free_bits(struct ocfs2_reservation_map *resmap, } if (found >= wanted) - break; + goto out; } +out: if (best_len == 0) return 0; diff --git a/fs/ocfs2/reservations.h b/fs/ocfs2/reservations.h index 42c2b80..bb5e94f 100644 --- a/fs/ocfs2/reservations.h +++ b/fs/ocfs2/reservations.h @@ -56,6 +56,8 @@ struct ocfs2_reservation_map { u32 m_bitmap_len; /* Number of valid * bits available */ + u32 m_bitmap_ext_cnt; + u32 *m_bitmap_ext_arr; struct list_head m_lru; /* LRU of reservations * structures. */ @@ -94,6 +96,9 @@ void ocfs2_resv_discard(struct ocfs2_reservation_map *resmap, int ocfs2_resmap_init(struct ocfs2_super *osb, struct ocfs2_reservation_map *resmap); +void ocfs2_resmap_set_ext(struct ocfs2_reservation_map *resmap, int arr, + u32 sz); + /** * ocfs2_resmap_restart() - "restart" a reservation bitmap * @resmap: reservations bitmap @@ -107,7 +112,7 @@ int ocfs2_resmap_init(struct ocfs2_super *osb, * reservations. A future version will recalculate existing * reservations based on the new bitmap. */ -void ocfs2_resmap_restart(struct ocfs2_reservation_map *resmap, +void ocfs2_resmap_restart(struct ocfs2_reservation_map *resmap, u32 ext_cnt, unsigned int clen, char *disk_bitmap); /** -- 1.5.4.3 From srinivas.eeda at oracle.com Mon May 7 16:21:27 2012 From: srinivas.eeda at oracle.com (Srinivas Eeda) Date: Mon, 7 May 2012 16:21:27 -0700 Subject: [Ocfs2-devel] ocfs2 discontiguous localalloc patches Message-ID: <1336432890-18638-1-git-send-email-srinivas.eeda@oracle.com> Hi all, can you please review following 3 patches that implement discontiguous localalloc bitmap support for ocfs2 file system. This feature helps applications that significantly fragment the filesystem. These fixes needs changes to ocfs2 tools as well. I am sending those patches for review separately. A write up on this feature is available at http://oss.oracle.com/osswiki/OCFS2/DesignDocs/DiscontiguousLocalAlloc.html Thanks, --Srini From jlbec at evilplan.org Mon May 7 17:01:01 2012 From: jlbec at evilplan.org (Joel Becker) Date: Mon, 7 May 2012 17:01:01 -0700 Subject: [Ocfs2-devel] ocfs2 discontiguous localalloc patches In-Reply-To: <1336432890-18638-1-git-send-email-srinivas.eeda@oracle.com> References: <1336432890-18638-1-git-send-email-srinivas.eeda@oracle.com> Message-ID: <20120508000100.GB4713@dhcp-172-17-9-228.mtv.corp.google.com> On Mon, May 07, 2012 at 04:21:27PM -0700, Srinivas Eeda wrote: > can you please review following 3 patches that implement discontiguous > localalloc bitmap support for ocfs2 file system. This feature helps > applications that significantly fragment the filesystem. Hi Srini. Have you some performance numbers backing this? That is, I believe that the described filesystem turned off local alloc. Do you have proof that these patches, turning it back on, improved the customer's performance? Joel -- "But all my words come back to me In shades of mediocrity. Like emptiness in harmony I need someone to comfort me." http://www.jlbec.org/ jlbec at evilplan.org From jlbec at evilplan.org Mon May 7 17:05:33 2012 From: jlbec at evilplan.org (Joel Becker) Date: Mon, 7 May 2012 17:05:33 -0700 Subject: [Ocfs2-devel] [PATCH 1/3] ocfs2: new structure to implment discontiguous local alloc bitmap In-Reply-To: <1336432890-18638-2-git-send-email-srinivas.eeda@oracle.com> References: <1336432890-18638-1-git-send-email-srinivas.eeda@oracle.com> <1336432890-18638-2-git-send-email-srinivas.eeda@oracle.com> Message-ID: <20120508000532.GC4713@dhcp-172-17-9-228.mtv.corp.google.com> On Mon, May 07, 2012 at 04:21:28PM -0700, Srinivas Eeda wrote: > Current local alloc handles single contiguous free chunk of clusters. This > patch enhances local alloc to handle discontigous free chunks. It adds a new > ocfs2_local_alloc_rec structure which tracks single contiguous free chunk. An > array of these sit in the bitmap itself and track discontiguous chunks. In > best case there is only one record and increases as the filesystem gets > fragmented. Number of records at a time are limited depending on the size > of the bitmap and the max limit is defined by OCFS2_MAX_LOCAL_ALLOC_RECS. > > Signed-off-by: Srinivas Eeda > --- > fs/ocfs2/localalloc.c | 10 ++++++++++ > fs/ocfs2/ocfs2.h | 8 ++++++++ > fs/ocfs2/ocfs2_fs.h | 48 ++++++++++++++++++++++++++++++++++++++++++------ > 3 files changed, 60 insertions(+), 6 deletions(-) > > diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c > index 210c352..4190e53 100644 > --- a/fs/ocfs2/localalloc.c > +++ b/fs/ocfs2/localalloc.c > @@ -48,6 +48,16 @@ > > #define OCFS2_LOCAL_ALLOC(dinode) (&((dinode)->id2.i_lab)) > > +#define OCFS2_LOCAL_ALLOC_REC_SZ(la) (le16_to_cpu(la->la_rec_count) *\ > + sizeof(struct ocfs2_local_alloc_rec)) > +#define OCFS2_LOCAL_ALLOC_BITMAP(la) ((char *)(&(la->la_recs)) +\ > + OCFS2_LOCAL_ALLOC_REC_SZ(la)) > +#define OCFS2_LOCAL_ALLOC_BITS_PER_REC (sizeof(struct ocfs2_local_alloc_rec)*8) > + > +/* Maximum number of local alloc records */ > +#define OCFS2_MAX_LOCAL_ALLOC_REC_LIMIT 128 > + > + > static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc); > > static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb, > diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h > index d355e6e..d4c36d2 100644 > --- a/fs/ocfs2/ocfs2.h > +++ b/fs/ocfs2/ocfs2.h > @@ -367,6 +367,7 @@ struct ocfs2_super > * by osb_lock */ > > struct buffer_head *local_alloc_bh; > + struct inode *local_alloc_inode; > > u64 la_last_gd; > > @@ -522,6 +523,13 @@ static inline int ocfs2_supports_discontig_bg(struct ocfs2_super *osb) > return 0; > } > > +static inline int ocfs2_supports_discontig_la(struct ocfs2_super *osb) > +{ > + if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_DISCONTIG_LA) > + return 1; > + return 0; > +} > + > static inline unsigned int ocfs2_link_max(struct ocfs2_super *osb) > { > if (ocfs2_supports_indexed_dirs(osb)) > diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h > index 938387a..6a0fe02 100644 > --- a/fs/ocfs2/ocfs2_fs.h > +++ b/fs/ocfs2/ocfs2_fs.h > @@ -102,7 +102,8 @@ > | OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS \ > | OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE \ > | OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG \ > - | OCFS2_FEATURE_INCOMPAT_CLUSTERINFO) > + | OCFS2_FEATURE_INCOMPAT_CLUSTERINFO \ > + | OCFS2_FEATURE_INCOMPAT_DISCONTIG_LA) > #define OCFS2_FEATURE_RO_COMPAT_SUPP (OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \ > | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \ > | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA) > @@ -177,6 +178,9 @@ > */ > #define OCFS2_FEATURE_INCOMPAT_CLUSTERINFO 0x4000 > > +/* Discontiguous local alloc */ > +#define OCFS2_FEATURE_INCOMPAT_DISCONTIG_LA 0x8000 I really wish this could be an RO_COMPAT flag, but I think that recovery on RO mounts will break with this. Mark, please confirm, but I think it has to be INCOMPAT. > @@ -664,14 +668,19 @@ struct ocfs2_super_block { > * Local allocation bitmap for OCFS2 slots > * Note that it exists inside an ocfs2_dinode, so all offsets are > * relative to the start of ocfs2_dinode.id2. > + * Each ocfs2_local_alloc_rec tracks one contigous chunk of clusters. > */ > +struct ocfs2_local_alloc_rec { > + __le32 la_start; /* 1st cluster in this extent */ > + __le32 la_clusters; /* Number of contiguous clusters */ > +}; > + > struct ocfs2_local_alloc > { > /*00*/ __le32 la_bm_off; /* Starting bit offset in main bitmap */ > __le16 la_size; /* Size of included bitmap, in bytes */ > - __le16 la_reserved1; > - __le64 la_reserved2; > -/*10*/ __u8 la_bitmap[0]; > + __le16 la_rec_count; /* Number of discontiguous records */ > + struct ocfs2_local_alloc_rec la_recs[0]; /* Localalloc records */ > }; You can't delete la_bitmap. Any filesystem without DISCONTIG_LA will be expecting the inline bitmap to start there. > @@ -1380,11 +1389,24 @@ static inline u16 ocfs2_local_alloc_size(struct super_block *sb) > u16 size; > > size = sb->s_blocksize - > - offsetof(struct ocfs2_dinode, id2.i_lab.la_bitmap); > + offsetof(struct ocfs2_dinode, id2.i_lab.la_recs); > + size -= sizeof(struct ocfs2_local_alloc_rec); You can't do this without checking for DISCONTIG_LA. Again, filesystems without DISCONTIG_LA will be starting at la_bitmap. Joel -- "If at first you don't succeed, cover all traces that you tried." -Unknown http://www.jlbec.org/ jlbec at evilplan.org From jlbec at evilplan.org Mon May 7 17:22:58 2012 From: jlbec at evilplan.org (Joel Becker) Date: Mon, 7 May 2012 17:22:58 -0700 Subject: [Ocfs2-devel] [PATCH 2/3] ocfs2: implement discontiguous localalloc bitmap In-Reply-To: <1336432890-18638-3-git-send-email-srinivas.eeda@oracle.com> References: <1336432890-18638-1-git-send-email-srinivas.eeda@oracle.com> <1336432890-18638-3-git-send-email-srinivas.eeda@oracle.com> Message-ID: <20120508002256.GD4713@dhcp-172-17-9-228.mtv.corp.google.com> On Mon, May 07, 2012 at 04:21:29PM -0700, Srinivas Eeda wrote: > This patch adds supporting functions and modifies localalloc code to implement > discontiguous localalloc bitmap. > > Signed-off-by: Srinivas Eeda > --- > fs/ocfs2/localalloc.c | 523 ++++++++++++++++++++++++++++++++----------------- > 1 files changed, 342 insertions(+), 181 deletions(-) > > diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c > index 4190e53..f63381e 100644 > --- a/fs/ocfs2/localalloc.c > +++ b/fs/ocfs2/localalloc.c > @@ -48,6 +48,9 @@ > > #define OCFS2_LOCAL_ALLOC(dinode) (&((dinode)->id2.i_lab)) > > +/* defines minimum contiguous required */ > +#define OCFS2_LOCAL_ALLOC_MIN_BITS 2 > + > #define OCFS2_LOCAL_ALLOC_REC_SZ(la) (le16_to_cpu(la->la_rec_count) *\ > sizeof(struct ocfs2_local_alloc_rec)) > #define OCFS2_LOCAL_ALLOC_BITMAP(la) ((char *)(&(la->la_recs)) +\ > @@ -58,7 +61,8 @@ > #define OCFS2_MAX_LOCAL_ALLOC_REC_LIMIT 128 > > > -static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc); > +static u32 ocfs2_local_alloc_count_bits(struct ocfs2_super *osb, > + struct ocfs2_dinode *alloc); > > static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb, > struct ocfs2_dinode *alloc, > @@ -82,8 +86,7 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb, > handle_t *handle, > struct ocfs2_alloc_context *ac); > > -static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb, > - struct inode *local_alloc_inode); > +static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb); I noted that you moved local_alloc_inode into ocfs2_super in the previous patch. Lifting that into the super should be one distinct patch. It should add the field to ocfs2_super and change the function signatures at the same time. Munging it with other patches confuses the issue. > @@ -202,6 +205,74 @@ unsigned int ocfs2_la_default_mb(struct ocfs2_super *osb) > return la_mb; > } > > +static u32 ocfs2_local_bitmap_to_cluster(struct ocfs2_local_alloc *la, u32 bit) > +{ > + u32 start, prev, offset; > + int rec; > + > + rec = start = prev = 0; > + for (rec = 0; rec < le16_to_cpu(la->la_rec_count); rec++) { > + prev = start; > + start += le32_to_cpu(la->la_recs[rec].la_clusters); > + if (bit < start) > + break; > + } > + offset = le32_to_cpu(la->la_recs[rec].la_start) + (bit - prev); > + > + return offset; > +} This can't work for non-DISCONTIG_LA filesystems. I looked, and you call this regardless of the feature bits. Old filesystems will crash, because they have bitmap bits instead of la_rec_count. This is why I said you couldn't remove la_bitmap. > +/* > + * This function is called before allocating a new chunk for the localalloc > + * bitmap to make sure there is enough space in the bitmap for the new record > + */ > +static u32 ocfs2_local_alloc_adjust_bits_wanted(struct ocfs2_local_alloc *la, > + struct ocfs2_alloc_context *ac) > +{ > + u32 required, available, cluster_cnt; > + > + if (ac->ac_bits_given == ac->ac_bits_wanted) > + return 0; > + > + /* total bits available in bitmap */ > + available = le16_to_cpu(la->la_size) << 3; > + cluster_cnt = ocfs2_local_alloc_cluster_count(la); > + > + /* > + * Wanted shouldn't be greater than bitmap size and given should be > + * equal to cluster count > + */ > + BUG_ON(ac->ac_bits_given > ac->ac_bits_wanted); > + BUG_ON(ac->ac_bits_wanted > available); > + BUG_ON(ac->ac_bits_given != cluster_cnt); > + > + /* reduce bits taken by each record structure */ > + available -= (le16_to_cpu(la->la_rec_count) * > + OCFS2_LOCAL_ALLOC_BITS_PER_REC); Again, no check for DISCONTIG_LA. I'm going to stop mentioning this. Just assume that every place you want to touch la_rec_count, you need to make sure you have a DISCONTIG_LA filesystem. > @@ -348,21 +421,21 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb) > } > > /* do a little verification. */ > - num_used = ocfs2_local_alloc_count_bits(alloc); > + num_used = ocfs2_local_alloc_count_bits(osb, alloc); > > /* hopefully the local alloc has always been recovered before > * we load it. */ > if (num_used > || alloc->id1.bitmap1.i_used > || alloc->id1.bitmap1.i_total > - || la->la_bm_off) > + || la->la_rec_count) I lied. You can't trust la_rec_count for non-DISCONTIG_LA filesystems, so you can't have a naked check here. Conversely, la_bm_off is the valid check for those filesystems. You need to alternate based on the feature. > @@ -690,8 +739,7 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb, > le32_to_cpu(alloc->id1.bitmap1.i_used); > if (bits_wanted > free_bits) { > /* uhoh, window change time. */ > - status = > - ocfs2_local_alloc_slide_window(osb, local_alloc_inode); > + status = ocfs2_local_alloc_slide_window(osb); This is what I mean about osb->local_alloc_inode. There should be a first patch that does these changes only. > @@ -745,7 +792,7 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb, > { > int status, start; > struct inode *local_alloc_inode; > - void *bitmap; > + u8 *bitmap; I'm not sure about this. Do you have a reason? > struct ocfs2_dinode *alloc; > struct ocfs2_local_alloc *la; > > @@ -764,8 +811,8 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb, > goto bail; > } > > - bitmap = la->la_bitmap; > - *bit_off = le32_to_cpu(la->la_bm_off) + start; > + bitmap = OCFS2_LOCAL_ALLOC_BITMAP(la); > + *bit_off = ocfs2_local_bitmap_to_cluster(la, start); Here is the call that assumes a DISCONTIG_LA filesystem. > *num_bits = bits_wanted; > > status = ocfs2_journal_access_di(handle, > @@ -792,16 +839,29 @@ bail: > return status; > } > > -static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc) > +static u32 ocfs2_local_alloc_count_bits(struct ocfs2_super *osb, > + struct ocfs2_dinode *alloc) > { > int i; > - u8 *buffer; > + u8 *bitmap; > u32 count = 0; > struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc); > > - buffer = la->la_bitmap; > - for (i = 0; i < le16_to_cpu(la->la_size); i++) > - count += hweight8(buffer[i]); > + /* > + * if discontig is not enabled then lets update the first localalloc > + * record with the current bitmap block info. We are doing this because > + * old disk formats are not aware of the records. > + */ > + if (!ocfs2_supports_discontig_la(osb) && la->la_bm_off) { > + la->la_rec_count = cpu_to_le16(1); > + la->la_recs[0].la_start = la->la_bm_off; > + la->la_recs[0].la_clusters = alloc->id1.bitmap1.i_total; > + } OH MY DOG NO. NEVER EVER DO THIS. You cannot update an old filesystem on the fly! What about other nodes that are running older versions of the software? They will crash or corrupt data! The entire point of feature bits is to make sure all nodes are speaking the same code. NAK NAK NAK This explains why you trusted la_rec_count earlier. But that is broken. When your patches are done, the code should use la_bm_off and la_bitmap when !DISCONTIG_LA and then use la_rec_count, etc when DISCONTIG_LA. The only way to transition between them is a tunefs.ocfs2 operation that walks the filesystem, flushes the bitmap, and then sets/clears la_rec_count appropriately depending on the direction.. Joel -- "I inject pure kryptonite into my brain. It improves my kung fu, and it eases the pain." http://www.jlbec.org/ jlbec at evilplan.org From jlbec at evilplan.org Mon May 7 17:28:23 2012 From: jlbec at evilplan.org (Joel Becker) Date: Mon, 7 May 2012 17:28:23 -0700 Subject: [Ocfs2-devel] [PATCH 1/3] ocfs2: new structure to implment discontiguous local alloc bitmap In-Reply-To: <1336432890-18638-2-git-send-email-srinivas.eeda@oracle.com> References: <1336432890-18638-1-git-send-email-srinivas.eeda@oracle.com> <1336432890-18638-2-git-send-email-srinivas.eeda@oracle.com> Message-ID: <20120508002820.GE4713@dhcp-172-17-9-228.mtv.corp.google.com> On Mon, May 07, 2012 at 04:21:28PM -0700, Srinivas Eeda wrote: > Current local alloc handles single contiguous free chunk of clusters. This > patch enhances local alloc to handle discontigous free chunks. It adds a new > ocfs2_local_alloc_rec structure which tracks single contiguous free chunk. An > array of these sit in the bitmap itself and track discontiguous chunks. In > best case there is only one record and increases as the filesystem gets > fragmented. Number of records at a time are limited depending on the size > of the bitmap and the max limit is defined by OCFS2_MAX_LOCAL_ALLOC_RECS. > > Signed-off-by: Srinivas Eeda > --- > fs/ocfs2/localalloc.c | 10 ++++++++++ > fs/ocfs2/ocfs2.h | 8 ++++++++ > fs/ocfs2/ocfs2_fs.h | 48 ++++++++++++++++++++++++++++++++++++++++++------ > 3 files changed, 60 insertions(+), 6 deletions(-) > > diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c > index 210c352..4190e53 100644 > --- a/fs/ocfs2/localalloc.c > +++ b/fs/ocfs2/localalloc.c > @@ -48,6 +48,16 @@ > > #define OCFS2_LOCAL_ALLOC(dinode) (&((dinode)->id2.i_lab)) > > +#define OCFS2_LOCAL_ALLOC_REC_SZ(la) (le16_to_cpu(la->la_rec_count) *\ > + sizeof(struct ocfs2_local_alloc_rec)) > +#define OCFS2_LOCAL_ALLOC_BITMAP(la) ((char *)(&(la->la_recs)) +\ > + OCFS2_LOCAL_ALLOC_REC_SZ(la)) Another point. Not only does this macro not handle !DISCONTIG_LA filesystems (as described in my other email about this patch), it should be a static inline function. See eg: INODE_CACHE() in fs/ocfs2/inode.h Joel -- Life's Little Instruction Book #456 "Send your loved one flowers. Think of a reason later." http://www.jlbec.org/ jlbec at evilplan.org From jlbec at evilplan.org Mon May 7 17:34:31 2012 From: jlbec at evilplan.org (Joel Becker) Date: Mon, 7 May 2012 17:34:31 -0700 Subject: [Ocfs2-devel] [PATCH 3/3] ocfs2: modify reservation code to support discontigous localalloc In-Reply-To: <1336432890-18638-4-git-send-email-srinivas.eeda@oracle.com> References: <1336432890-18638-1-git-send-email-srinivas.eeda@oracle.com> <1336432890-18638-4-git-send-email-srinivas.eeda@oracle.com> Message-ID: <20120508003431.GF4713@dhcp-172-17-9-228.mtv.corp.google.com> On Mon, May 07, 2012 at 04:21:30PM -0700, Srinivas Eeda wrote: > Currently reservation code assumes a bitmap given to it is all one contigous > chunk. This patch enhances it to handle a discontigous chunks. It adds new > fields m_bitmap_ext_cnt and m_bitmap_ext_arr. m_bitmap_ext_arr tracks the sizes > of each contigous free bits and m_bitmap_ext_cnt trackes number of > m_bitmap_ext_arr. > > Signed-off-by: Srinivas Eeda Hi Srini, A patch like this should come before the feature patch. Once this code can treat the old single-range bitmap as a one-element multiple-range bitmap, you can add the multiple-range change easily. > +void ocfs2_resmap_set_ext(struct ocfs2_reservation_map *resmap, int arr, u32 sz) > +{ > + if (ocfs2_resmap_disabled(resmap)) > + return; > + > + resmap->m_bitmap_ext_arr[arr] = sz; > +} I don't see this function called anywhere. And please don't use needless abbreviations. If you want to say ocfs2_resmap_set_extent(), write it out. I don't quite get the arguments, and since it isn't called, I can't figure out how they are used. Joel -- "To announce that there must be no criticism of them president, or that we are to stand by the president, right or wrong, is not only unpatriotic and servile, but is morally treasonable to the American public." - Theodore Roosevelt http://www.jlbec.org/ jlbec at evilplan.org From srinivas.eeda at oracle.com Mon May 7 18:26:58 2012 From: srinivas.eeda at oracle.com (Srinivas Eeda) Date: Mon, 07 May 2012 18:26:58 -0700 Subject: [Ocfs2-devel] ocfs2 discontiguous localalloc patches In-Reply-To: <20120508000100.GB4713@dhcp-172-17-9-228.mtv.corp.google.com> References: <1336432890-18638-1-git-send-email-srinivas.eeda@oracle.com> <20120508000100.GB4713@dhcp-172-17-9-228.mtv.corp.google.com> Message-ID: <4FA87662.9040006@oracle.com> Joel Becker wrote: > On Mon, May 07, 2012 at 04:21:27PM -0700, Srinivas Eeda wrote: > >> can you please review following 3 patches that implement discontiguous >> localalloc bitmap support for ocfs2 file system. This feature helps >> applications that significantly fragment the filesystem. >> > > Hi Srini. Have you some performance numbers backing this? That > is, I believe that the described filesystem turned off local alloc. Do > you have proof that these patches, turning it back on, improved the > customer's performance? > > Joel > Hi Joel, thanks a lot for the quick reply. I have some stat_sysdir.sh snapshots at http://oss.oracle.com/~seeda/diag/stat_sysdir/ collected from a system. It has 4 snapshots collected when the file system usage is at 8%, 19%, 21% and 52%. In file stat_sysdir_52_percent_usage_slow_del.out, for the filesystem that has UUID: 3A6F54DF288C4AF2ABD1E00FC49BE7ED you could see that local_alloc:0000 bitmap total is 38 and is 0(disabled) for local_alloc:0001, and local_alloc:0002. for the filesystem that has uuid AC444DB162AE427C899BA89E076DD479, all localalloc appears to be disabled. Sorry I didn't collect /sys/kernel/debug/fs//fs_state. But, given the file system state, even if localalloc is not disabled localalloc need to be refilled every 40 clusters. Thanks, --Srini From srinivas.eeda at oracle.com Mon May 7 19:10:32 2012 From: srinivas.eeda at oracle.com (Srinivas Eeda) Date: Mon, 07 May 2012 19:10:32 -0700 Subject: [Ocfs2-devel] [PATCH 2/3] ocfs2: implement discontiguous localalloc bitmap In-Reply-To: <20120508002256.GD4713@dhcp-172-17-9-228.mtv.corp.google.com> References: <1336432890-18638-1-git-send-email-srinivas.eeda@oracle.com> <1336432890-18638-3-git-send-email-srinivas.eeda@oracle.com> <20120508002256.GD4713@dhcp-172-17-9-228.mtv.corp.google.com> Message-ID: <4FA88098.9070301@oracle.com> Joel Becker wrote: > On Mon, May 07, 2012 at 04:21:29PM -0700, Srinivas Eeda wrote: > > > OH MY DOG NO. NEVER EVER DO THIS. You cannot update an old > filesystem on the fly! What about other nodes that are running older > versions of the software? They will crash or corrupt data! The entire > point of feature bits is to make sure all nodes are speaking the same > code. > > NAK NAK NAK > > This explains why you trusted la_rec_count earlier. But that is > broken. When your patches are done, the code should use la_bm_off and > la_bitmap when !DISCONTIG_LA and then use la_rec_count, etc when > DISCONTIG_LA. The only way to transition between them is a tunefs.ocfs2 > operation that walks the filesystem, flushes the bitmap, and then > sets/clears la_rec_count appropriately depending on the direction.. > Please please don't hate me :( ... the changes takes care of old formats as well ... I used the reserved space in the structure so that the code changes will be minimal and still compatible with old file system formats. I agree that we need to have some reserved space still available. So as discussed I'll redo the changes accordingly. Please ignore all the patches. Thanks, --Srini From xiaowei.hu at oracle.com Thu May 24 22:53:22 2012 From: xiaowei.hu at oracle.com (xiaowei.hu at oracle.com) Date: Fri, 25 May 2012 13:53:22 +0800 Subject: [Ocfs2-devel] [PATCH] Fix waiting status race condition in dlm recovery Message-ID: <1337925202-13086-1-git-send-email-xiaowei.hu@oracle.com> From: "Xiaowei.Hu" when the master requested locks ,but one/some of the live nodes died, after it received the request msg and before send out the locks packages, the recovery will fall into endless loop,waiting for the status changed to finalize NodeA NodeB selected as recovery master dlm_remaster_locks -> dlm_requeset_all_locks this send request locks msg to B received the msg from A, queue worker dlm_request_all_locks_worker return 0 go on set state to requested wait for the state become done NodeB lost connection due to network before the worker begin, or it die. NodeA still waiting for the change of reco state. It won't end if it not get data done msg And at this time nodeB do not realize this (or it just died), it won't send the msg for ever, nodeA left in the recovery process forever. This patch let the recovery master check if the node still in live node map when it stay in REQUESTED status. Signed-off-by: Xiaowei.Hu --- fs/ocfs2/dlm/dlmrecovery.c | 9 +++++++++ 1 files changed, 9 insertions(+), 0 deletions(-) diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 01ebfd0..62659e8 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -555,6 +555,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node) int all_nodes_done; int destroy = 0; int pass = 0; + int dying = 0; do { /* we have become recovery master. there is no escaping @@ -659,6 +660,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node) list_for_each_entry(ndata, &dlm->reco.node_data, list) { mlog(0, "checking recovery state of node %u\n", ndata->node_num); + dying = 0; switch (ndata->state) { case DLM_RECO_NODE_DATA_INIT: case DLM_RECO_NODE_DATA_REQUESTING: @@ -679,6 +681,13 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node) dlm->name, ndata->node_num, ndata->state==DLM_RECO_NODE_DATA_RECEIVING ? "receiving" : "requested"); + spin_lock(&dlm->spinlock); + dying = !test_bit(ndata->node_num, dlm->live_nodes_map); + spin_unlock(&dlm->spinlock); + if (dying) { + ndata->state = DLM_RECO_NODE_DATA_DEAD; + break; + } all_nodes_done = 0; break; case DLM_RECO_NODE_DATA_DONE: -- 1.7.7.6 From srinivas.eeda at oracle.com Fri May 25 15:17:57 2012 From: srinivas.eeda at oracle.com (srinivas eeda) Date: Fri, 25 May 2012 15:17:57 -0700 Subject: [Ocfs2-devel] [PATCH] Fix waiting status race condition in dlm recovery In-Reply-To: <1337925202-13086-1-git-send-email-xiaowei.hu@oracle.com> References: <1337925202-13086-1-git-send-email-xiaowei.hu@oracle.com> Message-ID: <4FC00515.1060105@oracle.com> comments inline On 5/24/2012 10:53 PM, xiaowei.hu at oracle.com wrote: > From: "Xiaowei.Hu" > > when the master requested locks ,but one/some of the live nodes died, > after it received the request msg and before send out the locks packages, > the recovery will fall into endless loop,waiting for the status changed to finalize > > NodeA NodeB > selected as recovery master > dlm_remaster_locks > -> dlm_requeset_all_locks > this send request locks msg to B > received the msg from A, > queue worker dlm_request_all_locks_worker > return 0 > go on set state to requested > wait for the state become done > NodeB lost connection due to network > before the worker begin, or it die. > NodeA still waiting for the > change of reco state. > It won't end if it not get data done msg > And at this time nodeB do not realize this (or it just died), > it won't send the msg for ever, nodeA left in the recovery process forever. > > This patch let the recovery master check if the node still in live node > map when it stay in REQUESTED status. > > Signed-off-by: Xiaowei.Hu > --- > fs/ocfs2/dlm/dlmrecovery.c | 9 +++++++++ > 1 files changed, 9 insertions(+), 0 deletions(-) > > diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c > index 01ebfd0..62659e8 100644 > --- a/fs/ocfs2/dlm/dlmrecovery.c > +++ b/fs/ocfs2/dlm/dlmrecovery.c > @@ -555,6 +555,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node) > int all_nodes_done; > int destroy = 0; > int pass = 0; > + int dying = 0; > > do { > /* we have become recovery master. there is no escaping > @@ -659,6 +660,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node) > list_for_each_entry(ndata,&dlm->reco.node_data, list) { > mlog(0, "checking recovery state of node %u\n", > ndata->node_num); > + dying = 0; > switch (ndata->state) { > case DLM_RECO_NODE_DATA_INIT: > case DLM_RECO_NODE_DATA_REQUESTING: > @@ -679,6 +681,13 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node) > dlm->name, ndata->node_num, > ndata->state==DLM_RECO_NODE_DATA_RECEIVING ? > "receiving" : "requested"); > + spin_lock(&dlm->spinlock); > + dying = !test_bit(ndata->node_num, dlm->live_nodes_map); > + spin_unlock(&dlm->spinlock); > + if (dying) { > + ndata->state = DLM_RECO_NODE_DATA_DEAD; > + break; > + } > all_nodes_done = 0; > break; > case DLM_RECO_NODE_DATA_DONE: fix seems to address the issue, but can you please add a function dlm_is_node_in_livemap similar to dlm_is_node_dead so that it' improves readability. You can then add the following to check if the node is still alive + if (!dlm_is_node_in_livemap(dlm, ndata->node_num)) + ndate->state = DLM_RECO_NODE_DATA_DEAD; + else + all_nodes_done = 0; From xiaowei.hu at oracle.com Fri May 25 19:05:14 2012 From: xiaowei.hu at oracle.com (Xiaowei) Date: Sat, 26 May 2012 10:05:14 +0800 Subject: [Ocfs2-devel] [PATCH] Fix waiting status race condition in dlm recovery In-Reply-To: <4FC00515.1060105@oracle.com> References: <1337925202-13086-1-git-send-email-xiaowei.hu@oracle.com> <4FC00515.1060105@oracle.com> Message-ID: <4FC03A5A.6090705@oracle.com> Thanks Srini , This sounds good, I tried to use dlm_is_node_dead in this patch , but this function can't report another node is dead if this node already in recovery process. It was blocked to set the bit in domain_map, but the live_nodes_map could always reflect the really live nodes. I will reformat the patch. Thanks, Xiaowei On 05/26/2012 06:17 AM, srinivas eeda wrote: > comments inline > > On 5/24/2012 10:53 PM, xiaowei.hu at oracle.com wrote: >> From: "Xiaowei.Hu" >> >> when the master requested locks ,but one/some of the live nodes died, >> after it received the request msg and before send out the locks >> packages, >> the recovery will fall into endless loop,waiting for the status >> changed to finalize >> >> NodeA NodeB >> selected as recovery master >> dlm_remaster_locks >> -> dlm_requeset_all_locks >> this send request locks msg to B >> received the msg from A, >> queue worker >> dlm_request_all_locks_worker >> return 0 >> go on set state to requested >> wait for the state become done >> NodeB lost connection due >> to network >> before the worker begin, >> or it die. >> NodeA still waiting for the >> change of reco state. >> It won't end if it not get data done msg >> And at this time nodeB do not realize this (or it just died), >> it won't send the msg for ever, nodeA left in the recovery process >> forever. >> >> This patch let the recovery master check if the node still in live node >> map when it stay in REQUESTED status. >> >> Signed-off-by: Xiaowei.Hu >> --- >> fs/ocfs2/dlm/dlmrecovery.c | 9 +++++++++ >> 1 files changed, 9 insertions(+), 0 deletions(-) >> >> diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c >> index 01ebfd0..62659e8 100644 >> --- a/fs/ocfs2/dlm/dlmrecovery.c >> +++ b/fs/ocfs2/dlm/dlmrecovery.c >> @@ -555,6 +555,7 @@ static int dlm_remaster_locks(struct dlm_ctxt >> *dlm, u8 dead_node) >> int all_nodes_done; >> int destroy = 0; >> int pass = 0; >> + int dying = 0; >> >> do { >> /* we have become recovery master. there is no escaping >> @@ -659,6 +660,7 @@ static int dlm_remaster_locks(struct dlm_ctxt >> *dlm, u8 dead_node) >> list_for_each_entry(ndata,&dlm->reco.node_data, list) { >> mlog(0, "checking recovery state of node %u\n", >> ndata->node_num); >> + dying = 0; >> switch (ndata->state) { >> case DLM_RECO_NODE_DATA_INIT: >> case DLM_RECO_NODE_DATA_REQUESTING: >> @@ -679,6 +681,13 @@ static int dlm_remaster_locks(struct dlm_ctxt >> *dlm, u8 dead_node) >> dlm->name, ndata->node_num, >> ndata->state==DLM_RECO_NODE_DATA_RECEIVING ? >> "receiving" : "requested"); >> + spin_lock(&dlm->spinlock); >> + dying = !test_bit(ndata->node_num, >> dlm->live_nodes_map); >> + spin_unlock(&dlm->spinlock); >> + if (dying) { >> + ndata->state = DLM_RECO_NODE_DATA_DEAD; >> + break; >> + } >> all_nodes_done = 0; >> break; >> case DLM_RECO_NODE_DATA_DONE: > fix seems to address the issue, but can you please add a function > dlm_is_node_in_livemap similar to dlm_is_node_dead so that it' > improves readability. You can then add the following to check if the > node is still alive > + if (!dlm_is_node_in_livemap(dlm, ndata->node_num)) > + ndate->state = DLM_RECO_NODE_DATA_DEAD; > + else > + all_nodes_done = 0; From xiaowei.hu at oracle.com Fri May 25 19:27:29 2012 From: xiaowei.hu at oracle.com (xiaowei.hu at oracle.com) Date: Sat, 26 May 2012 10:27:29 +0800 Subject: [Ocfs2-devel] [PATCH] Fix waiting status race condition in dlm recovery V2 Message-ID: <1337999249-15242-1-git-send-email-xiaowei.hu@oracle.com> From: "Xiaowei.Hu" when the master requested locks ,but one/some of the live nodes died, after it received the request msg and before send out the locks packages, the recovery will fall into endless loop,waiting for the status changed to finalize NodeA NodeB selected as recovery master dlm_remaster_locks -> dlm_requeset_all_locks this send request locks msg to B received the msg from A, queue worker dlm_request_all_locks_worker return 0 go on set state to requested wait for the state become done NodeB lost connection due to network before the worker begin, or it die. NodeA still waiting for the change of reco state. It won't end if it not get data done msg. And at this time nodeB do not realize this (or it just died), it won't send the msg for ever, nodeA left in the recovery process forever. This patch let the recovery master check if the node still in live node map when it stay in REQUESTED status. Signed-off-by: Xiaowei.Hu --- fs/ocfs2/dlm/dlmrecovery.c | 16 +++++++++++++++- 1 files changed, 15 insertions(+), 1 deletions(-) diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 01ebfd0..546c5b5 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -339,6 +339,17 @@ static int dlm_reco_master_ready(struct dlm_ctxt *dlm) return ready; } +/* returns true if node is still in the live node map + * this map is cleared before domain map,could be checked in recovery*/ +int dlm_is_node_in_livemap(struct dlm_ctxt *dlm, u8 node) +{ + int live; + spin_lock(&dlm->spinlock); + live = !test_bit(node, dlm->live_nodes_map); + spin_unlock(&dlm->spinlock); + return live; +} + /* returns true if node is no longer in the domain * could be dead or just not joined */ int dlm_is_node_dead(struct dlm_ctxt *dlm, u8 node) @@ -679,7 +690,10 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node) dlm->name, ndata->node_num, ndata->state==DLM_RECO_NODE_DATA_RECEIVING ? "receiving" : "requested"); - all_nodes_done = 0; + if (!dlm_is_node_in_livemap(dlm, ndata->node_num)) + ndata->state = DLM_RECO_NODE_DATA_DEAD; + else + all_nodes_done = 0; break; case DLM_RECO_NODE_DATA_DONE: mlog(0, "%s: node %u state is done\n", -- 1.7.7.6 From sunil.mushran at gmail.com Tue May 29 15:09:08 2012 From: sunil.mushran at gmail.com (Sunil Mushran) Date: Tue, 29 May 2012 15:09:08 -0700 Subject: [Ocfs2-devel] [PATCH] Fix waiting status race condition in dlm recovery In-Reply-To: <1337925202-13086-1-git-send-email-xiaowei.hu@oracle.com> References: <1337925202-13086-1-git-send-email-xiaowei.hu@oracle.com> Message-ID: On Thu, May 24, 2012 at 10:53 PM, wrote: > > diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c > index 01ebfd0..62659e8 100644 > --- a/fs/ocfs2/dlm/dlmrecovery.c > +++ b/fs/ocfs2/dlm/dlmrecovery.c > @@ -555,6 +555,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 > dead_node) > int all_nodes_done; > int destroy = 0; > int pass = 0; > + int dying = 0; > > do { > /* we have become recovery master. there is no escaping > @@ -659,6 +660,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 > dead_node) > list_for_each_entry(ndata, &dlm->reco.node_data, list) { > mlog(0, "checking recovery state of node %u\n", > ndata->node_num); > + dying = 0; > switch (ndata->state) { > case DLM_RECO_NODE_DATA_INIT: > case DLM_RECO_NODE_DATA_REQUESTING: > @@ -679,6 +681,13 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, > u8 dead_node) > dlm->name, ndata->node_num, > > ndata->state==DLM_RECO_NODE_DATA_RECEIVING ? > "receiving" : "requested"); > + spin_lock(&dlm->spinlock); > + dying = !test_bit(ndata->node_num, > dlm->live_nodes_map); > + spin_unlock(&dlm->spinlock); > + if (dying) { > + ndata->state = > DLM_RECO_NODE_DATA_DEAD; > + break; > + } > I would suggest exploring adding this in dlm hb down event. Checking live map all over the place is hacky. We do it more than we should right now. Let's not add to the mess. > all_nodes_done = 0; > break; > case DLM_RECO_NODE_DATA_DONE: > -- > 1.7.7.6 > > > _______________________________________________ > Ocfs2-devel mailing list > Ocfs2-devel at oss.oracle.com > http://oss.oracle.com/mailman/listinfo/ocfs2-devel > -------------- next part -------------- An HTML attachment was scrubbed... URL: http://oss.oracle.com/pipermail/ocfs2-devel/attachments/20120529/1080a567/attachment.html From xiaowei.hu at oracle.com Tue May 29 17:41:09 2012 From: xiaowei.hu at oracle.com (Xiaowei) Date: Wed, 30 May 2012 08:41:09 +0800 Subject: [Ocfs2-devel] [PATCH] Fix waiting status race condition in dlm recovery In-Reply-To: References: <1337925202-13086-1-git-send-email-xiaowei.hu@oracle.com> Message-ID: <4FC56CA5.8040902@oracle.com> On 05/30/2012 06:09 AM, Sunil Mushran wrote: > On Thu, May 24, 2012 at 10:53 PM, > wrote: > > > diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c > index 01ebfd0..62659e8 100644 > --- a/fs/ocfs2/dlm/dlmrecovery.c > +++ b/fs/ocfs2/dlm/dlmrecovery.c > @@ -555,6 +555,7 @@ static int dlm_remaster_locks(struct dlm_ctxt > *dlm, u8 dead_node) > int all_nodes_done; > int destroy = 0; > int pass = 0; > + int dying = 0; > > do { > /* we have become recovery master. there is no > escaping > @@ -659,6 +660,7 @@ static int dlm_remaster_locks(struct dlm_ctxt > *dlm, u8 dead_node) > list_for_each_entry(ndata, &dlm->reco.node_data, > list) { > mlog(0, "checking recovery state of node %u\n", > ndata->node_num); > + dying = 0; > switch (ndata->state) { > case DLM_RECO_NODE_DATA_INIT: > case DLM_RECO_NODE_DATA_REQUESTING: > @@ -679,6 +681,13 @@ static int dlm_remaster_locks(struct dlm_ctxt > *dlm, u8 dead_node) > dlm->name, > ndata->node_num, > > ndata->state==DLM_RECO_NODE_DATA_RECEIVING ? > "receiving" : > "requested"); > + spin_lock(&dlm->spinlock); > + dying = > !test_bit(ndata->node_num, dlm->live_nodes_map); > + spin_unlock(&dlm->spinlock); > + if (dying) { > + ndata->state = > DLM_RECO_NODE_DATA_DEAD; > + break; > + } > > > > > > I would suggest exploring adding this in dlm hb down event. Checking > live map all > over the place is hacky. We do it more than we should right now. Let's > not add to the > mess. HI Sunil, Do you mean we should clear the bit in domain map in dlm hb down event directly when the node down and check with dlm_is_node_dead at here? Or how could we explore and ensure the node is alive during the whole migrate process?One node could die even after it sends out one locks package and before the next if there were too many locks on that lockres. Thanks, Xiaowei > > > > all_nodes_done = 0; > break; > case DLM_RECO_NODE_DATA_DONE: > -- > 1.7.7.6 > > > _______________________________________________ > Ocfs2-devel mailing list > Ocfs2-devel at oss.oracle.com > http://oss.oracle.com/mailman/listinfo/ocfs2-devel > > -------------- next part -------------- An HTML attachment was scrubbed... URL: http://oss.oracle.com/pipermail/ocfs2-devel/attachments/20120530/5fcb3ea7/attachment.html From sunil.mushran at gmail.com Wed May 30 18:18:12 2012 From: sunil.mushran at gmail.com (Sunil Mushran) Date: Wed, 30 May 2012 18:18:12 -0700 Subject: [Ocfs2-devel] [PATCH] Fix waiting status race condition in dlm recovery In-Reply-To: <4FC56CA5.8040902@oracle.com> References: <1337925202-13086-1-git-send-email-xiaowei.hu@oracle.com> <4FC56CA5.8040902@oracle.com> Message-ID: On Tue, May 29, 2012 at 5:41 PM, Xiaowei wrote: > On 05/30/2012 06:09 AM, Sunil Mushran wrote: > I would suggest exploring adding this in dlm hb down event. Checking live > map all > over the place is hacky. We do it more than we should right now. Let's not > add to the > mess. > > HI Sunil, > > Do you mean we should clear the bit in domain map in dlm hb down event > directly when the node down > and check with dlm_is_node_dead at here? > Or how could we explore and ensure the node is alive during the whole > migrate process?One node could die even after it sends out one locks package > and before the next if there were too many locks on that lockres. dlm hb down event is triggered when a node is declared dead. That's where we clean up pending mles, etc. You can add a check for recovery and add logic to change the reco state for that node there. From junxiao.bi at oracle.com Wed May 30 21:12:29 2012 From: junxiao.bi at oracle.com (Junxiao Bi) Date: Thu, 31 May 2012 12:12:29 +0800 Subject: [Ocfs2-devel] [PATCH 1/2] aio: make kiocb->private NUll in init_sync_kiocb() Message-ID: <1338437550-24499-1-git-send-email-junxiao.bi@oracle.com> Ocfs2 uses kiocb.*private as a flag of unsigned long size. In commit a11f7e6 ocfs2: serialize unaligned aio, the unaligned io flag is involved in it to serialize the unaligned aio. As *private is not initialized in init_sync_kiocb() of do_sync_write(), this unaligned io flag may be unexpectly set in an aligned dio. And this will cause OCFS2_I(inode)->ip_unaligned_aio decreased to -1 in ocfs2_dio_end_io(), thus the following unaligned dio will hang forever at ocfs2_aiodio_wait() in ocfs2_file_write_iter(). We can't initialized this flag in ocfs2_file_write_iter() since it may be invoked several times by do_sync_write(). So we initialize it in init_sync_kiocb(), it's also useful for other similiar use of it in the future. Signed-off-by: Junxiao Bi --- include/linux/aio.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/linux/aio.h b/include/linux/aio.h index 2314ad8..b1a520e 100644 --- a/include/linux/aio.h +++ b/include/linux/aio.h @@ -140,6 +140,7 @@ struct kiocb { (x)->ki_dtor = NULL; \ (x)->ki_obj.tsk = tsk; \ (x)->ki_user_data = 0; \ + (x)->private = NULL; \ } while (0) #define AIO_RING_MAGIC 0xa10a10a1 -- 1.7.9.5 From junxiao.bi at oracle.com Wed May 30 21:12:30 2012 From: junxiao.bi at oracle.com (Junxiao Bi) Date: Thu, 31 May 2012 12:12:30 +0800 Subject: [Ocfs2-devel] [PATCH 2/2] ocfs2: clear unaligned io flag when dio fails In-Reply-To: <1338437550-24499-1-git-send-email-junxiao.bi@oracle.com> References: <1338437550-24499-1-git-send-email-junxiao.bi@oracle.com> Message-ID: <1338437550-24499-2-git-send-email-junxiao.bi@oracle.com> The unaligned io flag is set in the kiocb when an unaligned dio is issued, it should be cleared even when the dio fails, or it may affect the following io which are using the same kiocb. Signed-off-by: Junxiao Bi --- fs/ocfs2/file.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 061591a..98513c8 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -2422,8 +2422,10 @@ out_dio: unaligned_dio = 0; } - if (unaligned_dio) + if (unaligned_dio) { + ocfs2_iocb_clear_unaligned_aio(iocb); atomic_dec(&OCFS2_I(inode)->ip_unaligned_aio); + } out: if (rw_level != -1) -- 1.7.9.5 From jmoyer at redhat.com Thu May 31 07:08:13 2012 From: jmoyer at redhat.com (Jeff Moyer) Date: Thu, 31 May 2012 10:08:13 -0400 Subject: [Ocfs2-devel] [PATCH 1/2] aio: make kiocb->private NUll in init_sync_kiocb() In-Reply-To: <1338437550-24499-1-git-send-email-junxiao.bi@oracle.com> (Junxiao Bi's message of "Thu, 31 May 2012 12:12:29 +0800") References: <1338437550-24499-1-git-send-email-junxiao.bi@oracle.com> Message-ID: Junxiao Bi writes: > Ocfs2 uses kiocb.*private as a flag of unsigned long size. In > commit a11f7e6 ocfs2: serialize unaligned aio, the unaligned > io flag is involved in it to serialize the unaligned aio. As > *private is not initialized in init_sync_kiocb() of do_sync_write(), > this unaligned io flag may be unexpectly set in an aligned dio. > And this will cause OCFS2_I(inode)->ip_unaligned_aio decreased > to -1 in ocfs2_dio_end_io(), thus the following unaligned dio > will hang forever at ocfs2_aiodio_wait() in ocfs2_file_write_iter(). > We can't initialized this flag in ocfs2_file_write_iter() since > it may be invoked several times by do_sync_write(). So we initialize > it in init_sync_kiocb(), it's also useful for other similiar use of > it in the future. I don't see any ocfs2_file_write_iter in the upstream kernel. ocfs2_file_aio_write most certainly could set ->private to 0, it will only be called once for a given kiocb. That point aside, I have no issues with setting private to NULL in init_sync_kiocb. If you fix up the comment to reflect reality w.r.t. the upstream kernel source, I'll ack the patch. Cheers, Jeff From jmoyer at redhat.com Thu May 31 07:09:09 2012 From: jmoyer at redhat.com (Jeff Moyer) Date: Thu, 31 May 2012 10:09:09 -0400 Subject: [Ocfs2-devel] [PATCH 2/2] ocfs2: clear unaligned io flag when dio fails In-Reply-To: <1338437550-24499-2-git-send-email-junxiao.bi@oracle.com> (Junxiao Bi's message of "Thu, 31 May 2012 12:12:30 +0800") References: <1338437550-24499-1-git-send-email-junxiao.bi@oracle.com> <1338437550-24499-2-git-send-email-junxiao.bi@oracle.com> Message-ID: Junxiao Bi writes: > The unaligned io flag is set in the kiocb when an unaligned > dio is issued, it should be cleared even when the dio fails, > or it may affect the following io which are using the same > kiocb. What code is re-using kiocbs, much less re-using them without re-initializing them? -Jeff From junxiao.bi at oracle.com Thu May 31 18:41:52 2012 From: junxiao.bi at oracle.com (Junxiao Bi) Date: Fri, 01 Jun 2012 09:41:52 +0800 Subject: [Ocfs2-devel] [PATCH 1/2] aio: make kiocb->private NUll in init_sync_kiocb() In-Reply-To: References: <1338437550-24499-1-git-send-email-junxiao.bi@oracle.com> Message-ID: <4FC81DE0.5080403@oracle.com> On 05/31/2012 10:08 PM, Jeff Moyer wrote: > Junxiao Bi writes: > >> Ocfs2 uses kiocb.*private as a flag of unsigned long size. In >> commit a11f7e6 ocfs2: serialize unaligned aio, the unaligned >> io flag is involved in it to serialize the unaligned aio. As >> *private is not initialized in init_sync_kiocb() of do_sync_write(), >> this unaligned io flag may be unexpectly set in an aligned dio. >> And this will cause OCFS2_I(inode)->ip_unaligned_aio decreased >> to -1 in ocfs2_dio_end_io(), thus the following unaligned dio >> will hang forever at ocfs2_aiodio_wait() in ocfs2_file_write_iter(). >> We can't initialized this flag in ocfs2_file_write_iter() since >> it may be invoked several times by do_sync_write(). So we initialize >> it in init_sync_kiocb(), it's also useful for other similiar use of >> it in the future. > I don't see any ocfs2_file_write_iter in the upstream kernel. > ocfs2_file_aio_write most certainly could set ->private to 0, it > will only be called once for a given kiocb. >From sys_io_submit->..->io_submit_one->aio_run_iocb->aio_rw_vect_retry, it seems that aio_write could be called two times. See the following scenario. 1. There is a file opened with direct io flag, in aio_rw_vect_retry, aio_write is called first time. If the direct io can not be completed, it will fall back into buffer io, see line 2329 in aio_write. 2. If the very buffer io is a partial write, then it will return back to aio_rw_vect_retry and issue the second aio_write. > > That point aside, I have no issues with setting private to NULL in > init_sync_kiocb. If you fix up the comment to reflect reality > w.r.t. the upstream kernel source, I'll ack the patch. OK, I will fix the comment. > > Cheers, > Jeff From junxiao.bi at oracle.com Thu May 31 18:44:25 2012 From: junxiao.bi at oracle.com (Junxiao Bi) Date: Fri, 01 Jun 2012 09:44:25 +0800 Subject: [Ocfs2-devel] [PATCH 2/2] ocfs2: clear unaligned io flag when dio fails In-Reply-To: References: <1338437550-24499-1-git-send-email-junxiao.bi@oracle.com> <1338437550-24499-2-git-send-email-junxiao.bi@oracle.com> Message-ID: <4FC81E79.1080003@oracle.com> On 05/31/2012 10:09 PM, Jeff Moyer wrote: > Junxiao Bi writes: > >> The unaligned io flag is set in the kiocb when an unaligned >> dio is issued, it should be cleared even when the dio fails, >> or it may affect the following io which are using the same >> kiocb. > What code is re-using kiocbs, much less re-using them without > re-initializing them? See my comment in another thread. aio_write seems called two times with the same kiocb. > > -Jeff From akinobu.mita at gmail.com Sun May 20 06:24:03 2012 From: akinobu.mita at gmail.com (Akinobu Mita) Date: Sun, 20 May 2012 13:24:03 -0000 Subject: [Ocfs2-devel] [PATCH 01/10] string: introduce memweight Message-ID: <1337520203-29147-1-git-send-email-akinobu.mita@gmail.com> memweight() is the function that counts the total number of bits set in memory area. The memory area doesn't need to be aligned to long-word boundary unlike bitmap_weight(). Signed-off-by: Akinobu Mita Cc: Anders Larsen Cc: Alasdair Kergon Cc: dm-devel at redhat.com Cc: linux-fsdevel at vger.kernel.org Cc: Laurent Pinchart Cc: linux-media at vger.kernel.org Cc: Mark Fasheh Cc: Joel Becker Cc: ocfs2-devel at oss.oracle.com Cc: Jan Kara Cc: linux-ext4 at vger.kernel.org Cc: Andrew Morton Cc: Andreas Dilger Cc: "Theodore Ts'o" --- include/linux/string.h | 3 +++ lib/string.c | 37 +++++++++++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+), 0 deletions(-) diff --git a/include/linux/string.h b/include/linux/string.h index e033564..ffe0442 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -145,4 +145,7 @@ static inline bool strstarts(const char *str, const char *prefix) return strncmp(str, prefix, strlen(prefix)) == 0; } #endif + +extern size_t memweight(const void *ptr, size_t bytes); + #endif /* _LINUX_STRING_H_ */ diff --git a/lib/string.c b/lib/string.c index e5878de..c8b92a0 100644 --- a/lib/string.c +++ b/lib/string.c @@ -26,6 +26,7 @@ #include #include #include +#include #ifndef __HAVE_ARCH_STRNICMP /** @@ -824,3 +825,39 @@ void *memchr_inv(const void *start, int c, size_t bytes) return check_bytes8(start, value, bytes % 8); } EXPORT_SYMBOL(memchr_inv); + +/** + * memweight - count the total number of bits set in memory area + * @ptr: pointer to the start of the area + * @bytes: the size of the area + */ +size_t memweight(const void *ptr, size_t bytes) +{ + size_t w = 0; + size_t longs; + union { + const void *ptr; + const unsigned char *b; + unsigned long address; + } bitmap; + + for (bitmap.ptr = ptr; bytes > 0 && bitmap.address % sizeof(long); + bytes--, bitmap.address++) + w += hweight8(*bitmap.b); + + for (longs = bytes / sizeof(long); longs > 0; ) { + size_t bits = min_t(size_t, INT_MAX & ~(BITS_PER_LONG - 1), + longs * BITS_PER_LONG); + + w += bitmap_weight(bitmap.ptr, bits); + bytes -= bits / BITS_PER_BYTE; + bitmap.address += bits / BITS_PER_BYTE; + longs -= bits / BITS_PER_LONG; + } + + for (; bytes > 0; bytes--, bitmap.address++) + w += hweight8(*bitmap.b); + + return w; +} +EXPORT_SYMBOL(memweight); -- 1.7.7.6 From akinobu.mita at gmail.com Sun May 20 06:24:09 2012 From: akinobu.mita at gmail.com (Akinobu Mita) Date: Sun, 20 May 2012 13:24:09 -0000 Subject: [Ocfs2-devel] [PATCH 07/10] ocfs2: use memweight() In-Reply-To: <1337520203-29147-1-git-send-email-akinobu.mita@gmail.com> References: <1337520203-29147-1-git-send-email-akinobu.mita@gmail.com> Message-ID: <1337520203-29147-7-git-send-email-akinobu.mita@gmail.com> Use memweight to count the total number of bits set in memory area. Signed-off-by: Akinobu Mita Cc: Mark Fasheh Cc: Joel Becker Cc: ocfs2-devel at oss.oracle.com --- fs/ocfs2/localalloc.c | 8 ++------ 1 files changed, 2 insertions(+), 6 deletions(-) diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index 210c352..a9f78c7 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c @@ -784,14 +784,10 @@ bail: static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc) { - int i; - u8 *buffer; - u32 count = 0; + u32 count; struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc); - buffer = la->la_bitmap; - for (i = 0; i < le16_to_cpu(la->la_size); i++) - count += hweight8(buffer[i]); + count = memweight(la->la_bitmap, le16_to_cpu(la->la_size)); trace_ocfs2_local_alloc_count_bits(count); return count; -- 1.7.7.6 From akinobu.mita at gmail.com Wed May 23 05:12:21 2012 From: akinobu.mita at gmail.com (Akinobu Mita) Date: Wed, 23 May 2012 12:12:21 -0000 Subject: [Ocfs2-devel] [PATCH 01/10] string: introduce memweight In-Reply-To: <20120523092113.GG10452@quack.suse.cz> References: <1337520203-29147-1-git-send-email-akinobu.mita@gmail.com> <20120523092113.GG10452@quack.suse.cz> Message-ID: 2012/5/23 Jan Kara : > On Sun 20-05-12 22:23:14, Akinobu Mita wrote: >> memweight() is the function that counts the total number of bits set >> in memory area. ?The memory area doesn't need to be aligned to >> long-word boundary unlike bitmap_weight(). > ?Thanks for the patch. I have some comments below. Thanks for the review. >> @@ -824,3 +825,39 @@ void *memchr_inv(const void *start, int c, size_t bytes) >> ? ? ? return check_bytes8(start, value, bytes % 8); >> ?} >> ?EXPORT_SYMBOL(memchr_inv); >> + >> +/** >> + * memweight - count the total number of bits set in memory area >> + * @ptr: pointer to the start of the area >> + * @bytes: the size of the area >> + */ >> +size_t memweight(const void *ptr, size_t bytes) >> +{ >> + ? ? size_t w = 0; >> + ? ? size_t longs; >> + ? ? union { >> + ? ? ? ? ? ? const void *ptr; >> + ? ? ? ? ? ? const unsigned char *b; >> + ? ? ? ? ? ? unsigned long address; >> + ? ? } bitmap; > ?Ugh, this is ugly and mostly unnecessary. Just use "const unsigned char > *bitmap". > >> + >> + ? ? for (bitmap.ptr = ptr; bytes > 0 && bitmap.address % sizeof(long); >> + ? ? ? ? ? ? ? ? ? ? bytes--, bitmap.address++) >> + ? ? ? ? ? ? w += hweight8(*bitmap.b); > ?This can be: > ? ? ? ?count = ((unsigned long)bitmap) % sizeof(long); The count should be the size of unaligned area and it can be greater than bytes. So count = min(bytes, sizeof(long) - ((unsigned long)bitmap) % sizeof(long)); > ? ? ? ?while (count--) { > ? ? ? ? ? ? ? ?w += hweight(*bitmap); > ? ? ? ? ? ? ? ?bitmap++; > ? ? ? ? ? ? ? ?bytes--; > ? ? ? ?} >> + >> + ? ? for (longs = bytes / sizeof(long); longs > 0; ) { >> + ? ? ? ? ? ? size_t bits = min_t(size_t, INT_MAX & ~(BITS_PER_LONG - 1), >> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? longs * BITS_PER_LONG); > ?I find it highly unlikely that someone would have such a large bitmap > (256 MB or more on 32-bit). Also the condition as you wrote it can just > overflow so it won't have the desired effect. Just do > ? ? ? ?BUG_ON(longs >= ULONG_MAX / BITS_PER_LONG); The bits argument of bitmap_weight() is int type. So this should be BUG_ON(longs >= INT_MAX / BITS_PER_LONG); > and remove the loop completely. If someone comes with such a huge bitmap, > the code can be modified easily (after really closely inspecting whether > such a huge bitmap is really well justified). size_t memweight(const void *ptr, size_t bytes) { size_t w = 0; size_t longs; const unsigned char *bitmap = ptr; for (; bytes > 0 && ((unsigned long)bitmap) % sizeof(long); bytes--, bitmap++) w += hweight8(*bitmap); longs = bytes / sizeof(long); BUG_ON(longs >= INT_MAX / BITS_PER_LONG); w += bitmap_weight((unsigned long *)bitmap, longs * BITS_PER_LONG); bytes -= longs * sizeof(long); bitmap += longs * sizeof(long); for (; bytes > 0; bytes--, bitmap++) w += hweight8(*bitmap); return w; } From akinobu.mita at gmail.com Thu May 24 04:54:21 2012 From: akinobu.mita at gmail.com (Akinobu Mita) Date: Thu, 24 May 2012 11:54:21 -0000 Subject: [Ocfs2-devel] [PATCH 01/10] string: introduce memweight In-Reply-To: <20120523131559.GA7064@parisc-linux.org> References: <1337520203-29147-1-git-send-email-akinobu.mita@gmail.com> <20120523092113.GG10452@quack.suse.cz> <20120523131559.GA7064@parisc-linux.org> Message-ID: 2012/5/23 Matthew Wilcox : > On Wed, May 23, 2012 at 09:12:18PM +0900, Akinobu Mita wrote: >> size_t memweight(const void *ptr, size_t bytes) > > Why should this return size_t instead of unsigned long? I just use the same type as the bytes argument without mature consideration. If unsigned long is better than size_t, I'll change the return type. >> { >> ? ? ? size_t w = 0; >> ? ? ? size_t longs; >> ? ? ? const unsigned char *bitmap = ptr; >> >> ? ? ? for (; bytes > 0 && ((unsigned long)bitmap) % sizeof(long); >> ? ? ? ? ? ? ? ? ? ? ? bytes--, bitmap++) >> ? ? ? ? ? ? ? w += hweight8(*bitmap); >> >> ? ? ? longs = bytes / sizeof(long); >> ? ? ? BUG_ON(longs >= INT_MAX / BITS_PER_LONG); >> ? ? ? w += bitmap_weight((unsigned long *)bitmap, longs * BITS_PER_LONG); >> ? ? ? bytes -= longs * sizeof(long); >> ? ? ? bitmap += longs * sizeof(long); >> >> ? ? ? for (; bytes > 0; bytes--, bitmap++) >> ? ? ? ? ? ? ? w += hweight8(*bitmap); >> >> ? ? ? return w; >> } > > bitmap_weight copes with a bitmask that isn't a multiple of BITS_PER_LONG > in size already. ?So I think this can be done as: > > unsigned long memweight(const void *s, size_t n) > { > ? ? ? ?const unsigned char *ptr = s; > ? ? ? ?unsigned long r = 0; > > ? ? ? ?while (n > 0 && (unsigned long)ptr % sizeof(long)) { > ? ? ? ? ? ? ? ?r += hweight8(*ptr); > ? ? ? ? ? ? ? ?n--; > ? ? ? ? ? ? ? ?ptr++; > ? ? ? ?} > > ? ? ? ?BUG_ON(n >= INT_MAX / 8) > > ? ? ? ?return r + bitmap_weight((unsigned long *)ptr, n * 8); > } This works perfectly on little-endian machines. But it doesn't work on big-endian machines, if the bottom edge of memory area is not aligned on long word boundary.