[Ocfs2-tools-devel] [PATCH 3/4] Defrag directories

tristan tristan.ye at oracle.com
Tue Mar 23 05:05:00 PDT 2010


Hi Goldwyn,

Comments are inlined.

Goldwyn Rodrigues wrote:
> Directories with holes left be deleted entries are defragged.
> Algorithm:
> 1. Read first extent and coalesce entries.
> 2. Maintain a sliding window of extents, read the next extent
>    and move dirents from last read to the first extent.

I loved your algorithm of sliding window, and buffering;)

> 3. Truncate the directory file to the computed size.
>
> Signed-off-by: Goldwyn Rodrigues <rgoldwyn at suse.de>
>
> ---
>  defrag.ocfs2/Makefile         |    2 +-
>  defrag.ocfs2/defrag.c         |   18 ++-
>  defrag.ocfs2/dir.c            |  335 +++++++++++++++++++++++++++++++++++++++++
>  defrag.ocfs2/include/defrag.h |    3 +
>  4 files changed, 355 insertions(+), 3 deletions(-)
>  create mode 100644 defrag.ocfs2/dir.c
>
> diff --git a/defrag.ocfs2/Makefile b/defrag.ocfs2/Makefile
> index 8bc48e0..ed0bc37 100644
> --- a/defrag.ocfs2/Makefile
> +++ b/defrag.ocfs2/Makefile
> @@ -13,7 +13,7 @@ LIBOCFS2_DEPS = $(TOPDIR)/libocfs2/libocfs2.a
>  
>  CFLAGS += -g
>  
> -CFILES =	defrag.c alloc.c
> +CFILES =	defrag.c alloc.c dir.c
>  
>  HFILES = 	include/defrag.h
>  
> diff --git a/defrag.ocfs2/defrag.c b/defrag.ocfs2/defrag.c
> index 22b71d7..4a5ad7e 100644
> --- a/defrag.ocfs2/defrag.c
> +++ b/defrag.ocfs2/defrag.c
> @@ -48,6 +48,7 @@ static errcode_t check_superblock(struct defrag_state *dst)
>  		printf("The superblock max_slots field is set to 0.\n");
>  		ret = OCFS2_ET_CORRUPT_SUPERBLOCK;
>  	}
> +	dst->fs_generation = di->i_fs_generation;
>  
>  	return ret;
>  }
> @@ -269,14 +270,27 @@ int main(int argc, char **argv)
>  	printf("Pass 1: Clearing unused groups in Allocators\n");
>  	ret = allocators_clear_unused(dst);
>  	if (ret) {
> -		fprintf(stderr, "Error while clearing unused inode groups\n"
> -				"Please execute fsck.ocfs2\n");
>  		retval |= DEFRAG_ERROR;
> +		goto error;
> +	}
> +
> +	printf("Pass 2: Defrag directories\n");
> +	ret = defrag_dirs(dst);
> +	if (ret) {
> +		retval |= DEFRAG_ERROR;
> +		goto error;
>  	}
>  
>  	ocfs2_write_super(dst->dst_fs);
>  	ocfs2_close(dst->dst_fs);
> +	printf("All passes successfully completed.\n");
>  	
>  out:
>  	return retval;
> +
> +error:
> +	fprintf(stderr, "Error while defragging filesystem. Please execute"
> +			"fsck.ocfs2\n");
> +	return retval;
> +
>  }
> diff --git a/defrag.ocfs2/dir.c b/defrag.ocfs2/dir.c
> new file mode 100644
> index 0000000..739d9da
> --- /dev/null
> +++ b/defrag.ocfs2/dir.c
> @@ -0,0 +1,335 @@

Need a GPL license here.

> +#include "defrag.h"
> +#include <inttypes.h>
> +
> +#define MAX_DIR_EXTENTS 256 /* Maximum window size */
> +
> +/*Returns the write offset of the buffer */
> +static int coalesce_dirents(ocfs2_filesys *fs, char *buf,
> +		int offset, int end, struct ocfs2_dir_entry **p)
> +{
> +	struct ocfs2_dir_entry *dirent = NULL, *prev = NULL;
> +	int s_rec_len, write_off = offset, bs, next_blk_lim;
> +	next_blk_lim = bs = fs->fs_blocksize;
> +
> +	while (offset < end) {
> +		dirent = (struct ocfs2_dir_entry *)(buf + offset);
> +		if (dirent->inode == 0) {
> +			offset += dirent->rec_len;
> +			continue;

Oh, here is a bug, we definitely hit deadloop in following case:

1. mkfs with 4k blocksize and 32k clustersize.

2. mkdir a directory, touch 200 files inside.

3. I4 blocks(one cluster, and of course one extent) was allocated to the 
directory, while only 2 blocks were consumed.


think about when we reach the 3rd block, its content was zero'd, we 
therefore run into deadloop here, since the dirent->inode and 
dirent->rec_len of first dentry in the 3rd block were always zero.

It can survive when the extent is not last extent(in that case, we 
consumed all blocks in this extent), however, it definitely failed when 
the directory files only have one extent.

So the solution is, not define the 'end' in terms of the ending of 
extent, just define it as the ending of last consumed block instead.

> +		}
> +		/* Set r_offset to next block start for corrupted blocks*/
> +		if (dirent->rec_len <= 0) {
> +			offset = ((offset + bs - 1) / bs) * bs;
> +			next_blk_lim = offset + bs;
> +			continue;
> +		}
> +
> +		if (prev && prev->inode) {
> +			prev->rec_len = OCFS2_DIR_REC_LEN(prev->name_len);
> +			write_off += prev->rec_len;
> +
> +			/* If doing multi-block - check if moving dirent
> +			   will move it to block boundary
> +			 */
> +			if (write_off + OCFS2_DIR_REC_LEN(dirent->name_len)
> +					> next_blk_lim) {
> +				prev->rec_len += next_blk_lim - write_off;
> +				write_off = next_blk_lim;
> +				next_blk_lim += bs;
> +				verbosef("Reset - off: %u write:%u "
> +					"next_blk:%u d:%u/%u p:%u\n",
> +					offset, write_off, next_blk_lim,
> +					dirent->name_len, dirent->rec_len,
> +					prev->rec_len);
> +			}
> +
> +		}
> +		s_rec_len = dirent->rec_len;
> +		if (write_off < offset) {
> +
> +			memmove(buf + write_off, buf + offset,
> +					OCFS2_DIR_REC_LEN(dirent->name_len));
> +			dirent = (struct ocfs2_dir_entry *)(buf + write_off);
> +		}
> +		verbosef("off: %u w:%u nb:%u d:%u/%u %.*s %llu\n",
> +				offset, write_off, next_blk_lim,
> +				dirent->name_len, dirent->rec_len,
> +				dirent->name_len, dirent->name, dirent->inode);
> +		offset += s_rec_len;
> +		prev = dirent;
> +	}
> +	write_off += OCFS2_DIR_REC_LEN(dirent->name_len);
> +	memset(buf + write_off, 0, end - write_off);
> +	dirent->rec_len = end - write_off +
> +		OCFS2_DIR_REC_LEN(dirent->name_len);
> +	*p = prev;
> +	return write_off;
> +}
> +
> +
> +struct defrag_dir_context {
> +	struct ocfs2_extent_rec *er[MAX_DIR_EXTENTS];
> +	char *buf[MAX_DIR_EXTENTS];
> +	int r_offset;
> +	int w_offset;
> +	int ext_count;
> +	struct ocfs2_dir_entry *prev;
> +	uint64_t dirsize;
> +};
> +
> +enum move_rets {
> +	READER_EXHAUSTED = 0,
> +	WRITER_EXHAUSTED
> +};
> +
> +static enum move_rets move_dirents(ocfs2_filesys *fs,
> +		struct defrag_dir_context *c)
> +{
> +	struct ocfs2_dir_entry *dirent = NULL;
> +	int next_blk_lim, bs, len, idx, r_end, w_end;
> +	char *rbuf, *wbuf;
> +	bs = fs->fs_blocksize;
> +	next_blk_lim = ((c->w_offset + bs - 1)/bs) * bs;
> +
> +	idx = c->ext_count - 1;
> +	r_end = ocfs2_clusters_to_bytes(fs, c->er[idx]->e_leaf_clusters);
> +	w_end = ocfs2_clusters_to_bytes(fs, c->er[0]->e_leaf_clusters);
> +	rbuf = c->buf[idx];
> +	wbuf = c->buf[0];
> +
> +	while (c->r_offset < r_end) {
> +		dirent = (struct ocfs2_dir_entry *)(rbuf + c->r_offset);
> +		if (dirent->rec_len <= 0) {
> +			/* Set r_offset to next block start */
> +			c->r_offset = ((c->r_offset + bs) / bs) * bs;
> +			continue;
> +		}
> +		if (dirent->inode == 0)
> +			goto next;

See, the same issue as above also hurt your move_dirents(), if the 
extent you're dealing is the last extent, now the problem may happen. 
you're still required to be able to treat the 'r_end' as the real end of 
last consumed block...

> +
> +
> +		c->prev->rec_len = OCFS2_DIR_REC_LEN(c->prev->name_len);
> +		len = OCFS2_DIR_REC_LEN(dirent->name_len);
> +		if (c->w_offset + len > w_end) {
> +			c->prev->rec_len += next_blk_lim - c->w_offset;
> +			return WRITER_EXHAUSTED;
> +		}
> +
> +		/* Make sure a block limit does not cut across a
> +		   dirent */
> +		if (c->w_offset + len > next_blk_lim) {
> +			c->prev->rec_len += next_blk_lim - c->w_offset;
> +			c->w_offset = next_blk_lim;
> +			next_blk_lim += bs;
> +		}
> +
> +		verbosef("w %d/%d r %d/%d %.*s i %lu s %lu\n",
> +				c->w_offset, w_end,
> +				c->r_offset, r_end, dirent->name_len,
> +				dirent->name, (uint64_t)dirent->inode,
> +				c->dirsize);
> +		memmove((wbuf + c->w_offset), (rbuf + c->r_offset),
> +				dirent->rec_len);
> +		c->prev = (struct ocfs2_dir_entry *)(wbuf + c->w_offset);
> +		c->w_offset += len;
> +		c->dirsize += len;
> +next:
> +		c->r_offset += dirent->rec_len;
> +	}
> +	return READER_EXHAUSTED;
> +}
> +
> +#ifdef DEBUG
> +static void print_dirents(char *buf, int end)
> +{
> +	struct ocfs2_dir_entry *dirent;
> +	int off = 0;
> +
> +	while (off < end) {
> +		dirent = (struct ocfs2_dir_entry *)(buf+off);
> +		verbosef("o %d/%d %.*s i:%llu\n", off, end,
> +				dirent->name_len, dirent->name, dirent->inode);
> +		if (!dirent->inode)
> +			break;
> +		off += dirent->rec_len;
> +	}
> +
> +}
> +#endif
> +
> +static int defrag_dir(ocfs2_filesys *fs,
> +		struct ocfs2_extent_rec *rec,
> +		int tree_depth, uint32_t ccount, uint64_t ref_blkno,
> +		int ref_recno, void *priv_data)
> +{
> +	struct defrag_dir_context *c = (struct defrag_dir_context *)priv_data;
> +	errcode_t ret = 0;
> +	int i = c->ext_count, nblks, end;
> +	enum move_rets m;
> +
> +	verbosef("Reading extent %llu w:%d r:%d\n", rec->e_blkno, c->w_offset,
> +			c->r_offset);
> +
> +	c->r_offset = 0;
> +	/* allocate/copy because underlying buffer can be freed */
> +	ret = ocfs2_malloc(sizeof(struct ocfs2_extent_rec), &c->er[i]);
> +	if (ret)
> +		goto fail;
> +	memcpy(c->er[i], rec, sizeof(struct ocfs2_extent_rec));
> +	nblks = ocfs2_clusters_to_blocks(fs, rec->e_leaf_clusters);
> +	ret = ocfs2_malloc_blocks(fs->fs_io, nblks, &c->buf[i]);

buf for last extent may need less blocks.

> +	if (ret)
> +		goto fail;
> +	ret = ocfs2_read_blocks(fs, c->er[i]->e_blkno, nblks, c->buf[i]);
> +	if (ret)
> +		goto fail;
> +	c->ext_count++;
> +
> +	if (!i) {
> +		/* First extent, just coalesce and return*/

As I mentioned about, please deal with the 'end' here in following two 
cases:

1. the directory only has one extent, so we only need to 
coalese_dirents(), and the end should be the last consumed block(hint: 
please take i_size into account).


2. when directory has multiple extent, it's fine, since the end of 
extent is equal to the end of last consumed block in that case for first 
extent.

> +		end = nblks * fs->fs_blocksize;
> +		c->w_offset = c->dirsize =
> +			coalesce_dirents(fs, c->buf[0], 0, end, &c->prev);
> +		verbosef("w_offset:%u dirsize %lu\n", c->w_offset, c->dirsize);
> +		return 0;
> +	}
> +
> +	do {
> +		m = move_dirents(fs, c);
> +		if (m == READER_EXHAUSTED)
> +			return 0;
> +		/* else m==WRITER_EXHAUSTED */
> +		nblks = ocfs2_clusters_to_blocks(fs, c->er[0]->e_leaf_clusters);
> +#ifdef DEBUG
> +		print_dirents(c->buf[0],
> +			ocfs2_clusters_to_bytes(fs, c->er[0]->e_leaf_clusters));
> +#endif
> +		io_write_block(fs->fs_io, c->er[0]->e_blkno, nblks, c->buf[0]);
> +		verbosef("Writing extent %llu\n", c->er[0]->e_blkno);
> +		ocfs2_free(&c->buf[0]);
> +		ocfs2_free(&c->er[0]);
> +		/* shift window */
> +		for (i = 0; i < c->ext_count-1; i++) {
> +			c->er[i] = c->er[i+1];
> +			c->buf[i] = c->buf[i+1];
> +		}
> +		c->w_offset = 0;
> +		c->ext_count--;
> +	} while (m == WRITER_EXHAUSTED);
> +
> +	return 0;
> +fail:
> +	if (c->buf[i])
> +		ocfs2_free(&c->buf[i]);
> +	if (c->er[i])
> +		ocfs2_free(&c->er[i]);
> +	return OCFS2_EXTENT_ABORT;
> +}
> +
> +errcode_t defrag_dirs(struct defrag_state *dst)
> +{
> +	ocfs2_inode_scan *scan;
> +	ocfs2_filesys *fs = dst->dst_fs;
> +	uint64_t blkno;
> +	errcode_t ret = 0;
> +	char *buf = NULL;
> +	struct ocfs2_dinode *di;
> +	int offset, end, blockbits, i, nblks, bs;
> +	struct defrag_dir_context dc;
> +	blockbits = OCFS2_RAW_SB(dst->dst_fs->fs_super)->s_blocksize_bits;
> +	bs = fs->fs_blocksize;
> +	ret = ocfs2_malloc_block(fs->fs_io, &buf);
> +	ret = ocfs2_open_inode_scan(fs, &scan);
> +	if (ret)
> +		goto out;
> +
> +	di = (struct ocfs2_dinode *)buf;
> +	memset(&dc, 0, sizeof(struct defrag_dir_context));
> +	for (;;) {
> +		ret = ocfs2_get_next_inode(scan, &blkno, buf);
> +
> +		if (blkno == 0)
> +			break;
> +
> +		if (memcmp(di->i_signature, OCFS2_INODE_SIGNATURE,
> +					strlen(OCFS2_INODE_SIGNATURE)))
> +			continue;
> +
> +		ocfs2_swap_inode_to_cpu(fs, di);
> +
> +		if (di->i_fs_generation != dst->fs_generation)
> +			continue;
> +
> +		if (!(di->i_flags & OCFS2_VALID_FL) ||
> +				(di->i_flags & OCFS2_SYSTEM_FL))
> +			continue;
> +		verbosef("Inode %"PRIu64" blkno %"PRIu64" flags:0x%x "
> +				"dyn_flags:0x%x\n",
> +				(uint64_t)di->i_blkno, blkno,
> +				di->i_flags, di->i_dyn_features);
> +		dst->num_inodes++;
> +		if (S_ISDIR(di->i_mode)) {
> +			if (di->i_dyn_features & OCFS2_INLINE_DATA_FL) {
> +				struct ocfs2_dir_entry *prev = NULL;
> +				offset = offsetof(struct ocfs2_dinode,
> +						id2.i_data.id_data);
> +				end = fs->fs_blocksize;
> +				coalesce_dirents(fs, buf, offset, end, &prev);
> +				ocfs2_write_inode(fs, blkno, (char *)di);
> +				continue;
> +			}
> +			memset(&dc, 0, sizeof(struct defrag_dir_context));
> +			ret = ocfs2_extent_iterate_inode(dst->dst_fs, di,
> +					OCFS2_EXTENT_FLAG_DATA_ONLY, NULL,
> +					defrag_dir, &dc);
> +			if (ret)
> +				goto out;
> +
> +			end = ocfs2_clusters_to_bytes(fs,
> +					dc.er[0]->e_leaf_clusters);
> +			memset(dc.buf[0] + dc.w_offset, 0, end - dc.w_offset);
> +
> +			/*Set the last prev */
> +			end = ((dc.w_offset + bs - 1) / bs) * bs;
> +			verbosef("w_off %u end %u bb:%d l %d i %llu\n",
> +					dc.w_offset, end, blockbits,
> +					dc.prev->rec_len, dc.prev->inode);
> +			dc.prev->rec_len = end - dc.w_offset +
> +				OCFS2_DIR_REC_LEN(dc.prev->name_len);
> +			nblks = ocfs2_clusters_to_blocks(fs,
> +					dc.er[0]->e_leaf_clusters);
> +			io_write_block(fs->fs_io, dc.er[0]->e_blkno,
> +					nblks, dc.buf[0]);
> +
> +			/*Making sure all the rest are set to zeros*/
> +			for (i = 1; i < dc.ext_count; i++) {
> +				nblks = ocfs2_clusters_to_blocks(fs,
> +						dc.er[i]->e_leaf_clusters);
> +				end = ocfs2_clusters_to_bytes(fs,
> +						dc.er[i]->e_leaf_clusters);
> +				verbosef("Clearing i: %d end %u\n", i, end);
> +				memset(dc.buf[i], 0, end);
> +				io_write_block(fs->fs_io, dc.er[i]->e_blkno,
> +						nblks, dc.buf[i]);
> +				ocfs2_free(&dc.buf[i]);
> +				ocfs2_free(&dc.er[i]);

Oh, I'm curious where did you release the dc.buf[0] and dc.er[0]?

> +			}
> +
> +			dc.dirsize = ((dc.dirsize + bs - 1) / bs) * bs;
> +			verbosef("Truncating %lu from %lu to %lu\n",
> +				(uint64_t)di->i_blkno, (uint64_t)di->i_size,
> +				dc.dirsize);
> +			if (dc.dirsize < di->i_size)
> +				/* XXX can we call truncate if the inode is
> +				   read in memory */
> +				ocfs2_truncate(dst->dst_fs, di->i_blkno,
> +						dc.dirsize);
> +		}
> +	}
> +out:
> +	if (scan)
> +		ocfs2_close_inode_scan(scan);
> +	ocfs2_free(&buf);
> +	return ret;
> +}
> +
> diff --git a/defrag.ocfs2/include/defrag.h b/defrag.ocfs2/include/defrag.h
> index e845893..08b12c7 100644
> --- a/defrag.ocfs2/include/defrag.h
> +++ b/defrag.ocfs2/include/defrag.h
> @@ -33,6 +33,8 @@
>  
>  struct defrag_state {
>  	ocfs2_filesys 	*dst_fs;
> +	uint32_t        fs_generation;
> +	int num_inodes;
>  };
>  
>  extern int verbose;
> @@ -43,6 +45,7 @@ extern int verbose;
>  
>  
>  errcode_t allocators_clear_unused(struct defrag_state *);
> +errcode_t defrag_dirs(struct defrag_state *dst);
>  
>  #endif /* __OCFS2_DEFRAG_H__ */
>  




More information about the Ocfs2-tools-devel mailing list