[Ocfs2-tools-devel] [PATCH 4/4] defrag.ocfs2: Pass 3: Defrag filesystem

Goldwyn Rodrigues rgoldwyn at gmail.com
Tue May 11 21:02:45 PDT 2010


Defrags the filesystem. For each group_desc, scan for holes.
If a hole is found, try and find extent rec which fits the hole best
and is greater than the block number of the hole. Move the data in
the hole.

TODO:
* If a good fit is not found for a group_desc, try to shift
  the data to the top of the group_desc and try again.
* Currently, the extents scanned are from inodes with depths=0 only.
  This must be changed to scan inodes greater than zero-depth.

Signed-off-by: Goldwyn Rodrigues <rgoldwyn at suse.de>
---
 defrag.ocfs2/fs.c |  602 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 files changed, 602 insertions(+), 0 deletions(-)
 create mode 100644 defrag.ocfs2/fs.c

diff --git a/defrag.ocfs2/fs.c b/defrag.ocfs2/fs.c
new file mode 100644
index 0000000..86fccbf
--- /dev/null
+++ b/defrag.ocfs2/fs.c
@@ -0,0 +1,602 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+   * vim: noexpandtab sw=8 ts=8 sts=0:
+   *
+   * fs.c
+   *
+   * Copyright (C) 2010 Oracle. All rights reserved.
+
+   *
+   * This program is free software; you can redistribute it and/or
+   * modify it under the terms of the GNU General Public
+   * License version 2 as published by the Free Software Foundation.
+   *
+   * This program is distributed in the hope that it will be useful,
+   * but WITHOUT ANY WARRANTY; without even the implied warranty of
+   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+   * General Public License for more details.
+   */
+
+#include "defrag.h"
+#include <inttypes.h>
+#include "ocfs2/bitops.h"
+
+extern char *whoami;
+
+#ifdef DEBUG
+static void print_gd(struct ocfs2_group_desc *gd)
+{
+	int i, c;
+	printf("gd blkno %"PRIu64" size %d bits %d free:%d\n",
+			(uint64_t)gd->bg_blkno,
+			gd->bg_size, gd->bg_bits, gd->bg_free_bits_count);
+	for (i=0; i<gd->bg_size; i+=4) {
+		c = ((int) gd->bg_bitmap[i] << 24) +
+			((int)gd->bg_bitmap[i+1] << 16) +
+			((int)gd->bg_bitmap[i+2] << 8) +
+			(int)gd->bg_bitmap[i+3];
+		printf ("%9x", c);
+		if (!(i%32))
+			printf(" %"PRIu64"\n", (uint64_t)gd->bg_blkno + i*8);
+	}
+	printf("\n");
+}
+#endif
+
+/* A simple selection sort for now */
+static void sort_gds(struct defrag_state *dst)
+{
+	int i, j, n=dst->num_gds;
+	struct ocfs2_group_desc **gd = dst->gds;
+	struct ocfs2_group_desc *t;
+
+	for (i=0; i<n-1; i++)
+		for (j=i+1; j<n; j++)
+			if (gd[i]->bg_blkno > gd[j]->bg_blkno) {
+				t = gd[i];
+				gd[i]=gd[j];
+				gd[j] = t;
+			}
+}
+
+static errcode_t cr_read_gds(struct defrag_state *dst,
+		struct ocfs2_chain_rec *cr)
+{
+	ocfs2_filesys *fs = dst->dst_fs;
+	uint64_t blkno = 0;
+	errcode_t ret;
+	struct ocfs2_group_desc *gd = NULL;
+	char *buf = NULL;
+
+	verbosef("free %d tot %d\n", cr->c_free, cr->c_total);
+
+	blkno = cr->c_blkno;
+	while (blkno) {
+		ret = ocfs2_malloc_block(fs->fs_io, &buf);
+		if (ret) {
+			com_err(whoami, ret, "while allocating block\n");
+			goto out;
+		}
+		verbosef("Reading gd[%d] %"PRIu64"\n", dst->num_gds,
+				blkno);
+		ret = ocfs2_read_group_desc(fs, blkno, buf);
+		if (ret) {
+			com_err(whoami, ret, "while reading group\n");
+			goto out;
+		}
+		gd = (struct ocfs2_group_desc *)buf;
+		dst->gds[dst->num_gds++] = gd;
+		blkno = gd->bg_next_group;
+	}
+out:
+	return ret;
+}
+
+
+/*Read all gds and store in the dst structure */
+static errcode_t read_gds(struct defrag_state *dst)
+{
+	ocfs2_filesys *fs = dst->dst_fs;
+	uint64_t blkno;
+	struct ocfs2_dinode *di;
+	struct ocfs2_chain_list *cl;
+	char *buf;
+	int i, n;
+	errcode_t ret = 0;
+
+	ret = ocfs2_lookup_system_inode(fs, GLOBAL_BITMAP_SYSTEM_INODE,
+			0, &blkno);
+	if (ret) {
+		com_err(whoami, ret, "while looking up system inode\n");
+		return ret;
+	}
+	ret = ocfs2_malloc_block(fs->fs_io, &buf);
+	if (ret) {
+		com_err(whoami, ret, "while allocating block\n");
+		return ret;
+	}
+	ret = ocfs2_read_inode(fs, blkno, buf);
+	if (ret) {
+		com_err(whoami, ret, "while reading inode\n");
+		ocfs2_free(&buf);
+		return ret;
+	}
+
+	di = (struct ocfs2_dinode *)buf;
+	cl = &di->id2.i_chain;
+	dst->max_gds = di->i_clusters / cl->cl_cpg;
+	dst->max_gds++;
+	ret = ocfs2_malloc(sizeof(char *) * dst->max_gds, &dst->gds);
+	if (ret) {
+		com_err(whoami, ret, "while allocating block\n");
+		return ret;
+	}
+	memset(dst->gds, 0, sizeof(char *) * dst->max_gds);
+	dst->num_gds = 0;
+
+	n = ocfs2_chain_recs_per_inode(fs->fs_blocksize);
+	n--;
+	while ((cl->cl_recs[n].c_blkno==0) && n>0)
+		n--;
+
+	verbosef("Total gds: %d recs: %d\n", dst->max_gds, n);
+
+	for(i=0; i<=n; i++) {
+		struct ocfs2_chain_rec *cr = &cl->cl_recs[i];
+		ret = cr_read_gds(dst, cr);
+		if (ret) {
+			com_err(whoami, ret, "while reading group descs\n");
+			return ret;
+		}
+	}
+
+	dst->global_bitmap_inode = di;
+	return 0;
+}
+
+static errcode_t get_num_inodes(struct defrag_state *dst)
+{
+	ocfs2_inode_scan *scan;
+	ocfs2_filesys *fs = dst->dst_fs;
+	uint64_t blkno;
+	errcode_t ret = 0;
+	char *buf = NULL;
+	struct ocfs2_dinode *di;
+
+	ret = ocfs2_malloc_block(fs->fs_io, &buf);
+	if (ret) {
+		com_err(whoami, ret, "while allocating block\n");
+		goto out;
+	}
+
+	ret = ocfs2_open_inode_scan(fs, &scan);
+	if (ret) {
+		com_err(whoami, ret, "while scanning for inodes\n");
+		goto out;
+	}
+	ret = ocfs2_malloc(sizeof(struct ocfs2_dinode *) * dst->num_inodes,
+			&dst->inodes);
+	if (ret) {
+		com_err(whoami, ret, "while allocating block\n");
+		goto out_close_scan;
+	}
+	memset(dst->inodes, 0, sizeof(void *) * dst->num_inodes);
+
+	di = (struct ocfs2_dinode *)buf;
+	for (;;) {
+		ret = ocfs2_get_next_inode(scan, &blkno, buf);
+		if (ret) {
+			com_err(whoami, ret, "while getting next inode\n");
+			goto out_close_scan;
+		}
+
+		if (blkno == 0)
+			break;
+
+		if (memcmp(di->i_signature, OCFS2_INODE_SIGNATURE,
+					strlen(OCFS2_INODE_SIGNATURE)))
+			continue;
+
+		ocfs2_swap_inode_to_cpu(fs, di);
+
+		if (di->i_fs_generation != dst->fs_generation)
+			continue;
+
+		if (!(di->i_flags & OCFS2_VALID_FL) ||
+				(di->i_flags & OCFS2_SYSTEM_FL))
+			continue;
+
+		dst->num_inodes++;
+	}
+
+out_close_scan:
+	ocfs2_close_inode_scan(scan);
+out:
+	ocfs2_free(&buf);
+	return ret;
+}
+
+static errcode_t read_inodes(struct defrag_state *dst)
+{
+	ocfs2_inode_scan *scan;
+	ocfs2_filesys *fs = dst->dst_fs;
+	uint64_t blkno;
+	errcode_t ret = 0;
+	char *buf = NULL, *tbuf = NULL;
+	struct ocfs2_dinode *di;
+	int ninodes = 0;
+
+
+	ret = ocfs2_malloc_block(fs->fs_io, &buf);
+	if (ret) {
+		com_err(whoami, ret, "while allocating block\n");
+		return ret;
+	}
+
+	ret = ocfs2_open_inode_scan(fs, &scan);
+	if (ret) {
+		com_err(whoami, ret, "while scanning for inodes\n");
+		goto out;
+	}
+	ret = ocfs2_malloc(sizeof(struct ocfs2_dinode *) * dst->num_inodes,
+			&dst->inodes);
+	if (ret)
+		goto out_close_scan;
+	memset(dst->inodes, 0, sizeof(void *) * dst->num_inodes);
+
+
+	di = (struct ocfs2_dinode *)buf;
+
+	for (;;) {
+		ret = ocfs2_get_next_inode(scan, &blkno, buf);
+		if (ret)
+			goto out_close_scan;
+
+		if (blkno == 0)
+			break;
+
+		if (memcmp(di->i_signature, OCFS2_INODE_SIGNATURE,
+					strlen(OCFS2_INODE_SIGNATURE)))
+			continue;
+
+		ocfs2_swap_inode_to_cpu(fs, di);
+
+		if (di->i_fs_generation != dst->fs_generation)
+			continue;
+
+		if (!(di->i_flags & OCFS2_VALID_FL) ||
+				(di->i_flags & OCFS2_SYSTEM_FL))
+			continue;
+
+		verbosef("Inode %"PRIu64" blkno %"PRIu64" flags:0x%x "
+				"dyn_flags:0x%x\n",
+				(uint64_t)di->i_blkno, (uint64_t)blkno,
+				di->i_flags, di->i_dyn_features);
+
+		ret = ocfs2_malloc_block(fs->fs_io, &tbuf);
+		if (ret)
+			goto out_close_scan;
+		memcpy(tbuf, buf, fs->fs_blocksize);
+		dst->inodes[ninodes++] = (struct ocfs2_dinode *) tbuf;
+		if (ninodes>dst->num_inodes) {
+			fprintf(stderr, "Incorrect number of inodes "
+					"found in previous pass.\n");
+			goto out_close_scan;
+		}
+	}
+
+out_close_scan:
+	ocfs2_close_inode_scan(scan);
+out:
+	if (buf)
+		ocfs2_free(&buf);
+	return ret;
+}
+
+static struct ocfs2_extent_rec *find_di(struct defrag_state *dst,
+		uint64_t blkno,	struct ocfs2_dinode **di)
+{
+	struct ocfs2_dinode **inode = dst->inodes;
+	struct ocfs2_extent_rec *rec, *er = NULL;
+	struct ocfs2_extent_list *el;
+	int i, j;
+
+	verbosef("Finding di associated with %"PRIu64"\n", blkno);
+	for (i=dst->num_inodes-1; i>=0; i--) {
+		el = &inode[i]->id2.i_list;
+		if (el->l_tree_depth)
+			continue;
+		for (j=0; j<el->l_count; j++) {
+			rec = &el->l_recs[j];
+			if (rec->e_blkno == blkno) {
+				er = rec;
+				break;
+			}
+		}
+	}
+	return er;
+}
+
+static struct ocfs2_extent_rec *find_best_fit_extent(
+		struct defrag_state *dst, int size,
+		uint64_t blkno, struct ocfs2_dinode **di)
+{
+	struct ocfs2_dinode **inode = dst->inodes;
+	struct ocfs2_extent_rec *rec, *er = NULL;
+	struct ocfs2_extent_list *el;
+	int i, j, nb, cblks = -1;
+	*di = NULL;
+
+	for (i=dst->num_inodes-1; i>=0; i--) {
+		el = &inode[i]->id2.i_list;
+		/* XXX - temporarily work only on inodes of depth 0 */
+		if (el->l_tree_depth)
+			continue;
+		for(j=0; j<el->l_count; j++) {
+			rec = &el->l_recs[j];
+			nb = ocfs2_clusters_to_blocks(dst->dst_fs,
+					rec->e_leaf_clusters);
+			if ((rec->e_blkno > blkno) && (nb <= size) &&
+				((cblks==-1) ||	(cblks < nb))) {
+				*di = inode[i];
+				if (nb == size)
+					return rec;
+				er = rec;
+				cblks = nb;
+			}
+		}
+	}
+	return er;
+}
+
+
+static struct ocfs2_group_desc *find_gd(struct defrag_state *dst,
+		uint64_t blkno)
+{
+	struct ocfs2_group_desc **gd = dst->gds;
+	int i;
+	uint64_t end;
+	verbosef("Finding gd corresponding to %"PRIu64"\n", blkno);
+	for (i = dst->num_gds - 1; i>=0; i--) {
+		end = gd[i]->bg_blkno +
+			ocfs2_clusters_to_blocks(dst->dst_fs, gd[i]->bg_bits);
+		if (gd[i] && (blkno > gd[i]->bg_blkno) &&
+				(blkno < end))
+			return gd[i];
+	}
+	return NULL;
+}
+
+
+
+
+static errcode_t gd_bitmap_set_value(struct defrag_state *dst,
+		struct ocfs2_group_desc *gd,
+		int start, int len, int value)
+{
+	int i;
+	struct ocfs2_chain_list *cl = &dst->global_bitmap_inode->id2.i_chain;
+	struct ocfs2_chain_rec *cr;
+
+	for (i=start; i<start+len; i++)
+		if (value)
+			ocfs2_set_bit(i, gd->bg_bitmap);
+		else
+			ocfs2_clear_bit(i, gd->bg_bitmap);
+
+
+	i = gd->bg_chain;
+	cr = &cl->cl_recs[i];
+	if (value) {
+		cr->c_free-=len;
+		gd->bg_free_bits_count-=len;
+	}
+	else {
+		cr->c_free+=len;
+		gd->bg_free_bits_count+=len;
+	}
+	return ocfs2_write_group_desc(dst->dst_fs, gd->bg_blkno, (char *)gd);
+}
+
+
+static errcode_t gd_set_bits(struct defrag_state *dst, struct
ocfs2_group_desc *gd,
+		int start, int len)
+{
+	return gd_bitmap_set_value(dst, gd, start, len, 1);
+}
+
+
+static errcode_t gd_clear_bits(struct defrag_state *dst,
+		struct ocfs2_group_desc *gd,
+		int start, int len)
+{
+	return gd_bitmap_set_value(dst, gd, start, len, 0);
+}
+
+
+static errcode_t move_extent(struct defrag_state *dst,
+		struct ocfs2_group_desc *from_gd,
+		struct ocfs2_group_desc *to_gd, struct
+		ocfs2_extent_rec *rec, int to_offset)
+{
+	char *buf;
+	errcode_t ret;
+	uint64_t from = rec->e_blkno, to;
+	uint64_t first_gd_blkno =
+		(OCFS2_RAW_SB(dst->dst_fs->fs_super))->s_first_cluster_group;
+	ocfs2_filesys *fs = dst->dst_fs;
+	int len = ocfs2_clusters_to_blocks(fs, rec->e_leaf_clusters);
+
+	if (to_gd->bg_blkno == first_gd_blkno)
+		to = to_offset;
+	else
+		to = to_offset + to_gd->bg_blkno;
+	verbosef("Moving %d blocks from %"PRIu64" to %"PRIu64"\n",
+			len, from, to);
+
+	ret = ocfs2_malloc_blocks(fs->fs_io, len, &buf);
+	if (ret)
+		return ret;
+	ret = ocfs2_read_blocks(fs, from, len, buf);
+	if (ret)
+		goto out;
+	ret = io_write_block(fs->fs_io, to, len, buf);
+	if (ret)
+		goto out;
+
+	/* First clear and then set, to accomodate same gds.
+	   XXX If any of the following fails and to/from gds are same
+	   it will lead to data corruption.
+	 */
+	ret = gd_clear_bits(dst, from_gd,
+			ocfs2_blocks_to_clusters(fs, from - from_gd->bg_blkno),
+			rec->e_leaf_clusters);
+	if (ret)
+		goto out;
+	ret = gd_set_bits(dst, to_gd,
+			ocfs2_blocks_to_clusters(fs, to_offset),
+			rec->e_leaf_clusters);
+	rec->e_blkno = to;
+out:
+	ocfs2_free(&buf);
+	return ret;
+}
+
+static errcode_t defrag_gd(struct defrag_state *dst,
+		struct ocfs2_group_desc *gd)
+{
+	int start = 0, end = 0, changed = 0, bstart, bend, last_hole=0;
+	struct ocfs2_extent_rec *er;
+	struct ocfs2_dinode *di;
+	struct ocfs2_group_desc *from_gd;
+	errcode_t ret = 0;
+
+	while (end < gd->bg_bits) {
+		start = ocfs2_find_next_bit_clear (gd->bg_bitmap,
+				gd->bg_bits, end);
+
+		if (start > gd->bg_bits)
+			break;
+
+		end = ocfs2_find_next_bit_set(gd->bg_bitmap, gd->bg_bits, start);
+
+		if (end > gd->bg_bits)
+			break;
+
+		bstart = ocfs2_clusters_to_blocks(dst->dst_fs, start);
+		bend = ocfs2_clusters_to_blocks(dst->dst_fs, end);
+		verbosef("gd %"PRIu64" start %d end %d bstart %d bend %d\n",
+				(uint64_t) gd->bg_blkno, start, end,
+				bstart, bend);
+
+		/* Avoid best fit if last_hole is equal to current one */
+
+		if (last_hole >= bend - bstart)
+			goto no_best_fit;
+
+		er = NULL;
+		di = NULL;
+		er = find_best_fit_extent(dst, bend - bstart,
+				gd->bg_blkno + bstart, &di);
+
+		if (er) {
+			from_gd = find_gd(dst, er->e_blkno);
+			if (from_gd) {
+				ret = move_extent(dst, from_gd, gd, er,
+						bstart);
+				if (ret)
+					break;
+				/* XXX For efficiency, try to move
+				 * the rest of the file to
+				 * this gd, if space allows */
+				ret = ocfs2_write_inode(dst->dst_fs,
+						di->i_blkno , (char *) di);
+				if (ret)
+					break;
+				changed = 1;
+				last_hole = 0;
+			}
+			continue;
+		}
+
+no_best_fit:
+		if (end >= gd->bg_bits)
+			break;
+
+		/* Best fit not found. Move the following extent to cover the
+		 * hole */
+		er = NULL;
+		di = NULL;
+		er = find_di(dst, bend, &di);
+		if (er) {
+			ret = move_extent(dst, gd, gd, er, bstart);
+			if (ret)
+				break;
+			ret = ocfs2_write_inode(dst->dst_fs, di->i_blkno,
+					(char *)di);
+			if (ret)
+				break;
+			changed = 1;
+			last_hole = bend - bstart;
+		} else
+			verbosef("%"PRIu64" not found\n", bend);
+		
+	}
+	verbosef("Writing gd %"PRIu64"\n", (uint64_t)gd->bg_blkno);
+	if (changed)
+		ret = ocfs2_write_group_desc(dst->dst_fs,
+				gd->bg_blkno, (char *)gd);
+	return ret;
+}
+
+static errcode_t defrag_gds(struct defrag_state *dst)
+{
+	int i;
+	errcode_t ret = 0;
+	struct ocfs2_group_desc **gd = dst->gds;
+
+	for (i=0; i<dst->num_gds; i++)
+		if (gd[i]->bg_free_bits_count != 0)
+			ret = defrag_gd(dst, gd[i]);
+	return ret;
+}
+
+
+errcode_t defrag_fs(struct defrag_state *dst)
+{
+	errcode_t ret = read_gds(dst);
+	sort_gds(dst);
+#ifdef DEBUG
+	int i;
+	for (i=0; i<dst->num_gds; i++)
+		print_gd(dst->gds[i]);
+#endif
+	/* We did not run the previous passes, so get the num_inodes */
+	if (!dst->num_inodes) {
+		ret = get_num_inodes(dst);
+		if (ret) {
+			com_err(whoami, ret, "while getting number of inodes\n");
+			goto out;
+		}
+	}
+	ret = read_inodes(dst);
+	if (ret) {
+		com_err(whoami, ret, "while reading indoes\n");
+		goto out;
+	}
+
+	ret = defrag_gds(dst);
+	if (ret) {
+		com_err(whoami, ret, "while defragging gds\n");
+		goto out;
+	}
+
+#ifdef DEBUG
+	for (i=0; i<dst->num_gds; i++)
+		print_gd(dst->gds[i]);
+#endif
+	ocfs2_write_inode(dst->dst_fs, dst->global_bitmap_inode->i_blkno,
+			(char *)dst->global_bitmap_inode);
+out:
+	return ret;
+}
+
-- 
1.6.4.2



More information about the Ocfs2-tools-devel mailing list