[Ocfs2-tools-devel] [PATCH 2/4] defrag.ocfs2: Defrag files

Goldwyn Rodrigues rgoldwyn at gmail.com
Tue Sep 7 20:49:16 PDT 2010


Algorithm:
Allocate the maximum possible extent and copy the file into the
newly allocated extent. If the allocated extent is smaller than
the extent read from the file, write the collected data, and
re-use the file extent. At any point, if the number of extents
in the new inode increases than in the existing inode, abort.

Signed-off-by: Goldwyn Rodrigues <rgoldwyn at suse.de>
---
 defrag.ocfs2/Makefile         |    2 +-
 defrag.ocfs2/defrag.c         |   11 +
 defrag.ocfs2/file.c           |  511 +++++++++++++++++++++++++++++++++++++++++
 defrag.ocfs2/include/defrag.h |    3 +
 4 files changed, 526 insertions(+), 1 deletions(-)
 create mode 100644 defrag.ocfs2/file.c

diff --git a/defrag.ocfs2/Makefile b/defrag.ocfs2/Makefile
index 2079d0c..e4314b8 100644
--- a/defrag.ocfs2/Makefile
+++ b/defrag.ocfs2/Makefile
@@ -17,7 +17,7 @@ LIBO2CB_DEPS = $(TOPDIR)/libo2cb/libo2cb.a

 CFLAGS += -g

-CFILES =	defrag.c
+CFILES =	defrag.c file.c

 HFILES = 	include/defrag.h

diff --git a/defrag.ocfs2/defrag.c b/defrag.ocfs2/defrag.c
index 4238a4a..6b9ddd8 100644
--- a/defrag.ocfs2/defrag.c
+++ b/defrag.ocfs2/defrag.c
@@ -329,6 +329,17 @@ int main(int argc, char **argv)
 	if (!(dst->pass1 || dst->pass2))
 		dst->pass1 = dst->pass2 = 1;

+	if (dst->pass1) {
+		printf("Pass 1: Defrag files and directories\n");
+		ret = defrag_files_and_dirs(dst);
+		if (ret) {
+		    fprintf(stderr, "Error while defraging individual files\n"
+				"Please	execute	fsck.ocfs2\n");
+		    retval |= DEFRAG_ERROR;
+		    goto close;
+		}
+	}
+
 close:
 	block_signals(SIG_BLOCK);
 	if (dst->dst_fs->fs_dlm_ctxt)
diff --git a/defrag.ocfs2/file.c b/defrag.ocfs2/file.c
new file mode 100644
index 0000000..01bf993
--- /dev/null
+++ b/defrag.ocfs2/file.c
@@ -0,0 +1,511 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+   * vim: noexpandtab sw=8 ts=8 sts=0:
+   *
+   * file.c
+   *
+   * Copyright (C) 2010 Novell. All rights reserved.
+   *
+   * This program is free software; you can redistribute it and/or
+   * modify it under the terms of the GNU General Public
+   * License version 2 as published by the Free Software Foundation.
+   *
+   * This program is distributed in the hope that it will be useful,
+   * but WITHOUT ANY WARRANTY; without even the implied warranty of
+   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+   * General Public License for more details.
+   */
+
+#include "defrag.h"
+#include <inttypes.h>
+
+extern char *whoami;
+
+static errcode_t calc_num_extents(ocfs2_filesys *fs,
+		struct ocfs2_extent_list *el,
+		uint32_t *ne)
+{
+	struct ocfs2_extent_block *eb;
+	struct ocfs2_extent_rec *rec;
+	errcode_t ret = 0;
+	char *buf = NULL;
+	int i;
+	uint32_t clusters;
+	uint32_t extents = 0;
+
+	ret = ocfs2_malloc_block(fs->fs_io, &buf);
+	if (ret)
+		goto bail;
+
+	*ne = 0;
+	for (i = 0; i < el->l_next_free_rec; ++i) {
+		rec = &(el->l_recs[i]);
+		clusters = ocfs2_rec_clusters(el->l_tree_depth, rec);
+
+		/*
+		 * In a unsuccessful insertion, we may shift a tree
+		 * add a new branch for it and do no insertion. So we
+		 * may meet a extent block which have
+		 * clusters == 0, this should only be happen
+		 * in the last extent rec. */
+		if (!clusters && i == el->l_next_free_rec - 1)
+			break;
+
+		extents = 1;
+
+		if (el->l_tree_depth) {
+			memset(buf, 0, fs->fs_blocksize);
+
+			ret = ocfs2_read_extent_block(fs, rec->e_blkno, buf);
+			if (ret)
+				goto bail;
+
+			eb = (struct ocfs2_extent_block *)buf;
+
+			ret = calc_num_extents(fs, &(eb->h_list), &extents);
+			if (ret)
+				goto bail;
+
+		}
+		*ne = *ne + extents;
+	}
+
+bail:
+	if (buf)
+		ocfs2_free(&buf);
+	return ret;
+}
+
+#define MAX_RECS	4096
+
+struct save_cluster {
+	struct {
+		uint64_t start;
+		uint32_t len;
+	} rec[MAX_RECS];
+	int num_recs;
+};
+
+static errcode_t free_extents(ocfs2_filesys *fs, uint32_t len,
+		uint64_t start, void *private)
+{
+	struct save_cluster *sc = (struct save_cluster *)private;
+	int i;
+	for (i = 0; i < sc->num_recs; i++) {
+		if ((start == sc->rec[i].start) && (len == sc->rec[i].len)) {
+			verbosef("Preserved %"PRIu64" l %d\n",
+				sc->rec[i].start, sc->rec[i].len);
+			return 0;
+		}
+	}
+	/* TODO: change call ocfs2_truncate_clusters to incorporate
+	   refcounting */
+	verbosef("Generic free: %"PRIu64" l %d\n", start, len);
+	return ocfs2_free_clusters(fs, len, start);
+}
+
+
+static errcode_t copy_clusters(ocfs2_filesys *fs,
+		uint64_t from, uint64_t to, int n)
+{
+	char *buf = NULL;
+	int i, c = ocfs2_clusters_in_blocks(fs, 1);
+	errcode_t ret = ocfs2_malloc_blocks(fs->fs_io, c, &buf);
+	verbosef("Writing %d clusters to %"PRIu64" from %"PRIu64"\n", n,
+			to, from);
+	if (ret)
+		return ret;
+	for (i = 0; i < n; i++) {
+		ret = ocfs2_read_blocks(fs, from + i*c, c, buf);
+		if (ret)
+			goto out;
+		ret = io_write_block(fs->fs_io, to + i*c, c, buf);
+		if (ret)
+			goto out;
+	}
+out:
+	ocfs2_free(&buf);
+	return ret;
+}
+
+
+struct defrag_file_context {
+	struct defrag_state *dst;
+	ocfs2_cached_inode *new_inode;
+	struct ocfs2_dinode *old_inode;
+	uint64_t blkno;
+	/* Maintained in clusters */
+	unsigned int c_offset;
+	unsigned int c_end;
+	int clusters_left;
+	int cpos;
+	int expected_cpos;
+	unsigned int num_extents;
+	unsigned int orig_extents;
+	int status;
+	struct save_cluster sc;
+};
+
+#define MIN_REQUEST	1
+
+static errcode_t alloc_clusters(struct defrag_file_context *ctxt)
+{
+	errcode_t ret;
+	int request = ctxt->clusters_left;
+
+	if (request <= 0) {
+		verbosef("Requested: %d\n", request);
+		return OCFS2_ET_INVALID_ARGUMENT;
+	}
+
+	ret = ocfs2_new_clusters(ctxt->dst->dst_fs, MIN_REQUEST,
+			ctxt->clusters_left, &ctxt->blkno, &ctxt->c_end);
+	verbosef("alloc'd %d clusters req %d/%d at %"PRIu64"\n", ctxt->c_end,
+			request, ctxt->clusters_left, ctxt->blkno);
+	ctxt->c_offset = 0;
+	return ret;
+}
+
+static errcode_t free_clusters(struct defrag_file_context *ctxt)
+{
+	errcode_t ret = 0;
+	int l = ctxt->c_end - ctxt->c_offset;
+	/* Discard any remaining clusters */
+	if (l) {
+		uint64_t blk = ctxt->blkno +
+			ocfs2_clusters_to_blocks(ctxt->dst->dst_fs,
+				ctxt->c_offset);
+		verbosef("Freeing %"PRIu64" l %d\n", blk, l);
+		ret = ocfs2_free_clusters(ctxt->dst->dst_fs, l, blk);
+	}
+	ctxt->c_offset = ctxt->c_end = 0;
+	return ret;
+}
+
+static int copy_file_data(ocfs2_filesys *fs,
+		struct ocfs2_extent_rec *rec,
+		int tree_depth, uint32_t ccount, uint64_t ref_blkno,
+		int ref_recno, void *private)
+{
+	errcode_t ret;
+	struct defrag_file_context *ctxt =
+				(struct defrag_file_context *)private;
+	int nc, left, num_blocks = ocfs2_clusters_to_blocks(fs, 1);
+
+	verbosef("nc %d offset %d/%d pos %"PRIu64" blkno %"PRIu64
+			" exts %d/%d\n",
+			rec->e_leaf_clusters, ctxt->c_offset, ctxt->c_end,
+			(uint64_t)ctxt->cpos, (uint64_t)ctxt->blkno,
+			ctxt->num_extents, ctxt->orig_extents);
+
+	/* If our cluster length is more than existing
+	   file num extents, abort or we'll make it worse! */
+	if (ctxt->num_extents > ctxt->orig_extents) {
+		verbosef("Extents exceeded orig %d now %d cpos %d\n",
+			ctxt->orig_extents, ctxt->num_extents, ctxt->cpos);
+		ctxt->status = DEFRAG_EXTENTS_EXCEEDED;
+		return OCFS2_EXTENT_ABORT;
+	}
+
+	/* Handle holes */
+	if (ctxt->expected_cpos < rec->e_cpos) {
+		verbosef("Hole detected expected/rec cpos %d/%d\n",
+				ctxt->expected_cpos, rec->e_cpos);
+
+		if (ctxt->c_offset) {
+			verbosef("Adding collected extent rec p %d blk %"
+				PRIu64"l %d\n", ctxt->cpos, ctxt->blkno,
+				ctxt->c_offset);
+			ret = ocfs2_cached_inode_insert_extent(
+				ctxt->new_inode, ctxt->cpos, ctxt->blkno,
+				ctxt->c_offset, 0);
+			if (ret) {
+				com_err(whoami, ret,
+						"while adding extent rec\n");
+				ctxt->status = DEFRAG_ERROR;
+				return OCFS2_EXTENT_ABORT;
+			}
+			ctxt->num_extents++;
+			ctxt->c_offset = 0;
+		}
+		ctxt->expected_cpos = rec->e_cpos;
+		ctxt->clusters_left -= rec->e_cpos - ctxt->expected_cpos;
+		if (!ctxt->clusters_left)
+			return 0;
+		ret = free_clusters(ctxt);
+		if (ret) {
+			com_err(whoami, ret, "while freeing clusters\n");
+			ctxt->status = DEFRAG_ERROR;
+			return OCFS2_EXTENT_ABORT;
+		}
+	}
+
+	ctxt->expected_cpos += rec->e_leaf_clusters;
+
+	if (!ctxt->c_end) {
+		ret = alloc_clusters(ctxt);
+		if (ret) {
+			com_err(whoami, ret, "while allocating clusters\n");
+			ctxt->status = DEFRAG_ERROR;
+			return OCFS2_EXTENT_ABORT;
+		}
+
+	}
+
+	/* Reuse clusters if they are bigger than allocated cluster */
+	if (ctxt->c_end <= rec->e_leaf_clusters) {
+		if (ctxt->c_offset) {
+			verbosef("Adding collected extent rec p %d blk %"
+				PRIu64"l %d\n", ctxt->cpos, ctxt->blkno,
+				ctxt->c_offset);
+			ret = ocfs2_cached_inode_insert_extent(
+				ctxt->new_inode, ctxt->cpos, ctxt->blkno,
+				ctxt->c_offset, 0);
+			if (ret) {
+				com_err(whoami, ret,
+						"while adding extent rec\n");
+				ctxt->status = DEFRAG_ERROR;
+				return OCFS2_EXTENT_ABORT;
+			}
+			ctxt->num_extents++;
+		}
+		verbosef("Adding old extent rec %d b %"PRIu64" l %d\n",
+			rec->e_cpos, (uint64_t)rec->e_blkno,
+			rec->e_leaf_clusters);
+		ret = ocfs2_cached_inode_insert_extent(ctxt->new_inode,
+			rec->e_cpos, rec->e_blkno, rec->e_leaf_clusters, 0);
+		if (ret) {
+			com_err(whoami, ret, "while adding extent\n");
+			ctxt->status = DEFRAG_ERROR;
+			return OCFS2_EXTENT_ABORT;
+		}
+		ctxt->num_extents++;
+		ctxt->sc.rec[ctxt->sc.num_recs].start = rec->e_blkno;
+		ctxt->sc.rec[ctxt->sc.num_recs++].len = rec->e_leaf_clusters;
+		if (ctxt->sc.num_recs > MAX_RECS) {
+			verbosef("Exceeded MAX_RECS limit %d\n",
+					ctxt->sc.num_recs);
+			ctxt->status = DEFRAG_ERROR;
+			return OCFS2_EXTENT_ABORT;
+		}
+		ctxt->cpos = rec->e_cpos + rec->e_leaf_clusters;
+		ctxt->clusters_left -= rec->e_leaf_clusters;
+		ret = free_clusters(ctxt);
+		if (ret) {
+			com_err(whoami, ret, "while freeing clusters\n");
+			ctxt->status = DEFRAG_ERROR;
+			return OCFS2_EXTENT_ABORT;
+		}
+		return 0;
+	}
+
+	/* Remaining clusters short. Fill allocated clusters and write */
+	nc = 0;
+	left = rec->e_leaf_clusters;
+	while (ctxt->c_end < ctxt->c_offset + left) {
+		/* Copy enough to fill the rest of the block */
+		nc = ctxt->c_end - ctxt->c_offset;
+		if (nc) {
+			ret = copy_clusters(ctxt->dst->dst_fs, rec->e_blkno,
+				ctxt->blkno + num_blocks*ctxt->c_offset, nc);
+			if (ret) {
+				com_err(whoami, ret, "while reading %d blocks"
+					" from %"PRIu64"\n", num_blocks,
+					(uint64_t)rec->e_blkno);
+				ctxt->status = DEFRAG_ERROR;
+				return OCFS2_EXTENT_ABORT;
+			}
+			ctxt->clusters_left -= nc;
+			ctxt->c_offset += nc;
+			left -= nc;
+		}
+
+		verbosef("Inserting extent b %"PRIu64" l %d\n",
+			ctxt->blkno, ctxt->c_end);
+
+		/*Block is full. Insert an extent_rec*/
+		ret = ocfs2_cached_inode_insert_extent(ctxt->new_inode,
+				ctxt->cpos, ctxt->blkno, ctxt->c_end, 0);
+		if (ret) {
+			com_err(whoami, ret, "while inserting extent_rec\n");
+			ctxt->status = DEFRAG_ERROR;
+			return OCFS2_EXTENT_ABORT;
+		}
+		ctxt->cpos += ctxt->c_end;
+		ctxt->num_extents++;
+
+		/* Get a new extent allocation */
+		if (ctxt->clusters_left && (ctxt->c_offset == ctxt->c_end)) {
+			ret = alloc_clusters(ctxt);
+			if (ret) {
+				com_err(whoami, ret, "while allocating "
+						"clusters\n");
+				ctxt->status = DEFRAG_ERROR;
+				return OCFS2_EXTENT_ABORT;
+			}
+		}
+	}
+
+	if (left) {
+		ret = copy_clusters(ctxt->dst->dst_fs,
+			rec->e_blkno + nc*num_blocks,
+			ctxt->blkno + ctxt->c_offset * num_blocks, left);
+		ctxt->c_offset += left;
+		ctxt->clusters_left -= left;
+		if (ret) {
+			com_err(whoami, ret, "while copying %d clusters from"
+				" %"PRIu64"\n", nc, (uint64_t)rec->e_blkno);
+			ctxt->status = DEFRAG_ERROR;
+			return OCFS2_EXTENT_ABORT;
+		}
+	}
+	return 0;
+}
+
+errcode_t defrag_file(struct defrag_state *dst, struct ocfs2_dinode *di)
+{
+	struct defrag_file_context fc;
+	unsigned int best_exts;
+	uint64_t tmpblkno;
+	errcode_t ret;
+	struct ocfs2_extent_list *el = &di->id2.i_list;
+	int offset = 0;
+
+	/* XXX: Ignore refcounted file for now */
+	if (di->i_dyn_features &
+			(OCFS2_INLINE_DATA_FL | OCFS2_HAS_REFCOUNT_FL))
+		return 0;
+
+	/*Initialize fc */
+	memset(&fc, 0, sizeof(struct defrag_file_context));
+	fc.dst = dst;
+
+	/* What is the best number of extents for this file */
+	best_exts = (ocfs2_bytes_to_clusters(dst->dst_fs,
+			di->i_size))/dst->max_esize + 1;
+	ret = calc_num_extents(dst->dst_fs, el, &fc.orig_extents);
+	if (best_exts >= fc.orig_extents)
+		return 0;
+
+	ret = ocfs2_new_inode(dst->dst_fs, &tmpblkno, di->i_mode);
+	ret = ocfs2_read_cached_inode(dst->dst_fs, tmpblkno, &fc.new_inode);
+	fc.new_inode->ci_inode->i_size = di->i_size;
+
+	verbosef("Defragging ino: %"PRIu64" size %"PRIu64" best %d num %d to"
+			"%"PRIu64"\n", (uint64_t)di->i_blkno,
+			(uint64_t)di->i_size, best_exts, fc.orig_extents,
+			tmpblkno);
+
+	fc.clusters_left = ocfs2_bytes_to_clusters(dst->dst_fs,
+				di->i_size) + 1;
+	ret = ocfs2_extent_iterate_inode(dst->dst_fs, di,
+			OCFS2_EXTENT_FLAG_DATA_ONLY, NULL,
+			copy_file_data, &fc);
+	switch(fc.status) {
+		case DEFRAG_EXTENTS_EXCEEDED:
+			fprintf(stderr, "Number of extents for inode %"PRIu64
+					" exceeded. Aborting\n",
+					(uint64_t)di->i_blkno);
+			fc.c_offset = 0;
+			ret = free_clusters(&fc);
+			ret = ocfs2_truncate(dst->dst_fs, tmpblkno, 0);
+			goto out;
+		case DEFRAG_ERROR:
+			fprintf(stderr, "Error while defraging file %"PRIu64
+					"\n", (uint64_t) di->i_blkno);
+			fc.c_offset = 0;
+			ret = free_clusters(&fc);
+			ret = ocfs2_truncate(dst->dst_fs, tmpblkno, 0);
+			goto out;
+		default:
+			break;
+	}
+
+	/* Insert data representing remaining extent_rec */
+	if (fc.c_offset) {
+		ret = ocfs2_cached_inode_insert_extent(fc.new_inode, fc.cpos,
+			fc.blkno, fc.c_offset, 0);
+		if (ret) {
+			com_err(whoami, ret, "while writing clusters\n");
+			ret = ocfs2_truncate(dst->dst_fs, tmpblkno, 0);
+			goto out;
+		}
+		free_clusters(&fc);
+	}
+
+	/* Truncate the old file */
+	ocfs2_truncate_full(dst->dst_fs, di->i_blkno, 0, free_extents,
+			(void *)&fc.sc);
+	di->i_size = fc.new_inode->ci_inode->i_size;
+
+	/* Set the inode data */
+	el = &di->id2.i_list;
+	offset = offsetof(struct ocfs2_dinode, id2.i_list);
+	memcpy(el, &fc.new_inode->ci_inode->id2.i_list,
+			dst->dst_fs->fs_blocksize - offset);
+	ret = ocfs2_write_inode(dst->dst_fs, di->i_blkno, (char *)di);
+
+out:
+	/* Cleaning up temporary stuff */
+	ocfs2_free_cached_inode(dst->dst_fs, fc.new_inode);
+	ocfs2_delete_inode(dst->dst_fs, tmpblkno);
+	return ret;
+}
+
+
+errcode_t defrag_files_and_dirs(struct defrag_state *dst)
+{
+	ocfs2_inode_scan *scan;
+	ocfs2_filesys *fs = dst->dst_fs;
+	uint64_t blkno;
+	errcode_t ret = 0;
+	char *buf = NULL;
+	struct ocfs2_dinode *di;
+	int bsbits, cbits;
+
+	ret = ocfs2_malloc_block(fs->fs_io, &buf);
+	ret = ocfs2_open_inode_scan(fs, &scan);
+	if (ret)
+		goto out;
+
+	di = (struct ocfs2_dinode *)buf;
+	bsbits = OCFS2_RAW_SB(dst->dst_fs->fs_super)->s_blocksize_bits;
+	cbits = OCFS2_RAW_SB(dst->dst_fs->fs_super)->s_clustersize_bits;
+	/* XXX: Hard-coded for now. To be extracted from gd*/
+	dst->max_esize = 8192;
+
+	for (;;) {
+		ret = ocfs2_get_next_inode(scan, &blkno, buf);
+
+		if (blkno == 0)
+			break;
+
+		if (memcmp(di->i_signature, OCFS2_INODE_SIGNATURE,
+					strlen(OCFS2_INODE_SIGNATURE)))
+			continue;
+
+		ocfs2_swap_inode_to_cpu(fs, di);
+
+		if (di->i_fs_generation != dst->fs_generation)
+			continue;
+
+		if (!(di->i_flags & OCFS2_VALID_FL) ||
+				(di->i_flags & OCFS2_SYSTEM_FL))
+			continue;
+
+		dst->num_inodes++;
+
+		verbosef("Defragging inode %"PRIu64"\n", (uint64_t)di->i_blkno);
+
+		if (S_ISREG(di->i_mode))
+			ret = defrag_file(dst, di);
+
+		if (ret)
+			break;
+	}
+out:
+	if (scan)
+		ocfs2_close_inode_scan(scan);
+	ocfs2_free(&buf);
+	return ret;
+}
+
diff --git a/defrag.ocfs2/include/defrag.h b/defrag.ocfs2/include/defrag.h
index 536b6b3..264efa0 100644
--- a/defrag.ocfs2/include/defrag.h
+++ b/defrag.ocfs2/include/defrag.h
@@ -41,5 +41,8 @@ extern int verbose;
 		printf("%s:%d | " fmt, __FUNCTION__, __LINE__, args);\
 } while (0)

+errcode_t defrag_files_and_dirs(struct defrag_state *dst);
+errcode_t defrag_file(struct defrag_state *dst, struct ocfs2_dinode *);
+
 #endif /* __OCFS2_DEFRAG_H__ */

-- 
1.7.1


-- 
Goldwyn



More information about the Ocfs2-tools-devel mailing list