[Ocfs2-tools-devel] [PATCH 2/4] defrag.ocfs2: Pass 1: Defrag individual files and directories

Goldwyn Rodrigues rgoldwyn at gmail.com
Tue May 11 21:02:34 PDT 2010


Defragging file -
Allocate the maximum possible extent and copy the file into the
newly allocated extent. If the allocated extent is smaller than
the extent read from the file, write the collected data, and
re-use the file extent. At any point, if the number of extents
in the new inode increases than in the existing inode, abort.

Defragging directory -
Allocate an extent, and copy dirents to the new extent, skipping
holes and empty dirents. For each dirent, the dirent length
is recalculated to optimize on space.

TODO: Find the optimum extent size, perhaps by reading a
group_desc

Signed-off-by: Goldwyn Rodrigues <rgoldwyn at suse.de>
---
 Makefile                      |    2 +-
 defrag.ocfs2/Makefile         |   36 +++
 defrag.ocfs2/defrag.c         |  401 +++++++++++++++++++++++++++++++++
 defrag.ocfs2/dir.c            |  262 +++++++++++++++++++++
 defrag.ocfs2/file.c           |  499 +++++++++++++++++++++++++++++++++++++++++
 defrag.ocfs2/include/defrag.h |   48 ++++
 6 files changed, 1247 insertions(+), 1 deletions(-)
 create mode 100644 defrag.ocfs2/Makefile
 create mode 100644 defrag.ocfs2/defrag.c
 create mode 100644 defrag.ocfs2/dir.c
 create mode 100644 defrag.ocfs2/file.c
 create mode 100644 defrag.ocfs2/include/defrag.h

diff --git a/Makefile b/Makefile
index 88106fb..ecb56dc 100644
--- a/Makefile
+++ b/Makefile
@@ -20,7 +20,7 @@ CHKCONFIG_DEP = chkconfig
 COMPILE_PY = 1
 endif

-SUBDIRS = include libtools-internal libo2dlm libo2cb libocfs2
fsck.ocfs2 mkfs.ocfs2 mounted.ocfs2 tunefs.ocfs2 debugfs.ocfs2
o2cb_ctl ocfs2_hb_ctl mount.ocfs2 ocfs2_controld o2image listuuid
sizetest extras fswreck patches
+SUBDIRS = include libtools-internal libo2dlm libo2cb libocfs2
fsck.ocfs2 mkfs.ocfs2 mounted.ocfs2 tunefs.ocfs2 debugfs.ocfs2
o2cb_ctl ocfs2_hb_ctl mount.ocfs2 ocfs2_controld o2image listuuid
sizetest extras fswreck patches defrag.ocfs2

 ifdef BUILD_OCFS2CONSOLE
 SUBDIRS += ocfs2console
diff --git a/defrag.ocfs2/Makefile b/defrag.ocfs2/Makefile
new file mode 100644
index 0000000..685624e
--- /dev/null
+++ b/defrag.ocfs2/Makefile
@@ -0,0 +1,36 @@
+TOPDIR = ..
+
+include $(TOPDIR)/Preamble.make
+
+sbindir = $(root_sbindir)
+SBIN_PROGRAMS = defrag.ocfs2
+
+DEFINES += -DVERSION=\"$(VERSION)\"
+
+INCLUDES = -I$(TOPDIR)/include -Iinclude -I$(TOPDIR)/libocfs2
+LIBOCFS2_LIBS = -L$(TOPDIR)/libocfs2 -locfs2
+LIBOCFS2_DEPS = $(TOPDIR)/libocfs2/libocfs2.a
+LIBO2DLM_LIBS = -L$(TOPDIR)/libo2dlm -lo2dlm $(DL_LIBS)
+LIBO2DLM_DEPS = $(TOPDIR)/libo2dlm/libo2dlm.a
+LIBO2CB_LIBS = -L$(TOPDIR)/libo2cb -lo2cb
+LIBO2CB_DEPS = $(TOPDIR)/libo2cb/libo2cb.a
+
+CFLAGS += -g
+
+CFILES =	defrag.c file.c dir.c
+
+HFILES = 	include/defrag.h
+
+
+OBJS = $(subst .c,.o,$(CFILES))
+
+DIST_FILES = $(CFILES) $(HFILES) $(addsuffix .in,$(MANS))
+DIST_RULES = dist-subdircreate
+
+dist-subdircreate:
+	$(TOPDIR)/mkinstalldirs $(DIST_DIR)/include
+
+defrag.ocfs2: $(OBJS) $(LIBOCFS2_DEPS) $(LIBO2DLM_DEPS) $(LIBO2CB_DEPS)
+	$(LINK) $(LIBOCFS2_LIBS) $(LIBO2DLM_LIBS) $(LIBO2CB_LIBS) $(COM_ERR_LIBS)
+
+include $(TOPDIR)/Postamble.make
diff --git a/defrag.ocfs2/defrag.c b/defrag.ocfs2/defrag.c
new file mode 100644
index 0000000..65c1a13
--- /dev/null
+++ b/defrag.ocfs2/defrag.c
@@ -0,0 +1,401 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+   * vim: noexpandtab sw=8 ts=8 sts=0:
+   *
+   * defrag.c
+   *
+   * Copyright (C) 2010 Oracle. All rights reserved.
+
+   *
+   * This program is free software; you can redistribute it and/or
+   * modify it under the terms of the GNU General Public
+   * License version 2 as published by the Free Software Foundation.
+   *
+   * This program is distributed in the hope that it will be useful,
+   * but WITHOUT ANY WARRANTY; without even the implied warranty of
+   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+   * General Public License for more details.
+   */
+
+#include <getopt.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <inttypes.h>
+#include <signal.h>
+
+#include "defrag.h"
+
+
+int verbose = 0;
+char *whoami = "defrag.ocfs2";
+static struct defrag_state dfrag;
+static int cluster_locked = 0;
+
+extern int opterr, optind;
+extern char *optarg;
+
+static void handle_signal(int sig)
+{
+	switch (sig) {
+		case SIGTERM:
+		case SIGINT:
+			printf("\nProcess Interrupted.\n");
+
+			if (cluster_locked && dfrag.dst_fs->fs_dlm_ctxt) {
+				ocfs2_release_cluster(dfrag.dst_fs);
+				cluster_locked = 0;
+			}
+
+			if (dfrag.dst_fs->fs_dlm_ctxt)
+				ocfs2_shutdown_dlm(dfrag.dst_fs, whoami);
+
+			if (dfrag.dst_fs)
+				ocfs2_close(dfrag.dst_fs);
+
+			exit(1);
+	}
+
+	return ;
+}
+
+/* Call this with SIG_BLOCK to block and SIG_UNBLOCK to unblock */
+static void block_signals(int how)
+{
+	sigset_t sigs;
+
+	sigfillset(&sigs);
+	sigdelset(&sigs, SIGTRAP);
+	sigdelset(&sigs, SIGSEGV);
+	sigprocmask(how, &sigs, NULL);
+}
+
+
+static errcode_t check_superblock(struct defrag_state *dst)
+{
+	struct ocfs2_dinode *di = dst->dst_fs->fs_super;
+	struct ocfs2_super_block *sb = OCFS2_RAW_SB(di);
+	errcode_t ret = 0;
+
+	if (sb->s_max_slots == 0) {
+		printf("The superblock max_slots field is set to 0.\n");
+		ret = OCFS2_ET_CORRUPT_SUPERBLOCK;
+	}
+	dst->fs_generation = di->i_fs_generation;
+
+	return ret;
+}
+
+static void scale_time(time_t secs, unsigned *scaled, char **units)
+{
+	if (secs < 60) {
+		*units = "seconds";
+		goto done;
+	}
+	secs /= 60;
+
+	if (secs < 60) {
+		*units = "minutes";
+		goto done;
+	}
+	secs /= 60;
+
+	if (secs < 24) {
+		*units = "hours";
+		goto done;
+	}
+	secs /= 24;
+	*units = "days";
+
+done:
+	*scaled = secs;
+}
+
+static int fs_clean(struct ocfs2_super_block *sb, char *filename)
+{
+	time_t now = time(NULL);
+	time_t next = sb->s_lastcheck + sb->s_checkinterval;
+
+	if (sb->s_checkinterval > 0 && now >= next) {
+		unsigned scaled_time;
+		char *scaled_units;
+
+		scale_time(now - sb->s_lastcheck, &scaled_time, &scaled_units);
+		fprintf(stderr, "Filesystem has gone %u %s without"
+				"being checked\n", scaled_time, scaled_units);
+		return 0;
+	}
+
+	if (sb->s_mnt_count > 0) {
+		fprintf(stderr, "Filesystem mounted for %u mounts "
+				"without fsck. ", sb->s_mnt_count);
+		return 0;
+	}
+
+	return 1;
+}
+
+static void print_label(struct defrag_state *dst)
+{
+	unsigned char *label = OCFS2_RAW_SB(dst->dst_fs->fs_super)->s_label;
+	size_t i, max = sizeof(OCFS2_RAW_SB(dst->dst_fs->fs_super)->s_label);
+
+	for (i = 0; i < max && label[i]; i++) {
+		if (isprint(label[i]))
+			printf("%c", label[i]);
+		else
+			printf(".");
+	}
+	if (i == 0)
+		printf("<NONE>");
+
+	printf("\n");
+}
+
+static void print_uuid(struct defrag_state *dst)
+{
+	unsigned char *uuid = OCFS2_RAW_SB(dst->dst_fs->fs_super)->s_uuid;
+	size_t i, max = sizeof(OCFS2_RAW_SB(dst->dst_fs->fs_super)->s_uuid);
+
+	for (i = 0; i < max; i++)
+		printf("%02x ", uuid[i]);
+
+	printf("\n");
+}
+
+static void print_version(void)
+{
+	fprintf(stderr, "%s %s\n", whoami, VERSION);
+	fprintf(stdout, "WARNING: Unstable version.\n");
+}
+
+static void print_help(void)
+{
+	fprintf(stderr, "Usage: %s [-hvV] device\n", whoami);
+	fprintf(stderr, "Options:\n");
+	fprintf(stderr, "-h\t\t Print this help\n");
+	fprintf(stderr, "-v\t\t Verbose output\n");
+	fprintf(stderr, "-V\t\t Print version information\n");
+}
+
+static errcode_t open_and_check(struct defrag_state *dst, char *filename,
+				int open_flags, uint64_t blkno,
+				uint64_t blksize)
+{
+	errcode_t ret;
+
+	ret = ocfs2_open(filename, open_flags, blkno, blksize, &dst->dst_fs);
+	if (ret) {
+		com_err(whoami, ret, "while opening \"%s\"", filename);
+		goto out;
+	}
+
+	ret = check_superblock(dst);
+	if (ret) {
+		printf("Unrecoverable errors in the super block."
+				"Cannot continue.\n");
+		goto out;
+	}
+
+out:
+	return ret;
+}
+
+
+static errcode_t recover_cluster_info(struct defrag_state *dst)
+{
+	errcode_t ret = 0;
+	struct o2cb_cluster_desc disk = {NULL, NULL}, running = {NULL, NULL};
+
+	ret = o2cb_running_cluster_desc(&running);
+	if (ret)
+		goto bail;
+
+	ret = ocfs2_fill_cluster_desc(dst->dst_fs, &disk);
+	if (ret)
+		goto bail;
+
+	/*
+	 * If the disk matches the running cluster, there is nothing we
+	 * can fix.
+	 */
+	if ((!running.c_stack && !disk.c_stack) ||
+			(running.c_stack && running.c_cluster &&
+			 disk.c_stack && disk.c_cluster &&
+			 !strcmp(running.c_stack, disk.c_stack) &&
+			 !strcmp(running.c_cluster, disk.c_cluster)))
+		goto bail;
+
+bail:
+	o2cb_free_cluster_desc(&running);
+	o2cb_free_cluster_desc(&disk);
+
+	return ret;
+}
+
+
+int main(int argc, char **argv)
+{
+	char *filename;
+	int64_t blkno, blksize;
+	struct defrag_state *dst = &dfrag;
+	int c, open_flags = OCFS2_FLAG_RW | OCFS2_FLAG_STRICT_COMPAT_CHECK;
+	int retval = DEFRAG_OK;
+	errcode_t ret;
+	int mount_flags;
+
+	memset(dst, 0, sizeof(struct defrag_state));
+
+	/* These mean "autodetect" */
+	blksize = 0;
+	blkno = 0;
+
+	initialize_ocfs_error_table();
+	setlinebuf(stderr);
+	setlinebuf(stdout);
+
+	while ((c = getopt(argc, argv, "FvVh")) != EOF) {
+		switch (c) {
+			case 'F':
+				dst->skip_o2cb = 1;
+				break;
+			case 'v':
+				verbose = 1;
+				break;
+
+			case 'V':
+				print_version();
+				exit(DEFRAG_USAGE);
+				break;
+			case 'h':
+				print_help();
+				exit(0);
+				break;
+
+			default:
+				retval |= DEFRAG_USAGE;
+				print_help();
+				goto out;
+				break;
+		}
+	}
+
+	if (optind >= argc) {
+		fprintf(stderr, "Missing filename\n");
+		retval |= DEFRAG_USAGE;
+		print_help();
+		goto out;
+	}
+
+	filename = argv[optind];
+	print_version();
+
+	ret = ocfs2_check_if_mounted(filename, &mount_flags);
+	if (ret) {
+		com_err(whoami, ret, "while determining whether %s is mounted.",
+				filename);
+		retval |= DEFRAG_ERROR;
+		goto out;
+	}
+
+	if (mount_flags & (OCFS2_MF_MOUNTED | OCFS2_MF_BUSY)) {
+		fprintf(stdout, "\n Running defrag.ocfs2 on a "
+			"mounted filesystem will cause filesystem "
+			"damage.\n\n");
+		retval |= DEFRAG_ERROR;
+		goto out;
+	}
+
+	if (signal(SIGTERM, handle_signal) == SIG_ERR) {
+		com_err(whoami, 0, "Could not set SIGTERM");
+		exit(1);
+	}
+
+	if (signal(SIGINT, handle_signal) == SIG_ERR) {
+		com_err(whoami, 0, "Could not set SIGINT");
+		exit(1);
+	}
+
+	ret = open_and_check(dst, filename, open_flags, blkno, blksize);
+	if (ret) {
+		retval |= DEFRAG_ERROR;
+		goto out;
+	}
+
+	if (!dst->skip_o2cb && !ocfs2_mount_local(dst->dst_fs)) {
+		ret = o2cb_init();
+		if (ret) {
+			com_err(whoami, ret, "while initializing the cluster");
+			goto close;
+		}
+
+		block_signals(SIG_BLOCK);
+		ret = ocfs2_initialize_dlm(dst->dst_fs, whoami);
+		if (ret == O2CB_ET_INVALID_STACK_NAME) {
+			block_signals(SIG_UNBLOCK);
+			ret = recover_cluster_info(dst);
+			if (ret) {
+				com_err(whoami, ret,
+					"while recovering cluster information");
+				goto close;
+			}
+			block_signals(SIG_BLOCK);
+			ret = ocfs2_initialize_dlm(dst->dst_fs, whoami);
+		}
+		if (ret) {
+			block_signals(SIG_UNBLOCK);
+			com_err(whoami, ret, "while initializing the DLM");
+			goto close;
+		}
+
+		ret = ocfs2_lock_down_cluster(dst->dst_fs);
+		if (ret) {
+			block_signals(SIG_UNBLOCK);
+			com_err(whoami, ret, "while locking down the cluster");
+			goto close;
+		}
+		cluster_locked = 1;
+		block_signals(SIG_UNBLOCK);
+	}
+
+	printf("  De-fragmenting OCFS2 filesystem in %s:\n", filename);
+	printf("  label:              ");
+	print_label(dst);
+	printf("  uuid:               ");
+	print_uuid(dst);
+	printf("  number of blocks:   %"PRIu64"\n", dst->dst_fs->fs_blocks);
+	printf("  bytes per block:    %u\n", dst->dst_fs->fs_blocksize);
+	printf("  number of clusters: %"PRIu32"\n", dst->dst_fs->fs_clusters);
+	printf("  bytes per cluster:  %u\n", dst->dst_fs->fs_clustersize);
+	printf("  max slots:          %u\n\n",
+			OCFS2_RAW_SB(dst->dst_fs->fs_super)->s_max_slots);
+	printf(" System dir block number - %lu\n", dst->dst_fs->fs_sysdir_blkno);
+	printf(" Root dir block number - %lu\n", dst->dst_fs->fs_root_blkno);
+	printf(" FS blocks - %lu\n", dst->dst_fs->fs_blocks);
+
+	if (!fs_clean(OCFS2_RAW_SB(dst->dst_fs->fs_super), filename)) {
+		fprintf(stderr, "Please run fsck.ocfs2 before performing"
+				" defrag\n");
+		retval = DEFRAG_ERROR;
+		goto close;
+	}
+
+	ret = defrag_files_and_dirs(dst);
+	if (ret) {
+		fprintf(stderr, "Error while defraging individual files\n"
+				"Please execute fsck.ocfs2\n");
+		retval |= DEFRAG_ERROR;
+		goto close;
+	}
+close:
+	block_signals(SIG_BLOCK);
+	if (dst->dst_fs->fs_dlm_ctxt)
+		ocfs2_shutdown_dlm(dst->dst_fs, whoami);
+	block_signals(SIG_UNBLOCK);
+
+	ret = ocfs2_close(dst->dst_fs);
+	if (ret) {
+		com_err(whoami, ret, "while closing file \"%s\"", filename);
+		retval |= DEFRAG_ERROR;
+	}
+out:
+	return retval;
+}
diff --git a/defrag.ocfs2/dir.c b/defrag.ocfs2/dir.c
new file mode 100644
index 0000000..90f03df
--- /dev/null
+++ b/defrag.ocfs2/dir.c
@@ -0,0 +1,262 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+   * vim: noexpandtab sw=8 ts=8 sts=0:
+   *
+   * dir.c
+   *
+   * Copyright (C) 2010 Oracle. All rights reserved.
+   *
+   * This program is free software; you can redistribute it and/or
+   * modify it under the terms of the GNU General Public
+   * License version 2 as published by the Free Software Foundation.
+   *
+   * This program is distributed in the hope that it will be useful,
+   * but WITHOUT ANY WARRANTY; without even the implied warranty of
+   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+   * General Public License for more details.
+   */
+#include "defrag.h"
+#include <inttypes.h>
+
+extern char *whoami;
+
+struct defrag_dir_context {
+	struct defrag_state *dst;
+	ocfs2_cached_inode *new_inode;
+	struct ocfs2_dinode *old_inode;
+	uint64_t blkno;
+	uint64_t w_blkno;
+	int w_len;
+	char *w_buf;	/* contains one block of data */
+	int prev_offset;
+	/* Maintained in clusters */
+	int clusters_left;
+	int cpos;
+	uint64_t r_offset;
+	uint64_t dirsize;
+	int status;
+};
+
+
+static errcode_t alloc_clusters(struct defrag_dir_context *c)
+{
+	errcode_t ret;
+	unsigned int l;
+
+	c->clusters_left = ocfs2_bytes_to_clusters(c->dst->dst_fs,
+			c->old_inode->i_size - c->r_offset);
+
+	if (c->clusters_left <= 0) {
+		verbosef("Invalid request: %d\n", c->clusters_left);
+		return OCFS2_ET_INVALID_ARGUMENT;
+	}
+
+	verbosef("Allocating %d\n", c->clusters_left);
+
+	ret = ocfs2_new_clusters(c->dst->dst_fs, 1, c->clusters_left,
+			&c->blkno, &l);
+	c->w_len = ocfs2_clusters_to_blocks(c->dst->dst_fs, l);
+	c->w_blkno = c->blkno;
+	verbosef("alloc'd %d cluster/%d blocks req %d at %"PRIu64"\n", l,
+			c->w_len, c->clusters_left, c->blkno);
+	return ret;
+}
+
+
+static int copy_dirents(ocfs2_filesys *fs,
+		struct ocfs2_extent_rec *rec,
+		int tree_depth, uint32_t ccount, uint64_t ref_blkno,
+		int ref_recno, void *private)
+{
+	struct ocfs2_dir_entry *dirent, *prev = NULL;
+	struct defrag_dir_context *c = (struct defrag_dir_context *)private;
+	int i, offset, end, n, len, bs = fs->fs_blocksize;
+	char *buf;
+	errcode_t ret = ocfs2_malloc_block(fs->fs_io, &buf);
+	n = ocfs2_clusters_to_blocks(fs, rec->e_leaf_clusters);
+	verbosef("rec %"PRIu64" l %d\n", (uint64_t)rec->e_blkno,
rec->e_leaf_clusters);
+
+	for (i=0; i < n; i++) {
+		ret = ocfs2_read_blocks(fs, rec->e_blkno + i, 1, buf);
+		if (ret) {
+			com_err(whoami, ret, "while reading block\n");
+			goto fail;
+		}
+
+		offset = 0;
+		end = bs;
+		while (offset < end) {
+			if (c->prev_offset >= 0)
+				prev = (struct ocfs2_dir_entry *)
+					(c->w_buf + c->prev_offset);
+
+			dirent = (struct ocfs2_dir_entry *)(buf + offset);
+			len = dirent->rec_len;
+
+			if (len <=0) {
+				verbosef("Short dirent %d\n", dirent->rec_len);
+				c->r_offset += end - offset;
+				break;
+			}
+
+			if (dirent->inode == 0)
+				goto next;
+
+			if (prev && (len + c->prev_offset
+				+ OCFS2_DIR_REC_LEN(prev->name_len) > bs)) {
+				ret = io_write_block(fs->fs_io,
+					c->w_blkno, 1, c->w_buf);
+				verbosef("Writing b %"PRIu64"\n", c->w_blkno);
+				c->w_blkno += 1;
+				c->dirsize += bs;
+				if (c->w_blkno > c->blkno + c->w_len) {
+					int clen =
+					    ocfs2_blocks_to_clusters(fs, c->w_len);
+				verbosef("Inserting b %"PRIu64"\n", c->blkno);
+					ret = ocfs2_cached_inode_insert_extent(c->new_inode, c->cpos,
c->blkno, clen, 0);
+					if (ret) {
+						com_err(whoami, ret, "while inerting extent\n");
+						goto fail;
+					}
+					c->cpos += clen;
+					alloc_clusters(c);
+				}
+				memset(c->w_buf, 0, bs);
+				c->prev_offset = -1;
+				prev = NULL;
+			}	
+
+			verbosef("dirent %"PRIu64" %.*s\n",
+					(uint64_t)dirent->inode,
+					dirent->name_len, dirent->name);
+
+			if (c->prev_offset >= 0) {
+				prev->rec_len = OCFS2_DIR_REC_LEN(prev->name_len);
+				c->prev_offset += prev->rec_len;
+				memcpy((c->w_buf + c->prev_offset), dirent,
+					dirent->rec_len);
+			} else {
+				memcpy(c->w_buf, dirent, dirent->rec_len);
+				c->prev_offset = 0;
+			}
+			prev = (struct ocfs2_dir_entry *)(c->w_buf + c->prev_offset);
+			prev->rec_len = bs - c->prev_offset;
+next:
+			offset += len;
+			c->r_offset += len;
+
+			if (c->r_offset >= c->old_inode->i_size)
+				break;
+		}
+	}
+fail:
+	if (buf)
+		ocfs2_free(&buf);
+	return ret;
+}
+
+
+errcode_t defrag_dir(struct defrag_state *dst, struct ocfs2_dinode *di)
+{
+	struct defrag_dir_context dc;
+	uint64_t tmpblkno;
+	errcode_t ret;
+	int offset = 0, bs = dst->dst_fs->fs_blocksize;
+
+	/* XXX: Ignore refcounted dir for now */
+	if (di->i_dyn_features & (OCFS2_INLINE_DATA_FL|OCFS2_HAS_REFCOUNT_FL))
+		return 0;
+
+	/*Initialize dc */
+	memset(&dc, 0, sizeof(struct defrag_dir_context));
+	dc.dst = dst;
+	dc.prev_offset = -1;
+	dc.old_inode = di;
+
+	ret = ocfs2_malloc_block(dst->dst_fs->fs_io, &dc.w_buf);
+	if (ret) {
+		com_err(whoami, ret, "while allocating memory\n");
+		goto out;
+	}
+	memset(dc.w_buf, 0, bs);
+
+	ret = ocfs2_new_inode(dst->dst_fs, &tmpblkno, di->i_mode);
+	if (ret) {
+		com_err(whoami, ret, "while creating inode\n");
+		goto out;
+	}
+
+	ret = ocfs2_read_cached_inode(dst->dst_fs, tmpblkno, &dc.new_inode);
+	if (ret) {
+		com_err(whoami, ret, "while reading cached inode\n");
+		goto out;
+	}
+	/* XXX Hackish - reversing what ocfs2_init_inode did to the cached
+	   inode */
+	dc.new_inode->ci_inode->i_dyn_features &= ~OCFS2_INLINE_DATA_FL;
+	ocfs2_dinode_new_extent_list(dst->dst_fs, dc.new_inode->ci_inode);
+
+	verbosef("Defragging dir %"PRIu64" size %"PRIu64" tmp %"PRIu64"\n",
+			(uint64_t)di->i_blkno, (uint64_t)di->i_size, tmpblkno);
+
+	dc.clusters_left = ocfs2_bytes_to_clusters(dst->dst_fs, di->i_size);
+	ret = alloc_clusters(&dc);
+	if (ret) {
+		com_err(whoami, ret, "while allocating clusters\n");
+		goto out;
+	}
+	ret = ocfs2_extent_iterate_inode(dst->dst_fs, di,
+			OCFS2_EXTENT_FLAG_DATA_ONLY, NULL,
+			copy_dirents, &dc);
+
+	if (dc.status == DEFRAG_ERROR) {
+		fprintf(stderr, "Error while defraging file %"PRIu64
+				"\n", (uint64_t) di->i_blkno);
+		ret = ocfs2_truncate(dst->dst_fs, tmpblkno, 0);
+		goto out;
+	}
+
+	/* Insert data representing remaining extent_rec */
+	if ((dc.prev_offset >= 0) || (dc.w_blkno > dc.blkno)) {
+		int clen = ocfs2_blocks_to_clusters(dst->dst_fs, dc.w_len);
+		if (dc.prev_offset >= 0) {
+			verbosef("Writing final b %"PRIu64"\n", dc.w_blkno);
+			ret = io_write_block(dst->dst_fs->fs_io, dc.w_blkno, 1,
+					dc.w_buf);
+			dc.dirsize += bs;
+			if (ret) {
+				com_err(whoami, ret, "while writing block");
+				goto out;
+			}
+		}
+
+		verbosef("Inserting rec %"PRIu64" l %d\n", dc.blkno, clen);
+		ret = ocfs2_cached_inode_insert_extent(dc.new_inode, dc.cpos,
+				dc.blkno, clen, 0);
+		if (ret) {
+			com_err(whoami, ret, "while writing clusters\n");
+			ret = ocfs2_truncate(dst->dst_fs, tmpblkno, 0);
+			goto out;
+		}
+	}
+
+	/* Truncating the new file to trim additional allocated clusters */
+	verbosef("Truncating to %lu\n", dc.dirsize);
+	ocfs2_truncate(dst->dst_fs, dc.new_inode->ci_blkno, dc.dirsize);
+
+	/* Truncate the old file to free clusters held by them */
+	ocfs2_truncate(dst->dst_fs, di->i_blkno, 0);
+
+	/* Set the inode data */
+	offset = offsetof(struct ocfs2_dinode, id2.i_list);
+	memcpy(&di->id2.i_list, &dc.new_inode->ci_inode->id2.i_list,
+			bs - offset);
+	ret = ocfs2_write_inode(dst->dst_fs, di->i_blkno, (char *)di);
+	ocfs2_truncate(dst->dst_fs, di->i_blkno, dc.dirsize);
+
+out:
+	/* Cleaning up temporary stuff */
+	ocfs2_free_cached_inode(dst->dst_fs, dc.new_inode);
+	ocfs2_delete_inode(dst->dst_fs, tmpblkno);
+	return ret;
+}
+
diff --git a/defrag.ocfs2/file.c b/defrag.ocfs2/file.c
new file mode 100644
index 0000000..8153272
--- /dev/null
+++ b/defrag.ocfs2/file.c
@@ -0,0 +1,499 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+   * vim: noexpandtab sw=8 ts=8 sts=0:
+   *
+   * file.c
+   *
+   * Copyright (C) 2010 Oracle. All rights reserved.
+
+   *
+   * This program is free software; you can redistribute it and/or
+   * modify it under the terms of the GNU General Public
+   * License version 2 as published by the Free Software Foundation.
+   *
+   * This program is distributed in the hope that it will be useful,
+   * but WITHOUT ANY WARRANTY; without even the implied warranty of
+   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+   * General Public License for more details.
+   */
+
+#include "defrag.h"
+#include <inttypes.h>
+
+extern char *whoami;
+
+static errcode_t calc_num_extents(ocfs2_filesys *fs,
+		struct ocfs2_extent_list *el,
+		uint32_t *ne)
+{
+	struct ocfs2_extent_block *eb;
+	struct ocfs2_extent_rec *rec;
+	errcode_t ret = 0;
+	char *buf = NULL;
+	int i;
+	uint32_t clusters;
+	uint32_t extents = 0;
+
+	*ne = 0;
+        for (i = 0; i < el->l_next_free_rec; ++i) {
+                rec = &(el->l_recs[i]);
+                clusters = ocfs2_rec_clusters(el->l_tree_depth, rec);
+
+                /*
+                 * In a unsuccessful insertion, we may shift a tree
+                 * add a new branch for it and do no insertion. So we
+                 * may meet a extent block which have
+                 * clusters == 0, this should only be happen
+                 * in the last extent rec. */
+                if (!clusters && i == el->l_next_free_rec - 1)
+                        break;
+
+                extents = 1;
+
+                if (el->l_tree_depth) {
+                        ret = ocfs2_malloc_block(fs->fs_io, &buf);
+                        if (ret)
+                                goto bail;
+
+                        ret = ocfs2_read_extent_block(fs, rec->e_blkno, buf);
+                        if (ret)
+                                goto bail;
+
+                        eb = (struct ocfs2_extent_block *)buf;
+
+                        ret = calc_num_extents(fs, &(eb->h_list), &extents);
+                        if (ret)
+                                goto bail;
+
+                }
+                *ne = *ne + extents;
+        }
+
+bail:
+        if (buf)
+                ocfs2_free(&buf);
+        return ret;
+}
+
+#define MAX_RECS	4096
+
+struct save_cluster {
+	struct {
+		uint64_t start;
+		uint32_t len;
+	} rec[MAX_RECS];
+	int num_recs;
+};
+
+static errcode_t free_extents(ocfs2_filesys *fs, uint32_t len,
+		uint64_t start, void *private)
+{
+	struct save_cluster *sc = (struct save_cluster *)private;
+	int i;
+	for (i=0; i < sc->num_recs; i++) {
+		if ((start == sc->rec[i].start) && (len = sc->rec[i].len)) {
+			verbosef("Preserved %"PRIu64" l %d\n", sc->rec[i].start,
+					sc->rec[i].len);
+			return 0;
+		}
+	}
+	/* TODO: change call ocfs2_truncate_clusters to incorporate
+	   refcounting */
+	verbosef("Generic free: %"PRIu64" l %d\n", start, len);
+	return ocfs2_free_clusters(fs, len, start);
+}
+
+
+static errcode_t copy_clusters(ocfs2_filesys *fs,
+		uint64_t from, uint64_t to, int n)
+{
+	char *buf = NULL;
+	int i, c = ocfs2_clusters_in_blocks(fs, 1);
+	errcode_t ret = ocfs2_malloc_blocks(fs->fs_io, c, &buf);
+	verbosef("Writing %d clusters to %"PRIu64" from %"PRIu64"\n", n,
+			to, from);
+	if (ret)
+		return ret;
+	for (i=0; i < n; i++) {
+		ret = ocfs2_read_blocks(fs, from + i*c, c, buf);
+		if (ret)
+			goto out;
+		ret = io_write_block(fs->fs_io, to + i*c, c, buf);
+		if (ret)
+			goto out;
+	}
+out:
+	ocfs2_free(&buf);
+	return ret;
+}
+
+
+struct defrag_file_context {
+	struct defrag_state *dst;
+	ocfs2_cached_inode *new_inode;
+	struct ocfs2_dinode *old_inode;
+	uint64_t blkno;
+	/* Maintained in clusters */
+	unsigned int c_offset;
+	unsigned int c_end;
+	int clusters_left;
+	int cpos;
+	int expected_cpos;
+	unsigned int num_extents;
+	unsigned int orig_extents;
+	int status;
+	struct save_cluster sc;
+};
+
+#define MIN_REQUEST	1
+
+static errcode_t alloc_clusters(struct defrag_file_context *c)
+{
+	errcode_t ret;
+	int request = c->clusters_left;
+
+	if (request <= 0) {
+		verbosef("Requested: %d\n", request);
+		return OCFS2_ET_INVALID_ARGUMENT;
+	}
+
+	ret = ocfs2_new_clusters(c->dst->dst_fs, MIN_REQUEST, c->clusters_left,
+			&c->blkno, &c->c_end);
+	verbosef("alloc'd %d clusters req %d/%d at %"PRIu64"\n", c->c_end,
+			request, c->clusters_left, c->blkno);
+	c->c_offset = 0;
+	return ret;
+}
+
+static errcode_t free_clusters(struct defrag_file_context *c)
+{
+	errcode_t ret = 0;
+	int l = c->c_end - c->c_offset;
+	/* Discard any remaining clusters */
+	if (l) {
+		uint64_t blk = c->blkno +
+			ocfs2_clusters_to_blocks(c->dst->dst_fs, c->c_offset);
+		verbosef("Freeing %"PRIu64" l %d\n", blk, l);
+		ret = ocfs2_free_clusters(c->dst->dst_fs, l, blk);
+	}
+	c->c_offset = c->c_end = 0;
+	return ret;
+}
+
+static int copy_file_data(ocfs2_filesys *fs,
+		struct ocfs2_extent_rec *rec,
+		int tree_depth, uint32_t ccount, uint64_t ref_blkno,
+		int ref_recno, void *private)
+{
+	errcode_t ret;
+	struct defrag_file_context *c = (struct defrag_file_context *)private;
+	int nc, left, b = ocfs2_clusters_to_blocks(fs, 1);
+
+	verbosef("nc %d offset %d/%d pos %"PRIu64" blkno %"PRIu64
+			" exts %d/%d\n",
+			rec->e_leaf_clusters, c->c_offset, c->c_end,
+			(uint64_t)c->cpos, (uint64_t)c->blkno,
+			c->num_extents, c->orig_extents);
+
+	/* If our cluster length is more than existing
+	   file num extents, abort or we'll make it worse! */
+	if (c->num_extents > c->orig_extents) {
+		verbosef("Extents exceeded orig %d now %d cpos %d\n",
+			c->orig_extents, c->num_extents, c->cpos);
+		c->status = DEFRAG_EXTENTS_EXCEEDED;
+		return OCFS2_EXTENT_ABORT;
+	}
+
+	/* Handle holes */
+	if (c->expected_cpos < rec->e_cpos) {
+		verbosef("Hole detected expected/rec cpos %d/%d\n",
+				c->expected_cpos, rec->e_cpos);
+
+		if (c->c_offset) {
+			verbosef("Adding collected extent rec p %d blk %"
+				PRIu64"l %d\n", c->cpos, c->blkno, c->c_offset);
+			ret = ocfs2_cached_inode_insert_extent(c->new_inode,
+					c->cpos, c->blkno, c->c_offset, 0);
+			if (ret) {
+				com_err(whoami, ret,
+						"while adding extent rec\n");
+				c->status = DEFRAG_ERROR;
+				return OCFS2_EXTENT_ABORT;
+			}
+			c->num_extents++;
+			c->c_offset = 0;
+		}
+		c->expected_cpos = rec->e_cpos;
+		c->clusters_left -= rec->e_cpos - c->expected_cpos;
+		if (!c->clusters_left)
+			return 0;
+		ret = free_clusters(c);
+		if (ret) {
+			com_err(whoami, ret, "while freeing clusters\n");
+			c->status = DEFRAG_ERROR;
+			return OCFS2_EXTENT_ABORT;
+		}
+	}
+
+	c->expected_cpos += rec->e_leaf_clusters;
+
+	if (!c->c_end) {
+		ret = alloc_clusters(c);
+		if (ret) {
+			com_err(whoami, ret, "while allocating clusters\n");
+			c->status = DEFRAG_ERROR;
+			return OCFS2_EXTENT_ABORT;
+		}
+
+	}
+
+	/* Reuse clusters if they are bigger than allocated cluster */
+	if (c->c_end <= rec->e_leaf_clusters) {
+		if (c->c_offset) {
+			verbosef("Adding collected extent rec p %d blk %"
+				PRIu64"l %d\n", c->cpos, c->blkno, c->c_offset);
+			ret = ocfs2_cached_inode_insert_extent(c->new_inode,
+					c->cpos, c->blkno, c->c_offset, 0);
+			if (ret) {
+				com_err(whoami, ret,
+						"while adding extent rec\n");
+				c->status = DEFRAG_ERROR;
+				return OCFS2_EXTENT_ABORT;
+			}
+			c->num_extents++;
+		}
+		verbosef("Adding old extent rec %d b %"PRIu64" l %d\n",
+			rec->e_cpos, (uint64_t)rec->e_blkno, rec->e_leaf_clusters);
+		ret = ocfs2_cached_inode_insert_extent(c->new_inode,
+			rec->e_cpos, rec->e_blkno, rec->e_leaf_clusters, 0);
+		if (ret) {
+			com_err(whoami, ret, "while adding extent\n");
+			c->status = DEFRAG_ERROR;
+			return OCFS2_EXTENT_ABORT;
+		}
+		c->num_extents++;
+		c->sc.rec[c->sc.num_recs].start = rec->e_blkno;
+		c->sc.rec[c->sc.num_recs++].len = rec->e_leaf_clusters;
+		if (c->sc.num_recs > MAX_RECS) {
+			verbosef("Exceeded MAX_RECS limit %d\n",
+					c->sc.num_recs);
+			c->status = DEFRAG_ERROR;
+			return OCFS2_EXTENT_ABORT;
+		}
+		c->cpos = rec->e_cpos + rec->e_leaf_clusters;
+		c->clusters_left -= rec->e_leaf_clusters;
+		ret = free_clusters(c);
+		if (ret) {
+			com_err(whoami, ret, "while freeing clusters\n");
+			c->status = DEFRAG_ERROR;
+			return OCFS2_EXTENT_ABORT;
+		}
+		return 0;
+	}
+
+	/* Remaining clusters short. Fill allocated clusters and write */
+	nc = 0;
+	left = rec->e_leaf_clusters;
+	while (c->c_end < c->c_offset + left) {
+		/* Copy enough to fill the rest of the block */
+		nc = c->c_end - c->c_offset;
+		if (nc) {
+			ret = copy_clusters(c->dst->dst_fs, rec->e_blkno,
+					c->blkno + b*c->c_offset, nc);
+			if (ret) {
+				com_err(whoami, ret, "while reading %d blocks"
+					" from %"PRIu64"\n", b,
+					(uint64_t)rec->e_blkno);
+				c->status = DEFRAG_ERROR;
+				return OCFS2_EXTENT_ABORT;
+			}
+			c->clusters_left -= nc;
+			c->c_offset += nc;
+			left -= nc;
+		}
+
+		verbosef("Inserting extent b %"PRIu64" l %d\n",
+			c->blkno, c->c_end);
+
+		/*Block is full. Insert an extent_rec*/
+		ret = ocfs2_cached_inode_insert_extent(c->new_inode, c->cpos,
+			c->blkno, c->c_end, 0);
+		if (ret) {
+			com_err(whoami, ret, "while inserting extent_rec\n");
+			c->status = DEFRAG_ERROR;
+			return OCFS2_EXTENT_ABORT;
+		}
+		c->cpos += c->c_end;
+		c->num_extents++;
+
+		/* Get a new extent allocation */
+		if (c->clusters_left && (c->c_offset==c->c_end)) {
+			ret = alloc_clusters(c);
+			if (ret) {
+				com_err(whoami, ret, "while allocating "
+						"clusters\n");
+				c->status = DEFRAG_ERROR;
+				return OCFS2_EXTENT_ABORT;
+			}
+		}
+	}
+
+	if (left) {
+		ret = copy_clusters(c->dst->dst_fs, rec->e_blkno + nc*b,
+				c->blkno + c->c_offset*b, left);
+		c->c_offset += left;
+		c->clusters_left -= left;
+		if (ret) {
+			com_err(whoami, ret, "while copying %d clusters from %"
+				PRIu64"\n", nc, (uint64_t)rec->e_blkno);
+			c->status = DEFRAG_ERROR;
+			return OCFS2_EXTENT_ABORT;
+		}
+	}
+	return 0;
+}
+
+errcode_t defrag_file(struct defrag_state *dst, struct ocfs2_dinode *di)
+{
+	struct defrag_file_context fc;
+	unsigned int best_exts;
+	uint64_t tmpblkno;
+	errcode_t ret;
+	struct ocfs2_extent_list *el = &di->id2.i_list;
+	int offset = 0;
+
+	/* XXX: Ignore refcounted file for now */
+	if (di->i_dyn_features & (OCFS2_INLINE_DATA_FL | OCFS2_HAS_REFCOUNT_FL))
+		return 0;
+
+	/*Initialize fc */
+	memset(&fc, 0, sizeof(struct defrag_file_context));
+	fc.dst = dst;
+
+	/* What is the best number of extents for this file */
+	best_exts = (ocfs2_bytes_to_clusters(dst->dst_fs,
di->i_size))/dst->max_esize + 1;
+	ret = calc_num_extents(dst->dst_fs, el, &fc.orig_extents);
+	if (best_exts >= fc.orig_extents)
+		return 0;
+
+	ret = ocfs2_new_inode(dst->dst_fs, &tmpblkno, di->i_mode);
+	ret = ocfs2_read_cached_inode(dst->dst_fs, tmpblkno, &fc.new_inode);
+	fc.new_inode->ci_inode->i_size = di->i_size;
+
+	verbosef("Defragging ino: %"PRIu64" size %"PRIu64" best %d num %d to %"
+			PRIu64"\n",
+			(uint64_t)di->i_blkno, (uint64_t)di->i_size,
+			best_exts, fc.orig_extents, tmpblkno);
+
+	fc.clusters_left = ocfs2_bytes_to_clusters(dst->dst_fs, di->i_size) + 1;
+	ret = ocfs2_extent_iterate_inode(dst->dst_fs, di,
+			OCFS2_EXTENT_FLAG_DATA_ONLY, NULL,
+			copy_file_data, &fc);
+	switch(fc.status) {
+		case DEFRAG_EXTENTS_EXCEEDED:
+			fprintf(stderr, "Number of extents for inode %"PRIu64
+					" exceeded. Aborting\n",
+					(uint64_t)di->i_blkno);
+			fc.c_offset = 0;
+			ret = free_clusters(&fc);
+			ret = ocfs2_truncate(dst->dst_fs, tmpblkno, 0);
+			goto out;
+		case DEFRAG_ERROR:
+			fprintf(stderr, "Error while defraging file %"PRIu64
+					"\n", (uint64_t) di->i_blkno);
+			fc.c_offset = 0;
+			ret = free_clusters(&fc);
+			ret = ocfs2_truncate(dst->dst_fs, tmpblkno, 0);
+			goto out;
+		default:
+			break;
+	}
+
+	/* Insert data representing remaining extent_rec */
+	if (fc.c_offset) {
+		ret = ocfs2_cached_inode_insert_extent(fc.new_inode, fc.cpos,
+			fc.blkno, fc.c_offset, 0);
+		if (ret) {
+			com_err(whoami, ret, "while writing clusters\n");
+			ret = ocfs2_truncate(dst->dst_fs, tmpblkno, 0);
+			goto out;
+		}
+		free_clusters(&fc);
+	}
+
+	/* Truncate the old file */
+	ocfs2_truncate_full(dst->dst_fs, di->i_blkno, 0, free_extents,
+			(void *)&fc.sc);
+	di->i_size = fc.new_inode->ci_inode->i_size;
+
+	/* Set the inode data */
+	el = &di->id2.i_list;
+	offset = offsetof(struct ocfs2_dinode, id2.i_list);
+	memcpy(el, &fc.new_inode->ci_inode->id2.i_list,
+			dst->dst_fs->fs_blocksize - offset);
+	ret = ocfs2_write_inode(dst->dst_fs, di->i_blkno, (char *)di);
+
+out:
+	/* Cleaning up temporary stuff */
+	ocfs2_free_cached_inode(dst->dst_fs, fc.new_inode);
+	ocfs2_delete_inode(dst->dst_fs, tmpblkno);
+	return ret;
+}
+
+
+errcode_t defrag_files_and_dirs(struct defrag_state *dst)
+{
+	ocfs2_inode_scan *scan;
+	ocfs2_filesys *fs = dst->dst_fs;
+	uint64_t blkno;
+	errcode_t ret = 0;
+	char *buf = NULL;
+	struct ocfs2_dinode *di;
+	int bsbits, cbits;
+
+	ret = ocfs2_malloc_block(fs->fs_io, &buf);
+	ret = ocfs2_open_inode_scan(fs, &scan);
+	if (ret)
+		goto out;
+
+	di = (struct ocfs2_dinode *)buf;
+	bsbits = OCFS2_RAW_SB(dst->dst_fs->fs_super)->s_blocksize_bits;
+	cbits = OCFS2_RAW_SB(dst->dst_fs->fs_super)->s_clustersize_bits;
+	/* XXX: Hard-coded for now. To be extracted from gd*/
+	dst->max_esize = 8192;
+
+	for (;;) {
+		ret = ocfs2_get_next_inode(scan, &blkno, buf);
+
+		if (blkno == 0)
+			break;
+
+		if (memcmp(di->i_signature, OCFS2_INODE_SIGNATURE,
+					strlen(OCFS2_INODE_SIGNATURE)))
+			continue;
+
+		ocfs2_swap_inode_to_cpu(fs, di);
+
+		if (di->i_fs_generation != dst->fs_generation)
+			continue;
+
+		if (!(di->i_flags & OCFS2_VALID_FL) ||
+				(di->i_flags & OCFS2_SYSTEM_FL))
+			continue;
+
+		dst->num_inodes++;
+
+		if (S_ISREG(di->i_mode))
+			ret = defrag_file(dst, di);
+		else if (S_ISDIR(di->i_mode))
+			ret = defrag_dir(dst, di);
+
+		if (ret)
+			break;
+	}
+out:
+	if (scan)
+		ocfs2_close_inode_scan(scan);
+	ocfs2_free(&buf);
+	return ret;
+}
+
diff --git a/defrag.ocfs2/include/defrag.h b/defrag.ocfs2/include/defrag.h
new file mode 100644
index 0000000..0dabc86
--- /dev/null
+++ b/defrag.ocfs2/include/defrag.h
@@ -0,0 +1,48 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * defrag.h
+ *
+ * Copyright (C) 2010 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#ifndef __OCFS2_DEFRAG_H__
+#define __OCFS2_DEFRAG_H__
+
+#include "ocfs2/ocfs2.h"
+
+#define DEFRAG_OK			0
+#define DEFRAG_USAGE			1
+#define DEFRAG_ERROR			2
+#define DEFRAG_EXTENTS_EXCEEDED		3
+
+struct defrag_state {
+	ocfs2_filesys 	*dst_fs;
+	int max_esize;
+	int fs_generation;
+	int num_inodes;
+	unsigned	skip_o2cb:1;
+};
+
+extern int verbose;
+#define verbosef(fmt, args...) do {					\
+	if (verbose)							\
+		printf("%s:%d | " fmt, __FUNCTION__, __LINE__, args);\
+} while (0)
+
+
+errcode_t defrag_files_and_dirs(struct defrag_state *dst);
+errcode_t defrag_file(struct defrag_state *dst, struct ocfs2_dinode *);
+errcode_t defrag_dir(struct defrag_state *dst, struct ocfs2_dinode *);
+
+#endif /* __OCFS2_DEFRAG_H__ */
+
-- 
1.6.4.2



More information about the Ocfs2-tools-devel mailing list