[Ocfs2-tools-devel] [PATCH 9/9] fsck.ocfs2: Implement Pass 1D resolution of multiply-claimed clusters.
Tao Ma
tao.ma at oracle.com
Fri Jul 31 00:13:56 PDT 2009
Hi Joel,
I just have one qs.
It seems that you cp/paste the whole content of a file regardless the
number of multiply-claimed clusters. So why not just CoW the
multiply-claimed ones? If we can do this, it is also helpful for
libocfs2 support for reflink and I think it is worthwhile.
Regards,
Tao
Joel Becker wrote:
> Inodes that share multiply-claimed clusters can be duplicated or cloned.
> By copying the inode data off to other clusters, this removes the
> multiple claims on the original clusters. We then remove the cloned
> inode, which points to the originally shared clusters. We use
> ocfs2_truncate_full() to make sure we don't free our multiply-claimed
> clusters until they're really unused.
>
> We also implement deleting inodes that share multiply claimed clusters,
> because it can use the same ocfs2_truncate_full() code. Here we also
> mark the inode as unused. Pass 2 will let the user remove the directory
> entry (behavior copied from e2fsck).
>
> Signed-off-by: Joel Becker <joel.becker at oracle.com>
> ---
> fsck.ocfs2/pass1b.c | 310 +++++++++++++++++++++++++++++++++++++++++++++++++--
> 1 files changed, 300 insertions(+), 10 deletions(-)
>
> diff --git a/fsck.ocfs2/pass1b.c b/fsck.ocfs2/pass1b.c
> index f45f0f9..a078144 100644
> --- a/fsck.ocfs2/pass1b.c
> +++ b/fsck.ocfs2/pass1b.c
> @@ -43,7 +43,18 @@
> *
> * Pass 1D does the actual fixing. Each inode with duplicate clusters can
> * cloned to an entirely new file or deleted. Regardless of the choice,
> - * and inode that is fixed no longer has duplicate clusters.
> + * an inode that is fixed no longer has duplicate clusters. Cloning is
> + * done by creating a new inode and copying the data to it. Then the
> + * extent trees are swapped between the original and clone inode. This
> + * leaves the original inode with a good extent tree. Finally, the clone
> + * inode is removed and its extent tree released. If deletion is chosen
> + * instead of cloning, the original inode is removed. Either way, we end
> + * up over-freeing the clusters in the main bitmap. At the end, we run
> + * the list of multi-claimed clusters again. If the cluster still has
> + * claimers, it is forced on in the bitmap. If it does not, it is forced
> + * clear in the bitmap. If we crash in the middle, we're still safe. A
> + * re-run of fsck will determine whether the over-freed clusters are
> + * actually in use.
> *
> * Once Pass1D is complete, the ost_duplicate_clusters bitmap can be
> * freed.
> @@ -1019,9 +1030,283 @@ static void print_chain_warning(void)
> chain_warning = 1;
> }
>
> +static errcode_t new_clone(ocfs2_filesys *fs, ocfs2_cached_inode *orig_ci,
> + ocfs2_cached_inode **clone_ci)
> +{
> + errcode_t ret, ret2;
> + uint64_t clone_blkno = 0;
> + uint64_t bytes = orig_ci->ci_inode->i_size;
> + uint32_t clusters = ocfs2_clusters_in_bytes(fs, bytes);
> +
> + ret = ocfs2_new_inode(fs, &clone_blkno, orig_ci->ci_inode->i_mode);
> + if (ret) {
> + com_err(whoami, ret, "while allocating a clone inode");
> + return ret;
> + }
> +
> + /*
> + * Let's get the clusters in the best way we can. We make sure
> + * i_size is updated so that ocfs2_file_write() is happy.
> + */
> + if (ocfs2_writes_unwritten_extents(OCFS2_RAW_SB(fs->fs_super)) &&
> + !(orig_ci->ci_inode->i_flags & OCFS2_SYSTEM_FL))
> + ret = ocfs2_allocate_unwritten_extents(fs, clone_blkno, 0,
> + bytes);
> + else {
> + ret = ocfs2_extend_allocation(fs, clone_blkno, clusters);
> + if (!ret)
> + ret = ocfs2_extend_file(fs, clone_blkno, bytes);
> + }
> + if (ret) {
> + com_err(whoami, ret,
> + "while allocating data clusters for a clone inode");
> + goto out;
> + }
> +
> + ret = ocfs2_read_cached_inode(fs, clone_blkno, clone_ci);
> + if (ret)
> + com_err(whoami, ret, "while reading temporary clone inode");
> +
> + /*
> + * It is so tempting to link the temporary clone inode into
> + * the orphan directory here. But we can't, because later in
> + * the clone process it will point to multiply-claimed clusters.
> + * Orphan cleanup would free them, which is even worse than
> + * leaving the temporary clone inode around.
> + */
> +
> +out:
> + if (ret && clone_blkno) {
> + ret2 = ocfs2_delete_inode(fs, clone_blkno);
> + if (ret2)
> + com_err(whoami, ret2,
> + "while removing temporary clone inode");
> + }
> + return ret;
> +}
> +
> +static errcode_t copy_clone(ocfs2_filesys *fs, ocfs2_cached_inode *orig_ci,
> + ocfs2_cached_inode *clone_ci)
> +{
> + char *buf;
> + errcode_t ret;
> + uint64_t offset = 0;
> + uint64_t filesize = orig_ci->ci_inode->i_size;
> + unsigned int iosize = 1024 * 1024; /* Let's read in 1MB hunks */
> + unsigned int got, wrote;
> +
> + ret = ocfs2_malloc_blocks(fs->fs_io, iosize / fs->fs_blocksize,
> + &buf);
> + if (ret) {
> + com_err(whoami, ret, "while allocating clone buffer");
> + return ret;
> + }
> +
> + while (offset < filesize) {
> + if ((filesize - offset) < iosize)
> + iosize = filesize - offset;
> + ret = ocfs2_file_read(orig_ci, buf, iosize, offset, &got);
> + if (ret) {
> + com_err(whoami, ret, "while reading inode to clone");
> + break;
> + }
> +
> + ret = ocfs2_file_write(clone_ci, buf, iosize, offset, &wrote);
> + if (ret) {
> + com_err(whoami, ret, "while writing clone data");
> + break;
> + }
> + assert(got == wrote);
> + offset += wrote;
> + }
> +
> + ocfs2_free(&buf);
> + return ret;
> +}
> +
> +static errcode_t swap_clone(ocfs2_filesys *fs, ocfs2_cached_inode *orig_ci,
> + ocfs2_cached_inode *clone_ci)
> +{
> + errcode_t ret;
> + struct ocfs2_extent_list *tmp_el = NULL;
> + struct ocfs2_extent_list *orig_el = &orig_ci->ci_inode->id2.i_list;
> + struct ocfs2_extent_list *clone_el = &clone_ci->ci_inode->id2.i_list;
> + int el_size = offsetof(struct ocfs2_extent_list, l_recs) +
> + sizeof(struct ocfs2_extent_rec) * orig_el->l_count;
> +
> + ret = ocfs2_malloc0(el_size, &tmp_el);
> + if (ret) {
> + com_err(whoami, ret,
> + "while allocating temporary memory to swap a "
> + "cloned inode");
> + goto out;
> + }
> +
> + memcpy(tmp_el, orig_el, el_size);
> + memcpy(orig_el, clone_el, el_size);
> + memcpy(clone_el, tmp_el, el_size);
> +
> + /*
> + * We write the cloned inode with the original extent list first.
> + * If we crash between writing the cloned inode and the original
> + * one, the cloned inode will appear to share the same extents
> + * as the original and the extents we just allocated to the clone
> + * will look unused to a subsequent fsck run. They'll be reusable
> + * for recovery.
> + */
> + ret = ocfs2_write_cached_inode(fs, clone_ci);
> + if (ret) {
> + com_err(whoami, ret,
> + "while writing out clone inode %"PRIu64,
> + clone_ci->ci_blkno);
> + goto out;
> + }
> +
> + ret = ocfs2_write_cached_inode(fs, orig_ci);
> + if (ret)
> + com_err(whoami, ret, "while writing out inode %"PRIu64,
> + orig_ci->ci_blkno);
> +
> +out:
> + if (tmp_el)
> + ocfs2_free(&tmp_el);
> + return ret;
> +}
> +
> +static int can_free(struct dup_context *dct, uint32_t cpos)
> +{
> + struct dup_cluster *dc;
> + int unhandled = 0;
> +
> + dc = dup_cluster_lookup(dct, cpos);
> + /* We don't call can_free unless it's in the dup bitmap */
> + assert(dc);
> +
> + /*
> + * See how many inodes still point to it. It can't be zero,
> + * because we're working on an inode that points to it RIGHT
> + * NOW.
> + */
> + for_each_owner(dct, dc, count_func, &unhandled);
> + assert(unhandled > 0);
> + if (unhandled > 1)
> + return 0;
> + return 1;
> +}
> +
> +static errcode_t pass1d_free_clusters(ocfs2_filesys *fs, uint32_t len,
> + uint64_t start, void *free_data)
> +{
> + errcode_t ret = 0;
> + int was_set;
> + struct fix_dup_context *fd = free_data;
> + uint32_t p_cpos, p_start = ocfs2_blocks_to_clusters(fs, start);
> +
> + for (p_cpos = p_start; p_cpos < (p_start + len); p_cpos++) {
> + verbosef("checking cpos %"PRIu32"\n", p_cpos);
> + ret = ocfs2_bitmap_test(fd->fd_ost->ost_duplicate_clusters,
> + p_cpos, &was_set);
> + if (ret) {
> + com_err(whoami, ret,
> + "while testing cluster %"PRIu32" in "
> + "the duplicate cluster map",
> + p_cpos);
> + break;
> + }
> +
> + verbosef("cpos %"PRIu32" was_set == %d\n", p_cpos, was_set);
> + if (was_set) {
> + if (can_free(fd->fd_dct, p_cpos))
> + verbosef("Freeing multiply-claimed cluster "
> + "%"PRIu32", as it is no longer used\n",
> + p_cpos);
> + else
> + continue;
> + }
> +
> + verbosef("Freeing cluster %"PRIu32"\n", p_cpos);
> + ret = ocfs2_free_clusters(fd->fd_ost->ost_fs, 1,
> + ocfs2_clusters_to_blocks(fs, p_cpos));
> + if (ret) {
> + com_err(whoami, ret,
> + "while freeing duplicate cluster "
> + "%"PRIu32,
> + p_cpos);
> + break;
> + }
> + }
> +
> + return ret;
> +}
> +
> +static int delete_one_inode(struct fix_dup_context *fd, uint64_t ino)
> +{
> + errcode_t ret;
> + o2fsck_state *ost = fd->fd_ost;
> +
> + verbosef("Truncating inode %"PRIu64"\n", ino);
> + ret = ocfs2_truncate_full(ost->ost_fs, ino, 0,
> + pass1d_free_clusters, fd);
> + if (ret) {
> + com_err(whoami, ret,
> + "while truncating inode %"PRIu64" to remove it",
> + ino);
> + goto out;
> + }
> +
> + verbosef("Deleting inode %"PRIu64"\n", ino);
> + ret = ocfs2_delete_inode(ost->ost_fs, ino);
> + if (ret)
> + com_err(whoami, ret, "while removing inode %"PRIu64, ino);
> + else
> + o2fsck_icount_set(ost->ost_icount_in_inodes, ino, 0);
> +
> +out:
> + return ret ? 1 : 0;
> +}
> +
> +static int clone_one_inode(struct fix_dup_context *fd, struct dup_inode *di)
> +{
> + errcode_t ret, tmpret;
> + ocfs2_filesys *fs = fd->fd_ost->ost_fs;
> + ocfs2_cached_inode *orig_ci = NULL, *clone_ci = NULL;
> +
> + ret = ocfs2_read_cached_inode(fs, di->di_ino, &orig_ci);
> + if (ret) {
> + com_err(whoami, ret,
> + "while reading inode \"%s\" to clone it",
> + di->di_path);
> + goto out;
> + }
> +
> + ret = new_clone(fs, orig_ci, &clone_ci);
> + if (ret)
> + goto out;
> +
> + verbosef("Copying inode \"%s\" to clone %"PRIu64"\n", di->di_path,
> + clone_ci->ci_blkno);
> + ret = copy_clone(fs, orig_ci, clone_ci);
> + if (ret)
> + goto out;
> +
> + ret = swap_clone(fs, orig_ci, clone_ci);
> +
> +out:
> + if (orig_ci)
> + ocfs2_free_cached_inode(fs, orig_ci);
> + if (clone_ci) {
> + tmpret = delete_one_inode(fd, clone_ci->ci_blkno);
> + if (!ret)
> + ret = tmpret;
> + ocfs2_free_cached_inode(fs, clone_ci);
> + }
> + return ret ? 1 : 0;
> +}
> +
> static int fix_dups_func(struct dup_cluster *dc, struct dup_inode *di,
> void *priv_data)
> {
> + int ret = 0;
> struct fix_dup_context *fd = priv_data;
>
> if (di->di_flags & OCFS2_CHAIN_FL) {
> @@ -1039,7 +1324,9 @@ static int fix_dups_func(struct dup_cluster *dc, struct dup_inode *di,
> "break claims on clusters it shares with other "
> "inodes?",
> di->di_path, di->di_path)) {
> - return 0;
> + ret = clone_one_inode(fd, di);
> + if (!ret)
> + di->di_state |= DUP_INODE_CLONED;
> }
> } else {
> if (prompt(fd->fd_ost, PY, PR_DUP_CLUSTERS_CLONE,
> @@ -1048,17 +1335,20 @@ static int fix_dups_func(struct dup_cluster *dc, struct dup_inode *di,
> "Clone inode \"%s\" to break claims on "
> "clusters it shares with other inodes?",
> di->di_path, di->di_path)) {
> - return 0;
> - }
> - if (prompt(fd->fd_ost, PN, PR_DUP_CLUSTERS_DELETE,
> - "Delete inode \"%s\" to break claims on "
> - "clusters it shares with other inodes?",
> - di->di_path)) {
> - return 0;
> + ret = clone_one_inode(fd, di);
> + if (!ret)
> + di->di_state |= DUP_INODE_CLONED;
> + } else if (prompt(fd->fd_ost, PN, PR_DUP_CLUSTERS_DELETE,
> + "Delete inode \"%s\" to break claims on "
> + "clusters it shares with other inodes?",
> + di->di_path)) {
> + ret = delete_one_inode(fd, di->di_ino);
> + if (!ret)
> + di->di_state |= DUP_INODE_REMOVED;
> }
> }
>
> - return 0;
> + return ret;
> }
>
> static errcode_t o2fsck_pass1d(o2fsck_state *ost, struct dup_context *dct)
More information about the Ocfs2-tools-devel
mailing list