[Ocfs2-devel] [PATCH 2/2] ocfs2: issue zeroout to EOF blocks
Joseph Qi
joseph.qi at linux.alibaba.com
Tue Jul 20 20:32:48 PDT 2021
On 7/14/21 5:13 AM, Junxiao Bi wrote:
> For punch holes in EOF blocks, fallocate used buffer write to zero
> the EOF blocks in last cluster. But since ->writepage will ignore
> EOF pages, those zeros will not be flushed. This "looks" ok as
> commit 6bba4471f0cc ("ocfs2: fix data corruption by fallocate")
> will zero the EOF blocks when extend the file size, but it isn't.
> The problem happened on those EOF pages, before writeback, those
> pages had DIRTY flag set and all buffer_head in them also had
> DIRTY flag set, when writeback run by write_cache_pages(), DIRTY
> flag on the page was cleared, but DIRTY flag on the buffer_head
> not. When next write happened to those EOF pages, since buffer_head
> already had DIRTY flag set, it would not mark page DIRTY again.
> That made writeback ignore them forever. That will cause data
> corruption. Even directio write can't work because it will fail
> when trying to drop pages caches before direct io, as it found
> the buffer_head for those pages still had DIRTY flag set, then
> it will fall back to buffer io mode.
> To make a summary of the issue, as writeback ingores EOF pages,
> once any EOF page is generated, any write to it will only go
> to the page cache, it will never be flushed to disk even file
> size extends and that page is not EOF page any more.
> The fix is to avoid zero EOF blocks with buffer write.
>
> The following code snippet from qemu-img could trigger the corruption.
>
> 656 open("6b3711ae-3306-4bdd-823c-cf1c0060a095.conv.2", O_RDWR|O_DIRECT|O_CLOEXEC) = 11
> ...
> 660 fallocate(11, FALLOC_FL_KEEP_SIZE|FALLOC_FL_PUNCH_HOLE, 2275868672, 327680 <unfinished ...>
> 660 fallocate(11, 0, 2275868672, 327680) = 0
> 658 pwrite64(11, "\0\31\237\v\0\336\330\f\0\373~\r\0\300\270\16\0\335^\17\0\242\230\20\0\277>\21\0\204x\22"..., 311296, 2275868672) = 311296
>
> Cc: <stable at vger.kernel.org>
> Signed-off-by: Junxiao Bi <junxiao.bi at oracle.com>
> ---
> fs/ocfs2/file.c | 99 ++++++++++++++++++++++++++++++-------------------
> 1 file changed, 60 insertions(+), 39 deletions(-)
>
> diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
> index 53bb46ce3cbb..984b950f5abc 100644
> --- a/fs/ocfs2/file.c
> +++ b/fs/ocfs2/file.c
> @@ -1529,6 +1529,45 @@ static void ocfs2_truncate_cluster_pages(struct inode *inode, u64 byte_start,
> }
> }
>
> +/*
> + * zero out partial blocks of one cluster.
> + *
> + * start: file offset where zero starts, will be made upper block aligned.
> + * len: it will be trimmed to the end of current cluster if "start + len"
> + * is bigger than it.
> + */
> +static int ocfs2_zeroout_partial_cluster(struct inode *inode,
> + u64 start, u64 len)
> +{
> + int ret;
> + u64 start_block, end_block, nr_blocks;
> + u64 p_block, offset;
> + u32 cluster, p_cluster, nr_clusters;
> + struct super_block *sb = inode->i_sb;
> + u64 end = ocfs2_align_bytes_to_clusters(sb, start);
> +
> + if (start + len < end)
> + end = start + len;
> +
> + start_block = ocfs2_blocks_for_bytes(sb, start);
> + end_block = ocfs2_blocks_for_bytes(sb, end);
> + nr_blocks = end_block - start_block;
> + if (!nr_blocks)
> + return 0;
> +
> + cluster = ocfs2_bytes_to_clusters(sb, start);
> + ret = ocfs2_get_clusters(inode, cluster, &p_cluster,
> + &nr_clusters, NULL);
> + if (ret)
> + return ret;
> + if (!p_cluster)
> + return 0;
> +
> + offset = start_block - ocfs2_clusters_to_blocks(sb, cluster);
> + p_block = ocfs2_clusters_to_blocks(sb, p_cluster) + offset;
> + return sb_issue_zeroout(sb, p_block, nr_blocks, GFP_NOFS);
> +}
> +
> static int ocfs2_zero_partial_clusters(struct inode *inode,
> u64 start, u64 len)
> {
> @@ -1538,6 +1577,7 @@ static int ocfs2_zero_partial_clusters(struct inode *inode,
> struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
> unsigned int csize = osb->s_clustersize;
> handle_t *handle;
> + loff_t isize = i_size_read(inode);
>
> /*
> * The "start" and "end" values are NOT necessarily part of
> @@ -1558,6 +1598,26 @@ static int ocfs2_zero_partial_clusters(struct inode *inode,
> if ((start & (csize - 1)) == 0 && (end & (csize - 1)) == 0)
> goto out;
>
> + /* No page cache for EOF blocks, issue zero out to disk. */
> + if (end > isize) {
> + /*
> + * zeroout eof blocks in last cluster starting from
> + * "isize" even "start" > "isize" because it is
> + * complicated to zeroout just at "start" as "start"
> + * may be not aligned with block size, buffer write
> + * would be required to do that, but out of eof buffer
> + * write is not supported.
> + */
> + ret = ocfs2_zeroout_partial_cluster(inode, isize,
> + end - isize);
> + if (ret) {
> + mlog_errno(ret);
> + return ret;
Better to use "goto out" to keep code consistent.
> + }
> + if (start >= isize)
> + return ret;
Ditto.
> + end = isize;
> + }
> handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
> if (IS_ERR(handle)) {
> ret = PTR_ERR(handle);
> @@ -1855,45 +1915,6 @@ int ocfs2_remove_inode_range(struct inode *inode,
> return ret;
> }
>
> -/*
> - * zero out partial blocks of one cluster.
> - *
> - * start: file offset where zero starts, will be made upper block aligned.
> - * len: it will be trimmed to the end of current cluster if "start + len"
> - * is bigger than it.
> - */
> -static int ocfs2_zeroout_partial_cluster(struct inode *inode,
> - u64 start, u64 len)
> -{
> - int ret;
> - u64 start_block, end_block, nr_blocks;
> - u64 p_block, offset;
> - u32 cluster, p_cluster, nr_clusters;
> - struct super_block *sb = inode->i_sb;
> - u64 end = ocfs2_align_bytes_to_clusters(sb, start);
> -
> - if (start + len < end)
> - end = start + len;
> -
> - start_block = ocfs2_blocks_for_bytes(sb, start);
> - end_block = ocfs2_blocks_for_bytes(sb, end);
> - nr_blocks = end_block - start_block;
> - if (!nr_blocks)
> - return 0;
> -
> - cluster = ocfs2_bytes_to_clusters(sb, start);
> - ret = ocfs2_get_clusters(inode, cluster, &p_cluster,
> - &nr_clusters, NULL);
> - if (ret)
> - return ret;
> - if (!p_cluster)
> - return 0;
> -
> - offset = start_block - ocfs2_clusters_to_blocks(sb, cluster);
> - p_block = ocfs2_clusters_to_blocks(sb, p_cluster) + offset;
> - return sb_issue_zeroout(sb, p_block, nr_blocks, GFP_NOFS);
> -}
> -
> /*
> * Parts of this function taken from xfs_change_file_space()
> */
>
More information about the Ocfs2-devel
mailing list