[Ocfs2-devel] [PATCH 2/2] ocfs2: issue zeroout to EOF blocks

Tue Jul 20 20:32:48 PDT 2021

On 7/14/21 5:13 AM, Junxiao Bi wrote:
> For punch holes in EOF blocks, fallocate used buffer write to zero
> the EOF blocks in last cluster. But since ->writepage will ignore
> EOF pages, those zeros will not be flushed. This "looks" ok as
> commit 6bba4471f0cc ("ocfs2: fix data corruption by fallocate")
> will zero the EOF blocks when extend the file size, but it isn't.
> The problem happened on those EOF pages, before writeback, those
> pages had DIRTY flag set and all buffer_head in them also had
> DIRTY flag set, when writeback run by write_cache_pages(), DIRTY
> flag on the page was cleared, but DIRTY flag on the buffer_head
> not. When next write happened to those EOF pages, since buffer_head
> already had DIRTY flag set, it would not mark page DIRTY again.
> That made writeback ignore them forever. That will cause data
> corruption. Even directio write can't work because it will fail
> when trying to drop pages caches before direct io, as it found
> the buffer_head for those pages still had DIRTY flag set, then
> it will fall back to buffer io mode.
> To make a summary of the issue, as writeback ingores EOF pages,
> once any EOF page is generated, any write to it will only go
> to the page cache, it will never be flushed to disk even file
> size extends and that page is not EOF page any more.
> The fix is to avoid zero EOF blocks with buffer write.
> 
> The following code snippet from qemu-img could trigger the corruption.
> 
> 656   open("6b3711ae-3306-4bdd-823c-cf1c0060a095.conv.2", O_RDWR|O_DIRECT|O_CLOEXEC) = 11
> ...
> 660   fallocate(11, FALLOC_FL_KEEP_SIZE|FALLOC_FL_PUNCH_HOLE, 2275868672, 327680 <unfinished ...>
> 660   fallocate(11, 0, 2275868672, 327680) = 0
> 658   pwrite64(11, "\0\31\237\v\0\336\330\f\0\373~\r\0\300\270\16\0\335^\17\0\242\230\20\0\277>\21\0\204x\22"..., 311296, 2275868672) = 311296
> 
> Cc: <stable at vger.kernel.org>
> Signed-off-by: Junxiao Bi <junxiao.bi at oracle.com>
> ---
>  fs/ocfs2/file.c | 99 ++++++++++++++++++++++++++++++-------------------
>  1 file changed, 60 insertions(+), 39 deletions(-)
> 
> diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
> index 53bb46ce3cbb..984b950f5abc 100644
> --- a/fs/ocfs2/file.c
> +++ b/fs/ocfs2/file.c
> @@ -1529,6 +1529,45 @@ static void ocfs2_truncate_cluster_pages(struct inode *inode, u64 byte_start,
>  	}
>  }
>  
> +/*
> + * zero out partial blocks of one cluster.
> + *
> + * start: file offset where zero starts, will be made upper block aligned.
> + * len: it will be trimmed to the end of current cluster if "start + len"
> + *      is bigger than it.
> + */
> +static int ocfs2_zeroout_partial_cluster(struct inode *inode,
> +					u64 start, u64 len)
> +{
> +	int ret;
> +	u64 start_block, end_block, nr_blocks;
> +	u64 p_block, offset;
> +	u32 cluster, p_cluster, nr_clusters;
> +	struct super_block *sb = inode->i_sb;
> +	u64 end = ocfs2_align_bytes_to_clusters(sb, start);
> +
> +	if (start + len < end)
> +		end = start + len;
> +
> +	start_block = ocfs2_blocks_for_bytes(sb, start);
> +	end_block = ocfs2_blocks_for_bytes(sb, end);
> +	nr_blocks = end_block - start_block;
> +	if (!nr_blocks)
> +		return 0;
> +
> +	cluster = ocfs2_bytes_to_clusters(sb, start);
> +	ret = ocfs2_get_clusters(inode, cluster, &p_cluster,
> +				&nr_clusters, NULL);
> +	if (ret)
> +		return ret;
> +	if (!p_cluster)
> +		return 0;
> +
> +	offset = start_block - ocfs2_clusters_to_blocks(sb, cluster);
> +	p_block = ocfs2_clusters_to_blocks(sb, p_cluster) + offset;
> +	return sb_issue_zeroout(sb, p_block, nr_blocks, GFP_NOFS);
> +}
> +
>  static int ocfs2_zero_partial_clusters(struct inode *inode,
>  				       u64 start, u64 len)
>  {
> @@ -1538,6 +1577,7 @@ static int ocfs2_zero_partial_clusters(struct inode *inode,
>  	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
>  	unsigned int csize = osb->s_clustersize;
>  	handle_t *handle;
> +	loff_t isize = i_size_read(inode);
>  
>  	/*
>  	 * The "start" and "end" values are NOT necessarily part of
> @@ -1558,6 +1598,26 @@ static int ocfs2_zero_partial_clusters(struct inode *inode,
>  	if ((start & (csize - 1)) == 0 && (end & (csize - 1)) == 0)
>  		goto out;
>  
> +	/* No page cache for EOF blocks, issue zero out to disk. */
> +	if (end > isize) {
> +		/*
> +		 * zeroout eof blocks in last cluster starting from
> +		 * "isize" even "start" > "isize" because it is
> +		 * complicated to zeroout just at "start" as "start"
> +		 * may be not aligned with block size, buffer write
> +		 * would be required to do that, but out of eof buffer
> +		 * write is not supported.
> +		 */
> +		ret = ocfs2_zeroout_partial_cluster(inode, isize,
> +					end - isize);
> +		if (ret) {
> +			mlog_errno(ret);
> +			return ret;

Better to use "goto out" to keep code consistent.

> +		}
> +		if (start >= isize)
> +			return ret;

Ditto.

> +		end = isize;
> +	}
>  	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
>  	if (IS_ERR(handle)) {
>  		ret = PTR_ERR(handle);
> @@ -1855,45 +1915,6 @@ int ocfs2_remove_inode_range(struct inode *inode,
>  	return ret;
>  }
>  
> -/*
> - * zero out partial blocks of one cluster.
> - *
> - * start: file offset where zero starts, will be made upper block aligned.
> - * len: it will be trimmed to the end of current cluster if "start + len"
> - *      is bigger than it.
> - */
> -static int ocfs2_zeroout_partial_cluster(struct inode *inode,
> -					u64 start, u64 len)
> -{
> -	int ret;
> -	u64 start_block, end_block, nr_blocks;
> -	u64 p_block, offset;
> -	u32 cluster, p_cluster, nr_clusters;
> -	struct super_block *sb = inode->i_sb;
> -	u64 end = ocfs2_align_bytes_to_clusters(sb, start);
> -
> -	if (start + len < end)
> -		end = start + len;
> -
> -	start_block = ocfs2_blocks_for_bytes(sb, start);
> -	end_block = ocfs2_blocks_for_bytes(sb, end);
> -	nr_blocks = end_block - start_block;
> -	if (!nr_blocks)
> -		return 0;
> -
> -	cluster = ocfs2_bytes_to_clusters(sb, start);
> -	ret = ocfs2_get_clusters(inode, cluster, &p_cluster,
> -				&nr_clusters, NULL);
> -	if (ret)
> -		return ret;
> -	if (!p_cluster)
> -		return 0;
> -
> -	offset = start_block - ocfs2_clusters_to_blocks(sb, cluster);
> -	p_block = ocfs2_clusters_to_blocks(sb, p_cluster) + offset;
> -	return sb_issue_zeroout(sb, p_block, nr_blocks, GFP_NOFS);
> -}
> -
>  /*
>   * Parts of this function taken from xfs_change_file_space()
>   */
>