[Ocfs2-devel] ocfs2: serialize unaligned aio
Tao Ma
tm at tao.ma
Wed Jun 22 20:44:42 PDT 2011
Hi Mark,
On 06/23/2011 05:23 AM, Mark Fasheh wrote:
> Fix a corruption that can happen when we have (two or more) outstanding
> aio's to an overlapping unaligned region. Ext4
> (e9e3bcecf44c04b9e6b505fd8e2eb9cea58fb94d) and xfs recently had to fix
> similar issues.
>
> In our case what happens is that we can have an outstanding aio on a region
> and if a write comes in with some bytes overlapping the original aio we may
> decide to read that region into a page before continuing (typically because
> of buffered-io fallback). Since we have no ordering guarantees with the
> aio, we can read stale or bad data into the page and then write it back out.
>
> If the i/o is page and block aligned, then we avoid this issue as there
> won't be any need to read data from disk.
>
> I took the same approach as Eric in the ext4 patch and introduced some
> serialization of unaligned async direct i/o. I don't expect this to have an
> effect on the most common cases of AIO. Unaligned aio will be slower
> though, but that's far more acceptable than data corruption.
>
> Signed-off-by: Mark Fasheh <mfasheh at suse.com>
>
> ---
> fs/ocfs2/aops.c | 10 ++++++++++
> fs/ocfs2/aops.h | 14 ++++++++++++++
> fs/ocfs2/file.c | 38 ++++++++++++++++++++++++++++++++++++++
> fs/ocfs2/inode.h | 3 +++
> fs/ocfs2/super.c | 10 ++++++++--
> 5 files changed, 73 insertions(+), 2 deletions(-)
>
> diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
> index ac97bca..4c1ec8f 100644
> --- a/fs/ocfs2/aops.c
> +++ b/fs/ocfs2/aops.c
> @@ -564,6 +564,7 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
> {
> struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
> int level;
> + wait_queue_head_t *wq = ocfs2_ioend_wq(inode);
>
> /* this io's submitter should not have unlocked this before we could */
> BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
> @@ -573,6 +574,15 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
> ocfs2_iocb_clear_sem_locked(iocb);
> }
>
> + if (ocfs2_iocb_is_unaligned_aio(iocb)) {
> + ocfs2_iocb_clear_unaligned_aio(iocb);
> +
> + if (atomic_dec_and_test(&OCFS2_I(inode)->ip_unaligned_aio) &&
> + waitqueue_active(wq)) {
> + wake_up_all(wq);
> + }
> + }
> +
> ocfs2_iocb_clear_rw_locked(iocb);
>
> level = ocfs2_iocb_rw_locked_level(iocb);
> diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
> index 75cf3ad..ffb2da3 100644
> --- a/fs/ocfs2/aops.h
> +++ b/fs/ocfs2/aops.h
> @@ -78,6 +78,7 @@ enum ocfs2_iocb_lock_bits {
> OCFS2_IOCB_RW_LOCK = 0,
> OCFS2_IOCB_RW_LOCK_LEVEL,
> OCFS2_IOCB_SEM,
> + OCFS2_IOCB_UNALIGNED_IO,
> OCFS2_IOCB_NUM_LOCKS
> };
>
> @@ -91,4 +92,17 @@ enum ocfs2_iocb_lock_bits {
> clear_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private)
> #define ocfs2_iocb_is_sem_locked(iocb) \
> test_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private)
> +
> +#define ocfs2_iocb_set_unaligned_aio(iocb) \
> + set_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private)
> +#define ocfs2_iocb_clear_unaligned_aio(iocb) \
> + clear_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private)
> +#define ocfs2_iocb_is_unaligned_aio(iocb) \
> + test_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private)
> +
> +#define OCFS2_IOEND_WQ_HASH_SZ 37
> +#define ocfs2_ioend_wq(v) (&ocfs2__ioend_wq[((unsigned long)(v)) %\
> + OCFS2_IOEND_WQ_HASH_SZ])
> +extern wait_queue_head_t ocfs2__ioend_wq[OCFS2_IOEND_WQ_HASH_SZ];
> +
> #endif /* OCFS2_FILE_H */
> diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
> index 89659d6..4c909c9 100644
> --- a/fs/ocfs2/file.c
> +++ b/fs/ocfs2/file.c
> @@ -2038,6 +2038,23 @@ out:
> return ret;
> }
>
> +static void ocfs2_aiodio_wait(struct inode *inode)
> +{
> + wait_queue_head_t *wq = ocfs2_ioend_wq(inode);
> +
> + wait_event(*wq, (atomic_read(&OCFS2_I(inode)->ip_unaligned_aio) == 0));
> +}
> +
> +static int ocfs2_is_io_unaligned(struct inode *inode, size_t count, loff_t pos)
> +{
> + int blockmask = inode->i_sb->s_blocksize - 1;
> + loff_t final_size = pos + count;
> +
> + if ((pos & blockmask) || (final_size & blockmask))
> + return 1;
> + return 0;
> +}
> +
> static int ocfs2_prepare_inode_for_refcount(struct inode *inode,
> struct file *file,
> loff_t pos, size_t count,
> @@ -2216,6 +2233,7 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
> struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
> int full_coherency = !(osb->s_mount_opt &
> OCFS2_MOUNT_COHERENCY_BUFFERED);
> + int unaligned_dio = 0;
>
> trace_ocfs2_file_aio_write(inode, file, file->f_path.dentry,
> (unsigned long long)OCFS2_I(inode)->ip_blkno,
> @@ -2284,6 +2302,10 @@ relock:
> goto out;
> }
>
> + if (direct_io && !is_sync_kiocb(iocb))
> + unaligned_dio = ocfs2_is_io_unaligned(inode, iocb->ki_left,
> + *ppos);
> +
> /*
> * We can't complete the direct I/O as requested, fall back to
> * buffered I/O.
> @@ -2299,6 +2321,18 @@ relock:
> goto relock;
> }
>
> + if (unaligned_dio) {
> + /*
> + * Wait on previous unaligned aio to complete before
> + * proceeding.
> + */
> + ocfs2_aiodio_wait(inode);
> +
> + /* Mark the iocb as needing a decrement in ocfs2_dio_end_io */
> + atomic_inc(&OCFS2_I(inode)->ip_unaligned_aio);
> + ocfs2_iocb_set_unaligned_aio(iocb);
> + }
> +
I just have one qs here. In ocfs2_aiodio_wait, we just wait for the
event and in ocfs2_dio_end_io, we will wake up all the waiters.
So if there are 2 aios comes when 1 aio is proceeding, these 2 will wait
here and after the first 1 aio is completed, both of these 2 will be
waken up and they will be running at the same time actually. Maybe we
have to used exclusive wait here and use wake_up so that only 1 of these
2 will be waken up to proceed? Or is it designed like this intentionally?
I have gone through the patches of ext4 and yes, it does the similar
thing, but I am not sure whether the case is the same or not since I am
not quite familiar with ext4 by now.
Regards,
Tao
> /*
> * To later detect whether a journal commit for sync writes is
> * necessary, we sample i_size, and cluster count here.
> @@ -2371,8 +2405,12 @@ out_dio:
> if ((ret == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) {
> rw_level = -1;
> have_alloc_sem = 0;
> + unaligned_dio = 0;
> }
>
> + if (unaligned_dio)
> + atomic_dec(&OCFS2_I(inode)->ip_unaligned_aio);
> +
> out:
> if (rw_level != -1)
> ocfs2_rw_unlock(inode, rw_level);
> diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
> index 1c508b1..88924a3 100644
> --- a/fs/ocfs2/inode.h
> +++ b/fs/ocfs2/inode.h
> @@ -43,6 +43,9 @@ struct ocfs2_inode_info
> /* protects extended attribute changes on this inode */
> struct rw_semaphore ip_xattr_sem;
>
> + /* Number of outstanding AIO's which are not page aligned */
> + atomic_t ip_unaligned_aio;
> +
> /* These fields are protected by ip_lock */
> spinlock_t ip_lock;
> u32 ip_open_count;
> diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
> index 5a521c7..6b7b415 100644
> --- a/fs/ocfs2/super.c
> +++ b/fs/ocfs2/super.c
> @@ -53,6 +53,7 @@
> #include "ocfs1_fs_compat.h"
>
> #include "alloc.h"
> +#include "aops.h"
> #include "blockcheck.h"
> #include "dlmglue.h"
> #include "export.h"
> @@ -1615,12 +1616,17 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
> return 0;
> }
>
> +wait_queue_head_t ocfs2__ioend_wq[OCFS2_IOEND_WQ_HASH_SZ];
> +
> static int __init ocfs2_init(void)
> {
> - int status;
> + int status, i;
>
> ocfs2_print_version();
>
> + for (i = 0; i < OCFS2_IOEND_WQ_HASH_SZ; i++)
> + init_waitqueue_head(&ocfs2__ioend_wq[i]);
> +
> status = init_ocfs2_uptodate_cache();
> if (status < 0) {
> mlog_errno(status);
> @@ -1759,7 +1765,7 @@ static void ocfs2_inode_init_once(void *data)
> ocfs2_extent_map_init(&oi->vfs_inode);
> INIT_LIST_HEAD(&oi->ip_io_markers);
> oi->ip_dir_start_lookup = 0;
> -
> + atomic_set(&oi->ip_unaligned_aio, 0);
> init_rwsem(&oi->ip_alloc_sem);
> init_rwsem(&oi->ip_xattr_sem);
> mutex_init(&oi->ip_io_mutex);
More information about the Ocfs2-devel
mailing list