Subject: [PATCH] NFS: Use asynchronous reads to handle direct read requests Category: Performance enhancement Description: The initial implementation of NFS direct reads was entirely synchronous. The direct read logic issued one NFS READ operation at a time, and waited for the server's reply before issuing the next one. For large direct read requests, this is very slow. This patch changes the NFS direct read path to dispatch NFS READ operations for a single direct read request in parallel and wait for them once. The direct read path is still synchronous in nature, but because the NFS READ operations are going in parallel, the completion wait should be much shorter. *** This patch is an experimental prototype. It is safe to use, but the contents of the patch may change before it is integrated into the kernel. Note that a similar patch is possible in the direct write path. This will be forthcoming once we are confident that the read path is working satisfactorily. Test-plan: Millions of operations with fsx-odirect. OraSim with a direct job file and small rsize. Use sio with -direct to generate large sequential reads or large random reads. fs/nfs/direct.c | 74 -------------- fs/nfs/read.c | 209 +++++++++++++++++++++++++++++++++++++++++ include/linux/nfs_fs.h | 2 3 files changed, 212 insertions(+), 73 deletions(-) Signed-off-by: Chuck Lever Applies-to: 2.6.8-rc4 Created: Fri, 13 Aug 2004 15:26:33 -0400 System-ID: Linux climax.citi.umich.edu 2.6.8-rc4 i386 diff -X /home/cel/src/linux/dont-diff -Naurp 07-nfs-sigmask/fs/nfs/direct.c 08-nfs-odirect-read/fs/nfs/direct.c --- 07-nfs-sigmask/fs/nfs/direct.c 2004-08-10 11:01:36.221666000 -0400 +++ 08-nfs-odirect-read/fs/nfs/direct.c 2004-08-10 11:22:17.405809000 -0400 @@ -33,6 +33,7 @@ * 08 Jul 2002 Version for 2.4.19, with bug fixes --trondmy * 08 Jun 2003 Port to 2.5 APIs --cel * 31 Mar 2004 Handle direct I/O without VFS support --cel + * 13 Jul 2004 Dispatch multiple reads in parallel --cel * */ @@ -107,79 +108,6 @@ nfs_free_user_pages(struct page **pages, } /** - * nfs_direct_read_seg - Read in one iov segment. Generate separate - * read RPCs for each "rsize" bytes. - * @inode: target inode - * @file: target file (may be NULL) - * user_addr: starting address of this segment of user's buffer - * count: size of this segment - * file_offset: offset in file to begin the operation - * @pages: array of addresses of page structs defining user's buffer - * nr_pages: size of pages array - */ -static int -nfs_direct_read_seg(struct inode *inode, struct file *file, - unsigned long user_addr, size_t count, loff_t file_offset, - struct page **pages, int nr_pages) -{ - const unsigned int rsize = NFS_SERVER(inode)->rsize; - int tot_bytes = 0; - int curpage = 0; - struct nfs_read_data rdata = { - .inode = inode, - .args = { - .fh = NFS_FH(inode), - .lockowner = current->files, - }, - .res = { - .fattr = &rdata.fattr, - }, - }; - - rdata.args.pgbase = user_addr & ~PAGE_MASK; - rdata.args.offset = file_offset; - do { - int result; - - rdata.args.count = count; - if (rdata.args.count > rsize) - rdata.args.count = rsize; - rdata.args.pages = &pages[curpage]; - - dprintk("NFS: direct read: c=%u o=%Ld ua=%lu, pb=%u, cp=%u\n", - rdata.args.count, (long long) rdata.args.offset, - user_addr + tot_bytes, rdata.args.pgbase, curpage); - - lock_kernel(); - result = NFS_PROTO(inode)->read(&rdata, file); - unlock_kernel(); - - if (result <= 0) { - if (tot_bytes > 0) - break; - if (result == -EISDIR) - result = -EINVAL; - return result; - } - - tot_bytes += result; - if (rdata.res.eof) - break; - - rdata.args.offset += result; - rdata.args.pgbase += result; - curpage += rdata.args.pgbase >> PAGE_SHIFT; - rdata.args.pgbase &= ~PAGE_MASK; - count -= result; - } while (count != 0); - - /* XXX: should we zero the rest of the user's buffer if we - * hit eof? */ - - return tot_bytes; -} - -/** * nfs_direct_read - For each iov segment, map the user's buffer * then generate read RPCs. * @inode: target inode diff -X /home/cel/src/linux/dont-diff -Naurp 07-nfs-sigmask/fs/nfs/read.c 08-nfs-odirect-read/fs/nfs/read.c --- 07-nfs-sigmask/fs/nfs/read.c 2004-08-10 11:20:33.329074000 -0400 +++ 08-nfs-odirect-read/fs/nfs/read.c 2004-08-10 11:22:17.410809000 -0400 @@ -35,6 +35,7 @@ #define NFSDBG_FACILITY NFSDBG_PAGECACHE static int nfs_pagein_one(struct list_head *, struct inode *); +static void nfs_readpage_result_direct(struct nfs_read_data *, int); static void nfs_readpage_result_partial(struct nfs_read_data *, int); static void nfs_readpage_result_full(struct nfs_read_data *, int); @@ -400,6 +401,198 @@ nfs_pagein_list(struct list_head *head, return error; } +struct nfs_direct_read_req { + wait_queue_head_t wait; /* wait queue */ + atomic_t complete, /* i/os we're waiting for */ + count, /* bytes actually read */ + error; /* any reported error */ +}; + +/** + * nfs_direct_read_seg - Read in one iov segment in a direct read + * request. Dispatch an async read RPC for + * each "rsize" bytes. + * @inode: target inode + * @file: target file (may be NULL) + * user_addr: starting address of this segment of user's buffer + * count: size of this segment + * file_offset: offset in file to begin the operation + * @pages: array of addresses of page structs defining user's buffer + * nr_pages: size of pages array + * + * When we get here, the user's buffer has already been pinned down. + * We loop over the buffer and issue asynchronous NFS READ operations, + * then wait for them all to complete. + */ +int +nfs_direct_read_seg(struct inode *inode, struct file *file, + unsigned long user_addr, size_t count, loff_t file_offset, + struct page **pages, int nr_pages) +{ + sigset_t oldset; + unsigned int nbytes, curpage, pgbase, buf_offset, result = 0; + + const unsigned int rsize = NFS_SERVER(inode)->rsize; + struct rpc_clnt *clnt = NFS_CLIENT(inode); + LIST_HEAD(list); + DEFINE_WAIT(wait); + struct nfs_page req = { + .wb_inode = inode, + }; + struct nfs_direct_read_req dreq = { + .wait = __WAIT_QUEUE_HEAD_INITIALIZER(dreq.wait), + .complete = ATOMIC_INIT(0), + .count = ATOMIC_INIT(0), + .error = ATOMIC_INIT(0), + }; + + /* + * Set up credential and state information for this request + */ + NFS_PROTO(inode)->request_init(&req, file); + + /* + * Fail immediately if we can't get all our dynamic resources + * first; this gives us a better guarantee of request idempotency + * in the case of low memory, and makes our failure case very + * easy. + */ + nbytes = count; + for(;;) { + struct nfs_read_data *data = nfs_readdata_alloc(); + + if (unlikely(!data)) { + while (!list_empty(&list)) { + data = list_entry(list.next, + struct nfs_read_data, pages); + list_del(&data->pages); + nfs_readdata_free(data); + } + put_rpccred(req.wb_cred); + dfprintk(VFS, "NFS: direct read failed, -ENOMEM\n"); + return -ENOMEM; + } + + INIT_LIST_HEAD(&data->pages); + list_add(&data->pages, &list); + + data->req = (struct nfs_page *) &dreq; + data->inode = inode; + data->cred = req.wb_cred; + + data->args.fh = NFS_FH(inode); + data->args.lockowner = req.wb_lockowner; + data->args.state = req.wb_state; + + data->res.fattr = &data->fattr; + data->res.eof = 0; + + atomic_inc(&dreq.complete); + + if (nbytes <= rsize) + break; + nbytes -= rsize; + } + + /* + * We need these for the rpc_execute() call below. Both of + * these can be computationally expensive on a busy SMP system, + * so we do this once before dispatching the whole I/O request; + * otherwise we wait before every rpc_execute() call. Note that + * the rpc_execute() path has a periodic schedule() call, so + * pre-emption points in this loop should be unnecessary. + */ + rpc_clnt_sigmask(clnt, &oldset); + lock_kernel(); + + /* + * For each read_data struct that was allocated above, dispatch + * an NFS READ operation + */ + curpage = 0; + buf_offset = 0; + pgbase = user_addr & ~PAGE_MASK; + do { + struct nfs_read_data *data; + unsigned int bytes; + + bytes = rsize; + if (count < rsize) + bytes = count; + + data = list_entry(list.next, struct nfs_read_data, pages); + list_del_init(&data->pages); + + data->args.offset = file_offset; + data->args.pgbase = pgbase; + data->args.pages = &pages[curpage]; + data->args.count = bytes; + data->res.count = bytes; + + dfprintk(VFS, "NFS: direct read: c=%u o=%Ld ua=%lu, pb=%u, cp=%u\n", + data->args.count, (long long) data->args.offset, + user_addr + buf_offset, data->args.pgbase, curpage); + + NFS_PROTO(inode)->read_setup(data); + + data->task.tk_cookie = (unsigned long) inode; + data->task.tk_calldata = data; + data->task.tk_release = nfs_readdata_release; + data->complete = nfs_readpage_result_direct; + + rpc_execute(&data->task); + + dfprintk(VFS, "NFS: %4d initiated direct read call (req %s/%Ld, %u bytes @ offset %Lu)\n", + data->task.tk_pid, + inode->i_sb->s_id, + (long long)NFS_FILEID(inode), + bytes, + (unsigned long long)data->args.offset); + + file_offset += bytes; + buf_offset += bytes; + pgbase += bytes; + curpage += pgbase >> PAGE_SHIFT; + pgbase &= ~PAGE_MASK; + + count -= bytes; + } while (count != 0); + + unlock_kernel(); + + /* + * Wait here for async reads to finish + */ + if (atomic_read(&dreq.complete) != 0) { + if (clnt->cl_intr) { + prepare_to_wait(&dreq.wait, &wait, + TASK_INTERRUPTIBLE); + if (signal_pending(current)) + result = -ERESTARTSYS; + } else { + prepare_to_wait(&dreq.wait, &wait, + TASK_UNINTERRUPTIBLE); + } + schedule(); + finish_wait(&dreq.wait, &wait); + } + + rpc_clnt_sigunmask(clnt, &oldset); + + put_rpccred(req.wb_cred); + if (!result) + result = atomic_read(&dreq.error); + if (result) { + dfprintk(VFS, "NFS: direct read call failed, result=%d\n", + result); + return result; + } + result = atomic_read(&dreq.count); + dfprintk(VFS, "NFS: direct read call succeeded, %d bytes read\n", + result); + return result; +} + /* * Handle a read reply that fills part of a page. */ @@ -457,6 +650,22 @@ static void nfs_readpage_result_full(str } /* + * Handle a read reply for a direct read request + */ +static void nfs_readpage_result_direct(struct nfs_read_data *data, int status) +{ + struct nfs_direct_read_req *req = (struct nfs_direct_read_req *) data->req; + + if (status >= 0) + atomic_add(data->res.count, &req->count); + else + atomic_set(&req->error, status); + + if (atomic_dec_and_test(&req->complete)) + wake_up(&req->wait); +} + +/* * This is the callback from RPC telling us whether a reply was * received or some error occurred (timeout or socket shutdown). */ diff -X /home/cel/src/linux/dont-diff -Naurp 07-nfs-sigmask/include/linux/nfs_fs.h 08-nfs-odirect-read/include/linux/nfs_fs.h --- 07-nfs-sigmask/include/linux/nfs_fs.h 2004-08-10 11:20:33.342073000 -0400 +++ 08-nfs-odirect-read/include/linux/nfs_fs.h 2004-08-10 11:22:17.415809000 -0400 @@ -405,6 +405,8 @@ extern int nfs_readpages(struct file *, struct list_head *, unsigned); extern int nfs_pagein_list(struct list_head *, int); extern void nfs_readpage_result(struct rpc_task *); +extern int nfs_direct_read_seg(struct inode *, struct file *, + unsigned long, size_t, loff_t, struct page **, int); /* * linux/fs/mount_clnt.c