diff -X /home/cel/src/linux/dont-diff -Naurp old/fs/exec.c new/fs/exec.c --- old/fs/exec.c Fri Oct 31 15:50:21 2003 +++ new/fs/exec.c Fri Oct 31 15:54:25 2003 @@ -371,7 +371,9 @@ struct file *open_exec(const char *name) if (err) { fput(file); file = ERR_PTR(err); - } + } else + /* make sure these are always cached */ + file->f_flags &= ~O_DIRECT; } out: return file; diff -X /home/cel/src/linux/dont-diff -Naurp old/fs/nfs/Makefile new/fs/nfs/Makefile --- old/fs/nfs/Makefile Fri Dec 29 17:07:23 2000 +++ new/fs/nfs/Makefile Fri Oct 31 15:54:25 2003 @@ -10,7 +10,7 @@ O_TARGET := nfs.o obj-y := inode.o file.o read.o write.o dir.o symlink.o proc.o \ - nfs2xdr.o flushd.o unlink.o + nfs2xdr.o flushd.o unlink.o direct.o obj-$(CONFIG_ROOT_NFS) += nfsroot.o mount_clnt.o obj-$(CONFIG_NFS_V3) += nfs3proc.o nfs3xdr.o diff -X /home/cel/src/linux/dont-diff -Naurp old/fs/nfs/direct.c new/fs/nfs/direct.c --- old/fs/nfs/direct.c Wed Dec 31 19:00:00 1969 +++ new/fs/nfs/direct.c Fri Oct 31 15:54:25 2003 @@ -0,0 +1,888 @@ +/* + * linux/fs/nfs/direct.c + * + * High-performance direct I/O for the NFS client + * + * When an application requests uncached I/O, all read and write requests + * are made directly to the server; data stored or fetched via these + * requests is not cached in the Linux page cache. The client does not + * correct unaligned requests from applications. All requested bytes are + * held on permanent storage before a direct write system call returns to + * an application. Applications that manage their own data caching, such + * as databases, make very good use of direct I/O on local file systems. + * + * Solaris implements an uncached I/O facility called directio() that + * is used for backups and sequential I/O to very large files. Solaris + * also supports uncaching whole NFS partitions with "-o forcedirectio," + * an undocumented mount option. + * + * Note that I/O to read in executables (e.g. kernel_read) cannot use + * direct (kiobuf) reads because there is no vma backing the passed-in + * data buffer. + * + * Designed by Jeff Kimmel, Chuck Lever, and Trond Myklebust. + * + * Initial implementation: 12/2001 by Chuck Lever + * RHAS 2.1 backport: 09/2003 by Chuck Lever + * kmap-friendly 2.1: 10/2003 by Chuck Lever + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#define NFSDBG_FACILITY (NFSDBG_PAGECACHE | NFSDBG_VFS) +#define VERF_SIZE (2 * sizeof(__u32)) + +#define NFS_DIRECT_DEBUG 0 +#define NFS_DIRECT_IOVEC_INTERFACE 0 + +static inline int +nfs_direct_read_rpc(struct file *file, struct nfs_readargs *arg) +{ + int result; + struct inode * inode = file->f_dentry->d_inode; + struct nfs_fattr fattr; + struct rpc_message msg; + struct nfs_readres res = { + .fattr = &fattr, + .count = arg->count, + .eof = 0, + }; + +#if NFS_DIRECT_DEBUG + int i; + printk(KERN_ERR "%s: count=%d, offset=%Lu\n", __FUNCTION__, + arg->count, arg->offset); + for (i = 0; i < arg->nriov; i++) + printk(KERN_ERR "%s: arg->iov[%d]: base=%p, len=%u\n", + __FUNCTION__, i, arg->iov[i].iov_base, + arg->iov[i].iov_len); +#endif + +#ifdef CONFIG_NFS_V3 + msg.rpc_proc = (NFS_PROTO(inode)->version == 3) ? + NFS3PROC_READ : NFSPROC_READ; +#else + msg.rpc_proc = NFSPROC_READ; +#endif + msg.rpc_argp = arg; + msg.rpc_resp = &res; + + lock_kernel(); + msg.rpc_cred = nfs_file_cred(file); + fattr.valid = 0; + result = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); + nfs_refresh_inode(inode, &fattr); + unlock_kernel(); + + return result; +} + +static inline int +nfs_direct_write_rpc(struct file *file, struct nfs_writeargs *arg, + struct nfs_writeverf *verf) +{ + int result; + struct inode *inode = file->f_dentry->d_inode; + struct nfs_fattr fattr; + struct rpc_message msg; + struct nfs_writeres res = { + .fattr = &fattr, + .verf = verf, + .count = 0, + }; + +#if NFS_DIRECT_DEBUG + int i; + printk(KERN_ERR "%s: count=%d, offset=%Lu\n", __FUNCTION__, + arg->count, arg->offset); + for (i = 0; i > arg->nriov; i++) + printk(KERN_ERR "%s: arg->iov[%d]: base=%p, len=%u\n", + __FUNCTION__, i, arg->iov[i].iov_base, + arg->iov[i].iov_len); +#endif + +#ifdef CONFIG_NFS_V3 + msg.rpc_proc = (NFS_PROTO(inode)->version == 3) ? + NFS3PROC_WRITE : NFSPROC_WRITE; +#else + msg.rpc_proc = NFSPROC_WRITE; +#endif + msg.rpc_argp = arg; + msg.rpc_resp = &res; + + lock_kernel(); + msg.rpc_cred = get_rpccred(nfs_file_cred(file)); + fattr.valid = 0; + result = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); + nfs_refresh_inode(inode, &fattr); + put_rpccred(msg.rpc_cred); + unlock_kernel(); + +#ifdef CONFIG_NFS_V3 + if (NFS_PROTO(inode)->version == 3) { + if (result > 0) { + if ((arg->stable == NFS_FILE_SYNC) && + (verf->committed != NFS_FILE_SYNC)) { + printk(KERN_ERR + "%s: server didn't sync stable write request\n", + __FUNCTION__); + return -EIO; + } + + if (result != arg->count) { + printk(KERN_INFO + "%s: short write, count=%u, result=%d\n", + __FUNCTION__, arg->count, result); + } + } + return result; + } else { +#endif + verf->committed = NFS_FILE_SYNC; /* NFSv2 always syncs data */ + if (result == 0) + return arg->count; + return result; +#ifdef CONFIG_NFS_V3 + } +#endif +} + +#ifdef CONFIG_NFS_V3 +static inline int +nfs_direct_commit_rpc(struct inode *inode, loff_t offset, size_t count, + struct nfs_writeverf *verf) +{ + int result; + struct nfs_fattr fattr; + struct nfs_writeargs arg = { + .fh = NFS_FH(inode), + .offset = offset, + .count = count, + }; + struct nfs_writeres res = { + .fattr = &fattr, + .verf = verf, + .count = 0, + }; + struct rpc_message msg = { + .rpc_proc = NFS3PROC_COMMIT, + .rpc_argp = &arg, + .rpc_resp = &res, + .rpc_cred = NULL, + }; + + fattr.valid = 0; + + lock_kernel(); + result = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); + nfs_refresh_inode(inode, &fattr); + unlock_kernel(); + + return result; +} +#else +static inline int +nfs_direct_commit_rpc(struct inode *inode, loff_t offset, size_t count, + struct nfs_writeverf *verf) +{ + return 0; +} +#endif + +#if NFS_DIRECT_IOVEC_INTERFACE +/* + * Walk through the iobuf and create an iovec for each "rsize" bytes. + */ +static int +nfs_kiobuf_read(struct file *file, struct kiobuf *iobuf, loff_t offset, + size_t count) +{ + int curpage, total; + struct inode *inode = file->f_dentry->d_inode; + int rsize = NFS_SERVER(inode)->rsize; + struct nfs_readargs args = { + .fh = NFS_FH(inode), + }; + + total = 0; + curpage = 0; + while (count) { + int starting_offset, request, result, first, last, i; + struct iovec *iovec = args.iov; + + request = count; + if (count > rsize) + request = rsize; + args.count = request; + args.offset = offset; + args.nriov = 0; + + starting_offset = iobuf->offset; + first = last = curpage; + do { + struct page *page = iobuf->maplist[curpage]; + + if (curpage >= iobuf->nr_pages || !page) + return -EFAULT; + + args.nriov++; + iovec->iov_base = kmap(page) + starting_offset; + iovec->iov_len = (PAGE_SIZE - starting_offset); + if ((starting_offset + request) < PAGE_SIZE) + iovec->iov_len = request; + + request -= iovec->iov_len; + last = curpage; + + if (!request && (starting_offset + iovec->iov_len < PAGE_SIZE)) + break; + + starting_offset = 0; /* zero after the first page */ + curpage++; + iovec++; + } while (request != 0); + + result = nfs_direct_read_rpc(file, &args); + + for (i = first; i <= last; i++) { + flush_dcache_page(iobuf->maplist[i]); + kunmap(iobuf->maplist[i]); + } + + if (result <= 0) { + if (!total) { + if (result == -EISDIR) + total = -EINVAL; + else + total = result; + } + break; + } + + total += result; + count -= result; + offset += result; + + if (result < args.count) /* NFSv2ism */ + break; + }; + return total; +} +#else +/* + * Walk through the iobuf and create an iovec for each "rsize" bytes. + */ +static int +nfs_kiobuf_read(struct file *file, struct kiobuf *iobuf, loff_t offset, + size_t count) +{ + int curpage, total; + int result = 0; + struct inode *inode = file->f_dentry->d_inode; + int rsize = NFS_SERVER(inode)->rsize; + struct page *pages[NFS_READ_MAXIOV]; + struct nfs_readargs args = { + .fh = NFS_FH(inode), + .offset = offset, + .count = 0, + .pgbase = iobuf->offset, + .pages = pages, + }; + + total = 0; + curpage = 0; + while (count) { + int len, request; + struct page **dest = pages; + + request = count; + if (count > rsize) + request = rsize; + args.count = request; + args.offset = offset; + args.pgbase = (iobuf->offset + total) & ~PAGE_MASK; + len = PAGE_SIZE - args.pgbase; + + do { + struct page *page = iobuf->maplist[curpage]; + + if (curpage >= iobuf->nr_pages || !page) { + result = -EFAULT; + goto out_err; + } + + *dest++ = page; + /* zero after the first iov */ + if (request < len) + break; + request -= len; + len = PAGE_SIZE; + curpage++; + } while (request != 0); + + result = nfs_direct_read_rpc(file, &args); + + if (result < 0) + break; + + total += result; + if (result < args.count) /* NFSv2ism */ + break; + count -= result; + offset += result; + }; +out_err: + if (!total) + return result; + return total; +} +#endif + +#if NFS_DIRECT_IOVEC_INTERFACE +/* + * Walk through the iobuf and create an iovec for each "wsize" bytes. + * If only one network write is necessary, or if the O_SYNC flag or + * 'sync' mount option are present, or if this is a V2 inode, use + * FILE_SYNC. Otherwise, use UNSTABLE and finish with a COMMIT. + * + * The mechanics of this function are much the same as nfs_direct_read, + * with the added complexity of committing unstable writes. + */ +static int +nfs_kiobuf_write(struct file *file, struct kiobuf *iobuf, + loff_t offset, size_t count) +{ + int curpage, total; + int need_commit = 0; + loff_t save_offset = offset; + struct inode *inode = file->f_dentry->d_inode; + int wsize = NFS_SERVER(inode)->wsize; + struct nfs_writeverf first_verf, ret_verf; + struct nfs_writeargs args = { + .fh = NFS_FH(inode), + .stable = NFS_FILE_SYNC, + }; + +#ifdef CONFIG_NFS_V3 + if ((NFS_PROTO(inode)->version == 3) && (count > wsize) && + (!IS_SYNC(inode))) + args.stable = NFS_UNSTABLE; +#endif + +retry: + total = 0; + curpage = 0; + while (count) { + int starting_offset, request, result, first, last, i; + struct iovec *iovec = args.iov; + + request = count; + if (count > wsize) + request = wsize; + args.count = request; + args.offset = offset; + args.nriov = 0; + + starting_offset = iobuf->offset; + first = last = curpage; + do { + struct page *page = iobuf->maplist[curpage]; + + if (curpage >= iobuf->nr_pages || !page) + return -EFAULT; + + args.nriov++; + iovec->iov_base = kmap(page) + starting_offset; + iovec->iov_len = (PAGE_SIZE - starting_offset); + if ((starting_offset + request) < PAGE_SIZE) + iovec->iov_len = request; + + request -= iovec->iov_len; + last = curpage; + + if (!request && (starting_offset + iovec->iov_len < PAGE_SIZE)) + break; + + starting_offset = 0; /* zero after the first page */ + curpage++; + iovec++; + } while (request != 0); + + result = nfs_direct_write_rpc(file, &args, &ret_verf); + + for (i = first; i <= last; i++) { + kunmap(iobuf->maplist[i]); + } + + if (result <= 0) { + if (!total) + total = result; + break; + } + + if (!total) + memcpy(&first_verf.verifier, &ret_verf.verifier, + VERF_SIZE); + if (ret_verf.committed != NFS_FILE_SYNC) { + need_commit = 1; + if (memcmp(&first_verf.verifier, &ret_verf.verifier, + VERF_SIZE)) + goto print_retry; + } + + total += result; + count -= result; + offset += result; + }; + + /* + * Commit data written so far, even in the event of an error + */ + if (need_commit) { + if (nfs_direct_commit_rpc(inode, save_offset, + iobuf->length - count, &ret_verf)) + goto print_retry; + if (memcmp(&first_verf.verifier, &ret_verf.verifier, + VERF_SIZE)) + goto print_retry; + } + + return total; + +print_retry: + printk(KERN_INFO + "%s: detected server restart; retrying with FILE_SYNC\n", + __FUNCTION__); + args.stable = NFS_FILE_SYNC; + offset = save_offset; + count = iobuf->length; + goto retry; +} +#else +/* + * Walk through the iobuf and create an iovec for each "wsize" bytes. + * If only one network write is necessary, or if the O_SYNC flag or + * 'sync' mount option are present, or if this is a V2 inode, use + * FILE_SYNC. Otherwise, use UNSTABLE and finish with a COMMIT. + * + * The mechanics of this function are much the same as nfs_direct_read, + * with the added complexity of committing unstable writes. + */ +static int +nfs_kiobuf_write(struct file *file, struct kiobuf *iobuf, + loff_t offset, size_t count) +{ + int curpage, total; + int need_commit = 0; + int result = 0; + loff_t save_offset = offset; + struct inode *inode = file->f_dentry->d_inode; + int wsize = NFS_SERVER(inode)->wsize; + struct nfs_writeverf first_verf, ret_verf; + struct page *pages[NFS_WRITE_MAXIOV]; + struct nfs_writeargs args = { + .fh = NFS_FH(inode), + .offset = 0, + .count = 0, + .stable = NFS_FILE_SYNC, + .pgbase = 0, + .pages = pages, + }; + +#ifdef CONFIG_NFS_V3 + if ((NFS_PROTO(inode)->version == 3) && (count > wsize) && + (!IS_SYNC(inode))) + args.stable = NFS_UNSTABLE; +#endif + +retry: + total = 0; + curpage = 0; + while (count) { + int len, request; + struct page **dest = pages; + + request = count; + if (count > wsize) + request = wsize; + args.count = request; + args.offset = offset; + args.pgbase = (iobuf->offset + total) & ~PAGE_MASK; + len = PAGE_SIZE - args.pgbase; + + do { + struct page *page = iobuf->maplist[curpage]; + + if (curpage >= iobuf->nr_pages || !page) { + result = -EFAULT; + goto out_err; + } + + *dest++ = page; + /* zero after the first iov */ + if (request < len) + break; + request -= len; + len = PAGE_SIZE; + curpage++; + } while (request != 0); + + result = nfs_direct_write_rpc(file, &args, &ret_verf); + + if (result < 0) + break; + + if (!total) + memcpy(&first_verf.verifier, &ret_verf.verifier, + VERF_SIZE); + if (ret_verf.committed != NFS_FILE_SYNC) { + need_commit = 1; + if (memcmp(&first_verf.verifier, &ret_verf.verifier, + VERF_SIZE)) + goto print_retry; + } + + total += result; + count -= result; + offset += result; + }; + +out_err: + /* + * Commit data written so far, even in the event of an error + */ + if (need_commit) { + if (nfs_direct_commit_rpc(inode, save_offset, + iobuf->length - count, &ret_verf)) + goto print_retry; + if (memcmp(&first_verf.verifier, &ret_verf.verifier, + VERF_SIZE)) + goto print_retry; + } + + if (!total) + return result; + return total; + +print_retry: + printk(KERN_INFO "%s: detected server restart; retrying with FILE_SYNC\n", + __FUNCTION__); + args.stable = NFS_FILE_SYNC; + offset = save_offset; + count = iobuf->length; + goto retry; +} +#endif + +/* + * The following functions are an adapter layer that provides the same + * function as generic_file_direct_IO and friends from later kernels. + */ + +/* how much buffer data one iobuf can represent. keep the maplist + * array small, but try to stay efficient. */ +#define NFS_KIOBUF_SIZE (4096 << 10) /* one page of 32-bit addresses */ + +/* + * Mark all of the pages in a kiobuf as dirty + * + * This is the same as mark_dirty_kiobuf, but we avoid adding an EXPORT + * and breaking ABI compatibility by including our own copy here. + */ +static void +nfs_mark_dirty_kiobuf(struct kiobuf *iobuf, int bytes) +{ + int index, offset, remaining; + struct page *page; + + index = iobuf->offset >> PAGE_SHIFT; + offset = iobuf->offset & ~PAGE_MASK; + remaining = bytes; + if (remaining > iobuf->length) + remaining = iobuf->length; + + while (remaining > 0 && index < iobuf->nr_pages) { + page = iobuf->maplist[index]; + + if (!PageReserved(page)) + SetPageDirty(page); + + remaining -= (PAGE_SIZE - offset); + offset = 0; + index++; + } +} + +/* + * Convert these regular-looking parameters into a struct kiobuf suitable + * for NFS direct I/O. + */ +ssize_t +nfs_direct_file_read(struct file *file, char *buf, size_t count, + loff_t *ppos) +{ + ssize_t result; + int iosize, progress; + loff_t offset = *ppos; + struct dentry *dentry = file->f_dentry; + struct inode *inode = dentry->d_inode; + struct kiobuf *iobuf; + loff_t size; + + if (!count) + return 0; + + iobuf = kmalloc(sizeof(struct kiobuf), GFP_KERNEL); + if (!iobuf) + return -ENOMEM; + memset(iobuf, 0, sizeof(struct kiobuf)); + + filemap_fdatasync(inode->i_mapping); + down(&inode->i_sem); + result = nfs_wb_all(inode); + up(&inode->i_sem); + filemap_fdatawait(inode->i_mapping); + if (result < 0) { + goto out_free; + } + + result = 0; + size = inode->i_size; + if (*ppos > size) + goto out_free; + if (offset + count > size) + count = size - offset; + + dfprintk(VFS, "NFS: direct_IO(READ) (%s/%s) off/cnt(%Lu/%d)\n", + dentry->d_parent->d_name.name, dentry->d_name.name, + offset, count); + + progress = 0; + while (count > 0) { + iosize = count; + if (iosize > NFS_KIOBUF_SIZE) + iosize = NFS_KIOBUF_SIZE; + + result = map_user_kiobuf(READ, iobuf, (unsigned long) buf, + iosize); + if (result) + break; + + result = nfs_kiobuf_read(file, iobuf, offset + progress, count); + if (result > 0) { + nfs_mark_dirty_kiobuf(iobuf, result); + count -= result; + buf += result; + progress += result; + } + + unmap_kiobuf(iobuf); + kfree(iobuf->maplist); + + if (result != iosize) + break; + } + + if (progress) { + result = progress; + *ppos += progress; + } + +out_free: + kfree(iobuf); + return result; +} + +/* + * Check the conditions on a file descriptor prior to beginning a write + * on it. See 2.4.22's mm/filemap.c:precheck_file_write + * + * In the direct I/O case, can we simply let the server handle + * bounds checking? seems that might close some race windows. + */ +static int +nfs_precheck_file_write(struct file *file, struct inode *inode, + size_t *count, loff_t *offset) +{ + ssize_t err; + unsigned long limit = current->rlim[RLIMIT_FSIZE].rlim_cur; + loff_t pos = *offset; + + err = -EINVAL; + if (pos < 0) + goto out; + + err = file->f_error; + if (err) { + file->f_error = 0; + goto out; + } + +#if 0 + if (file->f_flags & O_APPEND) + *offset = pos = inode->i_size; + + /* + * Check whether we've reached the file size limit. + */ + err = -EFBIG; + if (limit != RLIM_INFINITY) { + if (pos >= limit) { + send_sig(SIGXFSZ, current, 0); + goto out; + } + if (pos > 0xFFFFFFFFULL || *count > limit - (u32)pos) + *count = limit - (u32)pos; + } + + /* + * LFS rule + */ + if (pos + *count > MAX_NON_LFS && !(file->f_flags&O_LARGEFILE)) { + if (pos >= MAX_NON_LFS) { + send_sig(SIGXFSZ, current, 0); + goto out; + } + if (*count > MAX_NON_LFS - (u32)pos) + *count = MAX_NON_LFS - (u32)pos; + } + + /* + * Are we about to exceed the fs block limit ? + * + * If we have written data it becomes a short write + * If we have exceeded without writing data we send + * a signal and give them an EFBIG. + */ + if (pos >= inode->i_sb->s_maxbytes) { + if (*count || pos > inode->i_sb->s_maxbytes) { + send_sig(SIGXFSZ, current, 0); + err = -EFBIG; + goto out; + } + /* zero-length writes at ->s_maxbytes are OK */ + } + + if (pos + *count > inode->i_sb->s_maxbytes) + *count = inode->i_sb->s_maxbytes - pos; + + err = 0; +#endif +out: + return err; +} + +static int +nfs_remove_suid(struct inode * inode) +{ + int error = 0; + struct nfs_fattr fattr; + struct iattr attr; + + memset(&attr, 0, sizeof(struct iattr)); + + /* set S_IGID if S_IXGRP is set, and always set S_ISUID */ + attr.ia_mode = (inode->i_mode & S_IXGRP)*(S_ISGID/S_IXGRP) | S_ISUID; + + /* were any of the uid bits set? */ + attr.ia_mode &= inode->i_mode; + if (attr.ia_mode && !capable(CAP_FSETID)) { + inode->i_mode &= ~attr.ia_mode; + printk(KERN_ERR "%s: resetting SUID bits\n", __FUNCTION__); + error = NFS_PROTO(inode)->setattr(inode, &fattr, &attr); + if (!error) + error = nfs_refresh_inode(inode, &fattr); + } + + return error; +} + +ssize_t +nfs_direct_file_write(struct file *file, const char *buf, size_t count, + loff_t *ppos) +{ + ssize_t result; + int iosize, progress; + loff_t offset = *ppos; + struct dentry *dentry = file->f_dentry; + struct inode *inode = dentry->d_inode; + struct kiobuf *iobuf; + + result = nfs_precheck_file_write(file, inode, &count, &offset); + if (result) + return result; + + iobuf = kmalloc(sizeof(struct kiobuf), GFP_KERNEL); + if (!iobuf) + return -ENOMEM; + memset(iobuf, 0, sizeof(struct kiobuf)); + + down(&inode->i_sem); + + /* start any latent normal read requests now to maintain + * request ordering */ + nfs_pagein_inode(inode, 0, 0); + + filemap_fdatasync(inode->i_mapping); + result = nfs_wb_all(inode); + filemap_fdatawait(inode->i_mapping); + if (result < 0) + goto out; + + result = 0; + if (nfs_remove_suid(inode)) + goto out; + + dfprintk(VFS, "NFS: direct_IO(WRITE) (%s/%s) off/cnt(%Lu/%d)\n", + dentry->d_parent->d_name.name, dentry->d_name.name, + offset, count); + + progress = 0; + while (count > 0) { + iosize = count; + if (iosize > NFS_KIOBUF_SIZE) + iosize = NFS_KIOBUF_SIZE; + + result = map_user_kiobuf(WRITE, iobuf, (unsigned long) buf, + iosize); + if (result) + break; + + result = nfs_kiobuf_write(file, iobuf, offset + progress, count); + if (result > 0) { + count -= result; + buf += result; + progress += result; + } + + unmap_kiobuf(iobuf); + kfree(iobuf->maplist); + + if (result != iosize) + break; + } + + if (progress) { + result = progress; + *ppos += progress; + } + + /* local non-O_DIRECT accessors need to see these changes */ + invalidate_inode_pages(inode); + +out: + up(&inode->i_sem); + kfree(iobuf); + return result; +} diff -X /home/cel/src/linux/dont-diff -Naurp old/fs/nfs/file.c new/fs/nfs/file.c --- old/fs/nfs/file.c Fri Oct 31 15:50:16 2003 +++ new/fs/nfs/file.c Fri Oct 31 15:54:25 2003 @@ -128,8 +128,12 @@ nfs_file_read(struct file * file, char * (unsigned long) count, (unsigned long) *ppos); result = nfs_revalidate_inode(NFS_SERVER(inode), inode); - if (!result) - result = generic_file_read(file, buf, count, ppos); + if (!result) { + if (file->f_flags & O_DIRECT) + result = nfs_direct_file_read(file, buf, count, ppos); + else + result = generic_file_read(file, buf, count, ppos); + } return result; } @@ -262,6 +266,7 @@ nfs_file_write(struct file *file, const result = -EBUSY; if (IS_SWAPFILE(inode)) goto out_swapfile; + result = nfs_revalidate_inode(NFS_SERVER(inode), inode); if (result) goto out; @@ -270,7 +275,10 @@ nfs_file_write(struct file *file, const if (!count) goto out; - result = generic_file_write(file, buf, count, ppos); + if (file->f_flags & O_DIRECT) + result = nfs_direct_file_write(file, buf, count, ppos); + else + result = generic_file_write(file, buf, count, ppos); out: return result; diff -X /home/cel/src/linux/dont-diff -Naurp old/fs/nfs/inode.c new/fs/nfs/inode.c --- old/fs/nfs/inode.c Fri Oct 31 15:50:21 2003 +++ new/fs/nfs/inode.c Fri Oct 31 15:54:25 2003 @@ -32,6 +32,7 @@ #include #include #include +#include #include #include @@ -54,6 +55,8 @@ static void nfs_put_super(struct super_b static void nfs_umount_begin(struct super_block *); static int nfs_statfs(struct super_block *, struct statfs *); +static unsigned int nfs_uncached_io = 0; + static struct super_operations nfs_sops = { read_inode: nfs_read_inode, clear_inode: nfs_clear_inode, @@ -300,10 +303,25 @@ nfs_read_super(struct super_block *sb, v server->flags = data->flags & NFS_MOUNT_FLAGMASK; if (data->flags & NFS_MOUNT_NOAC) { - data->acregmin = data->acregmax = 0; - data->acdirmin = data->acdirmax = 0; - sb->s_flags |= MS_SYNCHRONOUS; + if (nfs_uncached_io) { + printk(KERN_NOTICE "NFS: uncached I/O enabled for noac mount\n"); + server->flags &= ~NFS_MOUNT_NOAC; + server->flags |= NFS_MOUNT_FORCE_DIRECT; + /* + * Do to ABI issues: non-buffered I/O r/wsizes can + * only support 16kb buffers + */ + if (server->rsize > 16384) + server->rsize = 16384; + if (server->wsize > 16384) + server->wsize = 16384; + } else { + data->acregmin = data->acregmax = 0; + data->acdirmin = data->acdirmax = 0; + sb->s_flags |= MS_SYNCHRONOUS; + } } + server->acregmin = data->acregmin*HZ; server->acregmax = data->acregmax*HZ; server->acdirmin = data->acdirmin*HZ; @@ -863,6 +881,9 @@ int nfs_open(struct inode *inode, struct struct rpc_auth *auth; struct rpc_cred *cred; + if (NFS_SERVER(inode)->flags & NFS_MOUNT_FORCE_DIRECT) + filp->f_flags |= O_DIRECT; + lock_kernel(); auth = NFS_CLIENT(inode)->cl_auth; cred = rpcauth_lookupcred(auth, 0); @@ -870,6 +891,7 @@ int nfs_open(struct inode *inode, struct if (filp->f_mode & FMODE_WRITE) nfs_set_mmcred(inode, cred); unlock_kernel(); + return 0; } @@ -1157,6 +1179,27 @@ void nfs_destroy_inodecache(void) if (kmem_cache_destroy(nfs_inode_cachep)) printk(KERN_INFO "nfs_inode_cache: not all structures were freed\n"); } +#ifdef CONFIG_PROC_FS +#define CTL_NFS 2049 /* arbitrary and hopefully unused */ +static struct ctl_table_header *nfs_table_header=NULL; +enum { + CTL_UNCACHEDIO = 1, +}; +static ctl_table nfs_entries[] = { + {CTL_UNCACHEDIO, "uncached_io", &nfs_uncached_io, + sizeof(unsigned int), 0444, NULL, &proc_dointvec}, + {0} +}; +static ctl_table nfs_table[] = { + {CTL_NFS, "nfs", NULL, 0, 0555, nfs_entries, + NULL, NULL, NULL, NULL}, + {0} +}; +#endif + +MODULE_PARM(nfs_uncached_io, "0-1i"); +MODULE_PARM_DESC(nfs_uncached_io, + "enable uncached I/O with the noac mount option"); /* * Initialize NFS @@ -1179,6 +1222,8 @@ static int __init init_nfs_fs(void) #ifdef CONFIG_PROC_FS rpc_proc_register(&nfs_rpcstat); + if (!nfs_table_header) + nfs_table_header = register_sysctl_table(nfs_table, 1); #endif err = register_filesystem(&nfs_fs_type); if (err) @@ -1186,6 +1231,10 @@ static int __init init_nfs_fs(void) return 0; out: #ifdef CONFIG_PROC_FS + if (nfs_table_header) { + unregister_sysctl_table(nfs_table_header); + nfs_table_header = NULL; + } rpc_proc_unregister("nfs"); #endif nfs_destroy_readpagecache(); @@ -1203,6 +1252,10 @@ static void __exit exit_nfs_fs(void) nfs_destroy_inodecache(); nfs_destroy_nfspagecache(); #ifdef CONFIG_PROC_FS + if (nfs_table_header) { + unregister_sysctl_table(nfs_table_header); + nfs_table_header = NULL; + } rpc_proc_unregister("nfs"); #endif unregister_filesystem(&nfs_fs_type); diff -X /home/cel/src/linux/dont-diff -Naurp old/include/linux/nfs_fs.h new/include/linux/nfs_fs.h --- old/include/linux/nfs_fs.h Fri Oct 31 15:50:00 2003 +++ new/include/linux/nfs_fs.h Fri Oct 31 15:54:25 2003 @@ -213,6 +213,12 @@ extern int nfs_commit_file(struct inode extern int nfs_commit_timeout(struct inode *, int); #endif +/* + * linux/fs/nfs/direct.c + */ +extern ssize_t nfs_direct_file_read(struct file *, char *, size_t, loff_t *); +extern ssize_t nfs_direct_file_write(struct file *, const char *, size_t, loff_t *); + static inline int nfs_have_read(struct inode *inode) { diff -X /home/cel/src/linux/dont-diff -Naurp old/include/linux/nfs_mount.h new/include/linux/nfs_mount.h --- old/include/linux/nfs_mount.h Fri Oct 31 15:50:00 2003 +++ new/include/linux/nfs_mount.h Fri Oct 31 15:54:25 2003 @@ -54,6 +54,7 @@ struct nfs_mount_data { #define NFS_MOUNT_KERBEROS 0x0100 /* 3 */ #define NFS_MOUNT_NONLM 0x0200 /* 3 */ #define NFS_MOUNT_BROKEN_SUID 0x0400 /* 4 */ +#define NFS_MOUNT_FORCE_DIRECT 0x0800 /* 4 */ #define NFS_MOUNT_FLAGMASK 0xFFFF #endif