[PATCH] RPC: skip over transport-specific heads automatically Add a generic mechanism for skipping over transport-specific headers when constructing an RPC request. This removes another "xprt->stream" dependency. Test-plan: Write-intensive workload on a single mount point. Version: Thu, 10 Mar 2005 18:48:34 -0500 Signed-off-by: Chuck Lever --- include/linux/sunrpc/msg_prot.h | 25 +++++++++++++++++++++++++ include/linux/sunrpc/xprt.h | 7 +++++++ net/sunrpc/auth_gss/auth_gss.c | 4 +--- net/sunrpc/clnt.c | 4 +--- net/sunrpc/xprtsock.c | 29 ++++++++++++++++++++++------- 5 files changed, 56 insertions(+), 13 deletions(-) diff -X /home/cel/src/linux/dont-diff -Naurp 32-xprt-send_request/include/linux/sunrpc/msg_prot.h 33-xprt_tsh_size/include/linux/sunrpc/msg_prot.h --- 32-xprt-send_request/include/linux/sunrpc/msg_prot.h 2005-03-02 02:38:38.000000000 -0500 +++ 33-xprt_tsh_size/include/linux/sunrpc/msg_prot.h 2005-03-07 16:22:23.838177000 -0500 @@ -76,5 +76,30 @@ enum rpc_auth_stat { #define RPC_MAXNETNAMELEN 256 +/* + * From RFC 1831: + * + * "A record is composed of one or more record fragments. A record + * fragment is a four-byte header followed by 0 to (2**31) - 1 bytes of + * fragment data. The bytes encode an unsigned binary number; as with + * XDR integers, the byte order is from highest to lowest. The number + * encodes two values -- a boolean which indicates whether the fragment + * is the last fragment of the record (bit value 1 implies the fragment + * is the last fragment) and a 31-bit unsigned binary value which is the + * length in bytes of the fragment's data. The boolean value is the + * highest-order bit of the header; the length is the 31 low-order bits. + * (Note that this record specification is NOT in XDR standard form!)" + * + * The Linux RPC client always sends its requests in a single record + * fragment, limiting the maximum payload size for stream transports to + * 2GB. + */ + +typedef __u32 rpc_fraghdr; + +#define RPC_LAST_STREAM_FRAGMENT (1U << 31) +#define RPC_FRAGMENT_SIZE_MASK (~RPC_LAST_STREAM_FRAGMENT) +#define RPC_MAX_FRAGMENT_SIZE ((1U << 31) - 1) + #endif /* __KERNEL__ */ #endif /* _LINUX_SUNRPC_MSGPROT_H_ */ diff -X /home/cel/src/linux/dont-diff -Naurp 32-xprt-send_request/include/linux/sunrpc/xprt.h 33-xprt_tsh_size/include/linux/sunrpc/xprt.h --- 32-xprt-send_request/include/linux/sunrpc/xprt.h 2005-03-07 16:00:44.386951000 -0500 +++ 33-xprt_tsh_size/include/linux/sunrpc/xprt.h 2005-03-07 16:22:23.861177000 -0500 @@ -165,6 +165,8 @@ struct rpc_xprt { size_t max_payload; /* largest RPC payload size, in bytes */ + size_t tsh_size; /* size of transport specific + header, in bytes */ struct rpc_wait_queue sending; /* requests waiting to send */ struct rpc_wait_queue resend; /* requests waiting to resend */ @@ -243,6 +245,11 @@ int xs_setup_udp(struct rpc_xprt *, int xs_setup_tcp(struct rpc_xprt *, struct rpc_timeout *); +static inline size_t xprt_tsh_size(struct rpc_xprt *xprt) +{ + return xprt->tsh_size; +} + /* * Reserved bit positions in xprt->state */ diff -X /home/cel/src/linux/dont-diff -Naurp 32-xprt-send_request/net/sunrpc/auth_gss/auth_gss.c 33-xprt_tsh_size/net/sunrpc/auth_gss/auth_gss.c --- 32-xprt-send_request/net/sunrpc/auth_gss/auth_gss.c 2005-03-07 15:55:45.972841000 -0500 +++ 33-xprt_tsh_size/net/sunrpc/auth_gss/auth_gss.c 2005-03-07 16:22:23.892177000 -0500 @@ -1061,9 +1061,7 @@ gss_marshal(struct rpc_task *task, u32 * /* We compute the checksum for the verifier over the xdr-encoded bytes * starting with the xid and ending at the end of the credential: */ iov.iov_base = req->rq_snd_buf.head[0].iov_base; - if (task->tk_client->cl_xprt->stream) - /* See clnt.c:call_header() */ - iov.iov_base += 4; + iov.iov_base += xprt_tsh_size(task->tk_xprt); iov.iov_len = (u8 *)p - (u8 *)iov.iov_base; xdr_buf_from_iov(&iov, &verf_buf); diff -X /home/cel/src/linux/dont-diff -Naurp 32-xprt-send_request/net/sunrpc/clnt.c 33-xprt_tsh_size/net/sunrpc/clnt.c --- 32-xprt-send_request/net/sunrpc/clnt.c 2005-03-07 16:00:44.404933000 -0500 +++ 33-xprt_tsh_size/net/sunrpc/clnt.c 2005-03-07 16:22:23.912177000 -0500 @@ -1008,13 +1008,11 @@ static u32 * call_header(struct rpc_task *task) { struct rpc_clnt *clnt = task->tk_client; - struct rpc_xprt *xprt = task->tk_xprt; struct rpc_rqst *req = task->tk_rqstp; u32 *p = req->rq_svec[0].iov_base; /* FIXME: check buffer size? */ - if (xprt->stream) - *p++ = 0; /* fill in later */ + p += xprt_tsh_size(task->tk_xprt) / sizeof(*p); *p++ = req->rq_xid; /* XID */ *p++ = htonl(RPC_CALL); /* CALL */ *p++ = htonl(RPC_VERSION); /* RPC version */ diff -X /home/cel/src/linux/dont-diff -Naurp 32-xprt-send_request/net/sunrpc/xprtsock.c 33-xprt_tsh_size/net/sunrpc/xprtsock.c --- 32-xprt-send_request/net/sunrpc/xprtsock.c 2005-03-07 16:20:04.008882000 -0500 +++ 33-xprt_tsh_size/net/sunrpc/xprtsock.c 2005-03-07 16:22:23.928177000 -0500 @@ -276,6 +276,18 @@ static int xs_udp_send_request(struct rp } /** + * xs_encode_tcp_record_marker - add a record marker to an RPC message + * @buf: buffer containing message to update + * + */ +static inline void xs_encode_tcp_record_marker(struct xdr_buf *buf) +{ + u32 reclen = buf->len - sizeof(rpc_fraghdr); + rpc_fraghdr *base = buf->head[0].iov_base; + *base = htonl(RPC_LAST_STREAM_FRAGMENT | reclen); +} + +/** * xs_tcp_send_request - write an RPC request to a TCP socket * @task: address of RPC task that manages the state of an RPC request * @@ -295,14 +307,12 @@ static int xs_tcp_send_request(struct rp struct rpc_xprt *xprt = req->rq_xprt; struct socket *sock = xprt->sock; struct xdr_buf *xdr = &req->rq_snd_buf; - u32 *marker = req->rq_svec[0].iov_base; int status, retry = 0; if (!sock) return -ENOTCONN; - /* Write the record marker */ - *marker = htonl(0x80000000|(req->rq_slen-sizeof(*marker))); + xs_encode_tcp_record_marker(&req->rq_snd_buf); xs_pktdump("packet data:", req->rq_svec->iov_base, @@ -517,16 +527,19 @@ static inline void xs_tcp_read_fraghdr(s xprt->tcp_offset += used; if (used != len) return; + xprt->tcp_reclen = ntohl(xprt->tcp_recm); - if (xprt->tcp_reclen & 0x80000000) + if (xprt->tcp_reclen & RPC_LAST_STREAM_FRAGMENT) xprt->tcp_flags |= XPRT_LAST_FRAG; else xprt->tcp_flags &= ~XPRT_LAST_FRAG; - xprt->tcp_reclen &= 0x7fffffff; + xprt->tcp_reclen &= RPC_FRAGMENT_SIZE_MASK; + xprt->tcp_flags &= ~XPRT_COPY_RECM; xprt->tcp_offset = 0; + /* Sanity check of the record length */ - if (xprt->tcp_reclen < 4) { + if (unlikely(xprt->tcp_reclen < 4)) { printk(KERN_ERR "RPC: Invalid TCP record fragment length\n"); xprt_disconnect(xprt); } @@ -1114,6 +1127,7 @@ int xs_setup_udp(struct rpc_xprt *xprt, xprt->prot = IPPROTO_UDP; xprt->port = XS_MAX_RESVPORT; + xprt->tsh_size = 0; xprt->stream = 0; xprt->nocong = 0; xprt->cwnd = RPC_INITCWND; @@ -1154,11 +1168,12 @@ int xs_setup_tcp(struct rpc_xprt *xprt, xprt->prot = IPPROTO_TCP; xprt->port = XS_MAX_RESVPORT; + xprt->tsh_size = sizeof(rpc_fraghdr); xprt->stream = 1; xprt->nocong = 1; xprt->cwnd = RPC_MAXCWND(xprt); xprt->resvport = capable(CAP_NET_BIND_SERVICE) ? 1 : 0; - xprt->max_payload = (1U << 31) - 1; + xprt->max_payload = RPC_MAX_FRAGMENT_SIZE; INIT_WORK(&xprt->connect_worker, xs_tcp_connect_worker, xprt);