Subject: [PATCH] RPC: skip over transport-specific heads automatically Category: Code re-organization, maintainability Description: Add a mechanism for skipping over transport-specific headers when constructing an RPC request. This removes another "xprt->stream" dependency, and gets rid of a conditional branch in the RPC send hot path. Test-plan: Write-intensive workload on a single mount point. include/linux/sunrpc/xprt.h | 9 +++++++ net/sunrpc/auth_gss/auth_gss.c | 4 --- net/sunrpc/clnt.c | 4 --- net/sunrpc/ipv4_sock.c | 38 ++++++++++++++++++++++++++++++--- 4 files changed, 46 insertions(+), 9 deletions(-) Signed-off-by: Chuck Lever Applies-to: 2.6.8.1 Created: Tue, 17 Aug 2004 23:49:42 -0400 System-ID: Linux climax.citi.umich.edu 2.6.8.1 #1 SMP Tue Aug 17 15:53:50 EDT 2004 i386 diff -X /home/cel/src/linux/dont-diff -Naurp 45-send_request-tcp-udp/include/linux/sunrpc/xprt.h 46-tsh_size/include/linux/sunrpc/xprt.h --- 45-send_request-tcp-udp/include/linux/sunrpc/xprt.h 2004-08-17 13:54:57.475396000 -0400 +++ 46-tsh_size/include/linux/sunrpc/xprt.h 2004-08-17 14:03:32.454169000 -0400 @@ -155,6 +155,9 @@ struct rpc_xprt { unsigned int rcvsize, /* socket receive buffer size */ sndsize; /* socket send buffer size */ + unsigned int tsh_size; /* size of transport specific + header, in bytes */ + struct rpc_wait_queue sending; /* requests waiting to send */ struct rpc_wait_queue resend; /* requests waiting to resend */ struct rpc_wait_queue pending; /* requests in flight */ @@ -236,6 +239,12 @@ int xprt_sock_setup_tcp(struct rpc_xpr struct sockaddr_in *, void *); +static __inline__ unsigned int +xprt_tsh_size(struct rpc_xprt *xprt) +{ + return xprt->tsh_size; +} + /* * Reserved bit positions in xprt->state diff -X /home/cel/src/linux/dont-diff -Naurp 45-send_request-tcp-udp/net/sunrpc/auth_gss/auth_gss.c 46-tsh_size/net/sunrpc/auth_gss/auth_gss.c --- 45-send_request-tcp-udp/net/sunrpc/auth_gss/auth_gss.c 2004-08-17 13:52:58.142904000 -0400 +++ 46-tsh_size/net/sunrpc/auth_gss/auth_gss.c 2004-08-17 14:03:32.468151000 -0400 @@ -725,9 +725,7 @@ gss_marshal(struct rpc_task *task, u32 * /* We compute the checksum for the verifier over the xdr-encoded bytes * starting with the xid and ending at the end of the credential: */ iov.iov_base = req->rq_snd_buf.head[0].iov_base; - if (task->tk_client->cl_xprt->stream) - /* See clnt.c:call_header() */ - iov.iov_base += 4; + iov.iov_base += xprt_tsh_size(task->tk_client->cl_xprt); iov.iov_len = (u8 *)p - (u8 *)iov.iov_base; xdr_buf_from_iov(&iov, &verf_buf); diff -X /home/cel/src/linux/dont-diff -Naurp 45-send_request-tcp-udp/net/sunrpc/clnt.c 46-tsh_size/net/sunrpc/clnt.c --- 45-send_request-tcp-udp/net/sunrpc/clnt.c 2004-08-17 15:14:26.331514000 -0400 +++ 46-tsh_size/net/sunrpc/clnt.c 2004-08-17 15:14:31.503895000 -0400 @@ -936,13 +936,11 @@ static u32 * call_header(struct rpc_task *task) { struct rpc_clnt *clnt = task->tk_client; - struct rpc_xprt *xprt = clnt->cl_xprt; struct rpc_rqst *req = task->tk_rqstp; u32 *p = req->rq_svec[0].iov_base; /* FIXME: check buffer size? */ - if (xprt->stream) - *p++ = 0; /* fill in later */ + (u8 *) p += xprt_tsh_size(clnt->cl_xprt); /* transport specific header */ *p++ = req->rq_xid; /* XID */ *p++ = htonl(RPC_CALL); /* CALL */ *p++ = htonl(RPC_VERSION); /* RPC version */ diff -X /home/cel/src/linux/dont-diff -Naurp 45-send_request-tcp-udp/net/sunrpc/ipv4_sock.c 46-tsh_size/net/sunrpc/ipv4_sock.c --- 45-send_request-tcp-udp/net/sunrpc/ipv4_sock.c 2004-08-17 14:01:50.560624000 -0400 +++ 46-tsh_size/net/sunrpc/ipv4_sock.c 2004-08-17 14:03:32.474151000 -0400 @@ -85,6 +85,38 @@ xprt_sock_pktdump(char *msg, u32 *packet } #endif +/* + * From RFC 1831: + * + * "A record is composed of one or more record fragments. A record + * fragment is a four-byte header followed by 0 to (2**31) - 1 bytes of + * fragment data. The bytes encode an unsigned binary number; as with + * XDR integers, the byte order is from highest to lowest. The number + * encodes two values -- a boolean which indicates whether the fragment + * is the last fragment of the record (bit value 1 implies the fragment + * is the last fragment) and a 31-bit unsigned binary value which is the + * length in bytes of the fragment's data. The boolean value is the + * highest-order bit of the header; the length is the 31 low-order bits. + * (Note that this record specification is NOT in XDR standard form!)" + * + * We don't expect to have to send more than 2GB of data in one RPC + * request, so the Linux RPC client always sends its requests in a single + * record fragment. + */ + +typedef __u32 rpc_stream_record_marker_t; +#define RPC_LAST_STREAM_FRAGMENT (1U << 31) + +static inline void +xprt_sock_encode_tcp_record_marker(struct xdr_buf *buf) +{ + u32 reclen = buf->len - sizeof(rpc_stream_record_marker_t); + rpc_stream_record_marker_t *base = buf->head[0].iov_base; + + /* enocde the marker in the appointed location */ + *base = htonl(RPC_LAST_STREAM_FRAGMENT | reclen); +} + /** * xprt_sock_sendpages - write pages directly to a socket * @sock: socket to send on @@ -306,14 +338,12 @@ xprt_sock_tcp_send_request(struct rpc_ta struct rpc_xprt *xprt = req->rq_xprt; struct socket *sock = xprt->sock; struct xdr_buf *xdr = &req->rq_snd_buf; - u32 *marker = req->rq_svec[0].iov_base; int status, retry = 0; if (!sock) return -ENOTCONN; - /* Write the record marker */ - *marker = htonl(0x80000000|(req->rq_slen-sizeof(*marker))); + xprt_sock_encode_tcp_record_marker(&req->rq_snd_buf); xprt_sock_pktdump("packet data:", req->rq_svec->iov_base, @@ -1216,6 +1246,7 @@ xprt_sock_setup_udp(struct rpc_xprt *xpr xprt->addr = *addr; xprt->prot = IPPROTO_UDP; xprt->port = XPRT_SOCK_MAX_RESVPORT; + xprt->tsh_size = 0; xprt->stream = 0; xprt->nocong = 0; xprt->cwnd = RPC_INITCWND; @@ -1258,6 +1289,7 @@ xprt_sock_setup_tcp(struct rpc_xprt *xpr xprt->addr = *addr; xprt->prot = IPPROTO_TCP; xprt->port = XPRT_SOCK_MAX_RESVPORT; + xprt->tsh_size = sizeof(rpc_stream_record_marker_t); xprt->stream = 1; xprt->nocong = 1; xprt->cwnd = RPC_MAXCWND(xprt);