[rds-devel] An example of how to use RDS v3 zero copy extensions
Richard Frank
richard.frank at oracle.com
Wed Nov 14 12:18:16 PST 2007
An example of how to set up, request an rdma to/from a local buffer
from a remote rdma server, and clean up.
1) read RDS domain, sol_rds, etc values setup by RDS driver after loading.
#include "rds_api.h"
fd = open("/proc/sys/net/rds/pf_rds", O_RDONLY);
if (fd < 0)
die_errno("open(/proc/sys/net/rds/pf_rds) failed");
read(fd, str, sizeof(str));
sscanf(str, "%d", &rds_domain);
close(fd);
2) Create RDS socket.
type = SOCK_SEQPACKET;
protocol = 0;
fd = socket(rds_domain, type, protocol);
if (fd < 0)
die_errno("socket(%d, %d, %d) failed", domain, type,
protocol);
if (bind(fd, (struct sockaddr *)sin, sizeof(struct sockaddr_in)))
die_errno("bind() failed");
3) Create a memory key for local buffer which is to be used as source
or target (or both) for rdma operation. Here you must specify the
destination IP which will be initiating the rdma operation. Note that if
the transport unpinning the destination does not support rdma operations
- then this call will fail with enotsupported.
uint64_t rkey = 0;
struct rds_get_mr_args mr_args;
mr_args.vec.addr = destination_addr;
mr_args.vec.bytes = size; /* size of buffer */
mr_args.key_addr = (uint64_t)&rkey; /* rdma key */
if (setsockopt(fd, SOL_RDS, RDS_GET_MR, (char*)&mr_args, sizeof(mr_args)))
die_errno("setsockopt(RDS_GET_MR) failed");
4) Send key to along with request to perform rdma to rdma server via
normal sendto/sendmsg and then
wait for response message from rdma server that request is complete.
...
sendmsg(request)
...
/* wait for response message indicating rdma is complete */
...
poll(socket,timeout)
...
5) On the rdma server - recv's the request to rdma data to / from remote
buffer with rdma key and issues rdma.
request_msg = recvmsg(socket,...);
When issuing an RDMA, a normal message termed as "immediate data" is
sent along with the rdma request which is passed via send to the RDS
driver via the csmg ancillary data (man 3 cmsg). Both the immediate
data and rdma are treated as atomic send operation - either both will
arrive in order -> 1) rdma 2) immediate data - or niether will arrive.
struct rds_iovec localva[MSG_MAXIOVLEN];
struct iovec immdva[MSG_MAXIOVLEN];
struct cmsghdr *cmsg;
char ctlbuf[CMSG_SPACE(sizeof(struct rds_rdma_args))];
struct rds_rdma_args *rdmap;
struct msghdr msg;
/* cmsg setup for rdma */
msg.msg_control = ctlbuf;
msg.msg_controllen = sizeof(ctlbuf);
cmsg = CMSG_FIRSTHDR(&msg);
cmsg->cmsg_level = sol_rds; /* read from
"/proc/sys/net/rds/sol_rds" */
cmsg->cmsg_type = RDS_CMSG_RDMA_ARGS;
cmsg->cmsg_len = CMSG_LEN(sizeof(struct rds_rdma_args));
msg.msg_controllen = cmsg->cmsg_len;
/* describe local buffer to rdma in to / out of */
localva[0].addr = local_buf;
localva[0].bytes = local_size;
/* set up rdma args */
rdmap = (struct rds_rdma_args *)CMSG_DATA(cmsg);
rdmap->local_vec_addr = localva;
rdmap->nr_local = 1;
/* when a rdma operation is initiated an identifier for the
operation is returned */
/* The rdma_id can be used in an rds_barrier operation to detect
completion */
/* of an rdma */
rdmap->rdma_id_addr = &rdma_id; /* returned rdma_id of operation */
rdmap->remote_vec.addr = request_msg.remote_addr;
rdmap->remote_vec.bytes = request_msg.rdma_size;
rdmap->r_key = request_msg.rdma_key;
/* rdma is read or write */
if (RDMA_WRITE_REQUEST == request_msg.rdma_op)
rdmap->flags = RDS_RDMA_ARGS_WRITE;
else
rdmap->flags = 0;
/* immediate data (ack message) to go as response to inform client
that rdma is complete */
immdva[0].iov_base = ack_buf;
immdva[0].iov_len = sizeof(ack_buf);
/* setup send immediate data header */
msg.msg_iovlen = 1;
msg.msg_iov = immdva;
msg.msg_name = (struct sockaddr *)&sin;
msg.msg_namelen = sizeof(sin);
/*issue rdma with immediate data following */
ret = sendmsg(fd, &msg, 0);
if (ret != opts->ack_size)
die_errno("sendto() returned %zd", ret);
/* now wait for rdma operation to complete */
struct rds_barrier_args args;
uint64_t done_rdma_id = 0
args.daddr = (uint32_t)saddr->sin_addr.s_addr; /* destination of
rdma op */
args.rdma_id_addr = (uint64_t)&done_rdma_id; /* current complete rdma
id returned */
args.wait_rdma_id = rdma_id; /* returned from rdma send */
while (rdma_id > done_rdma_id) {
/* ask rds if rdma is complete. returns eagain if not, else
success or error */
/* this is a non-blocking call */
status = setsockopt(fd, sol_rds, RDS_BARRIER, (char*)&args,
sizeof(args)))
if (!eagain && !bad_status)
continue;
/* operation is not complete - sleep waiting for rdma operation to
complete */
poll(fd,timeout);
}
6) On client requesting rdma - recv's immediate data message indicating
rdma is complete and releases rdma key.
...
poll()
...
response_msg = recvmsg(socket,,,,);
...
...
/* free rdma key for operation */
struct rds_free_mr_args mr_args;
mr_args.key = rdma_key;
mr_args.flags = 0;
if (setsockopt(fd, sol_rds, RDS_FREE_MR, (char*)&mr_args,
sizeof(mr_args)))
die_errno("setsockopt(RDS_GET_MR) failed");
More information about the rds-devel
mailing list