[rds-devel] An example of how to use RDS v3 zero copy extensions

Richard Frank richard.frank at oracle.com
Wed Nov 14 12:18:16 PST 2007

An example of how to set up, request an rdma to/from  a local buffer 
from a remote rdma server, and clean up.

1) read RDS domain, sol_rds, etc values setup by RDS driver after loading.

#include "rds_api.h"

      fd = open("/proc/sys/net/rds/pf_rds", O_RDONLY);
       if (fd < 0)
               die_errno("open(/proc/sys/net/rds/pf_rds) failed");
       read(fd, str, sizeof(str));
       sscanf(str, "%d", &rds_domain);

2) Create RDS socket.

   protocol = 0;
   fd = socket(rds_domain, type, protocol);
   if (fd < 0)
               die_errno("socket(%d, %d, %d) failed", domain, type, 
   if (bind(fd, (struct sockaddr *)sin, sizeof(struct sockaddr_in)))
              die_errno("bind() failed");

3)  Create a memory key for local buffer which is to be used as source 
or target (or both) for rdma operation. Here you must specify the 
destination IP which will be initiating the rdma operation. Note that if 
the transport unpinning the destination does not support rdma operations 
- then this call will fail with enotsupported.

 uint64_t     rkey = 0;
 struct rds_get_mr_args mr_args;

 mr_args.vec.addr = destination_addr;
 mr_args.vec.bytes = size;    /* size of buffer */
 mr_args.key_addr = (uint64_t)&rkey;   /* rdma key */

 if (setsockopt(fd, SOL_RDS, RDS_GET_MR, (char*)&mr_args, sizeof(mr_args)))
   die_errno("setsockopt(RDS_GET_MR) failed");

4) Send key to along with request to perform rdma to rdma server via 
normal sendto/sendmsg and then
wait for response message from rdma server that request is complete.

/* wait for response message indicating rdma is complete */

5) On the rdma server - recv's the request to rdma data to / from remote 
buffer with rdma key and issues rdma.

request_msg = recvmsg(socket,...);

When issuing an RDMA, a normal message termed as "immediate data" is 
sent along with the rdma request which is passed via send to the RDS 
driver via the csmg ancillary data (man 3 cmsg).  Both the immediate 
data and rdma are treated as atomic send operation - either both will 
arrive in order -> 1) rdma 2) immediate data - or niether will arrive.

     struct rds_iovec   localva[MSG_MAXIOVLEN];
     struct iovec   immdva[MSG_MAXIOVLEN];
     struct cmsghdr *cmsg;
     char ctlbuf[CMSG_SPACE(sizeof(struct rds_rdma_args))];
     struct rds_rdma_args *rdmap;
     struct msghdr msg;

     /* cmsg setup for rdma */
     msg.msg_control = ctlbuf;
     msg.msg_controllen = sizeof(ctlbuf);
     cmsg = CMSG_FIRSTHDR(&msg);
     cmsg->cmsg_level = sol_rds; /* read from 
"/proc/sys/net/rds/sol_rds" */
     cmsg->cmsg_type = RDS_CMSG_RDMA_ARGS;
     cmsg->cmsg_len = CMSG_LEN(sizeof(struct rds_rdma_args));
     msg.msg_controllen = cmsg->cmsg_len;

     /* describe local buffer to rdma in to / out of */
     localva[0].addr = local_buf;
     localva[0].bytes = local_size;

     /* set up rdma args */
     rdmap = (struct rds_rdma_args *)CMSG_DATA(cmsg);
     rdmap->local_vec_addr = localva;
     rdmap->nr_local = 1;
     /* when a rdma operation is initiated an identifier for the 
operation is returned */
     /* The rdma_id can be used in an rds_barrier operation to detect 
completion */
     /* of an rdma */
     rdmap->rdma_id_addr = &rdma_id;  /* returned rdma_id of operation */
     rdmap->remote_vec.addr =  request_msg.remote_addr;     
     rdmap->remote_vec.bytes = request_msg.rdma_size;
     rdmap->r_key = request_msg.rdma_key;
     /* rdma is read or write */
     if (RDMA_WRITE_REQUEST == request_msg.rdma_op)
        rdmap->flags = RDS_RDMA_ARGS_WRITE;
        rdmap->flags = 0;

      /* immediate data (ack message) to go as response to inform client 
that rdma is complete */
     immdva[0].iov_base = ack_buf;
     immdva[0].iov_len = sizeof(ack_buf);
     /* setup send immediate data header */
     msg.msg_iovlen = 1;
     msg.msg_iov =  immdva;
     msg.msg_name  = (struct sockaddr *)&sin;
     msg.msg_namelen  = sizeof(sin);

     /*issue rdma with immediate data following */
     ret = sendmsg(fd, &msg, 0);
     if (ret != opts->ack_size)
       die_errno("sendto() returned %zd", ret);

  /* now wait for rdma operation to complete */
   struct rds_barrier_args args;
   uint64_t done_rdma_id = 0
   args.daddr = (uint32_t)saddr->sin_addr.s_addr;  /* destination of 
rdma op */
   args.rdma_id_addr = (uint64_t)&done_rdma_id; /* current complete rdma 
id returned */
   args.wait_rdma_id = rdma_id; /* returned from rdma send */

   while (rdma_id > done_rdma_id) {
       /* ask rds if rdma is complete. returns eagain if not, else 
success or error */
       /* this is a non-blocking call */       
      status = setsockopt(fd, sol_rds, RDS_BARRIER, (char*)&args, 
      if (!eagain && !bad_status)
      /* operation is not complete - sleep waiting for rdma operation to 
complete */

6) On client requesting rdma - recv's immediate data message indicating 
rdma is complete and releases rdma key.
 response_msg = recvmsg(socket,,,,);
/* free rdma key for operation */

 struct rds_free_mr_args mr_args;
 mr_args.key = rdma_key;
 mr_args.flags = 0;

 if (setsockopt(fd, sol_rds, RDS_FREE_MR, (char*)&mr_args, 
   die_errno("setsockopt(RDS_GET_MR) failed");

More information about the rds-devel mailing list