[rds-devel] what is rdma immediate data and how is it used ?

Olaf Kirch olaf.kirch at oracle.com
Tue Jan 8 05:43:01 PST 2008


Hi Or,

On Tuesday 08 January 2008 13:01, Or Gerlitz wrote:
> For the cases you mentioned here, where the immediate data send is 
> posted to the HCA immediatly after issuing the rdma read, the fence 
> indicator bit (IBV_SEND_FENCE) must be set in the send wqe flags, else 

I noticed that in the spec as well, when trying to understand some other
aspects of RDMA.

Isn't that something the application layer should take care of by default?
As far as I can tell the spec says "bad things can happen if you're careless;
to be on the safe side use the Fence flag". But we lose performance that way,
right? So we could make this a sendmsg flag, as in the patch below.

Olaf
-- 
Olaf Kirch  |  --- o --- Nous sommes du soleil we love when we play
okir at lst.de |    / | \   sol.dhoop.naytheet.ah kin.ir.samse.qurax
------------
From: Olaf Kirch <olaf.kirch at oracle.com>
Subject: [RDS] Properly fence off RDMA reads

The IB specification warns that a SEND call can interfere with a
preceding RDMA read if they both access the same memory region. To
prevent this, you can set the FENCE flag on the SEND call (or fix
your application to prevent that from happening)

This patch adds a flag to the RDMA sendmsg parameters that lets the
application request that the subsequent SEND should use the FENCE flag.

Signed-off-by: Olaf Kirch <olaf.kirch at oracle.com>
---
 net/rds/ib_send.c |   19 +++++++++++++++----
 net/rds/rdma.c    |    4 +++-
 net/rds/rdma.h    |    3 ++-
 3 files changed, 20 insertions(+), 6 deletions(-)

Index: ofa-kernel-1.3/net/rds/ib_send.c
===================================================================
--- ofa-kernel-1.3.orig/net/rds/ib_send.c
+++ ofa-kernel-1.3/net/rds/ib_send.c
@@ -316,6 +316,7 @@ int rds_ib_xmit(struct rds_connection *c
 	u32 pos;
 	u32 i;
 	u32 work_alloc;
+	int send_flags = 0;
 	int sent;
 	int ret;
 
@@ -372,6 +373,15 @@ int rds_ib_xmit(struct rds_connection *c
 	sent = 0;
 	i = 0;
 
+	/* Sometimes you want to put a fence between an RDMA
+	 * READ and the following SEND.
+	 * We could either do this all the time
+	 * or when requested by the user. Right now, we let
+	 * the application choose.
+	 */
+	if (rm->m_rdma_op && rm->m_rdma_op->r_fence)
+		send_flags = IB_SEND_FENCE;
+
 	/*
 	 * We could be copying the header into the unused tail of the page.
 	 * That would need to be changed in the future when those pages might
@@ -385,13 +395,14 @@ int rds_ib_xmit(struct rds_connection *c
 	if (be32_to_cpu(rm->m_inc.i_hdr.h_len) == 0) {
 		first->s_sge[0].addr = 0;
 		first->s_sge[0].length = 0;
+		first->s_wr.send_flags = send_flags;
 		first->s_wr.next = NULL;
 		goto add_header;
 	}
 
 	/* if there's data reference it with a chain of work reqs */
 	for(; i < work_alloc && scat != &rm->m_sg[rm->m_count]; i++) {
-		send->s_wr.send_flags = 0;
+		send->s_wr.send_flags = send_flags;
 		send->s_queued = jiffies;
 
                 /* 
@@ -400,7 +411,7 @@ int rds_ib_xmit(struct rds_connection *c
                  */
 		if (ic->i_unsignaled_wrs-- == 0) {
 			ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs;
-			send->s_wr.send_flags = IB_SEND_SIGNALED | IB_SEND_SOLICITED;
+			send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
 		}
 
 		send->s_wr.next = NULL;
@@ -423,7 +434,7 @@ int rds_ib_xmit(struct rds_connection *c
 		ic->i_unsignaled_bytes -= len;
 		if (ic->i_unsignaled_bytes <= 0) {
 			ic->i_unsignaled_bytes = rds_ib_sysctl_max_unsig_bytes;
-			send->s_wr.send_flags = IB_SEND_SIGNALED | IB_SEND_SOLICITED;
+			send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
 		}
 
 		rdsdebug("send %p wr %p num_sge %u next %p\n", send,
@@ -457,7 +468,7 @@ add_header:
 	/* if we finished the message then send completion owns it */
 	if (scat == &rm->m_sg[rm->m_count]) {
 		prev->s_rm = ic->i_rm;
-		prev->s_wr.send_flags = IB_SEND_SIGNALED | IB_SEND_SOLICITED;
+		prev->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
 		ic->i_rm = NULL;
 	}
 
Index: ofa-kernel-1.3/net/rds/rdma.c
===================================================================
--- ofa-kernel-1.3.orig/net/rds/rdma.c
+++ ofa-kernel-1.3/net/rds/rdma.c
@@ -71,7 +71,8 @@ struct rds_free_mr_args {
 	u64		flags;
 };
 
-#define RDS_RDMA_ARGS_WRITE 1  /* read when not set */
+#define RDS_RDMA_ARGS_WRITE	1  /* read when not set */
+#define RDS_RDMA_ARGS_FENCE	2  /* use FENCE for immediate send */
 
 struct rds_rdma_args {
 	struct rds_iovec remote_vec;
@@ -457,6 +458,7 @@ static struct rds_rdma_op *rds_rdma_prep
 	}
 
 	op->r_write = args->flags & RDS_RDMA_ARGS_WRITE ? 1 : 0;
+	op->r_fence = !!(args->flags & RDS_RDMA_ARGS_FENCE);
 
 	/* TODO: if we are using an fmr then we need to zero base the remote address */
 	/* if rkey is not special device (or magic key) then assume it is an fmr */
Index: ofa-kernel-1.3/net/rds/rdma.h
===================================================================
--- ofa-kernel-1.3.orig/net/rds/rdma.h
+++ ofa-kernel-1.3/net/rds/rdma.h
@@ -24,7 +24,8 @@ struct rds_rdma_op {
 	u64			remote_addr;
         u64                     rdma_id_addr;
         u64                     r_rdma_id;
-	unsigned		r_write:1;
+	unsigned		r_write : 1,
+				r_fence : 1;
 	unsigned int		r_nents;
 	unsigned int		r_count;
 	struct scatterlist	r_sg[0];



More information about the rds-devel mailing list