[rds-devel] [PATCH] rds: support for IB_DEVICE_LOCAL_DMA_LKEY (resend)

Jon Mason jon at opengridcomputing.com
Mon Aug 4 16:22:18 PDT 2008


This is a resend of the patch based on the ofed-1.4 2.6.27-rc1 git tree.
Please apply on top of the previous patches sent out today.

For iWARP, there is a limitation where syncs to remote memory need write
permission.  By allowing remote write, there is a potential security
risk where all memory is available to remote clients.  By using the
local_dma_lkey, this removes the necessity of remote write permission on
local memory regions.  The patch below converts the usage of dma_mr's to
dma_local_lkey and removes the allocation of dma_mr's (if
IB_DEVICE_LOCAL_DMA_LKEY is supported).

Also, Chelsio has a limitation of not being able to access DMA MR
regions that reside in memory greater that 4GB.  So using the patch, rds
bcopy will work on systems with greater than 4GB RAM.

For IB, using local_dma_lkey removes the need for DMA MR allocations
(presuming that the driver supports IB_DEVICE_LOCAL_DMA_LKEY).

Signed-Off-By: Jon Mason <jon at opengridcomputing.com>

diff --git a/net/rds/ib.c b/net/rds/ib.c
index 6c5328f..775a41e 100644
--- a/net/rds/ib.c
+++ b/net/rds/ib.c
@@ -79,6 +79,7 @@ void rds_ib_add_one(struct ib_device *device)
 
 	spin_lock_init(&rds_ibdev->spinlock);
 
+	rds_ibdev->dma_local_lkey = !!(dev_attr->device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY);
 	rds_ibdev->max_wrs = dev_attr->max_qp_wr;
 	rds_ibdev->max_sge = min(dev_attr->max_sge, RDS_IB_MAX_SGE);
 
@@ -95,18 +96,21 @@ void rds_ib_add_one(struct ib_device *device)
 	if (IS_ERR(rds_ibdev->pd))
 		goto free_dev;
 
-	if (device->node_type != RDMA_NODE_RNIC) {
-		rds_ibdev->mr = ib_get_dma_mr(rds_ibdev->pd,
-					IB_ACCESS_LOCAL_WRITE);
-	} else {
-		/* Why does it have to have these permissions? */
-		rds_ibdev->mr = ib_get_dma_mr(rds_ibdev->pd,
-					IB_ACCESS_REMOTE_READ |
-					IB_ACCESS_REMOTE_WRITE |
-					IB_ACCESS_LOCAL_WRITE);
-	}
-	if (IS_ERR(rds_ibdev->mr))
-		goto err_pd;
+	if (!rds_ibdev->dma_local_lkey) {
+		if (device->node_type != RDMA_NODE_RNIC) {
+			rds_ibdev->mr = ib_get_dma_mr(rds_ibdev->pd,
+						IB_ACCESS_LOCAL_WRITE);
+		} else {
+			/* Why does it have to have these permissions? */
+			rds_ibdev->mr = ib_get_dma_mr(rds_ibdev->pd,
+						IB_ACCESS_REMOTE_READ |
+						IB_ACCESS_REMOTE_WRITE |
+						IB_ACCESS_LOCAL_WRITE);
+		}
+		if (IS_ERR(rds_ibdev->mr))
+			goto err_pd;
+	} else
+		rds_ibdev->mr = NULL;
 
 	rds_ibdev->mr_pool = rds_ib_create_mr_pool(rds_ibdev);
 	if (IS_ERR(rds_ibdev->mr_pool)) {
@@ -122,7 +126,8 @@ void rds_ib_add_one(struct ib_device *device)
 	goto free_attr;
 
 err_mr:
-	ib_dereg_mr(rds_ibdev->mr);
+	if (!rds_ibdev->dma_local_lkey)
+		ib_dereg_mr(rds_ibdev->mr);
 err_pd:
 	ib_dealloc_pd(rds_ibdev->pd);
 free_dev:
@@ -148,7 +153,9 @@ void rds_ib_remove_one(struct ib_device *device)
 	if (rds_ibdev->mr_pool)
 		rds_ib_destroy_mr_pool(rds_ibdev->mr_pool);
 
-	ib_dereg_mr(rds_ibdev->mr);
+	if (rds_ibdev->mr)
+		ib_dereg_mr(rds_ibdev->mr);
+
 	ib_dealloc_pd(rds_ibdev->pd);
 	
 	list_del(&rds_ibdev->list);
diff --git a/net/rds/ib.h b/net/rds/ib.h
index 2a0682f..d4e19bd 100644
--- a/net/rds/ib.h
+++ b/net/rds/ib.h
@@ -125,7 +125,8 @@ struct rds_ib_connection {
 	/* Protocol version specific information */
 	unsigned int		i_flowctl : 1,	/* enable/disable flow ctl */
 				i_iwarp   : 1,	/* this is actually iWARP not IB */
-				i_fastreg : 1;	/* device supports fastreg */
+				i_fastreg : 1,	/* device supports fastreg */
+				i_dma_local_lkey : 1;
 
 	/* Batched completions */
 	unsigned int		i_unsignaled_wrs;
@@ -157,7 +158,8 @@ struct rds_ib_device {
 	unsigned int		max_fmrs;
 	int			max_sge;
 	unsigned int		max_wrs;
-	unsigned int		use_fastreg : 1;
+	unsigned int		use_fastreg : 1,
+				dma_local_lkey : 1;
 	spinlock_t		spinlock;
 };
 
@@ -232,6 +234,10 @@ static void inline rds_ib_dma_sync_sg_for_device(struct ib_device *dev,
 }
 #define ib_dma_sync_sg_for_device	rds_ib_dma_sync_sg_for_device
 
+static inline u32 rds_ib_local_dma_lkey(struct rds_ib_connection *ic)
+{
+	return (ic->i_dma_local_lkey ? ic->i_cm_id->device->local_dma_lkey : ic->i_mr->lkey);
+}
 
 /* ib.c */
 extern struct rds_transport rds_ib_transport;
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
index 6e9db6a..c413b6d 100644
--- a/net/rds/ib_cm.c
+++ b/net/rds/ib_cm.c
@@ -458,6 +458,7 @@ static int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
 	/* Remember whether this is IB or iWARP */
 	ic->i_iwarp = (cm_id->device->node_type == RDMA_NODE_RNIC);
 	ic->i_fastreg = rds_ibdev->use_fastreg;
+	ic->i_dma_local_lkey = rds_ibdev->dma_local_lkey;
 
  	/* We got halfway through setting up the ib_connection, if we
  	 * fail now, we have to take the long route out of this mess. */
@@ -613,6 +614,7 @@ out:
 int rds_ib_conn_connect(struct rds_connection *conn)
 {
 	struct rds_ib_connection *ic = conn->c_transport_data;
+	struct rds_ib_device *rds_ibdev;
 	struct sockaddr_in src, dest;
 	int ret;
 
@@ -640,8 +642,12 @@ int rds_ib_conn_connect(struct rds_connection *conn)
 		goto out;
 	}
 
+	rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client);
+
 	/* Now check the device type and set i_iwarp */
 	ic->i_iwarp = (ic->i_cm_id->device->node_type == RDMA_NODE_RNIC);
+	ic->i_fastreg = rds_ibdev->use_fastreg;
+	ic->i_dma_local_lkey = rds_ibdev->dma_local_lkey;
 
 	dest.sin_family = AF_INET;
 	dest.sin_addr.s_addr = (__force u32)conn->c_faddr;
diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c
index 6b3b476..5c69d92 100644
--- a/net/rds/ib_recv.c
+++ b/net/rds/ib_recv.c
@@ -97,12 +97,12 @@ void rds_ib_recv_init_ring(struct rds_ib_connection *ic)
 		sge = rds_ib_data_sge(ic, recv->r_sge);
 		sge->addr = 0;
 		sge->length = RDS_FRAG_SIZE;
-		sge->lkey = ic->i_mr->lkey;
+		sge->lkey = rds_ib_local_dma_lkey(ic);
 
 		sge = rds_ib_header_sge(ic, recv->r_sge);
 		sge->addr = ic->i_recv_hdrs_dma + (i * sizeof(struct rds_header));
 		sge->length = sizeof(struct rds_header);
-		sge->lkey = ic->i_mr->lkey;
+		sge->lkey = rds_ib_local_dma_lkey(ic);
 	}
 }
 
@@ -364,7 +364,7 @@ void rds_ib_recv_init_ack(struct rds_ib_connection *ic)
 
 	sge->addr = ic->i_ack_dma;
 	sge->length = sizeof(struct rds_header);
-	sge->lkey = ic->i_mr->lkey;
+	sge->lkey = rds_ib_local_dma_lkey(ic);
 
 	wr->sg_list = sge;
 	wr->num_sge = 1;
diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c
index 865301a..1b51526 100644
--- a/net/rds/ib_send.c
+++ b/net/rds/ib_send.c
@@ -144,12 +144,12 @@ void rds_ib_send_init_ring(struct rds_ib_connection *ic)
 		send->s_wr.ex.imm_data = 0;
 
 		sge = rds_ib_data_sge(ic, send->s_sge);
-		sge->lkey = ic->i_mr->lkey;
+		sge->lkey = rds_ib_local_dma_lkey(ic);
 
 		sge = rds_ib_header_sge(ic, send->s_sge);
 		sge->addr = ic->i_send_hdrs_dma + (i * sizeof(struct rds_header));
 		sge->length = sizeof(struct rds_header);
-		sge->lkey = ic->i_mr->lkey;
+		sge->lkey = rds_ib_local_dma_lkey(ic);
 	}
 }
 
@@ -425,7 +425,7 @@ rds_ib_xmit_populate_wr(struct rds_ib_connection *ic,
 		sge = rds_ib_data_sge(ic, send->s_sge);
 		sge->addr = buffer;
 		sge->length = length;
-		sge->lkey = ic->i_mr->lkey;
+		sge->lkey = rds_ib_local_dma_lkey(ic);
 
 		sge = rds_ib_header_sge(ic, send->s_sge);
 	} else {
@@ -437,7 +437,7 @@ rds_ib_xmit_populate_wr(struct rds_ib_connection *ic,
 
 	sge->addr = ic->i_send_hdrs_dma + (pos * sizeof(struct rds_header));
 	sge->length = sizeof(struct rds_header);
-	sge->lkey = ic->i_mr->lkey;
+	sge->lkey = rds_ib_local_dma_lkey(ic);
 }
 
 /*
@@ -791,7 +791,7 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
 			len = sg_dma_len(scat);
 			send->s_sge[j].addr = sg_dma_address(scat);
 			send->s_sge[j].length = len;
-			send->s_sge[j].lkey = ic->i_mr->lkey;
+			send->s_sge[j].lkey = rds_ib_local_dma_lkey(ic);
 
 			sent += len;
 			rdsdebug("ic %p sent %d remote_addr %llu\n", ic, sent, remote_addr);



More information about the rds-devel mailing list