[rds-devel] [RFC] rds iwarp: use send wr queue for fastreg and inv

Jon Mason jon at opengridcomputing.com
Tue Dec 9 16:42:04 PST 2008


Hey Andy,
This patch fixes a potential hole and introdcues a lockup (so please do
not commit until the cause can be found and fixed).  Perhaps a couple
extra eyes can help me see where the problem is.

The point of this patch is to remove the possibility of overflowing the
send wr queue (which I have never seen).  This would be caused by using
a locally created ib_send_wr instead of using the rds_ib_ring_alloc,
which would not allow  one to be allocated if there was not enough room
on the send wr queue due to it being slightly smaller than the max send
wr queue size.

In practice, this code causes a lockup (see below).

BUG: soft lockup detected on CPU#0!

Call Trace:
 <IRQ>  [<ffffffff800b50fa>] softlockup_tick+0xd5/0xe7
 [<ffffffff800930e2>] update_process_times+0x42/0x68
 [<ffffffff800746e3>] smp_local_timer_interrupt+0x23/0x47
 [<ffffffff80074da5>] smp_apic_timer_interrupt+0x41/0x47
 [<ffffffff8005bc8e>] apic_timer_interrupt+0x66/0x6c
 <EOI>  [<ffffffff885e9b71>] :ib_rds:rds_send_xmit+0x2e/0x49a
 [<ffffffff885ea53f>] :ib_rds:rds_send_worker+0x0/0x79
 [<ffffffff885ea557>] :ib_rds:rds_send_worker+0x18/0x79
 [<ffffffff8004b2ab>] run_workqueue+0x94/0xe5
 [<ffffffff80047c09>] worker_thread+0x0/0x122
 [<ffffffff8009b283>] keventd_create_kthread+0x0/0x61
 [<ffffffff80047cf9>] worker_thread+0xf0/0x122
 [<ffffffff800884ac>] default_wake_function+0x0/0xe
 [<ffffffff8009b283>] keventd_create_kthread+0x0/0x61
 [<ffffffff8009b283>] keventd_create_kthread+0x0/0x61
 [<ffffffff800321d8>] kthread+0xfe/0x132
 [<ffffffff8005bfb1>] child_rip+0xa/0x11
 [<ffffffff8009b283>] keventd_create_kthread+0x0/0x61
 [<ffffffff800320da>] kthread+0x0/0x132
 [<ffffffff8005bfa7>] child_rip+0x0/0x11


Thanks,
Jon

Signed-Off-By: Jon Mason <jon at opengridcomputing.com>


diff --git a/drivers/infiniband/ulp/rds/iw_rdma.c b/drivers/infiniband/ulp/rds/iw_rdma.c
index 0898bf8..a22a4a4 100644
--- a/drivers/infiniband/ulp/rds/iw_rdma.c
+++ b/drivers/infiniband/ulp/rds/iw_rdma.c
@@ -952,9 +952,14 @@ static int rds_ib_init_fastreg(struct rds_ib_mr_pool *pool,
 static int rds_ib_rdma_build_fastreg(struct rds_ib_mapping *mapping)
 {
 	struct rds_ib_mr *ibmr = mapping->m_mr;
-	struct ib_send_wr f_wr, *failed_wr;
+	struct rds_ib_send_work *send;
+	struct ib_send_wr *failed_wr;
+	struct rds_ib_connection *ic;
+	struct rds_connection *conn;
+	u32 pos, work_alloc;
 	int ret;
 
+	BUG_ON(mapping->m_sg.dma_npages > ibmr->fr_page_list->max_page_list_len);
 	/*
 	 * Perform a WR for the fast_reg_mr. Each individual page
 	 * in the sg list is added to the fast reg page list and placed
@@ -964,49 +969,84 @@ static int rds_ib_rdma_build_fastreg(struct rds_ib_mapping *mapping)
 	ib_update_fast_reg_key(ibmr->fr_mr, ibmr->remap_count++);
 	mapping->m_rkey = ibmr->fr_mr->rkey;
 
-	memset(&f_wr, 0, sizeof(f_wr));
-	f_wr.wr_id = ~0;
-	f_wr.opcode = IB_WR_FAST_REG_MR;
-	f_wr.wr.fast_reg.length = mapping->m_sg.bytes;
-	f_wr.wr.fast_reg.rkey = mapping->m_rkey;
-	f_wr.wr.fast_reg.page_list = ibmr->fr_page_list;
-	f_wr.wr.fast_reg.page_list_len = mapping->m_sg.dma_len;
-	f_wr.wr.fast_reg.page_shift = ibmr->device->fmr_page_shift;
-	f_wr.wr.fast_reg.access_flags = IB_ACCESS_LOCAL_WRITE |
+	conn = ibmr->cm_id->context;
+	ic = conn->c_transport_data;
+
+	work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, 1, &pos);
+	if (work_alloc != 1) {
+		printk("%s line %d: ENOMEM\n", __func__, __LINE__);
+		rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
+		rds_ib_stats_inc(s_ib_tx_ring_full);
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	send = &ic->i_sends[pos];
+	send->s_wr.next = NULL;
+	send->s_wr.num_sge = 0;
+	send->s_wr.opcode = IB_WR_FAST_REG_MR;
+	send->s_wr.wr.fast_reg.length = mapping->m_sg.bytes;
+	send->s_wr.wr.fast_reg.rkey = mapping->m_rkey;
+	send->s_wr.wr.fast_reg.page_list = ibmr->fr_page_list;
+	send->s_wr.wr.fast_reg.page_list_len = mapping->m_sg.dma_npages;
+	send->s_wr.wr.fast_reg.page_shift = ibmr->device->fmr_page_shift;
+	send->s_wr.wr.fast_reg.access_flags = IB_ACCESS_LOCAL_WRITE |
 				IB_ACCESS_REMOTE_READ |
 				IB_ACCESS_REMOTE_WRITE;
-	f_wr.wr.fast_reg.iova_start = 0;
-	f_wr.send_flags = IB_SEND_SIGNALED;
+	send->s_wr.wr.fast_reg.iova_start = 0;
 
-	failed_wr = &f_wr;
-	ret = ib_post_send(ibmr->cm_id->qp, &f_wr, &failed_wr);
-	BUG_ON(failed_wr != &f_wr);
+	failed_wr = &send->s_wr;
+	ret = ib_post_send(ibmr->cm_id->qp, &send->s_wr, &failed_wr);
+	BUG_ON(failed_wr != &send->s_wr);
 	if (ret) {
 		printk(KERN_WARNING "RDS/IB: %s %d ib_post_send returned %d\n",
 			__func__, __LINE__, ret);
 		goto out;
 	}
 
+	ic->i_fastreg_posted = 1;
 out:
 	return ret;
 }
 
 static int rds_ib_rdma_fastreg_inv(struct rds_ib_mr *ibmr)
 {
-	struct ib_send_wr s_wr, *failed_wr;
-	int ret = 0;
+	struct rds_ib_send_work *send;
+	struct ib_send_wr *failed_wr;
+	struct rds_ib_connection *ic;
+	struct rds_connection *conn;
+	u32 pos, work_alloc;
+	int ret;
 
 	if (!ibmr->cm_id->qp || !ibmr->fr_mr)
 		goto out;
 
-	memset(&s_wr, 0, sizeof(s_wr));
-	s_wr.wr_id = ~0;
-	s_wr.opcode = IB_WR_LOCAL_INV;
-	s_wr.ex.invalidate_rkey = ibmr->fr_mr->rkey;
-	s_wr.send_flags = IB_SEND_SIGNALED;
+	conn = ibmr->cm_id->context;
+	ic = conn->c_transport_data;
+
+	work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, 1, &pos);
+	if (work_alloc != 1) {
+		printk("%s line %d: ENOMEM\n", __func__, __LINE__);
+		rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
+		rds_ib_stats_inc(s_ib_tx_ring_full);
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	send = &ic->i_sends[pos];
+
+	send->s_wr.next = NULL;
+	send->s_wr.num_sge = 0;
+	send->s_wr.opcode = IB_WR_LOCAL_INV;
+	send->s_wr.send_flags = 0;
+	send->s_wr.ex.imm_data = 0;
+	send->s_wr.ex.invalidate_rkey = ibmr->fr_mr->rkey;
+	send->s_wr.wr.rdma.remote_addr = 0;
+	send->s_wr.wr.rdma.rkey = 0;
 
-	failed_wr = &s_wr;
-	ret = ib_post_send(ibmr->cm_id->qp, &s_wr, &failed_wr);
+	failed_wr = &send->s_wr;
+	ret = ib_post_send(ibmr->cm_id->qp, &send->s_wr, &failed_wr);
+	BUG_ON(failed_wr != &send->s_wr);
 	if (ret) {
 		printk(KERN_WARNING "RDS/IB: %s %d ib_post_send returned %d\n",
 			__func__, __LINE__, ret);
diff --git a/drivers/infiniband/ulp/rds/iw_send.c b/drivers/infiniband/ulp/rds/iw_send.c
index 13d01df..2186e4f 100644
--- a/drivers/infiniband/ulp/rds/iw_send.c
+++ b/drivers/infiniband/ulp/rds/iw_send.c
@@ -227,16 +227,6 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context)
 			break;
 		}
 
-		if (wc.opcode == IB_WC_LOCAL_INV && wc.wr_id == ~0) {
-			ic->i_fastreg_posted = 0;
-			continue;
-		}
-
-		if (wc.opcode == IB_WC_FAST_REG_MR && wc.wr_id == ~0) {
-			ic->i_fastreg_posted = 1;
-			continue;
-		}
-
 		if (wc.wr_id == RDS_IB_ACK_WR_ID) {
 			if (ic->i_ack_queued + HZ/2 < jiffies)
 				rds_ib_stats_inc(s_ib_tx_stalled);
@@ -254,9 +244,12 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context)
 			/* In the error case, wc.opcode sometimes contains garbage */
 			switch (send->s_wr.opcode) {
 			case IB_WR_SEND:
-				if (send->s_rm)
+				if (send->s_rm && ic->i_cm_id)
 					rds_ib_send_unmap_rm(ic, send, wc.status);
 				break;
+			case IB_WR_LOCAL_INV:
+				ic->i_fastreg_posted = 0;
+				break;
 			case IB_WR_FAST_REG_MR:
 			case IB_WR_RDMA_WRITE:
 			case IB_WR_RDMA_READ:



More information about the rds-devel mailing list