[rds-devel] [PATCH future/20090207 2/3] RDS/IB+IW: Yet Another Stall (YAS :)

Steve Wise swise at opengridcomputing.com
Tue Feb 10 12:07:38 PST 2009


Found another flow control stall.  I can reproduce it by running tkmivrpc
on 3 nodes, 4 separate jobs of 4 senders and 4 receivers in each job
(32 processes on each node).

Intermittently, it will hit this stall between two of the three nodes:

One node, the sender, looks like this:
- pending RDS messages to send
- transport send ring empty (due to no send credits)
- 0 send credits
- sq empty and no pending completions
- no pending rq completions

The other node, the receiver, looks like this:
- No RDS messages to send
- In the middle of a fragmented receive
- No "IB_ACK_REQUESTED" flag set
- lots of posted credits to advertise
- sq empty and no pending completions
- no pending rq completions

So the connection is stalled until the receiver decides to send something.

BTW: If you rds-ping from the receiver to the sender, it unstalls the
sender because the rds-ping messages advertise credits and get things
going again.

Here's how the stall can happen:

Time 	Event
-----------------
t1	rds_iw_xmit() grabs send and posted credits, and the posted
	count at this point is 0.

t2	recv thread refills the recv ring, bumping up posted credits
	and sets IB_ACK_REQUESTED

t3	rds_iw_xmit() resets ACK_REQUESTED because it will piggy back
	the ACK on outgoing data.

t4	rds_iw_xmit posts send with 0 adv_credits from the original
	grab done at t1

t5	send completion doesn't do anything special.  Namely it doesn't
	attempt to advertise credits

At this point, we can stall the peer because no more credits are
advertised to the peer unless more outgoing data is transmitted.

The solution is to refetch the posted credits when the ack is going to
be piggy backed at time t3.

Signed-off-by: Steve Wise <swise at opengridcomputing.com>
---

 net/rds/ib.h      |    2 +-
 net/rds/ib_recv.c |    2 +-
 net/rds/ib_send.c |   20 +++++++++++++++++---
 net/rds/iw.h      |    2 +-
 net/rds/iw_recv.c |    2 +-
 net/rds/iw_send.c |   20 +++++++++++++++++---
 net/rds/rds.h     |    2 +-
 7 files changed, 39 insertions(+), 11 deletions(-)

diff --git a/net/rds/ib.h b/net/rds/ib.h
index eff70f0..8edb99f 100644
--- a/net/rds/ib.h
+++ b/net/rds/ib.h
@@ -320,7 +320,7 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op);
 void rds_ib_send_add_credits(struct rds_connection *conn, unsigned int credits);
 void rds_ib_advertise_credits(struct rds_connection *conn, unsigned int posted);
 int rds_ib_send_grab_credits(struct rds_ib_connection *ic, u32 wanted,
-			     u32 *adv_credits);
+			     u32 *adv_credits, int need_posted);
 
 /* ib_stats.c */
 DECLARE_PER_CPU(struct rds_ib_statistics, rds_ib_stats);
diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c
index 537b2bf..45b569d 100644
--- a/net/rds/ib_recv.c
+++ b/net/rds/ib_recv.c
@@ -524,7 +524,7 @@ void rds_ib_attempt_ack(struct rds_ib_connection *ic)
 	}
 
 	/* Can we get a send credit? */
-	if (!rds_ib_send_grab_credits(ic, 1, &adv_credits)) {
+	if (!rds_ib_send_grab_credits(ic, 1, &adv_credits, 0)) {
 		rds_ib_stats_inc(s_ib_tx_throttle);
 		clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
 		return;
diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c
index 3b41647..87f6616 100644
--- a/net/rds/ib_send.c
+++ b/net/rds/ib_send.c
@@ -311,7 +311,7 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context)
  * and using atomic_cmpxchg when updating the two counters.
  */
 int rds_ib_send_grab_credits(struct rds_ib_connection *ic,
-			     u32 wanted, u32 *adv_credits)
+			     u32 wanted, u32 *adv_credits, int need_posted)
 {
 	unsigned int avail, posted, got = 0, advertise;
 	long oldval, newval;
@@ -345,7 +345,12 @@ try_again:
 	}
 	newval -= IB_SET_SEND_CREDITS(got);
 
-	if (got && posted) {
+	/*
+	 * If need_posted is non-zero, then the caller wants
+	 * the posted regardless of whether any send credits are
+	 * available.
+	 */
+	if (posted && (got || need_posted)) {
 		advertise = min_t(unsigned int, posted, RDS_MAX_ADV_CREDIT);
 		newval -= IB_SET_POST_CREDITS(advertise);
 	}
@@ -467,6 +472,7 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
 	u32 i;
 	u32 work_alloc;
 	u32 credit_alloc;
+	u32 posted;
 	u32 adv_credits = 0;
 	int send_flags = 0;
 	int sent;
@@ -492,7 +498,8 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
 
 	credit_alloc = work_alloc;
 	if (ic->i_flowctl) {
-		credit_alloc = rds_ib_send_grab_credits(ic, work_alloc, &adv_credits);
+		credit_alloc = rds_ib_send_grab_credits(ic, work_alloc, &posted, 0);
+		adv_credits += posted;
 		if (credit_alloc < work_alloc) {
 			rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - credit_alloc);
 			work_alloc = credit_alloc;
@@ -560,6 +567,13 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
 		 * should call rds_ib_ring_alloc first. */
 		rm->m_inc.i_hdr.h_ack = cpu_to_be64(rds_ib_piggyb_ack(ic));
 		rds_message_make_checksum(&rm->m_inc.i_hdr);
+
+		/*
+		 * Update adv_credits since we reset the ACK_REQUIRED bit.
+		 */
+		rds_ib_send_grab_credits(ic, 0, &posted, 1);
+		adv_credits += posted;
+		BUG_ON(adv_credits > 255);
 	} else if (ic->i_rm != rm)
 		BUG();
 
diff --git a/net/rds/iw.h b/net/rds/iw.h
index 600d57f..8b7f01b 100644
--- a/net/rds/iw.h
+++ b/net/rds/iw.h
@@ -349,7 +349,7 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op);
 void rds_iw_send_add_credits(struct rds_connection *conn, unsigned int credits);
 void rds_iw_advertise_credits(struct rds_connection *conn, unsigned int posted);
 int rds_iw_send_grab_credits(struct rds_iw_connection *ic, u32 wanted,
-			     u32 *adv_credits);
+			     u32 *adv_credits, int need_posted);
 
 /* ib_stats.c */
 DECLARE_PER_CPU(struct rds_iw_statistics, rds_iw_stats);
diff --git a/net/rds/iw_recv.c b/net/rds/iw_recv.c
index 385f239..05bccc5 100644
--- a/net/rds/iw_recv.c
+++ b/net/rds/iw_recv.c
@@ -524,7 +524,7 @@ void rds_iw_attempt_ack(struct rds_iw_connection *ic)
 	}
 
 	/* Can we get a send credit? */
-	if (!rds_iw_send_grab_credits(ic, 1, &adv_credits)) {
+	if (!rds_iw_send_grab_credits(ic, 1, &adv_credits, 0)) {
 		rds_iw_stats_inc(s_iw_tx_throttle);
 		clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
 		return;
diff --git a/net/rds/iw_send.c b/net/rds/iw_send.c
index 104662e..7b642cb 100644
--- a/net/rds/iw_send.c
+++ b/net/rds/iw_send.c
@@ -351,7 +351,7 @@ void rds_iw_send_cq_comp_handler(struct ib_cq *cq, void *context)
  * and using atomic_cmpxchg when updating the two counters.
  */
 int rds_iw_send_grab_credits(struct rds_iw_connection *ic,
-			     u32 wanted, u32 *adv_credits)
+			     u32 wanted, u32 *adv_credits, int need_posted)
 {
 	unsigned int avail, posted, got = 0, advertise;
 	long oldval, newval;
@@ -385,7 +385,12 @@ try_again:
 	}
 	newval -= IB_SET_SEND_CREDITS(got);
 
-	if (got && posted) {
+	/*
+	 * If need_posted is non-zero, then the caller wants
+	 * the posted regardless of whether any send credits are
+	 * available.
+	 */
+	if (posted && (got || need_posted)) {
 		advertise = min_t(unsigned int, posted, RDS_MAX_ADV_CREDIT);
 		newval -= IB_SET_POST_CREDITS(advertise);
 	}
@@ -507,6 +512,7 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
 	u32 i;
 	u32 work_alloc;
 	u32 credit_alloc;
+	u32 posted;
 	u32 adv_credits = 0;
 	int send_flags = 0;
 	int sent;
@@ -539,7 +545,8 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
 
 	credit_alloc = work_alloc;
 	if (ic->i_flowctl) {
-		credit_alloc = rds_iw_send_grab_credits(ic, work_alloc, &adv_credits);
+		credit_alloc = rds_iw_send_grab_credits(ic, work_alloc, &posted, 0);
+		adv_credits += posted;
 		if (credit_alloc < work_alloc) {
 			rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc - credit_alloc);
 			work_alloc = credit_alloc;
@@ -607,6 +614,13 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
 		 * should call rds_iw_ring_alloc first. */
 		rm->m_inc.i_hdr.h_ack = cpu_to_be64(rds_iw_piggyb_ack(ic));
 		rds_message_make_checksum(&rm->m_inc.i_hdr);
+	
+		/*
+		 * Update adv_credits since we reset the ACK_REQUIRED bit.
+		 */
+		rds_iw_send_grab_credits(ic, 0, &posted, 1);
+		adv_credits += posted;
+		BUG_ON(adv_credits > 255);
 	} else if (ic->i_rm != rm)
 		BUG();
 
diff --git a/net/rds/rds.h b/net/rds/rds.h
index ea5b128..0a88b8e 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -195,7 +195,7 @@ struct rds_connection {
 #define RDS_FLAG_CONG_BITMAP	0x01
 #define RDS_FLAG_ACK_REQUIRED	0x02
 #define RDS_FLAG_RETRANSMITTED	0x04
-#define RDS_MAX_ADV_CREDIT	255
+#define RDS_MAX_ADV_CREDIT	127
 
 /*
  * Maximum space available for extension headers.




More information about the rds-devel mailing list