[rds-devel] [PATCH future/20090207 2/3] RDS/IB+IW: Yet Another Stall (YAS :)
Steve Wise
swise at opengridcomputing.com
Tue Feb 10 14:18:15 PST 2009
I did send it.
Its patch 2 of 3 in my original series sent just before I replied to this..
Andy Grover wrote:
> Thanks. Can you resend without the ">" starting each line?
>
> -- Andy
>
> Steve Wise wrote:
>
>> This is the patch submitted earlier, but with the IB transport fix added.
>>
>>
>> Steve Wise wrote:
>>
>>> Found another flow control stall. I can reproduce it by running tkmivrpc
>>> on 3 nodes, 4 separate jobs of 4 senders and 4 receivers in each job
>>> (32 processes on each node).
>>>
>>> Intermittently, it will hit this stall between two of the three nodes:
>>>
>>> One node, the sender, looks like this:
>>> - pending RDS messages to send
>>> - transport send ring empty (due to no send credits)
>>> - 0 send credits
>>> - sq empty and no pending completions
>>> - no pending rq completions
>>>
>>> The other node, the receiver, looks like this:
>>> - No RDS messages to send
>>> - In the middle of a fragmented receive
>>> - No "IB_ACK_REQUESTED" flag set
>>> - lots of posted credits to advertise
>>> - sq empty and no pending completions
>>> - no pending rq completions
>>>
>>> So the connection is stalled until the receiver decides to send
>>> something.
>>>
>>> BTW: If you rds-ping from the receiver to the sender, it unstalls the
>>> sender because the rds-ping messages advertise credits and get things
>>> going again.
>>>
>>> Here's how the stall can happen:
>>>
>>> Time Event
>>> -----------------
>>> t1 rds_iw_xmit() grabs send and posted credits, and the posted
>>> count at this point is 0.
>>>
>>> t2 recv thread refills the recv ring, bumping up posted credits
>>> and sets IB_ACK_REQUESTED
>>>
>>> t3 rds_iw_xmit() resets ACK_REQUESTED because it will piggy back
>>> the ACK on outgoing data.
>>>
>>> t4 rds_iw_xmit posts send with 0 adv_credits from the original
>>> grab done at t1
>>>
>>> t5 send completion doesn't do anything special. Namely it doesn't
>>> attempt to advertise credits
>>>
>>> At this point, we can stall the peer because no more credits are
>>> advertised to the peer unless more outgoing data is transmitted.
>>>
>>> The solution is to refetch the posted credits when the ack is going to
>>> be piggy backed at time t3.
>>>
>>> Signed-off-by: Steve Wise <swise at opengridcomputing.com>
>>> ---
>>>
>>> net/rds/ib.h | 2 +-
>>> net/rds/ib_recv.c | 2 +-
>>> net/rds/ib_send.c | 20 +++++++++++++++++---
>>> net/rds/iw.h | 2 +-
>>> net/rds/iw_recv.c | 2 +-
>>> net/rds/iw_send.c | 20 +++++++++++++++++---
>>> net/rds/rds.h | 2 +-
>>> 7 files changed, 39 insertions(+), 11 deletions(-)
>>>
>>> diff --git a/net/rds/ib.h b/net/rds/ib.h
>>> index eff70f0..8edb99f 100644
>>> --- a/net/rds/ib.h
>>> +++ b/net/rds/ib.h
>>> @@ -320,7 +320,7 @@ int rds_ib_xmit_rdma(struct rds_connection *conn,
>>> struct rds_rdma_op *op);
>>> void rds_ib_send_add_credits(struct rds_connection *conn, unsigned
>>> int credits);
>>> void rds_ib_advertise_credits(struct rds_connection *conn, unsigned
>>> int posted);
>>> int rds_ib_send_grab_credits(struct rds_ib_connection *ic, u32 wanted,
>>> - u32 *adv_credits);
>>> + u32 *adv_credits, int need_posted);
>>>
>>> /* ib_stats.c */
>>> DECLARE_PER_CPU(struct rds_ib_statistics, rds_ib_stats);
>>> diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c
>>> index 537b2bf..45b569d 100644
>>> --- a/net/rds/ib_recv.c
>>> +++ b/net/rds/ib_recv.c
>>> @@ -524,7 +524,7 @@ void rds_ib_attempt_ack(struct rds_ib_connection *ic)
>>> }
>>>
>>> /* Can we get a send credit? */
>>> - if (!rds_ib_send_grab_credits(ic, 1, &adv_credits)) {
>>> + if (!rds_ib_send_grab_credits(ic, 1, &adv_credits, 0)) {
>>> rds_ib_stats_inc(s_ib_tx_throttle);
>>> clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
>>> return;
>>> diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c
>>> index 3b41647..87f6616 100644
>>> --- a/net/rds/ib_send.c
>>> +++ b/net/rds/ib_send.c
>>> @@ -311,7 +311,7 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq,
>>> void *context)
>>> * and using atomic_cmpxchg when updating the two counters.
>>> */
>>> int rds_ib_send_grab_credits(struct rds_ib_connection *ic,
>>> - u32 wanted, u32 *adv_credits)
>>> + u32 wanted, u32 *adv_credits, int need_posted)
>>> {
>>> unsigned int avail, posted, got = 0, advertise;
>>> long oldval, newval;
>>> @@ -345,7 +345,12 @@ try_again:
>>> }
>>> newval -= IB_SET_SEND_CREDITS(got);
>>>
>>> - if (got && posted) {
>>> + /*
>>> + * If need_posted is non-zero, then the caller wants
>>> + * the posted regardless of whether any send credits are
>>> + * available.
>>> + */
>>> + if (posted && (got || need_posted)) {
>>> advertise = min_t(unsigned int, posted, RDS_MAX_ADV_CREDIT);
>>> newval -= IB_SET_POST_CREDITS(advertise);
>>> }
>>> @@ -467,6 +472,7 @@ int rds_ib_xmit(struct rds_connection *conn,
>>> struct rds_message *rm,
>>> u32 i;
>>> u32 work_alloc;
>>> u32 credit_alloc;
>>> + u32 posted;
>>> u32 adv_credits = 0;
>>> int send_flags = 0;
>>> int sent;
>>> @@ -492,7 +498,8 @@ int rds_ib_xmit(struct rds_connection *conn,
>>> struct rds_message *rm,
>>>
>>> credit_alloc = work_alloc;
>>> if (ic->i_flowctl) {
>>> - credit_alloc = rds_ib_send_grab_credits(ic, work_alloc,
>>> &adv_credits);
>>> + credit_alloc = rds_ib_send_grab_credits(ic, work_alloc,
>>> &posted, 0);
>>> + adv_credits += posted;
>>> if (credit_alloc < work_alloc) {
>>> rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc -
>>> credit_alloc);
>>> work_alloc = credit_alloc;
>>> @@ -560,6 +567,13 @@ int rds_ib_xmit(struct rds_connection *conn,
>>> struct rds_message *rm,
>>> * should call rds_ib_ring_alloc first. */
>>> rm->m_inc.i_hdr.h_ack = cpu_to_be64(rds_ib_piggyb_ack(ic));
>>> rds_message_make_checksum(&rm->m_inc.i_hdr);
>>> +
>>> + /*
>>> + * Update adv_credits since we reset the ACK_REQUIRED bit.
>>> + */
>>> + rds_ib_send_grab_credits(ic, 0, &posted, 1);
>>> + adv_credits += posted;
>>> + BUG_ON(adv_credits > 255);
>>> } else if (ic->i_rm != rm)
>>> BUG();
>>>
>>> diff --git a/net/rds/iw.h b/net/rds/iw.h
>>> index 600d57f..8b7f01b 100644
>>> --- a/net/rds/iw.h
>>> +++ b/net/rds/iw.h
>>> @@ -349,7 +349,7 @@ int rds_iw_xmit_rdma(struct rds_connection *conn,
>>> struct rds_rdma_op *op);
>>> void rds_iw_send_add_credits(struct rds_connection *conn, unsigned
>>> int credits);
>>> void rds_iw_advertise_credits(struct rds_connection *conn, unsigned
>>> int posted);
>>> int rds_iw_send_grab_credits(struct rds_iw_connection *ic, u32 wanted,
>>> - u32 *adv_credits);
>>> + u32 *adv_credits, int need_posted);
>>>
>>> /* ib_stats.c */
>>> DECLARE_PER_CPU(struct rds_iw_statistics, rds_iw_stats);
>>> diff --git a/net/rds/iw_recv.c b/net/rds/iw_recv.c
>>> index 385f239..05bccc5 100644
>>> --- a/net/rds/iw_recv.c
>>> +++ b/net/rds/iw_recv.c
>>> @@ -524,7 +524,7 @@ void rds_iw_attempt_ack(struct rds_iw_connection *ic)
>>> }
>>>
>>> /* Can we get a send credit? */
>>> - if (!rds_iw_send_grab_credits(ic, 1, &adv_credits)) {
>>> + if (!rds_iw_send_grab_credits(ic, 1, &adv_credits, 0)) {
>>> rds_iw_stats_inc(s_iw_tx_throttle);
>>> clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
>>> return;
>>> diff --git a/net/rds/iw_send.c b/net/rds/iw_send.c
>>> index 104662e..7b642cb 100644
>>> --- a/net/rds/iw_send.c
>>> +++ b/net/rds/iw_send.c
>>> @@ -351,7 +351,7 @@ void rds_iw_send_cq_comp_handler(struct ib_cq *cq,
>>> void *context)
>>> * and using atomic_cmpxchg when updating the two counters.
>>> */
>>> int rds_iw_send_grab_credits(struct rds_iw_connection *ic,
>>> - u32 wanted, u32 *adv_credits)
>>> + u32 wanted, u32 *adv_credits, int need_posted)
>>> {
>>> unsigned int avail, posted, got = 0, advertise;
>>> long oldval, newval;
>>> @@ -385,7 +385,12 @@ try_again:
>>> }
>>> newval -= IB_SET_SEND_CREDITS(got);
>>>
>>> - if (got && posted) {
>>> + /*
>>> + * If need_posted is non-zero, then the caller wants
>>> + * the posted regardless of whether any send credits are
>>> + * available.
>>> + */
>>> + if (posted && (got || need_posted)) {
>>> advertise = min_t(unsigned int, posted, RDS_MAX_ADV_CREDIT);
>>> newval -= IB_SET_POST_CREDITS(advertise);
>>> }
>>> @@ -507,6 +512,7 @@ int rds_iw_xmit(struct rds_connection *conn,
>>> struct rds_message *rm,
>>> u32 i;
>>> u32 work_alloc;
>>> u32 credit_alloc;
>>> + u32 posted;
>>> u32 adv_credits = 0;
>>> int send_flags = 0;
>>> int sent;
>>> @@ -539,7 +545,8 @@ int rds_iw_xmit(struct rds_connection *conn,
>>> struct rds_message *rm,
>>>
>>> credit_alloc = work_alloc;
>>> if (ic->i_flowctl) {
>>> - credit_alloc = rds_iw_send_grab_credits(ic, work_alloc,
>>> &adv_credits);
>>> + credit_alloc = rds_iw_send_grab_credits(ic, work_alloc,
>>> &posted, 0);
>>> + adv_credits += posted;
>>> if (credit_alloc < work_alloc) {
>>> rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc -
>>> credit_alloc);
>>> work_alloc = credit_alloc;
>>> @@ -607,6 +614,13 @@ int rds_iw_xmit(struct rds_connection *conn,
>>> struct rds_message *rm,
>>> * should call rds_iw_ring_alloc first. */
>>> rm->m_inc.i_hdr.h_ack = cpu_to_be64(rds_iw_piggyb_ack(ic));
>>> rds_message_make_checksum(&rm->m_inc.i_hdr);
>>> +
>>> + /*
>>> + * Update adv_credits since we reset the ACK_REQUIRED bit.
>>> + */
>>> + rds_iw_send_grab_credits(ic, 0, &posted, 1);
>>> + adv_credits += posted;
>>> + BUG_ON(adv_credits > 255);
>>> } else if (ic->i_rm != rm)
>>> BUG();
>>>
>>> diff --git a/net/rds/rds.h b/net/rds/rds.h
>>> index ea5b128..0a88b8e 100644
>>> --- a/net/rds/rds.h
>>> +++ b/net/rds/rds.h
>>> @@ -195,7 +195,7 @@ struct rds_connection {
>>> #define RDS_FLAG_CONG_BITMAP 0x01
>>> #define RDS_FLAG_ACK_REQUIRED 0x02
>>> #define RDS_FLAG_RETRANSMITTED 0x04
>>> -#define RDS_MAX_ADV_CREDIT 255
>>> +#define RDS_MAX_ADV_CREDIT 127
>>>
>>> /*
>>> * Maximum space available for extension headers.
>>>
>>>
>>> _______________________________________________
>>> rds-devel mailing list
>>> rds-devel at oss.oracle.com
>>> http://oss.oracle.com/mailman/listinfo/rds-devel
>>>
>>>
More information about the rds-devel
mailing list