[rds-devel] [PATCH future/20090207 2/3] RDS/IB+IW: Yet Another Stall (YAS :)

Andy Grover andy.grover at oracle.com
Tue Feb 10 13:35:15 PST 2009


Thanks. Can you resend without the ">" starting each line?

-- Andy

Steve Wise wrote:
> This is the patch submitted earlier, but with the IB transport fix added.
> 
> 
> Steve Wise wrote:
>> Found another flow control stall.  I can reproduce it by running tkmivrpc
>> on 3 nodes, 4 separate jobs of 4 senders and 4 receivers in each job
>> (32 processes on each node).
>>
>> Intermittently, it will hit this stall between two of the three nodes:
>>
>> One node, the sender, looks like this:
>> - pending RDS messages to send
>> - transport send ring empty (due to no send credits)
>> - 0 send credits
>> - sq empty and no pending completions
>> - no pending rq completions
>>
>> The other node, the receiver, looks like this:
>> - No RDS messages to send
>> - In the middle of a fragmented receive
>> - No "IB_ACK_REQUESTED" flag set
>> - lots of posted credits to advertise
>> - sq empty and no pending completions
>> - no pending rq completions
>>
>> So the connection is stalled until the receiver decides to send
>> something.
>>
>> BTW: If you rds-ping from the receiver to the sender, it unstalls the
>> sender because the rds-ping messages advertise credits and get things
>> going again.
>>
>> Here's how the stall can happen:
>>
>> Time     Event
>> -----------------
>> t1    rds_iw_xmit() grabs send and posted credits, and the posted
>>     count at this point is 0.
>>
>> t2    recv thread refills the recv ring, bumping up posted credits
>>     and sets IB_ACK_REQUESTED
>>
>> t3    rds_iw_xmit() resets ACK_REQUESTED because it will piggy back
>>     the ACK on outgoing data.
>>
>> t4    rds_iw_xmit posts send with 0 adv_credits from the original
>>     grab done at t1
>>
>> t5    send completion doesn't do anything special.  Namely it doesn't
>>     attempt to advertise credits
>>
>> At this point, we can stall the peer because no more credits are
>> advertised to the peer unless more outgoing data is transmitted.
>>
>> The solution is to refetch the posted credits when the ack is going to
>> be piggy backed at time t3.
>>
>> Signed-off-by: Steve Wise <swise at opengridcomputing.com>
>> ---
>>
>>  net/rds/ib.h      |    2 +-
>>  net/rds/ib_recv.c |    2 +-
>>  net/rds/ib_send.c |   20 +++++++++++++++++---
>>  net/rds/iw.h      |    2 +-
>>  net/rds/iw_recv.c |    2 +-
>>  net/rds/iw_send.c |   20 +++++++++++++++++---
>>  net/rds/rds.h     |    2 +-
>>  7 files changed, 39 insertions(+), 11 deletions(-)
>>
>> diff --git a/net/rds/ib.h b/net/rds/ib.h
>> index eff70f0..8edb99f 100644
>> --- a/net/rds/ib.h
>> +++ b/net/rds/ib.h
>> @@ -320,7 +320,7 @@ int rds_ib_xmit_rdma(struct rds_connection *conn,
>> struct rds_rdma_op *op);
>>  void rds_ib_send_add_credits(struct rds_connection *conn, unsigned
>> int credits);
>>  void rds_ib_advertise_credits(struct rds_connection *conn, unsigned
>> int posted);
>>  int rds_ib_send_grab_credits(struct rds_ib_connection *ic, u32 wanted,
>> -                 u32 *adv_credits);
>> +                 u32 *adv_credits, int need_posted);
>>  
>>  /* ib_stats.c */
>>  DECLARE_PER_CPU(struct rds_ib_statistics, rds_ib_stats);
>> diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c
>> index 537b2bf..45b569d 100644
>> --- a/net/rds/ib_recv.c
>> +++ b/net/rds/ib_recv.c
>> @@ -524,7 +524,7 @@ void rds_ib_attempt_ack(struct rds_ib_connection *ic)
>>      }
>>  
>>      /* Can we get a send credit? */
>> -    if (!rds_ib_send_grab_credits(ic, 1, &adv_credits)) {
>> +    if (!rds_ib_send_grab_credits(ic, 1, &adv_credits, 0)) {
>>          rds_ib_stats_inc(s_ib_tx_throttle);
>>          clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
>>          return;
>> diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c
>> index 3b41647..87f6616 100644
>> --- a/net/rds/ib_send.c
>> +++ b/net/rds/ib_send.c
>> @@ -311,7 +311,7 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq,
>> void *context)
>>   * and using atomic_cmpxchg when updating the two counters.
>>   */
>>  int rds_ib_send_grab_credits(struct rds_ib_connection *ic,
>> -                 u32 wanted, u32 *adv_credits)
>> +                 u32 wanted, u32 *adv_credits, int need_posted)
>>  {
>>      unsigned int avail, posted, got = 0, advertise;
>>      long oldval, newval;
>> @@ -345,7 +345,12 @@ try_again:
>>      }
>>      newval -= IB_SET_SEND_CREDITS(got);
>>  
>> -    if (got && posted) {
>> +    /*
>> +     * If need_posted is non-zero, then the caller wants
>> +     * the posted regardless of whether any send credits are
>> +     * available.
>> +     */
>> +    if (posted && (got || need_posted)) {
>>          advertise = min_t(unsigned int, posted, RDS_MAX_ADV_CREDIT);
>>          newval -= IB_SET_POST_CREDITS(advertise);
>>      }
>> @@ -467,6 +472,7 @@ int rds_ib_xmit(struct rds_connection *conn,
>> struct rds_message *rm,
>>      u32 i;
>>      u32 work_alloc;
>>      u32 credit_alloc;
>> +    u32 posted;
>>      u32 adv_credits = 0;
>>      int send_flags = 0;
>>      int sent;
>> @@ -492,7 +498,8 @@ int rds_ib_xmit(struct rds_connection *conn,
>> struct rds_message *rm,
>>  
>>      credit_alloc = work_alloc;
>>      if (ic->i_flowctl) {
>> -        credit_alloc = rds_ib_send_grab_credits(ic, work_alloc,
>> &adv_credits);
>> +        credit_alloc = rds_ib_send_grab_credits(ic, work_alloc,
>> &posted, 0);
>> +        adv_credits += posted;
>>          if (credit_alloc < work_alloc) {
>>              rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc -
>> credit_alloc);
>>              work_alloc = credit_alloc;
>> @@ -560,6 +567,13 @@ int rds_ib_xmit(struct rds_connection *conn,
>> struct rds_message *rm,
>>           * should call rds_ib_ring_alloc first. */
>>          rm->m_inc.i_hdr.h_ack = cpu_to_be64(rds_ib_piggyb_ack(ic));
>>          rds_message_make_checksum(&rm->m_inc.i_hdr);
>> +
>> +        /*
>> +         * Update adv_credits since we reset the ACK_REQUIRED bit.
>> +         */
>> +        rds_ib_send_grab_credits(ic, 0, &posted, 1);
>> +        adv_credits += posted;
>> +        BUG_ON(adv_credits > 255);
>>      } else if (ic->i_rm != rm)
>>          BUG();
>>  
>> diff --git a/net/rds/iw.h b/net/rds/iw.h
>> index 600d57f..8b7f01b 100644
>> --- a/net/rds/iw.h
>> +++ b/net/rds/iw.h
>> @@ -349,7 +349,7 @@ int rds_iw_xmit_rdma(struct rds_connection *conn,
>> struct rds_rdma_op *op);
>>  void rds_iw_send_add_credits(struct rds_connection *conn, unsigned
>> int credits);
>>  void rds_iw_advertise_credits(struct rds_connection *conn, unsigned
>> int posted);
>>  int rds_iw_send_grab_credits(struct rds_iw_connection *ic, u32 wanted,
>> -                 u32 *adv_credits);
>> +                 u32 *adv_credits, int need_posted);
>>  
>>  /* ib_stats.c */
>>  DECLARE_PER_CPU(struct rds_iw_statistics, rds_iw_stats);
>> diff --git a/net/rds/iw_recv.c b/net/rds/iw_recv.c
>> index 385f239..05bccc5 100644
>> --- a/net/rds/iw_recv.c
>> +++ b/net/rds/iw_recv.c
>> @@ -524,7 +524,7 @@ void rds_iw_attempt_ack(struct rds_iw_connection *ic)
>>      }
>>  
>>      /* Can we get a send credit? */
>> -    if (!rds_iw_send_grab_credits(ic, 1, &adv_credits)) {
>> +    if (!rds_iw_send_grab_credits(ic, 1, &adv_credits, 0)) {
>>          rds_iw_stats_inc(s_iw_tx_throttle);
>>          clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
>>          return;
>> diff --git a/net/rds/iw_send.c b/net/rds/iw_send.c
>> index 104662e..7b642cb 100644
>> --- a/net/rds/iw_send.c
>> +++ b/net/rds/iw_send.c
>> @@ -351,7 +351,7 @@ void rds_iw_send_cq_comp_handler(struct ib_cq *cq,
>> void *context)
>>   * and using atomic_cmpxchg when updating the two counters.
>>   */
>>  int rds_iw_send_grab_credits(struct rds_iw_connection *ic,
>> -                 u32 wanted, u32 *adv_credits)
>> +                 u32 wanted, u32 *adv_credits, int need_posted)
>>  {
>>      unsigned int avail, posted, got = 0, advertise;
>>      long oldval, newval;
>> @@ -385,7 +385,12 @@ try_again:
>>      }
>>      newval -= IB_SET_SEND_CREDITS(got);
>>  
>> -    if (got && posted) {
>> +    /*
>> +     * If need_posted is non-zero, then the caller wants
>> +     * the posted regardless of whether any send credits are
>> +     * available.
>> +     */
>> +    if (posted && (got || need_posted)) {
>>          advertise = min_t(unsigned int, posted, RDS_MAX_ADV_CREDIT);
>>          newval -= IB_SET_POST_CREDITS(advertise);
>>      }
>> @@ -507,6 +512,7 @@ int rds_iw_xmit(struct rds_connection *conn,
>> struct rds_message *rm,
>>      u32 i;
>>      u32 work_alloc;
>>      u32 credit_alloc;
>> +    u32 posted;
>>      u32 adv_credits = 0;
>>      int send_flags = 0;
>>      int sent;
>> @@ -539,7 +545,8 @@ int rds_iw_xmit(struct rds_connection *conn,
>> struct rds_message *rm,
>>  
>>      credit_alloc = work_alloc;
>>      if (ic->i_flowctl) {
>> -        credit_alloc = rds_iw_send_grab_credits(ic, work_alloc,
>> &adv_credits);
>> +        credit_alloc = rds_iw_send_grab_credits(ic, work_alloc,
>> &posted, 0);
>> +        adv_credits += posted;
>>          if (credit_alloc < work_alloc) {
>>              rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc -
>> credit_alloc);
>>              work_alloc = credit_alloc;
>> @@ -607,6 +614,13 @@ int rds_iw_xmit(struct rds_connection *conn,
>> struct rds_message *rm,
>>           * should call rds_iw_ring_alloc first. */
>>          rm->m_inc.i_hdr.h_ack = cpu_to_be64(rds_iw_piggyb_ack(ic));
>>          rds_message_make_checksum(&rm->m_inc.i_hdr);
>> +   
>> +        /*
>> +         * Update adv_credits since we reset the ACK_REQUIRED bit.
>> +         */
>> +        rds_iw_send_grab_credits(ic, 0, &posted, 1);
>> +        adv_credits += posted;
>> +        BUG_ON(adv_credits > 255);
>>      } else if (ic->i_rm != rm)
>>          BUG();
>>  
>> diff --git a/net/rds/rds.h b/net/rds/rds.h
>> index ea5b128..0a88b8e 100644
>> --- a/net/rds/rds.h
>> +++ b/net/rds/rds.h
>> @@ -195,7 +195,7 @@ struct rds_connection {
>>  #define RDS_FLAG_CONG_BITMAP    0x01
>>  #define RDS_FLAG_ACK_REQUIRED    0x02
>>  #define RDS_FLAG_RETRANSMITTED    0x04
>> -#define RDS_MAX_ADV_CREDIT    255
>> +#define RDS_MAX_ADV_CREDIT    127
>>  
>>  /*
>>   * Maximum space available for extension headers.
>>
>>
>> _______________________________________________
>> rds-devel mailing list
>> rds-devel at oss.oracle.com
>> http://oss.oracle.com/mailman/listinfo/rds-devel
>>   
> 




More information about the rds-devel mailing list