[rds-devel] Re: Transmit RDS protocol version in first packet

Or Gerlitz ogerlitz at voltaire.com
Sun Jan 13 06:53:51 PST 2008


On Sun, 13 Jan 2008, Or Gerlitz wrote:

Hi Olaf,

> From: Olaf Kirch <olaf.kirch at oracle.com>
> Date: Thu, 10 Jan 2008 10:01:41 +0100
> Subject: [PATCH 14/14] Transmit RDS protocol version in first packet
> To: rds-devel at oracle.com
> Cc: vlad at mellanox.co.il
>
> We need to prepare the code for future version changes, and
> add the ability to do rolling upgrades of a cluster.
>
> This patch adds an extension header to every congestion update
> that is being sent - the first packet that happens to be sent
> is always a cong update. This header includes the RDS protocol
> version we implement.

IB has a very useful builtin mechanism for exchanging information during
the connection esablishment - i.e the private date which you can provide
rdma_connect/rdma_accept with. So instead of initiating the version
send in rds_ib_xmit_cong_map you could do that before rdma_connect and
verify it where the connection request is processed and not on every call
to rds_ib_cong_recv, etc.

Or

> In rds_ib_cong_recv, we compare the received version number
> with what we support.
>
> The current code supports only one version number (3.0).
> Future changes need to add the relevant plumbing to enable
> backward compatibility if they discover that the peer they're
> talking to only understands an older protocol rev.
>
> Signed-off-by: Olaf Kirch <olaf.kirch at oracle.com>
> ---
>  net/rds/connection.c |    1 +
>  net/rds/ib_recv.c    |   23 +++++++++++++++++++++++
>  net/rds/ib_send.c    |    6 +++++-
>  net/rds/message.c    |   21 +++++++++++++++++++++
>  net/rds/rds.h        |   25 +++++++++++++++++++++++++
>  net/rds/recv.c       |    1 +
>  6 files changed, 76 insertions(+), 1 deletions(-)
>
> diff --git a/net/rds/connection.c b/net/rds/connection.c
> index b5b6cd9..885e547 100644
> --- a/net/rds/connection.c
> +++ b/net/rds/connection.c
> @@ -168,6 +168,7 @@ struct rds_connection *rds_conn_create(_
>  	conn->c_map_queued = 0;
>  	conn->c_map_offset = 0;
>  	conn->c_map_bytes = 0;
> +	conn->c_version = 0;
>
>  	atomic_set(&conn->c_barrier_hwm, 0);
>
> diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c
> index b8ef9af..d311e12 100644
> --- a/net/rds/ib_recv.c
> +++ b/net/rds/ib_recv.c
> @@ -588,12 +588,35 @@ static void rds_ib_cong_recv(struct rds_
>   	unsigned long frag_off;
>  	unsigned long to_copy;
>  	unsigned long copied;
> +	unsigned int version;
>  	void *addr;
>
>  	/* catch completely corrupt packets */
>  	if (be32_to_cpu(ibinc->ii_inc.i_hdr.h_len) != RDS_CONG_MAP_BYTES)
>  		return;
>
> +	/* Check if the message has a version extension.
> +	 * This is where future changes in the protocol need to
> +	 * enable/disable backward compatibility hacks
> +	 */
> +	if (rds_message_get_version_extension(&ibinc->ii_inc.i_hdr, &version)
> +	 && conn->c_version == 0) {
> +		if (version > RDS_PROTOCOL_VERSION) {
> +			printk(KERN_NOTICE "RDS/IB: Pretty amazing new stuff detected! "
> +					"Peer advertises protocol ver%u.%u\n",
> +					RDS_PROTOCOL_MAJOR(version),
> +					RDS_PROTOCOL_MINOR(version));
> +		} else if (version < RDS_PROTOCOL_VERSION) {
> +			printk(KERN_NOTICE "RDS/IB: Peer advertises older protocol ver%u.%u, "
> +					"Enabling backward compatibility.\n",
> +					RDS_PROTOCOL_MAJOR(version),
> +					RDS_PROTOCOL_MINOR(version));
> +
> +			/* enable backward compat for this conn */
> +		}
> +		conn->c_version = version;
> +	}
> +
>  	map = conn->c_fcong;
>  	map_page = 0;
>  	map_off = 0;
> diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c
> index 8ce5cb0..ec7ef54 100644
> --- a/net/rds/ib_send.c
> +++ b/net/rds/ib_send.c
> @@ -256,12 +256,16 @@ int rds_ib_xmit_cong_map(struct rds_conn
>  			off = 0;
>  		}
>
> -		/* build the header and include it in the wr */
> +		/* build the header and include it in the wr.
> +		 * Include the protocol version so that the peer knows
> +		 * which version to deal with (this is for rolling updates
> +		 * in the future) */
>  		hdr = &ic->i_send_hdrs[pos];
>  		memset(hdr, 0, sizeof(struct rds_header));
>  		hdr->h_flags = RDS_FLAG_CONG_BITMAP;
>  		hdr->h_len = cpu_to_be32(RDS_CONG_MAP_BYTES);
>  		hdr->h_ack = cpu_to_be64(rds_ib_piggyb_ack(ic));
> +		rds_message_add_version_extension(hdr, RDS_PROTOCOL_VERSION);
>  		rds_message_make_checksum(hdr);
>  		send->s_wr.num_sge = 2;
>
> diff --git a/net/rds/message.c b/net/rds/message.c
> index d5a6d62..e4e0532 100644
> --- a/net/rds/message.c
> +++ b/net/rds/message.c
> @@ -37,6 +37,7 @@
>
>  static unsigned int	rds_exthdr_size[__RDS_EXTHDR_MAX] = {
>  [RDS_EXTHDR_NONE]	= 0,
> +[RDS_EXTHDR_VERSION]	= sizeof(struct rds_ext_header_version),
>  [RDS_EXTHDR_RDMA]	= sizeof(struct rds_ext_header_rdma),
>  };
>
> @@ -173,6 +174,26 @@ none:
>  	return RDS_EXTHDR_NONE;
>  }
>
> +int rds_message_add_version_extension(struct rds_header *hdr, unsigned int version)
> +{
> +	struct rds_ext_header_version ext_hdr;
> +
> +	ext_hdr.h_version = cpu_to_be32(version);
> +	return rds_message_add_extension(hdr, RDS_EXTHDR_VERSION, &ext_hdr, sizeof(ext_hdr));
> +}
> +
> +int rds_message_get_version_extension(struct rds_header *hdr, unsigned int *version)
> +{
> +	struct rds_ext_header_version ext_hdr;
> +	unsigned int pos = 0, len = sizeof(ext_hdr);
> +
> +	/* We assume the version extension is the only one present */
> +	if (rds_message_next_extension(hdr, &pos, &ext_hdr, &len) != RDS_EXTHDR_VERSION)
> +		return 0;
> +	*version = be32_to_cpu(ext_hdr.h_version);
> +	return 1;
> +}
> +
>  struct rds_message *rds_message_alloc(unsigned int nents, gfp_t gfp)
>  {
>  	struct rds_message *rm;
> diff --git a/net/rds/rds.h b/net/rds/rds.h
> index 065294d..d0e73a7 100644
> --- a/net/rds/rds.h
> +++ b/net/rds/rds.h
> @@ -8,6 +8,14 @@
>  #include <linux/mutex.h>
>
>  /*
> + * RDS Network protocol version
> + */
> +#define RDS_PROTOCOL_3_0	0x0300
> +#define RDS_PROTOCOL_VERSION	RDS_PROTOCOL_3_0
> +#define RDS_PROTOCOL_MAJOR(v)	((v) >> 8)
> +#define RDS_PROTOCOL_MINOR(v)	((v) & 255)
> +
> +/*
>   * XXX randomly chosen, but at least seems to be unused:
>   * #               18464-18768 Unassigned
>   * We should do better.  We want a reserved port to discourage unpriv'ed
> @@ -157,6 +165,9 @@ struct rds_connection {
>  	unsigned int		c_unacked_bytes;
>
>  	atomic_t		c_barrier_hwm;
> +
> +	/* Protocol version */
> +	unsigned int		c_version;
>  };
>
>  #define RDS_FLAG_CONG_BITMAP	0x01
> @@ -187,6 +198,18 @@ struct rds_header {
>  #define RDS_EXTHDR_NONE		0
>
>  /*
> + * This extension header is included in the very
> + * first message that is sent on a new connection,
> + * and identifies the protocol level. This will help
> + * rolling updates if a future change requires breaking
> + * the protocol.
> + */
> +#define RDS_EXTHDR_VERSION	1
> +struct rds_ext_header_version {
> +	__be32			h_version;
> +};
> +
> +/*
>   * This extension header is included in the RDS message
>   * chasing an RDMA operation.
>   */
> @@ -517,6 +540,8 @@ int rds_message_add_extension(struct rds
>  		                unsigned int type, const void *data, unsigned int len);
>  int rds_message_next_extension(struct rds_header *hdr,
>  		                unsigned int *pos, void *buf, unsigned int *buflen);
> +int rds_message_add_version_extension(struct rds_header *hdr, unsigned int version);
> +int rds_message_get_version_extension(struct rds_header *hdr, unsigned int *version);
>  int rds_message_inc_copy_to_user(struct rds_incoming *inc,
>  				 struct iovec *first_iov, size_t size);
>  void rds_message_inc_purge(struct rds_incoming *inc);
> diff --git a/net/rds/recv.c b/net/rds/recv.c
> index a8ccc4b..c2fae54 100644
> --- a/net/rds/recv.c
> +++ b/net/rds/recv.c
> @@ -102,6 +102,7 @@ static void rds_recv_incoming_exthdrs(st
>  	struct rds_header *hdr = &inc->i_hdr;
>  	unsigned int pos = 0, type, len;
>  	union {
> +		struct rds_ext_header_version version;
>  		struct rds_ext_header_rdma rdma;
>  	} buffer;
>
> --
> 1.4.4
>
>



More information about the rds-devel mailing list