[rds-devel] Re: Transmit RDS protocol version in first packet
Or Gerlitz
ogerlitz at voltaire.com
Sun Jan 13 06:53:51 PST 2008
On Sun, 13 Jan 2008, Or Gerlitz wrote:
Hi Olaf,
> From: Olaf Kirch <olaf.kirch at oracle.com>
> Date: Thu, 10 Jan 2008 10:01:41 +0100
> Subject: [PATCH 14/14] Transmit RDS protocol version in first packet
> To: rds-devel at oracle.com
> Cc: vlad at mellanox.co.il
>
> We need to prepare the code for future version changes, and
> add the ability to do rolling upgrades of a cluster.
>
> This patch adds an extension header to every congestion update
> that is being sent - the first packet that happens to be sent
> is always a cong update. This header includes the RDS protocol
> version we implement.
IB has a very useful builtin mechanism for exchanging information during
the connection esablishment - i.e the private date which you can provide
rdma_connect/rdma_accept with. So instead of initiating the version
send in rds_ib_xmit_cong_map you could do that before rdma_connect and
verify it where the connection request is processed and not on every call
to rds_ib_cong_recv, etc.
Or
> In rds_ib_cong_recv, we compare the received version number
> with what we support.
>
> The current code supports only one version number (3.0).
> Future changes need to add the relevant plumbing to enable
> backward compatibility if they discover that the peer they're
> talking to only understands an older protocol rev.
>
> Signed-off-by: Olaf Kirch <olaf.kirch at oracle.com>
> ---
> net/rds/connection.c | 1 +
> net/rds/ib_recv.c | 23 +++++++++++++++++++++++
> net/rds/ib_send.c | 6 +++++-
> net/rds/message.c | 21 +++++++++++++++++++++
> net/rds/rds.h | 25 +++++++++++++++++++++++++
> net/rds/recv.c | 1 +
> 6 files changed, 76 insertions(+), 1 deletions(-)
>
> diff --git a/net/rds/connection.c b/net/rds/connection.c
> index b5b6cd9..885e547 100644
> --- a/net/rds/connection.c
> +++ b/net/rds/connection.c
> @@ -168,6 +168,7 @@ struct rds_connection *rds_conn_create(_
> conn->c_map_queued = 0;
> conn->c_map_offset = 0;
> conn->c_map_bytes = 0;
> + conn->c_version = 0;
>
> atomic_set(&conn->c_barrier_hwm, 0);
>
> diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c
> index b8ef9af..d311e12 100644
> --- a/net/rds/ib_recv.c
> +++ b/net/rds/ib_recv.c
> @@ -588,12 +588,35 @@ static void rds_ib_cong_recv(struct rds_
> unsigned long frag_off;
> unsigned long to_copy;
> unsigned long copied;
> + unsigned int version;
> void *addr;
>
> /* catch completely corrupt packets */
> if (be32_to_cpu(ibinc->ii_inc.i_hdr.h_len) != RDS_CONG_MAP_BYTES)
> return;
>
> + /* Check if the message has a version extension.
> + * This is where future changes in the protocol need to
> + * enable/disable backward compatibility hacks
> + */
> + if (rds_message_get_version_extension(&ibinc->ii_inc.i_hdr, &version)
> + && conn->c_version == 0) {
> + if (version > RDS_PROTOCOL_VERSION) {
> + printk(KERN_NOTICE "RDS/IB: Pretty amazing new stuff detected! "
> + "Peer advertises protocol ver%u.%u\n",
> + RDS_PROTOCOL_MAJOR(version),
> + RDS_PROTOCOL_MINOR(version));
> + } else if (version < RDS_PROTOCOL_VERSION) {
> + printk(KERN_NOTICE "RDS/IB: Peer advertises older protocol ver%u.%u, "
> + "Enabling backward compatibility.\n",
> + RDS_PROTOCOL_MAJOR(version),
> + RDS_PROTOCOL_MINOR(version));
> +
> + /* enable backward compat for this conn */
> + }
> + conn->c_version = version;
> + }
> +
> map = conn->c_fcong;
> map_page = 0;
> map_off = 0;
> diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c
> index 8ce5cb0..ec7ef54 100644
> --- a/net/rds/ib_send.c
> +++ b/net/rds/ib_send.c
> @@ -256,12 +256,16 @@ int rds_ib_xmit_cong_map(struct rds_conn
> off = 0;
> }
>
> - /* build the header and include it in the wr */
> + /* build the header and include it in the wr.
> + * Include the protocol version so that the peer knows
> + * which version to deal with (this is for rolling updates
> + * in the future) */
> hdr = &ic->i_send_hdrs[pos];
> memset(hdr, 0, sizeof(struct rds_header));
> hdr->h_flags = RDS_FLAG_CONG_BITMAP;
> hdr->h_len = cpu_to_be32(RDS_CONG_MAP_BYTES);
> hdr->h_ack = cpu_to_be64(rds_ib_piggyb_ack(ic));
> + rds_message_add_version_extension(hdr, RDS_PROTOCOL_VERSION);
> rds_message_make_checksum(hdr);
> send->s_wr.num_sge = 2;
>
> diff --git a/net/rds/message.c b/net/rds/message.c
> index d5a6d62..e4e0532 100644
> --- a/net/rds/message.c
> +++ b/net/rds/message.c
> @@ -37,6 +37,7 @@
>
> static unsigned int rds_exthdr_size[__RDS_EXTHDR_MAX] = {
> [RDS_EXTHDR_NONE] = 0,
> +[RDS_EXTHDR_VERSION] = sizeof(struct rds_ext_header_version),
> [RDS_EXTHDR_RDMA] = sizeof(struct rds_ext_header_rdma),
> };
>
> @@ -173,6 +174,26 @@ none:
> return RDS_EXTHDR_NONE;
> }
>
> +int rds_message_add_version_extension(struct rds_header *hdr, unsigned int version)
> +{
> + struct rds_ext_header_version ext_hdr;
> +
> + ext_hdr.h_version = cpu_to_be32(version);
> + return rds_message_add_extension(hdr, RDS_EXTHDR_VERSION, &ext_hdr, sizeof(ext_hdr));
> +}
> +
> +int rds_message_get_version_extension(struct rds_header *hdr, unsigned int *version)
> +{
> + struct rds_ext_header_version ext_hdr;
> + unsigned int pos = 0, len = sizeof(ext_hdr);
> +
> + /* We assume the version extension is the only one present */
> + if (rds_message_next_extension(hdr, &pos, &ext_hdr, &len) != RDS_EXTHDR_VERSION)
> + return 0;
> + *version = be32_to_cpu(ext_hdr.h_version);
> + return 1;
> +}
> +
> struct rds_message *rds_message_alloc(unsigned int nents, gfp_t gfp)
> {
> struct rds_message *rm;
> diff --git a/net/rds/rds.h b/net/rds/rds.h
> index 065294d..d0e73a7 100644
> --- a/net/rds/rds.h
> +++ b/net/rds/rds.h
> @@ -8,6 +8,14 @@
> #include <linux/mutex.h>
>
> /*
> + * RDS Network protocol version
> + */
> +#define RDS_PROTOCOL_3_0 0x0300
> +#define RDS_PROTOCOL_VERSION RDS_PROTOCOL_3_0
> +#define RDS_PROTOCOL_MAJOR(v) ((v) >> 8)
> +#define RDS_PROTOCOL_MINOR(v) ((v) & 255)
> +
> +/*
> * XXX randomly chosen, but at least seems to be unused:
> * # 18464-18768 Unassigned
> * We should do better. We want a reserved port to discourage unpriv'ed
> @@ -157,6 +165,9 @@ struct rds_connection {
> unsigned int c_unacked_bytes;
>
> atomic_t c_barrier_hwm;
> +
> + /* Protocol version */
> + unsigned int c_version;
> };
>
> #define RDS_FLAG_CONG_BITMAP 0x01
> @@ -187,6 +198,18 @@ struct rds_header {
> #define RDS_EXTHDR_NONE 0
>
> /*
> + * This extension header is included in the very
> + * first message that is sent on a new connection,
> + * and identifies the protocol level. This will help
> + * rolling updates if a future change requires breaking
> + * the protocol.
> + */
> +#define RDS_EXTHDR_VERSION 1
> +struct rds_ext_header_version {
> + __be32 h_version;
> +};
> +
> +/*
> * This extension header is included in the RDS message
> * chasing an RDMA operation.
> */
> @@ -517,6 +540,8 @@ int rds_message_add_extension(struct rds
> unsigned int type, const void *data, unsigned int len);
> int rds_message_next_extension(struct rds_header *hdr,
> unsigned int *pos, void *buf, unsigned int *buflen);
> +int rds_message_add_version_extension(struct rds_header *hdr, unsigned int version);
> +int rds_message_get_version_extension(struct rds_header *hdr, unsigned int *version);
> int rds_message_inc_copy_to_user(struct rds_incoming *inc,
> struct iovec *first_iov, size_t size);
> void rds_message_inc_purge(struct rds_incoming *inc);
> diff --git a/net/rds/recv.c b/net/rds/recv.c
> index a8ccc4b..c2fae54 100644
> --- a/net/rds/recv.c
> +++ b/net/rds/recv.c
> @@ -102,6 +102,7 @@ static void rds_recv_incoming_exthdrs(st
> struct rds_header *hdr = &inc->i_hdr;
> unsigned int pos = 0, type, len;
> union {
> + struct rds_ext_header_version version;
> struct rds_ext_header_rdma rdma;
> } buffer;
>
> --
> 1.4.4
>
>
More information about the rds-devel
mailing list