[rds-devel] [External] : Re: [PATCH net-next v2 3/3] rds: convert to getsockopt_iter

Allison Henderson achender at kernel.org
Fri Jun 5 22:31:06 UTC 2026


On Fri, 2026-06-05 at 03:31 -0700, Breno Leitao wrote:
> Convert RDS socket's getsockopt implementation to use the new
> getsockopt_iter callback with sockopt_t.
> 
> Key changes:
> - Replace (char __user *optval, int __user *optlen) with sockopt_t *opt
> - Use opt->optlen for buffer length (input) and returned size (output)
> - Use copy_to_iter() instead of put_user()/copy_to_user()
> 
> The RDS_INFO_* snapshot path in rds_info_getsockopt() used to pin the
> userspace buffer with pin_user_pages_fast() on the raw optval address;
> the info producers then memcpy into those pages under a spinlock via
> kmap_atomic() and so must not fault. Obtain the same page array and
> starting offset from opt->iter_out with iov_iter_extract_pages(), which
> pins for write because iter_out is ITER_DEST.
> 
> The page array is preallocated here (sized with iov_iter_npages()) and
> passed in, so iov_iter_extract_pages() fills it in place rather than
> allocating one for us; RDS therefore keeps ownership of the array on
> every return path and frees it itself. The rds_info_iterator /
> rds_info_copy machinery and all producer callbacks are unchanged.
> 
> Kernel buffers (ITER_KVEC) are not page-backed in a way the info
> producers can use, so the RDS_INFO path returns -EOPNOTSUPP for them;
> this matches the previous behaviour, where a kernel-buffer getsockopt
> hit the WARN_ONCE() path in do_sock_getsockopt() and returned
> -EOPNOTSUPP. The simple RDS_RECVERR and SO_RDS_TRANSPORT options keep
> working for kernel buffers via copy_to_iter().
> 
> Signed-off-by: Breno Leitao <leitao at debian.org>

Looks good, thanks for the quick turn around.
Reviewed-by: Allison Henderson <achender at kernel.org>
> ---
>  net/rds/af_rds.c | 36 +++++++++++++++------------
>  net/rds/info.c   | 76 ++++++++++++++++++++++++++++++++------------------------
>  net/rds/info.h   |  3 +--
>  3 files changed, 65 insertions(+), 50 deletions(-)
> 
> diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c
> index 6f4f9cf352bd..d5defe9172e3 100644
> --- a/net/rds/af_rds.c
> +++ b/net/rds/af_rds.c
> @@ -37,6 +37,7 @@
>  #include <linux/in.h>
>  #include <linux/ipv6.h>
>  #include <linux/poll.h>
> +#include <linux/uio.h>
>  #include <net/sock.h>
>  
>  #include "rds.h"
> @@ -485,35 +486,36 @@ static int rds_setsockopt(struct socket *sock, int level, int optname,
>  }
>  
>  static int rds_getsockopt(struct socket *sock, int level, int optname,
> -			  char __user *optval, int __user *optlen)
> +			  sockopt_t *opt)
>  {
>  	struct rds_sock *rs = rds_sk_to_rs(sock->sk);
>  	int ret = -ENOPROTOOPT, len;
>  	int trans;
> +	int val;
>  
>  	if (level != SOL_RDS)
>  		goto out;
>  
> -	if (get_user(len, optlen)) {
> -		ret = -EFAULT;
> -		goto out;
> -	}
> +	len = opt->optlen;
>  
>  	switch (optname) {
>  	case RDS_INFO_FIRST ... RDS_INFO_LAST:
> -		ret = rds_info_getsockopt(sock, optname, optval,
> -					  optlen);
> +		ret = rds_info_getsockopt(sock, optname, opt);
>  		break;
>  
>  	case RDS_RECVERR:
> -		if (len < sizeof(int))
> +		if (len < sizeof(int)) {
>  			ret = -EINVAL;
> -		else
> -		if (put_user(rs->rs_recverr, (int __user *) optval) ||
> -		    put_user(sizeof(int), optlen))
> +			break;
> +		}
> +		val = rs->rs_recverr;
> +		if (copy_to_iter(&val, sizeof(int), &opt->iter_out) !=
> +		    sizeof(int)) {
>  			ret = -EFAULT;
> -		else
> +		} else {
> +			opt->optlen = sizeof(int);
>  			ret = 0;
> +		}
>  		break;
>  	case SO_RDS_TRANSPORT:
>  		if (len < sizeof(int)) {
> @@ -522,11 +524,13 @@ static int rds_getsockopt(struct socket *sock, int level, int optname,
>  		}
>  		trans = (rs->rs_transport ? rs->rs_transport->t_type :
>  			 RDS_TRANS_NONE); /* unbound */
> -		if (put_user(trans, (int __user *)optval) ||
> -		    put_user(sizeof(int), optlen))
> +		if (copy_to_iter(&trans, sizeof(int), &opt->iter_out) !=
> +		    sizeof(int)) {
>  			ret = -EFAULT;
> -		else
> +		} else {
> +			opt->optlen = sizeof(int);
>  			ret = 0;
> +		}
>  		break;
>  	default:
>  		break;
> @@ -653,7 +657,7 @@ static const struct proto_ops rds_proto_ops = {
>  	.listen =	sock_no_listen,
>  	.shutdown =	sock_no_shutdown,
>  	.setsockopt =	rds_setsockopt,
> -	.getsockopt =	rds_getsockopt,
> +	.getsockopt_iter =	rds_getsockopt,
>  	.sendmsg =	rds_sendmsg,
>  	.recvmsg =	rds_recvmsg,
>  	.mmap =		sock_no_mmap,
> diff --git a/net/rds/info.c b/net/rds/info.c
> index 17061f6ff74e..21b32eb16559 100644
> --- a/net/rds/info.c
> +++ b/net/rds/info.c
> @@ -35,6 +35,7 @@
>  #include <linux/slab.h>
>  #include <linux/proc_fs.h>
>  #include <linux/export.h>
> +#include <linux/uio.h>
>  
>  #include "rds.h"
>  
> @@ -144,60 +145,68 @@ void rds_info_copy(struct rds_info_iterator *iter, void *data,
>  EXPORT_SYMBOL_GPL(rds_info_copy);
>  
>  /*
> - * @optval points to the userspace buffer that the information snapshot
> - * will be copied into.
> - *
> - * @optlen on input is the size of the buffer in userspace.  @optlen
> - * on output is the size of the requested snapshot in bytes.
> + * @opt->iter_out describes the buffer that the information snapshot will be
> + * copied into, and @opt->optlen is the size of that buffer on input.  On
> + * output @opt->optlen is set to the size of the requested snapshot in bytes.
>   *
>   * This function returns -errno if there is a failure, particularly -ENOSPC
> - * if the given userspace buffer was not large enough to fit the snapshot.
> - * On success it returns the positive number of bytes of each array element
> - * in the snapshot.
> + * if the given buffer was not large enough to fit the snapshot.  On success
> + * it returns the positive number of bytes of each array element in the
> + * snapshot.
>   */
> -int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval,
> -			int __user *optlen)
> +int rds_info_getsockopt(struct socket *sock, int optname, sockopt_t *opt)
>  {
>  	struct rds_info_iterator iter;
>  	struct rds_info_lengths lens;
>  	unsigned long nr_pages = 0;
> -	unsigned long start;
>  	rds_info_func func;
>  	struct page **pages = NULL;
> +	size_t offset0 = 0;
> +	int npages = 0;
>  	int ret;
>  	int len;
>  	int total;
>  
> -	if (get_user(len, optlen)) {
> -		ret = -EFAULT;
> -		goto out;
> -	}
> +	len = opt->optlen;
>  
>  	/* check for all kinds of wrapping and the like */
> -	start = (unsigned long)optval;
> -	if (len < 0 || len > INT_MAX - PAGE_SIZE + 1 || start + len < start) {
> +	if (len < 0 || len > INT_MAX - PAGE_SIZE + 1) {
>  		ret = -EINVAL;
>  		goto out;
>  	}
>  
> +	/* The info producers write into the pages with kmap_atomic() while
> +	 * holding a spinlock, so they need a genuine page-backed user buffer.
> +	 */
> +	if (!user_backed_iter(&opt->iter_out)) {
> +		ret = -EOPNOTSUPP;
> +		goto out;
> +	}
> +
>  	/* a 0 len call is just trying to probe its length */
>  	if (len == 0)
>  		goto call_func;
>  
> -	nr_pages = (PAGE_ALIGN(start + len) - (start & PAGE_MASK))
> -			>> PAGE_SHIFT;
> -
> -	pages = kmalloc_objs(struct page *, nr_pages);
> +	/*
> +	 * Preallocate the page array and pass it in so that
> +	 * iov_iter_extract_pages() fills it in place rather than allocating
> +	 * one for us.  Handing it a non-NULL array keeps ownership of the
> +	 * array with us on every return path, instead of depending on the
> +	 * iterator code to allocate and hand it back.
> +	 */
> +	npages = iov_iter_npages(&opt->iter_out, INT_MAX);
> +	pages = kvmalloc_array(npages, sizeof(*pages), GFP_KERNEL);
>  	if (!pages) {
>  		ret = -ENOMEM;
>  		goto out;
>  	}
> -	ret = pin_user_pages_fast(start, nr_pages, FOLL_WRITE, pages);
> -	if (ret != nr_pages) {
> -		if (ret > 0)
> -			nr_pages = ret;
> -		else
> -			nr_pages = 0;
> +
> +	ret = iov_iter_extract_pages(&opt->iter_out, &pages, len, npages,
> +				     0, &offset0);
> +	if (ret < 0)
> +		goto out;
> +	nr_pages = DIV_ROUND_UP(offset0 + ret, PAGE_SIZE);
> +	if (ret != len) {
>  		ret = -EAGAIN; /* XXX ? */
>  		goto out;
>  	}
> @@ -213,7 +222,7 @@ int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval,
>  
>  	iter.pages = pages;
>  	iter.addr = NULL;
> -	iter.offset = start & (PAGE_SIZE - 1);
> +	iter.offset = offset0;
>  
>  	func(sock, len, &iter, &lens);
>  	BUG_ON(lens.each == 0);
> @@ -230,13 +239,16 @@ int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval,
>  		ret = lens.each;
>  	}
>  
> -	if (put_user(len, optlen))
> -		ret = -EFAULT;
> +	opt->optlen = len;
>  
>  out:
> -	if (pages)
> +	/*
> +	 * iov_iter_extract_pages() pins only user-backed (ubuf) iters;
> +	 * iov_iter_extract_will_pin() reports whether an unpin is owed here.
> +	 */
> +	if (pages && iov_iter_extract_will_pin(&opt->iter_out))
>  		unpin_user_pages_dirty_lock(pages, nr_pages, true);
> -	kfree(pages);
> +	kvfree(pages);
>  
>  	return ret;
>  }
> diff --git a/net/rds/info.h b/net/rds/info.h
> index a069b51c4679..1aab62ab6d00 100644
> --- a/net/rds/info.h
> +++ b/net/rds/info.h
> @@ -21,8 +21,7 @@ typedef void (*rds_info_func)(struct socket *sock, unsigned int len,
>  
>  void rds_info_register_func(int optname, rds_info_func func);
>  void rds_info_deregister_func(int optname, rds_info_func func);
> -int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval,
> -			int __user *optlen);
> +int rds_info_getsockopt(struct socket *sock, int optname, sockopt_t *opt);
>  void rds_info_copy(struct rds_info_iterator *iter, void *data,
>  		   unsigned long bytes);
>  void rds_info_iter_unmap(struct rds_info_iterator *iter);
> 




More information about the rds-devel mailing list