[DTrace-devel] [PATCH 03/20] libproc: dynamically search for elements of the rtld_global structure

Kris Van Hees kris.van.hees at oracle.com
Fri May 20 19:29:21 UTC 2022


On Wed, May 11, 2022 at 10:12:45PM +0100, Nick Alcock via DTrace-devel wrote:
> On glibc < 2.35 (with r_version 1), there is no rtld support for
> multiple lmids, so rtld_db needs to fish the necessary data directly out
> of glibc internal data structures.  However, as with link_map (see
> commit d1d38404d44ff000) this is not at all ABI-stable. After many
> decades of total stability the struct link_namespaces at the head of
> this structure changed repeatedly in the 2.31 -- 2.34 timeframe, mostly
> to fix race conditions, and it's an array[DL_NNS] so even small changes
> have large consequences for offsets of elements beyond this structure:
> so we have to hunt for everything we need in this structure dynamically
> (because even elements at the start, like _ns_nloaded, are going to
> have the offsets of subsequent elements in the array move around when
> struct link_namespaces grows, and elements after that array in
> rtld_global will move even more).
> 
> This is quite hard to scan for because (unlike find_l_searchlist) we can
> rely on hardly anything for verification: the first lmid tells us almost
> nothing, since most of the elements we need lie above the point of
> change, and subsequent lmids are almost (but not entirely) certain to be
> uninitialized and thus all-zero.
> 
> But we can cheat!  As the code comments note, the field we are
> interested in (dl_nns) immediately follows a pointer in all versions of
> glibc we care about: and pointers in struct link_namespace are either
> value 0 in uninitialized lmids or huge (definitely greater than value of
> DL_NNS, which is a fixed 16 in all supported glibc versions): and the
> integral fields therein are also nonzero only when the lmid is
> initialized.  If the last lmid is initialized we can't really do
> anything: there are lots of fields with a pointer followed by a small
> integral value.  But this is massively unlikely: glibc < 2.35 cannot
> even start that many lmids without having some of them entirely detached
> from libc because of in-glibc TLS consumption overflowing the space
> allocated for it, and there is no supported way to start an lmid
> detached from glibc. So we can rely on looking for a pair of
> pointer-aligned fields satisfying (0, 0 < dl_nns < DL_NNS): the second
> such field is the dl_nns.  We can then figure out both the offset of all
> the other fields after it by offsetting them from the computed dl_nns
> offset, and the size of struct link_namespace (which has no padding
> following the structure in any supported glibc version, and glibc relies
> on this property); and thus determine both the offsets of fields late in
> struct link_namespace and the offsets of fields in nonzero lmids.
> 
> Unfortunately because most of these fields (other than dl_nns == 1) only
> go nonzero when multiple lmids are in use (which is rare) or when
> dlopen() is being called in the child (also rare), we can't really
> verify our guesses the way we could for l_searchlist.  The existing
> verification that when we look up a symbol in a nonzero lmid it actually
> comes from a nonzero lmid will have to do.  This will almost certainly
> be good enough, particularly given that all this machinery will be
> disabled in favour of documented facilities when the victim is a binary
> running against glibc 2.35+.
> 
> Orabug: 32856318
> Signed-off-by: Nick Alcock <nick.alcock at oracle.com>

Reviewed-by: Kris Van Hees <kris.van.hees at oracle.com>

... modulo fixinng several cases where you have 8 leading spaces in case of a
    tab.  I fixed those, and will put this on dev.
> ---
>  libproc/Pcontrol.h   |   8 ++
>  libproc/mkoffsets.sh |  10 +-
>  libproc/rtld_db.c    | 232 +++++++++++++++++++++++++++++++++++++------
>  3 files changed, 216 insertions(+), 34 deletions(-)
> 
> diff --git a/libproc/Pcontrol.h b/libproc/Pcontrol.h
> index dab482a50882..347963571e41 100644
> --- a/libproc/Pcontrol.h
> +++ b/libproc/Pcontrol.h
> @@ -172,9 +172,17 @@ struct rd_agent {
>  	int released;			/* 1 if released */
>  	size_t l_searchlist_offset;	/* Offset of the l_searchlist in the
>  					   link map structure. */
> +	int r_version;			/* the version of the r_debug interface */
>  	uintptr_t r_brk_addr;		/* if nonzero, the address of r_brk */
>  	uintptr_t rtld_global_addr;	/* if nonzero, the address of
>  					   _rtld_global */
> +	size_t	dl_nns_offset;		/* Offset of the dl_nns from rtld_global.  */
> +	size_t	dl_load_lock_offset;	/* Offset of the dl_load_lock from
> +					 * rtld_global.  */
> +	size_t	g_debug_offset;		/* Offset of the g_debug element from
> +					 * its expected value, G_DEBUG.  */
> +	size_t	link_namespaces_size;	/* Apparent size of "struct link
> +					   namespaces" in this glibc. */
>  	int	rd_monitoring;		/* 1 whenever rtld_db has a breakpoint
>  					   set on the dynamic linker. */
>  	int	rd_monitor_suppressed;	/* 1 if rd monitoring is off forever */
> diff --git a/libproc/mkoffsets.sh b/libproc/mkoffsets.sh
> index 96c1ed18cd67..42fda011a012 100755
> --- a/libproc/mkoffsets.sh
> +++ b/libproc/mkoffsets.sh
> @@ -68,10 +68,15 @@ for BITNESS in 32 64; do
>  int main(void)
>  {
>  	printf("#define UINT_ at BITNESS@_SIZE\t%li\n", sizeof(unsigned int));
> +	printf("#define PTR_ at BITNESS@_SIZE\t%li\n", sizeof(void *));
>  	printf("#ifndef DL_NNS\n");
>  	printf("#define DL_NNS\t%li\n", DL_NNS);
>  	printf("#endif\n");
>  
> +        printf("#define R_DEBUG_ at BITNESS@_SIZE\t%li\n", sizeof (struct r_debug));
> +        printf("#define LINK_NAMESPACES_ at BITNESS@_SIZE\t%li\n",
> +	       sizeof (struct link_namespaces));
> +
>  	BITNESS_OFFSET(R_VERSION, r_debug, r_version);
>  	BITNESS_OFFSET(R_MAP, r_debug, r_map);
>  	BITNESS_OFFSET(R_BRK, r_debug, r_brk);
> @@ -85,12 +90,9 @@ int main(void)
>  	BITNESS_OFFSET(L_PREV, internal_link_map, l_prev);
>  	BITNESS_OFFSET(L_SEARCHLIST, internal_link_map, l_searchlist.r_list);
>  
> -	BITNESS_OFFSET(G_DEBUG, rtld_global, _dl_ns[0]._ns_debug);
> -	BITNESS_OFFSET(G_DEBUG_SUBSEQUENT, rtld_global, _dl_ns[1]._ns_debug);
> +        BITNESS_OFFSET(G_DEBUG, rtld_global, _dl_ns[0]._ns_debug);
>  	BITNESS_OFFSET(G_NLOADED, rtld_global, _dl_ns[0]._ns_nloaded);
> -	BITNESS_OFFSET(G_NLOADED_SUBSEQUENT, rtld_global, _dl_ns[1]._ns_nloaded);
>  	BITNESS_OFFSET(G_NS_LOADED, rtld_global, _dl_ns[0]._ns_loaded);
> -	BITNESS_OFFSET(G_NS_LOADED_SUBSEQUENT, rtld_global, _dl_ns[1]._ns_loaded);
>  	BITNESS_OFFSET(G_DL_NNS, rtld_global, _dl_nns);
>  	BITNESS_OFFSET(G_DL_LOAD_LOCK, rtld_global, _dl_load_lock.mutex.__data.__count);
>  }
> diff --git a/libproc/rtld_db.c b/libproc/rtld_db.c
> index 77f67b05b679..43ffb7c916e9 100644
> --- a/libproc/rtld_db.c
> +++ b/libproc/rtld_db.c
> @@ -22,6 +22,7 @@
>  #include <signal.h>
>  #include <rtld_db.h>
>  #include <rtld_offsets.h>
> +#include <assert.h>
>  
>  #include "libproc.h"
>  #include "Pcontrol.h"
> @@ -87,6 +88,160 @@ sane_nanosleep(long long timeout_nsec)
>  	nanosleep(&timeout, NULL);
>  }
>  
> +/*
> + * Find the offset of the dl_nns and the size and offsets of fields preceding it
> + * in the rtld_global structure.
> + *
> + * Returns -1 if nothing resembling a scope searchlist can be found. (May return
> + * -1 spuriously in obscure cases, such as processes with no dynamic linker
> + * initialized yet, as well as if an exec() strikes while scanning.  In such
> + * cases, it will leave the searchlist uninitialized and recheck on every
> + * ustack() et al: potentially slow, but the only safe approach.)
> + *
> + * This structure is part of the guts of glibc and is not ABI-guaranteed, but
> + * changes are limited by the fact that in glibc 2.35+ we will not use this
> + * code at all due to the new r_debug protocol version, and that while distros
> + * may make changes here, they are likely only to be backports from glibc <
> + * 2.35.  We know that all such changes grew the structure, so searches only
> + * need to be done in one direction (forwards), and even those are tightly
> + * bounded.
> + *
> + * Must be called under rd_ldso_consistent_begin() or at least Ptrace(), to
> + * prevent longjmps on exec from causing memory leaks.
> + */
> +static int
> +find_dl_nns(rd_agent_t *rd)
> +{
> +	uintptr_t start;
> +	uintptr_t scan;
> +	uintptr_t scan_next;
> +
> +	_dprintf("%i: Finding dl_nns\n", rd->P->pid);
> +
> +	/*
> +	 * This process has several stages: finding dl_nns, and then finding
> +	 * everything else given what finding dl_nns lets us know.
> +	 *
> +	 * Finding dl_nns is simple enough: search forward from our previous
> +	 * best-guess address and hunt for a pair of pointer-aligned addresses
> +	 * the first of which is zero and the second of which is between 0 and
> +	 * DL_NNS (exclusive).  We assume that pointers and size_ts are the same
> +	 * size and have the same alignment, which is true for all platforms we
> +	 * run on.
> +	 *
> +	 * This works because almost all fields in struct link_namespaces are
> +	 * either pointers (almost certain to be either NULL when a namespace is
> +	 * uninitializes, i.e. 0 on all platforms we support, or a high value >
> +	 * DL_NNS) or integral values which are either zero when the ns is
> +	 * uninitialized or nonzero otherwise: so fields we're not interested in
> +	 * are either two zeroes or large values (for pointers) and nonzero
> +	 * values (for integral values).  The last field in struct
> +	 * link_namespaces is the last field of struct r_debug, which is a
> +	 * pointer with the same semantics as above in both struct r_debug and
> +	 * struct r_debug_extended (used in glibc 2.35+).  We specifically look
> +	 * for the zero-pointer uninitialized case because if we find that it
> +	 * means that every other integral field in this link_namespace is zero,
> +	 * which is not a valid value for dl_nns.  (If initialized, quite a few
> +	 * of them might in theory have a value overlapping with the set of
> +	 * values valid for dl_nns.)
> +	 *
> +	 * This heuristic will fail if every single lmid is initialized, because
> +	 * the last pointer will be nonzero, but given that until glibc 2.35 the
> +	 * TLS allocation of libc itself prevented the use of all 16 lmids and
> +	 * even now it is incredibly rare (and even less likely near program
> +	 * startup time), we can ignore this possibility.
> +	 */
> +
> +	size_t ptr_size;
> +
> +	ptr_size = (rd->P->elf64 ? PTR_64_SIZE : PTR_32_SIZE);
> +	start = rtld_global(rd) + (rd->P->elf64 ? G_DL_NNS_64_OFFSET
> +	    : G_DL_NNS_32_OFFSET);
> +	scan_next = start + ptr_size;
> +
> +	for (scan = start;; scan = scan_next, scan_next += ptr_size) {
> +		uintptr_t poss_preceding;
> +		uintptr_t poss_l_nns;
> +
> +		/*
> +		 * Give up eventually.
> +		 */
> +		if (scan > start + 65535)
> +			return -1;
> +
> +		if (Pread_scalar_quietly(rd->P, &poss_preceding, ptr_size,
> +			sizeof(uintptr_t), scan, 1) < 0)
> +			return -1;
> +
> +		if (Pread_scalar_quietly(rd->P, &poss_l_nns, ptr_size,
> +			sizeof(uintptr_t), scan_next, 1) < 0)
> +			return -1;
> +
> +		/*
> +		 * Found it.  We know the struct link_namespace size too, as a
> +		 * direct consequence: the distance between the rtld_global
> +		 * address and dl_nns address, divided by DL_NNS, rounded to the
> +		 * size of a pointer, since the last element is always
> +		 * pointer-sized in all known variants.  (This would break if
> +		 * DL_NNS changed, but it hasn't as of glibc 2.35, so we are
> +		 * probably safe.  If we get it wrong it is hard to know without
> +		 * checking against a multi-lmid testcase: there is little we
> +		 * can check against otherwise, and at this point we are
> +		 * unlikely to have more than one lmid to check.  If this turns
> +		 * out to be a problem in practice we can add more validation
> +		 * code that kicks in only when find_dl_nns was needed and
> +		 * dl_nss > 1.)
> +		 *
> +		 * Offset dl_load_lock by a corresponding amount (the relative
> +		 * positions of dl_nns and dl_load_lock have never changed).
> +		 */
> +		if (poss_preceding == 0 &&
> +		    poss_l_nns > 0 && poss_l_nns <= DL_NNS) {
> +
> +			rd->dl_nns_offset = scan_next - rtld_global(rd);
> +			rd->dl_load_lock_offset = rd->dl_nns_offset +
> +			    rd->P->elf64 ? G_DL_LOAD_LOCK_64_OFFSET - G_DL_NNS_64_OFFSET
> +			    : G_DL_LOAD_LOCK_32_OFFSET - G_DL_NNS_32_OFFSET;
> +
> +			rd->link_namespaces_size = (((scan_next - rtld_global(rd)) / DL_NNS)
> +			    / ptr_size) * ptr_size;
> +			break;
> +		}
> +	}
> +
> +	if (rd->link_namespaces_size == 0)
> +		return -1;
> +
> +	/*
> +	 * We now know (or hope we know) l_nns's offset and the size of a struct
> +	 * link_namespace.  It's time to figure out the offsets of other things
> +	 * in srtuct link_namespace, by reference to the first namespace, which
> +	 * is always populated.
> +	 *
> +	 * We consult four fields in rtld_global.  dl_nns we have already found.
> +	 * dl_load_lock is right after dl_nns in all glibcs of interest and is
> +	 * in any case hard to validate because it is usually zero.  g_ns_loaded
> +	 * is at the start of the link map in all supported glibc variants.
> +	 * _ns_nloaded is right after it.  But _ns_debug is at a potentially
> +	 * varying offset.  We can exploit the fact that it is always at the end
> +	 * of struct link_namespace and that it's always the same size when
> +	 * r_version is 1.  The r_debug for namespace zero is not found in this
> +	 * list at all, so we can't validate any of this in any useful fashion,
> +	 * but we can at least compute it.
> +	 */
> +
> +	assert(rd->r_version < 2);
> +
> +	rd->g_debug_offset = rd->link_namespaces_size -
> +	    (rd->P->elf64 ? R_DEBUG_64_SIZE : R_DEBUG_32_SIZE);
> +
> +	_dprintf("dl_nns_offset is %zi\n", rd->dl_nns_offset);
> +	_dprintf("g_debug_offset is %zi\n", rd->g_debug_offset);
> +	_dprintf("sizeof (struct link_namespaces) is %zi\n", rd->link_namespaces_size);
> +
> +	return 0;
> +}
> +
>  /*
>   * Determine the number of currently-valid namespaces.
>   */
> @@ -94,6 +249,7 @@ static size_t
>  dl_nns(rd_agent_t *rd)
>  {
>  	size_t buf;
> +	int tried = 0;
>  
>  	/*
>  	 * Non-shared processes always have one and only one namespace, as do
> @@ -106,26 +262,54 @@ dl_nns(rd_agent_t *rd)
>  	    rtld_global(rd) == 0)
>  		return 1;
>  
> +	/*
> +	 * Set up various offsets.  This is only a compile-time guesstimate and
> +	 * may be recomputed by find_dl_nns, below.
> +	 */
> +	if (rd->P->elf64) {
> +		rd->dl_nns_offset = G_DL_NNS_64_OFFSET;
> +		rd->dl_load_lock_offset = G_DL_LOAD_LOCK_64_OFFSET;
> +		rd->g_debug_offset = G_DEBUG_64_OFFSET;
> +		rd->link_namespaces_size = LINK_NAMESPACES_64_SIZE;
> +	} else {
> +		rd->dl_nns_offset = G_DL_NNS_32_OFFSET;
> +		rd->dl_load_lock_offset = G_DL_LOAD_LOCK_32_OFFSET;
> +		rd->g_debug_offset = G_DEBUG_32_OFFSET;
> +		rd->link_namespaces_size = LINK_NAMESPACES_32_SIZE;
> +	}
> +
> + retry:
>  	/*
>  	 * Because this has no corresponding publically-visible header, we must
>  	 * use offsets directly.  If the read fails, assume 1 (almost always
>  	 * true anyway).
>  	 */
>  	if (Pread_scalar(rd->P, &buf, rd->P->elf64 ? G_DL_NNS_64_SIZE :
> -		G_DL_NNS_32_SIZE, sizeof(size_t), rtld_global(rd) +
> -		(rd->P->elf64 ? G_DL_NNS_64_OFFSET : G_DL_NNS_32_OFFSET)) < 0) {
> +		G_DL_NNS_32_SIZE, sizeof(size_t),
> +		rtld_global(rd) + rd->dl_nns_offset) < 0) {
>  		_dprintf("%i: Cannot read namespace count\n", rd->P->pid);
>  		return 1;
>  	}
>  
> -	if ((buf == 0) || (buf > DL_NNS)) {
> +	if ((buf > 0) && (buf < DL_NNS))
> +		return buf;
> +
> +	/*
> +	 * Whatever we're looking at, it can't be dl_nns (or DL_NNS has been
> +	 * bumped in this glibc version and a quite implausible number of lmids
> +	 * are active).  Search for the dl_nns value.  This will also tell us
> +	 * how large each struct link_namespace is.  If we can't find it,
> +	 * suppress multiple-lmid support.
> +	 */
> +
> +	if (tried || find_dl_nns(rd) < 0) {
>  		_dprintf("%i: %li namespaces is not valid: "
>  		    "probably incompatible glibc\n", rd->P->pid, buf);
>  		rd->lmid_incompatible_glibc = 1;
>  		return 1;
>  	}
> -
> -	return buf;
> +	tried = 1;
> +	goto retry;
>  }
>  
>  /*
> @@ -201,11 +385,7 @@ ns_debug_addr(rd_agent_t *rd, Lmid_t lmid)
>  	 * Because this structure is not visible in the systemwide <link.h>, we
>  	 * cannot use offsetof tricks, but must resort to raw offset computation.
>  	 */
> -	return global + (rd->P->elf64 ? G_DEBUG_64_OFFSET : G_DEBUG_32_OFFSET) +
> -	    (((rd->P->elf64 ? G_DEBUG_SUBSEQUENT_64_OFFSET :
> -		    G_DEBUG_SUBSEQUENT_32_OFFSET) -
> -		(rd->P->elf64 ? G_DEBUG_64_OFFSET : G_DEBUG_32_OFFSET)) *
> -	    lmid);
> +	return global + (rd->link_namespaces_size * lmid) + rd->g_debug_offset;
>  }
>  
>  /*
> @@ -244,12 +424,8 @@ first_link_map(rd_agent_t *rd, Lmid_t lmid)
>  	 * Fish the link map straight out of _ns_loaded.
>  	 */
>  
> -	link_map_ptr_addr = global + (rd->P->elf64 ? G_NS_LOADED_64_OFFSET :
> -	    G_NS_LOADED_32_OFFSET) +
> -	    (((rd->P->elf64 ? G_NS_LOADED_SUBSEQUENT_64_OFFSET :
> -		    G_NS_LOADED_SUBSEQUENT_32_OFFSET) -
> -		(rd->P->elf64 ? G_NS_LOADED_64_OFFSET : G_NS_LOADED_32_OFFSET)) *
> -		lmid);
> +	link_map_ptr_addr = global + (rd->link_namespaces_size * lmid) +
> +	    (rd->P->elf64 ? G_NS_LOADED_64_OFFSET : G_NS_LOADED_32_OFFSET);
>  
>  	if (Pread_scalar(rd->P, &link_map_addr, rd->P->elf64 ? G_NS_LOADED_64_SIZE :
>  		G_NS_LOADED_32_SIZE, sizeof(struct link_map *), link_map_ptr_addr) < 0) {
> @@ -645,12 +821,8 @@ ns_nloaded(rd_agent_t *rd, Lmid_t lmid)
>  	unsigned int buf;
>  	uintptr_t addr;
>  
> -	addr = rtld_global(rd) +
> -	    (rd->P->elf64 ? G_NLOADED_64_OFFSET : G_NLOADED_32_OFFSET) +
> -	    (((rd->P->elf64 ? G_NLOADED_SUBSEQUENT_64_OFFSET :
> -		    G_NLOADED_SUBSEQUENT_32_OFFSET) -
> -		(rd->P->elf64 ? G_NLOADED_64_OFFSET : G_NLOADED_32_OFFSET)) *
> -		lmid);
> +	addr = rtld_global(rd) + (rd->link_namespaces_size * lmid) +
> +	    (rd->P->elf64 ? G_NLOADED_64_OFFSET : G_NLOADED_32_OFFSET);
>  
>  	/*
>  	 * If the read fails, assume 1 (almost always true anyway).
> @@ -675,6 +847,8 @@ load_lock(rd_agent_t *rd)
>  
>  	/*
>  	 * This should never happen: if it does, let's not read garbage.
> +	 * (Always called after dl_nns is called, so we don't need to worry
> +	 * about the dl_load_lock offsets not being set.)
>  	 */
>  
>  	if (rtld_global(rd) == 0)
> @@ -682,9 +856,8 @@ load_lock(rd_agent_t *rd)
>  
>  	if (Pread_scalar(rd->P, &lock_count, rd->P->elf64 ?
>  		G_DL_LOAD_LOCK_64_SIZE : G_DL_LOAD_LOCK_32_SIZE,
> -		sizeof(unsigned int), rtld_global(rd) +
> -		(rd->P->elf64 ? G_DL_LOAD_LOCK_64_OFFSET :
> -		    G_DL_LOAD_LOCK_32_OFFSET)) < 0)
> +		sizeof(unsigned int),
> +		rtld_global(rd) + rd->dl_load_lock_offset) < 0)
>  		return -1;
>  
>  	return lock_count;
> @@ -1210,7 +1383,6 @@ r_brk(rd_agent_t *rd)
>  {
>  	static int warned = 0;
>  	uintptr_t r_debug_addr;
> -	int r_version;
>  
>  	if (rd->released)
>  		return 0;
> @@ -1231,17 +1403,17 @@ r_brk(rd_agent_t *rd)
>  	/*
>  	 * Check its version.
>  	 */
> -	if ((read_scalar_child(rd->P, &r_version, r_debug_addr,
> +	if ((read_scalar_child(rd->P, &rd->r_version, r_debug_addr,
>  		    r_debug_offsets, r_debug, r_version) <= 0) ||
> -	    (r_version > 1)) {
> +	    (rd->r_version > 1)) {
>  		if (!warned)
>  			_dprintf("%i: r_version %i unsupported.\n",
> -			    rd->P->pid, r_version);
> +			    rd->P->pid, rd->r_version);
>  		warned = 1;
>  		return 0;
>  	}
>  
> -	if (r_version == 0)
> +	if (rd->r_version == 0)
>  		return 0;
>  
>  	/*
> -- 
> 2.36.1.263.g194b774378.dirty
> 
> 
> _______________________________________________
> DTrace-devel mailing list
> DTrace-devel at oss.oracle.com
> https://oss.oracle.com/mailman/listinfo/dtrace-devel



More information about the DTrace-devel mailing list