[DTrace-devel] [PATCH v2 07/19] Create the BPF usdt_prids map

Kris Van Hees kris.van.hees at oracle.com
Thu Sep 19 14:51:53 UTC 2024


Just a head's up that I am taking a look at this right now from the perspective
of whether it is safe to perform a BPF map update on a map that might be in use
at the same time. So, the updating of the strtab map is a likely dangerous
situation.

More once I figure out a few things.

On Sun, Sep 08, 2024 at 12:40:15PM -0400, eugene.loh at oracle.com wrote:
> From: Eugene Loh <eugene.loh at oracle.com>
> 
> As USDT processes come and go, the overlying probes for an underlying
> probe will change.  Hence, we will move to a scheme in which an
> underlying probe will walk all possible clauses it could call,
> deciding at run time (using a bit mask for the overlying USDT probe)
> which of the clauses to call.
> 
> In this patch, we create and update the BPF "usdt_prids" map.  This
> is a hash map, where:
> 
>   *)  the key (size: dtp->dt_usdt_pridsmap_ksz) comprises
>         - the PID of the firing process
>         - the PRID of the underlying probe
> 
>   *)  the value (size: dtp->dt_usdt_pridsmap_vsz) comprises
>         - the PRID over the overlying USDT probe
>         - a bit mask indicating which clauses should be called
> 
> As processes start up, we also add new overlying USDT probes,
> requiring updates of the BPF "probes" and "strtab" maps and of
> the list of enablings.
> 
> The underlying-probes "update" function is called sporadically.
> The focus of this patch is the function.  When it is called can
> be tuned in future patches.
> 
> This patch guesses certain sizes very crudely.  These sizes should
> be handled more robustly in future patches:
> 
>   *)  The allocated size of the string table.  For example,
>       new provider names have to be added as new processes
>       start.
> 
>   *)  The number of entries in the BPF "usdt_prids" map.
>       There is a little relief here in that, as processes
>       terminate, they can be removed from the map.
> 
>   *)  The size of the bit mask -- that is, the greatest
>       number of clauses an underlying probe might call.
>       This is relatively easy to extend;  nevertheless,
>       that work is left for a future patch.
> 
> Signed-off-by: Eugene Loh <eugene.loh at oracle.com>
> ---
>  libdtrace/dt_bpf.c         |  42 +++++-
>  libdtrace/dt_cc.c          |   2 +-
>  libdtrace/dt_dlibs.c       |   1 +
>  libdtrace/dt_impl.h        |   6 +
>  libdtrace/dt_prov_uprobe.c | 266 +++++++++++++++++++++++++++++++++++++
>  libdtrace/dt_work.c        |   2 +
>  6 files changed, 317 insertions(+), 2 deletions(-)
> 
> diff --git a/libdtrace/dt_bpf.c b/libdtrace/dt_bpf.c
> index 3f9c42ea..70803e3c 100644
> --- a/libdtrace/dt_bpf.c
> +++ b/libdtrace/dt_bpf.c
> @@ -841,7 +841,8 @@ gmap_create_strtab(dtrace_hdl_t *dtp)
>  	int		fd, rc, err;
>  
>  	dtp->dt_strlen = dt_strtab_size(dtp->dt_ccstab);
> -	dtp->dt_rooffset = P2ROUNDUP(dtp->dt_strlen, 8);
> +	dtp->dt_strmax = dtp->dt_strlen + 1000;            // FIXME pad some arbitrary amount to account for new USDT probes as new processes start
> +	dtp->dt_rooffset = P2ROUNDUP(dtp->dt_strmax, 8);
>  	dtp->dt_rosize = dt_rodata_size(dtp->dt_rodata);
>  	dtp->dt_zerooffset = P2ROUNDUP(dtp->dt_rooffset + dtp->dt_rosize, 8);
>  	dtp->dt_zerosize = 0;
> @@ -899,6 +900,7 @@ gmap_create_strtab(dtrace_hdl_t *dtp)
>  	if (rc == -1)
>  		return dt_bpf_error(dtp, "cannot update BPF map 'strtab': %s\n",
>  				    strerror(err));
> +	dtp->dt_strtabmap_fd = fd;
>  
>  	return 0;
>  }
> @@ -936,6 +938,43 @@ gmap_create_probes(dtrace_hdl_t *dtp)
>  				dtp, "cannot update BPF map 'probes': %s\n",
>  				strerror(errno));
>  	}
> +	dtp->dt_probesmap_fd = fd;
> +
> +	return 0;
> +}
> +
> +/*
> + * Create the 'usdt_prids' BPF map.
> + *
> + * USDT-PRID information map.  This is a global hash map for use
> + * with USDT probes.  It is indexed by (pid, underlying probe ID).
> + * The value is a probe ID for the overlying USDT probe and a bit
> + * mask indicating which clauses to execute for this pid.
> + *
> + * WIP.  Just make up some sizes for now.
> + *
> + *     How big is a probe ID?  Sometimes, it's a dtrace_id_t.
> + *     And uts/common/sys/dtrace_types.h gives us "typedef uint32_t dtrace_id_t".
> + *     Meanwhile, libdtrace/dt_impl.h has "uint32_t dt_probe_id".
> + *     So either uint32_t or dtrace_id_t is fine.
> + *
> + *     How big should our bit mask be?  Start with 8*sizeof(long long) bits.
> + *
> + *     How many entries should we allow?  Start with 1000.
> + *
> + * Note that for a given (pid, PRID) key, there can be at most one
> + * overlying USDT probe.
> + */
> +static int
> +gmap_create_usdt_prids(dtrace_hdl_t *dtp)
> +{
> +	dtp->dt_usdt_pridsmap_fd = create_gmap(dtp, "usdt_prids", BPF_MAP_TYPE_HASH,
> +			 dtp->dt_usdt_pridsmap_ksz, dtp->dt_usdt_pridsmap_vsz, 1000);
> +	if (dtp->dt_usdt_pridsmap_fd == -1)
> +		return -1;
> +
> +	/* Populate the newly created map.  FIXME:  this is probably not the right place for this. */
> +	dt_uprobe.update(dtp, NULL);
>  
>  	return 0;
>  }
> @@ -1045,6 +1084,7 @@ dt_bpf_gmap_create(dtrace_hdl_t *dtp)
>  	CREATE_MAP(scratchmem)
>  	CREATE_MAP(strtab)
>  	CREATE_MAP(probes)
> +	CREATE_MAP(usdt_prids)
>  	CREATE_MAP(gvars)
>  	CREATE_MAP(lvars)
>  	CREATE_MAP(dvars)
> diff --git a/libdtrace/dt_cc.c b/libdtrace/dt_cc.c
> index b28388e9..e7f22206 100644
> --- a/libdtrace/dt_cc.c
> +++ b/libdtrace/dt_cc.c
> @@ -1046,7 +1046,7 @@ dt_link_construct(dtrace_hdl_t *dtp, const dt_probe_t *prp, dtrace_difo_t *dp,
>  				nrp->dofr_data = 0;	/* FIXME */
>  				continue;
>  			case DT_CONST_STBSZ:
> -				nrp->dofr_data = dtp->dt_strlen;
> +				nrp->dofr_data = dtp->dt_strmax;
>  				continue;
>  			case DT_CONST_STRSZ:
>  				nrp->dofr_data =
> diff --git a/libdtrace/dt_dlibs.c b/libdtrace/dt_dlibs.c
> index ba4d4abe..924cf11e 100644
> --- a/libdtrace/dt_dlibs.c
> +++ b/libdtrace/dt_dlibs.c
> @@ -66,6 +66,7 @@ static const dt_ident_t		dt_bpf_symbols[] = {
>  	DT_BPF_SYMBOL(lvars, DT_IDENT_PTR),
>  	DT_BPF_SYMBOL(mem, DT_IDENT_PTR),
>  	DT_BPF_SYMBOL(probes, DT_IDENT_PTR),
> +	DT_BPF_SYMBOL(usdt_prids, DT_IDENT_PTR),
>  	DT_BPF_SYMBOL(scratchmem, DT_IDENT_PTR),
>  	DT_BPF_SYMBOL(specs, DT_IDENT_PTR),
>  	DT_BPF_SYMBOL(state, DT_IDENT_PTR),
> diff --git a/libdtrace/dt_impl.h b/libdtrace/dt_impl.h
> index c0399a06..6704cfc7 100644
> --- a/libdtrace/dt_impl.h
> +++ b/libdtrace/dt_impl.h
> @@ -287,6 +287,7 @@ struct dtrace_hdl {
>  	dt_strtab_t *dt_ccstab;	/* global string table (during compilation) */
>  	dt_rodata_t *dt_rodata;	/* global read-only data */
>  	uint_t dt_strlen;	/* global string table (runtime) size */
> +	uint_t dt_strmax;	/* global string table (runtime) size, allocated */
>  	uint_t dt_rooffset;	/* read-only data offset */
>  	uint_t dt_rosize;	/* read-only data size */
>  	uint_t dt_zerooffset;	/* zero region, offset */
> @@ -394,6 +395,11 @@ struct dtrace_hdl {
>  	int dt_aggmap_fd;	/* file descriptor for the 'aggs' BPF map */
>  	int dt_genmap_fd;	/* file descriptor for the 'agggen' BPF map */
>  	int dt_cpumap_fd;	/* file descriptor for the 'cpuinfo' BPF map */
> +	int dt_strtabmap_fd;	/* file descriptor for the 'strtab' BPF map */
> +	int dt_probesmap_fd;	/* file descriptor for the 'probes' BPF map */
> +	int dt_usdt_pridsmap_fd; /* file descriptor for the 'usdt_prids' BPF map */
> +	int dt_usdt_pridsmap_ksz; /* 'usdt_prids' BPF map key size */
> +	int dt_usdt_pridsmap_vsz; /* 'usdt_prids' BPF map value size */
>  	dtrace_handle_err_f *dt_errhdlr; /* error handler, if any */
>  	void *dt_errarg;	/* error handler argument */
>  	dtrace_handle_drop_f *dt_drophdlr; /* drop handler, if any */
> diff --git a/libdtrace/dt_prov_uprobe.c b/libdtrace/dt_prov_uprobe.c
> index cbde6a48..d53a1c8d 100644
> --- a/libdtrace/dt_prov_uprobe.c
> +++ b/libdtrace/dt_prov_uprobe.c
> @@ -37,8 +37,10 @@
>  #include "dt_list.h"
>  #include "dt_provider_tp.h"
>  #include "dt_probe.h"
> +#include "dt_program.h"
>  #include "dt_pid.h"
>  #include "dt_string.h"
> +#include "port.h"
>  
>  /* Provider name for the underlying probes. */
>  static const char	prvname[] = "uprobe";
> @@ -63,6 +65,15 @@ typedef struct list_probe {
>  	dt_probe_t	*probe;
>  } list_probe_t;
>  
> +typedef struct usdt_prids_map_key {
> +	pid_t		pid;
> +	dtrace_id_t	uprid;
> +} usdt_prids_map_key_t;
> +typedef struct usdt_prids_map_val {
> +	dtrace_id_t	prid;
> +	long long	mask;
> +} usdt_prids_map_val_t;
> +
>  static const dtrace_pattr_t	pattr = {
>  { DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_ISA },
>  { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
> @@ -77,6 +88,9 @@ dt_provimpl_t	dt_usdt;
>  
>  static int populate(dtrace_hdl_t *dtp)
>  {
> +	dtp->dt_usdt_pridsmap_ksz = sizeof(usdt_prids_map_key_t);
> +	dtp->dt_usdt_pridsmap_vsz = sizeof(usdt_prids_map_val_t);
> +
>  	if (dt_provider_create(dtp, dt_uprobe.name, &dt_uprobe, &pattr,
>  			       NULL) == NULL ||
>  	    dt_provider_create(dtp, dt_uprobe_is_enabled.name,
> @@ -123,6 +137,257 @@ static void probe_destroy(dtrace_hdl_t *dtp, void *datap)
>  	free_probe_list(dtp, datap);
>  }
>  
> +/*
> + * Clean up the BPF "usdt_prids" map.
> + */
> +static int
> +purge_BPFmap(dtrace_hdl_t *dtp)
> +{
> +	int			fd = dtp->dt_usdt_pridsmap_fd;
> +	usdt_prids_map_key_t	key, nxt;
> +	usdt_prids_map_val_t	val;
> +
> +	/* Initialize key to a pid/uprid that cannot be found. */
> +	key.pid = 0;
> +	key.uprid = 0;
> +
> +	/* Loop over all entries. */
> +	while (dt_bpf_map_next_key(fd, &key, &nxt) == 0) {
> +		memcpy(&key, &nxt, sizeof(usdt_prids_map_key_t));
> +
> +		if (dt_bpf_map_lookup(fd, &key, &val) == -1)
> +			return dt_set_errno(dtp, EDT_BPF);
> +
> +		/* Check if the process is still running. */
> +		if (!Pexists(key.pid)) {
> +			dt_bpf_map_delete(fd, &key);          // FIXME:  bpf_map_next_key() iteration restarts each time we delete an elem!!!
> +			continue;
> +		}
> +
> +		/*
> +		 * FIXME.  There might be another case, where the process
> +		 * is still running, but some of its USDT probes are gone?
> +		 * So maybe we have to check for the existence of one of
> +		 *     dtrace_probedesc_t *pdp = dtp->dt_probes[val.prid]->desc;
> +		 *     char *prv = ...pdp->prv minus the numerial part;
> +		 *
> +		 *     /run/dtrace/probes/$pid/$pdp->prv/$pdp->mod/$pdp->fun/$pdp->prb
> +		 *     /run/dtrace/stash/dof-pid/$pid/0/parsed/$prv:$pdp->mod:$pdp->fun:$pdp->prb
> +		 *     /run/dtrace/stash/dof-pid/$pid/.../parsed/$prv:$pdp->mod:$pdp->fun:$pdp->prb
> +		 */
> +	}
> +
> +	return 0;
> +}
> +
> +/*
> + * Grow the string table map.  It includes the string table (which might
> + * grow due to new USDT probes).  It also includes the read-only data and
> + * the block of zeros, but these remain fixed.
> + *
> + * FIXME Is there a danger of the BPF map update colliding with map reads?
> + */
> +static int
> +grow_strtab(dtrace_hdl_t *dtp)
> +{
> +	size_t		sz = dtp->dt_zerooffset + dtp->dt_zerosize;
> +	char		*strtab;
> +	uint8_t		*buf, *end;
> +	size_t		strsize = dtp->dt_options[DTRACEOPT_STRSIZE];
> +	uint32_t	key = 0;
> +	int		rc, err;
> +
> +	strtab = dt_zalloc(dtp, sz);
> +#if 0
> +	// FIXME do something
> +	if (strtab == NULL)
> +		return dt_set_errno(dtp, EDT_NOMEM);
> +#endif
> +
> +	/* Copy the string table data. */
> +	dt_strtab_write(dtp->dt_ccstab, (dt_strtab_write_f *)dt_strtab_copystr, strtab);
> +
> +	/* Loop over the string table and truncate strings that are too long. */
> +	buf = (uint8_t *)strtab;
> +	end = buf + dtp->dt_strlen;
> +	while (buf < end) {
> +		uint_t	len = strlen((char *)buf);
> +
> +		if (len > strsize)
> +			buf[strsize] = '\0';
> +
> +		buf += len + 1;
> +	}
> +
> +	/* Copy the read-only data. */
> +	dt_rodata_write(dtp->dt_rodata, (dt_rodata_copy_f *)dt_rodata_copy, strtab + dtp->dt_rooffset);
> +
> +	rc = dt_bpf_map_update(dtp->dt_strtabmap_fd, &key, strtab);
> +	err = errno;
> +	dt_free(dtp, strtab);
> +
> +	// FIXME do something
> +	if (rc == -1) {
> +		strerror(err);
> +		return -1;
> +		// return dt_bpf_error(dtp, "cannot update BPF map 'strtab': %s\n", strerror(err));
> +	}
> +
> +	return 0;
> +}
> +
> +/*
> + * Update the uprobe provider.
> + */
> +static void update_uprobe(dtrace_hdl_t *dtp, void *datap)
> +{
> +	dt_probe_t	*prp;
> +	dt_probe_t	*prp_next;
> +	int		i, prid = dtp->dt_probe_id;
> +	uint_t		old_strlen = dtp->dt_strlen;
> +	dt_pcb_t	pcb;
> +
> +	/* Clear stale pids. */
> +	purge_BPFmap(dtp);
> +
> +	/* Update dt_probes[] and dt_enablings. */
> +	/*
> +	 * pcb is only used inside of dt_pid_error() to get:
> +	 *     pcb->pcb_region
> +	 *     pcb->pcb_filetag
> +	 *     pcb->pcb_fileptr
> +	 * While pcb cannot be NULL, these other things apparently can be.
> +	 */
> +	memset(&pcb, 0, sizeof(dt_pcb_t));
> +	for (i = 0; i < dtp->dt_stmt_nextid; i++)
> +		dt_pid_create_probes(&dtp->dt_stmts[i]->dtsd_ecbdesc->dted_probe, dtp, &pcb, 1);
> +
> +	while (prid < dtp->dt_probe_id) {
> +		dt_bpf_probe_t pinfo;
> +		const dtrace_probedesc_t *pdp = dtp->dt_probes[prid]->desc;
> +		int fd = dtp->dt_probesmap_fd;
> +
> +		dt_probe_enable(dtp, dtp->dt_probes[prid]);
> +
> +		pinfo.prv = dt_strtab_index(dtp->dt_ccstab, pdp->prv);
> +		pinfo.mod = dt_strtab_index(dtp->dt_ccstab, pdp->mod);
> +		pinfo.fun = dt_strtab_index(dtp->dt_ccstab, pdp->fun);
> +		pinfo.prb = dt_strtab_index(dtp->dt_ccstab, pdp->prb);
> +
> +		if (dt_bpf_map_update(fd, &pdp->id, &pinfo) == -1)
> +			assert(0);   // FIXME do something here
> +
> +		prid++;
> +	}
> +
> +	/* Grow the strtab if it has gotten bigger. */
> +	dtp->dt_strlen = dt_strtab_size(dtp->dt_ccstab);
> +
> +	if (dtp->dt_strlen > dtp->dt_strmax)
> +		assert(0);               // FIXME handle this more gracefully
> +	if (dtp->dt_strlen > old_strlen)
> +		grow_strtab(dtp);
> +
> +	/* Review enablings. */
> +	for (prp = dt_list_next(&dtp->dt_enablings); prp != NULL; prp = prp_next) {
> +		pid_t		pid;
> +		list_probe_t	*pup;
> +
> +		prp_next = dt_list_next(prp);
> +
> +		/* Make sure it is an overlying USDT probe. */
> +		if (prp->prov->impl != &dt_usdt)
> +			continue;
> +
> +		/* FIXME passing in NULL pcb and dpr wreaks havoc on error reporting? */
> +		/*
> +		 * Nick writes:
> +		 * This is a general problem with running compiler-adjacent things outside
> +		 * compile time. I think we should adjust dt_pid_error() so that it works
> +		 * with NULL pcb and dpr at once, probably by using the code path for
> +		 * pcb != NULL and augmenting it so that it passes in NULL for the region and
> +		 * filename args and 0 for the lineno if pcb is NULL. (dt_set_errmsg can
> +		 * already handle this case.)
> +		 */
> +		pid = dt_pid_get_pid(prp->desc, dtp, NULL, NULL);
> +
> +		if (!Pexists(pid)) {
> +			/* Remove from enablings. */
> +			dt_list_delete(&dtp->dt_enablings, prp);
> +
> +			/* Make it evident from the probe that it is not in enablings. */
> +			((dt_list_t *)prp)->dl_prev = NULL;
> +			((dt_list_t *)prp)->dl_next = NULL;
> +
> +			/* Free up its list of underlying probes. */
> +			while ((pup = dt_list_next(prp->prv_data)) != NULL) {
> +				dt_list_delete(prp->prv_data, pup);
> +				dt_free(dtp, pup);
> +			}
> +			dt_free(dtp, prp->prv_data);
> +			prp->prv_data = NULL;
> +
> +			/* Free up BPF "probes" map entry. */
> +			dt_bpf_map_delete(dtp->dt_probesmap_fd, &prp->desc->id);
> +
> +			continue;
> +		}
> +
> +		for (pup = prp->prv_data; pup != NULL; pup = dt_list_next(pup)) {
> +			dt_probe_t		*uprp = pup->probe;
> +			long long		mask = 0, bit = 1;
> +			usdt_prids_map_key_t	key;
> +			usdt_prids_map_val_t	val, oval;
> +			dt_uprobe_t		*upp = uprp->prv_data;
> +
> +			/*
> +			 * For is-enabled probes, the bit mask does not matter.
> +			 * It is possible that we have this underlying probe due to
> +			 * an overlying pid-offset probe and that we will not know
> +			 * until later, when some new pid is created, that we also
> +			 * have an overlying USDT is-enabled probe, but missing this
> +			 * optimization opportunity is okay.
> +			 */
> +			if (uprp->prov->impl == &dt_uprobe && !(upp->flags & PP_IS_ENABLED)) {
> +				int n;
> +
> +				for (n = 0; n < dtp->dt_stmt_nextid; n++) {
> +					if (dt_gmatch(prp->desc->prv, dtp->dt_stmts[n]->dtsd_ecbdesc->dted_probe.prv) &&
> +					    dt_gmatch(prp->desc->mod, dtp->dt_stmts[n]->dtsd_ecbdesc->dted_probe.mod) &&
> +					    dt_gmatch(prp->desc->fun, dtp->dt_stmts[n]->dtsd_ecbdesc->dted_probe.fun) &&
> +					    dt_gmatch(prp->desc->prb, dtp->dt_stmts[n]->dtsd_ecbdesc->dted_probe.prb))
> +						mask |= bit;
> +
> +					bit <<= 1;
> +				}
> +			}
> +
> +			key.pid = pid;
> +			key.uprid = uprp->desc->id;
> +
> +			val.prid = prp->desc->id;
> +			val.mask = mask;
> +
> +			/* linux/bpf.h says error is -1, but it could be -2 */
> +			if (dt_bpf_map_lookup(dtp->dt_usdt_pridsmap_fd, &key, &oval) < 0) {
> +				/*
> +				 * Set the map entry.
> +				 */
> +				// FIXME Check return value, but how should errors be handled?
> +				dt_bpf_map_update(dtp->dt_usdt_pridsmap_fd, &key, &val);
> +			} else if (val.prid != oval.prid || val.mask != oval.mask) {
> +				/*
> +				 * This can happen when two overlying probes map to the same underlying probe for the same pid.
> +				 * E.g., pid:::entry and pid:::0, or pid:::$offset and usdt:::.
> +				 */
> +			} else {
> +				/*
> +				 * Nothing to do, it already is in the map.
> +				 */
> +			}
> +		}
> +	}
> +}
>  
>  /*
>   * Look up or create an underlying (real) probe, corresponding directly to a
> @@ -782,6 +1047,7 @@ dt_provimpl_t	dt_uprobe = {
>  	.probe_info	= &probe_info,
>  	.detach		= &detach,
>  	.probe_destroy	= &probe_destroy_underlying,
> +	.update		= &update_uprobe,
>  };
>  
>  /*
> diff --git a/libdtrace/dt_work.c b/libdtrace/dt_work.c
> index 599e2ae3..79da46fa 100644
> --- a/libdtrace/dt_work.c
> +++ b/libdtrace/dt_work.c
> @@ -373,6 +373,8 @@ dtrace_work(dtrace_hdl_t *dtp, FILE *fp, dtrace_consume_probe_f *pfunc,
>  	dtrace_workstatus_t	rval;
>  	int			gen;
>  
> +dt_uprobe.update(dtp, NULL);
> +
>  	switch (dtrace_status(dtp)) {
>  	case DTRACE_STATUS_EXITED:
>  	case DTRACE_STATUS_STOPPED:
> -- 
> 2.43.5
> 



More information about the DTrace-devel mailing list