[DTrace-devel] [PATCH v4 2/5] support stapsdt ELF-note-defined static probes

Kris Van Hees kris.van.hees at oracle.com
Tue Jul 1 19:42:09 UTC 2025


I am looking a bit deeper into this patch.  See my other email concerning the
args test that is not passing because it fails to get the function name.  I
believe the problem is that the code here does not handle PIE-compiled code
(which is default for e.g. Debian, but not OL).

Also, I am trying to see whether we can integrate the parsing of the note
format in usdt_parser_notes.c so that we can centralize all ELF notes parsing
related to USDT in a single location.

Maybe it would be better, maybe not - I'm evaluating.

Another thing...  you are performing addr-to-map lookups for every address
(i.e. for every note) even though you are processing notes for a single
mapping in the loop, so the map should be the same for all the addresses,
right?  I don't think that the ELF notes for mapping A can refer to probes
(by address) that belong in mapping B - so that cam be optimized I think.

Do you think the semaphore can be implemented as well, since that is somewhat
similar to is-enabled probes I think?

On Mon, Jun 23, 2025 at 11:13:07AM +0100, Alan Maguire wrote:
> As well as using dtrace -G to generate USDT probes, programs and
> libraries may have added static probes via stapsdt ELF notes.
> 
> Read ELF notes from binaries from /proc/ maps associated with processes
> and parse them to retrieve uprobe address and argument-related info
> to create the associated uprobe.
> 
> Probe arguments can be either constants, register values or dereferences
> or dereferences from register values (plus offset), identical to the
> updated USDT ELF note handling.
> 
> A new provider - stapsdt - implements this support, as stapsdt probes do
> not dynamically register themselves with DTrace.  This makes them less
> powerful than DTrace-based USDT probes, but they do exist in programs and
> libraries so should be supported.
> 
> As well as supporting ELF-note stapsdt defined probes in programs and
> libraries, this patch supports dynamically-created probes that
> are created via libstapsdt [1].  libstapsdt allows dynamic languages
> like python to declare and fire probes by dynamically creating
> a memfd-based shared library containing ELF notes for the probes.
> With these changes we can also trace these probes.  This is very
> useful since libstapsdt has python, NodeJS, go and luaJIT bindings.
> 
> [1] https://github.com/linux-usdt/libstapsdt
> 
> Signed-off-by: Alan Maguire <alan.maguire at oracle.com>
> ---
>  include/dtrace/pid.h       |   1 +
>  libdtrace/dt_pid.c         | 288 +++++++++++++++++++++++++++++++++++++
>  libdtrace/dt_prov_uprobe.c |  43 +++++-
>  3 files changed, 328 insertions(+), 4 deletions(-)
> 
> diff --git a/include/dtrace/pid.h b/include/dtrace/pid.h
> index 8d4b6432..99093bc9 100644
> --- a/include/dtrace/pid.h
> +++ b/include/dtrace/pid.h
> @@ -24,6 +24,7 @@ typedef enum pid_probetype {
>  	DTPPT_OFFSETS,
>  	DTPPT_ABSOFFSETS,
>  	DTPPT_USDT,
> +	DTPPT_STAPSDT,
>  	DTPPT_IS_ENABLED
>  } pid_probetype_t;
>  
> diff --git a/libdtrace/dt_pid.c b/libdtrace/dt_pid.c
> index d12b7919..6581b087 100644
> --- a/libdtrace/dt_pid.c
> +++ b/libdtrace/dt_pid.c
> @@ -38,6 +38,9 @@
>  #include <dt_pid.h>
>  #include <dt_string.h>
>  
> +#define SEC_STAPSDT_NOTE	".note.stapsdt"
> +#define NAME_STAPSDT_NOTE	"stapsdt"
> +
>  /*
>   * Information on a PID probe.
>   */
> @@ -1262,6 +1265,288 @@ dt_pid_create_pid_probes(dtrace_probedesc_t *pdp, dtrace_hdl_t *dtp, dt_pcb_t *p
>  	return err;
>  }
>  
> +static int
> +dt_stapsdt_parse(dtrace_hdl_t *dtp, dt_proc_t *dpr, dtrace_probedesc_t *pdp,
> +		 dt_pcb_t *pcb, const dt_provider_t *pvp, char *path,
> +		 unsigned long base_addr)
> +{
> +	Elf *elf = NULL;
> +	Elf_Scn *scn = NULL;
> +	GElf_Shdr shdr;
> +	GElf_Nhdr nhdr;
> +	size_t shstrndx, noff, doff, off, n;
> +	Elf_Data *data;
> +	GElf_Ehdr ehdr;
> +	int i, err = 0;
> +	int fd = -1;
> +	char *mod;
> +
> +	fd = open(path, O_RDONLY);
> +	if (fd < 0) {
> +		dt_pid_error(dtp, pcb, dpr, D_PROC_USDT,
> +			     "Cannot open %s: %s\n",
> +			     path, strerror(errno));
> +		return -1;
> +	}
> +	mod = strrchr(path, '/');
> +	if (mod)
> +		mod++;
> +	else
> +		mod = path;
> +
> +	elf = elf_begin(fd, ELF_C_READ_MMAP, NULL);   // ELF_C_READ ?
> +	assert(elf_kind(elf) == ELF_K_ELF);
> +	elf_getshdrstrndx(elf, &shstrndx);
> +
> +	if (gelf_getehdr(elf, &ehdr)) {
> +		switch (ehdr.e_type) {
> +		case ET_EXEC:
> +			/* binary does not require base addr adjustment */
> +			base_addr = 0;
> +			break;
> +		case ET_DYN:
> +			break;
> +		default:
> +			dt_dprintf("unexpected ELF hdr type 0x%x for '%s'\n",
> +				   ehdr.e_type, path);
> +			err = -1;
> +			goto out;
> +		}
> +	}
> +
> +	while ((scn = elf_nextscn(elf, scn)) != NULL) {
> +		char *secname;
> +
> +		assert(gelf_getshdr(scn, &shdr) != NULL);
> +
> +		secname = elf_strptr(elf, shstrndx, shdr.sh_name);
> +		if (strcmp(secname, SEC_STAPSDT_NOTE) == 0 &&
> +		    shdr.sh_type == SHT_NOTE)
> +			break;
> +	}
> +	/* No ELF notes, just bail. */
> +	if (scn == NULL)
> +		goto out;
> +	data = elf_getdata(scn, 0);
> +	for (off = 0;
> +	     (off = gelf_getnote(data, off, &nhdr, &noff, &doff)) > 0;) {
> +		pid_probespec_t psp = {0};
> +		char *prv, *prb;
> +		const char *fun;
> +		char *dbuf = (char *)data->d_buf;
> +		long *addrs = data->d_buf + doff; /* 3 addrs are loc/base/semaphore */
> +		GElf_Sym sym;
> +		const prmap_t *pmp;
> +
> +		if (strncmp(dbuf + noff, NAME_STAPSDT_NOTE, nhdr.n_namesz) != 0)
> +			continue;
> +		prv = dbuf + doff + (3*sizeof(long));
> +		/* ensure prv/prb is null-terminated */
> +		if (strlen(prv) >= nhdr.n_descsz)
> +			continue;
> +		prb = prv + strlen(prv) + 1;
> +		if (strlen(prb) >= nhdr.n_descsz)
> +			continue;
> +		if (strncmp(pdp->prv, prv, strlen(prv)) != 0)
> +			continue;
> +		/* skip unmatched, non-wildcarded probes */
> +		if (strcmp(pdp->prb, "*") != 0 &&
> +		    (strlen(pdp->prb) > 0 && strcmp(pdp->prb, prb) != 0))
> +			continue;
> +		if (prb + strlen(prb) + 1 < dbuf + doff + nhdr.n_descsz)
> +			psp.pps_sargv = prb + strlen(prb) + 1;
> +
> +		psp.pps_type = DTPPT_STAPSDT;
> +		psp.pps_prv = prv;
> +		psp.pps_mod = mod;
> +		psp.pps_prb = prb;
> +		if (elf_getphdrnum(elf, &n))
> +			continue;
> +		for (i = 0; i < n; i++) {
> +			GElf_Phdr phdr;
> +
> +			if (!gelf_getphdr(elf, i, &phdr))
> +				break;
> +
> +			if (addrs[0] < phdr.p_vaddr ||
> +			    addrs[0] > phdr.p_vaddr + phdr.p_memsz)
> +				continue;
> +			if (base_addr)
> +				psp.pps_off = addrs[0];
> +			else
> +				psp.pps_off = addrs[0] - phdr.p_vaddr + phdr.p_offset;
> +			break;
> +		}
> +		if (!psp.pps_off)
> +			continue;
> +		psp.pps_nameoff = 0;
> +
> +		pmp = Paddr_to_map(dpr->dpr_proc, base_addr + addrs[0]);
> +		if (!pmp) {
> +			dt_dprintf("%i: cannot determine 0x%lx's mapping\n",
> +				   Pgetpid(dpr->dpr_proc), psp.pps_off);
> +			continue;
> +		}
> +		psp.pps_fn = Pmap_mapfile_name(dpr->dpr_proc, pmp);
> +		if (psp.pps_fn == NULL) {
> +			dt_pid_error(dtp, pcb, dpr, D_PROC_USDT,
> +				     "Cannot get name of mapping containing probe %s for pid %d\n",
> +				     psp.pps_prb, dpr->dpr_pid);
> +			err = -1;
> +			break;
> +		}
> +		if (dt_Plookup_by_addr(dtp, dpr->dpr_pid, base_addr + addrs[0],
> +				       &fun, &sym) == 0)
> +			psp.pps_fun = (char *)fun;
> +		else
> +			psp.pps_fun = "";
> +		psp.pps_dev = pmp->pr_dev;
> +		psp.pps_inum = pmp->pr_inum;
> +		psp.pps_pid = dpr->dpr_pid;
> +		psp.pps_nameoff = 0;
> +
> +		if (pvp->impl->provide_probe(dtp, &psp) < 0) {
> +			dt_pid_error(dtp, pcb, dpr, D_PROC_USDT,
> +				     "failed to instantiate probe %s for pid %d: %s",
> +				     psp.pps_prb, psp.pps_pid,
> +			dtrace_errmsg(dtp, dtrace_errno(dtp)));
> +			err = -1;
> +		}
> +		free(psp.pps_fn);
> +		if (err == -1)
> +			break;
> +	}
> +
> +out:
> +	elf_end(elf);
> +	close(fd);
> +	return err;
> +}
> +
> +static void
> +dt_pid_create_stapsdt_probes_proc(dtrace_probedesc_t *pdp, dtrace_hdl_t *dtp,
> +				  dt_pcb_t *pcb, const dt_provider_t *pvp,
> +				  dt_proc_t *dpr, const char *proc_map)
> +{
> +	char line[1024];
> +	FILE *fp = NULL;
> +	pid_t pid;
> +
> +	assert(dpr != NULL);
> +
> +	pid = dpr->dpr_pid;
> +	fp = fopen(proc_map, "r");
> +	if (!fp)
> +		return;
> +
> +	while (fgets(line, sizeof(line) - 1, fp) != NULL) {
> +		long addr_start, addr_end, file_offset;
> +		long dev_major, dev_minor;
> +		unsigned long inode;
> +		char name[PATH_MAX + 1];
> +		char path[PATH_MAX + 1];
> +		char perm[5];
> +		int ret;
> +
> +		ret = sscanf(line,
> +			     "%lx-%lx %4s %lx %lx:%lx %lu %[^\n]",
> +			     &addr_start, &addr_end, perm, &file_offset,
> +			     &dev_major, &dev_minor, &inode, name);
> +		if (ret != 8 || !strchr(perm, 'x') || strchr(name, '[') != NULL)
> +			continue;
> +
> +		/* libstapsdt uses an memfd-based library to dynamically create
> +		 * stapsdt notes for dynamic languages like python; we need
> +		 * the associated /proc/<pid>/fds/ fd to read these notes.
> +		 */
> +		if (strncmp(name, "/memfd:", strlen("/memfd:")) == 0) {
> +			DIR *d;
> +			struct dirent *dirent;
> +			char *deleted;
> +
> +			deleted = strstr(name, " (deleted)");
> +			if (deleted)
> +				*deleted = '\0';
> +			snprintf(path, sizeof(path), "/proc/%d/fd", pid);
> +			d = opendir(path);
> +			if (d == NULL)
> +				continue;
> +			while ((dirent = readdir(d)) != NULL) {
> +				struct stat s;
> +
> +				snprintf(path, sizeof(path), "/proc/%d/fd/%s",
> +					 pid, dirent->d_name);
> +				if (stat(path, &s) != 0 || s.st_ino != inode)
> +					continue;
> +				if (dt_stapsdt_parse(dtp, dpr, pdp, pcb, pvp,
> +						     path, addr_start) != 0)
> +					break;
> +			}
> +		} else {
> +			if (dt_stapsdt_parse(dtp, dpr, pdp, pcb, pvp, name,
> +					     addr_start) != 0)
> +				break;
> +		}
> +	}
> +	fclose(fp);
> +}
> +
> +static int
> +dt_pid_create_stapsdt_probes(dtrace_probedesc_t *pdp, dtrace_hdl_t *dtp, dt_pcb_t *pcb)
> +{
> +	int i, nmatches = 0, err = 0;
> +	const dt_provider_t *pvp;
> +	char *globpat = NULL;
> +	const char *pidstr;
> +	glob_t globbuf;
> +	bool wildcard;
> +	pid_t pid;
> +
> +	assert(pcb != NULL);
> +
> +	pidstr = &pdp->prv[strlen(pdp->prv)];
> +
> +	while (isdigit(*(pidstr - 1)) || *(pidstr - 1) == '*')
> +		pidstr--;
> +	if (strlen(pidstr) == 0)
> +		return 0;
> +	wildcard = strchr(pidstr, '*');
> +	asprintf(&globpat, "/proc/%s/maps", pidstr);
> +	nmatches = glob(globpat, 0, NULL, &globbuf) ? 0 : globbuf.gl_pathc;
> +	pvp = dt_provider_lookup(dtp, "stapsdt");
> +	assert(pvp != NULL);
> +
> +	for (i = 0; i < nmatches; i++) {
> +		dt_proc_t *dpr = NULL;
> +
> +		pidstr = globbuf.gl_pathv[i] + strlen("/proc/");
> +		pid = atoll(pidstr);
> +		if (pid <= 0)
> +			continue;
> +		if (dt_proc_grab_lock(dtp, pid, DTRACE_PROC_WAITING |
> +				      DTRACE_PROC_SHORTLIVED) < 0) {
> +			if (wildcard)
> +				continue;
> +			dt_pid_error(dtp, pcb, NULL, D_PROC_GRAB,
> +				     "failed to grab process %d",
> +				     (int)pid);
> +			err = 1;
> +			break;
> +		}
> +		dpr = dt_proc_lookup(dtp, pid);
> +		if (dpr) {
> +			dt_pid_create_stapsdt_probes_proc(pdp, dtp, pcb,
> +							  pvp, dpr,
> +							  globbuf.gl_pathv[i]);
> +			dt_proc_release_unlock(dtp, pid);
> +		}
> +	}
> +	free(globpat);
> +	globfree(&globbuf);
> +
> +	return err;
> +}
> +
>  int
>  dt_pid_create_usdt_probes(dtrace_probedesc_t *pdp, dtrace_hdl_t *dtp, dt_pcb_t *pcb)
>  {
> @@ -1319,6 +1604,9 @@ dt_pid_create_usdt_probes(dtrace_probedesc_t *pdp, dtrace_hdl_t *dtp, dt_pcb_t *
>  	free(globpat);
>  	globfree(&globbuf);
>  
> +	if (err == 0)
> +		err = dt_pid_create_stapsdt_probes(pdp, dtp, pcb);
> +
>  	/* If no errors, report success. */
>  	if (err == 0)
>  		return 0;
> diff --git a/libdtrace/dt_prov_uprobe.c b/libdtrace/dt_prov_uprobe.c
> index 2cbd8910..b91cf810 100644
> --- a/libdtrace/dt_prov_uprobe.c
> +++ b/libdtrace/dt_prov_uprobe.c
> @@ -313,12 +313,15 @@ static const dtrace_pattr_t	pattr = {
>  
>  dt_provimpl_t	dt_pid;
>  dt_provimpl_t	dt_usdt;
> +dt_provimpl_t	dt_stapsdt;
>  
>  static int populate(dtrace_hdl_t *dtp)
>  {
>  	if (dt_provider_create(dtp, dt_uprobe.name, &dt_uprobe, &pattr,
>  			       NULL) == NULL ||
>  	    dt_provider_create(dtp, dt_pid.name, &dt_pid, &pattr,
> +			       NULL) == NULL ||
> +	    dt_provider_create(dtp, dt_stapsdt.name, &dt_stapsdt, &pattr,
>  			       NULL) == NULL)
>  		return -1;			/* errno already set */
>  
> @@ -477,8 +480,8 @@ clean_usdt_probes(dtrace_hdl_t *dtp)
>  
>  		prp_next = dt_list_next(prp);
>  
> -		/* Make sure it is an overlying USDT probe. */
> -		if (prp->prov->impl != &dt_usdt)
> +		/* Make sure it is an overlying USDT, stapsdt probe. */
> +		if (prp->prov->impl != &dt_usdt && prp->prov->impl != &dt_stapsdt)
>  			continue;
>  
>  		/* FIXME passing in NULL pcb and dpr wreaks havoc on error reporting? */
> @@ -637,6 +640,7 @@ static int add_probe_uprobe(dtrace_hdl_t *dtp, dt_probe_t *prp)
>  	return 0;
>  }
>  
> +/* shared between usdt, stapsdt probes */
>  static int add_probe_usdt(dtrace_hdl_t *dtp, dt_probe_t *prp)
>  {
>  	char				probnam[DTRACE_FULLNAMELEN], *p;
> @@ -890,6 +894,7 @@ static dt_probe_t *create_underlying(dtrace_hdl_t *dtp,
>  	case DTPPT_OFFSETS:
>  	case DTPPT_ABSOFFSETS:
>  	case DTPPT_USDT:
> +	case DTPPT_STAPSDT:
>  		snprintf(prb, sizeof(prb), "%lx", psp->pps_off);
>  		break;
>  	default:
> @@ -904,7 +909,7 @@ static dt_probe_t *create_underlying(dtrace_hdl_t *dtp,
>  	pd.prb = prb;
>  
>  	dt_dprintf("Providing underlying probe %s:%s:%s:%s @ %lx\n", psp->pps_prv,
> -		   psp->pps_mod, psp->pps_fn, psp->pps_prb, psp->pps_off);
> +		   psp->pps_mod, psp->pps_fun, psp->pps_prb, psp->pps_off);
>  	uprp = dt_probe_lookup(dtp, &pd);
>  	if (uprp == NULL) {
>  		dt_provider_t	*pvp;
> @@ -1108,11 +1113,24 @@ static int provide_usdt_probe(dtrace_hdl_t *dtp, const pid_probespec_t *psp)
>  	return provide_probe(dtp, psp, psp->pps_prb, &dt_usdt, PP_IS_FUNCALL);
>  }
>  
> +static int provide_stapsdt_probe(dtrace_hdl_t *dtp, const pid_probespec_t *psp)
> +{
> +	if (psp->pps_type != DTPPT_STAPSDT &&
> +	    psp->pps_type != DTPPT_IS_ENABLED) {
> +		dt_dprintf("pid: unknown stapsdt probe type %i\n", psp->pps_type);
> +		return -1;
> +	}
> +
> +	return provide_probe(dtp, psp, psp->pps_prb, &dt_stapsdt, PP_IS_FUNCALL);
> +}
> +
> +
>  static void enable(dtrace_hdl_t *dtp, dt_probe_t *prp, int is_usdt)
>  {
>  	const list_probe_t	*pup;
>  
> -	assert(prp->prov->impl == &dt_pid || prp->prov->impl == &dt_usdt);
> +	assert(prp->prov->impl == &dt_pid || prp->prov->impl == &dt_usdt ||
> +	       prp->prov->impl == &dt_stapsdt);
>  
>  	/*
>  	 * We need to enable the underlying probes (if not enabled yet).
> @@ -1144,6 +1162,11 @@ static void enable_usdt(dtrace_hdl_t *dtp, dt_probe_t *prp)
>  	enable(dtp, prp, 1);
>  }
>  
> +static void enable_stapsdt(dtrace_hdl_t *dtp, dt_probe_t *prp)
> +{
> +	enable(dtp, prp, 1);
> +}
> +
>  /*
>   * Generate code that populates, counts the probe arguments.
>   */
> @@ -1875,3 +1898,15 @@ dt_provimpl_t	dt_usdt = {
>  	.discover	= &discover,
>  	.add_probe	= &add_probe_usdt,
>  };
> +
> +/*
> + * Used for stapsdt probes.
> + */
> +dt_provimpl_t	dt_stapsdt = {
> +	.name		= "stapsdt",
> +	.prog_type	= BPF_PROG_TYPE_UNSPEC,
> +	.provide_probe	= &provide_stapsdt_probe,
> +	.enable		= &enable_stapsdt,
> +	.probe_destroy	= &probe_destroy,
> +	.add_probe	= &add_probe_usdt,
> +};
> -- 
> 2.43.5
> 



More information about the DTrace-devel mailing list