[DTrace-devel] [PATCH v4 2/5] support stapsdt ELF-note-defined static probes

Alan Maguire alan.maguire at oracle.com
Mon Jun 23 10:13:07 UTC 2025


As well as using dtrace -G to generate USDT probes, programs and
libraries may have added static probes via stapsdt ELF notes.

Read ELF notes from binaries from /proc/ maps associated with processes
and parse them to retrieve uprobe address and argument-related info
to create the associated uprobe.

Probe arguments can be either constants, register values or dereferences
or dereferences from register values (plus offset), identical to the
updated USDT ELF note handling.

A new provider - stapsdt - implements this support, as stapsdt probes do
not dynamically register themselves with DTrace.  This makes them less
powerful than DTrace-based USDT probes, but they do exist in programs and
libraries so should be supported.

As well as supporting ELF-note stapsdt defined probes in programs and
libraries, this patch supports dynamically-created probes that
are created via libstapsdt [1].  libstapsdt allows dynamic languages
like python to declare and fire probes by dynamically creating
a memfd-based shared library containing ELF notes for the probes.
With these changes we can also trace these probes.  This is very
useful since libstapsdt has python, NodeJS, go and luaJIT bindings.

[1] https://github.com/linux-usdt/libstapsdt

Signed-off-by: Alan Maguire <alan.maguire at oracle.com>
---
 include/dtrace/pid.h       |   1 +
 libdtrace/dt_pid.c         | 288 +++++++++++++++++++++++++++++++++++++
 libdtrace/dt_prov_uprobe.c |  43 +++++-
 3 files changed, 328 insertions(+), 4 deletions(-)

diff --git a/include/dtrace/pid.h b/include/dtrace/pid.h
index 8d4b6432..99093bc9 100644
--- a/include/dtrace/pid.h
+++ b/include/dtrace/pid.h
@@ -24,6 +24,7 @@ typedef enum pid_probetype {
 	DTPPT_OFFSETS,
 	DTPPT_ABSOFFSETS,
 	DTPPT_USDT,
+	DTPPT_STAPSDT,
 	DTPPT_IS_ENABLED
 } pid_probetype_t;
 
diff --git a/libdtrace/dt_pid.c b/libdtrace/dt_pid.c
index d12b7919..6581b087 100644
--- a/libdtrace/dt_pid.c
+++ b/libdtrace/dt_pid.c
@@ -38,6 +38,9 @@
 #include <dt_pid.h>
 #include <dt_string.h>
 
+#define SEC_STAPSDT_NOTE	".note.stapsdt"
+#define NAME_STAPSDT_NOTE	"stapsdt"
+
 /*
  * Information on a PID probe.
  */
@@ -1262,6 +1265,288 @@ dt_pid_create_pid_probes(dtrace_probedesc_t *pdp, dtrace_hdl_t *dtp, dt_pcb_t *p
 	return err;
 }
 
+static int
+dt_stapsdt_parse(dtrace_hdl_t *dtp, dt_proc_t *dpr, dtrace_probedesc_t *pdp,
+		 dt_pcb_t *pcb, const dt_provider_t *pvp, char *path,
+		 unsigned long base_addr)
+{
+	Elf *elf = NULL;
+	Elf_Scn *scn = NULL;
+	GElf_Shdr shdr;
+	GElf_Nhdr nhdr;
+	size_t shstrndx, noff, doff, off, n;
+	Elf_Data *data;
+	GElf_Ehdr ehdr;
+	int i, err = 0;
+	int fd = -1;
+	char *mod;
+
+	fd = open(path, O_RDONLY);
+	if (fd < 0) {
+		dt_pid_error(dtp, pcb, dpr, D_PROC_USDT,
+			     "Cannot open %s: %s\n",
+			     path, strerror(errno));
+		return -1;
+	}
+	mod = strrchr(path, '/');
+	if (mod)
+		mod++;
+	else
+		mod = path;
+
+	elf = elf_begin(fd, ELF_C_READ_MMAP, NULL);   // ELF_C_READ ?
+	assert(elf_kind(elf) == ELF_K_ELF);
+	elf_getshdrstrndx(elf, &shstrndx);
+
+	if (gelf_getehdr(elf, &ehdr)) {
+		switch (ehdr.e_type) {
+		case ET_EXEC:
+			/* binary does not require base addr adjustment */
+			base_addr = 0;
+			break;
+		case ET_DYN:
+			break;
+		default:
+			dt_dprintf("unexpected ELF hdr type 0x%x for '%s'\n",
+				   ehdr.e_type, path);
+			err = -1;
+			goto out;
+		}
+	}
+
+	while ((scn = elf_nextscn(elf, scn)) != NULL) {
+		char *secname;
+
+		assert(gelf_getshdr(scn, &shdr) != NULL);
+
+		secname = elf_strptr(elf, shstrndx, shdr.sh_name);
+		if (strcmp(secname, SEC_STAPSDT_NOTE) == 0 &&
+		    shdr.sh_type == SHT_NOTE)
+			break;
+	}
+	/* No ELF notes, just bail. */
+	if (scn == NULL)
+		goto out;
+	data = elf_getdata(scn, 0);
+	for (off = 0;
+	     (off = gelf_getnote(data, off, &nhdr, &noff, &doff)) > 0;) {
+		pid_probespec_t psp = {0};
+		char *prv, *prb;
+		const char *fun;
+		char *dbuf = (char *)data->d_buf;
+		long *addrs = data->d_buf + doff; /* 3 addrs are loc/base/semaphore */
+		GElf_Sym sym;
+		const prmap_t *pmp;
+
+		if (strncmp(dbuf + noff, NAME_STAPSDT_NOTE, nhdr.n_namesz) != 0)
+			continue;
+		prv = dbuf + doff + (3*sizeof(long));
+		/* ensure prv/prb is null-terminated */
+		if (strlen(prv) >= nhdr.n_descsz)
+			continue;
+		prb = prv + strlen(prv) + 1;
+		if (strlen(prb) >= nhdr.n_descsz)
+			continue;
+		if (strncmp(pdp->prv, prv, strlen(prv)) != 0)
+			continue;
+		/* skip unmatched, non-wildcarded probes */
+		if (strcmp(pdp->prb, "*") != 0 &&
+		    (strlen(pdp->prb) > 0 && strcmp(pdp->prb, prb) != 0))
+			continue;
+		if (prb + strlen(prb) + 1 < dbuf + doff + nhdr.n_descsz)
+			psp.pps_sargv = prb + strlen(prb) + 1;
+
+		psp.pps_type = DTPPT_STAPSDT;
+		psp.pps_prv = prv;
+		psp.pps_mod = mod;
+		psp.pps_prb = prb;
+		if (elf_getphdrnum(elf, &n))
+			continue;
+		for (i = 0; i < n; i++) {
+			GElf_Phdr phdr;
+
+			if (!gelf_getphdr(elf, i, &phdr))
+				break;
+
+			if (addrs[0] < phdr.p_vaddr ||
+			    addrs[0] > phdr.p_vaddr + phdr.p_memsz)
+				continue;
+			if (base_addr)
+				psp.pps_off = addrs[0];
+			else
+				psp.pps_off = addrs[0] - phdr.p_vaddr + phdr.p_offset;
+			break;
+		}
+		if (!psp.pps_off)
+			continue;
+		psp.pps_nameoff = 0;
+
+		pmp = Paddr_to_map(dpr->dpr_proc, base_addr + addrs[0]);
+		if (!pmp) {
+			dt_dprintf("%i: cannot determine 0x%lx's mapping\n",
+				   Pgetpid(dpr->dpr_proc), psp.pps_off);
+			continue;
+		}
+		psp.pps_fn = Pmap_mapfile_name(dpr->dpr_proc, pmp);
+		if (psp.pps_fn == NULL) {
+			dt_pid_error(dtp, pcb, dpr, D_PROC_USDT,
+				     "Cannot get name of mapping containing probe %s for pid %d\n",
+				     psp.pps_prb, dpr->dpr_pid);
+			err = -1;
+			break;
+		}
+		if (dt_Plookup_by_addr(dtp, dpr->dpr_pid, base_addr + addrs[0],
+				       &fun, &sym) == 0)
+			psp.pps_fun = (char *)fun;
+		else
+			psp.pps_fun = "";
+		psp.pps_dev = pmp->pr_dev;
+		psp.pps_inum = pmp->pr_inum;
+		psp.pps_pid = dpr->dpr_pid;
+		psp.pps_nameoff = 0;
+
+		if (pvp->impl->provide_probe(dtp, &psp) < 0) {
+			dt_pid_error(dtp, pcb, dpr, D_PROC_USDT,
+				     "failed to instantiate probe %s for pid %d: %s",
+				     psp.pps_prb, psp.pps_pid,
+			dtrace_errmsg(dtp, dtrace_errno(dtp)));
+			err = -1;
+		}
+		free(psp.pps_fn);
+		if (err == -1)
+			break;
+	}
+
+out:
+	elf_end(elf);
+	close(fd);
+	return err;
+}
+
+static void
+dt_pid_create_stapsdt_probes_proc(dtrace_probedesc_t *pdp, dtrace_hdl_t *dtp,
+				  dt_pcb_t *pcb, const dt_provider_t *pvp,
+				  dt_proc_t *dpr, const char *proc_map)
+{
+	char line[1024];
+	FILE *fp = NULL;
+	pid_t pid;
+
+	assert(dpr != NULL);
+
+	pid = dpr->dpr_pid;
+	fp = fopen(proc_map, "r");
+	if (!fp)
+		return;
+
+	while (fgets(line, sizeof(line) - 1, fp) != NULL) {
+		long addr_start, addr_end, file_offset;
+		long dev_major, dev_minor;
+		unsigned long inode;
+		char name[PATH_MAX + 1];
+		char path[PATH_MAX + 1];
+		char perm[5];
+		int ret;
+
+		ret = sscanf(line,
+			     "%lx-%lx %4s %lx %lx:%lx %lu %[^\n]",
+			     &addr_start, &addr_end, perm, &file_offset,
+			     &dev_major, &dev_minor, &inode, name);
+		if (ret != 8 || !strchr(perm, 'x') || strchr(name, '[') != NULL)
+			continue;
+
+		/* libstapsdt uses an memfd-based library to dynamically create
+		 * stapsdt notes for dynamic languages like python; we need
+		 * the associated /proc/<pid>/fds/ fd to read these notes.
+		 */
+		if (strncmp(name, "/memfd:", strlen("/memfd:")) == 0) {
+			DIR *d;
+			struct dirent *dirent;
+			char *deleted;
+
+			deleted = strstr(name, " (deleted)");
+			if (deleted)
+				*deleted = '\0';
+			snprintf(path, sizeof(path), "/proc/%d/fd", pid);
+			d = opendir(path);
+			if (d == NULL)
+				continue;
+			while ((dirent = readdir(d)) != NULL) {
+				struct stat s;
+
+				snprintf(path, sizeof(path), "/proc/%d/fd/%s",
+					 pid, dirent->d_name);
+				if (stat(path, &s) != 0 || s.st_ino != inode)
+					continue;
+				if (dt_stapsdt_parse(dtp, dpr, pdp, pcb, pvp,
+						     path, addr_start) != 0)
+					break;
+			}
+		} else {
+			if (dt_stapsdt_parse(dtp, dpr, pdp, pcb, pvp, name,
+					     addr_start) != 0)
+				break;
+		}
+	}
+	fclose(fp);
+}
+
+static int
+dt_pid_create_stapsdt_probes(dtrace_probedesc_t *pdp, dtrace_hdl_t *dtp, dt_pcb_t *pcb)
+{
+	int i, nmatches = 0, err = 0;
+	const dt_provider_t *pvp;
+	char *globpat = NULL;
+	const char *pidstr;
+	glob_t globbuf;
+	bool wildcard;
+	pid_t pid;
+
+	assert(pcb != NULL);
+
+	pidstr = &pdp->prv[strlen(pdp->prv)];
+
+	while (isdigit(*(pidstr - 1)) || *(pidstr - 1) == '*')
+		pidstr--;
+	if (strlen(pidstr) == 0)
+		return 0;
+	wildcard = strchr(pidstr, '*');
+	asprintf(&globpat, "/proc/%s/maps", pidstr);
+	nmatches = glob(globpat, 0, NULL, &globbuf) ? 0 : globbuf.gl_pathc;
+	pvp = dt_provider_lookup(dtp, "stapsdt");
+	assert(pvp != NULL);
+
+	for (i = 0; i < nmatches; i++) {
+		dt_proc_t *dpr = NULL;
+
+		pidstr = globbuf.gl_pathv[i] + strlen("/proc/");
+		pid = atoll(pidstr);
+		if (pid <= 0)
+			continue;
+		if (dt_proc_grab_lock(dtp, pid, DTRACE_PROC_WAITING |
+				      DTRACE_PROC_SHORTLIVED) < 0) {
+			if (wildcard)
+				continue;
+			dt_pid_error(dtp, pcb, NULL, D_PROC_GRAB,
+				     "failed to grab process %d",
+				     (int)pid);
+			err = 1;
+			break;
+		}
+		dpr = dt_proc_lookup(dtp, pid);
+		if (dpr) {
+			dt_pid_create_stapsdt_probes_proc(pdp, dtp, pcb,
+							  pvp, dpr,
+							  globbuf.gl_pathv[i]);
+			dt_proc_release_unlock(dtp, pid);
+		}
+	}
+	free(globpat);
+	globfree(&globbuf);
+
+	return err;
+}
+
 int
 dt_pid_create_usdt_probes(dtrace_probedesc_t *pdp, dtrace_hdl_t *dtp, dt_pcb_t *pcb)
 {
@@ -1319,6 +1604,9 @@ dt_pid_create_usdt_probes(dtrace_probedesc_t *pdp, dtrace_hdl_t *dtp, dt_pcb_t *
 	free(globpat);
 	globfree(&globbuf);
 
+	if (err == 0)
+		err = dt_pid_create_stapsdt_probes(pdp, dtp, pcb);
+
 	/* If no errors, report success. */
 	if (err == 0)
 		return 0;
diff --git a/libdtrace/dt_prov_uprobe.c b/libdtrace/dt_prov_uprobe.c
index 2cbd8910..b91cf810 100644
--- a/libdtrace/dt_prov_uprobe.c
+++ b/libdtrace/dt_prov_uprobe.c
@@ -313,12 +313,15 @@ static const dtrace_pattr_t	pattr = {
 
 dt_provimpl_t	dt_pid;
 dt_provimpl_t	dt_usdt;
+dt_provimpl_t	dt_stapsdt;
 
 static int populate(dtrace_hdl_t *dtp)
 {
 	if (dt_provider_create(dtp, dt_uprobe.name, &dt_uprobe, &pattr,
 			       NULL) == NULL ||
 	    dt_provider_create(dtp, dt_pid.name, &dt_pid, &pattr,
+			       NULL) == NULL ||
+	    dt_provider_create(dtp, dt_stapsdt.name, &dt_stapsdt, &pattr,
 			       NULL) == NULL)
 		return -1;			/* errno already set */
 
@@ -477,8 +480,8 @@ clean_usdt_probes(dtrace_hdl_t *dtp)
 
 		prp_next = dt_list_next(prp);
 
-		/* Make sure it is an overlying USDT probe. */
-		if (prp->prov->impl != &dt_usdt)
+		/* Make sure it is an overlying USDT, stapsdt probe. */
+		if (prp->prov->impl != &dt_usdt && prp->prov->impl != &dt_stapsdt)
 			continue;
 
 		/* FIXME passing in NULL pcb and dpr wreaks havoc on error reporting? */
@@ -637,6 +640,7 @@ static int add_probe_uprobe(dtrace_hdl_t *dtp, dt_probe_t *prp)
 	return 0;
 }
 
+/* shared between usdt, stapsdt probes */
 static int add_probe_usdt(dtrace_hdl_t *dtp, dt_probe_t *prp)
 {
 	char				probnam[DTRACE_FULLNAMELEN], *p;
@@ -890,6 +894,7 @@ static dt_probe_t *create_underlying(dtrace_hdl_t *dtp,
 	case DTPPT_OFFSETS:
 	case DTPPT_ABSOFFSETS:
 	case DTPPT_USDT:
+	case DTPPT_STAPSDT:
 		snprintf(prb, sizeof(prb), "%lx", psp->pps_off);
 		break;
 	default:
@@ -904,7 +909,7 @@ static dt_probe_t *create_underlying(dtrace_hdl_t *dtp,
 	pd.prb = prb;
 
 	dt_dprintf("Providing underlying probe %s:%s:%s:%s @ %lx\n", psp->pps_prv,
-		   psp->pps_mod, psp->pps_fn, psp->pps_prb, psp->pps_off);
+		   psp->pps_mod, psp->pps_fun, psp->pps_prb, psp->pps_off);
 	uprp = dt_probe_lookup(dtp, &pd);
 	if (uprp == NULL) {
 		dt_provider_t	*pvp;
@@ -1108,11 +1113,24 @@ static int provide_usdt_probe(dtrace_hdl_t *dtp, const pid_probespec_t *psp)
 	return provide_probe(dtp, psp, psp->pps_prb, &dt_usdt, PP_IS_FUNCALL);
 }
 
+static int provide_stapsdt_probe(dtrace_hdl_t *dtp, const pid_probespec_t *psp)
+{
+	if (psp->pps_type != DTPPT_STAPSDT &&
+	    psp->pps_type != DTPPT_IS_ENABLED) {
+		dt_dprintf("pid: unknown stapsdt probe type %i\n", psp->pps_type);
+		return -1;
+	}
+
+	return provide_probe(dtp, psp, psp->pps_prb, &dt_stapsdt, PP_IS_FUNCALL);
+}
+
+
 static void enable(dtrace_hdl_t *dtp, dt_probe_t *prp, int is_usdt)
 {
 	const list_probe_t	*pup;
 
-	assert(prp->prov->impl == &dt_pid || prp->prov->impl == &dt_usdt);
+	assert(prp->prov->impl == &dt_pid || prp->prov->impl == &dt_usdt ||
+	       prp->prov->impl == &dt_stapsdt);
 
 	/*
 	 * We need to enable the underlying probes (if not enabled yet).
@@ -1144,6 +1162,11 @@ static void enable_usdt(dtrace_hdl_t *dtp, dt_probe_t *prp)
 	enable(dtp, prp, 1);
 }
 
+static void enable_stapsdt(dtrace_hdl_t *dtp, dt_probe_t *prp)
+{
+	enable(dtp, prp, 1);
+}
+
 /*
  * Generate code that populates, counts the probe arguments.
  */
@@ -1875,3 +1898,15 @@ dt_provimpl_t	dt_usdt = {
 	.discover	= &discover,
 	.add_probe	= &add_probe_usdt,
 };
+
+/*
+ * Used for stapsdt probes.
+ */
+dt_provimpl_t	dt_stapsdt = {
+	.name		= "stapsdt",
+	.prog_type	= BPF_PROG_TYPE_UNSPEC,
+	.provide_probe	= &provide_stapsdt_probe,
+	.enable		= &enable_stapsdt,
+	.probe_destroy	= &probe_destroy,
+	.add_probe	= &add_probe_usdt,
+};
-- 
2.43.5




More information about the DTrace-devel mailing list