[DTrace-devel] [PATCH v4 2/5] support stapsdt ELF-note-defined static probes
Kris Van Hees
kris.van.hees at oracle.com
Tue Jul 1 19:42:09 UTC 2025
I am looking a bit deeper into this patch. See my other email concerning the
args test that is not passing because it fails to get the function name. I
believe the problem is that the code here does not handle PIE-compiled code
(which is default for e.g. Debian, but not OL).
Also, I am trying to see whether we can integrate the parsing of the note
format in usdt_parser_notes.c so that we can centralize all ELF notes parsing
related to USDT in a single location.
Maybe it would be better, maybe not - I'm evaluating.
Another thing... you are performing addr-to-map lookups for every address
(i.e. for every note) even though you are processing notes for a single
mapping in the loop, so the map should be the same for all the addresses,
right? I don't think that the ELF notes for mapping A can refer to probes
(by address) that belong in mapping B - so that cam be optimized I think.
Do you think the semaphore can be implemented as well, since that is somewhat
similar to is-enabled probes I think?
On Mon, Jun 23, 2025 at 11:13:07AM +0100, Alan Maguire wrote:
> As well as using dtrace -G to generate USDT probes, programs and
> libraries may have added static probes via stapsdt ELF notes.
>
> Read ELF notes from binaries from /proc/ maps associated with processes
> and parse them to retrieve uprobe address and argument-related info
> to create the associated uprobe.
>
> Probe arguments can be either constants, register values or dereferences
> or dereferences from register values (plus offset), identical to the
> updated USDT ELF note handling.
>
> A new provider - stapsdt - implements this support, as stapsdt probes do
> not dynamically register themselves with DTrace. This makes them less
> powerful than DTrace-based USDT probes, but they do exist in programs and
> libraries so should be supported.
>
> As well as supporting ELF-note stapsdt defined probes in programs and
> libraries, this patch supports dynamically-created probes that
> are created via libstapsdt [1]. libstapsdt allows dynamic languages
> like python to declare and fire probes by dynamically creating
> a memfd-based shared library containing ELF notes for the probes.
> With these changes we can also trace these probes. This is very
> useful since libstapsdt has python, NodeJS, go and luaJIT bindings.
>
> [1] https://github.com/linux-usdt/libstapsdt
>
> Signed-off-by: Alan Maguire <alan.maguire at oracle.com>
> ---
> include/dtrace/pid.h | 1 +
> libdtrace/dt_pid.c | 288 +++++++++++++++++++++++++++++++++++++
> libdtrace/dt_prov_uprobe.c | 43 +++++-
> 3 files changed, 328 insertions(+), 4 deletions(-)
>
> diff --git a/include/dtrace/pid.h b/include/dtrace/pid.h
> index 8d4b6432..99093bc9 100644
> --- a/include/dtrace/pid.h
> +++ b/include/dtrace/pid.h
> @@ -24,6 +24,7 @@ typedef enum pid_probetype {
> DTPPT_OFFSETS,
> DTPPT_ABSOFFSETS,
> DTPPT_USDT,
> + DTPPT_STAPSDT,
> DTPPT_IS_ENABLED
> } pid_probetype_t;
>
> diff --git a/libdtrace/dt_pid.c b/libdtrace/dt_pid.c
> index d12b7919..6581b087 100644
> --- a/libdtrace/dt_pid.c
> +++ b/libdtrace/dt_pid.c
> @@ -38,6 +38,9 @@
> #include <dt_pid.h>
> #include <dt_string.h>
>
> +#define SEC_STAPSDT_NOTE ".note.stapsdt"
> +#define NAME_STAPSDT_NOTE "stapsdt"
> +
> /*
> * Information on a PID probe.
> */
> @@ -1262,6 +1265,288 @@ dt_pid_create_pid_probes(dtrace_probedesc_t *pdp, dtrace_hdl_t *dtp, dt_pcb_t *p
> return err;
> }
>
> +static int
> +dt_stapsdt_parse(dtrace_hdl_t *dtp, dt_proc_t *dpr, dtrace_probedesc_t *pdp,
> + dt_pcb_t *pcb, const dt_provider_t *pvp, char *path,
> + unsigned long base_addr)
> +{
> + Elf *elf = NULL;
> + Elf_Scn *scn = NULL;
> + GElf_Shdr shdr;
> + GElf_Nhdr nhdr;
> + size_t shstrndx, noff, doff, off, n;
> + Elf_Data *data;
> + GElf_Ehdr ehdr;
> + int i, err = 0;
> + int fd = -1;
> + char *mod;
> +
> + fd = open(path, O_RDONLY);
> + if (fd < 0) {
> + dt_pid_error(dtp, pcb, dpr, D_PROC_USDT,
> + "Cannot open %s: %s\n",
> + path, strerror(errno));
> + return -1;
> + }
> + mod = strrchr(path, '/');
> + if (mod)
> + mod++;
> + else
> + mod = path;
> +
> + elf = elf_begin(fd, ELF_C_READ_MMAP, NULL); // ELF_C_READ ?
> + assert(elf_kind(elf) == ELF_K_ELF);
> + elf_getshdrstrndx(elf, &shstrndx);
> +
> + if (gelf_getehdr(elf, &ehdr)) {
> + switch (ehdr.e_type) {
> + case ET_EXEC:
> + /* binary does not require base addr adjustment */
> + base_addr = 0;
> + break;
> + case ET_DYN:
> + break;
> + default:
> + dt_dprintf("unexpected ELF hdr type 0x%x for '%s'\n",
> + ehdr.e_type, path);
> + err = -1;
> + goto out;
> + }
> + }
> +
> + while ((scn = elf_nextscn(elf, scn)) != NULL) {
> + char *secname;
> +
> + assert(gelf_getshdr(scn, &shdr) != NULL);
> +
> + secname = elf_strptr(elf, shstrndx, shdr.sh_name);
> + if (strcmp(secname, SEC_STAPSDT_NOTE) == 0 &&
> + shdr.sh_type == SHT_NOTE)
> + break;
> + }
> + /* No ELF notes, just bail. */
> + if (scn == NULL)
> + goto out;
> + data = elf_getdata(scn, 0);
> + for (off = 0;
> + (off = gelf_getnote(data, off, &nhdr, &noff, &doff)) > 0;) {
> + pid_probespec_t psp = {0};
> + char *prv, *prb;
> + const char *fun;
> + char *dbuf = (char *)data->d_buf;
> + long *addrs = data->d_buf + doff; /* 3 addrs are loc/base/semaphore */
> + GElf_Sym sym;
> + const prmap_t *pmp;
> +
> + if (strncmp(dbuf + noff, NAME_STAPSDT_NOTE, nhdr.n_namesz) != 0)
> + continue;
> + prv = dbuf + doff + (3*sizeof(long));
> + /* ensure prv/prb is null-terminated */
> + if (strlen(prv) >= nhdr.n_descsz)
> + continue;
> + prb = prv + strlen(prv) + 1;
> + if (strlen(prb) >= nhdr.n_descsz)
> + continue;
> + if (strncmp(pdp->prv, prv, strlen(prv)) != 0)
> + continue;
> + /* skip unmatched, non-wildcarded probes */
> + if (strcmp(pdp->prb, "*") != 0 &&
> + (strlen(pdp->prb) > 0 && strcmp(pdp->prb, prb) != 0))
> + continue;
> + if (prb + strlen(prb) + 1 < dbuf + doff + nhdr.n_descsz)
> + psp.pps_sargv = prb + strlen(prb) + 1;
> +
> + psp.pps_type = DTPPT_STAPSDT;
> + psp.pps_prv = prv;
> + psp.pps_mod = mod;
> + psp.pps_prb = prb;
> + if (elf_getphdrnum(elf, &n))
> + continue;
> + for (i = 0; i < n; i++) {
> + GElf_Phdr phdr;
> +
> + if (!gelf_getphdr(elf, i, &phdr))
> + break;
> +
> + if (addrs[0] < phdr.p_vaddr ||
> + addrs[0] > phdr.p_vaddr + phdr.p_memsz)
> + continue;
> + if (base_addr)
> + psp.pps_off = addrs[0];
> + else
> + psp.pps_off = addrs[0] - phdr.p_vaddr + phdr.p_offset;
> + break;
> + }
> + if (!psp.pps_off)
> + continue;
> + psp.pps_nameoff = 0;
> +
> + pmp = Paddr_to_map(dpr->dpr_proc, base_addr + addrs[0]);
> + if (!pmp) {
> + dt_dprintf("%i: cannot determine 0x%lx's mapping\n",
> + Pgetpid(dpr->dpr_proc), psp.pps_off);
> + continue;
> + }
> + psp.pps_fn = Pmap_mapfile_name(dpr->dpr_proc, pmp);
> + if (psp.pps_fn == NULL) {
> + dt_pid_error(dtp, pcb, dpr, D_PROC_USDT,
> + "Cannot get name of mapping containing probe %s for pid %d\n",
> + psp.pps_prb, dpr->dpr_pid);
> + err = -1;
> + break;
> + }
> + if (dt_Plookup_by_addr(dtp, dpr->dpr_pid, base_addr + addrs[0],
> + &fun, &sym) == 0)
> + psp.pps_fun = (char *)fun;
> + else
> + psp.pps_fun = "";
> + psp.pps_dev = pmp->pr_dev;
> + psp.pps_inum = pmp->pr_inum;
> + psp.pps_pid = dpr->dpr_pid;
> + psp.pps_nameoff = 0;
> +
> + if (pvp->impl->provide_probe(dtp, &psp) < 0) {
> + dt_pid_error(dtp, pcb, dpr, D_PROC_USDT,
> + "failed to instantiate probe %s for pid %d: %s",
> + psp.pps_prb, psp.pps_pid,
> + dtrace_errmsg(dtp, dtrace_errno(dtp)));
> + err = -1;
> + }
> + free(psp.pps_fn);
> + if (err == -1)
> + break;
> + }
> +
> +out:
> + elf_end(elf);
> + close(fd);
> + return err;
> +}
> +
> +static void
> +dt_pid_create_stapsdt_probes_proc(dtrace_probedesc_t *pdp, dtrace_hdl_t *dtp,
> + dt_pcb_t *pcb, const dt_provider_t *pvp,
> + dt_proc_t *dpr, const char *proc_map)
> +{
> + char line[1024];
> + FILE *fp = NULL;
> + pid_t pid;
> +
> + assert(dpr != NULL);
> +
> + pid = dpr->dpr_pid;
> + fp = fopen(proc_map, "r");
> + if (!fp)
> + return;
> +
> + while (fgets(line, sizeof(line) - 1, fp) != NULL) {
> + long addr_start, addr_end, file_offset;
> + long dev_major, dev_minor;
> + unsigned long inode;
> + char name[PATH_MAX + 1];
> + char path[PATH_MAX + 1];
> + char perm[5];
> + int ret;
> +
> + ret = sscanf(line,
> + "%lx-%lx %4s %lx %lx:%lx %lu %[^\n]",
> + &addr_start, &addr_end, perm, &file_offset,
> + &dev_major, &dev_minor, &inode, name);
> + if (ret != 8 || !strchr(perm, 'x') || strchr(name, '[') != NULL)
> + continue;
> +
> + /* libstapsdt uses an memfd-based library to dynamically create
> + * stapsdt notes for dynamic languages like python; we need
> + * the associated /proc/<pid>/fds/ fd to read these notes.
> + */
> + if (strncmp(name, "/memfd:", strlen("/memfd:")) == 0) {
> + DIR *d;
> + struct dirent *dirent;
> + char *deleted;
> +
> + deleted = strstr(name, " (deleted)");
> + if (deleted)
> + *deleted = '\0';
> + snprintf(path, sizeof(path), "/proc/%d/fd", pid);
> + d = opendir(path);
> + if (d == NULL)
> + continue;
> + while ((dirent = readdir(d)) != NULL) {
> + struct stat s;
> +
> + snprintf(path, sizeof(path), "/proc/%d/fd/%s",
> + pid, dirent->d_name);
> + if (stat(path, &s) != 0 || s.st_ino != inode)
> + continue;
> + if (dt_stapsdt_parse(dtp, dpr, pdp, pcb, pvp,
> + path, addr_start) != 0)
> + break;
> + }
> + } else {
> + if (dt_stapsdt_parse(dtp, dpr, pdp, pcb, pvp, name,
> + addr_start) != 0)
> + break;
> + }
> + }
> + fclose(fp);
> +}
> +
> +static int
> +dt_pid_create_stapsdt_probes(dtrace_probedesc_t *pdp, dtrace_hdl_t *dtp, dt_pcb_t *pcb)
> +{
> + int i, nmatches = 0, err = 0;
> + const dt_provider_t *pvp;
> + char *globpat = NULL;
> + const char *pidstr;
> + glob_t globbuf;
> + bool wildcard;
> + pid_t pid;
> +
> + assert(pcb != NULL);
> +
> + pidstr = &pdp->prv[strlen(pdp->prv)];
> +
> + while (isdigit(*(pidstr - 1)) || *(pidstr - 1) == '*')
> + pidstr--;
> + if (strlen(pidstr) == 0)
> + return 0;
> + wildcard = strchr(pidstr, '*');
> + asprintf(&globpat, "/proc/%s/maps", pidstr);
> + nmatches = glob(globpat, 0, NULL, &globbuf) ? 0 : globbuf.gl_pathc;
> + pvp = dt_provider_lookup(dtp, "stapsdt");
> + assert(pvp != NULL);
> +
> + for (i = 0; i < nmatches; i++) {
> + dt_proc_t *dpr = NULL;
> +
> + pidstr = globbuf.gl_pathv[i] + strlen("/proc/");
> + pid = atoll(pidstr);
> + if (pid <= 0)
> + continue;
> + if (dt_proc_grab_lock(dtp, pid, DTRACE_PROC_WAITING |
> + DTRACE_PROC_SHORTLIVED) < 0) {
> + if (wildcard)
> + continue;
> + dt_pid_error(dtp, pcb, NULL, D_PROC_GRAB,
> + "failed to grab process %d",
> + (int)pid);
> + err = 1;
> + break;
> + }
> + dpr = dt_proc_lookup(dtp, pid);
> + if (dpr) {
> + dt_pid_create_stapsdt_probes_proc(pdp, dtp, pcb,
> + pvp, dpr,
> + globbuf.gl_pathv[i]);
> + dt_proc_release_unlock(dtp, pid);
> + }
> + }
> + free(globpat);
> + globfree(&globbuf);
> +
> + return err;
> +}
> +
> int
> dt_pid_create_usdt_probes(dtrace_probedesc_t *pdp, dtrace_hdl_t *dtp, dt_pcb_t *pcb)
> {
> @@ -1319,6 +1604,9 @@ dt_pid_create_usdt_probes(dtrace_probedesc_t *pdp, dtrace_hdl_t *dtp, dt_pcb_t *
> free(globpat);
> globfree(&globbuf);
>
> + if (err == 0)
> + err = dt_pid_create_stapsdt_probes(pdp, dtp, pcb);
> +
> /* If no errors, report success. */
> if (err == 0)
> return 0;
> diff --git a/libdtrace/dt_prov_uprobe.c b/libdtrace/dt_prov_uprobe.c
> index 2cbd8910..b91cf810 100644
> --- a/libdtrace/dt_prov_uprobe.c
> +++ b/libdtrace/dt_prov_uprobe.c
> @@ -313,12 +313,15 @@ static const dtrace_pattr_t pattr = {
>
> dt_provimpl_t dt_pid;
> dt_provimpl_t dt_usdt;
> +dt_provimpl_t dt_stapsdt;
>
> static int populate(dtrace_hdl_t *dtp)
> {
> if (dt_provider_create(dtp, dt_uprobe.name, &dt_uprobe, &pattr,
> NULL) == NULL ||
> dt_provider_create(dtp, dt_pid.name, &dt_pid, &pattr,
> + NULL) == NULL ||
> + dt_provider_create(dtp, dt_stapsdt.name, &dt_stapsdt, &pattr,
> NULL) == NULL)
> return -1; /* errno already set */
>
> @@ -477,8 +480,8 @@ clean_usdt_probes(dtrace_hdl_t *dtp)
>
> prp_next = dt_list_next(prp);
>
> - /* Make sure it is an overlying USDT probe. */
> - if (prp->prov->impl != &dt_usdt)
> + /* Make sure it is an overlying USDT, stapsdt probe. */
> + if (prp->prov->impl != &dt_usdt && prp->prov->impl != &dt_stapsdt)
> continue;
>
> /* FIXME passing in NULL pcb and dpr wreaks havoc on error reporting? */
> @@ -637,6 +640,7 @@ static int add_probe_uprobe(dtrace_hdl_t *dtp, dt_probe_t *prp)
> return 0;
> }
>
> +/* shared between usdt, stapsdt probes */
> static int add_probe_usdt(dtrace_hdl_t *dtp, dt_probe_t *prp)
> {
> char probnam[DTRACE_FULLNAMELEN], *p;
> @@ -890,6 +894,7 @@ static dt_probe_t *create_underlying(dtrace_hdl_t *dtp,
> case DTPPT_OFFSETS:
> case DTPPT_ABSOFFSETS:
> case DTPPT_USDT:
> + case DTPPT_STAPSDT:
> snprintf(prb, sizeof(prb), "%lx", psp->pps_off);
> break;
> default:
> @@ -904,7 +909,7 @@ static dt_probe_t *create_underlying(dtrace_hdl_t *dtp,
> pd.prb = prb;
>
> dt_dprintf("Providing underlying probe %s:%s:%s:%s @ %lx\n", psp->pps_prv,
> - psp->pps_mod, psp->pps_fn, psp->pps_prb, psp->pps_off);
> + psp->pps_mod, psp->pps_fun, psp->pps_prb, psp->pps_off);
> uprp = dt_probe_lookup(dtp, &pd);
> if (uprp == NULL) {
> dt_provider_t *pvp;
> @@ -1108,11 +1113,24 @@ static int provide_usdt_probe(dtrace_hdl_t *dtp, const pid_probespec_t *psp)
> return provide_probe(dtp, psp, psp->pps_prb, &dt_usdt, PP_IS_FUNCALL);
> }
>
> +static int provide_stapsdt_probe(dtrace_hdl_t *dtp, const pid_probespec_t *psp)
> +{
> + if (psp->pps_type != DTPPT_STAPSDT &&
> + psp->pps_type != DTPPT_IS_ENABLED) {
> + dt_dprintf("pid: unknown stapsdt probe type %i\n", psp->pps_type);
> + return -1;
> + }
> +
> + return provide_probe(dtp, psp, psp->pps_prb, &dt_stapsdt, PP_IS_FUNCALL);
> +}
> +
> +
> static void enable(dtrace_hdl_t *dtp, dt_probe_t *prp, int is_usdt)
> {
> const list_probe_t *pup;
>
> - assert(prp->prov->impl == &dt_pid || prp->prov->impl == &dt_usdt);
> + assert(prp->prov->impl == &dt_pid || prp->prov->impl == &dt_usdt ||
> + prp->prov->impl == &dt_stapsdt);
>
> /*
> * We need to enable the underlying probes (if not enabled yet).
> @@ -1144,6 +1162,11 @@ static void enable_usdt(dtrace_hdl_t *dtp, dt_probe_t *prp)
> enable(dtp, prp, 1);
> }
>
> +static void enable_stapsdt(dtrace_hdl_t *dtp, dt_probe_t *prp)
> +{
> + enable(dtp, prp, 1);
> +}
> +
> /*
> * Generate code that populates, counts the probe arguments.
> */
> @@ -1875,3 +1898,15 @@ dt_provimpl_t dt_usdt = {
> .discover = &discover,
> .add_probe = &add_probe_usdt,
> };
> +
> +/*
> + * Used for stapsdt probes.
> + */
> +dt_provimpl_t dt_stapsdt = {
> + .name = "stapsdt",
> + .prog_type = BPF_PROG_TYPE_UNSPEC,
> + .provide_probe = &provide_stapsdt_probe,
> + .enable = &enable_stapsdt,
> + .probe_destroy = &probe_destroy,
> + .add_probe = &add_probe_usdt,
> +};
> --
> 2.43.5
>
More information about the DTrace-devel
mailing list