[DTrace-devel] [PATCH] rawfbt: new provider

Kris Van Hees kris.van.hees at oracle.com
Fri Dec 6 16:33:52 UTC 2024


On Fri, Dec 06, 2024 at 10:35:50AM +0000, Alan Maguire wrote:
> On 05/12/2024 19:42, Kris Van Hees wrote:
> > This provider provides access to all kprobe-based probes that are
> > available on the system.  This includes any compiler-generated
> > optimized variants of functions, named <func>.<suffix>.
> > 
> 
> This is great! Having a rawfbt allows users to still trace cases
> where fprobe can't currently handle.
> 
> On the .suffix; I get that this is intended to be a raw provider, but
> would it be better for stability to expose these functions as "func"
> rather than "func.suffix"? It becomes difficult to write portable
> scripts when suffixes are included, because wildcarding
> 
> rawfbt::func*:entry
> 
> ...to catch both suffixed and non-suffixed variants may end up tracing
> func, func.suffix but also func2 etc.

The trouble is that exposing <func>.<suffix> as <func> will mean that you
automatically end up trying to probe *any* <func>.<suffix> whenever you
place a probe on <func> using rawfbt, which is not necessarily what you may
want.  And there would not be any way to avoid that.

But by not collapsing them, you have the ability to catch them all anyway,
if you want, by specifying <func>, <func>.* instead of <func>.

> > Signed-off-by: Kris Van Hees <kris.van.hees at oracle.com>
> > ---
> >  libdtrace/Build            |   2 +
> >  libdtrace/dt_prov_rawfbt.c | 330 +++++++++++++++++++++++++++++++++++++
> >  libdtrace/dt_provider.c    |   1 +
> >  libdtrace/dt_provider.h    |   1 +
> >  test/utils/clean_probes.sh |   2 +-
> >  5 files changed, 335 insertions(+), 1 deletion(-)
> >  create mode 100644 libdtrace/dt_prov_rawfbt.c
> > 
> > diff --git a/libdtrace/Build b/libdtrace/Build
> > index 8d398221..72235159 100644
> > --- a/libdtrace/Build
> > +++ b/libdtrace/Build
> > @@ -55,6 +55,7 @@ libdtrace-build_SOURCES = dt_aggregate.c \
> >  			  dt_prov_lockstat.c \
> >  			  dt_prov_proc.c \
> >  			  dt_prov_profile.c \
> > +			  dt_prov_rawfbt.c \
> >  			  dt_prov_rawtp.c \
> >  			  dt_prov_sched.c \
> >  			  dt_prov_sdt.c \
> > @@ -112,6 +113,7 @@ dt_prov_ip.c_CFLAGS := -Wno-pedantic
> >  dt_prov_lockstat.c_CFLAGS := -Wno-pedantic
> >  dt_prov_proc.c_CFLAGS := -Wno-pedantic
> >  dt_prov_profile.c_CFLAGS := -Wno-pedantic
> > +dt_prov_rawfbt.c_CFLAGS := -Wno-pedantic
> >  dt_prov_rawtp.c_CFLAGS := -Wno-pedantic
> >  dt_prov_sched.c_CFLAGS := -Wno-pedantic
> >  dt_prov_sdt.c_CFLAGS := -Wno-pedantic
> > diff --git a/libdtrace/dt_prov_rawfbt.c b/libdtrace/dt_prov_rawfbt.c
> > new file mode 100644
> > index 00000000..edfd36b4
> > --- /dev/null
> > +++ b/libdtrace/dt_prov_rawfbt.c
> > @@ -0,0 +1,330 @@
> > +/*
> > + * Oracle Linux DTrace.
> > + * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
> > + * Licensed under the Universal Permissive License v 1.0 as shown at
> > + * http://oss.oracle.com/licenses/upl.
> > + *
> > + * The Raw Function Boundary Tracing provider for DTrace.
> > + *
> > + * The kernel provides kprobes to trace specific symbols.  They are listed in
> > + * the TRACEFS/available_filter_functions file.  Kprobes may be associated with
> > + * a symbol in the core kernel or with a symbol in a specific kernel module.
> > + * Whereas the fbt provider supports tracing regular symbols only, the rawfbt
> > + * provider also provides access to synthetic symbols, i.e. symbols created by
> > + * compiler optimizations.
> > + *
> > + * Mapping from event name to DTrace probe name:
> > + *
> > + *	<name>					rawfbt:vmlinux:<name>:entry
> > + *						rawfbt:vmlinux:<name>:return
> > + *   or
> > + *	<name> [<modname>]			rawfbt:<modname>:<name>:entry
> > + *						rawfbt:<modname>:<name>:return
> > + */
> > +#include <assert.h>
> > +#include <errno.h>
> > +#include <fcntl.h>
> > +#include <stdio.h>
> > +#include <stdlib.h>
> > +#include <string.h>
> > +#include <unistd.h>
> > +#include <linux/bpf.h>
> > +#include <linux/btf.h>
> > +#include <sys/stat.h>
> > +#include <sys/types.h>
> > +
> > +#include <bpf_asm.h>
> > +
> > +#include "dt_btf.h"
> > +#include "dt_dctx.h"
> > +#include "dt_cg.h"
> > +#include "dt_module.h"
> > +#include "dt_provider_tp.h"
> > +#include "dt_probe.h"
> > +#include "dt_pt_regs.h"
> > +
> > +static const char		prvname[] = "rawfbt";
> > +static const char		modname[] = "vmlinux";
> > +
> > +#define KPROBE_EVENTS		TRACEFS "kprobe_events"
> > +#define PROBE_LIST		TRACEFS "available_filter_functions"
> > +
> > +#define FBT_GROUP_FMT		GROUP_FMT "_%s"
> > +#define FBT_GROUP_DATA		GROUP_DATA, prp->desc->prb
> > +
> > +static const dtrace_pattr_t	pattr = {
> > +{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
> > +{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
> > +{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_ISA },
> > +{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
> > +{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_ISA },
> > +};
> > +
> > +/*
> > + * Scan the PROBE_LIST file and add entry and return probes for every function
> > + * that is listed.
> > + */
> > +static int populate(dtrace_hdl_t *dtp)
> > +{
> > +	dt_provider_t		*prv;
> > +	FILE			*f;
> > +	char			*buf = NULL;
> > +	size_t			len  = 0;
> > +	size_t			n = 0;
> > +	dtrace_syminfo_t	sip;
> > +	dtrace_probedesc_t	pd;
> > +
> > +	prv = dt_provider_create(dtp, prvname, &dt_rawfbt, &pattr, NULL);
> > +	if (prv == NULL)
> > +		return -1;			/* errno already set */
> > +
> > +	f = fopen(PROBE_LIST, "r");
> > +	if (f == NULL)
> > +		return 0;
> > +
> > +	while (getline(&buf, &len, f) >= 0) {
> > +		char		*p, *q;
> > +		const char	*mod = modname;
> > +		dt_probe_t	*prp;
> > +
> > +		/*
> > +		 * Here buf is either "funcname\n" or "funcname [modname]\n".
> > +		 * The last line may not have a linefeed.
> > +		 */
> > +		p = strchr(buf, '\n');
> > +		if (p) {
> > +			*p = '\0';
> > +			if (p > buf && *(--p) == ']')
> > +				*p = '\0';
> > +		}
> > +
> > +		/*
> > +		 * Now buf is either "funcname" or "funcname [modname".  If
> > +		 * there is no module name provided, we will use the default.
> > +		 */
> > +		p = strchr(buf, ' ');
> > +		if (p) {
> > +			*p++ = '\0';
> > +			if (*p == '[')
> > +				p++;
> > +		}
> > +
> > +#define strstarts(var, x) (strncmp(var, x, strlen (x)) == 0)
> > +		/* Weed out __ftrace_invalid_address___* entries. */
> > +		if (strstarts(buf, "__ftrace_invalid_address__") ||
> > +		    strstarts(buf, "__probestub_") ||
> > +		    strstarts(buf, "__traceiter_"))
> > +			continue;
> > +#undef strstarts
> > +
> > +		/*
> > +		 * If we did not see a module name, perform a symbol lookup to
> > +		 * try to determine the module name.
> > +		 */
> > +		if (!p) {
> > +			/*
> > +			 * For synthetic symbol names (those containing '.'),
> > +			 * we need to use the base name (before the '.') for
> > +			 * module name lookup, because the synthetic forms are
> > +			 * not recorded in kallsyms information.
> > +			 *
> > +			 * We replace the first '.' with a 0 to terminate the
> > +			 * string, and after the lookup, we put it back.
> > +			 */
> > +			q = strchr(buf, '.');
> > +			if (q != NULL)
> > +				*q = '\0';
> > +
> > +			if (dtrace_lookup_by_name(dtp, DTRACE_OBJ_KMODS, buf,
> > +						  NULL, &sip) == 0)
> > +				mod = sip.object;
> > +
> > +			if (q != NULL)
> > +				*q = '.';
> > +		} else
> > +			mod = p;
> > +
> > +		/*
> > +		 * Due to the lack of module names in
> > +		 * TRACEFS/available_filter_functions, there are some duplicate
> > +		 * function names.  The kernel does not let us trace functions
> > +		 * that have duplicates, so we need to remove the existing one.
> > +		 */
> > +		pd.id = DTRACE_IDNONE;
> > +		pd.prv = prvname;
> > +		pd.mod = mod;
> > +		pd.fun = buf;
> > +		pd.prb = "entry";
> > +		prp = dt_probe_lookup(dtp, &pd);
> > +		if (prp != NULL) {
> > +			dt_probe_destroy(prp);
> > +			continue;
> > +		}
> > +
> > +		if (dt_tp_probe_insert(dtp, prv, prvname, mod, buf, "entry"))
> > +			n++;
> > +		if (dt_tp_probe_insert(dtp, prv, prvname, mod, buf, "return"))
> > +			n++;
> > +	}
> > +
> > +	free(buf);
> > +	fclose(f);
> > +
> > +	return n;
> > +}
> > +
> > +/*
> > + * Generate a BPF trampoline for a FBT probe.
> > + *
> > + * The trampoline function is called when a FBT probe triggers, and it must
> > + * satisfy the following prototype:
> > + *
> > + *	int dt_fbt(dt_pt_regs *regs)
> > + *
> > + * The trampoline will populate a dt_dctx_t struct and then call the function
> > + * that implements the compiled D clause.  It returns 0 to the caller.
> > + */
> > +static int trampoline(dt_pcb_t *pcb, uint_t exitlbl)
> > +{
> > +	dt_cg_tramp_prologue(pcb);
> > +
> > +	/*
> > +	 * After the dt_cg_tramp_prologue() call, we have:
> > +	 *				//     (%r7 = dctx->mst)
> > +	 *				//     (%r8 = dctx->ctx)
> > +	 */
> > +	dt_cg_tramp_copy_regs(pcb);
> > +	if (strcmp(pcb->pcb_probe->desc->prb, "return") == 0) {
> > +		dt_irlist_t	*dlp = &pcb->pcb_ir;
> > +
> > +		dt_cg_tramp_copy_rval_from_regs(pcb);
> > +
> > +		/*
> > +		 * fbt:::return arg0 should be the function offset for
> > +		 * return instruction.  Since we use kretprobes, however,
> > +		 * which do not fire until the function has returned to
> > +		 * its caller, information about the returning instruction
> > +		 * in the callee has been lost.
> > +		 *
> > +		 * Set arg0=-1 to indicate that we do not know the value.
> > +		 */
> > +		dt_cg_xsetx(dlp, NULL, DT_LBL_NONE, BPF_REG_0, -1);
> > +		emit(dlp,  BPF_STORE(BPF_DW, BPF_REG_7, DMST_ARG(0), BPF_REG_0));
> > +	} else
> > +		dt_cg_tramp_copy_args_from_regs(pcb, 1);
> > +	dt_cg_tramp_epilogue(pcb);
> > +
> > +	return 0;
> > +}
> > +
> > +static int attach(dtrace_hdl_t *dtp, const dt_probe_t *prp, int bpf_fd)
> > +{
> > +	if (!dt_tp_probe_has_info(prp)) {
> > +		char	*fn, *prb, *p;
> > +		FILE	*f;
> > +		size_t	len;
> > +		int	fd, rc = -1;
> > +
> > +		/*
> > +		 * The tracepoint event we will be creating needs to have a
> > +		 * valid name.  We use a copy of the probe name, with . -> _
> > +		 * conversion.
> > +		 */
> > +		prb = strdup(prp->desc->fun);
> > +		for (p = prb; *p; p++) {
> > +			if (*p == '.')
> > +				*p = '_';
> > +		}
> > +
> > +		/*
> > +		 * Register the kprobe with the tracing subsystem.  This will
> > +		 * create a tracepoint event.
> > +		 */
> > +		fd = open(KPROBE_EVENTS, O_WRONLY | O_APPEND);
> > +		if (fd == -1)
> > +			return -ENOENT;
> > +
> > +		rc = dprintf(fd, "%c:" FBT_GROUP_FMT "/%s %s\n",
> > +			     prp->desc->prb[0] == 'e' ? 'p' : 'r',
> > +			     FBT_GROUP_DATA, prb, prp->desc->fun);
> > +		close(fd);
> > +		if (rc == -1)
> > +			return -ENOENT;
> > +
> > +		/* create format file name */
> > +		len = snprintf(NULL, 0, "%s" FBT_GROUP_FMT "/%s/format",
> > +			       EVENTSFS, FBT_GROUP_DATA, prb) + 1;
> > +		fn = dt_alloc(dtp, len);
> > +		if (fn == NULL)
> > +			return -ENOENT;
> > +
> > +		snprintf(fn, len, "%s" FBT_GROUP_FMT "/%s/format", EVENTSFS,
> > +			 FBT_GROUP_DATA, prb);
> > +
> > +		/* open format file */
> > +		f = fopen(fn, "r");
> > +		dt_free(dtp, fn);
> > +		if (f == NULL)
> > +			return -ENOENT;
> > +
> > +		/* read event id from format file */
> > +		rc = dt_tp_probe_info(dtp, f, 0, prp, NULL, NULL);
> > +		fclose(f);
> > +
> > +		if (rc < 0)
> > +			return -ENOENT;
> > +	}
> > +
> > +	/* attach BPF program to the probe */
> > +	return dt_tp_probe_attach(dtp, prp, bpf_fd);
> > +}
> > +
> > +static int probe_info(dtrace_hdl_t *dtp, const dt_probe_t *prp, int *argcp,
> > +		      dt_argdesc_t **argvp)
> > +{
> > +	*argcp = 0;			/* no arguments by default */
> > +	*argvp = NULL;
> > +
> > +	return 0;
> > +}
> > +
> > +/*
> > + * Try to clean up system resources that may have been allocated for this
> > + * probe.
> > + *
> > + * If there is an event FD, we close it.
> > + *
> > + * We also try to remove any kprobe that may have been created for the probe.
> > + * This is harmless for probes that didn't get created.  If the removal fails
> > + * for some reason we are out of luck - fortunately it is not harmful to the
> > + * system as a whole.
> > + */
> > +static void detach(dtrace_hdl_t *dtp, const dt_probe_t *prp)
> > +{
> > +	int	fd;
> > +
> > +	if (!dt_tp_probe_has_info(prp))
> > +		return;
> > +
> > +	dt_tp_probe_detach(dtp, prp);
> > +
> > +	fd = open(KPROBE_EVENTS, O_WRONLY | O_APPEND);
> > +	if (fd == -1)
> > +		return;
> > +
> > +	dprintf(fd, "-:" FBT_GROUP_FMT "/%s\n", FBT_GROUP_DATA,
> > +		prp->desc->fun);
> > +	close(fd);
> > +}
> > +
> > +dt_provimpl_t	dt_rawfbt = {
> > +	.name		= "rawfbt",
> > +	.prog_type	= BPF_PROG_TYPE_KPROBE,
> > +	.populate	= &populate,
> > +	.load_prog	= &dt_bpf_prog_load,
> > +	.trampoline	= &trampoline,
> > +	.attach		= &attach,
> > +	.probe_info	= &probe_info,
> > +	.detach		= &detach,
> > +	.probe_destroy	= &dt_tp_probe_destroy,
> > +};
> > diff --git a/libdtrace/dt_provider.c b/libdtrace/dt_provider.c
> > index 1e2e844e..0c621197 100644
> > --- a/libdtrace/dt_provider.c
> > +++ b/libdtrace/dt_provider.c
> > @@ -36,6 +36,7 @@ const dt_provimpl_t *dt_providers[] = {
> >  	&dt_lockstat,
> >  	&dt_proc,
> >  	&dt_profile,
> > +	&dt_rawfbt,
> >  	&dt_rawtp,
> >  	&dt_sched,
> >  	&dt_sdt,
> > diff --git a/libdtrace/dt_provider.h b/libdtrace/dt_provider.h
> > index f62137de..59a8d62e 100644
> > --- a/libdtrace/dt_provider.h
> > +++ b/libdtrace/dt_provider.h
> > @@ -82,6 +82,7 @@ extern dt_provimpl_t dt_ip;
> >  extern dt_provimpl_t dt_lockstat;
> >  extern dt_provimpl_t dt_proc;
> >  extern dt_provimpl_t dt_profile;
> > +extern dt_provimpl_t dt_rawfbt;
> >  extern dt_provimpl_t dt_rawtp;
> >  extern dt_provimpl_t dt_sched;
> >  extern dt_provimpl_t dt_sdt;
> > diff --git a/test/utils/clean_probes.sh b/test/utils/clean_probes.sh
> > index 8292b309..16052cbc 100755
> > --- a/test/utils/clean_probes.sh
> > +++ b/test/utils/clean_probes.sh
> > @@ -70,7 +70,7 @@ fi
> >  	     { sub(/:/, "/"); }
> >  
> >  	     {
> > -		 if (/_fbt_/)
> > +		 if (/_(raw)?fbt_/)
> >  		     kpv[kpc++] = $0;
> >  		 else
> >  		     upv[upc++] = $0;



More information about the DTrace-devel mailing list