[DTrace-devel] [PATCH 11/13] Add a profile provider

Kris Van Hees kris.van.hees at oracle.com
Tue Jul 7 00:07:36 PDT 2020


On Wed, Jul 01, 2020 at 10:41:16PM -0400, eugene.loh at oracle.com wrote:
> From: Eugene Loh <eugene.loh at oracle.com>
> 
> The probes use perf_events with PERF_TYPE_SOFTWARE and
> PERF_COUNT_SW_CPU_CLOCK.
> 
> Signed-off-by: Eugene Loh <eugene.loh at oracle.com>
> ---
>  BPF-DESIGN                  |   1 +
>  libdtrace/Build             |   2 +-
>  libdtrace/dt_open.c         |   1 +
>  libdtrace/dt_prov_profile.c | 422 ++++++++++++++++++++++++++++++++++++
>  libdtrace/dt_provider.h     |   1 +
>  5 files changed, 426 insertions(+), 1 deletion(-)
>  create mode 100644 libdtrace/dt_prov_profile.c
> 
> diff --git a/BPF-DESIGN b/BPF-DESIGN
> index 919a6d9d..1d1c3a2a 100644
> --- a/BPF-DESIGN
> +++ b/BPF-DESIGN
> @@ -221,6 +221,7 @@ DTRACE BPF PROGRAM CONVENTIONS
>  
>      BPF trampoline (probe specific)
>      -------------------------------
> +	FIXME: add something for dt_dtrace and dt_profile.

No need for this - this document needs updating in various places anyway.
We'll tackle that later (or more likely, the file will go away in favour of
actual documentation).

>  	Function Boundary Tracing (based on kprobe)
>  	-------------------------------------------
>  	The C equivalent implementation of the FBT trampoline program is:
> diff --git a/libdtrace/Build b/libdtrace/Build
> index 1fe41d53..cf0718ff 100644
> --- a/libdtrace/Build
> +++ b/libdtrace/Build
> @@ -21,7 +21,7 @@ libdtrace-build_SOURCES = dt_lex.c dt_aggregate.c dt_as.c dt_bpf.c \
>  			  dt_proc.c dt_program.c dt_provider.c dt_regset.c \
>  			  dt_string.c dt_strtab.c dt_subr.c dt_symtab.c \
>  			  dt_work.c dt_xlator.c dt_peb.c dt_prov_dtrace.c \
> -			  dt_prov_fbt.c dt_prov_sdt.c dt_prov_syscall.c
> +			  dt_prov_fbt.c dt_prov_profile.c dt_prov_sdt.c dt_prov_syscall.c

Please keep the lines <= 80 characters, so in this case:

			  dt_prov_fbt.c dt_prov_profile.c dt_prov_sdt.c \
			  dt_prov_syscall.c

>  libdtrace-build_SRCDEPS := dt_grammar.h $(objdir)/dt_git_version.h
>  
> diff --git a/libdtrace/dt_open.c b/libdtrace/dt_open.c
> index 6a1efbfb..f6e82954 100644
> --- a/libdtrace/dt_open.c
> +++ b/libdtrace/dt_open.c
> @@ -64,6 +64,7 @@ const dt_version_t _dtrace_versions[] = {
>  static const dt_provimpl_t *dt_providers[] = {
>  	&dt_dtrace,
>  	&dt_fbt,
> +	&dt_profile,
>  	&dt_sdt,
>  	&dt_syscall,
>  };
> diff --git a/libdtrace/dt_prov_profile.c b/libdtrace/dt_prov_profile.c
> new file mode 100644
> index 00000000..24a4a315
> --- /dev/null
> +++ b/libdtrace/dt_prov_profile.c
> @@ -0,0 +1,422 @@
> +/*
> + * Oracle Linux DTrace.
> + * Copyright (c) 2020, Oracle and/or its affiliates. All rights reserved.
> + * Licensed under the Universal Permissive License v 1.0 as shown at
> + * http://oss.oracle.com/licenses/upl.
> + *
> + * The profile provider for DTrace.
> + */
> +/* FIXME: clean up unneeded include files */

Please do the cleanup before submitting the patch for review.  If certain
failes are not needed to compile this code, they shouldn't be listed here.

> +#include <assert.h>
> +// #include <errno.h>
> +// #include <fcntl.h>
> +// #include <stdio.h>
> +// #include <stdlib.h>
> +// #include <string.h>
> +// #include <unistd.h>
> +#include <sys/ioctl.h>
> +// #include <linux/bpf.h>
> +// #include <linux/perf_event.h>     // included via dt_bpf.h
> +// #include <sys/stat.h>
> +// #include <sys/types.h>
> +
> +#include <bpf_asm.h>
> +
> +// #include "dt_impl.h"              // included via dt_bpf.h and dt_probe.h
> +#include "dt_bpf.h"
> +// #include "dt_bpf_builtins.h"
> +// #include "dt_provider.h"          // included via dt_probe.h
> +#include "dt_probe.h"
> +// #include "dt_pt_regs.h"
> +
> +typedef struct prv_data {
> +	int	ncpus;		/* number of CPUs on which a probe fires */
> +	int	*fds;		/* fds for the CPUs on which a probe fires */
> +} prv_data_t;

Better name would be struct profile_probe / profile_probe_t.

I also think it would be more clear if the members are fds and fds_cnt or
something similar.  The fds is an array of file descriptors, and fds_cnt would
tell you how many there are.  The fact that this corresponds to the number of
CPUs that are online is (after the data has been initialized) not relevant
anymore.

Of course, since there are two types of profile provider probes, perhaps it
would suffice to just store a kind (profile or tick) and the fds array.  If it
is a kind == TICK probe, fds will contain just a singleton element, and if
it is kind == PROFILE, fds will contain as many elements as there are CPUs on
the system (online).  Since that number does not change during the execution
of DTrace for a specific session, there is no need to store it here.

> +static const char		prvname[] = "profile";
> +static const char		modname[] = "";
> +static const char		funname[] = "";
> +
> +#define PREFIX_PROFILE	"profile-"
> +#define PREFIX_TICK	"tick-"
> +
> +static const dtrace_pattr_t	pattr = {
> +{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
> +{ DTRACE_STABILITY_UNSTABLE, DTRACE_STABILITY_UNSTABLE, DTRACE_CLASS_UNKNOWN },
> +{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
> +{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
> +{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
> +};
> +
> +static dt_probe_t *profile_probe_insert(dtrace_hdl_t *dtp, const char *prb)
> +{
> +	prv_data_t *datap;
> +	dt_probe_t *prp;
> +	dt_provider_t *prv;
> +	int i;
> +
> +	/* FIXME: shall we cache prv since this function is called multiple times? */
> +	prv = dt_provider_lookup(dtp, prvname);
> +	if (!prv)
> +		return NULL;

Seems to me that 'prv' should be passed in by the caller.

> +	datap = dt_zalloc(dtp, sizeof (prv_data_t));
> +	if (datap == NULL)
> +		return NULL;
> +
> +	/* the probe name starts with "profile-" or "tick-" */
> +	if (prb[0] == 'p')
> +		datap->ncpus = dtp->dt_conf.num_online_cpus;
> +	else
> +		datap->ncpus = 1;
> +
> +	datap->fds = dt_calloc(dtp, datap->ncpus, sizeof (int));
> +	if (datap->fds == NULL)
> +		goto err;
> +
> +	for (i = 0; i < datap->ncpus; i++)
> +		datap->fds[i] = -1;
> +
> +	prp = dt_probe_insert(dtp, prv, datap, prvname, modname, funname, prb);
> +	if (prp)
> +		return prp;
> +
> +	dt_free(dtp, datap->fds);
> +err:
> +	dt_free(dtp, datap);
> +	return NULL;
> +}
> +
> +static int populate(dtrace_hdl_t *dtp)
> +{
> +	dt_provider_t	*prv;
> +	char		buf[32];
> +	int		i, n = 0;
> +	int		profile_n[] = { 97, 199, 499, 997, 1999, 4001, 4999 };
> +	int		tick_n[] = { 1, 10, 100, 500, 1000, 5000 };
> +
> +	prv = dt_provider_create(dtp, prvname, &dt_profile, &pattr);
> +	if (prv == NULL)
> +		return 0;
> +
> +	for (i = 0; i < sizeof(profile_n) / sizeof(*profile_n); i++) {
> +		snprintf(buf, sizeof(buf), "%s%d", PREFIX_PROFILE, profile_n[i]);
> +		if (profile_probe_insert(dtp, buf))
> +			n++;
> +	}
> +
> +	for (i = 0; i < sizeof(tick_n) / sizeof(*tick_n); i++) {
> +		snprintf(buf, sizeof(buf), "%s%d", PREFIX_TICK, tick_n[i]);
> +		if (profile_probe_insert(dtp, buf))
> +			n++;
> +	}
> +
> +	return n;
> +}
> +
> +/*
> + * Generate a BPF trampoline for a profile probe.
> + *
> + * The trampoline function is called when a profile probe triggers, and it must
> + * satisfy the following prototype:
> + *
> + *	int dt_profile(struct bpf_perf_event_data *ctx)
> + *
> + * The trampoline will populate a dt_bpf_context struct and then call the
> + * function that implements the compiled D clause.  It returns the value that
> + * it gets back from that function.
> + */
> +static void trampoline(dt_pcb_t *pcb)
> +{
> +	int		i;
> +	dt_irlist_t	*dlp = &pcb->pcb_ir;
> +	struct bpf_insn	instr;
> +	uint_t		lbl_exit = dt_irlist_label(dlp);
> +	dt_ident_t	*idp;
> +
> +#define DCTX_FP(off)	(-(ushort_t)DCTX_SIZE + (ushort_t)(off))
> +
> +	/*
> +	 * int dt_profile(struct bpf_perf_event_data *ctx)
> +	 * {
> +	 *     struct dt_bpf_context	dctx;
> +	 *
> +	 *     memset(&dctx, 0, sizeof(dctx));
> +	 *
> +	 *     dctx.epid = EPID;
> +	 *     (we clear dctx.pad and dctx.fault because of the memset above)
> +	 */
> +	idp = dt_dlib_get_var(pcb->pcb_hdl, "EPID");
> +	assert(idp != NULL);
> +	instr = BPF_STORE_IMM(BPF_W, BPF_REG_FP, DCTX_FP(DCTX_EPID), -1);
> +	dt_irlist_append(dlp, dt_cg_node_alloc(DT_LBL_NONE, instr));
> +	dlp->dl_last->di_extern = idp;
> +	instr = BPF_STORE_IMM(BPF_W, BPF_REG_FP, DCTX_FP(DCTX_PAD), 0);
> +	dt_irlist_append(dlp, dt_cg_node_alloc(DT_LBL_NONE, instr));
> +	instr = BPF_STORE_IMM(BPF_DW, BPF_REG_FP, DCTX_FP(DCTX_FAULT), 0);
> +	dt_irlist_append(dlp, dt_cg_node_alloc(DT_LBL_NONE, instr));
> +
> +#if 0
> +	/*
> +	 *     dctx.regs = *regs;
> +	 */
> +	/*
> +	 * FIXME:
> +	 * %r1 points to a context given by /usr/include/linux/bpf_perf_event.h:
> +	 *     struct bpf_perf_event_data {
> +	 *         bpf_user_pt_regs_t regs;
> +	 *         __u64 sample_period;
> +	 *         __u64 addr;
> +	 *     };
> +	 * Since regs lives at the same base address as the context and
> +	 * since bpf_user_pt_regs_t might be the same as dt_pt_regs, I think
> +	 * we can just use the PT_REGS_ARGn definitions from libdtrace/dt_pt_regs.h.
> +	 * But we should revisit all that and fix as necessary.
> +	 */
> +	for (i = 0; i < sizeof(dt_pt_regs); i += 8) {
> +		instr = BPF_LOAD(BPF_DW, BPF_REG_0, BPF_REG_1, i);
> +		dt_irlist_append(dlp, dt_cg_node_alloc(DT_LBL_NONE, instr));
> +		instr = BPF_STORE(BPF_DW, BPF_REG_FP, DCTX_FP(DCTX_REGS) + i,
> +				  BPF_REG_0);
> +		dt_irlist_append(dlp, dt_cg_node_alloc(DT_LBL_NONE, instr));
> +	}
> +#endif
> +
> +	/*
> +	 * FIXME:
> +	 * We need to set the args here.  They are:
> +	 *     dctx.argv[0] =     pc;  kernel   PC
> +	 *     dctx.argv[1] =    upc; userspace PC
> +	 *     dctx.argv[2] =  nsecs; elapsed nsec (profile-n only, not tick-n)
> +	 * Maybe look at what samples/bpf/[sampleip|trace_event]_kern.c
> +	 * does with PT_REGS_IP(&ctx->regs).
> +	 */
> +
> +	/*
> +	 *     (we clear dctx.argv[0] and on because of the memset above)
> +	 */
> +	/*
> +	 * FIXME: Once we set the args, I think we want to start this
> +	 * loop at pcb->pcb_pinfo.dtp_argc;  see dt_prov_syscall.c.
> +	 * Actually, dtp_argc might only be for the typed args, of which
> +	 * the profile provider has none.
> +	 */
> +	for (i = 0; i < sizeof(((struct dt_bpf_context *)0)->argv) / 8; i++) {
> +		instr = BPF_STORE_IMM(BPF_DW, BPF_REG_FP, DCTX_FP(DCTX_ARG(i)),
> +				      0);
> +		dt_irlist_append(dlp, dt_cg_node_alloc(DT_LBL_NONE, instr));
> +	}
> +
> +	/*
> +	 * We know the BPF context (ctx) is in %r1.  Since we will be passing
> +	 * the DTrace context (dctx) as 2nd argument to dt_program, we need it
> +	 * in %r2.
> +	 */
> +	instr = BPF_MOV_REG(BPF_REG_2, BPF_REG_FP);
> +	dt_irlist_append(dlp, dt_cg_node_alloc(DT_LBL_NONE, instr));
> +	instr = BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, DCTX_FP(0));
> +	dt_irlist_append(dlp, dt_cg_node_alloc(DT_LBL_NONE, instr));
> +
> +	/*
> +	 *     rc = dt_program(ctx, dctx);
> +	 */
> +	idp = dt_dlib_get_func(pcb->pcb_hdl, "dt_program");
> +	assert(idp != NULL);
> +	instr = BPF_CALL_FUNC(idp->di_id);
> +	dt_irlist_append(dlp, dt_cg_node_alloc(DT_LBL_NONE, instr));
> +	dlp->dl_last->di_extern = idp;
> +
> +	/*
> +	 * exit:
> +	 *     return rc;
> +	 * }
> +	 */
> +	instr = BPF_RETURN();
> +	dt_irlist_append(dlp, dt_cg_node_alloc(lbl_exit, instr));
> +}
> +
> +/*
> + * Given a profile-provider probe name, set sample_period, or else freq
> + * and sample_freq, in the perf_event_attr.
> + */
> +static int set_period_freq(const char *name, struct perf_event_attr *ap)
> +{
> +	int ndigits;
> +	char *p;
> +	unsigned long long val;
> +
> +	if (strncmp(name, PREFIX_PROFILE, strlen(PREFIX_PROFILE)) != 0 &&
> +	    strncmp(name, PREFIX_TICK, strlen(PREFIX_TICK)) != 0)
> +		return -1;
> +
> +	p = strchr(name, '-') + 1;
> +	ndigits = strspn(p, "0123456789");
> +
> +	sscanf(p, "%llud", &val);
> +	if (val == 0)
> +		return -1;
> +	p += ndigits;
> +
> +	ap->freq = 0;
> +	if (p[0] == '\0' || strncasecmp(p, "hz", 3) == 0) {
> +		ap->sample_freq = val;
> +		ap->freq = 1;
> +	} else if (strncasecmp(p, "d", 2) == 0 || strncasecmp(p, "day", 4) == 0)
> +		ap->sample_period = val * 24 * 60 * 60 * 1000000000;
> +	else if (strncasecmp(p, "h", 2) == 0 || strncasecmp(p, "hour", 5) == 0)

Once you start using curly brackets, use it for all cases in the else if list.

Alternatively, you could use:

	if (p[0] == '\0' || strncasecmp(p, "hz", 3) == 0) {
		ap->freq = 1;
		ap->sample_freq = val;
	} else {
		ap->freq = 0;
		if (strncasecmp(p, "d", 2) == 0 ||
		    strncasecmp(p, "day", 4) == 0)
			ap->sample_period = val * 24 * 60 * 60 * 1000000000;
		else if (...)

	}

> +		ap->sample_period = val * 60 * 60 * 1000000000;
> +	else if (strncasecmp(p, "m", 2) == 0 || strncasecmp(p, "min", 4) == 0)
> +		ap->sample_period = val * 60 * 1000000000;
> +	else if (strncasecmp(p, "s", 2) == 0 || strncasecmp(p, "sec", 4) == 0)
> +		ap->sample_period = val * 1000000000;
> +	else if (strncasecmp(p, "ms", 3) == 0 || strncasecmp(p, "msec", 5) == 0)
> +		ap->sample_period = val * 1000000;
> +	else if (strncasecmp(p, "us", 3) == 0 || strncasecmp(p, "usec", 5) == 0)
> +		ap->sample_period = val * 1000;
> +	else if (strncasecmp(p, "ns", 3) == 0 || strncasecmp(p, "nsec", 5) == 0)
> +		ap->sample_period = val;
> +	else
> +		return -1;
> +
> +	return 0;
> +}
> +
> +static int probe_info(dtrace_hdl_t *dtp, const dt_probe_t *prp,
> +		      int *argcp, dt_argdesc_t **argvp)
> +{
> +	/* FIXME: This should be unneeded? */

Correct.  Not needed.

> +	if (strcmp(prp->desc->prv, prvname) ||
> +	    strcmp(prp->desc->mod, modname) ||
> +	    strcmp(prp->desc->fun, funname))
> +		return -1;
> +
> +	/* profile-provider probe arguments are not typed */
> +	*argcp = 0;
> +	*argvp = NULL;
> +
> +	return 0;
> +}
> +
> +static int provide(dtrace_hdl_t *dtp, const dtrace_probedesc_t *pdp)
> +{
> +	struct perf_event_attr attr;
> +
> +	/* make sure we have IDNONE and a legal name */
> +	if (pdp->id != DTRACE_IDNONE ||
> +	    strcmp(pdp->prv, prvname) ||
> +	    strcmp(pdp->mod, modname) ||
> +	    strcmp(pdp->fun, funname))
> +		return 0;
> +	if (strncmp(pdp->prb, PREFIX_TICK, strlen(PREFIX_TICK)) &&
> +	    strncmp(pdp->prb, PREFIX_PROFILE, strlen(PREFIX_PROFILE)))
> +		return 0;
> +
> +	/* return if we already have this probe */
> +	if (dt_probe_lookup(dtp, pdp))
> +		return 0;
> +
> +	/* enforce the 200-usec limit */
> +	if (set_period_freq(pdp->prb, &attr) < 0 ||
> +	    (attr.freq == 1 && attr.sample_freq > 5000) ||
> +	    (attr.freq == 0 && attr.sample_period < 200000))
> +		return 0;

Since you did all the work here already to determine the sample_freq and
sample_period, why not store that in profile_probe_t so that you don't have
to repeat the parsing when the time comes to attach the probe?  That is the
kind of data that would make sense to store in the provider-specific data
struct.

> +	/* insert this probe */
> +	profile_probe_insert(dtp, pdp->prb);
> +
> +	return 0;
> +}
> +
> +static int attach(dtrace_hdl_t *dtp, const dt_probe_t *prp, int bpf_fd)
> +{
> +	prv_data_t *datap = prp->prv_data;
> +	struct perf_event_attr attr;
> +	int i, nattach = 0;;
> +
> +	memset(&attr, 0, sizeof(attr));
> +	attr.type = PERF_TYPE_SOFTWARE;
> +	attr.config = PERF_COUNT_SW_CPU_CLOCK;
> +	attr.sample_type = PERF_SAMPLE_RAW;
> +	attr.size = sizeof(struct perf_event_attr);   // FIXME: why have we not been doing this in other providers?
> +	attr.wakeup_events = 1;
> +
> +	if (set_period_freq(prp->desc->prb, &attr) < 0)
> +		return -1;

See comment above about sample_freq and sample_period belonging in the
profile_probe_t struct.

> +	/* FIXME: Use a different CPU for tick-n? (datap->ncpus==1) */

It would probably be a good idea to attach tick probes to a random CPU, to
spread out the load across multiple CPUs if there are multiple tick probes.

> +	for (i = 0; i < datap->ncpus; i++) {
> +		int fd;
> +		fd = perf_event_open(&attr, -1, dtp->dt_conf.cpus[i].cpu_id, -1, 0);
> +		if (fd < 0)
> +			continue;
> +		if (ioctl(fd, PERF_EVENT_IOC_SET_BPF, bpf_fd) < 0) {
> +			close(fd);
> +			continue;
> +		}
> +		datap->fds[i] = fd;
> +		nattach++;
> +	}
> +
> +	return nattach > 0 ? 0 : -1;
> +}
> +
> +static int probe_fini(dtrace_hdl_t *dtp, dt_probe_t *prp)
> +{
> +	prv_data_t *datap = prp->prv_data;
> +	int i;
> +
> +	for (i = 0; i < datap->ncpus; i++)
> +		if (datap->fds[i] != -1)
> +			close(datap->fds[i]);
> +	dt_free(dtp, datap->fds);
> +	datap->ncpus = 0;
> +
> +	return 0;
> +}
> +
> +dt_provimpl_t	dt_profile = {
> +	.name		= prvname,
> +	.prog_type	= BPF_PROG_TYPE_PERF_EVENT,
> +	.populate	= &populate,
> +	.trampoline	= &trampoline,
> +	.probe_info	= &probe_info,
> +	.provide	= &provide,
> +	.attach		= &attach,
> +	.probe_fini	= &probe_fini,
> +};
> +#if 0
> +#include <linux/fs.h>
> +#include <linux/ktime.h>
> +#include <linux/miscdevice.h>
> +#include <linux/slab.h>
> +#include <asm/irq_regs.h>
> +#include <asm/ptrace.h>
> +
> +#include <linux/hardirq.h>
> +#include <linux/profile.h>
> +
> +static void profile_[tick|prof]_fn(uintptr_t arg)
> +{
> +	unsigned long	pc = 0, upc = 0;
> +	struct pt_regs	*regs = get_irq_regs();
> +
> +	/*
> +	 * If regs == NULL, then we were called from from softirq context which
> +	 * also means that we didn't actually interrupt any processing (kernel
> +	 * or user space).
> +	 * If regs != NULL, then we did actually get called from hardirq
> +	 * because the timer interrupt did really interrupt something that was
> +	 * going on on the CPU (could be user mode or kernel mode).
> +	 */
> +	if (regs == NULL) {
> +		uint64_t	stack[8];
> +
> +		dtrace_getpcstack(stack, 8, 0, NULL);
> +		pc = stack[7];
> +	} else if (user_mode(regs))
> +		upc = instruction_pointer(regs);
> +	else
> +		pc = instruction_pointer(regs);
> +}
> +#endif

Remove.

> diff --git a/libdtrace/dt_provider.h b/libdtrace/dt_provider.h
> index 9ea89898..99ba0c13 100644
> --- a/libdtrace/dt_provider.h
> +++ b/libdtrace/dt_provider.h
> @@ -77,6 +77,7 @@ extern int tp_event_attach(dtrace_hdl_t *dtp, const struct dt_probe *prp, int bp
>  
>  extern dt_provimpl_t dt_dtrace;
>  extern dt_provimpl_t dt_fbt;
> +extern dt_provimpl_t dt_profile;
>  extern dt_provimpl_t dt_sdt;
>  extern dt_provimpl_t dt_syscall;
>  
> -- 
> 2.18.2
> 
> 
> _______________________________________________
> DTrace-devel mailing list
> DTrace-devel at oss.oracle.com
> https://oss.oracle.com/mailman/listinfo/dtrace-devel



More information about the DTrace-devel mailing list