[DTrace-devel] [PATCH 09/11] Add a profile provider

eugene.loh at oracle.com eugene.loh at oracle.com
Sun Jun 28 23:23:28 PDT 2020


From: Eugene Loh <eugene.loh at oracle.com>

The probes use perf_events with PERF_TYPE_SOFTWARE and
PERF_COUNT_SW_CPU_CLOCK.

Signed-off-by: Eugene Loh <eugene.loh at oracle.com>
---
 BPF-DESIGN                  |   1 +
 libdtrace/Build             |   2 +-
 libdtrace/dt_open.c         |   1 +
 libdtrace/dt_prov_profile.c | 422 ++++++++++++++++++++++++++++++++++++
 libdtrace/dt_provider.h     |   1 +
 5 files changed, 426 insertions(+), 1 deletion(-)
 create mode 100644 libdtrace/dt_prov_profile.c

diff --git a/BPF-DESIGN b/BPF-DESIGN
index 919a6d9d..1d1c3a2a 100644
--- a/BPF-DESIGN
+++ b/BPF-DESIGN
@@ -221,6 +221,7 @@ DTRACE BPF PROGRAM CONVENTIONS
 
     BPF trampoline (probe specific)
     -------------------------------
+	FIXME: add something for dt_dtrace and dt_profile.
 	Function Boundary Tracing (based on kprobe)
 	-------------------------------------------
 	The C equivalent implementation of the FBT trampoline program is:
diff --git a/libdtrace/Build b/libdtrace/Build
index 1fe41d53..cf0718ff 100644
--- a/libdtrace/Build
+++ b/libdtrace/Build
@@ -21,7 +21,7 @@ libdtrace-build_SOURCES = dt_lex.c dt_aggregate.c dt_as.c dt_bpf.c \
 			  dt_proc.c dt_program.c dt_provider.c dt_regset.c \
 			  dt_string.c dt_strtab.c dt_subr.c dt_symtab.c \
 			  dt_work.c dt_xlator.c dt_peb.c dt_prov_dtrace.c \
-			  dt_prov_fbt.c dt_prov_sdt.c dt_prov_syscall.c
+			  dt_prov_fbt.c dt_prov_profile.c dt_prov_sdt.c dt_prov_syscall.c
 
 libdtrace-build_SRCDEPS := dt_grammar.h $(objdir)/dt_git_version.h
 
diff --git a/libdtrace/dt_open.c b/libdtrace/dt_open.c
index 6a1efbfb..f6e82954 100644
--- a/libdtrace/dt_open.c
+++ b/libdtrace/dt_open.c
@@ -64,6 +64,7 @@ const dt_version_t _dtrace_versions[] = {
 static const dt_provimpl_t *dt_providers[] = {
 	&dt_dtrace,
 	&dt_fbt,
+	&dt_profile,
 	&dt_sdt,
 	&dt_syscall,
 };
diff --git a/libdtrace/dt_prov_profile.c b/libdtrace/dt_prov_profile.c
new file mode 100644
index 00000000..24a4a315
--- /dev/null
+++ b/libdtrace/dt_prov_profile.c
@@ -0,0 +1,422 @@
+/*
+ * Oracle Linux DTrace.
+ * Copyright (c) 2020, Oracle and/or its affiliates. All rights reserved.
+ * Licensed under the Universal Permissive License v 1.0 as shown at
+ * http://oss.oracle.com/licenses/upl.
+ *
+ * The profile provider for DTrace.
+ */
+/* FIXME: clean up unneeded include files */
+#include <assert.h>
+// #include <errno.h>
+// #include <fcntl.h>
+// #include <stdio.h>
+// #include <stdlib.h>
+// #include <string.h>
+// #include <unistd.h>
+#include <sys/ioctl.h>
+// #include <linux/bpf.h>
+// #include <linux/perf_event.h>     // included via dt_bpf.h
+// #include <sys/stat.h>
+// #include <sys/types.h>
+
+#include <bpf_asm.h>
+
+// #include "dt_impl.h"              // included via dt_bpf.h and dt_probe.h
+#include "dt_bpf.h"
+// #include "dt_bpf_builtins.h"
+// #include "dt_provider.h"          // included via dt_probe.h
+#include "dt_probe.h"
+// #include "dt_pt_regs.h"
+
+typedef struct prv_data {
+	int	ncpus;		/* number of CPUs on which a probe fires */
+	int	*fds;		/* fds for the CPUs on which a probe fires */
+} prv_data_t;
+
+static const char		prvname[] = "profile";
+static const char		modname[] = "";
+static const char		funname[] = "";
+
+#define PREFIX_PROFILE	"profile-"
+#define PREFIX_TICK	"tick-"
+
+static const dtrace_pattr_t	pattr = {
+{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
+{ DTRACE_STABILITY_UNSTABLE, DTRACE_STABILITY_UNSTABLE, DTRACE_CLASS_UNKNOWN },
+{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
+{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
+{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
+};
+
+static dt_probe_t *profile_probe_insert(dtrace_hdl_t *dtp, const char *prb)
+{
+	prv_data_t *datap;
+	dt_probe_t *prp;
+	dt_provider_t *prv;
+	int i;
+
+	/* FIXME: shall we cache prv since this function is called multiple times? */
+	prv = dt_provider_lookup(dtp, prvname);
+	if (!prv)
+		return NULL;
+
+	datap = dt_zalloc(dtp, sizeof (prv_data_t));
+	if (datap == NULL)
+		return NULL;
+
+	/* the probe name starts with "profile-" or "tick-" */
+	if (prb[0] == 'p')
+		datap->ncpus = dtp->dt_conf.num_online_cpus;
+	else
+		datap->ncpus = 1;
+
+	datap->fds = dt_calloc(dtp, datap->ncpus, sizeof (int));
+	if (datap->fds == NULL)
+		goto err;
+
+	for (i = 0; i < datap->ncpus; i++)
+		datap->fds[i] = -1;
+
+	prp = dt_probe_insert(dtp, prv, datap, prvname, modname, funname, prb);
+	if (prp)
+		return prp;
+
+	dt_free(dtp, datap->fds);
+err:
+	dt_free(dtp, datap);
+	return NULL;
+}
+
+static int populate(dtrace_hdl_t *dtp)
+{
+	dt_provider_t	*prv;
+	char		buf[32];
+	int		i, n = 0;
+	int		profile_n[] = { 97, 199, 499, 997, 1999, 4001, 4999 };
+	int		tick_n[] = { 1, 10, 100, 500, 1000, 5000 };
+
+	prv = dt_provider_create(dtp, prvname, &dt_profile, &pattr);
+	if (prv == NULL)
+		return 0;
+
+	for (i = 0; i < sizeof(profile_n) / sizeof(*profile_n); i++) {
+		snprintf(buf, sizeof(buf), "%s%d", PREFIX_PROFILE, profile_n[i]);
+		if (profile_probe_insert(dtp, buf))
+			n++;
+	}
+
+	for (i = 0; i < sizeof(tick_n) / sizeof(*tick_n); i++) {
+		snprintf(buf, sizeof(buf), "%s%d", PREFIX_TICK, tick_n[i]);
+		if (profile_probe_insert(dtp, buf))
+			n++;
+	}
+
+	return n;
+}
+
+/*
+ * Generate a BPF trampoline for a profile probe.
+ *
+ * The trampoline function is called when a profile probe triggers, and it must
+ * satisfy the following prototype:
+ *
+ *	int dt_profile(struct bpf_perf_event_data *ctx)
+ *
+ * The trampoline will populate a dt_bpf_context struct and then call the
+ * function that implements the compiled D clause.  It returns the value that
+ * it gets back from that function.
+ */
+static void trampoline(dt_pcb_t *pcb)
+{
+	int		i;
+	dt_irlist_t	*dlp = &pcb->pcb_ir;
+	struct bpf_insn	instr;
+	uint_t		lbl_exit = dt_irlist_label(dlp);
+	dt_ident_t	*idp;
+
+#define DCTX_FP(off)	(-(ushort_t)DCTX_SIZE + (ushort_t)(off))
+
+	/*
+	 * int dt_profile(struct bpf_perf_event_data *ctx)
+	 * {
+	 *     struct dt_bpf_context	dctx;
+	 *
+	 *     memset(&dctx, 0, sizeof(dctx));
+	 *
+	 *     dctx.epid = EPID;
+	 *     (we clear dctx.pad and dctx.fault because of the memset above)
+	 */
+	idp = dt_dlib_get_var(pcb->pcb_hdl, "EPID");
+	assert(idp != NULL);
+	instr = BPF_STORE_IMM(BPF_W, BPF_REG_FP, DCTX_FP(DCTX_EPID), -1);
+	dt_irlist_append(dlp, dt_cg_node_alloc(DT_LBL_NONE, instr));
+	dlp->dl_last->di_extern = idp;
+	instr = BPF_STORE_IMM(BPF_W, BPF_REG_FP, DCTX_FP(DCTX_PAD), 0);
+	dt_irlist_append(dlp, dt_cg_node_alloc(DT_LBL_NONE, instr));
+	instr = BPF_STORE_IMM(BPF_DW, BPF_REG_FP, DCTX_FP(DCTX_FAULT), 0);
+	dt_irlist_append(dlp, dt_cg_node_alloc(DT_LBL_NONE, instr));
+
+#if 0
+	/*
+	 *     dctx.regs = *regs;
+	 */
+	/*
+	 * FIXME:
+	 * %r1 points to a context given by /usr/include/linux/bpf_perf_event.h:
+	 *     struct bpf_perf_event_data {
+	 *         bpf_user_pt_regs_t regs;
+	 *         __u64 sample_period;
+	 *         __u64 addr;
+	 *     };
+	 * Since regs lives at the same base address as the context and
+	 * since bpf_user_pt_regs_t might be the same as dt_pt_regs, I think
+	 * we can just use the PT_REGS_ARGn definitions from libdtrace/dt_pt_regs.h.
+	 * But we should revisit all that and fix as necessary.
+	 */
+	for (i = 0; i < sizeof(dt_pt_regs); i += 8) {
+		instr = BPF_LOAD(BPF_DW, BPF_REG_0, BPF_REG_1, i);
+		dt_irlist_append(dlp, dt_cg_node_alloc(DT_LBL_NONE, instr));
+		instr = BPF_STORE(BPF_DW, BPF_REG_FP, DCTX_FP(DCTX_REGS) + i,
+				  BPF_REG_0);
+		dt_irlist_append(dlp, dt_cg_node_alloc(DT_LBL_NONE, instr));
+	}
+#endif
+
+	/*
+	 * FIXME:
+	 * We need to set the args here.  They are:
+	 *     dctx.argv[0] =     pc;  kernel   PC
+	 *     dctx.argv[1] =    upc; userspace PC
+	 *     dctx.argv[2] =  nsecs; elapsed nsec (profile-n only, not tick-n)
+	 * Maybe look at what samples/bpf/[sampleip|trace_event]_kern.c
+	 * does with PT_REGS_IP(&ctx->regs).
+	 */
+
+	/*
+	 *     (we clear dctx.argv[0] and on because of the memset above)
+	 */
+	/*
+	 * FIXME: Once we set the args, I think we want to start this
+	 * loop at pcb->pcb_pinfo.dtp_argc;  see dt_prov_syscall.c.
+	 * Actually, dtp_argc might only be for the typed args, of which
+	 * the profile provider has none.
+	 */
+	for (i = 0; i < sizeof(((struct dt_bpf_context *)0)->argv) / 8; i++) {
+		instr = BPF_STORE_IMM(BPF_DW, BPF_REG_FP, DCTX_FP(DCTX_ARG(i)),
+				      0);
+		dt_irlist_append(dlp, dt_cg_node_alloc(DT_LBL_NONE, instr));
+	}
+
+	/*
+	 * We know the BPF context (ctx) is in %r1.  Since we will be passing
+	 * the DTrace context (dctx) as 2nd argument to dt_program, we need it
+	 * in %r2.
+	 */
+	instr = BPF_MOV_REG(BPF_REG_2, BPF_REG_FP);
+	dt_irlist_append(dlp, dt_cg_node_alloc(DT_LBL_NONE, instr));
+	instr = BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, DCTX_FP(0));
+	dt_irlist_append(dlp, dt_cg_node_alloc(DT_LBL_NONE, instr));
+
+	/*
+	 *     rc = dt_program(ctx, dctx);
+	 */
+	idp = dt_dlib_get_func(pcb->pcb_hdl, "dt_program");
+	assert(idp != NULL);
+	instr = BPF_CALL_FUNC(idp->di_id);
+	dt_irlist_append(dlp, dt_cg_node_alloc(DT_LBL_NONE, instr));
+	dlp->dl_last->di_extern = idp;
+
+	/*
+	 * exit:
+	 *     return rc;
+	 * }
+	 */
+	instr = BPF_RETURN();
+	dt_irlist_append(dlp, dt_cg_node_alloc(lbl_exit, instr));
+}
+
+/*
+ * Given a profile-provider probe name, set sample_period, or else freq
+ * and sample_freq, in the perf_event_attr.
+ */
+static int set_period_freq(const char *name, struct perf_event_attr *ap)
+{
+	int ndigits;
+	char *p;
+	unsigned long long val;
+
+	if (strncmp(name, PREFIX_PROFILE, strlen(PREFIX_PROFILE)) != 0 &&
+	    strncmp(name, PREFIX_TICK, strlen(PREFIX_TICK)) != 0)
+		return -1;
+
+	p = strchr(name, '-') + 1;
+	ndigits = strspn(p, "0123456789");
+
+	sscanf(p, "%llud", &val);
+	if (val == 0)
+		return -1;
+	p += ndigits;
+
+	ap->freq = 0;
+	if (p[0] == '\0' || strncasecmp(p, "hz", 3) == 0) {
+		ap->sample_freq = val;
+		ap->freq = 1;
+	} else if (strncasecmp(p, "d", 2) == 0 || strncasecmp(p, "day", 4) == 0)
+		ap->sample_period = val * 24 * 60 * 60 * 1000000000;
+	else if (strncasecmp(p, "h", 2) == 0 || strncasecmp(p, "hour", 5) == 0)
+		ap->sample_period = val * 60 * 60 * 1000000000;
+	else if (strncasecmp(p, "m", 2) == 0 || strncasecmp(p, "min", 4) == 0)
+		ap->sample_period = val * 60 * 1000000000;
+	else if (strncasecmp(p, "s", 2) == 0 || strncasecmp(p, "sec", 4) == 0)
+		ap->sample_period = val * 1000000000;
+	else if (strncasecmp(p, "ms", 3) == 0 || strncasecmp(p, "msec", 5) == 0)
+		ap->sample_period = val * 1000000;
+	else if (strncasecmp(p, "us", 3) == 0 || strncasecmp(p, "usec", 5) == 0)
+		ap->sample_period = val * 1000;
+	else if (strncasecmp(p, "ns", 3) == 0 || strncasecmp(p, "nsec", 5) == 0)
+		ap->sample_period = val;
+	else
+		return -1;
+
+	return 0;
+}
+
+static int probe_info(dtrace_hdl_t *dtp, const dt_probe_t *prp,
+		      int *argcp, dt_argdesc_t **argvp)
+{
+	/* FIXME: This should be unneeded? */
+	if (strcmp(prp->desc->prv, prvname) ||
+	    strcmp(prp->desc->mod, modname) ||
+	    strcmp(prp->desc->fun, funname))
+		return -1;
+
+	/* profile-provider probe arguments are not typed */
+	*argcp = 0;
+	*argvp = NULL;
+
+	return 0;
+}
+
+static int provide(dtrace_hdl_t *dtp, const dtrace_probedesc_t *pdp)
+{
+	struct perf_event_attr attr;
+
+	/* make sure we have IDNONE and a legal name */
+	if (pdp->id != DTRACE_IDNONE ||
+	    strcmp(pdp->prv, prvname) ||
+	    strcmp(pdp->mod, modname) ||
+	    strcmp(pdp->fun, funname))
+		return 0;
+	if (strncmp(pdp->prb, PREFIX_TICK, strlen(PREFIX_TICK)) &&
+	    strncmp(pdp->prb, PREFIX_PROFILE, strlen(PREFIX_PROFILE)))
+		return 0;
+
+	/* return if we already have this probe */
+	if (dt_probe_lookup(dtp, pdp))
+		return 0;
+
+	/* enforce the 200-usec limit */
+	if (set_period_freq(pdp->prb, &attr) < 0 ||
+	    (attr.freq == 1 && attr.sample_freq > 5000) ||
+	    (attr.freq == 0 && attr.sample_period < 200000))
+		return 0;
+
+	/* insert this probe */
+	profile_probe_insert(dtp, pdp->prb);
+
+	return 0;
+}
+
+static int attach(dtrace_hdl_t *dtp, const dt_probe_t *prp, int bpf_fd)
+{
+	prv_data_t *datap = prp->prv_data;
+	struct perf_event_attr attr;
+	int i, nattach = 0;;
+
+	memset(&attr, 0, sizeof(attr));
+	attr.type = PERF_TYPE_SOFTWARE;
+	attr.config = PERF_COUNT_SW_CPU_CLOCK;
+	attr.sample_type = PERF_SAMPLE_RAW;
+	attr.size = sizeof(struct perf_event_attr);   // FIXME: why have we not been doing this in other providers?
+	attr.wakeup_events = 1;
+
+	if (set_period_freq(prp->desc->prb, &attr) < 0)
+		return -1;
+
+	/* FIXME: Use a different CPU for tick-n? (datap->ncpus==1) */
+	for (i = 0; i < datap->ncpus; i++) {
+		int fd;
+		fd = perf_event_open(&attr, -1, dtp->dt_conf.cpus[i].cpu_id, -1, 0);
+		if (fd < 0)
+			continue;
+		if (ioctl(fd, PERF_EVENT_IOC_SET_BPF, bpf_fd) < 0) {
+			close(fd);
+			continue;
+		}
+		datap->fds[i] = fd;
+		nattach++;
+	}
+
+	return nattach > 0 ? 0 : -1;
+}
+
+static int probe_fini(dtrace_hdl_t *dtp, dt_probe_t *prp)
+{
+	prv_data_t *datap = prp->prv_data;
+	int i;
+
+	for (i = 0; i < datap->ncpus; i++)
+		if (datap->fds[i] != -1)
+			close(datap->fds[i]);
+	dt_free(dtp, datap->fds);
+	datap->ncpus = 0;
+
+	return 0;
+}
+
+dt_provimpl_t	dt_profile = {
+	.name		= prvname,
+	.prog_type	= BPF_PROG_TYPE_PERF_EVENT,
+	.populate	= &populate,
+	.trampoline	= &trampoline,
+	.probe_info	= &probe_info,
+	.provide	= &provide,
+	.attach		= &attach,
+	.probe_fini	= &probe_fini,
+};
+#if 0
+#include <linux/fs.h>
+#include <linux/ktime.h>
+#include <linux/miscdevice.h>
+#include <linux/slab.h>
+#include <asm/irq_regs.h>
+#include <asm/ptrace.h>
+
+#include <linux/hardirq.h>
+#include <linux/profile.h>
+
+static void profile_[tick|prof]_fn(uintptr_t arg)
+{
+	unsigned long	pc = 0, upc = 0;
+	struct pt_regs	*regs = get_irq_regs();
+
+	/*
+	 * If regs == NULL, then we were called from from softirq context which
+	 * also means that we didn't actually interrupt any processing (kernel
+	 * or user space).
+	 * If regs != NULL, then we did actually get called from hardirq
+	 * because the timer interrupt did really interrupt something that was
+	 * going on on the CPU (could be user mode or kernel mode).
+	 */
+	if (regs == NULL) {
+		uint64_t	stack[8];
+
+		dtrace_getpcstack(stack, 8, 0, NULL);
+		pc = stack[7];
+	} else if (user_mode(regs))
+		upc = instruction_pointer(regs);
+	else
+		pc = instruction_pointer(regs);
+}
+#endif
diff --git a/libdtrace/dt_provider.h b/libdtrace/dt_provider.h
index 9ea89898..99ba0c13 100644
--- a/libdtrace/dt_provider.h
+++ b/libdtrace/dt_provider.h
@@ -77,6 +77,7 @@ extern int tp_event_attach(dtrace_hdl_t *dtp, const struct dt_probe *prp, int bp
 
 extern dt_provimpl_t dt_dtrace;
 extern dt_provimpl_t dt_fbt;
+extern dt_provimpl_t dt_profile;
 extern dt_provimpl_t dt_sdt;
 extern dt_provimpl_t dt_syscall;
 
-- 
2.18.2




More information about the DTrace-devel mailing list