[DTrace-devel] [PATCH 5/5 v2] Add a profile provider

eugene.loh at oracle.com eugene.loh at oracle.com
Mon Jun 15 11:18:05 PDT 2020


From: Eugene Loh <eugene.loh at oracle.com>

The probes use perf_events with PERF_TYPE_SOFTWARE and
PERF_COUNT_SW_CPU_CLOCK.

There are many FIXMEs.  Among them, the trampoline does not
yet write the entire DTrace context... at least not the DTrace
probe arguments.  Also, profile-n is not yet implemented.

For other providers, the work of setting a probe's event_fd is
done in bpf_attach(), but the profile provider is sufficiently
different that it makes more sense to do so in the provider's
probe_info() function.  That is, we would like to set the
probe's event_id and event_fd in probe_info().  On the other
hand, the function was declared to take a "const" probe.
This didn't even make sense, however, since the function's
only caller, dt_probe_args_info(), retrieved information
specifically so that it could set fields like event_id.  So,
this commit removes the "const" from the probe in probe_info()'s
declaration.

The basic approach should work.  It borrows from the kernel's
samples/bpf/ examples (sampleip_user.c and trace_event_user.c),
but I am worried about this statement in the perf_event_open()
man page regarding attaching a BPF program to these events:
    PERF_EVENT_IOC_SET_BPF (since Linux 4.1)
    This allows attaching a Berkeley Packet Filter (BPF)
    program to an existing kprobe tracepoint event.
Are they serious about that kprobe tracepoint limitation?

We also need to choose a CPU.  I guess we need one event per
CPU for profile-n but only one (explicitly chosen?) CPU for
tick-n.

Signed-off-by: Eugene Loh <eugene.loh at oracle.com>
---
 BPF-DESIGN                  |   1 +
 libdtrace/Build             |   2 +-
 libdtrace/dt_open.c         |   1 +
 libdtrace/dt_prov_dtrace.c  |   2 +-
 libdtrace/dt_prov_fbt.c     |   2 +-
 libdtrace/dt_prov_profile.c | 355 ++++++++++++++++++++++++++++++++++++
 libdtrace/dt_prov_sdt.c     |   2 +-
 libdtrace/dt_prov_syscall.c |   2 +-
 libdtrace/dt_provider.h     |   3 +-
 9 files changed, 364 insertions(+), 6 deletions(-)
 create mode 100644 libdtrace/dt_prov_profile.c

diff --git a/BPF-DESIGN b/BPF-DESIGN
index 919a6d9d..1d1c3a2a 100644
--- a/BPF-DESIGN
+++ b/BPF-DESIGN
@@ -221,6 +221,7 @@ DTRACE BPF PROGRAM CONVENTIONS
 
     BPF trampoline (probe specific)
     -------------------------------
+	FIXME: add something for dt_dtrace and dt_profile.
 	Function Boundary Tracing (based on kprobe)
 	-------------------------------------------
 	The C equivalent implementation of the FBT trampoline program is:
diff --git a/libdtrace/Build b/libdtrace/Build
index 1fe41d53..cf0718ff 100644
--- a/libdtrace/Build
+++ b/libdtrace/Build
@@ -21,7 +21,7 @@ libdtrace-build_SOURCES = dt_lex.c dt_aggregate.c dt_as.c dt_bpf.c \
 			  dt_proc.c dt_program.c dt_provider.c dt_regset.c \
 			  dt_string.c dt_strtab.c dt_subr.c dt_symtab.c \
 			  dt_work.c dt_xlator.c dt_peb.c dt_prov_dtrace.c \
-			  dt_prov_fbt.c dt_prov_sdt.c dt_prov_syscall.c
+			  dt_prov_fbt.c dt_prov_profile.c dt_prov_sdt.c dt_prov_syscall.c
 
 libdtrace-build_SRCDEPS := dt_grammar.h $(objdir)/dt_git_version.h
 
diff --git a/libdtrace/dt_open.c b/libdtrace/dt_open.c
index 6a1efbfb..f6e82954 100644
--- a/libdtrace/dt_open.c
+++ b/libdtrace/dt_open.c
@@ -64,6 +64,7 @@ const dt_version_t _dtrace_versions[] = {
 static const dt_provimpl_t *dt_providers[] = {
 	&dt_dtrace,
 	&dt_fbt,
+	&dt_profile,
 	&dt_sdt,
 	&dt_syscall,
 };
diff --git a/libdtrace/dt_prov_dtrace.c b/libdtrace/dt_prov_dtrace.c
index fe9759de..085d8bc4 100644
--- a/libdtrace/dt_prov_dtrace.c
+++ b/libdtrace/dt_prov_dtrace.c
@@ -229,7 +229,7 @@ out:
 	return spec;
 }
 
-static int probe_info(dtrace_hdl_t *dtp, const dt_probe_t *prp,
+static int probe_info(dtrace_hdl_t *dtp, dt_probe_t *prp,
 		      int *idp, int *argcp, dt_argdesc_t **argvp)
 {
 	char	*spec;
diff --git a/libdtrace/dt_prov_fbt.c b/libdtrace/dt_prov_fbt.c
index 1aa3ca3c..bea9536f 100644
--- a/libdtrace/dt_prov_fbt.c
+++ b/libdtrace/dt_prov_fbt.c
@@ -272,7 +272,7 @@ static void trampoline(dt_pcb_t *pcb)
 	dt_irlist_append(dlp, dt_cg_node_alloc(lbl_exit, instr));
 }
 
-static int probe_info(dtrace_hdl_t *dtp, const dt_probe_t *prp,
+static int probe_info(dtrace_hdl_t *dtp, dt_probe_t *prp,
 		      int *idp, int *argcp, dt_argdesc_t **argvp)
 {
 	int	fd;
diff --git a/libdtrace/dt_prov_profile.c b/libdtrace/dt_prov_profile.c
new file mode 100644
index 00000000..15986971
--- /dev/null
+++ b/libdtrace/dt_prov_profile.c
@@ -0,0 +1,355 @@
+/*
+ * Oracle Linux DTrace.
+ * Copyright (c) 2020, Oracle and/or its affiliates. All rights reserved.
+ * Licensed under the Universal Permissive License v 1.0 as shown at
+ * http://oss.oracle.com/licenses/upl.
+ *
+ * The profile provider for DTrace.
+ */
+#include <assert.h>
+// #include <errno.h>
+// #include <fcntl.h>
+// #include <stdio.h>
+// #include <stdlib.h>
+// #include <string.h>
+// #include <unistd.h>
+// #include <linux/perf_event.h>
+// #include <linux/bpf.h>
+// #include <sys/stat.h>
+// #include <sys/types.h>
+
+#include <bpf_asm.h>
+
+// #include "dt_impl.h"
+#include "dt_bpf.h"
+// #include "dt_bpf_builtins.h"
+// #include "dt_provider.h"
+#include "dt_probe.h"
+// #include "dt_pt_regs.h"
+
+static const char		prvname[] = "profile";
+static const char		modname[] = "";
+static const char		funname[] = "";
+
+#define KIND_PROFILE	0
+#define KIND_TICK	1
+#define PREFIX_PROFILE	"profile-"
+#define PREFIX_TICK	"tick-"
+
+static const dtrace_pattr_t	pattr = {
+{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
+{ DTRACE_STABILITY_UNSTABLE, DTRACE_STABILITY_UNSTABLE, DTRACE_CLASS_UNKNOWN },
+{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
+{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
+{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
+};
+
+static int populate(dtrace_hdl_t *dtp)
+{
+	dt_provider_t	*prv;
+	char		buf[32];
+	int		n = 0;
+	int		profile_n[] = { 97, 199, 499, 997, 1999, 4001, 4999 };
+	int		tick_n[] = { 1, 10, 100, 500, 1000, 5000 };
+	int		i;
+
+	prv = dt_provider_create(dtp, prvname, &dt_profile, &pattr);
+	if (prv == NULL)
+		return 0;
+
+	for (i = 0; i < sizeof(profile_n) / sizeof(int); i++) {
+		snprintf(buf, sizeof(buf), "%s%d", PREFIX_PROFILE, profile_n[i]);
+		if (dt_probe_insert(dtp, prv, prvname, modname, funname, buf))
+			n++;
+	}
+
+	for (i = 0; i < sizeof(tick_n) / sizeof(int); i++) {
+		snprintf(buf, sizeof(buf), "%s%d", PREFIX_TICK, tick_n[i]);
+		if (dt_probe_insert(dtp, prv, prvname, modname, funname, buf))
+			n++;
+	}
+
+	return n;
+}
+
+/*
+ * Generate a BPF trampoline for a profile probe.
+ *
+ * The trampoline function is called when a profile probe triggers, and it must
+ * satisfy the following prototype:
+ *
+ *	int dt_profile(struct bpf_perf_event_data *ctx)
+ *
+ * The trampoline will populate a dt_bpf_context struct and then call the
+ * function that implements the compiled D clause.  It returns the value that
+ * it gets back from that function.
+ */
+static void trampoline(dt_pcb_t *pcb)
+{
+	int		i;
+	dt_irlist_t	*dlp = &pcb->pcb_ir;
+	struct bpf_insn	instr;
+	uint_t		lbl_exit = dt_irlist_label(dlp);
+	dt_ident_t	*idp;
+
+#define DCTX_FP(off)	(-(ushort_t)DCTX_SIZE + (ushort_t)(off))
+
+	/*
+	 * int dt_profile(struct bpf_perf_event_data *ctx)
+	 * {
+	 *     struct dt_bpf_context	dctx;
+	 *
+	 *     memset(&dctx, 0, sizeof(dctx));
+	 *
+	 *     dctx.epid = EPID;
+	 *     (we clear dctx.pad and dctx.fault because of the memset above)
+	 */
+	idp = dt_dlib_get_var(pcb->pcb_hdl, "EPID");
+	assert(idp != NULL);
+	instr = BPF_STORE_IMM(BPF_W, BPF_REG_FP, DCTX_FP(DCTX_EPID), -1);
+	dt_irlist_append(dlp, dt_cg_node_alloc(DT_LBL_NONE, instr));
+	dlp->dl_last->di_extern = idp;
+	instr = BPF_STORE_IMM(BPF_W, BPF_REG_FP, DCTX_FP(DCTX_PAD), 0);
+	dt_irlist_append(dlp, dt_cg_node_alloc(DT_LBL_NONE, instr));
+	instr = BPF_STORE_IMM(BPF_DW, BPF_REG_FP, DCTX_FP(DCTX_FAULT), 0);
+	dt_irlist_append(dlp, dt_cg_node_alloc(DT_LBL_NONE, instr));
+
+#if 0
+	/*
+	 *     dctx.regs = *regs;
+	 */
+	/*
+	 * FIXME:
+	 * %r1 points to a context given by /usr/include/linux/bpf_perf_event.h:
+	 *     struct bpf_perf_event_data {
+	 *         bpf_user_pt_regs_t regs;
+	 *         __u64 sample_period;
+	 *         __u64 addr;
+	 *     };
+	 * Since regs lives at the same base address as the context and
+	 * since bpf_user_pt_regs_t might be the same as dt_pt_regs, I think
+	 * we can just use the PT_REGS_ARGn definitions from libdtrace/dt_pt_regs.h.
+	 * But we should revisit all that and fix as necessary.
+	 */
+	for (i = 0; i < sizeof(dt_pt_regs); i += 8) {
+		instr = BPF_LOAD(BPF_DW, BPF_REG_0, BPF_REG_1, i);
+		dt_irlist_append(dlp, dt_cg_node_alloc(DT_LBL_NONE, instr));
+		instr = BPF_STORE(BPF_DW, BPF_REG_FP, DCTX_FP(DCTX_REGS) + i,
+				  BPF_REG_0);
+		dt_irlist_append(dlp, dt_cg_node_alloc(DT_LBL_NONE, instr));
+	}
+#endif
+
+	/*
+	 * FIXME:
+	 * We need to set the args here.  They are:
+	 *     dctx.argv[0] =     pc;  kernel   PC
+	 *     dctx.argv[1] =    upc; userspace PC
+	 *     dctx.argv[2] =  nsecs; elapsed nsec (profile-n only, not tick-n)
+	 * Maybe look at what samples/bpf/[sampleip|trace_event]_kern.c
+	 * does with PT_REGS_IP(&ctx->regs).
+	 */
+
+	/*
+	 *     (we clear dctx.argv[0] and on because of the memset above)
+	 */
+	/*
+	 * FIXME: Once we set the args, I think we want to start this
+	 * loop at pcb->pcb_pinfo.dtp_argc;  see dt_prov_syscall.c.
+	 */
+	for (i = 0; i < sizeof(((struct dt_bpf_context *)0)->argv) / 8; i++) {
+		instr = BPF_STORE_IMM(BPF_DW, BPF_REG_FP, DCTX_FP(DCTX_ARG(i)),
+				      0);
+		dt_irlist_append(dlp, dt_cg_node_alloc(DT_LBL_NONE, instr));
+	}
+
+	/*
+	 * We know the BPF context (ctx) is in %r1.  Since we will be passing
+	 * the DTrace context (dctx) as 2nd argument to dt_program, we need it
+	 * in %r2.
+	 */
+	instr = BPF_MOV_REG(BPF_REG_2, BPF_REG_FP);
+	dt_irlist_append(dlp, dt_cg_node_alloc(DT_LBL_NONE, instr));
+	instr = BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, DCTX_FP(0));
+	dt_irlist_append(dlp, dt_cg_node_alloc(DT_LBL_NONE, instr));
+
+	/*
+	 *     rc = dt_program(ctx, dctx);
+	 */
+	idp = dt_dlib_get_func(pcb->pcb_hdl, "dt_program");
+	assert(idp != NULL);
+	instr = BPF_CALL_FUNC(idp->di_id);
+	dt_irlist_append(dlp, dt_cg_node_alloc(DT_LBL_NONE, instr));
+	dlp->dl_last->di_extern = idp;
+
+	/*
+	 * exit:
+	 *     return rc;
+	 * }
+	 */
+	instr = BPF_RETURN();
+	dt_irlist_append(dlp, dt_cg_node_alloc(lbl_exit, instr));
+}
+
+static int set_period_freq(const char *name, struct perf_event_attr *ap)
+{
+	int kind = -1, ndigits;
+	char *p;
+	unsigned long long val;
+
+	if (strncmp(name, PREFIX_PROFILE, strlen(PREFIX_PROFILE)) == 0)
+		kind = KIND_PROFILE;
+	else if (strncmp(name, PREFIX_TICK, strlen(PREFIX_TICK)) == 0)
+		kind = KIND_TICK;
+	else
+		return -1;
+
+	p = strchr(name, '-') + 1;
+	ndigits = strspn(p, "0123456789");
+
+	sscanf(p, "%llud", &val);
+	if (val == 0)
+		return -1;
+	p += ndigits;
+
+	if (p[0] == '\0' || strncasecmp(p, "hz", 3) == 0) {
+		ap->sample_freq = val;
+		ap->freq = 1;
+	} else if (strncasecmp(p, "d", 2) == 0 || strncasecmp(p, "day", 4) == 0)
+		ap->sample_period = val * 24 * 60 * 60 * 1000000000;
+	else if (strncasecmp(p, "h", 2) == 0 || strncasecmp(p, "hour", 5) == 0)
+		ap->sample_period = val * 60 * 60 * 1000000000;
+	else if (strncasecmp(p, "m", 2) == 0 || strncasecmp(p, "min", 4) == 0)
+		ap->sample_period = val * 60 * 1000000000;
+	else if (strncasecmp(p, "s", 2) == 0 || strncasecmp(p, "sec", 4) == 0)
+		ap->sample_period = val * 1000000000;
+	else if (strncasecmp(p, "ms", 3) == 0 || strncasecmp(p, "msec", 5) == 0)
+		ap->sample_period = val * 1000000;
+	else if (strncasecmp(p, "us", 3) == 0 || strncasecmp(p, "usec", 5) == 0)
+		ap->sample_period = val * 1000;
+	else if (strncasecmp(p, "ns", 3) == 0 || strncasecmp(p, "nsec", 5) == 0)
+		ap->sample_period = val;
+	else
+		return -1;
+
+	return kind;
+}
+
+static int probe_info(dtrace_hdl_t *dtp, dt_probe_t *prp,
+		      int *idp, int *argcp, dt_argdesc_t **argvp)
+{
+	int fd, kind;
+	struct perf_event_attr attr;
+
+	if (strcmp(prp->desc->prv, prvname) ||
+	    strcmp(prp->desc->mod, modname) ||
+	    strcmp(prp->desc->fun, funname))
+		return -1;
+
+	memset(&attr, 0, sizeof(attr));
+	attr.type = PERF_TYPE_SOFTWARE;
+	attr.config = *idp = prp->event_id = PERF_COUNT_SW_CPU_CLOCK;
+	attr.sample_type = PERF_SAMPLE_RAW;
+	attr.size = sizeof(struct perf_event_attr);   // FIXME: why have we not been doing this in other providers?
+	attr.wakeup_events = 1;
+	attr.inherit = 0; // system wide, no need to inherit; for test_all_cpu
+	attr.inherit = 1; // per task, enable to trace properly (but causes bpf_perf_prog_read_time helper failure)
+
+	kind = set_period_freq(prp->desc->prb, &attr);
+	if (kind < 0)
+		return -1;
+	if (kind == KIND_PROFILE)    // FIXME: still need to support this
+		return -1;
+
+	/*
+	 * FIXME:
+	 * what should cpu be?  0?  (that's what kprobes does)  one case for each cpu?
+	 * perf_event_open() man page says -1 -1 is not legal
+	 * is this a profile-n vs tick-n thing?
+	 */
+	fd = perf_event_open(&attr, -1, 0 /* cpu */, -1, 0);
+	if (fd < 0) {
+		/* FIXME: what else? unset prp->event_id? */
+		return -1;
+	}
+	prp->event_fd = fd;
+
+	/* FIXME: should *argcp be 3 for profile-n and 2 for tick-n?
+	 * also need to set argvp
+	 */
+	*argcp = 0;
+	*argvp = NULL;
+
+	return 0;
+}
+
+static int provide(dtrace_hdl_t *dtp, const dtrace_probedesc_t *pdp)
+{
+	dt_provider_t *prv;
+
+	/* make sure we have IDNONE and a legal name */
+	if (pdp->id != DTRACE_IDNONE ||
+	    strcmp(pdp->prv, prvname) ||
+	    strcmp(pdp->mod, modname) ||
+	    strcmp(pdp->fun, funname))
+		return 0;
+	if (strncmp(pdp->prb, PREFIX_TICK, strlen(PREFIX_TICK)) &&
+	    strncmp(pdp->prb, PREFIX_PROFILE, strlen(PREFIX_PROFILE)))
+		return 0;
+
+	/* return if we already have this probe */
+	if (dt_probe_lookup(dtp, pdp))
+		return 0;
+
+	/* insert this probe */
+	prv = dt_provider_lookup(dtp, prvname);
+	if (!prv)
+		return 0;
+	dt_probe_insert(dtp, prv, prvname, modname, funname, pdp->prb);
+
+	return 0;
+}
+
+dt_provimpl_t	dt_profile = {
+	.name		= prvname,
+	.prog_type	= BPF_PROG_TYPE_PERF_EVENT,
+	.populate	= &populate,
+	.trampoline	= &trampoline,
+	.probe_info	= &probe_info,
+	.provide	= &provide,
+};
+#if 0
+#include <linux/fs.h>
+#include <linux/ktime.h>
+#include <linux/miscdevice.h>
+#include <linux/slab.h>
+#include <asm/irq_regs.h>
+#include <asm/ptrace.h>
+
+#include <linux/hardirq.h>
+#include <linux/profile.h>
+
+static void profile_[tick|prof]_fn(uintptr_t arg)
+{
+	unsigned long	pc = 0, upc = 0;
+	struct pt_regs	*regs = get_irq_regs();
+
+	/*
+	 * If regs == NULL, then we were called from from softirq context which
+	 * also means that we didn't actually interrupt any processing (kernel
+	 * or user space).
+	 * If regs != NULL, then we did actually get called from hardirq
+	 * because the timer interrupt did really interrupt something that was
+	 * going on on the CPU (could be user mode or kernel mode).
+	 */
+	if (regs == NULL) {
+		uint64_t	stack[8];
+
+		dtrace_getpcstack(stack, 8, 0, NULL);
+		pc = stack[7];
+	} else if (user_mode(regs))
+		upc = instruction_pointer(regs);
+	else
+		pc = instruction_pointer(regs);
+}
+#endif
diff --git a/libdtrace/dt_prov_sdt.c b/libdtrace/dt_prov_sdt.c
index 6d99653a..beba66c1 100644
--- a/libdtrace/dt_prov_sdt.c
+++ b/libdtrace/dt_prov_sdt.c
@@ -379,7 +379,7 @@ static void trampoline(dt_pcb_t *pcb)
 	dt_irlist_append(dlp, dt_cg_node_alloc(lbl_exit, instr));
 }
 
-static int probe_info(dtrace_hdl_t *dtp, const dt_probe_t *prp,
+static int probe_info(dtrace_hdl_t *dtp, dt_probe_t *prp,
 		      int *idp, int *argcp, dt_argdesc_t **argvp)
 {
 	FILE	*f;
diff --git a/libdtrace/dt_prov_syscall.c b/libdtrace/dt_prov_syscall.c
index 0b0ca954..9d8f22b4 100644
--- a/libdtrace/dt_prov_syscall.c
+++ b/libdtrace/dt_prov_syscall.c
@@ -234,7 +234,7 @@ static void trampoline(dt_pcb_t *pcb)
 	dt_irlist_append(dlp, dt_cg_node_alloc(lbl_exit, instr));
 }
 
-static int probe_info(dtrace_hdl_t *dtp, const dt_probe_t *prp,
+static int probe_info(dtrace_hdl_t *dtp, dt_probe_t *prp,
 		      int *idp, int *argcp, dt_argdesc_t **argvp)
 {
 	FILE	*f;
diff --git a/libdtrace/dt_provider.h b/libdtrace/dt_provider.h
index 50f663f1..51d282f7 100644
--- a/libdtrace/dt_provider.h
+++ b/libdtrace/dt_provider.h
@@ -60,7 +60,7 @@ typedef struct dt_provimpl {
 	int prog_type;				/* BPF program type */
 	int (*populate)(dtrace_hdl_t *dtp);	/* register probes */
 	int (*probe_info)(dtrace_hdl_t *dtp,	/* get probe info */
-			  const struct dt_probe *prp,
+			  struct dt_probe *prp,
 			  int *idp, int *argcp, dt_argdesc_t **argvp);
 	int (*provide)(dtrace_hdl_t *dtp,	/* provide probes */
 		       const dtrace_probedesc_t *pdp);
@@ -74,6 +74,7 @@ extern int tp_event_info(dtrace_hdl_t *dtp, FILE *f, int skip, int *idp,
 
 extern dt_provimpl_t dt_dtrace;
 extern dt_provimpl_t dt_fbt;
+extern dt_provimpl_t dt_profile;
 extern dt_provimpl_t dt_sdt;
 extern dt_provimpl_t dt_syscall;
 
-- 
2.18.2




More information about the DTrace-devel mailing list