[DTrace-devel] [PATCH 11/12] Add a profile provider

eugene.loh at oracle.com eugene.loh at oracle.com
Sat Jul 11 17:38:35 PDT 2020


From: Eugene Loh <eugene.loh at oracle.com>

The probes use perf_events with PERF_TYPE_SOFTWARE and
PERF_COUNT_SW_CPU_CLOCK.

Signed-off-by: Eugene Loh <eugene.loh at oracle.com>
---
 BPF-DESIGN                  |   3 +
 libdtrace/Build             |   3 +-
 libdtrace/dt_open.c         |   1 +
 libdtrace/dt_prov_profile.c | 376 ++++++++++++++++++++++++++++++++++++
 libdtrace/dt_provider.h     |   1 +
 5 files changed, 383 insertions(+), 1 deletion(-)
 create mode 100644 libdtrace/dt_prov_profile.c

diff --git a/BPF-DESIGN b/BPF-DESIGN
index 919a6d9d..c8acda38 100644
--- a/BPF-DESIGN
+++ b/BPF-DESIGN
@@ -1,3 +1,6 @@
+
+WARNING: This file is no longer accurate; it is basically obsolete.
+
 BPF MAPS
 --------
     Global
diff --git a/libdtrace/Build b/libdtrace/Build
index 1fe41d53..e453a84e 100644
--- a/libdtrace/Build
+++ b/libdtrace/Build
@@ -21,7 +21,8 @@ libdtrace-build_SOURCES = dt_lex.c dt_aggregate.c dt_as.c dt_bpf.c \
 			  dt_proc.c dt_program.c dt_provider.c dt_regset.c \
 			  dt_string.c dt_strtab.c dt_subr.c dt_symtab.c \
 			  dt_work.c dt_xlator.c dt_peb.c dt_prov_dtrace.c \
-			  dt_prov_fbt.c dt_prov_sdt.c dt_prov_syscall.c
+			  dt_prov_fbt.c dt_prov_profile.c dt_prov_sdt.c \
+			  dt_prov_syscall.c
 
 libdtrace-build_SRCDEPS := dt_grammar.h $(objdir)/dt_git_version.h
 
diff --git a/libdtrace/dt_open.c b/libdtrace/dt_open.c
index 6a1efbfb..f6e82954 100644
--- a/libdtrace/dt_open.c
+++ b/libdtrace/dt_open.c
@@ -64,6 +64,7 @@ const dt_version_t _dtrace_versions[] = {
 static const dt_provimpl_t *dt_providers[] = {
 	&dt_dtrace,
 	&dt_fbt,
+	&dt_profile,
 	&dt_sdt,
 	&dt_syscall,
 };
diff --git a/libdtrace/dt_prov_profile.c b/libdtrace/dt_prov_profile.c
new file mode 100644
index 00000000..a5b5ddae
--- /dev/null
+++ b/libdtrace/dt_prov_profile.c
@@ -0,0 +1,376 @@
+/*
+ * Oracle Linux DTrace.
+ * Copyright (c) 2020, Oracle and/or its affiliates. All rights reserved.
+ * Licensed under the Universal Permissive License v 1.0 as shown at
+ * http://oss.oracle.com/licenses/upl.
+ *
+ * The profile provider for DTrace.
+ */
+#include <assert.h>
+#include <sys/ioctl.h>
+
+#include <bpf_asm.h>
+
+#include "dt_bpf.h"
+#include "dt_probe.h"
+
+#define FDS_CNT (datap->kind == KIND_TICK ? 1 : dtp->dt_conf.num_online_cpus)
+typedef struct profile_probe {
+	int	kind;
+	int	*fds;
+	__u64	period;
+} profile_probe_t;
+
+static const char		prvname[] = "profile";
+static const char		modname[] = "";
+static const char		funname[] = "";
+
+#define KIND_PROFILE	0
+#define KIND_TICK	1
+#define PREFIX_PROFILE	"profile-"
+#define PREFIX_TICK	"tick-"
+
+static const dtrace_pattr_t	pattr = {
+{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
+{ DTRACE_STABILITY_UNSTABLE, DTRACE_STABILITY_UNSTABLE, DTRACE_CLASS_UNKNOWN },
+{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
+{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
+{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
+};
+
+static int set_period(const char *name, profile_probe_t *datap)
+{
+	int ndigits;
+	char *p;
+	unsigned long long val;
+
+	if (strncmp(name, PREFIX_PROFILE, strlen(PREFIX_PROFILE)) != 0 &&
+	    strncmp(name, PREFIX_TICK, strlen(PREFIX_TICK)) != 0)
+		return -1;
+
+	p = strchr(name, '-') + 1;
+	ndigits = strspn(p, "0123456789");
+
+	sscanf(p, "%llud", &val);
+	if (val == 0)
+		return -1;
+	p += ndigits;
+
+	if (p[0] == '\0' || strncasecmp(p, "hz", 3) == 0)
+		datap->period = 1000000000ull / val;
+	else if (strncasecmp(p, "d", 2) == 0 || strncasecmp(p, "day", 4) == 0)
+		datap->period = val * 24 * 60 * 60 * 1000000000;
+	else if (strncasecmp(p, "h", 2) == 0 || strncasecmp(p, "hour", 5) == 0)
+		datap->period = val * 60 * 60 * 1000000000;
+	else if (strncasecmp(p, "m", 2) == 0 || strncasecmp(p, "min", 4) == 0)
+		datap->period = val * 60 * 1000000000;
+	else if (strncasecmp(p, "s", 2) == 0 || strncasecmp(p, "sec", 4) == 0)
+		datap->period = val * 1000000000;
+	else if (strncasecmp(p, "ms", 3) == 0 || strncasecmp(p, "msec", 5) == 0)
+		datap->period = val * 1000000;
+	else if (strncasecmp(p, "us", 3) == 0 || strncasecmp(p, "usec", 5) == 0)
+		datap->period = val * 1000;
+	else if (strncasecmp(p, "ns", 3) == 0 || strncasecmp(p, "nsec", 5) == 0)
+		datap->period = val;
+	else
+		return -1;
+
+	/* enforce the 200-usec limit */
+	if (datap->period < 200000)
+		return -1;
+
+	return 0;
+}
+
+static dt_probe_t *profile_probe_insert(dtrace_hdl_t *dtp, dt_provider_t *prv,
+				        const char *prb)
+{
+	profile_probe_t *datap;
+	int i;
+
+	datap = dt_zalloc(dtp, sizeof (profile_probe_t));
+	if (datap == NULL)
+		return NULL;
+
+	if (set_period(prb, datap) < 0)
+		goto err;
+
+	/* the probe name starts with "profile-" or else "tick-" */
+	if (prb[0] == 'p')
+		datap->kind = KIND_PROFILE;
+	else
+		datap->kind = KIND_TICK;
+
+	datap->fds = dt_calloc(dtp, FDS_CNT, sizeof (int));
+	if (datap->fds == NULL)
+		goto err;
+
+	for (i = 0; i < FDS_CNT; i++)
+		datap->fds[i] = -1;
+
+	return dt_probe_insert(dtp, prv, prvname, modname, funname, prb, datap);
+
+err:
+	dt_free(dtp, datap);
+	return NULL;
+}
+
+static int populate(dtrace_hdl_t *dtp)
+{
+	dt_provider_t	*prv;
+	char		buf[32];
+	int		i, n = 0;
+	int		profile_n[] = { 97, 199, 499, 997, 1999, 4001, 4999 };
+	int		tick_n[] = { 1, 10, 100, 500, 1000, 5000 };
+
+	prv = dt_provider_create(dtp, prvname, &dt_profile, &pattr);
+	if (prv == NULL)
+		return 0;
+
+	for (i = 0; i < sizeof(profile_n) / sizeof(*profile_n); i++) {
+		snprintf(buf, sizeof(buf), "%s%d", PREFIX_PROFILE, profile_n[i]);
+		if (profile_probe_insert(dtp, prv, buf))
+			n++;
+	}
+
+	for (i = 0; i < sizeof(tick_n) / sizeof(*tick_n); i++) {
+		snprintf(buf, sizeof(buf), "%s%d", PREFIX_TICK, tick_n[i]);
+		if (profile_probe_insert(dtp, prv, buf))
+			n++;
+	}
+
+	return n;
+}
+
+/*
+ * Generate a BPF trampoline for a profile probe.
+ *
+ * The trampoline function is called when a profile probe triggers, and it must
+ * satisfy the following prototype:
+ *
+ *	int dt_profile(struct bpf_perf_event_data *ctx)
+ *
+ * The trampoline will populate a dt_bpf_context struct and then call the
+ * function that implements the compiled D clause.  It returns the value that
+ * it gets back from that function.
+ */
+static void trampoline(dt_pcb_t *pcb)
+{
+	int		i;
+	dt_irlist_t	*dlp = &pcb->pcb_ir;
+	struct bpf_insn	instr;
+	uint_t		lbl_exit = dt_irlist_label(dlp);
+	dt_ident_t	*idp;
+
+#define DCTX_FP(off)	(-(ushort_t)DCTX_SIZE + (ushort_t)(off))
+
+	/*
+	 * int dt_profile(struct bpf_perf_event_data *ctx)
+	 * {
+	 *     struct dt_bpf_context	dctx;
+	 *
+	 *     memset(&dctx, 0, sizeof(dctx));
+	 *
+	 *     dctx.epid = EPID;
+	 *     (we clear dctx.pad and dctx.fault because of the memset above)
+	 */
+	idp = dt_dlib_get_var(pcb->pcb_hdl, "EPID");
+	assert(idp != NULL);
+	instr = BPF_STORE_IMM(BPF_W, BPF_REG_FP, DCTX_FP(DCTX_EPID), -1);
+	dt_irlist_append(dlp, dt_cg_node_alloc(DT_LBL_NONE, instr));
+	dlp->dl_last->di_extern = idp;
+	instr = BPF_STORE_IMM(BPF_W, BPF_REG_FP, DCTX_FP(DCTX_PAD), 0);
+	dt_irlist_append(dlp, dt_cg_node_alloc(DT_LBL_NONE, instr));
+	instr = BPF_STORE_IMM(BPF_DW, BPF_REG_FP, DCTX_FP(DCTX_FAULT), 0);
+	dt_irlist_append(dlp, dt_cg_node_alloc(DT_LBL_NONE, instr));
+
+#if 0
+	/*
+	 *     dctx.regs = *regs;
+	 */
+	/*
+	 * FIXME:
+	 * %r1 points to a context given by /usr/include/linux/bpf_perf_event.h:
+	 *     struct bpf_perf_event_data {
+	 *         bpf_user_pt_regs_t regs;
+	 *         __u64 sample_period;
+	 *         __u64 addr;
+	 *     };
+	 * Since regs lives at the same base address as the context and
+	 * since bpf_user_pt_regs_t might be the same as dt_pt_regs, I think
+	 * we can just use the PT_REGS_ARGn definitions from libdtrace/dt_pt_regs.h.
+	 * But we should revisit all that and fix as necessary.
+	 */
+	for (i = 0; i < sizeof(dt_pt_regs); i += 8) {
+		instr = BPF_LOAD(BPF_DW, BPF_REG_0, BPF_REG_1, i);
+		dt_irlist_append(dlp, dt_cg_node_alloc(DT_LBL_NONE, instr));
+		instr = BPF_STORE(BPF_DW, BPF_REG_FP, DCTX_FP(DCTX_REGS) + i,
+				  BPF_REG_0);
+		dt_irlist_append(dlp, dt_cg_node_alloc(DT_LBL_NONE, instr));
+	}
+#endif
+
+	/*
+	 * FIXME:
+	 * We need to set the args here.  They are:
+	 *     dctx.argv[0] =     pc;  kernel   PC
+	 *     dctx.argv[1] =    upc; userspace PC
+	 *     dctx.argv[2] =  nsecs; elapsed nsec (profile-n only, not tick-n)
+	 * Maybe look at what samples/bpf/[sampleip|trace_event]_kern.c
+	 * does with PT_REGS_IP(&ctx->regs).
+	 */
+
+	/*
+	 *     (we clear dctx.argv[0] and on because of the memset above)
+	 */
+	/*
+	 * FIXME: Once we set the args, I think we want to start this
+	 * loop at pcb->pcb_pinfo.dtp_argc;  see dt_prov_syscall.c.
+	 * Actually, dtp_argc might only be for the typed args, of which
+	 * the profile provider has none.
+	 */
+	for (i = 0; i < sizeof(((struct dt_bpf_context *)0)->argv) / 8; i++) {
+		instr = BPF_STORE_IMM(BPF_DW, BPF_REG_FP, DCTX_FP(DCTX_ARG(i)),
+				      0);
+		dt_irlist_append(dlp, dt_cg_node_alloc(DT_LBL_NONE, instr));
+	}
+
+	/*
+	 * We know the BPF context (ctx) is in %r1.  Since we will be passing
+	 * the DTrace context (dctx) as 2nd argument to dt_program, we need it
+	 * in %r2.
+	 */
+	instr = BPF_MOV_REG(BPF_REG_2, BPF_REG_FP);
+	dt_irlist_append(dlp, dt_cg_node_alloc(DT_LBL_NONE, instr));
+	instr = BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, DCTX_FP(0));
+	dt_irlist_append(dlp, dt_cg_node_alloc(DT_LBL_NONE, instr));
+
+	/*
+	 *     rc = dt_program(ctx, dctx);
+	 */
+	idp = dt_dlib_get_func(pcb->pcb_hdl, "dt_program");
+	assert(idp != NULL);
+	instr = BPF_CALL_FUNC(idp->di_id);
+	dt_irlist_append(dlp, dt_cg_node_alloc(DT_LBL_NONE, instr));
+	dlp->dl_last->di_extern = idp;
+
+	/*
+	 * exit:
+	 *     return rc;
+	 * }
+	 */
+	instr = BPF_RETURN();
+	dt_irlist_append(dlp, dt_cg_node_alloc(lbl_exit, instr));
+}
+
+static int probe_info(dtrace_hdl_t *dtp, const dt_probe_t *prp,
+		      int *argcp, dt_argdesc_t **argvp)
+{
+	/* profile-provider probe arguments are not typed */
+	*argcp = 0;
+	*argvp = NULL;
+
+	return 0;
+}
+
+static int provide(dtrace_hdl_t *dtp, const dtrace_probedesc_t *pdp)
+{
+	dt_provider_t *prv;
+
+	/* make sure we have IDNONE and a legal name */
+	if (pdp->id != DTRACE_IDNONE ||
+	    strcmp(pdp->prv, prvname) ||
+	    strcmp(pdp->mod, modname) ||
+	    strcmp(pdp->fun, funname))
+		return 0;
+	if (strncmp(pdp->prb, PREFIX_TICK, strlen(PREFIX_TICK)) &&
+	    strncmp(pdp->prb, PREFIX_PROFILE, strlen(PREFIX_PROFILE)))
+		return 0;
+
+	/* return if we already have this probe */
+	if (dt_probe_lookup(dtp, pdp))
+		return 0;
+
+	/* get the provider */
+	prv = dt_provider_lookup(dtp, prvname);
+	if (!prv)
+		return 0;
+
+	/* insert this probe */
+	profile_probe_insert(dtp, prv, pdp->prb);
+
+	return 0;
+}
+
+static int attach(dtrace_hdl_t *dtp, const dt_probe_t *prp, int bpf_fd)
+{
+	profile_probe_t *datap = prp->prv_data;
+	struct perf_event_attr attr;
+	int i, nattach = 0;;
+
+	memset(&attr, 0, sizeof(attr));
+	attr.type = PERF_TYPE_SOFTWARE;
+	attr.config = PERF_COUNT_SW_CPU_CLOCK;
+	attr.sample_type = PERF_SAMPLE_RAW;
+	attr.size = sizeof(struct perf_event_attr);   // FIXME: why have we not been doing this in other providers?
+	attr.wakeup_events = 1;
+	attr.freq = 0;
+	attr.sample_period = datap->period;
+
+	for (i = 0; i < FDS_CNT; i++) {
+		int j = i, fd;
+
+		/* if there is only one fd, place it at random */
+		if (FDS_CNT == 1)
+			j = rand() % dtp->dt_conf.num_online_cpus;
+
+		fd = perf_event_open(&attr, -1, dtp->dt_conf.cpus[j].cpu_id, -1, 0);
+		if (fd < 0)
+			continue;
+		if (ioctl(fd, PERF_EVENT_IOC_SET_BPF, bpf_fd) < 0) {
+			close(fd);
+			continue;
+		}
+		datap->fds[i] = fd;
+		nattach++;
+	}
+
+	return nattach > 0 ? 0 : -1;
+}
+
+static int probe_destroy(dtrace_hdl_t *dtp, void *arg)
+{
+	profile_probe_t *datap = arg;
+
+	dt_free(dtp, datap->fds);
+	dt_free(dtp, datap);
+	arg = NULL;
+
+	return 0;
+}
+
+static int probe_fini(dtrace_hdl_t *dtp, const dt_probe_t *prp)
+{
+	profile_probe_t *datap = prp->prv_data;
+	int i;
+
+	for (i = 0; i < FDS_CNT; i++)
+		if (datap->fds[i] != -1)
+			close(datap->fds[i]);
+
+	probe_destroy(dtp, datap);
+
+	return 0;
+}
+
+dt_provimpl_t	dt_profile = {
+	.name		= prvname,
+	.prog_type	= BPF_PROG_TYPE_PERF_EVENT,
+	.populate	= &populate,
+	.trampoline	= &trampoline,
+	.probe_info	= &probe_info,
+	.provide	= &provide,
+	.attach		= &attach,
+	.probe_destroy	= &probe_destroy,
+	.probe_fini	= &probe_fini,
+};
diff --git a/libdtrace/dt_provider.h b/libdtrace/dt_provider.h
index 1fb616a0..d25f8e3f 100644
--- a/libdtrace/dt_provider.h
+++ b/libdtrace/dt_provider.h
@@ -80,6 +80,7 @@ typedef struct tp_probe {
 
 extern dt_provimpl_t dt_dtrace;
 extern dt_provimpl_t dt_fbt;
+extern dt_provimpl_t dt_profile;
 extern dt_provimpl_t dt_sdt;
 extern dt_provimpl_t dt_syscall;
 
-- 
2.18.2




More information about the DTrace-devel mailing list