[DTrace-devel] [PATCH 11/12] Add a profile provider
eugene.loh at oracle.com
eugene.loh at oracle.com
Sat Jul 11 17:38:35 PDT 2020
From: Eugene Loh <eugene.loh at oracle.com>
The probes use perf_events with PERF_TYPE_SOFTWARE and
PERF_COUNT_SW_CPU_CLOCK.
Signed-off-by: Eugene Loh <eugene.loh at oracle.com>
---
BPF-DESIGN | 3 +
libdtrace/Build | 3 +-
libdtrace/dt_open.c | 1 +
libdtrace/dt_prov_profile.c | 376 ++++++++++++++++++++++++++++++++++++
libdtrace/dt_provider.h | 1 +
5 files changed, 383 insertions(+), 1 deletion(-)
create mode 100644 libdtrace/dt_prov_profile.c
diff --git a/BPF-DESIGN b/BPF-DESIGN
index 919a6d9d..c8acda38 100644
--- a/BPF-DESIGN
+++ b/BPF-DESIGN
@@ -1,3 +1,6 @@
+
+WARNING: This file is no longer accurate; it is basically obsolete.
+
BPF MAPS
--------
Global
diff --git a/libdtrace/Build b/libdtrace/Build
index 1fe41d53..e453a84e 100644
--- a/libdtrace/Build
+++ b/libdtrace/Build
@@ -21,7 +21,8 @@ libdtrace-build_SOURCES = dt_lex.c dt_aggregate.c dt_as.c dt_bpf.c \
dt_proc.c dt_program.c dt_provider.c dt_regset.c \
dt_string.c dt_strtab.c dt_subr.c dt_symtab.c \
dt_work.c dt_xlator.c dt_peb.c dt_prov_dtrace.c \
- dt_prov_fbt.c dt_prov_sdt.c dt_prov_syscall.c
+ dt_prov_fbt.c dt_prov_profile.c dt_prov_sdt.c \
+ dt_prov_syscall.c
libdtrace-build_SRCDEPS := dt_grammar.h $(objdir)/dt_git_version.h
diff --git a/libdtrace/dt_open.c b/libdtrace/dt_open.c
index 6a1efbfb..f6e82954 100644
--- a/libdtrace/dt_open.c
+++ b/libdtrace/dt_open.c
@@ -64,6 +64,7 @@ const dt_version_t _dtrace_versions[] = {
static const dt_provimpl_t *dt_providers[] = {
&dt_dtrace,
&dt_fbt,
+ &dt_profile,
&dt_sdt,
&dt_syscall,
};
diff --git a/libdtrace/dt_prov_profile.c b/libdtrace/dt_prov_profile.c
new file mode 100644
index 00000000..a5b5ddae
--- /dev/null
+++ b/libdtrace/dt_prov_profile.c
@@ -0,0 +1,376 @@
+/*
+ * Oracle Linux DTrace.
+ * Copyright (c) 2020, Oracle and/or its affiliates. All rights reserved.
+ * Licensed under the Universal Permissive License v 1.0 as shown at
+ * http://oss.oracle.com/licenses/upl.
+ *
+ * The profile provider for DTrace.
+ */
+#include <assert.h>
+#include <sys/ioctl.h>
+
+#include <bpf_asm.h>
+
+#include "dt_bpf.h"
+#include "dt_probe.h"
+
+#define FDS_CNT (datap->kind == KIND_TICK ? 1 : dtp->dt_conf.num_online_cpus)
+typedef struct profile_probe {
+ int kind;
+ int *fds;
+ __u64 period;
+} profile_probe_t;
+
+static const char prvname[] = "profile";
+static const char modname[] = "";
+static const char funname[] = "";
+
+#define KIND_PROFILE 0
+#define KIND_TICK 1
+#define PREFIX_PROFILE "profile-"
+#define PREFIX_TICK "tick-"
+
+static const dtrace_pattr_t pattr = {
+{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
+{ DTRACE_STABILITY_UNSTABLE, DTRACE_STABILITY_UNSTABLE, DTRACE_CLASS_UNKNOWN },
+{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
+{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
+{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
+};
+
+static int set_period(const char *name, profile_probe_t *datap)
+{
+ int ndigits;
+ char *p;
+ unsigned long long val;
+
+ if (strncmp(name, PREFIX_PROFILE, strlen(PREFIX_PROFILE)) != 0 &&
+ strncmp(name, PREFIX_TICK, strlen(PREFIX_TICK)) != 0)
+ return -1;
+
+ p = strchr(name, '-') + 1;
+ ndigits = strspn(p, "0123456789");
+
+ sscanf(p, "%llud", &val);
+ if (val == 0)
+ return -1;
+ p += ndigits;
+
+ if (p[0] == '\0' || strncasecmp(p, "hz", 3) == 0)
+ datap->period = 1000000000ull / val;
+ else if (strncasecmp(p, "d", 2) == 0 || strncasecmp(p, "day", 4) == 0)
+ datap->period = val * 24 * 60 * 60 * 1000000000;
+ else if (strncasecmp(p, "h", 2) == 0 || strncasecmp(p, "hour", 5) == 0)
+ datap->period = val * 60 * 60 * 1000000000;
+ else if (strncasecmp(p, "m", 2) == 0 || strncasecmp(p, "min", 4) == 0)
+ datap->period = val * 60 * 1000000000;
+ else if (strncasecmp(p, "s", 2) == 0 || strncasecmp(p, "sec", 4) == 0)
+ datap->period = val * 1000000000;
+ else if (strncasecmp(p, "ms", 3) == 0 || strncasecmp(p, "msec", 5) == 0)
+ datap->period = val * 1000000;
+ else if (strncasecmp(p, "us", 3) == 0 || strncasecmp(p, "usec", 5) == 0)
+ datap->period = val * 1000;
+ else if (strncasecmp(p, "ns", 3) == 0 || strncasecmp(p, "nsec", 5) == 0)
+ datap->period = val;
+ else
+ return -1;
+
+ /* enforce the 200-usec limit */
+ if (datap->period < 200000)
+ return -1;
+
+ return 0;
+}
+
+static dt_probe_t *profile_probe_insert(dtrace_hdl_t *dtp, dt_provider_t *prv,
+ const char *prb)
+{
+ profile_probe_t *datap;
+ int i;
+
+ datap = dt_zalloc(dtp, sizeof (profile_probe_t));
+ if (datap == NULL)
+ return NULL;
+
+ if (set_period(prb, datap) < 0)
+ goto err;
+
+ /* the probe name starts with "profile-" or else "tick-" */
+ if (prb[0] == 'p')
+ datap->kind = KIND_PROFILE;
+ else
+ datap->kind = KIND_TICK;
+
+ datap->fds = dt_calloc(dtp, FDS_CNT, sizeof (int));
+ if (datap->fds == NULL)
+ goto err;
+
+ for (i = 0; i < FDS_CNT; i++)
+ datap->fds[i] = -1;
+
+ return dt_probe_insert(dtp, prv, prvname, modname, funname, prb, datap);
+
+err:
+ dt_free(dtp, datap);
+ return NULL;
+}
+
+static int populate(dtrace_hdl_t *dtp)
+{
+ dt_provider_t *prv;
+ char buf[32];
+ int i, n = 0;
+ int profile_n[] = { 97, 199, 499, 997, 1999, 4001, 4999 };
+ int tick_n[] = { 1, 10, 100, 500, 1000, 5000 };
+
+ prv = dt_provider_create(dtp, prvname, &dt_profile, &pattr);
+ if (prv == NULL)
+ return 0;
+
+ for (i = 0; i < sizeof(profile_n) / sizeof(*profile_n); i++) {
+ snprintf(buf, sizeof(buf), "%s%d", PREFIX_PROFILE, profile_n[i]);
+ if (profile_probe_insert(dtp, prv, buf))
+ n++;
+ }
+
+ for (i = 0; i < sizeof(tick_n) / sizeof(*tick_n); i++) {
+ snprintf(buf, sizeof(buf), "%s%d", PREFIX_TICK, tick_n[i]);
+ if (profile_probe_insert(dtp, prv, buf))
+ n++;
+ }
+
+ return n;
+}
+
+/*
+ * Generate a BPF trampoline for a profile probe.
+ *
+ * The trampoline function is called when a profile probe triggers, and it must
+ * satisfy the following prototype:
+ *
+ * int dt_profile(struct bpf_perf_event_data *ctx)
+ *
+ * The trampoline will populate a dt_bpf_context struct and then call the
+ * function that implements the compiled D clause. It returns the value that
+ * it gets back from that function.
+ */
+static void trampoline(dt_pcb_t *pcb)
+{
+ int i;
+ dt_irlist_t *dlp = &pcb->pcb_ir;
+ struct bpf_insn instr;
+ uint_t lbl_exit = dt_irlist_label(dlp);
+ dt_ident_t *idp;
+
+#define DCTX_FP(off) (-(ushort_t)DCTX_SIZE + (ushort_t)(off))
+
+ /*
+ * int dt_profile(struct bpf_perf_event_data *ctx)
+ * {
+ * struct dt_bpf_context dctx;
+ *
+ * memset(&dctx, 0, sizeof(dctx));
+ *
+ * dctx.epid = EPID;
+ * (we clear dctx.pad and dctx.fault because of the memset above)
+ */
+ idp = dt_dlib_get_var(pcb->pcb_hdl, "EPID");
+ assert(idp != NULL);
+ instr = BPF_STORE_IMM(BPF_W, BPF_REG_FP, DCTX_FP(DCTX_EPID), -1);
+ dt_irlist_append(dlp, dt_cg_node_alloc(DT_LBL_NONE, instr));
+ dlp->dl_last->di_extern = idp;
+ instr = BPF_STORE_IMM(BPF_W, BPF_REG_FP, DCTX_FP(DCTX_PAD), 0);
+ dt_irlist_append(dlp, dt_cg_node_alloc(DT_LBL_NONE, instr));
+ instr = BPF_STORE_IMM(BPF_DW, BPF_REG_FP, DCTX_FP(DCTX_FAULT), 0);
+ dt_irlist_append(dlp, dt_cg_node_alloc(DT_LBL_NONE, instr));
+
+#if 0
+ /*
+ * dctx.regs = *regs;
+ */
+ /*
+ * FIXME:
+ * %r1 points to a context given by /usr/include/linux/bpf_perf_event.h:
+ * struct bpf_perf_event_data {
+ * bpf_user_pt_regs_t regs;
+ * __u64 sample_period;
+ * __u64 addr;
+ * };
+ * Since regs lives at the same base address as the context and
+ * since bpf_user_pt_regs_t might be the same as dt_pt_regs, I think
+ * we can just use the PT_REGS_ARGn definitions from libdtrace/dt_pt_regs.h.
+ * But we should revisit all that and fix as necessary.
+ */
+ for (i = 0; i < sizeof(dt_pt_regs); i += 8) {
+ instr = BPF_LOAD(BPF_DW, BPF_REG_0, BPF_REG_1, i);
+ dt_irlist_append(dlp, dt_cg_node_alloc(DT_LBL_NONE, instr));
+ instr = BPF_STORE(BPF_DW, BPF_REG_FP, DCTX_FP(DCTX_REGS) + i,
+ BPF_REG_0);
+ dt_irlist_append(dlp, dt_cg_node_alloc(DT_LBL_NONE, instr));
+ }
+#endif
+
+ /*
+ * FIXME:
+ * We need to set the args here. They are:
+ * dctx.argv[0] = pc; kernel PC
+ * dctx.argv[1] = upc; userspace PC
+ * dctx.argv[2] = nsecs; elapsed nsec (profile-n only, not tick-n)
+ * Maybe look at what samples/bpf/[sampleip|trace_event]_kern.c
+ * does with PT_REGS_IP(&ctx->regs).
+ */
+
+ /*
+ * (we clear dctx.argv[0] and on because of the memset above)
+ */
+ /*
+ * FIXME: Once we set the args, I think we want to start this
+ * loop at pcb->pcb_pinfo.dtp_argc; see dt_prov_syscall.c.
+ * Actually, dtp_argc might only be for the typed args, of which
+ * the profile provider has none.
+ */
+ for (i = 0; i < sizeof(((struct dt_bpf_context *)0)->argv) / 8; i++) {
+ instr = BPF_STORE_IMM(BPF_DW, BPF_REG_FP, DCTX_FP(DCTX_ARG(i)),
+ 0);
+ dt_irlist_append(dlp, dt_cg_node_alloc(DT_LBL_NONE, instr));
+ }
+
+ /*
+ * We know the BPF context (ctx) is in %r1. Since we will be passing
+ * the DTrace context (dctx) as 2nd argument to dt_program, we need it
+ * in %r2.
+ */
+ instr = BPF_MOV_REG(BPF_REG_2, BPF_REG_FP);
+ dt_irlist_append(dlp, dt_cg_node_alloc(DT_LBL_NONE, instr));
+ instr = BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, DCTX_FP(0));
+ dt_irlist_append(dlp, dt_cg_node_alloc(DT_LBL_NONE, instr));
+
+ /*
+ * rc = dt_program(ctx, dctx);
+ */
+ idp = dt_dlib_get_func(pcb->pcb_hdl, "dt_program");
+ assert(idp != NULL);
+ instr = BPF_CALL_FUNC(idp->di_id);
+ dt_irlist_append(dlp, dt_cg_node_alloc(DT_LBL_NONE, instr));
+ dlp->dl_last->di_extern = idp;
+
+ /*
+ * exit:
+ * return rc;
+ * }
+ */
+ instr = BPF_RETURN();
+ dt_irlist_append(dlp, dt_cg_node_alloc(lbl_exit, instr));
+}
+
+static int probe_info(dtrace_hdl_t *dtp, const dt_probe_t *prp,
+ int *argcp, dt_argdesc_t **argvp)
+{
+ /* profile-provider probe arguments are not typed */
+ *argcp = 0;
+ *argvp = NULL;
+
+ return 0;
+}
+
+static int provide(dtrace_hdl_t *dtp, const dtrace_probedesc_t *pdp)
+{
+ dt_provider_t *prv;
+
+ /* make sure we have IDNONE and a legal name */
+ if (pdp->id != DTRACE_IDNONE ||
+ strcmp(pdp->prv, prvname) ||
+ strcmp(pdp->mod, modname) ||
+ strcmp(pdp->fun, funname))
+ return 0;
+ if (strncmp(pdp->prb, PREFIX_TICK, strlen(PREFIX_TICK)) &&
+ strncmp(pdp->prb, PREFIX_PROFILE, strlen(PREFIX_PROFILE)))
+ return 0;
+
+ /* return if we already have this probe */
+ if (dt_probe_lookup(dtp, pdp))
+ return 0;
+
+ /* get the provider */
+ prv = dt_provider_lookup(dtp, prvname);
+ if (!prv)
+ return 0;
+
+ /* insert this probe */
+ profile_probe_insert(dtp, prv, pdp->prb);
+
+ return 0;
+}
+
+static int attach(dtrace_hdl_t *dtp, const dt_probe_t *prp, int bpf_fd)
+{
+ profile_probe_t *datap = prp->prv_data;
+ struct perf_event_attr attr;
+ int i, nattach = 0;;
+
+ memset(&attr, 0, sizeof(attr));
+ attr.type = PERF_TYPE_SOFTWARE;
+ attr.config = PERF_COUNT_SW_CPU_CLOCK;
+ attr.sample_type = PERF_SAMPLE_RAW;
+ attr.size = sizeof(struct perf_event_attr); // FIXME: why have we not been doing this in other providers?
+ attr.wakeup_events = 1;
+ attr.freq = 0;
+ attr.sample_period = datap->period;
+
+ for (i = 0; i < FDS_CNT; i++) {
+ int j = i, fd;
+
+ /* if there is only one fd, place it at random */
+ if (FDS_CNT == 1)
+ j = rand() % dtp->dt_conf.num_online_cpus;
+
+ fd = perf_event_open(&attr, -1, dtp->dt_conf.cpus[j].cpu_id, -1, 0);
+ if (fd < 0)
+ continue;
+ if (ioctl(fd, PERF_EVENT_IOC_SET_BPF, bpf_fd) < 0) {
+ close(fd);
+ continue;
+ }
+ datap->fds[i] = fd;
+ nattach++;
+ }
+
+ return nattach > 0 ? 0 : -1;
+}
+
+static int probe_destroy(dtrace_hdl_t *dtp, void *arg)
+{
+ profile_probe_t *datap = arg;
+
+ dt_free(dtp, datap->fds);
+ dt_free(dtp, datap);
+ arg = NULL;
+
+ return 0;
+}
+
+static int probe_fini(dtrace_hdl_t *dtp, const dt_probe_t *prp)
+{
+ profile_probe_t *datap = prp->prv_data;
+ int i;
+
+ for (i = 0; i < FDS_CNT; i++)
+ if (datap->fds[i] != -1)
+ close(datap->fds[i]);
+
+ probe_destroy(dtp, datap);
+
+ return 0;
+}
+
+dt_provimpl_t dt_profile = {
+ .name = prvname,
+ .prog_type = BPF_PROG_TYPE_PERF_EVENT,
+ .populate = &populate,
+ .trampoline = &trampoline,
+ .probe_info = &probe_info,
+ .provide = &provide,
+ .attach = &attach,
+ .probe_destroy = &probe_destroy,
+ .probe_fini = &probe_fini,
+};
diff --git a/libdtrace/dt_provider.h b/libdtrace/dt_provider.h
index 1fb616a0..d25f8e3f 100644
--- a/libdtrace/dt_provider.h
+++ b/libdtrace/dt_provider.h
@@ -80,6 +80,7 @@ typedef struct tp_probe {
extern dt_provimpl_t dt_dtrace;
extern dt_provimpl_t dt_fbt;
+extern dt_provimpl_t dt_profile;
extern dt_provimpl_t dt_sdt;
extern dt_provimpl_t dt_syscall;
--
2.18.2
More information about the DTrace-devel
mailing list