[DTrace-devel] [PATCH 5/5] Add a profile provider
eugene.loh at oracle.com
eugene.loh at oracle.com
Thu Jun 11 14:29:41 PDT 2020
From: Eugene Loh <eugene.loh at oracle.com>
The probes use perf_events with PERF_TYPE_SOFTWARE and
PERF_COUNT_SW_CPU_CLOCK.
There are many FIXMEs. Among them, the trampoline does not
yet write the entire DTrace context... at least not the DTrace
probe arguments. Also, profile-n is not yet implemented.
For other providers, the work of setting a probe's event_fd is
done in bpf_attach(), but the profile provider is sufficiently
different that it makes more sense to do so in the provider's
probe_info() function. That is, we would like to set the
probe's event_id and event_fd in probe_info(). On the other
hand, the function was declared to take a "const" probe.
This didn't even make sense, however, since the function's
only caller, dt_probe_args_info(), retrieved information
specifically so that it could set fields like event_id. So,
this commit removes the "const" from the probe in probe_info()'s
declaration.
The basic approach should work. It borrows from the kernel's
samples/bpf/ examples (sampleip_user.c and trace_event_user.c),
but I am worried about this statement in the perf_event_open()
man page regarding attaching a BPF program to these events:
PERF_EVENT_IOC_SET_BPF (since Linux 4.1)
This allows attaching a Berkeley Packet Filter (BPF)
program to an existing kprobe tracepoint event.
Are they serious about that kprobe tracepoint limitation?
We also need to choose a CPU. I guess we need one event per
CPU for profile-n but only one (explicitly chosen?) CPU for
tick-n.
Signed-off-by: Eugene Loh <eugene.loh at oracle.com>
---
BPF-DESIGN | 1 +
libdtrace/Build | 2 +-
libdtrace/dt_open.c | 1 +
libdtrace/dt_prov_dtrace.c | 2 +-
libdtrace/dt_prov_fbt.c | 2 +-
libdtrace/dt_prov_profile.c | 355 ++++++++++++++++++++++++++++++++++++
libdtrace/dt_prov_sdt.c | 2 +-
libdtrace/dt_prov_syscall.c | 2 +-
libdtrace/dt_provider.h | 3 +-
9 files changed, 364 insertions(+), 6 deletions(-)
create mode 100644 libdtrace/dt_prov_profile.c
diff --git a/BPF-DESIGN b/BPF-DESIGN
index 919a6d9d..1d1c3a2a 100644
--- a/BPF-DESIGN
+++ b/BPF-DESIGN
@@ -221,6 +221,7 @@ DTRACE BPF PROGRAM CONVENTIONS
BPF trampoline (probe specific)
-------------------------------
+ FIXME: add something for dt_dtrace and dt_profile.
Function Boundary Tracing (based on kprobe)
-------------------------------------------
The C equivalent implementation of the FBT trampoline program is:
diff --git a/libdtrace/Build b/libdtrace/Build
index 1fe41d53..cf0718ff 100644
--- a/libdtrace/Build
+++ b/libdtrace/Build
@@ -21,7 +21,7 @@ libdtrace-build_SOURCES = dt_lex.c dt_aggregate.c dt_as.c dt_bpf.c \
dt_proc.c dt_program.c dt_provider.c dt_regset.c \
dt_string.c dt_strtab.c dt_subr.c dt_symtab.c \
dt_work.c dt_xlator.c dt_peb.c dt_prov_dtrace.c \
- dt_prov_fbt.c dt_prov_sdt.c dt_prov_syscall.c
+ dt_prov_fbt.c dt_prov_profile.c dt_prov_sdt.c dt_prov_syscall.c
libdtrace-build_SRCDEPS := dt_grammar.h $(objdir)/dt_git_version.h
diff --git a/libdtrace/dt_open.c b/libdtrace/dt_open.c
index 6a1efbfb..f6e82954 100644
--- a/libdtrace/dt_open.c
+++ b/libdtrace/dt_open.c
@@ -64,6 +64,7 @@ const dt_version_t _dtrace_versions[] = {
static const dt_provimpl_t *dt_providers[] = {
&dt_dtrace,
&dt_fbt,
+ &dt_profile,
&dt_sdt,
&dt_syscall,
};
diff --git a/libdtrace/dt_prov_dtrace.c b/libdtrace/dt_prov_dtrace.c
index fe9759de..085d8bc4 100644
--- a/libdtrace/dt_prov_dtrace.c
+++ b/libdtrace/dt_prov_dtrace.c
@@ -229,7 +229,7 @@ out:
return spec;
}
-static int probe_info(dtrace_hdl_t *dtp, const dt_probe_t *prp,
+static int probe_info(dtrace_hdl_t *dtp, dt_probe_t *prp,
int *idp, int *argcp, dt_argdesc_t **argvp)
{
char *spec;
diff --git a/libdtrace/dt_prov_fbt.c b/libdtrace/dt_prov_fbt.c
index 1aa3ca3c..bea9536f 100644
--- a/libdtrace/dt_prov_fbt.c
+++ b/libdtrace/dt_prov_fbt.c
@@ -272,7 +272,7 @@ static void trampoline(dt_pcb_t *pcb)
dt_irlist_append(dlp, dt_cg_node_alloc(lbl_exit, instr));
}
-static int probe_info(dtrace_hdl_t *dtp, const dt_probe_t *prp,
+static int probe_info(dtrace_hdl_t *dtp, dt_probe_t *prp,
int *idp, int *argcp, dt_argdesc_t **argvp)
{
int fd;
diff --git a/libdtrace/dt_prov_profile.c b/libdtrace/dt_prov_profile.c
new file mode 100644
index 00000000..821269a6
--- /dev/null
+++ b/libdtrace/dt_prov_profile.c
@@ -0,0 +1,355 @@
+/*
+ * Oracle Linux DTrace.
+ * Copyright (c) 2020, Oracle and/or its affiliates. All rights reserved.
+ * Licensed under the Universal Permissive License v 1.0 as shown at
+ * http://oss.oracle.com/licenses/upl.
+ *
+ * The profile provider for DTrace.
+ */
+#include <assert.h>
+// #include <errno.h>
+// #include <fcntl.h>
+// #include <stdio.h>
+// #include <stdlib.h>
+// #include <string.h>
+// #include <unistd.h>
+// #include <linux/perf_event.h>
+// #include <linux/bpf.h>
+// #include <sys/stat.h>
+// #include <sys/types.h>
+
+#include <bpf_asm.h>
+
+// #include "dt_impl.h"
+#include "dt_bpf.h"
+// #include "dt_bpf_builtins.h"
+// #include "dt_provider.h"
+#include "dt_probe.h"
+// #include "dt_pt_regs.h"
+
+static const char prvname[] = "profile";
+static const char modname[] = "";
+static const char funname[] = "";
+
+#define KIND_PROFILE 0
+#define KIND_TICK 1
+#define PREFIX_PROFILE "profile-"
+#define PREFIX_TICK "tick-"
+
+static const dtrace_pattr_t pattr = {
+{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
+{ DTRACE_STABILITY_UNSTABLE, DTRACE_STABILITY_UNSTABLE, DTRACE_CLASS_UNKNOWN },
+{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
+{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
+{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
+};
+
+static int populate(dtrace_hdl_t *dtp)
+{
+ dt_provider_t *prv;
+ char buf[32];
+ int n = 0;
+ int profile_n[] = { 97, 199, 499, 997, 1999, 4001, 4999 };
+ int tick_n[] = { 1, 10, 100, 500, 1000, 5000 };
+ int i;
+
+ prv = dt_provider_create(dtp, prvname, &dt_profile, &pattr);
+ if (prv == NULL)
+ return 0;
+
+ for (i = 0; i < sizeof(profile_n) / sizeof(int); i++) {
+ snprintf(buf, sizeof(buf), "%s%d", PREFIX_PROFILE, profile_n[i]);
+ if (dt_probe_insert(dtp, prv, prvname, modname, funname, buf))
+ n++;
+ }
+
+ for (i = 0; i < sizeof(tick_n) / sizeof(int); i++) {
+ snprintf(buf, sizeof(buf), "%s%d", PREFIX_TICK, tick_n[i]);
+ if (dt_probe_insert(dtp, prv, prvname, modname, funname, buf))
+ n++;
+ }
+
+ return n;
+}
+
+/*
+ * Generate a BPF trampoline for a profile probe.
+ *
+ * The trampoline function is called when a profile probe triggers, and it must
+ * satisfy the following prototype:
+ *
+ * int dt_profile(struct bpf_perf_event_data *ctx)
+ *
+ * The trampoline will populate a dt_bpf_context struct and then call the
+ * function that implements the compiled D clause. It returns the value that
+ * it gets back from that function.
+ */
+static void trampoline(dt_pcb_t *pcb)
+{
+ int i;
+ dt_irlist_t *dlp = &pcb->pcb_ir;
+ struct bpf_insn instr;
+ uint_t lbl_exit = dt_irlist_label(dlp);
+ dt_ident_t *idp;
+
+#define DCTX_FP(off) (-(ushort_t)DCTX_SIZE + (ushort_t)(off))
+
+ /*
+ * int dt_profile(struct bpf_perf_event_data *ctx)
+ * {
+ * struct dt_bpf_context dctx;
+ *
+ * memset(&dctx, 0, sizeof(dctx));
+ *
+ * dctx.epid = EPID;
+ * (we clear dctx.pad and dctx.fault because of the memset above)
+ */
+ idp = dt_dlib_get_var(pcb->pcb_hdl, "EPID");
+ assert(idp != NULL);
+ instr = BPF_STORE_IMM(BPF_W, BPF_REG_FP, DCTX_FP(DCTX_EPID), -1);
+ dt_irlist_append(dlp, dt_cg_node_alloc(DT_LBL_NONE, instr));
+ dlp->dl_last->di_extern = idp;
+ instr = BPF_STORE_IMM(BPF_W, BPF_REG_FP, DCTX_FP(DCTX_PAD), 0);
+ dt_irlist_append(dlp, dt_cg_node_alloc(DT_LBL_NONE, instr));
+ instr = BPF_STORE_IMM(BPF_DW, BPF_REG_FP, DCTX_FP(DCTX_FAULT), 0);
+ dt_irlist_append(dlp, dt_cg_node_alloc(DT_LBL_NONE, instr));
+
+#if 0
+ /*
+ * dctx.regs = *regs;
+ */
+ /*
+ * FIXME:
+ * %r1 points to a context given by /usr/include/linux/bpf_perf_event.h:
+ * struct bpf_perf_event_data {
+ * bpf_user_pt_regs_t regs;
+ * __u64 sample_period;
+ * __u64 addr;
+ * };
+ * Since regs lives at the same base address as the context and
+ * since bpf_user_pt_regs_t might be the same as dt_pt_regs, I think
+ * we can just use the PT_REGS_ARGn definitions from libdtrace/dt_pt_regs.h.
+ * But we should revisit all that and fix as necessary.
+ */
+ for (i = 0; i < sizeof(dt_pt_regs); i += 8) {
+ instr = BPF_LOAD(BPF_DW, BPF_REG_0, BPF_REG_1, i);
+ dt_irlist_append(dlp, dt_cg_node_alloc(DT_LBL_NONE, instr));
+ instr = BPF_STORE(BPF_DW, BPF_REG_FP, DCTX_FP(DCTX_REGS) + i,
+ BPF_REG_0);
+ dt_irlist_append(dlp, dt_cg_node_alloc(DT_LBL_NONE, instr));
+ }
+#endif
+
+ /*
+ * FIXME:
+ * We need to set the args here. They are:
+ * dctx.argv[0] = pc; kernel PC
+ * dctx.argv[1] = upc; userspace PC
+ * dctx.argv[2] = nsecs; elapsed nsec (profile-n only, not tick-n)
+ * Maybe look at what samples/bpf/[sampleip|trace_event]_kern.c
+ * does with PT_REGS_IP(&ctx->regs).
+ */
+
+ /*
+ * (we clear dctx.argv[0] and on because of the memset above)
+ */
+ /*
+ * FIXME: Once we set the args, I think we want to start this
+ * loop at pcb->pcb_pinfo.dtp_argc; see dt_prov_syscall.c.
+ */
+ for (i = 0; i < sizeof(((struct dt_bpf_context *)0)->argv) / 8; i++) {
+ instr = BPF_STORE_IMM(BPF_DW, BPF_REG_FP, DCTX_FP(DCTX_ARG(i)),
+ 0);
+ dt_irlist_append(dlp, dt_cg_node_alloc(DT_LBL_NONE, instr));
+ }
+
+ /*
+ * We know the BPF context (ctx) is in %r1. Since we will be passing
+ * the DTrace context (dctx) as 2nd argument to dt_program, we need it
+ * in %r2.
+ */
+ instr = BPF_MOV_REG(BPF_REG_2, BPF_REG_FP);
+ dt_irlist_append(dlp, dt_cg_node_alloc(DT_LBL_NONE, instr));
+ instr = BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, DCTX_FP(0));
+ dt_irlist_append(dlp, dt_cg_node_alloc(DT_LBL_NONE, instr));
+
+ /*
+ * rc = dt_program(ctx, dctx);
+ */
+ idp = dt_dlib_get_func(pcb->pcb_hdl, "dt_program");
+ assert(idp != NULL);
+ instr = BPF_CALL_FUNC(idp->di_id);
+ dt_irlist_append(dlp, dt_cg_node_alloc(DT_LBL_NONE, instr));
+ dlp->dl_last->di_extern = idp;
+
+ /*
+ * exit:
+ * return rc;
+ * }
+ */
+ instr = BPF_RETURN();
+ dt_irlist_append(dlp, dt_cg_node_alloc(lbl_exit, instr));
+}
+
+static int set_period_freq(const char *name, struct perf_event_attr *ap)
+{
+ int kind = -1, ndigits;
+ char *p;
+ unsigned long long val;
+
+ if (strncmp(name, PREFIX_PROFILE, strlen(PREFIX_PROFILE)) == 0)
+ kind = KIND_PROFILE;
+ else if (strncmp(name, PREFIX_TICK, strlen(PREFIX_TICK)) == 0)
+ kind = KIND_TICK;
+ else
+ return -1;
+
+ p = strchr(name, '-') + 1;
+ ndigits = strspn(p, "0123456789");
+
+ sscanf(p, "%llud", &val);
+ if (val == 0)
+ return -1;
+ p += ndigits;
+
+ if (p[0] == '\0' || strncasecmp(p, "hz", 3) == 0) {
+ ap->sample_freq = val;
+ ap->freq = 1;
+ } else if (strncasecmp(p, "d", 2) || strncasecmp(p, "day", 4))
+ ap->sample_period = val * 24 * 60 * 60 * 1000000000;
+ else if (strncasecmp(p, "h", 2) || strncasecmp(p, "hour", 5))
+ ap->sample_period = val * 60 * 60 * 1000000000;
+ else if (strncasecmp(p, "m", 2) || strncasecmp(p, "min", 4))
+ ap->sample_period = val * 60 * 1000000000;
+ else if (strncasecmp(p, "s", 2) || strncasecmp(p, "sec", 4))
+ ap->sample_period = val * 1000000000;
+ else if (strncasecmp(p, "ms", 3) || strncasecmp(p, "msec", 5))
+ ap->sample_period = val * 1000000;
+ else if (strncasecmp(p, "us", 3) || strncasecmp(p, "usec", 5))
+ ap->sample_period = val * 1000;
+ else if (strncasecmp(p, "ns", 3) || strncasecmp(p, "nsec", 5))
+ ap->sample_period = val;
+ else
+ return -1;
+
+ return kind;
+}
+
+static int probe_info(dtrace_hdl_t *dtp, dt_probe_t *prp,
+ int *idp, int *argcp, dt_argdesc_t **argvp)
+{
+ int fd, kind;
+ struct perf_event_attr attr;
+
+ if (strcmp(prp->desc->prv, prvname) ||
+ strcmp(prp->desc->mod, modname) ||
+ strcmp(prp->desc->fun, funname))
+ return -1;
+
+ memset(&attr, 0, sizeof(attr));
+ attr.type = PERF_TYPE_SOFTWARE;
+ attr.config = *idp = prp->event_id = PERF_COUNT_SW_CPU_CLOCK;
+ attr.sample_type = PERF_SAMPLE_RAW;
+ attr.size = sizeof(struct perf_event_attr); // FIXME: why have we not been doing this in other providers?
+ attr.wakeup_events = 1;
+ attr.inherit = 0; // system wide, no need to inherit; for test_all_cpu
+ attr.inherit = 1; // per task, enable to trace properly (but causes bpf_perf_prog_read_time helper failure)
+
+ kind = set_period_freq(prp->desc->prb, &attr);
+ if (kind < 0)
+ return -1;
+ if (kind == KIND_PROFILE) // FIXME: still need to support this
+ return -1;
+
+ /*
+ * FIXME:
+ * what should cpu be? 0? (that's what kprobes does) one case for each cpu?
+ * perf_event_open() man page says -1 -1 is not legal
+ * is this a profile-n vs tick-n thing?
+ */
+ fd = perf_event_open(&attr, -1, 0 /* cpu */, -1, 0);
+ if (fd < 0) {
+ /* FIXME: what else? unset prp->event_id? */
+ return -1;
+ }
+ prp->event_fd = fd;
+
+ /* FIXME: should *argcp be 3 for profile-n and 2 for tick-n?
+ * also need to set argvp
+ */
+ *argcp = 0;
+ *argvp = NULL;
+
+ return 0;
+}
+
+static int provide(dtrace_hdl_t *dtp, const dtrace_probedesc_t *pdp)
+{
+ dt_provider_t *prv;
+
+ /* make sure we have IDNONE and a legal name */
+ if (pdp->id != DTRACE_IDNONE ||
+ strcmp(pdp->prv, prvname) ||
+ strcmp(pdp->mod, modname) ||
+ strcmp(pdp->fun, funname))
+ return 0;
+ if (strncmp(pdp->prb, PREFIX_TICK, strlen(PREFIX_TICK)) &&
+ strncmp(pdp->prb, PREFIX_PROFILE, strlen(PREFIX_PROFILE)))
+ return 0;
+
+ /* return if we already have this probe */
+ if (dt_probe_lookup(dtp, pdp))
+ return 0;
+
+ /* insert this probe */
+ prv = dt_provider_lookup(dtp, prvname);
+ if (!prv)
+ return 0;
+ dt_probe_insert(dtp, prv, prvname, modname, funname, pdp->prb);
+
+ return 0;
+}
+
+dt_provimpl_t dt_profile = {
+ .name = prvname,
+ .prog_type = BPF_PROG_TYPE_PERF_EVENT,
+ .populate = &populate,
+ .trampoline = &trampoline,
+ .probe_info = &probe_info,
+ .provide = &provide,
+};
+#if 0
+#include <linux/fs.h>
+#include <linux/ktime.h>
+#include <linux/miscdevice.h>
+#include <linux/slab.h>
+#include <asm/irq_regs.h>
+#include <asm/ptrace.h>
+
+#include <linux/hardirq.h>
+#include <linux/profile.h>
+
+static void profile_[tick|prof]_fn(uintptr_t arg)
+{
+ unsigned long pc = 0, upc = 0;
+ struct pt_regs *regs = get_irq_regs();
+
+ /*
+ * If regs == NULL, then we were called from from softirq context which
+ * also means that we didn't actually interrupt any processing (kernel
+ * or user space).
+ * If regs != NULL, then we did actually get called from hardirq
+ * because the timer interrupt did really interrupt something that was
+ * going on on the CPU (could be user mode or kernel mode).
+ */
+ if (regs == NULL) {
+ uint64_t stack[8];
+
+ dtrace_getpcstack(stack, 8, 0, NULL);
+ pc = stack[7];
+ } else if (user_mode(regs))
+ upc = instruction_pointer(regs);
+ else
+ pc = instruction_pointer(regs);
+}
+#endif
diff --git a/libdtrace/dt_prov_sdt.c b/libdtrace/dt_prov_sdt.c
index 6d99653a..beba66c1 100644
--- a/libdtrace/dt_prov_sdt.c
+++ b/libdtrace/dt_prov_sdt.c
@@ -379,7 +379,7 @@ static void trampoline(dt_pcb_t *pcb)
dt_irlist_append(dlp, dt_cg_node_alloc(lbl_exit, instr));
}
-static int probe_info(dtrace_hdl_t *dtp, const dt_probe_t *prp,
+static int probe_info(dtrace_hdl_t *dtp, dt_probe_t *prp,
int *idp, int *argcp, dt_argdesc_t **argvp)
{
FILE *f;
diff --git a/libdtrace/dt_prov_syscall.c b/libdtrace/dt_prov_syscall.c
index 0b0ca954..9d8f22b4 100644
--- a/libdtrace/dt_prov_syscall.c
+++ b/libdtrace/dt_prov_syscall.c
@@ -234,7 +234,7 @@ static void trampoline(dt_pcb_t *pcb)
dt_irlist_append(dlp, dt_cg_node_alloc(lbl_exit, instr));
}
-static int probe_info(dtrace_hdl_t *dtp, const dt_probe_t *prp,
+static int probe_info(dtrace_hdl_t *dtp, dt_probe_t *prp,
int *idp, int *argcp, dt_argdesc_t **argvp)
{
FILE *f;
diff --git a/libdtrace/dt_provider.h b/libdtrace/dt_provider.h
index 50f663f1..51d282f7 100644
--- a/libdtrace/dt_provider.h
+++ b/libdtrace/dt_provider.h
@@ -60,7 +60,7 @@ typedef struct dt_provimpl {
int prog_type; /* BPF program type */
int (*populate)(dtrace_hdl_t *dtp); /* register probes */
int (*probe_info)(dtrace_hdl_t *dtp, /* get probe info */
- const struct dt_probe *prp,
+ struct dt_probe *prp,
int *idp, int *argcp, dt_argdesc_t **argvp);
int (*provide)(dtrace_hdl_t *dtp, /* provide probes */
const dtrace_probedesc_t *pdp);
@@ -74,6 +74,7 @@ extern int tp_event_info(dtrace_hdl_t *dtp, FILE *f, int skip, int *idp,
extern dt_provimpl_t dt_dtrace;
extern dt_provimpl_t dt_fbt;
+extern dt_provimpl_t dt_profile;
extern dt_provimpl_t dt_sdt;
extern dt_provimpl_t dt_syscall;
--
2.18.2
More information about the DTrace-devel
mailing list