[DTrace-devel] [PATCH v2 01/16] Add a CPC provider
Kris Van Hees
kris.van.hees at oracle.com
Fri Feb 24 00:49:34 UTC 2023
Reviewed-by: Kris Van Hees <kris.van.hees at oracle.com>
... with a small name change (prvdata -> prv_data)
On Wed, Feb 22, 2023 at 08:04:13PM -0500, eugene.loh--- via DTrace-devel wrote:
> From: Eugene Loh <eugene.loh at oracle.com>
>
> Use libpfm to populate CPC probes.
>
> The PC is placed into arg0 (if kernel) or arg1 (if user-space),
> the other arg being 0. This initialization is in a new function
> dt_cg_tramp_copy_PC_from_regs() so that it can be used by any other
> provider (like profile) that needs it.
>
> Current limitations include:
>
> *) no support for optional attributes
>
> *) no support for raw events
>
> Signed-off-by: Eugene Loh <eugene.loh at oracle.com>
> ---
> dtrace.spec | 6 +-
> libdtrace/Build | 8 +-
> libdtrace/dt_cg.c | 74 ++++++
> libdtrace/dt_cg.h | 1 +
> libdtrace/dt_open.c | 10 +-
> libdtrace/dt_prov_cpc.c | 482 ++++++++++++++++++++++++++++++++++++++++
> libdtrace/dt_provider.h | 5 +-
> 7 files changed, 577 insertions(+), 9 deletions(-)
> create mode 100644 libdtrace/dt_prov_cpc.c
>
> diff --git a/dtrace.spec b/dtrace.spec
> index 1049738d..d12a2a82 100644
> --- a/dtrace.spec
> +++ b/dtrace.spec
> @@ -1,7 +1,7 @@
> # spec file for package dtrace
> #
> # Oracle Linux DTrace.
> -# Copyright (c) 2011, 2022, Oracle and/or its affiliates. All rights reserved.
> +# Copyright (c) 2011, 2023, Oracle and/or its affiliates. All rights reserved.
> # Licensed under the Universal Permissive License v 1.0 as shown at
> # http://oss.oracle.com/licenses/upl.
>
> @@ -55,9 +55,9 @@ BuildRequires: rpm
> Name: dtrace
> License: Universal Permissive License (UPL), Version 1.0
> Group: Development/Tools
> -Requires: cpp elfutils-libelf zlib libpcap
> +Requires: cpp elfutils-libelf zlib libpcap libpfm
> BuildRequires: glibc-headers bison flex zlib-devel elfutils-libelf-devel systemd systemd-devel
> -BuildRequires: glibc-static %{glibc32} wireshark libpcap-devel valgrind-devel
> +BuildRequires: glibc-static %{glibc32} wireshark libpcap-devel valgrind-devel libpfm-devel
> %if "%{?dist}" == ".el7"
> Requires: fuse
> BuildRequires: fuse-devel
> diff --git a/libdtrace/Build b/libdtrace/Build
> index c1ef05ad..f9d11876 100644
> --- a/libdtrace/Build
> +++ b/libdtrace/Build
> @@ -1,5 +1,5 @@
> # Oracle Linux DTrace.
> -# Copyright (c) 2011, 2022, Oracle and/or its affiliates. All rights reserved.
> +# Copyright (c) 2011, 2023, Oracle and/or its affiliates. All rights reserved.
> # Licensed under the Universal Permissive License v 1.0 as shown at
> # http://oss.oracle.com/licenses/upl.
>
> @@ -46,6 +46,7 @@ libdtrace-build_SOURCES = dt_aggregate.c \
> dt_probe.c \
> dt_proc.c \
> dt_program.c \
> + dt_prov_cpc.c \
> dt_prov_dtrace.c \
> dt_prov_fbt.c \
> dt_prov_profile.c \
> @@ -69,9 +70,9 @@ SHLIBS += libdtrace
> libdtrace_DIR := $(current-dir)
> libdtrace_TARGET = libdtrace
> ifdef HAVE_LIBCTF
> -libdtrace_LIBS := -lctf -lelf -lz -lrt -lpcap -lpthread -ldl -lm
> +libdtrace_LIBS := -lctf -lelf -lz -lrt -lpcap -lpthread -ldl -lm -lpfm
> else
> -libdtrace_LIBS := -ldtrace-ctf -lelf -lz -lrt -lpcap -lpthread -ldl -lm
> +libdtrace_LIBS := -ldtrace-ctf -lelf -lz -lrt -lpcap -lpthread -ldl -lm -lpfm
> endif
> libdtrace_VERSION := 2.0.0
> libdtrace_SONAME := libdtrace.so.2
> @@ -86,6 +87,7 @@ dt_cg.c_CFLAGS := -Wno-pedantic
> dt_dis.c_CFLAGS := -Wno-pedantic
> dt_pid.c_CFLAGS := -Wno-pedantic
> dt_proc.c_CFLAGS := -Wno-pedantic
> +dt_prov_cpc.c_CFLAGS := -Wno-pedantic
> dt_prov_dtrace.c_CFLAGS := -Wno-pedantic
> dt_prov_fbt.c_CFLAGS := -Wno-pedantic
> dt_prov_profile.c_CFLAGS := -Wno-pedantic
> diff --git a/libdtrace/dt_cg.c b/libdtrace/dt_cg.c
> index cb284f07..b1626a9c 100644
> --- a/libdtrace/dt_cg.c
> +++ b/libdtrace/dt_cg.c
> @@ -400,6 +400,80 @@ dt_cg_tramp_copy_args_from_regs(dt_pcb_t *pcb, int called)
> }
> }
>
> +/*
> + * For some providers, we have
> + * - arg0 = PC if kernel (0 otherwise)
> + * - arg1 = PC if user space (0 otherwise)
> + *
> + * So put the PC in both arg0 and arg1, test the PC, and then zero out
> + * either arg0 or arg1, as apropriate.
> + *
> + * The caller must ensure that %r7 and %r8 contain the values set by
> + * the dt_cg_tramp_prologue*() functions.
> + */
> +void
> +dt_cg_tramp_copy_pc_from_regs(dt_pcb_t *pcb)
> +{
> + dtrace_hdl_t *dtp = pcb->pcb_hdl;
> + dt_regset_t *drp = pcb->pcb_regs;
> + dt_irlist_t *dlp = &pcb->pcb_ir;
> + uint_t Luser = dt_irlist_label(dlp);
> + uint_t Ldone = dt_irlist_label(dlp);
> +
> + if (dt_regset_xalloc_args(drp) == -1)
> + longjmp(yypcb->pcb_jmpbuf, EDT_NOREG);
> +
> + /* place the PC in %r3, arg0, and arg1 */
> + emit(dlp, BPF_LOAD(BPF_DW, BPF_REG_3, BPF_REG_8, PT_REGS_IP));
> + emit(dlp, BPF_STORE(BPF_DW, BPF_REG_7, DMST_ARG(0), BPF_REG_3));
> + emit(dlp, BPF_STORE(BPF_DW, BPF_REG_7, DMST_ARG(1), BPF_REG_3));
> +
> + /* check if the PC is kernel or user space */
> + if (dtp->dt_bpfhelper[BPF_FUNC_probe_read_kernel] == BPF_FUNC_probe_read_kernel) {
> + /*
> + * Use probe_read_kernel() if it is really is probe_read_kernel().
> + * On older kernels, it does not exist and is aliased to something else.
> + */
> +
> + /* test just a single byte */
> + emit(dlp, BPF_MOV_IMM(BPF_REG_2, 1));
> +
> + /* safe to write to FP+DT_STK_SP_BASE, which becomes the clause stack */
> + emit(dlp, BPF_MOV_REG(BPF_REG_1, BPF_REG_FP));
> + emit(dlp, BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, DT_STK_SP_BASE));
> +
> + /* bpf_probe_read_kernel(%fp + DT_STK_SP, 1, PC) */
> + dt_regset_xalloc(drp, BPF_REG_0);
> + emit(dlp, BPF_CALL_HELPER(BPF_FUNC_probe_read_kernel));
> +
> + /* if there was a problem, assume it was user space */
> + emit(dlp, BPF_BRANCH_IMM(BPF_JNE, BPF_REG_0, 0, Luser));
> + dt_regset_free(drp, BPF_REG_0);
> + } else {
> + /*
> + * If no real probe_read_kernel() exists, just test the highest bit.
> + * This is not as robust, but probably works just fine for us.
> + */
> +
> + /* if the highest bit is 0, assume it was user space */
> + emit(dlp, BPF_ALU64_IMM(BPF_RSH, BPF_REG_3, 63));
> + emit(dlp, BPF_BRANCH_IMM(BPF_JEQ, BPF_REG_3, 0, Luser));
> + }
> +
> + /* PC is kernel space (zero out arg1) */
> + emit(dlp, BPF_STORE_IMM(BPF_DW, BPF_REG_7, DMST_ARG(1), 0));
> + emit(dlp, BPF_JUMP(Ldone));
> +
> + /* PC is user space (zero out arg0) */
> + emitl(dlp, Luser,
> + BPF_STORE_IMM(BPF_DW, BPF_REG_7, DMST_ARG(0), 0));
> +
> + /* done */
> + emitl(dlp, Ldone,
> + BPF_NOP());
> + dt_regset_free_args(drp);
> +}
> +
> /*
> * Copy return value from a dt_pt_regs structure referenced by %r8 to
> * mst->arg[1]. Zero the other args.
> diff --git a/libdtrace/dt_cg.h b/libdtrace/dt_cg.h
> index 3742bc6a..b4f46e28 100644
> --- a/libdtrace/dt_cg.h
> +++ b/libdtrace/dt_cg.h
> @@ -25,6 +25,7 @@ extern void dt_cg_tramp_prologue(dt_pcb_t *pcb);
> extern void dt_cg_tramp_clear_regs(dt_pcb_t *pcb);
> extern void dt_cg_tramp_copy_regs(dt_pcb_t *pcb);
> extern void dt_cg_tramp_copy_args_from_regs(dt_pcb_t *pcb, int called);
> +extern void dt_cg_tramp_copy_pc_from_regs(dt_pcb_t *pcb);
> extern void dt_cg_tramp_copy_rval_from_regs(dt_pcb_t *pcb);
> extern void dt_cg_tramp_call_clauses(dt_pcb_t *pcb, const dt_probe_t *prp,
> dt_activity_t act);
> diff --git a/libdtrace/dt_open.c b/libdtrace/dt_open.c
> index a3754a32..7f8f680a 100644
> --- a/libdtrace/dt_open.c
> +++ b/libdtrace/dt_open.c
> @@ -1,6 +1,6 @@
> /*
> * Oracle Linux DTrace.
> - * Copyright (c) 2003, 2022, Oracle and/or its affiliates. All rights reserved.
> + * Copyright (c) 2003, 2023, Oracle and/or its affiliates. All rights reserved.
> * Licensed under the Universal Permissive License v 1.0 as shown at
> * http://oss.oracle.com/licenses/upl.
> */
> @@ -64,7 +64,8 @@ const dt_version_t _dtrace_versions[] = {
> * provider module may create multiple providers.
> */
> static const dt_provimpl_t *dt_providers[] = {
> - &dt_dtrace,
> + &dt_dtrace, /* list dt_dtrace first */
> + &dt_cpc,
> &dt_fbt,
> &dt_profile,
> &dt_sdt,
> @@ -1261,6 +1262,11 @@ dtrace_close(dtrace_hdl_t *dtp)
> dt_pfdict_destroy(dtp);
> dt_dof_fini(dtp);
> dt_probe_fini(dtp);
> + /*
> + * FIXME:
> + * add some dt_prov_fini() to iterate over providers and call provider-specific fini()'s
> + * CPC will call pfm_terminate()
> + */
>
> dt_htab_destroy(dtp, dtp->dt_provs);
>
> diff --git a/libdtrace/dt_prov_cpc.c b/libdtrace/dt_prov_cpc.c
> new file mode 100644
> index 00000000..59ff0a2d
> --- /dev/null
> +++ b/libdtrace/dt_prov_cpc.c
> @@ -0,0 +1,482 @@
> +/*
> + * Oracle Linux DTrace.
> + * Copyright (c) 2023, Oracle and/or its affiliates. All rights reserved.
> + * Licensed under the Universal Permissive License v 1.0 as shown at
> + * http://oss.oracle.com/licenses/upl.
> + *
> + * The CPU Performance Counter (CPC) provider for DTrace.
> + */
> +#include <assert.h>
> +#include <dt_impl.h>
> +#include <sys/ioctl.h>
> +#include <ctype.h> /* tolower() */
> +
> +#include <bpf_asm.h>
> +#include <linux/perf_event.h>
> +#include <perfmon/pfmlib_perf_event.h>
> +
> +#include "dt_dctx.h"
> +#include "dt_cg.h"
> +#include "dt_bpf.h"
> +#include "dt_probe.h"
> +
> +static const char prvname[] = "cpc";
> +static const char modname[] = "";
> +static const char funname[] = "";
> +
> +static const dtrace_pattr_t pattr = {
> +{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
> +{ DTRACE_STABILITY_UNSTABLE, DTRACE_STABILITY_UNSTABLE, DTRACE_CLASS_UNKNOWN },
> +{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
> +{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
> +{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
> +};
> +
> +typedef struct cpc_probe {
> + char *name;
> + int *fds;
> +} cpc_probe_t;
> +
> +/*
> + * Probe name mappings. As we discover which events can actually be used
> + * on the system, we put them in a linked list that maps from names
> + * we present to the D/CPC user to names used by the underlying system.
> + * Importantly, CPC wants no '-' in probe names. And, stylistically, we
> + * prefer lower-case probe names.
> + */
> +typedef struct cpc_probe_map {
> + dt_list_t list;
> + char *Dname;
> + char *pfmname;
> +} cpc_probe_map_t;
> +
> +static dt_probe_t *cpc_probe_insert(dtrace_hdl_t *dtp, const char *prb)
> +{
> + dt_provider_t *prv;
> + cpc_probe_t *datap;
> + int i, cnt = dtp->dt_conf.num_online_cpus;
> +
> + prv = dt_provider_lookup(dtp, prvname);
> + if (!prv)
> + return 0;
> +
> + datap = dt_zalloc(dtp, sizeof(cpc_probe_t));
> + if (datap == NULL)
> + return NULL;
> +
> + datap->name = strdup(prb);
> + datap->fds = dt_calloc(dtp, cnt, sizeof(int));
> + if (datap->fds == NULL)
> + goto err;
> +
> + for (i = 0; i < cnt; i++)
> + datap->fds[i] = -1;
> +
> + return dt_probe_insert(dtp, prv, prvname, modname, funname, prb, datap);
> +
> +err:
> + dt_free(dtp, datap);
> + return NULL;
> +}
> +
> +static int populate(dtrace_hdl_t *dtp)
> +{
> + int n = 0;
> +
> + dt_provider_create(dtp, prvname, &dt_cpc, &pattr);
> + dt_cpc.prvdata = dt_zalloc(dtp, sizeof(dt_list_t));
> +
> + /* incidentally, pfm_strerror(pfm_initialize()) describes the error */
> + if (pfm_initialize() != PFM_SUCCESS)
> + return 0;
> +
> + /* loop over PMUs (FWIW, ipmu=PFM_PMU_PERF_EVENT is among them) */
> + for (pfm_pmu_t ipmu = PFM_PMU_NONE; ipmu < PFM_PMU_MAX; ipmu ++) {
> + pfm_pmu_info_t pmuinfo;
> +
> + memset(&pmuinfo, 0, sizeof(pmuinfo));
> + pmuinfo.size = sizeof(pfm_pmu_info_t);
> + if (pfm_get_pmu_info(ipmu , &pmuinfo) != PFM_SUCCESS || pmuinfo.is_present == 0)
> + continue;
> +
> + /*
> + * At this point, we have interesting information like:
> + * - pmuinfo.nevents
> + * - pmuinfo.name
> + * - pmuinfo.desc
> + * - pmuinfo.type = PFM_PMU_TYPE_[UNKNOWN|CORE|UNCORE]
> + * - pmuinfo.num_cntrs
> + * - pmuinfo.fixed_num_cntrs
> + * - pmuinfo.max_encoding - number of event codes returned by pfm_get_event_encoding()
> + */
> +
> + /* loop over events */
> + for (int ievt = pmuinfo.first_event; ievt != -1; ievt = pfm_get_event_next(ievt)) {
> + pfm_event_info_t evtinfo;
> +
> + pfm_perf_encode_arg_t encoding;
> + struct perf_event_attr attr;
> + char *fstr = NULL;
> +
> + int fd;
> +
> + cpc_probe_map_t *next_probe_map;
> +
> + char *suffix = "-all-1000000000";
> + char *s;
> +
> + dtrace_probedesc_t pd;
> +
> + /*
> + * Convert opaque integer index ievt into a name evt.name.
> + */
> +
> + memset(&evtinfo, 0, sizeof(evtinfo));
> + evtinfo.size = sizeof(evtinfo);
> +
> + /* PFM_OS_[NONE|PERF_EVENT|PERF_EVENT_EXT] */
> + if (pfm_get_event_info(ievt, PFM_OS_PERF_EVENT, &evtinfo) != PFM_SUCCESS)
> + continue;
> +
> + /*
> + * At this point, we have interesting information like:
> + * - evtinfo.name
> + * - evtinfo.desc - a little verbose and does not say that much
> + * - evtinfo.nattrs
> + * - evtinfo.dtype - should be PFM_DTYPE_UINT64
> + * - evtinfo.idx - should be ievt
> + * - evtinfo.equiv - some equivalent name, or "(null)"
> + */
> +
> + /*
> + * Convert the event name into perf_event attr.
> + */
> + memset(&encoding, 0, sizeof(encoding));
> + memset(&attr, 0, sizeof(attr));
> + encoding.size = sizeof(encoding);
> + encoding.attr = &attr;
> + encoding.fstr = &fstr;
> +
> + /*
> + * os = [PFM_OS_PERF_EVENT | PFM_OS_PERF_EVENT_EXT]
> + * Note that pfm_strerror(pfm_get_os_event_encoding(...)) describes any error.
> + */
> + if (pfm_get_os_event_encoding(evtinfo.name, PFM_PLM0 | PFM_PLM3, PFM_OS_PERF_EVENT, &encoding) != PFM_SUCCESS) {
> + if (fstr)
> + free(fstr); /* is this necessary if we errored out? */
> + continue;
> + }
> +
> + /*
> + * At this point, ievt is what we requested, while encoding.idx corresponds to fstr.
> + * Meanwhile, fstr will have some ":u=1:k=1" that we would otherwise want to modify.
> + */
> + if (fstr)
> + free(fstr);
> +
> + /*
> + * Now attr is largely set up. Note:
> + * - attr.size is still 0, which is okay
> + * - attr.freq is 0, which is okay
> + * - attr.wakeup_events is 0, which we can change
> + */
> + attr.wakeup_events = 1;
> +
> + /*
> + * Check attr with perf_event_open().
> + */
> + fd = perf_event_open(&attr, -1, 0 /* FIXME: cpu */, -1, 0);
> + if (fd < 0)
> + continue;
> + close(fd);
> +
> + /*
> + * We convert '-' to '_' to conform to CPC practices
> + * and convert to lower-case characters (for stylistic reasons).
> + *
> + * FIXME: If we run out of memory (which is unlikely?), we can:
> + * - just proceed with the NULL pointers (causing later drastic failure)
> + * - silently skip over this probe (causing later more controlled failure)
> + * - somehow emit a diagnostic message
> + * For now, we just choose the middle option.
> + *
> + * FIXME: Memory pointed to by next_probe_map, pfmname, and Dname
> + * should ideally be freed explicitly during some probe_destroy(),
> + * but this is a low priority since all such memory will be freed
> + * anyhow when the DTrace session ends.
> + */
> + next_probe_map = dt_zalloc(dtp, sizeof(cpc_probe_map_t));
> + if (next_probe_map == NULL)
> + continue;
> + next_probe_map->pfmname = strdup(evtinfo.name);
> + next_probe_map->Dname = strdup(evtinfo.name);
> + if (next_probe_map->pfmname == NULL ||
> + next_probe_map->Dname == NULL)
> + continue;
> + for (unsigned char *p = next_probe_map->Dname; *p; p++)
> + *p = (*p == '-') ? '_' : tolower(*p);
> + dt_list_append(dt_cpc.prvdata, next_probe_map);
> +
> + /*
> + * Compose a CPC probe name by adding mode "all" and a sample period
> + * big enough that even the fastest firing probe will not be unreasonable.
> + */
> + s = dt_zalloc(dtp, strlen(next_probe_map->Dname) + strlen(suffix) + 1);
> + if (s == NULL)
> + continue;
> + sprintf(s, "%s%s", next_probe_map->Dname, suffix);
> +
> + /*
> + * If this probe is not yet there (likely!), add it.
> + */
> + pd.id = DTRACE_IDNONE;
> + pd.prv = prvname;
> + pd.mod = modname;
> + pd.fun = funname;
> + pd.prb = s;
> + if (dt_probe_lookup(dtp, &pd) == NULL && cpc_probe_insert(dtp, s))
> + n++;
> +
> + dt_free(dtp, s);
> + }
> + }
> +
> + return n;
> +}
> +
> +static int decode_event(struct perf_event_attr *ap, const char *name) {
> + cpc_probe_map_t *probe_map;
> + pfm_perf_encode_arg_t encoding;
> +
> + /* find the probe name mapping for this D name */
> + for (probe_map = dt_list_next(dt_cpc.prvdata);
> + probe_map; probe_map = dt_list_next(probe_map))
> + if (strcmp(name, probe_map->Dname) == 0)
> + break;
> + if (probe_map == NULL)
> + return -1;
> +
> + /* fill in the attr for this pfm name */
> + char *fstr = NULL;
> + int ret;
> +
> + memset(&encoding, 0, sizeof(encoding));
> + encoding.size = sizeof(encoding);
> + encoding.attr = ap;
> + encoding.fstr = &fstr;
> +
> + /*
> + * os = [PFM_OS_PERF_EVENT | PFM_OS_PERF_EVENT_EXT]
> + * Note that pfm_strerror(pfm_get_os_event_encoding(...)) describes any error.
> + */
> + ret = pfm_get_os_event_encoding(probe_map->pfmname, PFM_PLM0 | PFM_PLM3, PFM_OS_PERF_EVENT, &encoding);
> + if (fstr)
> + free(fstr); /* FIXME: is this necessary if we errored out? if not, we do not need to define ret? */
> + return (ret == PFM_SUCCESS) ? 0 : -1;
> +}
> +
> +static int decode_mode(struct perf_event_attr *ap, const char *name) {
> + if (strcmp(name, "user") == 0) {
> + ap->exclude_kernel = 1;
> + return 0;
> + } else if (strcmp(name, "kernel") == 0) {
> + ap->exclude_user = 1;
> + return 0;
> + } else if (strcmp(name, "all") == 0)
> + return 0;
> +
> + return -1;
> +}
> +
> +static int decode_attributes(struct perf_event_attr *ap, const char *name) {
> + /* FIXME: need to implement this */
> + return -1;
> +}
> +
> +static int decode_probename(struct perf_event_attr *ap, const char *name) {
> + char buf[DTRACE_NAMELEN];
> + char *pend;
> +
> + /* work in a temporary space */
> + strcpy(buf, name);
> +
> + /* "event" substring */
> + name = buf;
> + pend = strchr(name, '-');
> + if (pend == NULL)
> + return -1;
> + *pend = '\0';
> + pend++;
> + if (decode_event(ap, name) < 0)
> + return -1;
> +
> + /* "mode" substring */
> + name = pend;
> + pend = strchr(name, '-');
> + if (pend == NULL)
> + return -1;
> + *pend = '\0';
> + pend++;
> + if (decode_mode(ap, name) < 0)
> + return -1;
> +
> + /* optional "attributes" substring */
> + name = pend;
> + pend = strchr(name, '-');
> + if (pend) {
> + *pend = '\0';
> + pend++;
> + if (decode_attributes(ap, name) < 0)
> + return -1;
> + name = pend;
> + }
> +
> + /* "count" substring must be all digits 0-9 */
> + if (strspn(name, "0123456789") < strlen(name))
> + return -1;
> + if (sscanf(name, "%llu", &ap->sample_period) != 1)
> + return -1;
> +
> + return 0;
> +}
> +
> +static int provide(dtrace_hdl_t *dtp, const dtrace_probedesc_t *pdp)
> +{
> + struct perf_event_attr attr;
> +
> + /* make sure we have IDNONE and a legal name */
> + if (pdp->id != DTRACE_IDNONE || strcmp(pdp->prv, prvname) ||
> + strcmp(pdp->mod, modname) || strcmp(pdp->fun, funname))
> + return 0;
> +
> + /* return if we already have this probe */
> + if (dt_probe_lookup(dtp, pdp))
> + return 0;
> +
> + /* check if the probe name can be decoded */
> + if (decode_probename(&attr, pdp->prb) == -1)
> + return 0;
> +
> + /* try to add this probe */
> + if (cpc_probe_insert(dtp, pdp->prb) == NULL)
> + return 0;
> +
> + return 1;
> +}
> +
> +/*
> + * Generate a BPF trampoline for a cpc probe.
> + *
> + * The trampoline function is called when a cpc probe triggers, and it must
> + * satisfy the following prototype:
> + *
> + * int dt_cpc(struct bpf_perf_event_data *ctx)
> + *
> + * The trampoline will populate a dt_bpf_context struct and then call the
> + * function that implements the compiled D clause. It returns the value that
> + * it gets back from that function.
> + *
> + * The context that is passed to the trampoline is:
> + * struct bpf_perf_event_data {
> + * bpf_user_pt_regs_t regs;
> + * __u64 sample_period;
> + * __u64 addr;
> + * }
> + */
> +static void trampoline(dt_pcb_t *pcb)
> +{
> + int i;
> + dt_irlist_t *dlp = &pcb->pcb_ir;
> +
> + dt_cg_tramp_prologue(pcb);
> +
> + /*
> + * After the dt_cg_tramp_prologue() call, we have:
> + * // (%r7 = dctx->mst)
> + * // (%r8 = dctx->ctx)
> + */
> +
> + dt_cg_tramp_copy_regs(pcb);
> +
> + /*
> + * Use the PC to set arg0 and arg1, then clear the other args.
> + */
> + dt_cg_tramp_copy_pc_from_regs(pcb);
> + for (i = 2; i < ARRAY_SIZE(((dt_mstate_t *)0)->argv); i++)
> + emit(dlp, BPF_STORE_IMM(BPF_DW, BPF_REG_7, DMST_ARG(i), 0));
> +
> + dt_cg_tramp_epilogue(pcb);
> +}
> +
> +static int attach(dtrace_hdl_t *dtp, const dt_probe_t *prp, int bpf_fd)
> +{
> + cpc_probe_t *datap = prp->prv_data;
> + struct perf_event_attr attr;
> + int i, nattach = 0;;
> + int cnt = dtp->dt_conf.num_online_cpus;
> + char *name = datap->name; /* same as prp->desc->prb */
> +
> + memset(&attr, 0, sizeof(attr));
> + if (decode_probename(&attr, name) < 0)
> + return -1;
> + attr.wakeup_events = 1;
> +
> + for (i = 0; i < cnt; i++) {
> + int fd;
> +
> + fd = perf_event_open(&attr, -1, dtp->dt_conf.cpus[i].cpu_id,
> + -1, 0);
> + if (fd < 0)
> + continue;
> + if (ioctl(fd, PERF_EVENT_IOC_SET_BPF, bpf_fd) < 0) {
> + close(fd);
> + continue;
> + }
> + datap->fds[i] = fd;
> + nattach++;
> + }
> +
> + return nattach > 0 ? 0 : -1;
> +}
> +
> +static int probe_info(dtrace_hdl_t *dtp, const dt_probe_t *prp,
> + int *argcp, dt_argdesc_t **argvp)
> +{
> + /* cpc-provider probe arguments are not typed */
> + *argcp = 0;
> + *argvp = NULL;
> +
> + return 0;
> +}
> +
> +static void detach(dtrace_hdl_t *dtp, const dt_probe_t *prp)
> +{
> + cpc_probe_t *datap = prp->prv_data;
> + int i, cnt = dtp->dt_conf.num_online_cpus;
> +
> + for (i = 0; i < cnt; i++) {
> + if (datap->fds[i] != -1)
> + close(datap->fds[i]);
> + }
> +}
> +
> +static void probe_destroy(dtrace_hdl_t *dtp, void *arg)
> +{
> + cpc_probe_t *datap = arg;
> +
> + dt_free(dtp, datap->fds);
> + dt_free(dtp, datap->name);
> + dt_free(dtp, datap);
> +}
> +
> +dt_provimpl_t dt_cpc = {
> + .name = prvname,
> + .prog_type = BPF_PROG_TYPE_PERF_EVENT,
> + .populate = &populate,
> + .provide = &provide,
> + .trampoline = &trampoline,
> + .attach = &attach,
> + .probe_info = &probe_info,
> + .detach = &detach,
> + .probe_destroy = &probe_destroy,
> +};
> diff --git a/libdtrace/dt_provider.h b/libdtrace/dt_provider.h
> index a7f6c95b..d1611739 100644
> --- a/libdtrace/dt_provider.h
> +++ b/libdtrace/dt_provider.h
> @@ -1,6 +1,6 @@
> /*
> * Oracle Linux DTrace.
> - * Copyright (c) 2006, 2022, Oracle and/or its affiliates. All rights reserved.
> + * Copyright (c) 2006, 2023, Oracle and/or its affiliates. All rights reserved.
> * Licensed under the Universal Permissive License v 1.0 as shown at
> * http://oss.oracle.com/licenses/upl.
> */
> @@ -76,9 +76,12 @@ typedef struct dt_provimpl {
> const struct dt_probe *prb);
> void (*probe_destroy)(dtrace_hdl_t *dtp, /* free provider data */
> void *datap);
> + void *prvdata; /* provider-specific data */
> } dt_provimpl_t;
>
> +/* list dt_dtrace first */
> extern dt_provimpl_t dt_dtrace;
> +extern dt_provimpl_t dt_cpc;
> extern dt_provimpl_t dt_fbt;
> extern dt_provimpl_t dt_profile;
> extern dt_provimpl_t dt_sdt;
> --
> 2.18.4
>
>
> _______________________________________________
> DTrace-devel mailing list
> DTrace-devel at oss.oracle.com
> https://oss.oracle.com/mailman/listinfo/dtrace-devel
More information about the DTrace-devel
mailing list