[DTrace-devel] [PATCH v2 01/16] Add a CPC provider

Kris Van Hees kris.van.hees at oracle.com
Fri Feb 24 00:49:34 UTC 2023


Reviewed-by: Kris Van Hees <kris.van.hees at oracle.com>

... with a small name change (prvdata -> prv_data)

On Wed, Feb 22, 2023 at 08:04:13PM -0500, eugene.loh--- via DTrace-devel wrote:
> From: Eugene Loh <eugene.loh at oracle.com>
> 
> Use libpfm to populate CPC probes.
> 
> The PC is placed into arg0 (if kernel) or arg1 (if user-space),
> the other arg being 0.  This initialization is in a new function
> dt_cg_tramp_copy_PC_from_regs() so that it can be used by any other
> provider (like profile) that needs it.
> 
> Current limitations include:
> 
> *)  no support for optional attributes
> 
> *)  no support for raw events
> 
> Signed-off-by: Eugene Loh <eugene.loh at oracle.com>
> ---
>  dtrace.spec             |   6 +-
>  libdtrace/Build         |   8 +-
>  libdtrace/dt_cg.c       |  74 ++++++
>  libdtrace/dt_cg.h       |   1 +
>  libdtrace/dt_open.c     |  10 +-
>  libdtrace/dt_prov_cpc.c | 482 ++++++++++++++++++++++++++++++++++++++++
>  libdtrace/dt_provider.h |   5 +-
>  7 files changed, 577 insertions(+), 9 deletions(-)
>  create mode 100644 libdtrace/dt_prov_cpc.c
> 
> diff --git a/dtrace.spec b/dtrace.spec
> index 1049738d..d12a2a82 100644
> --- a/dtrace.spec
> +++ b/dtrace.spec
> @@ -1,7 +1,7 @@
>  # spec file for package dtrace
>  #
>  # Oracle Linux DTrace.
> -# Copyright (c) 2011, 2022, Oracle and/or its affiliates. All rights reserved.
> +# Copyright (c) 2011, 2023, Oracle and/or its affiliates. All rights reserved.
>  # Licensed under the Universal Permissive License v 1.0 as shown at
>  # http://oss.oracle.com/licenses/upl.
>  
> @@ -55,9 +55,9 @@ BuildRequires: rpm
>  Name:         dtrace
>  License:      Universal Permissive License (UPL), Version 1.0
>  Group:        Development/Tools
> -Requires:     cpp elfutils-libelf zlib libpcap
> +Requires:     cpp elfutils-libelf zlib libpcap libpfm
>  BuildRequires: glibc-headers bison flex zlib-devel elfutils-libelf-devel systemd systemd-devel
> -BuildRequires: glibc-static %{glibc32} wireshark libpcap-devel valgrind-devel
> +BuildRequires: glibc-static %{glibc32} wireshark libpcap-devel valgrind-devel libpfm-devel
>  %if "%{?dist}" == ".el7"
>  Requires:     fuse
>  BuildRequires: fuse-devel
> diff --git a/libdtrace/Build b/libdtrace/Build
> index c1ef05ad..f9d11876 100644
> --- a/libdtrace/Build
> +++ b/libdtrace/Build
> @@ -1,5 +1,5 @@
>  # Oracle Linux DTrace.
> -# Copyright (c) 2011, 2022, Oracle and/or its affiliates. All rights reserved.
> +# Copyright (c) 2011, 2023, Oracle and/or its affiliates. All rights reserved.
>  # Licensed under the Universal Permissive License v 1.0 as shown at
>  # http://oss.oracle.com/licenses/upl.
>  
> @@ -46,6 +46,7 @@ libdtrace-build_SOURCES = dt_aggregate.c \
>  			  dt_probe.c \
>  			  dt_proc.c \
>  			  dt_program.c \
> +			  dt_prov_cpc.c \
>  			  dt_prov_dtrace.c \
>  			  dt_prov_fbt.c \
>  			  dt_prov_profile.c \
> @@ -69,9 +70,9 @@ SHLIBS += libdtrace
>  libdtrace_DIR := $(current-dir)
>  libdtrace_TARGET = libdtrace
>  ifdef HAVE_LIBCTF
> -libdtrace_LIBS := -lctf -lelf -lz -lrt -lpcap -lpthread -ldl -lm
> +libdtrace_LIBS := -lctf -lelf -lz -lrt -lpcap -lpthread -ldl -lm -lpfm
>  else
> -libdtrace_LIBS := -ldtrace-ctf -lelf -lz -lrt -lpcap -lpthread -ldl -lm
> +libdtrace_LIBS := -ldtrace-ctf -lelf -lz -lrt -lpcap -lpthread -ldl -lm -lpfm
>  endif
>  libdtrace_VERSION := 2.0.0
>  libdtrace_SONAME := libdtrace.so.2
> @@ -86,6 +87,7 @@ dt_cg.c_CFLAGS := -Wno-pedantic
>  dt_dis.c_CFLAGS := -Wno-pedantic
>  dt_pid.c_CFLAGS := -Wno-pedantic
>  dt_proc.c_CFLAGS := -Wno-pedantic
> +dt_prov_cpc.c_CFLAGS := -Wno-pedantic
>  dt_prov_dtrace.c_CFLAGS := -Wno-pedantic
>  dt_prov_fbt.c_CFLAGS := -Wno-pedantic
>  dt_prov_profile.c_CFLAGS := -Wno-pedantic
> diff --git a/libdtrace/dt_cg.c b/libdtrace/dt_cg.c
> index cb284f07..b1626a9c 100644
> --- a/libdtrace/dt_cg.c
> +++ b/libdtrace/dt_cg.c
> @@ -400,6 +400,80 @@ dt_cg_tramp_copy_args_from_regs(dt_pcb_t *pcb, int called)
>  	}
>  }
>  
> +/*
> + * For some providers, we have
> + *   - arg0 = PC if kernel (0 otherwise)
> + *   - arg1 = PC if user space (0 otherwise)
> + *
> + * So put the PC in both arg0 and arg1, test the PC, and then zero out
> + * either arg0 or arg1, as apropriate.
> + *
> + * The caller must ensure that %r7 and %r8 contain the values set by
> + * the dt_cg_tramp_prologue*() functions.
> + */
> +void
> +dt_cg_tramp_copy_pc_from_regs(dt_pcb_t *pcb)
> +{
> +	dtrace_hdl_t	*dtp = pcb->pcb_hdl;
> +	dt_regset_t	*drp = pcb->pcb_regs;
> +	dt_irlist_t	*dlp = &pcb->pcb_ir;
> +	uint_t		Luser = dt_irlist_label(dlp);
> +	uint_t		Ldone = dt_irlist_label(dlp);
> +
> +	if (dt_regset_xalloc_args(drp) == -1)
> +		longjmp(yypcb->pcb_jmpbuf, EDT_NOREG);
> +
> +	/* place the PC in %r3, arg0, and arg1 */
> +        emit(dlp, BPF_LOAD(BPF_DW, BPF_REG_3, BPF_REG_8, PT_REGS_IP));
> +        emit(dlp, BPF_STORE(BPF_DW, BPF_REG_7, DMST_ARG(0), BPF_REG_3));
> +        emit(dlp, BPF_STORE(BPF_DW, BPF_REG_7, DMST_ARG(1), BPF_REG_3));
> +
> +	/* check if the PC is kernel or user space */
> +	if (dtp->dt_bpfhelper[BPF_FUNC_probe_read_kernel] == BPF_FUNC_probe_read_kernel) {
> +		/*
> +		 * Use probe_read_kernel() if it is really is probe_read_kernel().
> +		 * On older kernels, it does not exist and is aliased to something else.
> +		 */
> +
> +		/* test just a single byte */
> +		emit(dlp,  BPF_MOV_IMM(BPF_REG_2, 1));
> +
> +		/* safe to write to FP+DT_STK_SP_BASE, which becomes the clause stack */
> +		emit(dlp,  BPF_MOV_REG(BPF_REG_1, BPF_REG_FP));
> +		emit(dlp,  BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, DT_STK_SP_BASE));
> +
> +		/* bpf_probe_read_kernel(%fp + DT_STK_SP, 1, PC) */
> +		dt_regset_xalloc(drp, BPF_REG_0);
> +		emit(dlp,  BPF_CALL_HELPER(BPF_FUNC_probe_read_kernel));
> +
> +		/* if there was a problem, assume it was user space */
> +		emit(dlp,  BPF_BRANCH_IMM(BPF_JNE, BPF_REG_0, 0, Luser));
> +		dt_regset_free(drp, BPF_REG_0);
> +	} else {
> +		/*
> +		 * If no real probe_read_kernel() exists, just test the highest bit.
> +		 * This is not as robust, but probably works just fine for us.
> +		 */
> +
> +		/* if the highest bit is 0, assume it was user space */
> +		emit(dlp, BPF_ALU64_IMM(BPF_RSH, BPF_REG_3, 63));
> +		emit(dlp,  BPF_BRANCH_IMM(BPF_JEQ, BPF_REG_3, 0, Luser));
> +	}
> +
> +	/* PC is kernel space (zero out arg1) */
> +	emit(dlp,  BPF_STORE_IMM(BPF_DW, BPF_REG_7, DMST_ARG(1), 0));
> +	emit(dlp,  BPF_JUMP(Ldone));
> +
> +	/* PC is user space (zero out arg0) */
> +	emitl(dlp, Luser,
> +		   BPF_STORE_IMM(BPF_DW, BPF_REG_7, DMST_ARG(0), 0));
> +
> +	/* done */
> +	emitl(dlp, Ldone,
> +	      BPF_NOP());
> +	dt_regset_free_args(drp);
> +}
> +
>  /*
>   * Copy return value from a dt_pt_regs structure referenced by %r8 to
>   * mst->arg[1].  Zero the other args.
> diff --git a/libdtrace/dt_cg.h b/libdtrace/dt_cg.h
> index 3742bc6a..b4f46e28 100644
> --- a/libdtrace/dt_cg.h
> +++ b/libdtrace/dt_cg.h
> @@ -25,6 +25,7 @@ extern void dt_cg_tramp_prologue(dt_pcb_t *pcb);
>  extern void dt_cg_tramp_clear_regs(dt_pcb_t *pcb);
>  extern void dt_cg_tramp_copy_regs(dt_pcb_t *pcb);
>  extern void dt_cg_tramp_copy_args_from_regs(dt_pcb_t *pcb, int called);
> +extern void dt_cg_tramp_copy_pc_from_regs(dt_pcb_t *pcb);
>  extern void dt_cg_tramp_copy_rval_from_regs(dt_pcb_t *pcb);
>  extern void dt_cg_tramp_call_clauses(dt_pcb_t *pcb, const dt_probe_t *prp,
>  				     dt_activity_t act);
> diff --git a/libdtrace/dt_open.c b/libdtrace/dt_open.c
> index a3754a32..7f8f680a 100644
> --- a/libdtrace/dt_open.c
> +++ b/libdtrace/dt_open.c
> @@ -1,6 +1,6 @@
>  /*
>   * Oracle Linux DTrace.
> - * Copyright (c) 2003, 2022, Oracle and/or its affiliates. All rights reserved.
> + * Copyright (c) 2003, 2023, Oracle and/or its affiliates. All rights reserved.
>   * Licensed under the Universal Permissive License v 1.0 as shown at
>   * http://oss.oracle.com/licenses/upl.
>   */
> @@ -64,7 +64,8 @@ const dt_version_t _dtrace_versions[] = {
>   * provider module may create multiple providers.
>   */
>  static const dt_provimpl_t *dt_providers[] = {
> -	&dt_dtrace,
> +	&dt_dtrace,		/* list dt_dtrace first */
> +	&dt_cpc,
>  	&dt_fbt,
>  	&dt_profile,
>  	&dt_sdt,
> @@ -1261,6 +1262,11 @@ dtrace_close(dtrace_hdl_t *dtp)
>  	dt_pfdict_destroy(dtp);
>  	dt_dof_fini(dtp);
>  	dt_probe_fini(dtp);
> +	/*
> +	 * FIXME:
> +	 * add some dt_prov_fini() to iterate over providers and call provider-specific fini()'s
> +	 * CPC will call pfm_terminate()
> +	 */
>  
>  	dt_htab_destroy(dtp, dtp->dt_provs);
>  
> diff --git a/libdtrace/dt_prov_cpc.c b/libdtrace/dt_prov_cpc.c
> new file mode 100644
> index 00000000..59ff0a2d
> --- /dev/null
> +++ b/libdtrace/dt_prov_cpc.c
> @@ -0,0 +1,482 @@
> +/*
> + * Oracle Linux DTrace.
> + * Copyright (c) 2023, Oracle and/or its affiliates. All rights reserved.
> + * Licensed under the Universal Permissive License v 1.0 as shown at
> + * http://oss.oracle.com/licenses/upl.
> + *
> + * The CPU Performance Counter (CPC) provider for DTrace.
> + */
> +#include <assert.h>
> +#include <dt_impl.h>
> +#include <sys/ioctl.h>
> +#include <ctype.h>				/* tolower() */
> +
> +#include <bpf_asm.h>
> +#include <linux/perf_event.h>
> +#include <perfmon/pfmlib_perf_event.h>
> +
> +#include "dt_dctx.h"
> +#include "dt_cg.h"
> +#include "dt_bpf.h"
> +#include "dt_probe.h"
> +
> +static const char		prvname[] = "cpc";
> +static const char		modname[] = "";
> +static const char		funname[] = "";
> +
> +static const dtrace_pattr_t	pattr = {
> +{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
> +{ DTRACE_STABILITY_UNSTABLE, DTRACE_STABILITY_UNSTABLE, DTRACE_CLASS_UNKNOWN },
> +{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
> +{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
> +{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
> +};
> +
> +typedef struct cpc_probe {
> +	char	*name;
> +	int	*fds;
> +} cpc_probe_t;
> +
> +/*
> + * Probe name mappings.  As we discover which events can actually be used
> + * on the system, we put them in a linked list that maps from names
> + * we present to the D/CPC user to names used by the underlying system.
> + * Importantly, CPC wants no '-' in probe names.  And, stylistically, we
> + * prefer lower-case probe names.
> + */
> +typedef struct cpc_probe_map {
> +	dt_list_t		list;
> +	char			*Dname;
> +	char			*pfmname;
> +} cpc_probe_map_t;
> +
> +static dt_probe_t *cpc_probe_insert(dtrace_hdl_t *dtp, const char *prb)
> +{
> +	dt_provider_t	*prv;
> +	cpc_probe_t	*datap;
> +	int		i, cnt = dtp->dt_conf.num_online_cpus;
> +
> +	prv = dt_provider_lookup(dtp, prvname);
> +	if (!prv)
> +		return 0;
> +
> +	datap = dt_zalloc(dtp, sizeof(cpc_probe_t));
> +	if (datap == NULL)
> +		return NULL;
> +
> +	datap->name = strdup(prb);
> +	datap->fds = dt_calloc(dtp, cnt, sizeof(int));
> +	if (datap->fds == NULL)
> +		goto err;
> +
> +	for (i = 0; i < cnt; i++)
> +		datap->fds[i] = -1;
> +
> +	return dt_probe_insert(dtp, prv, prvname, modname, funname, prb, datap);
> +
> +err:
> +	dt_free(dtp, datap);
> +	return NULL;
> +}
> +
> +static int populate(dtrace_hdl_t *dtp)
> +{
> +	int		n = 0;
> +
> +	dt_provider_create(dtp, prvname, &dt_cpc, &pattr);
> +	dt_cpc.prvdata = dt_zalloc(dtp, sizeof(dt_list_t));
> +
> +	/* incidentally, pfm_strerror(pfm_initialize()) describes the error */
> +	if (pfm_initialize() != PFM_SUCCESS)
> +		return 0;
> +
> +	/* loop over PMUs (FWIW, ipmu=PFM_PMU_PERF_EVENT is among them) */
> +	for (pfm_pmu_t ipmu = PFM_PMU_NONE; ipmu < PFM_PMU_MAX; ipmu ++) {
> +		pfm_pmu_info_t pmuinfo;
> +
> +		memset(&pmuinfo, 0, sizeof(pmuinfo));
> +		pmuinfo.size = sizeof(pfm_pmu_info_t);
> +		if (pfm_get_pmu_info(ipmu , &pmuinfo) != PFM_SUCCESS || pmuinfo.is_present == 0)
> +			continue;
> +
> +		/*
> +		 * At this point, we have interesting information like:
> +		 *     - pmuinfo.nevents
> +		 *     - pmuinfo.name
> +		 *     - pmuinfo.desc
> +		 *     - pmuinfo.type = PFM_PMU_TYPE_[UNKNOWN|CORE|UNCORE]
> +		 *     - pmuinfo.num_cntrs
> +		 *     - pmuinfo.fixed_num_cntrs
> +		 *     - pmuinfo.max_encoding - number of event codes returned by pfm_get_event_encoding()
> +		 */
> +
> +		/* loop over events */
> +		for (int ievt = pmuinfo.first_event; ievt != -1; ievt = pfm_get_event_next(ievt)) {
> +			pfm_event_info_t evtinfo;
> +
> +			pfm_perf_encode_arg_t encoding;
> +			struct perf_event_attr attr;
> +			char *fstr = NULL;
> +
> +			int fd;
> +
> +			cpc_probe_map_t *next_probe_map;
> +
> +			char *suffix = "-all-1000000000";
> +			char *s;
> +
> +			dtrace_probedesc_t pd;
> +
> +			/*
> +			 * Convert opaque integer index ievt into a name evt.name.
> +			 */
> +
> +			memset(&evtinfo, 0, sizeof(evtinfo));
> +			evtinfo.size = sizeof(evtinfo);
> +
> +			/* PFM_OS_[NONE|PERF_EVENT|PERF_EVENT_EXT] */
> +			if (pfm_get_event_info(ievt, PFM_OS_PERF_EVENT, &evtinfo) != PFM_SUCCESS)
> +				continue;
> +
> +			/*
> +			 * At this point, we have interesting information like:
> +			 *     - evtinfo.name
> +			 *     - evtinfo.desc   - a little verbose and does not say that much
> +			 *     - evtinfo.nattrs
> +			 *     - evtinfo.dtype  - should be PFM_DTYPE_UINT64
> +			 *     - evtinfo.idx    - should be ievt
> +			 *     - evtinfo.equiv  - some equivalent name, or "(null)"
> +			 */
> +
> +			/*
> +			 * Convert the event name into perf_event attr.
> +			 */
> +			memset(&encoding, 0, sizeof(encoding));
> +			memset(&attr, 0, sizeof(attr));
> +			encoding.size = sizeof(encoding);
> +			encoding.attr = &attr;
> +			encoding.fstr = &fstr;
> +
> +			/*
> +			 * os = [PFM_OS_PERF_EVENT | PFM_OS_PERF_EVENT_EXT]
> +			 * Note that pfm_strerror(pfm_get_os_event_encoding(...)) describes any error.
> +			 */
> +			if (pfm_get_os_event_encoding(evtinfo.name, PFM_PLM0 | PFM_PLM3, PFM_OS_PERF_EVENT, &encoding) != PFM_SUCCESS) {
> +				if (fstr)
> +					free(fstr);    /* is this necessary if we errored out? */
> +				continue;
> +			}
> +
> +			/*
> +			 * At this point, ievt is what we requested, while encoding.idx corresponds to fstr.
> +			 * Meanwhile, fstr will have some ":u=1:k=1" that we would otherwise want to modify.
> +			 */
> +			if (fstr)
> +				free(fstr);
> +
> +			/*
> +			 * Now attr is largely set up.  Note:
> +			 *     - attr.size is still 0, which is okay
> +			 *     - attr.freq is 0, which is okay
> +			 *     - attr.wakeup_events is 0, which we can change
> +			 */
> +			attr.wakeup_events = 1;
> +
> +			/*
> +			 * Check attr with perf_event_open().
> +			 */
> +			fd = perf_event_open(&attr, -1, 0 /* FIXME: cpu */, -1, 0);
> +			if (fd < 0)
> +				continue;
> +			close(fd);
> +
> +			/*
> +			 * We convert '-' to '_' to conform to CPC practices
> +			 * and convert to lower-case characters (for stylistic reasons).
> +			 *
> +			 * FIXME: If we run out of memory (which is unlikely?), we can:
> +			 *   - just proceed with the NULL pointers (causing later drastic failure)
> +			 *   - silently skip over this probe (causing later more controlled failure)
> +			 *   - somehow emit a diagnostic message
> +			 * For now, we just choose the middle option.
> +			 *
> +			 * FIXME: Memory pointed to by next_probe_map, pfmname, and Dname
> +			 * should ideally be freed explicitly during some probe_destroy(),
> +			 * but this is a low priority since all such memory will be freed
> +			 * anyhow when the DTrace session ends.
> +			 */
> +			next_probe_map = dt_zalloc(dtp, sizeof(cpc_probe_map_t));
> +			if (next_probe_map == NULL)
> +				continue;
> +			next_probe_map->pfmname = strdup(evtinfo.name);
> +			next_probe_map->Dname = strdup(evtinfo.name);
> +			if (next_probe_map->pfmname == NULL ||
> +			    next_probe_map->Dname == NULL)
> +				continue;
> +			for (unsigned char *p = next_probe_map->Dname; *p; p++)
> +				*p = (*p == '-') ? '_' : tolower(*p);
> +			dt_list_append(dt_cpc.prvdata, next_probe_map);
> +
> +			/*
> +			 * Compose a CPC probe name by adding mode "all" and a sample period
> +			 * big enough that even the fastest firing probe will not be unreasonable.
> +			 */
> +			s = dt_zalloc(dtp, strlen(next_probe_map->Dname) + strlen(suffix) + 1);
> +			if (s == NULL)
> +				continue;
> +			sprintf(s, "%s%s", next_probe_map->Dname, suffix);
> +
> +			/*
> +			 * If this probe is not yet there (likely!), add it.
> +			 */
> +			pd.id = DTRACE_IDNONE;
> +			pd.prv = prvname;
> +			pd.mod = modname;
> +			pd.fun = funname;
> +			pd.prb = s;
> +			if (dt_probe_lookup(dtp, &pd) == NULL && cpc_probe_insert(dtp, s))
> +				n++;
> +
> +			dt_free(dtp, s);
> +		}
> +	}
> +
> +	return n;
> +}
> +
> +static int decode_event(struct perf_event_attr *ap, const char *name) {
> +	cpc_probe_map_t *probe_map;
> +	pfm_perf_encode_arg_t encoding;
> +
> +	/* find the probe name mapping for this D name */
> +	for (probe_map = dt_list_next(dt_cpc.prvdata);
> +	    probe_map; probe_map = dt_list_next(probe_map))
> +		if (strcmp(name, probe_map->Dname) == 0)
> +			break;
> +	if (probe_map == NULL)
> +		return -1;
> +
> +	/* fill in the attr for this pfm name */
> +	char *fstr = NULL;
> +	int ret;
> +
> +	memset(&encoding, 0, sizeof(encoding));
> +	encoding.size = sizeof(encoding);
> +	encoding.attr = ap;
> +	encoding.fstr = &fstr;
> +
> +	/*
> +	 * os = [PFM_OS_PERF_EVENT | PFM_OS_PERF_EVENT_EXT]
> +	 * Note that pfm_strerror(pfm_get_os_event_encoding(...)) describes any error.
> +	 */
> +	ret = pfm_get_os_event_encoding(probe_map->pfmname, PFM_PLM0 | PFM_PLM3, PFM_OS_PERF_EVENT, &encoding);
> +	if (fstr)
> +		free(fstr);    /* FIXME: is this necessary if we errored out?  if not, we do not need to define ret? */
> +	return (ret == PFM_SUCCESS) ? 0 : -1;
> +}
> +
> +static int decode_mode(struct perf_event_attr *ap, const char *name) {
> +	if (strcmp(name, "user") == 0) {
> +		ap->exclude_kernel = 1;
> +		return 0;
> +	} else if (strcmp(name, "kernel") == 0) {
> +		ap->exclude_user = 1;
> +		return 0;
> +	} else if (strcmp(name, "all") == 0)
> +		return 0;
> +
> +	return -1;
> +}
> +
> +static int decode_attributes(struct perf_event_attr *ap, const char *name) {
> +	/* FIXME: need to implement this */
> +	return -1;
> +}
> +
> +static int decode_probename(struct perf_event_attr *ap, const char *name) {
> +	char buf[DTRACE_NAMELEN];
> +	char *pend;
> +
> +	/* work in a temporary space */
> +	strcpy(buf, name);
> +
> +	/* "event" substring */
> +	name = buf;
> +	pend = strchr(name, '-');
> +	if (pend == NULL)
> +		return -1;
> +	*pend = '\0';
> +	pend++;
> +	if (decode_event(ap, name) < 0)
> +		return -1;
> +
> +	/* "mode" substring */
> +	name = pend;
> +	pend = strchr(name, '-');
> +	if (pend == NULL)
> +		return -1;
> +	*pend = '\0';
> +	pend++;
> +	if (decode_mode(ap, name) < 0)
> +		return -1;
> +
> +	/* optional "attributes" substring */
> +	name = pend;
> +	pend = strchr(name, '-');
> +	if (pend) {
> +		*pend = '\0';
> +		pend++;
> +		if (decode_attributes(ap, name) < 0)
> +			return -1;
> +		name = pend;
> +	}
> +
> +	/* "count" substring must be all digits 0-9 */
> +	if (strspn(name, "0123456789") < strlen(name))
> +		return -1;
> +	if (sscanf(name, "%llu", &ap->sample_period) != 1)
> +		return -1;
> +
> +	return 0;
> +}
> +
> +static int provide(dtrace_hdl_t *dtp, const dtrace_probedesc_t *pdp)
> +{
> +	struct perf_event_attr attr;
> +
> +	/* make sure we have IDNONE and a legal name */
> +	if (pdp->id != DTRACE_IDNONE || strcmp(pdp->prv, prvname) ||
> +	    strcmp(pdp->mod, modname) || strcmp(pdp->fun, funname))
> +		return 0;
> +
> +	/* return if we already have this probe */
> +	if (dt_probe_lookup(dtp, pdp))
> +		return 0;
> +
> +	/* check if the probe name can be decoded */
> +	if (decode_probename(&attr, pdp->prb) == -1)
> +		return 0;
> +
> +	/* try to add this probe */
> +	if (cpc_probe_insert(dtp, pdp->prb) == NULL)
> +		return 0;
> +
> +	return 1;
> +}
> +
> +/*
> + * Generate a BPF trampoline for a cpc probe.
> + *
> + * The trampoline function is called when a cpc probe triggers, and it must
> + * satisfy the following prototype:
> + *
> + *	int dt_cpc(struct bpf_perf_event_data *ctx)
> + *
> + * The trampoline will populate a dt_bpf_context struct and then call the
> + * function that implements the compiled D clause.  It returns the value that
> + * it gets back from that function.
> + *
> + * The context that is passed to the trampoline is:
> + *     struct bpf_perf_event_data {
> + *         bpf_user_pt_regs_t regs;
> + *         __u64 sample_period;
> + *         __u64 addr;
> + *     }
> + */
> +static void trampoline(dt_pcb_t *pcb)
> +{
> +	int		i;
> +	dt_irlist_t	*dlp = &pcb->pcb_ir;
> +
> +	dt_cg_tramp_prologue(pcb);
> +
> +	/*
> +	 * After the dt_cg_tramp_prologue() call, we have:
> +	 *				//     (%r7 = dctx->mst)
> +	 *				//     (%r8 = dctx->ctx)
> +	 */
> +
> +	dt_cg_tramp_copy_regs(pcb);
> +
> +	/*
> +	 * Use the PC to set arg0 and arg1, then clear the other args.
> +	 */
> +	dt_cg_tramp_copy_pc_from_regs(pcb);
> +	for (i = 2; i < ARRAY_SIZE(((dt_mstate_t *)0)->argv); i++)
> +		emit(dlp, BPF_STORE_IMM(BPF_DW, BPF_REG_7, DMST_ARG(i), 0));
> +
> +	dt_cg_tramp_epilogue(pcb);
> +}
> +
> +static int attach(dtrace_hdl_t *dtp, const dt_probe_t *prp, int bpf_fd)
> +{
> +	cpc_probe_t		*datap = prp->prv_data;
> +	struct perf_event_attr	attr;
> +	int			i, nattach = 0;;
> +	int			cnt = dtp->dt_conf.num_online_cpus;
> +	char			*name = datap->name;  /* same as prp->desc->prb */
> +
> +	memset(&attr, 0, sizeof(attr));
> +	if (decode_probename(&attr, name) < 0)
> +		return -1;
> +	attr.wakeup_events = 1;
> +
> +	for (i = 0; i < cnt; i++) {
> +		int fd;
> +
> +		fd = perf_event_open(&attr, -1, dtp->dt_conf.cpus[i].cpu_id,
> +				     -1, 0);
> +		if (fd < 0)
> +			continue;
> +		if (ioctl(fd, PERF_EVENT_IOC_SET_BPF, bpf_fd) < 0) {
> +			close(fd);
> +			continue;
> +		}
> +		datap->fds[i] = fd;
> +		nattach++;
> +	}
> +
> +	return nattach > 0 ? 0 : -1;
> +}
> +
> +static int probe_info(dtrace_hdl_t *dtp, const dt_probe_t *prp,
> +		      int *argcp, dt_argdesc_t **argvp)
> +{
> +	/* cpc-provider probe arguments are not typed */
> +	*argcp = 0;
> +	*argvp = NULL;
> +
> +	return 0;
> +}
> +
> +static void detach(dtrace_hdl_t *dtp, const dt_probe_t *prp)
> +{
> +	cpc_probe_t	*datap = prp->prv_data;
> +	int		i, cnt = dtp->dt_conf.num_online_cpus;
> +
> +	for (i = 0; i < cnt; i++) {
> +		if (datap->fds[i] != -1)
> +			close(datap->fds[i]);
> +	}
> +}
> +
> +static void probe_destroy(dtrace_hdl_t *dtp, void *arg)
> +{
> +	cpc_probe_t	*datap = arg;
> +
> +	dt_free(dtp, datap->fds);
> +	dt_free(dtp, datap->name);
> +	dt_free(dtp, datap);
> +}
> +
> +dt_provimpl_t	dt_cpc = {
> +	.name		= prvname,
> +	.prog_type	= BPF_PROG_TYPE_PERF_EVENT,
> +	.populate	= &populate,
> +	.provide	= &provide,
> +	.trampoline	= &trampoline,
> +	.attach		= &attach,
> +	.probe_info	= &probe_info,
> +	.detach		= &detach,
> +	.probe_destroy	= &probe_destroy,
> +};
> diff --git a/libdtrace/dt_provider.h b/libdtrace/dt_provider.h
> index a7f6c95b..d1611739 100644
> --- a/libdtrace/dt_provider.h
> +++ b/libdtrace/dt_provider.h
> @@ -1,6 +1,6 @@
>  /*
>   * Oracle Linux DTrace.
> - * Copyright (c) 2006, 2022, Oracle and/or its affiliates. All rights reserved.
> + * Copyright (c) 2006, 2023, Oracle and/or its affiliates. All rights reserved.
>   * Licensed under the Universal Permissive License v 1.0 as shown at
>   * http://oss.oracle.com/licenses/upl.
>   */
> @@ -76,9 +76,12 @@ typedef struct dt_provimpl {
>  		       const struct dt_probe *prb);
>  	void (*probe_destroy)(dtrace_hdl_t *dtp, /* free provider data */
>  			      void *datap);
> +	void *prvdata;				/* provider-specific data */
>  } dt_provimpl_t;
>  
> +/* list dt_dtrace first */
>  extern dt_provimpl_t dt_dtrace;
> +extern dt_provimpl_t dt_cpc;
>  extern dt_provimpl_t dt_fbt;
>  extern dt_provimpl_t dt_profile;
>  extern dt_provimpl_t dt_sdt;
> -- 
> 2.18.4
> 
> 
> _______________________________________________
> DTrace-devel mailing list
> DTrace-devel at oss.oracle.com
> https://oss.oracle.com/mailman/listinfo/dtrace-devel



More information about the DTrace-devel mailing list