[DTrace-devel] [PATCH 01/16] Add a CPC provider

Kris Van Hees kris.van.hees at oracle.com
Tue Feb 21 21:53:40 UTC 2023


On Thu, Jan 26, 2023 at 09:23:14PM -0500, eugene.loh--- via DTrace-devel wrote:
> From: Eugene Loh <eugene.loh at oracle.com>
> 
> Use libpfm to populate CPC probes.
> 
> The PC is placed into arg0 (if kernel) or arg1 (if user-space),
> the other arg being 0.  This initialization is in a new function
> dt_cg_tramp_copy_PC_from_regs() so that it can be used by any other
> provider (like profile) that needs it.
> 
> Current limitations include:
> 
> *)  no support for optional attributes
> 
> *)  no support for raw events
> 
> Signed-off-by: Eugene Loh <eugene.loh at oracle.com>
> ---
>  dtrace.spec             |   6 +-
>  libdtrace/Build         |   8 +-
>  libdtrace/dt_cg.c       |  74 +++++++
>  libdtrace/dt_cg.h       |   1 +
>  libdtrace/dt_open.c     |   6 +-
>  libdtrace/dt_prov_cpc.c | 463 ++++++++++++++++++++++++++++++++++++++++
>  libdtrace/dt_provider.h |   3 +-
>  7 files changed, 552 insertions(+), 9 deletions(-)
>  create mode 100644 libdtrace/dt_prov_cpc.c
> 
> diff --git a/dtrace.spec b/dtrace.spec
> index 1049738d..d12a2a82 100644
> --- a/dtrace.spec
> +++ b/dtrace.spec
> @@ -1,7 +1,7 @@
>  # spec file for package dtrace
>  #
>  # Oracle Linux DTrace.
> -# Copyright (c) 2011, 2022, Oracle and/or its affiliates. All rights reserved.
> +# Copyright (c) 2011, 2023, Oracle and/or its affiliates. All rights reserved.
>  # Licensed under the Universal Permissive License v 1.0 as shown at
>  # http://oss.oracle.com/licenses/upl.
>  
> @@ -55,9 +55,9 @@ BuildRequires: rpm
>  Name:         dtrace
>  License:      Universal Permissive License (UPL), Version 1.0
>  Group:        Development/Tools
> -Requires:     cpp elfutils-libelf zlib libpcap
> +Requires:     cpp elfutils-libelf zlib libpcap libpfm
>  BuildRequires: glibc-headers bison flex zlib-devel elfutils-libelf-devel systemd systemd-devel
> -BuildRequires: glibc-static %{glibc32} wireshark libpcap-devel valgrind-devel
> +BuildRequires: glibc-static %{glibc32} wireshark libpcap-devel valgrind-devel libpfm-devel
>  %if "%{?dist}" == ".el7"
>  Requires:     fuse
>  BuildRequires: fuse-devel
> diff --git a/libdtrace/Build b/libdtrace/Build
> index c1ef05ad..f9d11876 100644
> --- a/libdtrace/Build
> +++ b/libdtrace/Build
> @@ -1,5 +1,5 @@
>  # Oracle Linux DTrace.
> -# Copyright (c) 2011, 2022, Oracle and/or its affiliates. All rights reserved.
> +# Copyright (c) 2011, 2023, Oracle and/or its affiliates. All rights reserved.
>  # Licensed under the Universal Permissive License v 1.0 as shown at
>  # http://oss.oracle.com/licenses/upl.
>  
> @@ -46,6 +46,7 @@ libdtrace-build_SOURCES = dt_aggregate.c \
>  			  dt_probe.c \
>  			  dt_proc.c \
>  			  dt_program.c \
> +			  dt_prov_cpc.c \
>  			  dt_prov_dtrace.c \
>  			  dt_prov_fbt.c \
>  			  dt_prov_profile.c \
> @@ -69,9 +70,9 @@ SHLIBS += libdtrace
>  libdtrace_DIR := $(current-dir)
>  libdtrace_TARGET = libdtrace
>  ifdef HAVE_LIBCTF
> -libdtrace_LIBS := -lctf -lelf -lz -lrt -lpcap -lpthread -ldl -lm
> +libdtrace_LIBS := -lctf -lelf -lz -lrt -lpcap -lpthread -ldl -lm -lpfm
>  else
> -libdtrace_LIBS := -ldtrace-ctf -lelf -lz -lrt -lpcap -lpthread -ldl -lm
> +libdtrace_LIBS := -ldtrace-ctf -lelf -lz -lrt -lpcap -lpthread -ldl -lm -lpfm
>  endif
>  libdtrace_VERSION := 2.0.0
>  libdtrace_SONAME := libdtrace.so.2
> @@ -86,6 +87,7 @@ dt_cg.c_CFLAGS := -Wno-pedantic
>  dt_dis.c_CFLAGS := -Wno-pedantic
>  dt_pid.c_CFLAGS := -Wno-pedantic
>  dt_proc.c_CFLAGS := -Wno-pedantic
> +dt_prov_cpc.c_CFLAGS := -Wno-pedantic
>  dt_prov_dtrace.c_CFLAGS := -Wno-pedantic
>  dt_prov_fbt.c_CFLAGS := -Wno-pedantic
>  dt_prov_profile.c_CFLAGS := -Wno-pedantic
> diff --git a/libdtrace/dt_cg.c b/libdtrace/dt_cg.c
> index cb284f07..a0ca8d50 100644
> --- a/libdtrace/dt_cg.c
> +++ b/libdtrace/dt_cg.c
> @@ -400,6 +400,80 @@ dt_cg_tramp_copy_args_from_regs(dt_pcb_t *pcb, int called)
>  	}
>  }
>  
> +/*
> + * For some providers, we have
> + *   - arg0 = PC if kernel (0 otherwise)
> + *   - arg1 = PC if user space (0 otherwise)
> + *
> + * So put the PC in both arg0 and arg1, test the PC, and then zero out
> + * either arg0 or arg1, as apropriate.
> + *
> + * The caller must ensure that %r7 and %r8 contain the values set by
> + * the dt_cg_tramp_prologue*() functions.
> + */
> +void
> +dt_cg_tramp_copy_PC_from_regs(dt_pcb_t *pcb)

Mind if I rename this to be: dt_cg_tramp_copy_pc_from_regs()
since most function names are all lowercase.

> +{
> +	dtrace_hdl_t	*dtp = pcb->pcb_hdl;
> +	dt_regset_t	*drp = pcb->pcb_regs;
> +	dt_irlist_t	*dlp = &pcb->pcb_ir;
> +	uint_t		Luser = dt_irlist_label(dlp);
> +	uint_t		Ldone = dt_irlist_label(dlp);
> +
> +	if (dt_regset_xalloc_args(drp) == -1)
> +		longjmp(yypcb->pcb_jmpbuf, EDT_NOREG);
> +
> +	/* place the PC in %r3, arg0, and arg1 */
> +        emit(dlp, BPF_LOAD(BPF_DW, BPF_REG_3, BPF_REG_8, PT_REGS_IP));
> +        emit(dlp, BPF_STORE(BPF_DW, BPF_REG_7, DMST_ARG(0), BPF_REG_3));
> +        emit(dlp, BPF_STORE(BPF_DW, BPF_REG_7, DMST_ARG(1), BPF_REG_3));
> +
> +	/* check if the PC is kernel or user space */
> +	if (dtp->dt_bpfhelper[BPF_FUNC_probe_read_kernel] == BPF_FUNC_probe_read_kernel) {
> +		/*
> +		 * Use probe_read_kernel() if it is really is probe_read_kernel().
> +		 * On older kernels, it does not exist and is aliased to something else.
> +		 */

Are you sure this is valid?  I thought that e.g. x86_64 allowed reading/writing
fromto userspace addresses using probe_read_kernel().  Did you check that?

> +
> +		/* test just a single byte */
> +		emit(dlp,  BPF_MOV_IMM(BPF_REG_2, 1));
> +
> +		/* safe to write to FP+DT_STK_SP_BASE, which becomes the clause stack */
> +		emit(dlp,  BPF_MOV_REG(BPF_REG_1, BPF_REG_FP));
> +		emit(dlp,  BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, DT_STK_SP_BASE));
> +
> +		/* bpf_probe_read_kernel(%fp + DT_STK_SP, 1, PC) */
> +		dt_regset_xalloc(drp, BPF_REG_0);
> +		emit(dlp,  BPF_CALL_HELPER(BPF_FUNC_probe_read_kernel));
> +
> +		/* if there was a problem, assume it was user space */
> +		emit(dlp,  BPF_BRANCH_IMM(BPF_JNE, BPF_REG_0, 0, Luser));
> +		dt_regset_free(drp, BPF_REG_0);
> +	} else {
> +		/*
> +		 * If no real probe_read_kernel() exists, just test the highest bit.
> +		 * This is not as robust, but probably works just fine for us.
> +		 */
> +
> +		/* if the highest bit is 0, assume it was user space */
> +		emit(dlp, BPF_ALU64_IMM(BPF_RSH, BPF_REG_3, 63));
> +		emit(dlp,  BPF_BRANCH_IMM(BPF_JEQ, BPF_REG_3, 0, Luser));
> +	}
> +
> +	/* PC is kernel space (zero out arg1) */
> +	emit(dlp,  BPF_STORE_IMM(BPF_DW, BPF_REG_7, DMST_ARG(1), 0));
> +	emit(dlp,  BPF_JUMP(Ldone));
> +
> +	/* PC is user space (zero out arg0) */
> +	emitl(dlp, Luser,
> +		   BPF_STORE_IMM(BPF_DW, BPF_REG_7, DMST_ARG(0), 0));
> +
> +	/* done */
> +	emitl(dlp, Ldone,
> +	      BPF_NOP());
> +	dt_regset_free_args(drp);
> +}
> +
>  /*
>   * Copy return value from a dt_pt_regs structure referenced by %r8 to
>   * mst->arg[1].  Zero the other args.
> diff --git a/libdtrace/dt_cg.h b/libdtrace/dt_cg.h
> index 3742bc6a..78628b4a 100644
> --- a/libdtrace/dt_cg.h
> +++ b/libdtrace/dt_cg.h
> @@ -25,6 +25,7 @@ extern void dt_cg_tramp_prologue(dt_pcb_t *pcb);
>  extern void dt_cg_tramp_clear_regs(dt_pcb_t *pcb);
>  extern void dt_cg_tramp_copy_regs(dt_pcb_t *pcb);
>  extern void dt_cg_tramp_copy_args_from_regs(dt_pcb_t *pcb, int called);
> +extern void dt_cg_tramp_copy_PC_from_regs(dt_pcb_t *pcb);
>  extern void dt_cg_tramp_copy_rval_from_regs(dt_pcb_t *pcb);
>  extern void dt_cg_tramp_call_clauses(dt_pcb_t *pcb, const dt_probe_t *prp,
>  				     dt_activity_t act);
> diff --git a/libdtrace/dt_open.c b/libdtrace/dt_open.c
> index a3754a32..12944748 100644
> --- a/libdtrace/dt_open.c
> +++ b/libdtrace/dt_open.c
> @@ -1,6 +1,6 @@
>  /*
>   * Oracle Linux DTrace.
> - * Copyright (c) 2003, 2022, Oracle and/or its affiliates. All rights reserved.
> + * Copyright (c) 2003, 2023, Oracle and/or its affiliates. All rights reserved.
>   * Licensed under the Universal Permissive License v 1.0 as shown at
>   * http://oss.oracle.com/licenses/upl.
>   */
> @@ -64,7 +64,8 @@ const dt_version_t _dtrace_versions[] = {
>   * provider module may create multiple providers.
>   */
>  static const dt_provimpl_t *dt_providers[] = {
> -	&dt_dtrace,
> +	&dt_dtrace,		/* list dt_dtrace first */
> +	&dt_cpc,
>  	&dt_fbt,
>  	&dt_profile,
>  	&dt_sdt,
> @@ -1261,6 +1262,7 @@ dtrace_close(dtrace_hdl_t *dtp)
>  	dt_pfdict_destroy(dtp);
>  	dt_dof_fini(dtp);
>  	dt_probe_fini(dtp);
> +	// FIXME: add some dt_prov_fini() that will iterate over providers and call provider-specific fini()'s; CPC will call pfm_terminate()

I would stick with /* */ comment style since that is what we do pretty much
everywhere else.  What is the impact of not having that yet at this point?

>  
>  	dt_htab_destroy(dtp, dtp->dt_provs);
>  
> diff --git a/libdtrace/dt_prov_cpc.c b/libdtrace/dt_prov_cpc.c
> new file mode 100644
> index 00000000..fba1fb5d
> --- /dev/null
> +++ b/libdtrace/dt_prov_cpc.c
> @@ -0,0 +1,463 @@
> +/*
> + * Oracle Linux DTrace.
> + * Copyright (c) 2023, Oracle and/or its affiliates. All rights reserved.
> + * Licensed under the Universal Permissive License v 1.0 as shown at
> + * http://oss.oracle.com/licenses/upl.
> + *
> + * The CPU Performance Counter (CPC) provider for DTrace.
> + */
> +#include <assert.h>
> +#include <dt_impl.h>
> +#include <sys/ioctl.h>
> +#include <ctype.h>				// tolower()

Please stick to /* */ comments.

> +
> +#include <bpf_asm.h>
> +#include <linux/perf_event.h>
> +#include <perfmon/pfmlib_perf_event.h>
> +
> +#include "dt_dctx.h"
> +#include "dt_cg.h"
> +#include "dt_bpf.h"
> +#include "dt_probe.h"
> +
> +static const char		prvname[] = "cpc";
> +static const char		modname[] = "";
> +static const char		funname[] = "";
> +
> +static const dtrace_pattr_t	pattr = {
> +{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
> +{ DTRACE_STABILITY_UNSTABLE, DTRACE_STABILITY_UNSTABLE, DTRACE_CLASS_UNKNOWN },
> +{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
> +{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
> +{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
> +};
> +
> +typedef struct cpc_probe_data {
> +	char	*name;
> +	int	*fds;
> +} cpc_probe_data_t;

Can we rename this to be cpc_probe_t (similar to profile_probe_t, etc)?

> +
> +/*
> + * Probe translators.  As we discover which events can actually be used
> + * on the system, we put them in a linked list that translates from names
> + * we present to the D/CPC user to names used by the underlying system.
> + * Importantly, CPC wants no '-' in probe names.  And, stylistically, we
> + * prefer lower-case probe names.
> + */

I would not refer to this as 'translators' because DTrace has a specific
meaning when talking about translators.  How about "Probe name mapping" ?

> +typedef struct cpc_probex {
> +	char			*Dname;
> +	char			*pfmname;
> +	struct cpc_probex	*next;
> +} cpc_probex_t;
> +/*
> + * FIXME: Handle this differently?  E.g., add a member to dt_provimpl_t that
> + * points to provider-specific data?
> + */
> +static cpc_probex_t	*first_probex;

Why not use a dt_list since that is the typical way we implement lists in this
codebase?  This would indeed be something to implement using an opaque pointer
in dt_provimpl_t to some private provider struct.

> +
> +static dt_probe_t *cpc_probe_insert(dtrace_hdl_t *dtp, const char *prb)
> +{
> +	dt_provider_t		*prv;
> +	cpc_probe_data_t	*datap;
> +	int			i, cnt = dtp->dt_conf.num_online_cpus;
> +
> +	prv = dt_provider_lookup(dtp, prvname);
> +	if (!prv)
> +		return 0;
> +
> +	datap = dt_zalloc(dtp, sizeof(cpc_probe_data_t));
> +	if (datap == NULL)
> +		return NULL;
> +
> +	datap->name = strdup(prb);
> +	datap->fds = dt_calloc(dtp, cnt, sizeof(int));
> +	if (datap->fds == NULL)
> +		goto err;
> +
> +	for (i = 0; i < cnt; i++)
> +		datap->fds[i] = -1;
> +
> +	return dt_probe_insert(dtp, prv, prvname, modname, funname, prb, datap);
> +
> +err:
> +	dt_free(dtp, datap);
> +	return NULL;
> +}
> +
> +static int populate(dtrace_hdl_t *dtp)
> +{
> +	int		n = 0;
> +
> +	dt_provider_create(dtp, prvname, &dt_cpc, &pattr);
> +	first_probex = NULL;
> +
> +	/* incidentally, pfm_strerror(pfm_initialize()) describes the error */
> +	if (pfm_initialize() != PFM_SUCCESS)
> +		return 0;
> +
> +	/* loop over PMUs (FWIW, ipmu=PFM_PMU_PERF_EVENT is among them) */
> +	for (pfm_pmu_t ipmu = PFM_PMU_NONE; ipmu < PFM_PMU_MAX; ipmu ++) {
> +		pfm_pmu_info_t pmuinfo;
> +
> +		memset(&pmuinfo, 0, sizeof(pmuinfo));
> +		pmuinfo.size = sizeof(pfm_pmu_info_t);
> +		if (pfm_get_pmu_info(ipmu , &pmuinfo) != PFM_SUCCESS || pmuinfo.is_present == 0)
> +			continue;
> +
> +		/*
> +		 * At this point, we have interesting information like:
> +		 *     - pmuinfo.nevents
> +		 *     - pmuinfo.name
> +		 *     - pmuinfo.desc
> +		 *     - pmuinfo.type = PFM_PMU_TYPE_[UNKNOWN|CORE|UNCORE]
> +		 *     - pmuinfo.num_cntrs
> +		 *     - pmuinfo.fixed_num_cntrs
> +		 *     - pmuinfo.max_encoding // number of event codes returned by pfm_get_event_encoding()

Comment within comment?  How about replacing // with -?
> +		 */
> +
> +		/* loop over events */
> +		for (int ievt = pmuinfo.first_event; ievt != -1; ievt = pfm_get_event_next(ievt)) {
> +
> +			/*
> +			 * Convert opaque integer index ievt into a name evt.name.
> +			 */
> +
> +			pfm_event_info_t evtinfo;
> +
> +			memset(&evtinfo, 0, sizeof(evtinfo));
> +			evtinfo.size = sizeof(evtinfo);
> +
> +			/* PFM_OS_[NONE|PERF_EVENT|PERF_EVENT_EXT] */
> +			if (pfm_get_event_info(ievt, PFM_OS_PERF_EVENT, &evtinfo) != PFM_SUCCESS)
> +				continue;
> +
> +			/*
> +			 * At this point, we have interesting information like:
> +			 *     - evtinfo.name
> +			 *     - evtinfo.desc   // a little verbose and does not say that much
> +			 *     - evtinfo.nattrs
> +			 *     - evtinfo.dtype  // should be PFM_DTYPE_UINT64
> +			 *     - evtinfo.idx    // should be ievt
> +			 *     - evtinfo.equiv  // some equivalent name, or "(null)"

See above.  Replace // with -.
> +			 */
> +
> +			/*
> +			 * Convert the event name into perf_event attr.
> +			 */
> +			pfm_perf_encode_arg_t encoding;
> +			struct perf_event_attr attr;
> +			char *fstr = NULL;
> +
> +			memset(&encoding, 0, sizeof(encoding));
> +			memset(&attr    , 0, sizeof(  attr  ));

This looks a bit odd.  I'd drop the extra spacing - it doesn't really add much
in terms of readability.

> +			encoding.size = sizeof(encoding);
> +			encoding.attr = &attr;
> +			encoding.fstr = &fstr;
> +
> +			// os = [PFM_OS_PERF_EVENT | PFM_OS_PERF_EVENT_EXT]
> +			// Note that pfm_strerror(pfm_get_os_event_encoding(...) describes any error.

Please use /* */ comments.

> +			if (pfm_get_os_event_encoding(evtinfo.name, PFM_PLM0 | PFM_PLM3, PFM_OS_PERF_EVENT, &encoding) != PFM_SUCCESS) {
> +				if (fstr)
> +					free(fstr);    // is this necessary if we errored out?

Please use /* */ comments.  Also...  it this needed or not?  I would assume
that testing can indicate whether it is needed?

> +				continue;
> +			}
> +
> +			// At this point, ievt is what we requested, while encoding.idx corresponds to fstr.
> +			// Meanwhile, fstr will have some ":u=1:k=1" that we would otherwise want to modify.

Please use /* */ comments.  Also, I do not actually understand the relevance
of this comment.  Or what it means.  Can you explain?

> +			if (fstr)
> +				free(fstr);
> +
> +			/*
> +			 * Now attr is largely set up.  Note:
> +			 *     - attr.size is still 0, which is okay
> +			 *     - attr.freq is 0, which is okay
> +			 *     - attr.wakeup_events is 0, which we can change
> +			 */
> +			attr.wakeup_events = 1;
> +
> +			/*
> +			 * Check attr with perf_event_open().
> +			 */
> +			int fd = perf_event_open(&attr, -1, 0 /* FIXME: cpu */, -1, 0);
> +			if (fd < 0)
> +				continue;
> +			close(fd);
> +
> +			/*
> +			 * We convert '-' to '_' to conform to CPC practices
> +			 * and convert to lower-case characters (for stylistic reasons).
> +			 */
> +			cpc_probex_t *next_probex;
> +			next_probex = dt_zalloc(dtp, sizeof(cpc_probe_data_t));   // FIXME: should free in some probe_destroy() or something/??
> +			if (next_probex == NULL)
> +				continue;
> +			next_probex->pfmname = strdup(evtinfo.name);              // FIXME: check nonNULL; free somewhere
> +			next_probex->Dname = strdup(evtinfo.name);                // FIXME: check nonNULL; free somewhere

Please use /* */ comments.  Also...  These FIXMEs seem like things that should
be resolved in this patch.

> +			for (unsigned char *p = next_probex->Dname; *p; p++)
> +				*p = (*p == '-') ? '_' : tolower(*p);
> +			next_probex->next = first_probex;
> +			first_probex = next_probex;
> +
> +			/*
> +			 * Compose a CPC probe name by adding mode "all" and a sample period
> +			 * big enough that even the fastest firing probe will not be unreasonable.
> +			 */
> +			char *suffix = "-all-1000000000";
> +			char *s = dt_zalloc(dtp, strlen(next_probex->Dname) + strlen(suffix) + 1);  // FIXME: what if s==NULL?

Please use /* */ comments.  And yes, what if s == NULL?  That should be
handled.

> +			sprintf(s, "%s%s", next_probex->Dname, suffix);
> +
> +			/*
> +			 * If this probe is not yet there (likely!), add it.
> +			 */
> +			dtrace_probedesc_t pd;
> +			pd.id = DTRACE_IDNONE;
> +			pd.prv = prvname;
> +			pd.mod = modname;
> +			pd.fun = funname;
> +			pd.prb = s;
> +			if (dt_probe_lookup(dtp, &pd) == NULL && cpc_probe_insert(dtp, s))
> +				n++;
> +
> +			dt_free(dtp, s);
> +		}
> +	}
> +
> +	return n;
> +}
> +
> +static int decode_event(struct perf_event_attr *ap, const char *name) {
> +	cpc_probex_t *probex;
> +
> +	// find the probe translator for this D name

/* */ comments.
Do not call it translator.

> +	for (probex = first_probex; probex; probex = probex->next)
> +		if (strcmp(name, probex->Dname) == 0)
> +			break;
> +	if (probex == NULL)
> +		return -1;
> +
> +	// fill in the attr for this pfm name

/* */ comments.

> +	pfm_perf_encode_arg_t encoding;
> +	char *fstr = NULL;
> +	int ret;

All variable declarations should be at the beginning of the code block (or
function).

> +
> +	memset(&encoding, 0, sizeof(encoding));
> +	encoding.size = sizeof(encoding);
> +	encoding.attr = ap;
> +	encoding.fstr = &fstr;
> +
> +	// os = [PFM_OS_PERF_EVENT | PFM_OS_PERF_EVENT_EXT]
> +	// Note that pfm_strerror(pfm_get_os_event_encoding(...) describes any error.

/* */ comments.

> +	ret = pfm_get_os_event_encoding(probex->pfmname, PFM_PLM0 | PFM_PLM3, PFM_OS_PERF_EVENT, &encoding);
> +	if (fstr)
> +		free(fstr);    // FIXME: is this necessary if we errored out?  if not, we do not need to define ret?

/* */ comments.  And you should know whether you need this or not, right?

> +	return (ret == PFM_SUCCESS) ? 0 : -1;
> +}
> +
> +static int decode_mode(struct perf_event_attr *ap, const char *name) {
> +	if (strcmp(name, "user") == 0) {
> +		ap->exclude_kernel = 1;
> +		return 0;
> +	} else if (strcmp(name, "kernel") == 0) {
> +		ap->exclude_user = 1;
> +		return 0;
> +	} else if (strcmp(name, "all") == 0)
> +		return 0;
> +
> +	return -1;
> +}
> +
> +static int decode_probename(struct perf_event_attr *ap, const char *name) {
> +	char buf[256];  // FIXME: fixed length???

Isn't there a symbolic name for the name length limit?  Is there a known limit?

> +	char *pend;
> +
> +	/* work in a temporary space */
> +	strcpy(buf, name);
> +
> +	/* "event" substring */
> +	name = buf;
> +	pend = strchr(name, '-');
> +	if (pend == NULL)
> +		return -1;
> +	*pend = '\0';
> +	pend++;
> +	if (decode_event(ap, name) < 0)
> +		return -1;
> +
> +	/* "mode" substring */
> +	name = pend;
> +	pend = strchr(name, '-');
> +	if (pend == NULL)
> +		return -1;
> +	*pend = '\0';
> +	pend++;
> +	if (decode_mode(ap, name) < 0)
> +		return -1;
> +
> +	/* optional "attributes" substring */
> +	name = pend;
> +	pend = strchr(name, '-');
> +	if (pend) {
> +		*pend = '\0';
> +		pend++;
> +/*
> +ignore [optional] attributes for now
> +  how about cpu=
> +		if (decode_mode(ap, name) < 0)
> +			return -1;
> +*/

Put this in standard comment block style, indented as needed, with a FIXME?

> +
> +		name = pend;
> +	}
> +
> +	/* "count" substring must be all digits 0-9 */
> +	if (strspn(name, "0123456789") < strlen(name))
> +		return -1;
> +	if (sscanf(name, "%llu", &ap->sample_period) != 1)
> +		return -1;
> +
> +	return 0;
> +}
> +
> +static int provide(dtrace_hdl_t *dtp, const dtrace_probedesc_t *pdp)
> +{
> +	/* make sure we have IDNONE and a legal name */
> +	if (pdp->id != DTRACE_IDNONE || strcmp(pdp->prv, prvname) ||
> +	    strcmp(pdp->mod, modname) || strcmp(pdp->fun, funname))
> +		return 0;
> +
> +	/* return if we already have this probe */
> +	if (dt_probe_lookup(dtp, pdp))
> +		return 0;
> +
> +	/* check if the probe name can be decoded */
> +	struct perf_event_attr attr;

No variable declarations mixed in with statements please.

> +	if (decode_probename(&attr, pdp->prb) == -1)
> +		return 0;
> +
> +	/* try to add this probe */
> +	if (cpc_probe_insert(dtp, pdp->prb) == NULL)
> +		return 0;
> +
> +	return 1;
> +}
> +
> +/*
> + * Generate a BPF trampoline for a cpc probe.
> + *
> + * The trampoline function is called when a cpc probe triggers, and it must
> + * satisfy the following prototype:
> + *
> + *	int dt_cpc(struct bpf_perf_event_data *ctx)
> + *
> + * The trampoline will populate a dt_bpf_context struct and then call the
> + * function that implements the compiled D clause.  It returns the value that
> + * it gets back from that function.
> + *
> + * The context that is passed to the trampoline is:
> + *     struct bpf_perf_event_data {
> + *         bpf_user_pt_regs_t regs;
> + *         __u64 sample_period;
> + *         __u64 addr;
> + *     }
> + */
> +static void trampoline(dt_pcb_t *pcb)
> +{
> +	int		i;
> +	dt_irlist_t	*dlp = &pcb->pcb_ir;
> +
> +	dt_cg_tramp_prologue(pcb);
> +
> +	/*
> +	 * After the dt_cg_tramp_prologue() call, we have:
> +	 *				//     (%r7 = dctx->mst)
> +	 *				//     (%r8 = dctx->ctx)
> +	 */
> +
> +	dt_cg_tramp_copy_regs(pcb);
> +
> +	/*
> +	 * Use the PC to set arg0 and arg1, then clear the other args.
> +	 */
> +	dt_cg_tramp_copy_PC_from_regs(pcb);
> +	for (i = 2; i < ARRAY_SIZE(((dt_mstate_t *)0)->argv); i++)
> +		emit(dlp, BPF_STORE_IMM(BPF_DW, BPF_REG_7, DMST_ARG(i), 0));
> +
> +	dt_cg_tramp_epilogue(pcb);
> +}
> +
> +static int attach(dtrace_hdl_t *dtp, const dt_probe_t *prp, int bpf_fd)
> +{
> +	cpc_probe_data_t	*datap = prp->prv_data;
> +	struct perf_event_attr	attr;
> +	int			i, nattach = 0;;
> +	int			cnt = dtp->dt_conf.num_online_cpus;
> +	char			*name = datap->name;  /* prp->desc->prb; */
> +
> +	/* this debugging check (that either name works) can be eliminated */
> +	if (strcmp(prp->desc->prb, datap->name))
> +		assert(0);

So... why is it still here?

> +
> +	memset(&attr, 0, sizeof(attr));
> +	if (decode_probename(&attr, name) < 0)
> +		return -1;
> +	attr.wakeup_events = 1;
> +
> +	for (i = 0; i < cnt; i++) {
> +		int fd;
> +
> +		fd = perf_event_open(&attr, -1, dtp->dt_conf.cpus[i].cpu_id,
> +				     -1, 0);
> +		if (fd < 0)
> +			continue;
> +		if (ioctl(fd, PERF_EVENT_IOC_SET_BPF, bpf_fd) < 0) {
> +			close(fd);
> +			continue;
> +		}
> +		datap->fds[i] = fd;
> +		nattach++;
> +	}
> +
> +	return nattach > 0 ? 0 : -1;
> +}
> +
> +static int probe_info(dtrace_hdl_t *dtp, const dt_probe_t *prp,
> +		      int *argcp, dt_argdesc_t **argvp)
> +{
> +	/* cpc-provider probe arguments are not typed */
> +	*argcp = 0;
> +	*argvp = NULL;
> +
> +	return 0;
> +}
> +
> +static void detach(dtrace_hdl_t *dtp, const dt_probe_t *prp)
> +{
> +	cpc_probe_data_t	*datap = prp->prv_data;
> +	int			i, cnt = dtp->dt_conf.num_online_cpus;
> +
> +	for (i = 0; i < cnt; i++) {
> +		if (datap->fds[i] != -1)
> +			close(datap->fds[i]);
> +	}
> +}
> +
> +static void probe_destroy(dtrace_hdl_t *dtp, void *arg)
> +{
> +	cpc_probe_data_t	*datap = arg;
> +
> +	dt_free(dtp, datap->fds);
> +	dt_free(dtp, datap->name);
> +	dt_free(dtp, datap);
> +}
> +
> +dt_provimpl_t	dt_cpc = {
> +	.name		= prvname,
> +	.prog_type	= BPF_PROG_TYPE_PERF_EVENT,
> +	.populate	= &populate,
> +	.provide	= &provide,
> +	.trampoline	= &trampoline,
> +	.attach		= &attach,
> +	.probe_info	= &probe_info,
> +	.detach		= &detach,
> +	.probe_destroy	= &probe_destroy,
> +};
> diff --git a/libdtrace/dt_provider.h b/libdtrace/dt_provider.h
> index a7f6c95b..9d86e6b2 100644
> --- a/libdtrace/dt_provider.h
> +++ b/libdtrace/dt_provider.h
> @@ -1,6 +1,6 @@
>  /*
>   * Oracle Linux DTrace.
> - * Copyright (c) 2006, 2022, Oracle and/or its affiliates. All rights reserved.
> + * Copyright (c) 2006, 2023, Oracle and/or its affiliates. All rights reserved.
>   * Licensed under the Universal Permissive License v 1.0 as shown at
>   * http://oss.oracle.com/licenses/upl.
>   */
> @@ -78,6 +78,7 @@ typedef struct dt_provimpl {
>  			      void *datap);
>  } dt_provimpl_t;
>  
> +extern dt_provimpl_t dt_cpc;

Just to be pedantic, I would put this after dtrace because that is where it is
in other places.

>  extern dt_provimpl_t dt_dtrace;
>  extern dt_provimpl_t dt_fbt;
>  extern dt_provimpl_t dt_profile;
> -- 
> 2.18.4
> 
> 
> _______________________________________________
> DTrace-devel mailing list
> DTrace-devel at oss.oracle.com
> https://oss.oracle.com/mailman/listinfo/dtrace-devel



More information about the DTrace-devel mailing list