[DTrace-devel] [PATCH 01/16] Add a CPC provider

eugene.loh at oracle.com eugene.loh at oracle.com
Fri Jan 27 02:23:14 UTC 2023


From: Eugene Loh <eugene.loh at oracle.com>

Use libpfm to populate CPC probes.

The PC is placed into arg0 (if kernel) or arg1 (if user-space),
the other arg being 0.  This initialization is in a new function
dt_cg_tramp_copy_PC_from_regs() so that it can be used by any other
provider (like profile) that needs it.

Current limitations include:

*)  no support for optional attributes

*)  no support for raw events

Signed-off-by: Eugene Loh <eugene.loh at oracle.com>
---
 dtrace.spec             |   6 +-
 libdtrace/Build         |   8 +-
 libdtrace/dt_cg.c       |  74 +++++++
 libdtrace/dt_cg.h       |   1 +
 libdtrace/dt_open.c     |   6 +-
 libdtrace/dt_prov_cpc.c | 463 ++++++++++++++++++++++++++++++++++++++++
 libdtrace/dt_provider.h |   3 +-
 7 files changed, 552 insertions(+), 9 deletions(-)
 create mode 100644 libdtrace/dt_prov_cpc.c

diff --git a/dtrace.spec b/dtrace.spec
index 1049738d..d12a2a82 100644
--- a/dtrace.spec
+++ b/dtrace.spec
@@ -1,7 +1,7 @@
 # spec file for package dtrace
 #
 # Oracle Linux DTrace.
-# Copyright (c) 2011, 2022, Oracle and/or its affiliates. All rights reserved.
+# Copyright (c) 2011, 2023, Oracle and/or its affiliates. All rights reserved.
 # Licensed under the Universal Permissive License v 1.0 as shown at
 # http://oss.oracle.com/licenses/upl.
 
@@ -55,9 +55,9 @@ BuildRequires: rpm
 Name:         dtrace
 License:      Universal Permissive License (UPL), Version 1.0
 Group:        Development/Tools
-Requires:     cpp elfutils-libelf zlib libpcap
+Requires:     cpp elfutils-libelf zlib libpcap libpfm
 BuildRequires: glibc-headers bison flex zlib-devel elfutils-libelf-devel systemd systemd-devel
-BuildRequires: glibc-static %{glibc32} wireshark libpcap-devel valgrind-devel
+BuildRequires: glibc-static %{glibc32} wireshark libpcap-devel valgrind-devel libpfm-devel
 %if "%{?dist}" == ".el7"
 Requires:     fuse
 BuildRequires: fuse-devel
diff --git a/libdtrace/Build b/libdtrace/Build
index c1ef05ad..f9d11876 100644
--- a/libdtrace/Build
+++ b/libdtrace/Build
@@ -1,5 +1,5 @@
 # Oracle Linux DTrace.
-# Copyright (c) 2011, 2022, Oracle and/or its affiliates. All rights reserved.
+# Copyright (c) 2011, 2023, Oracle and/or its affiliates. All rights reserved.
 # Licensed under the Universal Permissive License v 1.0 as shown at
 # http://oss.oracle.com/licenses/upl.
 
@@ -46,6 +46,7 @@ libdtrace-build_SOURCES = dt_aggregate.c \
 			  dt_probe.c \
 			  dt_proc.c \
 			  dt_program.c \
+			  dt_prov_cpc.c \
 			  dt_prov_dtrace.c \
 			  dt_prov_fbt.c \
 			  dt_prov_profile.c \
@@ -69,9 +70,9 @@ SHLIBS += libdtrace
 libdtrace_DIR := $(current-dir)
 libdtrace_TARGET = libdtrace
 ifdef HAVE_LIBCTF
-libdtrace_LIBS := -lctf -lelf -lz -lrt -lpcap -lpthread -ldl -lm
+libdtrace_LIBS := -lctf -lelf -lz -lrt -lpcap -lpthread -ldl -lm -lpfm
 else
-libdtrace_LIBS := -ldtrace-ctf -lelf -lz -lrt -lpcap -lpthread -ldl -lm
+libdtrace_LIBS := -ldtrace-ctf -lelf -lz -lrt -lpcap -lpthread -ldl -lm -lpfm
 endif
 libdtrace_VERSION := 2.0.0
 libdtrace_SONAME := libdtrace.so.2
@@ -86,6 +87,7 @@ dt_cg.c_CFLAGS := -Wno-pedantic
 dt_dis.c_CFLAGS := -Wno-pedantic
 dt_pid.c_CFLAGS := -Wno-pedantic
 dt_proc.c_CFLAGS := -Wno-pedantic
+dt_prov_cpc.c_CFLAGS := -Wno-pedantic
 dt_prov_dtrace.c_CFLAGS := -Wno-pedantic
 dt_prov_fbt.c_CFLAGS := -Wno-pedantic
 dt_prov_profile.c_CFLAGS := -Wno-pedantic
diff --git a/libdtrace/dt_cg.c b/libdtrace/dt_cg.c
index cb284f07..a0ca8d50 100644
--- a/libdtrace/dt_cg.c
+++ b/libdtrace/dt_cg.c
@@ -400,6 +400,80 @@ dt_cg_tramp_copy_args_from_regs(dt_pcb_t *pcb, int called)
 	}
 }
 
+/*
+ * For some providers, we have
+ *   - arg0 = PC if kernel (0 otherwise)
+ *   - arg1 = PC if user space (0 otherwise)
+ *
+ * So put the PC in both arg0 and arg1, test the PC, and then zero out
+ * either arg0 or arg1, as apropriate.
+ *
+ * The caller must ensure that %r7 and %r8 contain the values set by
+ * the dt_cg_tramp_prologue*() functions.
+ */
+void
+dt_cg_tramp_copy_PC_from_regs(dt_pcb_t *pcb)
+{
+	dtrace_hdl_t	*dtp = pcb->pcb_hdl;
+	dt_regset_t	*drp = pcb->pcb_regs;
+	dt_irlist_t	*dlp = &pcb->pcb_ir;
+	uint_t		Luser = dt_irlist_label(dlp);
+	uint_t		Ldone = dt_irlist_label(dlp);
+
+	if (dt_regset_xalloc_args(drp) == -1)
+		longjmp(yypcb->pcb_jmpbuf, EDT_NOREG);
+
+	/* place the PC in %r3, arg0, and arg1 */
+        emit(dlp, BPF_LOAD(BPF_DW, BPF_REG_3, BPF_REG_8, PT_REGS_IP));
+        emit(dlp, BPF_STORE(BPF_DW, BPF_REG_7, DMST_ARG(0), BPF_REG_3));
+        emit(dlp, BPF_STORE(BPF_DW, BPF_REG_7, DMST_ARG(1), BPF_REG_3));
+
+	/* check if the PC is kernel or user space */
+	if (dtp->dt_bpfhelper[BPF_FUNC_probe_read_kernel] == BPF_FUNC_probe_read_kernel) {
+		/*
+		 * Use probe_read_kernel() if it is really is probe_read_kernel().
+		 * On older kernels, it does not exist and is aliased to something else.
+		 */
+
+		/* test just a single byte */
+		emit(dlp,  BPF_MOV_IMM(BPF_REG_2, 1));
+
+		/* safe to write to FP+DT_STK_SP_BASE, which becomes the clause stack */
+		emit(dlp,  BPF_MOV_REG(BPF_REG_1, BPF_REG_FP));
+		emit(dlp,  BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, DT_STK_SP_BASE));
+
+		/* bpf_probe_read_kernel(%fp + DT_STK_SP, 1, PC) */
+		dt_regset_xalloc(drp, BPF_REG_0);
+		emit(dlp,  BPF_CALL_HELPER(BPF_FUNC_probe_read_kernel));
+
+		/* if there was a problem, assume it was user space */
+		emit(dlp,  BPF_BRANCH_IMM(BPF_JNE, BPF_REG_0, 0, Luser));
+		dt_regset_free(drp, BPF_REG_0);
+	} else {
+		/*
+		 * If no real probe_read_kernel() exists, just test the highest bit.
+		 * This is not as robust, but probably works just fine for us.
+		 */
+
+		/* if the highest bit is 0, assume it was user space */
+		emit(dlp, BPF_ALU64_IMM(BPF_RSH, BPF_REG_3, 63));
+		emit(dlp,  BPF_BRANCH_IMM(BPF_JEQ, BPF_REG_3, 0, Luser));
+	}
+
+	/* PC is kernel space (zero out arg1) */
+	emit(dlp,  BPF_STORE_IMM(BPF_DW, BPF_REG_7, DMST_ARG(1), 0));
+	emit(dlp,  BPF_JUMP(Ldone));
+
+	/* PC is user space (zero out arg0) */
+	emitl(dlp, Luser,
+		   BPF_STORE_IMM(BPF_DW, BPF_REG_7, DMST_ARG(0), 0));
+
+	/* done */
+	emitl(dlp, Ldone,
+	      BPF_NOP());
+	dt_regset_free_args(drp);
+}
+
 /*
  * Copy return value from a dt_pt_regs structure referenced by %r8 to
  * mst->arg[1].  Zero the other args.
diff --git a/libdtrace/dt_cg.h b/libdtrace/dt_cg.h
index 3742bc6a..78628b4a 100644
--- a/libdtrace/dt_cg.h
+++ b/libdtrace/dt_cg.h
@@ -25,6 +25,7 @@ extern void dt_cg_tramp_prologue(dt_pcb_t *pcb);
 extern void dt_cg_tramp_clear_regs(dt_pcb_t *pcb);
 extern void dt_cg_tramp_copy_regs(dt_pcb_t *pcb);
 extern void dt_cg_tramp_copy_args_from_regs(dt_pcb_t *pcb, int called);
+extern void dt_cg_tramp_copy_PC_from_regs(dt_pcb_t *pcb);
 extern void dt_cg_tramp_copy_rval_from_regs(dt_pcb_t *pcb);
 extern void dt_cg_tramp_call_clauses(dt_pcb_t *pcb, const dt_probe_t *prp,
 				     dt_activity_t act);
diff --git a/libdtrace/dt_open.c b/libdtrace/dt_open.c
index a3754a32..12944748 100644
--- a/libdtrace/dt_open.c
+++ b/libdtrace/dt_open.c
@@ -1,6 +1,6 @@
 /*
  * Oracle Linux DTrace.
- * Copyright (c) 2003, 2022, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2003, 2023, Oracle and/or its affiliates. All rights reserved.
  * Licensed under the Universal Permissive License v 1.0 as shown at
  * http://oss.oracle.com/licenses/upl.
  */
@@ -64,7 +64,8 @@ const dt_version_t _dtrace_versions[] = {
  * provider module may create multiple providers.
  */
 static const dt_provimpl_t *dt_providers[] = {
-	&dt_dtrace,
+	&dt_dtrace,		/* list dt_dtrace first */
+	&dt_cpc,
 	&dt_fbt,
 	&dt_profile,
 	&dt_sdt,
@@ -1261,6 +1262,7 @@ dtrace_close(dtrace_hdl_t *dtp)
 	dt_pfdict_destroy(dtp);
 	dt_dof_fini(dtp);
 	dt_probe_fini(dtp);
+	// FIXME: add some dt_prov_fini() that will iterate over providers and call provider-specific fini()'s; CPC will call pfm_terminate()
 
 	dt_htab_destroy(dtp, dtp->dt_provs);
 
diff --git a/libdtrace/dt_prov_cpc.c b/libdtrace/dt_prov_cpc.c
new file mode 100644
index 00000000..fba1fb5d
--- /dev/null
+++ b/libdtrace/dt_prov_cpc.c
@@ -0,0 +1,463 @@
+/*
+ * Oracle Linux DTrace.
+ * Copyright (c) 2023, Oracle and/or its affiliates. All rights reserved.
+ * Licensed under the Universal Permissive License v 1.0 as shown at
+ * http://oss.oracle.com/licenses/upl.
+ *
+ * The CPU Performance Counter (CPC) provider for DTrace.
+ */
+#include <assert.h>
+#include <dt_impl.h>
+#include <sys/ioctl.h>
+#include <ctype.h>				// tolower()
+
+#include <bpf_asm.h>
+#include <linux/perf_event.h>
+#include <perfmon/pfmlib_perf_event.h>
+
+#include "dt_dctx.h"
+#include "dt_cg.h"
+#include "dt_bpf.h"
+#include "dt_probe.h"
+
+static const char		prvname[] = "cpc";
+static const char		modname[] = "";
+static const char		funname[] = "";
+
+static const dtrace_pattr_t	pattr = {
+{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
+{ DTRACE_STABILITY_UNSTABLE, DTRACE_STABILITY_UNSTABLE, DTRACE_CLASS_UNKNOWN },
+{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
+{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
+{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
+};
+
+typedef struct cpc_probe_data {
+	char	*name;
+	int	*fds;
+} cpc_probe_data_t;
+
+/*
+ * Probe translators.  As we discover which events can actually be used
+ * on the system, we put them in a linked list that translates from names
+ * we present to the D/CPC user to names used by the underlying system.
+ * Importantly, CPC wants no '-' in probe names.  And, stylistically, we
+ * prefer lower-case probe names.
+ */
+typedef struct cpc_probex {
+	char			*Dname;
+	char			*pfmname;
+	struct cpc_probex	*next;
+} cpc_probex_t;
+/*
+ * FIXME: Handle this differently?  E.g., add a member to dt_provimpl_t that
+ * points to provider-specific data?
+ */
+static cpc_probex_t	*first_probex;
+
+static dt_probe_t *cpc_probe_insert(dtrace_hdl_t *dtp, const char *prb)
+{
+	dt_provider_t		*prv;
+	cpc_probe_data_t	*datap;
+	int			i, cnt = dtp->dt_conf.num_online_cpus;
+
+	prv = dt_provider_lookup(dtp, prvname);
+	if (!prv)
+		return 0;
+
+	datap = dt_zalloc(dtp, sizeof(cpc_probe_data_t));
+	if (datap == NULL)
+		return NULL;
+
+	datap->name = strdup(prb);
+	datap->fds = dt_calloc(dtp, cnt, sizeof(int));
+	if (datap->fds == NULL)
+		goto err;
+
+	for (i = 0; i < cnt; i++)
+		datap->fds[i] = -1;
+
+	return dt_probe_insert(dtp, prv, prvname, modname, funname, prb, datap);
+
+err:
+	dt_free(dtp, datap);
+	return NULL;
+}
+
+static int populate(dtrace_hdl_t *dtp)
+{
+	int		n = 0;
+
+	dt_provider_create(dtp, prvname, &dt_cpc, &pattr);
+	first_probex = NULL;
+
+	/* incidentally, pfm_strerror(pfm_initialize()) describes the error */
+	if (pfm_initialize() != PFM_SUCCESS)
+		return 0;
+
+	/* loop over PMUs (FWIW, ipmu=PFM_PMU_PERF_EVENT is among them) */
+	for (pfm_pmu_t ipmu = PFM_PMU_NONE; ipmu < PFM_PMU_MAX; ipmu ++) {
+		pfm_pmu_info_t pmuinfo;
+
+		memset(&pmuinfo, 0, sizeof(pmuinfo));
+		pmuinfo.size = sizeof(pfm_pmu_info_t);
+		if (pfm_get_pmu_info(ipmu , &pmuinfo) != PFM_SUCCESS || pmuinfo.is_present == 0)
+			continue;
+
+		/*
+		 * At this point, we have interesting information like:
+		 *     - pmuinfo.nevents
+		 *     - pmuinfo.name
+		 *     - pmuinfo.desc
+		 *     - pmuinfo.type = PFM_PMU_TYPE_[UNKNOWN|CORE|UNCORE]
+		 *     - pmuinfo.num_cntrs
+		 *     - pmuinfo.fixed_num_cntrs
+		 *     - pmuinfo.max_encoding // number of event codes returned by pfm_get_event_encoding()
+		 */
+
+		/* loop over events */
+		for (int ievt = pmuinfo.first_event; ievt != -1; ievt = pfm_get_event_next(ievt)) {
+
+			/*
+			 * Convert opaque integer index ievt into a name evt.name.
+			 */
+
+			pfm_event_info_t evtinfo;
+
+			memset(&evtinfo, 0, sizeof(evtinfo));
+			evtinfo.size = sizeof(evtinfo);
+
+			/* PFM_OS_[NONE|PERF_EVENT|PERF_EVENT_EXT] */
+			if (pfm_get_event_info(ievt, PFM_OS_PERF_EVENT, &evtinfo) != PFM_SUCCESS)
+				continue;
+
+			/*
+			 * At this point, we have interesting information like:
+			 *     - evtinfo.name
+			 *     - evtinfo.desc   // a little verbose and does not say that much
+			 *     - evtinfo.nattrs
+			 *     - evtinfo.dtype  // should be PFM_DTYPE_UINT64
+			 *     - evtinfo.idx    // should be ievt
+			 *     - evtinfo.equiv  // some equivalent name, or "(null)"
+			 */
+
+			/*
+			 * Convert the event name into perf_event attr.
+			 */
+			pfm_perf_encode_arg_t encoding;
+			struct perf_event_attr attr;
+			char *fstr = NULL;
+
+			memset(&encoding, 0, sizeof(encoding));
+			memset(&attr    , 0, sizeof(  attr  ));
+			encoding.size = sizeof(encoding);
+			encoding.attr = &attr;
+			encoding.fstr = &fstr;
+
+			// os = [PFM_OS_PERF_EVENT | PFM_OS_PERF_EVENT_EXT]
+			// Note that pfm_strerror(pfm_get_os_event_encoding(...) describes any error.
+			if (pfm_get_os_event_encoding(evtinfo.name, PFM_PLM0 | PFM_PLM3, PFM_OS_PERF_EVENT, &encoding) != PFM_SUCCESS) {
+				if (fstr)
+					free(fstr);    // is this necessary if we errored out?
+				continue;
+			}
+
+			// At this point, ievt is what we requested, while encoding.idx corresponds to fstr.
+			// Meanwhile, fstr will have some ":u=1:k=1" that we would otherwise want to modify.
+			if (fstr)
+				free(fstr);
+
+			/*
+			 * Now attr is largely set up.  Note:
+			 *     - attr.size is still 0, which is okay
+			 *     - attr.freq is 0, which is okay
+			 *     - attr.wakeup_events is 0, which we can change
+			 */
+			attr.wakeup_events = 1;
+
+			/*
+			 * Check attr with perf_event_open().
+			 */
+			int fd = perf_event_open(&attr, -1, 0 /* FIXME: cpu */, -1, 0);
+			if (fd < 0)
+				continue;
+			close(fd);
+
+			/*
+			 * We convert '-' to '_' to conform to CPC practices
+			 * and convert to lower-case characters (for stylistic reasons).
+			 */
+			cpc_probex_t *next_probex;
+			next_probex = dt_zalloc(dtp, sizeof(cpc_probe_data_t));   // FIXME: should free in some probe_destroy() or something/??
+			if (next_probex == NULL)
+				continue;
+			next_probex->pfmname = strdup(evtinfo.name);              // FIXME: check nonNULL; free somewhere
+			next_probex->Dname = strdup(evtinfo.name);                // FIXME: check nonNULL; free somewhere
+			for (unsigned char *p = next_probex->Dname; *p; p++)
+				*p = (*p == '-') ? '_' : tolower(*p);
+			next_probex->next = first_probex;
+			first_probex = next_probex;
+
+			/*
+			 * Compose a CPC probe name by adding mode "all" and a sample period
+			 * big enough that even the fastest firing probe will not be unreasonable.
+			 */
+			char *suffix = "-all-1000000000";
+			char *s = dt_zalloc(dtp, strlen(next_probex->Dname) + strlen(suffix) + 1);  // FIXME: what if s==NULL?
+			sprintf(s, "%s%s", next_probex->Dname, suffix);
+
+			/*
+			 * If this probe is not yet there (likely!), add it.
+			 */
+			dtrace_probedesc_t pd;
+			pd.id = DTRACE_IDNONE;
+			pd.prv = prvname;
+			pd.mod = modname;
+			pd.fun = funname;
+			pd.prb = s;
+			if (dt_probe_lookup(dtp, &pd) == NULL && cpc_probe_insert(dtp, s))
+				n++;
+
+			dt_free(dtp, s);
+		}
+	}
+
+	return n;
+}
+
+static int decode_event(struct perf_event_attr *ap, const char *name) {
+	cpc_probex_t *probex;
+
+	// find the probe translator for this D name
+	for (probex = first_probex; probex; probex = probex->next)
+		if (strcmp(name, probex->Dname) == 0)
+			break;
+	if (probex == NULL)
+		return -1;
+
+	// fill in the attr for this pfm name
+	pfm_perf_encode_arg_t encoding;
+	char *fstr = NULL;
+	int ret;
+
+	memset(&encoding, 0, sizeof(encoding));
+	encoding.size = sizeof(encoding);
+	encoding.attr = ap;
+	encoding.fstr = &fstr;
+
+	// os = [PFM_OS_PERF_EVENT | PFM_OS_PERF_EVENT_EXT]
+	// Note that pfm_strerror(pfm_get_os_event_encoding(...) describes any error.
+	ret = pfm_get_os_event_encoding(probex->pfmname, PFM_PLM0 | PFM_PLM3, PFM_OS_PERF_EVENT, &encoding);
+	if (fstr)
+		free(fstr);    // FIXME: is this necessary if we errored out?  if not, we do not need to define ret?
+	return (ret == PFM_SUCCESS) ? 0 : -1;
+}
+
+static int decode_mode(struct perf_event_attr *ap, const char *name) {
+	if (strcmp(name, "user") == 0) {
+		ap->exclude_kernel = 1;
+		return 0;
+	} else if (strcmp(name, "kernel") == 0) {
+		ap->exclude_user = 1;
+		return 0;
+	} else if (strcmp(name, "all") == 0)
+		return 0;
+
+	return -1;
+}
+
+static int decode_probename(struct perf_event_attr *ap, const char *name) {
+	char buf[256];  // FIXME: fixed length???
+	char *pend;
+
+	/* work in a temporary space */
+	strcpy(buf, name);
+
+	/* "event" substring */
+	name = buf;
+	pend = strchr(name, '-');
+	if (pend == NULL)
+		return -1;
+	*pend = '\0';
+	pend++;
+	if (decode_event(ap, name) < 0)
+		return -1;
+
+	/* "mode" substring */
+	name = pend;
+	pend = strchr(name, '-');
+	if (pend == NULL)
+		return -1;
+	*pend = '\0';
+	pend++;
+	if (decode_mode(ap, name) < 0)
+		return -1;
+
+	/* optional "attributes" substring */
+	name = pend;
+	pend = strchr(name, '-');
+	if (pend) {
+		*pend = '\0';
+		pend++;
+/*
+ignore [optional] attributes for now
+  how about cpu=
+		if (decode_mode(ap, name) < 0)
+			return -1;
+*/
+
+		name = pend;
+	}
+
+	/* "count" substring must be all digits 0-9 */
+	if (strspn(name, "0123456789") < strlen(name))
+		return -1;
+	if (sscanf(name, "%llu", &ap->sample_period) != 1)
+		return -1;
+
+	return 0;
+}
+
+static int provide(dtrace_hdl_t *dtp, const dtrace_probedesc_t *pdp)
+{
+	/* make sure we have IDNONE and a legal name */
+	if (pdp->id != DTRACE_IDNONE || strcmp(pdp->prv, prvname) ||
+	    strcmp(pdp->mod, modname) || strcmp(pdp->fun, funname))
+		return 0;
+
+	/* return if we already have this probe */
+	if (dt_probe_lookup(dtp, pdp))
+		return 0;
+
+	/* check if the probe name can be decoded */
+	struct perf_event_attr attr;
+	if (decode_probename(&attr, pdp->prb) == -1)
+		return 0;
+
+	/* try to add this probe */
+	if (cpc_probe_insert(dtp, pdp->prb) == NULL)
+		return 0;
+
+	return 1;
+}
+
+/*
+ * Generate a BPF trampoline for a cpc probe.
+ *
+ * The trampoline function is called when a cpc probe triggers, and it must
+ * satisfy the following prototype:
+ *
+ *	int dt_cpc(struct bpf_perf_event_data *ctx)
+ *
+ * The trampoline will populate a dt_bpf_context struct and then call the
+ * function that implements the compiled D clause.  It returns the value that
+ * it gets back from that function.
+ *
+ * The context that is passed to the trampoline is:
+ *     struct bpf_perf_event_data {
+ *         bpf_user_pt_regs_t regs;
+ *         __u64 sample_period;
+ *         __u64 addr;
+ *     }
+ */
+static void trampoline(dt_pcb_t *pcb)
+{
+	int		i;
+	dt_irlist_t	*dlp = &pcb->pcb_ir;
+
+	dt_cg_tramp_prologue(pcb);
+
+	/*
+	 * After the dt_cg_tramp_prologue() call, we have:
+	 *				//     (%r7 = dctx->mst)
+	 *				//     (%r8 = dctx->ctx)
+	 */
+
+	dt_cg_tramp_copy_regs(pcb);
+
+	/*
+	 * Use the PC to set arg0 and arg1, then clear the other args.
+	 */
+	dt_cg_tramp_copy_PC_from_regs(pcb);
+	for (i = 2; i < ARRAY_SIZE(((dt_mstate_t *)0)->argv); i++)
+		emit(dlp, BPF_STORE_IMM(BPF_DW, BPF_REG_7, DMST_ARG(i), 0));
+
+	dt_cg_tramp_epilogue(pcb);
+}
+
+static int attach(dtrace_hdl_t *dtp, const dt_probe_t *prp, int bpf_fd)
+{
+	cpc_probe_data_t	*datap = prp->prv_data;
+	struct perf_event_attr	attr;
+	int			i, nattach = 0;;
+	int			cnt = dtp->dt_conf.num_online_cpus;
+	char			*name = datap->name;  /* prp->desc->prb; */
+
+	/* this debugging check (that either name works) can be eliminated */
+	if (strcmp(prp->desc->prb, datap->name))
+		assert(0);
+
+	memset(&attr, 0, sizeof(attr));
+	if (decode_probename(&attr, name) < 0)
+		return -1;
+	attr.wakeup_events = 1;
+
+	for (i = 0; i < cnt; i++) {
+		int fd;
+
+		fd = perf_event_open(&attr, -1, dtp->dt_conf.cpus[i].cpu_id,
+				     -1, 0);
+		if (fd < 0)
+			continue;
+		if (ioctl(fd, PERF_EVENT_IOC_SET_BPF, bpf_fd) < 0) {
+			close(fd);
+			continue;
+		}
+		datap->fds[i] = fd;
+		nattach++;
+	}
+
+	return nattach > 0 ? 0 : -1;
+}
+
+static int probe_info(dtrace_hdl_t *dtp, const dt_probe_t *prp,
+		      int *argcp, dt_argdesc_t **argvp)
+{
+	/* cpc-provider probe arguments are not typed */
+	*argcp = 0;
+	*argvp = NULL;
+
+	return 0;
+}
+
+static void detach(dtrace_hdl_t *dtp, const dt_probe_t *prp)
+{
+	cpc_probe_data_t	*datap = prp->prv_data;
+	int			i, cnt = dtp->dt_conf.num_online_cpus;
+
+	for (i = 0; i < cnt; i++) {
+		if (datap->fds[i] != -1)
+			close(datap->fds[i]);
+	}
+}
+
+static void probe_destroy(dtrace_hdl_t *dtp, void *arg)
+{
+	cpc_probe_data_t	*datap = arg;
+
+	dt_free(dtp, datap->fds);
+	dt_free(dtp, datap->name);
+	dt_free(dtp, datap);
+}
+
+dt_provimpl_t	dt_cpc = {
+	.name		= prvname,
+	.prog_type	= BPF_PROG_TYPE_PERF_EVENT,
+	.populate	= &populate,
+	.provide	= &provide,
+	.trampoline	= &trampoline,
+	.attach		= &attach,
+	.probe_info	= &probe_info,
+	.detach		= &detach,
+	.probe_destroy	= &probe_destroy,
+};
diff --git a/libdtrace/dt_provider.h b/libdtrace/dt_provider.h
index a7f6c95b..9d86e6b2 100644
--- a/libdtrace/dt_provider.h
+++ b/libdtrace/dt_provider.h
@@ -1,6 +1,6 @@
 /*
  * Oracle Linux DTrace.
- * Copyright (c) 2006, 2022, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2006, 2023, Oracle and/or its affiliates. All rights reserved.
  * Licensed under the Universal Permissive License v 1.0 as shown at
  * http://oss.oracle.com/licenses/upl.
  */
@@ -78,6 +78,7 @@ typedef struct dt_provimpl {
 			      void *datap);
 } dt_provimpl_t;
 
+extern dt_provimpl_t dt_cpc;
 extern dt_provimpl_t dt_dtrace;
 extern dt_provimpl_t dt_fbt;
 extern dt_provimpl_t dt_profile;
-- 
2.18.4




More information about the DTrace-devel mailing list