[DTrace-devel] [PATCH 2/7] Implement the sched provider

Kris Van Hees kris.van.hees at oracle.com
Tue May 9 22:29:30 UTC 2023


Signed-off-by: Kris Van Hees <kris.van.hees at oracle.com>
---
 libdtrace/Build           |   2 +
 libdtrace/dt_open.c       |   1 +
 libdtrace/dt_prov_sched.c | 250 ++++++++++++++++++++++++++++++++++++++
 libdtrace/dt_provider.h   |   1 +
 4 files changed, 254 insertions(+)
 create mode 100644 libdtrace/dt_prov_sched.c

diff --git a/libdtrace/Build b/libdtrace/Build
index 483b2f02..299cb128 100644
--- a/libdtrace/Build
+++ b/libdtrace/Build
@@ -52,6 +52,7 @@ libdtrace-build_SOURCES = dt_aggregate.c \
 			  dt_prov_proc.c \
 			  dt_prov_profile.c \
 			  dt_prov_rawtp.c \
+			  dt_prov_sched.c \
 			  dt_prov_sdt.c \
 			  dt_prov_syscall.c \
 			  dt_prov_uprobe.c \
@@ -95,6 +96,7 @@ dt_prov_fbt.c_CFLAGS := -Wno-pedantic
 dt_prov_proc.c_CFLAGS := -Wno-pedantic
 dt_prov_profile.c_CFLAGS := -Wno-pedantic
 dt_prov_rawtp.c_CFLAGS := -Wno-pedantic
+dt_prov_sched.c_CFLAGS := -Wno-pedantic
 dt_prov_sdt.c_CFLAGS := -Wno-pedantic
 dt_prov_syscall.c_CFLAGS := -Wno-pedantic
 dt_prov_uprobe.c_CFLAGS := -Wno-pedantic
diff --git a/libdtrace/dt_open.c b/libdtrace/dt_open.c
index 52e23fe5..04364807 100644
--- a/libdtrace/dt_open.c
+++ b/libdtrace/dt_open.c
@@ -70,6 +70,7 @@ static const dt_provimpl_t *dt_providers[] = {
 	&dt_proc,
 	&dt_profile,
 	&dt_rawtp,
+	&dt_sched,
 	&dt_sdt,
 	&dt_syscall,
 	&dt_uprobe,
diff --git a/libdtrace/dt_prov_sched.c b/libdtrace/dt_prov_sched.c
new file mode 100644
index 00000000..0308e212
--- /dev/null
+++ b/libdtrace/dt_prov_sched.c
@@ -0,0 +1,250 @@
+/*
+ * Oracle Linux DTrace.
+ * Copyright (c) 2023, Oracle and/or its affiliates. All rights reserved.
+ * Licensed under the Universal Permissive License v 1.0 as shown at
+ * http://oss.oracle.com/licenses/upl.
+ *
+ * The 'sched' SDT provider for DTrace specific probes.
+ */
+#include <assert.h>
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/ioctl.h>
+#include <linux/bpf.h>
+#include <linux/perf_event.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+
+#include <bpf_asm.h>
+
+#include "dt_dctx.h"
+#include "dt_cg.h"
+#include "dt_bpf.h"
+#include "dt_provider.h"
+#include "dt_probe.h"
+#include "dt_pt_regs.h"
+
+static const char		prvname[] = "sched";
+static const char		modname[] = "vmlinux";
+
+/*
+ * Implement sched probes as dependent probes on other probes.  E.g. for the
+ * on-cpu probe, call dt_probe_add_dependent() to add the on-cpu probe to the
+ * underlying fbt::schedule_tail:entry probe.  When the program for probe
+ * fbt::schedule_tail:entry) is generated, loop through all dependent probes,
+ * and call the trampoline callback for each dependent probe to generate a
+ * pseudo-trampoline that converts the underlying probe into the dependent
+ * probe.  Finally, code is generated to call each of the clauses of the
+ * dependent probe.
+ *
+ * The process of converting the main probe to the dependent probe needs to be
+ * done in a way that preserves the original probe data so that multiple
+ * dependent probes are possible (and each needs to be able to convert the
+ * main probe data into the proper dependent data using a pseudo-trampoline).
+ *
+ * The conversion needs to be able to implement predicate-style conditions that
+ * determine whether the dependent probe is to fire when the main probe does.
+ * The sched:::preempt and sched:::sleep probes are an example of dependent
+ * probes that need a conditional.  If the condition fails, the main probe
+ * processing should move on to the next dependent probe (if any).  If there
+ * are no more dependent probes, it will end up skipping to the exit label,
+ *
+ * Dependent probes need priorities for cases where two probes are dependent on
+ * the same underlying probe, but one needs to 'fire' before the other.
+ */
+
+typedef struct probe_arg {
+	const char	*name;			/* name of probe */
+	int		argno;			/* argument number */
+	dt_argdesc_t	argdesc;		/* argument description */
+} probe_arg_t;
+
+static probe_arg_t probe_args[] = {
+#if 0
+	{ "change-pri", 0, { 0, "struct task_struct *", "lwpsinfo_t *" } },
+	{ "change-pri", 1, { 0, "struct task_struct *", "psinfo_t *" } },
+	{ "change-pri", 2, { 1, "int", } },
+	{ "dequeue", 0, { 0, "struct task_struct *", "lwpsinfo_t *" } },
+	{ "dequeue", 1, { 0, "struct task_struct *", "psinfo_t *" } },
+	{ "dequeue", 2, { 1, "cpuinfo_t *", } },
+	{ "dequeue", 3, { 2, "int", } },
+	{ "enqueue", 0, { 0, "struct task_struct *", "lwpsinfo_t *" } },
+	{ "enqueue", 1, { 0, "struct task_struct *", "psinfo_t *" } },
+	{ "enqueue", 2, { 1, "cpuinfo_t *", } },
+	{ "off-cpu", 0, { 0, "struct task_struct *", "lwpsinfo_t *" } },
+	{ "off-cpu", 1, { 0, "struct task_struct *", "psinfo_t *" } },
+#endif
+	{ "on-cpu", },
+	{ "preempt", },
+	{ "remain-cpu", },
+	{ "sleep", },
+#if 0
+	{ "surrender", 0, { 0, "struct task_struct *", "lwpsinfo_t *" } },
+	{ "surrender", 1, { 0, "struct task_struct *", "psinfo_t *" } },
+	{ "tick", 0, { 0, "struct task_struct *", "lwpsinfo_t *" } },
+	{ "tick", 1, { 0, "struct task_struct *", "psinfo_t *" } },
+	{ "wakeup", 0, { 0, "struct task_struct *", "lwpsinfo_t *" } },
+	{ "wakeup", 1, { 0, "struct task_struct *", "psinfo_t *" } },
+#endif
+};
+
+static const dtrace_pattr_t	pattr = {
+{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_ISA },
+{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
+{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
+{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_ISA },
+{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_ISA },
+};
+
+/*
+ * Provide all the "sched" SDT probes.
+ */
+static int populate(dtrace_hdl_t *dtp)
+{
+	dt_provider_t	*prv;
+	int		i;
+	int		n = 0;
+
+	prv = dt_provider_create(dtp, prvname, &dt_sched, &pattr);
+	if (prv == NULL)
+		return 0;
+
+	/*
+	 * Create "sched" probes based on the probe_args list.  Since each
+	 * probe will have at least one entry (with argno == 0), we can use
+	 * those entries to identify the probe names.
+	 */
+	for (i = 0; i < ARRAY_SIZE(probe_args); i++) {
+		probe_arg_t	*arg = &probe_args[i];
+
+		if (arg->argno == 0 &&
+		    dt_probe_insert(dtp, prv, prvname, modname, "", arg->name,
+				    NULL))
+			n++;
+	}
+
+	return n;
+}
+
+static void enable(dtrace_hdl_t *dtp, dt_probe_t *prp)
+{
+	dt_probe_t		*uprp = NULL;
+	dtrace_probedesc_t	pd;
+
+	if (strcmp(prp->desc->prb, "on-cpu") == 0) {
+		pd.id = DTRACE_IDNONE;
+		pd.prv = "fbt";
+		pd.mod = "";
+		pd.fun = "schedule_tail";
+		pd.prb = "entry";
+
+		uprp = dt_probe_lookup(dtp, &pd);
+		assert(uprp != NULL);
+
+		dt_probe_add_dependent(dtp, uprp, prp);
+		dt_probe_enable(dtp, uprp);
+	} else if (strcmp(prp->desc->prb, "preempt") == 0 ||
+		   strcmp(prp->desc->prb, "sleep") == 0) {
+		pd.id = DTRACE_IDNONE;
+		pd.prv = "sdt";
+		pd.mod = "sched";
+		pd.fun = "";
+		pd.prb = "sched_switch";
+
+		uprp = dt_probe_lookup(dtp, &pd);
+		assert(uprp != NULL);
+
+		dt_probe_add_dependent(dtp, uprp, prp);
+		dt_probe_enable(dtp, uprp);
+	}
+
+	/*
+	 * Finally, ensure we're in the list of enablings as well.
+	 * (This ensures that, among other things, the probes map
+	 * gains entries for us.)
+	 */
+	if (!dt_in_list(&dtp->dt_enablings, prp))
+		dt_list_append(&dtp->dt_enablings, prp);
+}
+
+/*
+ * Generate a BPF trampoline for a SDT probe.
+ *
+ * The trampoline function is called when a SDT probe triggers, and it must
+ * satisfy the following prototype:
+ *
+ *	int dt_proc(void *data)
+ *
+ * The trampoline will populate a dt_dctx_t struct and then call the function
+ * that implements the compiled D clause.  It returns the value that it gets
+ * back from that function.
+ */
+static void trampoline(dt_pcb_t *pcb, uint_t exitlbl)
+{
+	dtrace_hdl_t	*dtp = pcb->pcb_hdl;
+	dt_irlist_t	*dlp = &pcb->pcb_ir;
+	dt_probe_t	*prp = pcb->pcb_probe;
+
+	if (strcmp(prp->desc->prb, "preempt") == 0) {
+		emit(dlp, BPF_LOAD(BPF_DW, BPF_REG_0, BPF_REG_7, DMST_ARG(3)));
+                emit(dlp, BPF_BRANCH_IMM(BPF_JLT, BPF_REG_0, 1 << 8, pcb->pcb_exitlbl));
+	} else if (strcmp(prp->desc->prb, "sleep") == 0) {
+		emit(dlp, BPF_LOAD(BPF_DW, BPF_REG_0, BPF_REG_7, DMST_ARG(3)));
+                emit(dlp, BPF_BRANCH_IMM(BPF_JGE, BPF_REG_0, 1 << 8, pcb->pcb_exitlbl));
+	}
+}
+
+static int probe_info(dtrace_hdl_t *dtp, const dt_probe_t *prp,
+		      int *argcp, dt_argdesc_t **argvp)
+{
+	int		i;
+	int		pidx = -1;
+	int		argc = 0;
+	dt_argdesc_t	*argv = NULL;
+
+	for (i = 0; i < ARRAY_SIZE(probe_args); i++) {
+		probe_arg_t	*arg = &probe_args[i];
+
+		if (strcmp(arg->name, prp->desc->prb) == 0) {
+			if (pidx == -1) {
+				pidx = i;
+
+				if (arg->argdesc.native == NULL)
+					break;
+			}
+
+			argc++;
+		}
+	}
+
+	if (argc == 0)
+		goto done;
+
+	argv = dt_zalloc(dtp, argc * sizeof(dt_argdesc_t));
+	if (!argv)
+		return -ENOMEM;
+
+	for (i = pidx; i < pidx + argc; i++) {
+		probe_arg_t	*arg = &probe_args[i];
+
+		argv[arg->argno] = arg->argdesc;
+	}
+
+done:
+	*argcp = argc;
+	*argvp = argv;
+
+	return 0;
+}
+
+dt_provimpl_t	dt_sched = {
+	.name		= prvname,
+	.prog_type	= BPF_PROG_TYPE_UNSPEC,
+	.populate	= &populate,
+	.enable		= &enable,
+	.trampoline	= &trampoline,
+	.probe_info	= &probe_info,
+};
diff --git a/libdtrace/dt_provider.h b/libdtrace/dt_provider.h
index da1b4e57..f3b21f55 100644
--- a/libdtrace/dt_provider.h
+++ b/libdtrace/dt_provider.h
@@ -84,6 +84,7 @@ extern dt_provimpl_t dt_fbt;
 extern dt_provimpl_t dt_proc;
 extern dt_provimpl_t dt_profile;
 extern dt_provimpl_t dt_rawtp;
+extern dt_provimpl_t dt_sched;
 extern dt_provimpl_t dt_sdt;
 extern dt_provimpl_t dt_syscall;
 extern dt_provimpl_t dt_uprobe;
-- 
2.40.1




More information about the DTrace-devel mailing list