[DTrace-devel] [PATCH v2 1/2] sched: fix firing of sched:::on-cpu

Alan Maguire alan.maguire at oracle.com
Fri Jun 28 17:16:33 UTC 2024


sched:::on-cpu is not firing very often versus off-cpu.  It appears
that - for recent kernels at least - fbt::schedule_tail:entry
placement is wrong.  The only way to efficiently ensure firing in
the right place - when the new task has been just scheduled in -
is to use fbt::__perf_event_task_sched_in:entry as it

- fires at the right time
- is not static, so not subject to inlining or other optimizations
- is stable across kernel versions.

However the downside is it will not be called unless context switch
perf events are enabled.  So the most efficient method is to
perf_event_open() such an event but not attach anything to it.
Also explored was attaching to cpc:::sched_switch-all-1 and weeding
out off-cpu events, but that required a copy in of task state,
comparison etc so in such a hot codepath a more precise attach
is preferable.

With this in place we get sensible on/off cpu numbers:

$ dtrace -n 'sched:::*-cpu { @c[probename] = count();}'
dtrace: description 'sched:::*-cpu ' matched 2 probes
^C

  off-cpu                                                        1454
  on-cpu                                                         1454

Signed-off-by: Alan Maguire <alan.maguire at oracle.com>
---
 libdtrace/dt_prov_sched.c | 48 +++++++++++++++++++++++++++++++++++++--
 1 file changed, 46 insertions(+), 2 deletions(-)

diff --git a/libdtrace/dt_prov_sched.c b/libdtrace/dt_prov_sched.c
index 2749385a..3e9d4f6b 100644
--- a/libdtrace/dt_prov_sched.c
+++ b/libdtrace/dt_prov_sched.c
@@ -9,6 +9,9 @@
 #include <assert.h>
 #include <errno.h>
 
+#include <linux/perf_event.h>
+#include <perfmon/pfmlib_perf_event.h>
+
 #include "dt_dctx.h"
 #include "dt_cg.h"
 #include "dt_provider_sdt.h"
@@ -25,7 +28,7 @@ static probe_dep_t	probes[] = {
 	{ "off-cpu",
 	  DTRACE_PROBESPEC_NAME,	"rawtp:sched::sched_switch" },
 	{ "on-cpu",
-	  DTRACE_PROBESPEC_NAME,	"fbt::schedule_tail:entry" },
+	  DTRACE_PROBESPEC_NAME,	"fbt::__perf_event_task_sched_in:entry" },
 	{ "surrender",
 	  DTRACE_PROBESPEC_NAME,	"fbt::do_sched_yield:entry" },
 	{ "tick",
@@ -141,13 +144,54 @@ static int trampoline(dt_pcb_t *pcb, uint_t exitlbl)
 	return 0;
 }
 
+/* We need a custom enabling for on-cpu probes to ensure that the fbt function
+ * __perf_event_task_sched_in is called.  __perf_event_task_sched_in
+ * will not be called unless context switch perf events have been enabled,
+ * so we do that here by opening a context switch count perf event but not
+ * attaching anything to it to minimize overhead.  The alternative - attaching
+ * to cpc:::context_switches-all-1 and weeding out on- versus off-cpu events
+ * via a trampoline is too expensive.  This approach works stably across
+ * kernels because __perf_event_task_sched_in() is not static, so not potentially
+ * subject to inlining or other optimizations.
+ */
+static void enable(dtrace_hdl_t *dtp, dt_probe_t *prp)
+{
+	struct perf_event_attr attr = {};
+	int swfd;
+
+	if (strcmp(prp->desc->prb, "on-cpu") != 0)
+		return dt_sdt_enable(dtp, prp);
+
+	memset(&attr, 0, sizeof(attr));
+	attr.size = sizeof(attr);
+	attr.type = PERF_TYPE_SOFTWARE;
+	attr.config = PERF_COUNT_SW_CONTEXT_SWITCHES;
+	attr.freq = 1;
+	attr.sample_freq = 1000;
+	attr.context_switch = 1;
+
+	swfd = dt_perf_event_open(&attr, -1, 0, -1, 0);
+	if (swfd < 0)
+		dt_dprintf("open of context_switch perf event open failed: %d\n", errno);
+	else
+		prp->prv_data = (void *)(long)swfd;
+	dt_sdt_enable(dtp, prp);
+}
+		
+static void detach(dtrace_hdl_t *dtp, const dt_probe_t *prp)
+{
+	if (prp->prv_data)
+		close((int)(long)prp->prv_data);
+}
+
 dt_provimpl_t	dt_sched = {
 	.name		= prvname,
 	.prog_type	= BPF_PROG_TYPE_UNSPEC,
 	.populate	= &populate,
-	.enable		= &dt_sdt_enable,
+	.enable		= &enable,
 	.load_prog	= &dt_bpf_prog_load,
 	.trampoline	= &trampoline,
 	.probe_info	= &dt_sdt_probe_info,
+	.detach		= &detach,
 	.destroy	= &dt_sdt_destroy,
 };
-- 
2.43.5




More information about the DTrace-devel mailing list