[DTrace-devel] [PATCH v2 1/2] sched: fix firing of sched:::on-cpu
Alan Maguire
alan.maguire at oracle.com
Fri Jun 28 17:16:33 UTC 2024
sched:::on-cpu is not firing very often versus off-cpu. It appears
that - for recent kernels at least - fbt::schedule_tail:entry
placement is wrong. The only way to efficiently ensure firing in
the right place - when the new task has been just scheduled in -
is to use fbt::__perf_event_task_sched_in:entry as it
- fires at the right time
- is not static, so not subject to inlining or other optimizations
- is stable across kernel versions.
However the downside is it will not be called unless context switch
perf events are enabled. So the most efficient method is to
perf_event_open() such an event but not attach anything to it.
Also explored was attaching to cpc:::sched_switch-all-1 and weeding
out off-cpu events, but that required a copy in of task state,
comparison etc so in such a hot codepath a more precise attach
is preferable.
With this in place we get sensible on/off cpu numbers:
$ dtrace -n 'sched:::*-cpu { @c[probename] = count();}'
dtrace: description 'sched:::*-cpu ' matched 2 probes
^C
off-cpu 1454
on-cpu 1454
Signed-off-by: Alan Maguire <alan.maguire at oracle.com>
---
libdtrace/dt_prov_sched.c | 48 +++++++++++++++++++++++++++++++++++++--
1 file changed, 46 insertions(+), 2 deletions(-)
diff --git a/libdtrace/dt_prov_sched.c b/libdtrace/dt_prov_sched.c
index 2749385a..3e9d4f6b 100644
--- a/libdtrace/dt_prov_sched.c
+++ b/libdtrace/dt_prov_sched.c
@@ -9,6 +9,9 @@
#include <assert.h>
#include <errno.h>
+#include <linux/perf_event.h>
+#include <perfmon/pfmlib_perf_event.h>
+
#include "dt_dctx.h"
#include "dt_cg.h"
#include "dt_provider_sdt.h"
@@ -25,7 +28,7 @@ static probe_dep_t probes[] = {
{ "off-cpu",
DTRACE_PROBESPEC_NAME, "rawtp:sched::sched_switch" },
{ "on-cpu",
- DTRACE_PROBESPEC_NAME, "fbt::schedule_tail:entry" },
+ DTRACE_PROBESPEC_NAME, "fbt::__perf_event_task_sched_in:entry" },
{ "surrender",
DTRACE_PROBESPEC_NAME, "fbt::do_sched_yield:entry" },
{ "tick",
@@ -141,13 +144,54 @@ static int trampoline(dt_pcb_t *pcb, uint_t exitlbl)
return 0;
}
+/* We need a custom enabling for on-cpu probes to ensure that the fbt function
+ * __perf_event_task_sched_in is called. __perf_event_task_sched_in
+ * will not be called unless context switch perf events have been enabled,
+ * so we do that here by opening a context switch count perf event but not
+ * attaching anything to it to minimize overhead. The alternative - attaching
+ * to cpc:::context_switches-all-1 and weeding out on- versus off-cpu events
+ * via a trampoline is too expensive. This approach works stably across
+ * kernels because __perf_event_task_sched_in() is not static, so not potentially
+ * subject to inlining or other optimizations.
+ */
+static void enable(dtrace_hdl_t *dtp, dt_probe_t *prp)
+{
+ struct perf_event_attr attr = {};
+ int swfd;
+
+ if (strcmp(prp->desc->prb, "on-cpu") != 0)
+ return dt_sdt_enable(dtp, prp);
+
+ memset(&attr, 0, sizeof(attr));
+ attr.size = sizeof(attr);
+ attr.type = PERF_TYPE_SOFTWARE;
+ attr.config = PERF_COUNT_SW_CONTEXT_SWITCHES;
+ attr.freq = 1;
+ attr.sample_freq = 1000;
+ attr.context_switch = 1;
+
+ swfd = dt_perf_event_open(&attr, -1, 0, -1, 0);
+ if (swfd < 0)
+ dt_dprintf("open of context_switch perf event open failed: %d\n", errno);
+ else
+ prp->prv_data = (void *)(long)swfd;
+ dt_sdt_enable(dtp, prp);
+}
+
+static void detach(dtrace_hdl_t *dtp, const dt_probe_t *prp)
+{
+ if (prp->prv_data)
+ close((int)(long)prp->prv_data);
+}
+
dt_provimpl_t dt_sched = {
.name = prvname,
.prog_type = BPF_PROG_TYPE_UNSPEC,
.populate = &populate,
- .enable = &dt_sdt_enable,
+ .enable = &enable,
.load_prog = &dt_bpf_prog_load,
.trampoline = &trampoline,
.probe_info = &dt_sdt_probe_info,
+ .detach = &detach,
.destroy = &dt_sdt_destroy,
};
--
2.43.5
More information about the DTrace-devel
mailing list