[DTrace-devel] [PATCH v2 1/2] sched: fix firing of sched:::on-cpu
Kris Van Hees
kris.van.hees at oracle.com
Fri Aug 2 21:36:22 UTC 2024
Reviewed-by: Kris Van Hees <kris.van.hees at oracle.com>
On Fri, Jun 28, 2024 at 06:16:33PM +0100, Alan Maguire via DTrace-devel wrote:
> sched:::on-cpu is not firing very often versus off-cpu. It appears
> that - for recent kernels at least - fbt::schedule_tail:entry
> placement is wrong. The only way to efficiently ensure firing in
> the right place - when the new task has been just scheduled in -
> is to use fbt::__perf_event_task_sched_in:entry as it
>
> - fires at the right time
> - is not static, so not subject to inlining or other optimizations
> - is stable across kernel versions.
>
> However the downside is it will not be called unless context switch
> perf events are enabled. So the most efficient method is to
> perf_event_open() such an event but not attach anything to it.
> Also explored was attaching to cpc:::sched_switch-all-1 and weeding
> out off-cpu events, but that required a copy in of task state,
> comparison etc so in such a hot codepath a more precise attach
> is preferable.
>
> With this in place we get sensible on/off cpu numbers:
>
> $ dtrace -n 'sched:::*-cpu { @c[probename] = count();}'
> dtrace: description 'sched:::*-cpu ' matched 2 probes
> ^C
>
> off-cpu 1454
> on-cpu 1454
>
> Signed-off-by: Alan Maguire <alan.maguire at oracle.com>
> ---
> libdtrace/dt_prov_sched.c | 48 +++++++++++++++++++++++++++++++++++++--
> 1 file changed, 46 insertions(+), 2 deletions(-)
>
> diff --git a/libdtrace/dt_prov_sched.c b/libdtrace/dt_prov_sched.c
> index 2749385a..3e9d4f6b 100644
> --- a/libdtrace/dt_prov_sched.c
> +++ b/libdtrace/dt_prov_sched.c
> @@ -9,6 +9,9 @@
> #include <assert.h>
> #include <errno.h>
>
> +#include <linux/perf_event.h>
> +#include <perfmon/pfmlib_perf_event.h>
> +
> #include "dt_dctx.h"
> #include "dt_cg.h"
> #include "dt_provider_sdt.h"
> @@ -25,7 +28,7 @@ static probe_dep_t probes[] = {
> { "off-cpu",
> DTRACE_PROBESPEC_NAME, "rawtp:sched::sched_switch" },
> { "on-cpu",
> - DTRACE_PROBESPEC_NAME, "fbt::schedule_tail:entry" },
> + DTRACE_PROBESPEC_NAME, "fbt::__perf_event_task_sched_in:entry" },
> { "surrender",
> DTRACE_PROBESPEC_NAME, "fbt::do_sched_yield:entry" },
> { "tick",
> @@ -141,13 +144,54 @@ static int trampoline(dt_pcb_t *pcb, uint_t exitlbl)
> return 0;
> }
>
> +/* We need a custom enabling for on-cpu probes to ensure that the fbt function
> + * __perf_event_task_sched_in is called. __perf_event_task_sched_in
> + * will not be called unless context switch perf events have been enabled,
> + * so we do that here by opening a context switch count perf event but not
> + * attaching anything to it to minimize overhead. The alternative - attaching
> + * to cpc:::context_switches-all-1 and weeding out on- versus off-cpu events
> + * via a trampoline is too expensive. This approach works stably across
> + * kernels because __perf_event_task_sched_in() is not static, so not potentially
> + * subject to inlining or other optimizations.
> + */
> +static void enable(dtrace_hdl_t *dtp, dt_probe_t *prp)
> +{
> + struct perf_event_attr attr = {};
> + int swfd;
> +
> + if (strcmp(prp->desc->prb, "on-cpu") != 0)
> + return dt_sdt_enable(dtp, prp);
> +
> + memset(&attr, 0, sizeof(attr));
> + attr.size = sizeof(attr);
> + attr.type = PERF_TYPE_SOFTWARE;
> + attr.config = PERF_COUNT_SW_CONTEXT_SWITCHES;
> + attr.freq = 1;
> + attr.sample_freq = 1000;
> + attr.context_switch = 1;
> +
> + swfd = dt_perf_event_open(&attr, -1, 0, -1, 0);
> + if (swfd < 0)
> + dt_dprintf("open of context_switch perf event open failed: %d\n", errno);
> + else
> + prp->prv_data = (void *)(long)swfd;
> + dt_sdt_enable(dtp, prp);
> +}
> +
> +static void detach(dtrace_hdl_t *dtp, const dt_probe_t *prp)
> +{
> + if (prp->prv_data)
> + close((int)(long)prp->prv_data);
> +}
> +
> dt_provimpl_t dt_sched = {
> .name = prvname,
> .prog_type = BPF_PROG_TYPE_UNSPEC,
> .populate = &populate,
> - .enable = &dt_sdt_enable,
> + .enable = &enable,
> .load_prog = &dt_bpf_prog_load,
> .trampoline = &trampoline,
> .probe_info = &dt_sdt_probe_info,
> + .detach = &detach,
> .destroy = &dt_sdt_destroy,
> };
> --
> 2.43.5
>
>
> _______________________________________________
> DTrace-devel mailing list
> DTrace-devel at oss.oracle.com
> https://oss.oracle.com/mailman/listinfo/dtrace-devel
More information about the DTrace-devel
mailing list