[DTrace-devel] [PATCH v2 1/2] sched: fix firing of sched:::on-cpu

Kris Van Hees kris.van.hees at oracle.com
Fri Aug 2 21:36:22 UTC 2024


Reviewed-by: Kris Van Hees <kris.van.hees at oracle.com>

On Fri, Jun 28, 2024 at 06:16:33PM +0100, Alan Maguire via DTrace-devel wrote:
> sched:::on-cpu is not firing very often versus off-cpu.  It appears
> that - for recent kernels at least - fbt::schedule_tail:entry
> placement is wrong.  The only way to efficiently ensure firing in
> the right place - when the new task has been just scheduled in -
> is to use fbt::__perf_event_task_sched_in:entry as it
> 
> - fires at the right time
> - is not static, so not subject to inlining or other optimizations
> - is stable across kernel versions.
> 
> However the downside is it will not be called unless context switch
> perf events are enabled.  So the most efficient method is to
> perf_event_open() such an event but not attach anything to it.
> Also explored was attaching to cpc:::sched_switch-all-1 and weeding
> out off-cpu events, but that required a copy in of task state,
> comparison etc so in such a hot codepath a more precise attach
> is preferable.
> 
> With this in place we get sensible on/off cpu numbers:
> 
> $ dtrace -n 'sched:::*-cpu { @c[probename] = count();}'
> dtrace: description 'sched:::*-cpu ' matched 2 probes
> ^C
> 
>   off-cpu                                                        1454
>   on-cpu                                                         1454
> 
> Signed-off-by: Alan Maguire <alan.maguire at oracle.com>
> ---
>  libdtrace/dt_prov_sched.c | 48 +++++++++++++++++++++++++++++++++++++--
>  1 file changed, 46 insertions(+), 2 deletions(-)
> 
> diff --git a/libdtrace/dt_prov_sched.c b/libdtrace/dt_prov_sched.c
> index 2749385a..3e9d4f6b 100644
> --- a/libdtrace/dt_prov_sched.c
> +++ b/libdtrace/dt_prov_sched.c
> @@ -9,6 +9,9 @@
>  #include <assert.h>
>  #include <errno.h>
>  
> +#include <linux/perf_event.h>
> +#include <perfmon/pfmlib_perf_event.h>
> +
>  #include "dt_dctx.h"
>  #include "dt_cg.h"
>  #include "dt_provider_sdt.h"
> @@ -25,7 +28,7 @@ static probe_dep_t	probes[] = {
>  	{ "off-cpu",
>  	  DTRACE_PROBESPEC_NAME,	"rawtp:sched::sched_switch" },
>  	{ "on-cpu",
> -	  DTRACE_PROBESPEC_NAME,	"fbt::schedule_tail:entry" },
> +	  DTRACE_PROBESPEC_NAME,	"fbt::__perf_event_task_sched_in:entry" },
>  	{ "surrender",
>  	  DTRACE_PROBESPEC_NAME,	"fbt::do_sched_yield:entry" },
>  	{ "tick",
> @@ -141,13 +144,54 @@ static int trampoline(dt_pcb_t *pcb, uint_t exitlbl)
>  	return 0;
>  }
>  
> +/* We need a custom enabling for on-cpu probes to ensure that the fbt function
> + * __perf_event_task_sched_in is called.  __perf_event_task_sched_in
> + * will not be called unless context switch perf events have been enabled,
> + * so we do that here by opening a context switch count perf event but not
> + * attaching anything to it to minimize overhead.  The alternative - attaching
> + * to cpc:::context_switches-all-1 and weeding out on- versus off-cpu events
> + * via a trampoline is too expensive.  This approach works stably across
> + * kernels because __perf_event_task_sched_in() is not static, so not potentially
> + * subject to inlining or other optimizations.
> + */
> +static void enable(dtrace_hdl_t *dtp, dt_probe_t *prp)
> +{
> +	struct perf_event_attr attr = {};
> +	int swfd;
> +
> +	if (strcmp(prp->desc->prb, "on-cpu") != 0)
> +		return dt_sdt_enable(dtp, prp);
> +
> +	memset(&attr, 0, sizeof(attr));
> +	attr.size = sizeof(attr);
> +	attr.type = PERF_TYPE_SOFTWARE;
> +	attr.config = PERF_COUNT_SW_CONTEXT_SWITCHES;
> +	attr.freq = 1;
> +	attr.sample_freq = 1000;
> +	attr.context_switch = 1;
> +
> +	swfd = dt_perf_event_open(&attr, -1, 0, -1, 0);
> +	if (swfd < 0)
> +		dt_dprintf("open of context_switch perf event open failed: %d\n", errno);
> +	else
> +		prp->prv_data = (void *)(long)swfd;
> +	dt_sdt_enable(dtp, prp);
> +}
> +		
> +static void detach(dtrace_hdl_t *dtp, const dt_probe_t *prp)
> +{
> +	if (prp->prv_data)
> +		close((int)(long)prp->prv_data);
> +}
> +
>  dt_provimpl_t	dt_sched = {
>  	.name		= prvname,
>  	.prog_type	= BPF_PROG_TYPE_UNSPEC,
>  	.populate	= &populate,
> -	.enable		= &dt_sdt_enable,
> +	.enable		= &enable,
>  	.load_prog	= &dt_bpf_prog_load,
>  	.trampoline	= &trampoline,
>  	.probe_info	= &dt_sdt_probe_info,
> +	.detach		= &detach,
>  	.destroy	= &dt_sdt_destroy,
>  };
> -- 
> 2.43.5
> 
> 
> _______________________________________________
> DTrace-devel mailing list
> DTrace-devel at oss.oracle.com
> https://oss.oracle.com/mailman/listinfo/dtrace-devel



More information about the DTrace-devel mailing list