[DTrace-devel] [PATCH v3 2/2] Clean up sched provider trampoline FIXMEs

Kris Van Hees kris.van.hees at oracle.com
Tue Apr 15 11:59:06 UTC 2025


On Fri, Apr 11, 2025 at 05:20:00PM -0400, Eugene Loh wrote:
> On 4/11/25 16:48, Kris Van Hees wrote:
> 
> > Partial comments below (still looking at the provider changes)...
> > 
> > On Thu, Apr 03, 2025 at 01:02:52AM -0400, eugene.loh at oracle.com wrote:
> > > From: Eugene Loh <eugene.loh at oracle.com>
> > > 
> > > The sched provider trampoline for enqueue and dequeue probes had
> > > pending FIXMEs for providing a cpuinfo_t* for the cpu associated
> > > with the run queue.  Implement the missing code.
> > > 
> > > Since the cpu associated with the run queue might be different from
> > > the cpu where we are running, it becomes necessary to access the
> > > cpuinfo for some random cpu.  With Linux 5.18, there is a BPF
> > > helper function map_lookup_percpu_elem() that allows such lookups
> > > on per-cpu arrays.  To support older kernels, however, we change
> > > the cpuinfo BPF map from per-cpu to global.  Also, it is a hash
> > > table rather than an array in case cpus are not numbered consecutively.
> > I agree with all the above.  Good solution.
> > 
> > > Signed-off-by: Eugene Loh <eugene.loh at oracle.com>
> > > ---
> > >   bpf/get_agg.c                     |  2 +-
> > >   bpf/get_bvar.c                    |  2 +-
> > >   libdtrace/dt_bpf.c                | 34 ++++++--------
> > >   libdtrace/dt_cg.c                 |  5 ++-
> > >   libdtrace/dt_prov_lockstat.c      |  4 +-
> > >   libdtrace/dt_prov_sched.c         | 74 +++++++++++++++++++++++++------
> > >   libdtrace/dt_work.c               | 20 +++------
> > >   test/unittest/sched/tst.enqueue.d |  1 -
> > >   8 files changed, 89 insertions(+), 53 deletions(-)
> > > 
> > > diff --git a/bpf/get_agg.c b/bpf/get_agg.c
> > > index c0eb825f0..e70caa6ef 100644
> > > --- a/bpf/get_agg.c
> > > +++ b/bpf/get_agg.c
> > > @@ -21,7 +21,7 @@ extern struct bpf_map_def cpuinfo;
> > >    */
> > >   noinline uint64_t *dt_no_agg(void)
> > >   {
> > > -	uint32_t		key = 0;
> > > +	uint32_t		key = bpf_get_smp_processor_id();
> > >   	dt_bpf_cpuinfo_t	*ci;
> > >   	ci = bpf_map_lookup_elem(&cpuinfo, &key);
> > > diff --git a/bpf/get_bvar.c b/bpf/get_bvar.c
> > > index d372b3445..d81c3605f 100644
> > > --- a/bpf/get_bvar.c
> > > +++ b/bpf/get_bvar.c
> > > @@ -67,7 +67,7 @@ noinline uint64_t dt_bvar_caller(const dt_dctx_t *dctx)
> > >   noinline uint64_t dt_bvar_curcpu(const dt_dctx_t *dctx)
> > >   {
> > > -	uint32_t	key = 0;
> > > +	uint32_t	key = bpf_get_smp_processor_id();
> > >   	void		*val = bpf_map_lookup_elem(&cpuinfo, &key);
> > >   	if (val == NULL) {
> > > diff --git a/libdtrace/dt_bpf.c b/libdtrace/dt_bpf.c
> > > index 6d42a96c7..d6722cbd1 100644
> > > --- a/libdtrace/dt_bpf.c
> > > +++ b/libdtrace/dt_bpf.c
> > > @@ -761,37 +761,29 @@ gmap_create_buffers(dtrace_hdl_t *dtp)
> > >   static int
> > >   gmap_create_cpuinfo(dtrace_hdl_t *dtp)
> > >   {
> > > -	int			i, rc;
> > > +	int			i;
> > >   	uint32_t		key = 0;
> > >   	dtrace_conf_t		*conf = &dtp->dt_conf;
> > >   	size_t			ncpus = conf->num_online_cpus;
> > > -	dt_bpf_cpuinfo_t	*data;
> > > +	dt_bpf_cpuinfo_t	data;
> > Not sure about this, because (see below)...
> > 
> > >   	cpuinfo_t		*ci;
> > > -	/*
> > > -	 * num_possible_cpus <= num_online_cpus: see dt_conf_init.
> > > -	 */
> > > -	data = dt_calloc(dtp, dtp->dt_conf.num_possible_cpus,
> > > -			 sizeof(dt_bpf_cpuinfo_t));
> > > -	if (data == NULL)
> > > -		return dt_set_errno(dtp, EDT_NOMEM);
> > > -
> > > -	for (i = 0, ci = &conf->cpus[0]; i < ncpus; i++, ci++)
> > > -		memcpy(&data[ci->cpu_id].ci, ci, sizeof(cpuinfo_t));
> > > -
> > >   	dtp->dt_cpumap_fd = create_gmap(dtp, "cpuinfo",
> > > -					BPF_MAP_TYPE_PERCPU_ARRAY,
> > > +					BPF_MAP_TYPE_HASH,
> > >   					sizeof(uint32_t),
> > > -					sizeof(dt_bpf_cpuinfo_t), 1);
> > > +					sizeof(dt_bpf_cpuinfo_t), ncpus);
> > >   	if (dtp->dt_cpumap_fd == -1)
> > >   		return -1;
> > > -	rc = dt_bpf_map_update(dtp->dt_cpumap_fd, &key, data);
> > > -	dt_free(dtp, data);
> > > -	if (rc == -1)
> > > -		return dt_bpf_error(dtp,
> > > -				    "cannot update BPF map 'cpuinfo': %s\n",
> > > -				    strerror(errno));
> > > +	memset(&data, 0, sizeof(data));
> > Do we need this, because (see below)....
> > 
> > > +	for (i = 0, ci = &conf->cpus[0]; i < ncpus; i++, ci++) {
> > > +		memcpy(&data.ci, ci, sizeof(cpuinfo_t));
> > Do we need this, because (see below)....
> > 
> > > +		key = ci->cpu_id;
> > > +		if (dt_bpf_map_update(dtp->dt_cpumap_fd, &key, &data) == -1)
> > Why can'you we simply do:
> > 
> > 		if (dt_bpf_map_update(dtp->dt_cpumap_fd, &key, ci) == -1)
> 
> I think the problem is that the BPF map has elements with size
> sizeof(dt_bpf_cpuinfo_t).  Meanwhile, ci has size sizeof(cpuinfo_t), which
> is smaller.  So if we do an update like that, the map will have stuff where
> we want it to be initialized to 0.

Yes, but I am 99% certain that BPF maps are allocated and initialized with
zeros because doing otherwise would be a major security risk for the kernel.
So you can count on that (should verify first to make certain but honestly
it needs to be or else it could leak data which is a big no-no).

> > > +			return dt_bpf_error(dtp,
> > > +					    "cannot update BPF map 'cpuinfo': %s\n",
> > > +					    strerror(errno));
> > > +	}
> > >   	return 0;
> > >   }
> > > diff --git a/libdtrace/dt_cg.c b/libdtrace/dt_cg.c
> > > index 6dcf4cd3d..d83b1c2ce 100644
> > > --- a/libdtrace/dt_cg.c
> > > +++ b/libdtrace/dt_cg.c
> > > @@ -1243,9 +1243,12 @@ dt_cg_epilogue(dt_pcb_t *pcb)
> > >   		} else {
> > >   			idp = dt_dlib_get_map(dtp, "cpuinfo");
> > >   			assert(idp != NULL);
> > > +
> > > +			emit(dlp, BPF_CALL_HELPER(BPF_FUNC_get_smp_processor_id));
> > > +
> > >   			dt_cg_xsetx(dlp, idp, DT_LBL_NONE, BPF_REG_1, idp->di_id);
> > >   			emit(dlp, BPF_LOAD(BPF_DW, BPF_REG_2, BPF_REG_FP, DT_STK_SP));
> > > -			emit(dlp, BPF_STORE_IMM(BPF_DW, BPF_REG_2, 0, 0));
> > > +			emit(dlp, BPF_STORE(BPF_DW, BPF_REG_2, 0, BPF_REG_0));
> > >   			emit(dlp, BPF_CALL_HELPER(BPF_FUNC_map_lookup_elem));
> > >   			emit(dlp, BPF_BRANCH_IMM(BPF_JEQ, BPF_REG_0, 0, pcb->pcb_exitlbl));
> > >   			emit(dlp, BPF_MOV_IMM(BPF_REG_1, 1));
> > > diff --git a/libdtrace/dt_prov_lockstat.c b/libdtrace/dt_prov_lockstat.c
> > > index c73edf9be..8b2cf4da2 100644
> > > --- a/libdtrace/dt_prov_lockstat.c
> > > +++ b/libdtrace/dt_prov_lockstat.c
> > > @@ -121,11 +121,13 @@ static void get_cpuinfo(dtrace_hdl_t *dtp, dt_irlist_t *dlp, uint_t exitlbl)
> > >   {
> > >   	dt_ident_t	*idp = dt_dlib_get_map(dtp, "cpuinfo");
> > > +	emit(dlp,  BPF_CALL_HELPER(BPF_FUNC_get_smp_processor_id));
> > > +
> > >   	assert(idp != NULL);
> > >   	dt_cg_xsetx(dlp, idp, DT_LBL_NONE, BPF_REG_1, idp->di_id);
> > >   	emit(dlp, BPF_MOV_REG(BPF_REG_2, BPF_REG_FP));
> > >   	emit(dlp, BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, DT_TRAMP_SP_BASE));
> > > -	emit(dlp, BPF_STORE_IMM(BPF_DW, BPF_REG_2, 0, 0));
> > > +	emit(dlp, BPF_STORE(BPF_DW, BPF_REG_2, 0, BPF_REG_0));
> > >   	emit(dlp, BPF_CALL_HELPER(BPF_FUNC_map_lookup_elem));
> > >   	emit(dlp, BPF_BRANCH_IMM(BPF_JEQ, BPF_REG_0, 0, exitlbl));
> > >   	emit(dlp, BPF_MOV_REG(BPF_REG_6, BPF_REG_0));
> > > diff --git a/libdtrace/dt_prov_sched.c b/libdtrace/dt_prov_sched.c
> > > index 3a218f3cb..a548e679f 100644
> > > --- a/libdtrace/dt_prov_sched.c
> > > +++ b/libdtrace/dt_prov_sched.c
> > > @@ -84,6 +84,40 @@ static int populate(dtrace_hdl_t *dtp)
> > >   			       probe_args, probes);
> > >   }
> > > +/*
> > > + * Get a pointer to the cpuinfo_t structure for the CPU associated
> > > + * with the runqueue that is in arg0.
> > > + *
> > > + * Clobbers %r1 through %r5
> > > + * Stores pointer to cpuinfo_t struct in %r0
> > > + */
> > > +static void get_cpuinfo(dtrace_hdl_t *dtp, dt_irlist_t *dlp, uint_t exitlbl)
> > > +{
> > > +	dt_ident_t	*idp = dt_dlib_get_map(dtp, "cpuinfo");
> > > +
> > > +	assert(idp != NULL);
> > > +
> > > +	/* Put the runqueue pointer from mst->arg0 into %r3. */
> > > +	emit(dlp, BPF_LOAD(BPF_DW, BPF_REG_3, BPF_REG_7, DMST_ARG(0)));
> > > +
> > > +	/* Turn it into a pointer to its cpu member. */
> > > +	emit(dlp, BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, dt_cg_ctf_offsetof("struct rq", "cpu", NULL, 1)));
> > > +
> > > +	/* Call bpf_probe_read_kernel(%fp + DT_TRAMP_SP_SLOT[0], sizeof(int), %r3) */
> > > +	emit(dlp, BPF_MOV_IMM(BPF_REG_2, (int) sizeof(int)));
> > > +	emit(dlp, BPF_MOV_REG(BPF_REG_1, BPF_REG_FP));
> > > +	emit(dlp, BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, DT_TRAMP_SP_SLOT(0)));
> > > +	emit(dlp, BPF_CALL_HELPER(BPF_FUNC_probe_read_kernel));
> > > +	emit(dlp, BPF_BRANCH_IMM(BPF_JNE, BPF_REG_0, 0, exitlbl));
> > > +
> > > +	/* Now look up the corresponding cpuinfo_t. */
> > > +	dt_cg_xsetx(dlp, idp, DT_LBL_NONE, BPF_REG_1, idp->di_id);
> > > +	emit(dlp, BPF_MOV_REG(BPF_REG_2, BPF_REG_FP));
> > > +	emit(dlp, BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, DT_TRAMP_SP_SLOT(0)));
> > > +	emit(dlp, BPF_CALL_HELPER(BPF_FUNC_map_lookup_elem));
> > > +	emit(dlp, BPF_BRANCH_IMM(BPF_JEQ, BPF_REG_0, 0, exitlbl));
> > > +}
> > > +
> > >   /*
> > >    * Generate a BPF trampoline for a SDT probe.
> > >    *
> > > @@ -98,18 +132,39 @@ static int populate(dtrace_hdl_t *dtp)
> > >    */
> > >   static int trampoline(dt_pcb_t *pcb, uint_t exitlbl)
> > >   {
> > > +	dtrace_hdl_t	*dtp = pcb->pcb_hdl;
> > >   	dt_irlist_t	*dlp = &pcb->pcb_ir;
> > >   	dt_probe_t	*prp = pcb->pcb_probe;
> > >   	if (strcmp(prp->desc->prb, "dequeue") == 0) {
> > > -		emit(dlp, BPF_LOAD(BPF_DW, BPF_REG_0, BPF_REG_7, DMST_ARG(1)));
> > > -		emit(dlp, BPF_STORE(BPF_DW, BPF_REG_7, DMST_ARG(0), BPF_REG_0));
> > >   		/*
> > > -		 * FIXME: arg1 should be a pointer to cpuinfo_t for the CPU
> > > -		 *	  associated with the runqueue.
> > > +		 * Get the runqueue from arg0 and place its cpuinfo_t* into %r0.
> > > +		 */
> > > +		get_cpuinfo(dtp, dlp, exitlbl);
> > > +
> > > +		/*
> > > +		 * Copy arg1 into arg0.
> > >   		 */
> > > -		emit(dlp, BPF_STORE_IMM(BPF_DW, BPF_REG_7, DMST_ARG(1), 0));
> > > +		emit(dlp, BPF_LOAD(BPF_DW, BPF_REG_3, BPF_REG_7, DMST_ARG(1)));
> > > +		emit(dlp, BPF_STORE(BPF_DW, BPF_REG_7, DMST_ARG(0), BPF_REG_3));
> > > +
> > > +		/* Store the cpuinfo_t* in %r0 into arg1. */
> > > +		emit(dlp, BPF_STORE(BPF_DW, BPF_REG_7, DMST_ARG(1), BPF_REG_0));
> > >   	} else if (strcmp(prp->desc->prb, "enqueue") == 0) {
> > > +		/*
> > > +		 * Get the runqueue from arg0 and place its cpuinfo_t* into %r0.
> > > +		 */
> > > +		get_cpuinfo(dtp, dlp, exitlbl);
> > > +
> > > +		/*
> > > +		 * Copy arg1 into arg0.
> > > +		 */
> > > +		emit(dlp, BPF_LOAD(BPF_DW, BPF_REG_3, BPF_REG_7, DMST_ARG(1)));
> > > +		emit(dlp, BPF_STORE(BPF_DW, BPF_REG_7, DMST_ARG(0), BPF_REG_3));
> > > +
> > > +		/* Store the cpuinfo_t* in %r0 into arg1. */
> > > +		emit(dlp, BPF_STORE(BPF_DW, BPF_REG_7, DMST_ARG(1), BPF_REG_0));
> > > +
> > >   /*
> > >    * This is ugly but necessary...  enqueue_task() takes a flags argument and the
> > >    * ENQUEUE_HEAD flag is used to indicate that the task is to be placed at the
> > > @@ -120,15 +175,6 @@ static int trampoline(dt_pcb_t *pcb, uint_t exitlbl)
> > >    * outside the kernel source tree.
> > >    */
> > >   #define ENQUEUE_HEAD	0x10
> > > -
> > > -		emit(dlp, BPF_LOAD(BPF_DW, BPF_REG_0, BPF_REG_7, DMST_ARG(1)));
> > > -		emit(dlp, BPF_STORE(BPF_DW, BPF_REG_7, DMST_ARG(0), BPF_REG_0));
> > > -		/*
> > > -		 * FIXME: arg1 should be a pointer to cpuinfo_t for the CPU
> > > -		 *	  associated with the runqueue.
> > > -		 */
> > > -		emit(dlp, BPF_STORE_IMM(BPF_DW, BPF_REG_7, DMST_ARG(1), 0));
> > > -
> > >   		emit(dlp, BPF_LOAD(BPF_DW, BPF_REG_0, BPF_REG_7, DMST_ARG(2)));
> > >   		emit(dlp, BPF_ALU64_IMM(BPF_AND, BPF_REG_0, ENQUEUE_HEAD));
> > >   		emit(dlp, BPF_STORE(BPF_DW, BPF_REG_7, DMST_ARG(2), BPF_REG_0));
> > > diff --git a/libdtrace/dt_work.c b/libdtrace/dt_work.c
> > > index 498d5332a..2167ed299 100644
> > > --- a/libdtrace/dt_work.c
> > > +++ b/libdtrace/dt_work.c
> > > @@ -37,35 +37,29 @@ END_probe(void)
> > >   int
> > >   dt_check_cpudrops(dtrace_hdl_t *dtp, processorid_t cpu, dtrace_dropkind_t what)
> > >   {
> > > -	dt_bpf_cpuinfo_t	*ci;
> > > -	uint32_t		cikey = 0;
> > > +	dt_bpf_cpuinfo_t	ci;
> > > +	uint32_t		cikey = cpu;
> > >   	uint64_t		cnt;
> > >   	int			rval = 0;
> > >   	assert(what == DTRACEDROP_PRINCIPAL || what == DTRACEDROP_AGGREGATION);
> > > -	ci = dt_calloc(dtp, dtp->dt_conf.num_possible_cpus,
> > > -		       sizeof(dt_bpf_cpuinfo_t));
> > > -	if (ci == NULL)
> > > -		return dt_set_errno(dtp, EDT_NOMEM);
> > > -
> > > -	if (dt_bpf_map_lookup(dtp->dt_cpumap_fd, &cikey, ci) == -1) {
> > > +	if (dt_bpf_map_lookup(dtp->dt_cpumap_fd, &cikey, &ci) == -1) {
> > >   		rval = dt_set_errno(dtp, EDT_BPF);
> > >   		goto fail;
> > >   	}
> > >   	if (what == DTRACEDROP_PRINCIPAL) {
> > > -		cnt = ci[cpu].buf_drops - dtp->dt_drops[cpu].buf;
> > > -		dtp->dt_drops[cpu].buf = ci[cpu].buf_drops;
> > > +		cnt = ci.buf_drops - dtp->dt_drops[cpu].buf;
> > > +		dtp->dt_drops[cpu].buf = ci.buf_drops;
> > >   	} else {
> > > -		cnt = ci[cpu].agg_drops - dtp->dt_drops[cpu].agg;
> > > -		dtp->dt_drops[cpu].agg = ci[cpu].agg_drops;
> > > +		cnt = ci.agg_drops - dtp->dt_drops[cpu].agg;
> > > +		dtp->dt_drops[cpu].agg = ci.agg_drops;
> > >   	}
> > >   	rval = dt_handle_cpudrop(dtp, cpu, what, cnt);
> > >   fail:
> > > -	dt_free(dtp, ci);
> > >   	return rval;
> > >   }
> > > diff --git a/test/unittest/sched/tst.enqueue.d b/test/unittest/sched/tst.enqueue.d
> > > index f445ac843..28dcace8c 100644
> > > --- a/test/unittest/sched/tst.enqueue.d
> > > +++ b/test/unittest/sched/tst.enqueue.d
> > > @@ -4,7 +4,6 @@
> > >    * Licensed under the Universal Permissive License v 1.0 as shown at
> > >    * http://oss.oracle.com/licenses/upl.
> > >    */
> > > -/* @@xfail: dtv2 */
> > >   #pragma D option switchrate=100hz
> > >   #pragma D option destructive
> > > -- 
> > > 2.43.5
> > > 



More information about the DTrace-devel mailing list