[DTrace-devel] [PATCH v3 2/2] Clean up sched provider trampoline FIXMEs

Eugene Loh eugene.loh at oracle.com
Fri Apr 11 21:20:00 UTC 2025


On 4/11/25 16:48, Kris Van Hees wrote:

> Partial comments below (still looking at the provider changes)...
>
> On Thu, Apr 03, 2025 at 01:02:52AM -0400, eugene.loh at oracle.com wrote:
>> From: Eugene Loh <eugene.loh at oracle.com>
>>
>> The sched provider trampoline for enqueue and dequeue probes had
>> pending FIXMEs for providing a cpuinfo_t* for the cpu associated
>> with the run queue.  Implement the missing code.
>>
>> Since the cpu associated with the run queue might be different from
>> the cpu where we are running, it becomes necessary to access the
>> cpuinfo for some random cpu.  With Linux 5.18, there is a BPF
>> helper function map_lookup_percpu_elem() that allows such lookups
>> on per-cpu arrays.  To support older kernels, however, we change
>> the cpuinfo BPF map from per-cpu to global.  Also, it is a hash
>> table rather than an array in case cpus are not numbered consecutively.
> I agree with all the above.  Good solution.
>
>> Signed-off-by: Eugene Loh <eugene.loh at oracle.com>
>> ---
>>   bpf/get_agg.c                     |  2 +-
>>   bpf/get_bvar.c                    |  2 +-
>>   libdtrace/dt_bpf.c                | 34 ++++++--------
>>   libdtrace/dt_cg.c                 |  5 ++-
>>   libdtrace/dt_prov_lockstat.c      |  4 +-
>>   libdtrace/dt_prov_sched.c         | 74 +++++++++++++++++++++++++------
>>   libdtrace/dt_work.c               | 20 +++------
>>   test/unittest/sched/tst.enqueue.d |  1 -
>>   8 files changed, 89 insertions(+), 53 deletions(-)
>>
>> diff --git a/bpf/get_agg.c b/bpf/get_agg.c
>> index c0eb825f0..e70caa6ef 100644
>> --- a/bpf/get_agg.c
>> +++ b/bpf/get_agg.c
>> @@ -21,7 +21,7 @@ extern struct bpf_map_def cpuinfo;
>>    */
>>   noinline uint64_t *dt_no_agg(void)
>>   {
>> -	uint32_t		key = 0;
>> +	uint32_t		key = bpf_get_smp_processor_id();
>>   	dt_bpf_cpuinfo_t	*ci;
>>   
>>   	ci = bpf_map_lookup_elem(&cpuinfo, &key);
>> diff --git a/bpf/get_bvar.c b/bpf/get_bvar.c
>> index d372b3445..d81c3605f 100644
>> --- a/bpf/get_bvar.c
>> +++ b/bpf/get_bvar.c
>> @@ -67,7 +67,7 @@ noinline uint64_t dt_bvar_caller(const dt_dctx_t *dctx)
>>   
>>   noinline uint64_t dt_bvar_curcpu(const dt_dctx_t *dctx)
>>   {
>> -	uint32_t	key = 0;
>> +	uint32_t	key = bpf_get_smp_processor_id();
>>   	void		*val = bpf_map_lookup_elem(&cpuinfo, &key);
>>   
>>   	if (val == NULL) {
>> diff --git a/libdtrace/dt_bpf.c b/libdtrace/dt_bpf.c
>> index 6d42a96c7..d6722cbd1 100644
>> --- a/libdtrace/dt_bpf.c
>> +++ b/libdtrace/dt_bpf.c
>> @@ -761,37 +761,29 @@ gmap_create_buffers(dtrace_hdl_t *dtp)
>>   static int
>>   gmap_create_cpuinfo(dtrace_hdl_t *dtp)
>>   {
>> -	int			i, rc;
>> +	int			i;
>>   	uint32_t		key = 0;
>>   	dtrace_conf_t		*conf = &dtp->dt_conf;
>>   	size_t			ncpus = conf->num_online_cpus;
>> -	dt_bpf_cpuinfo_t	*data;
>> +	dt_bpf_cpuinfo_t	data;
> Not sure about this, because (see below)...
>
>>   	cpuinfo_t		*ci;
>>   
>> -	/*
>> -	 * num_possible_cpus <= num_online_cpus: see dt_conf_init.
>> -	 */
>> -	data = dt_calloc(dtp, dtp->dt_conf.num_possible_cpus,
>> -			 sizeof(dt_bpf_cpuinfo_t));
>> -	if (data == NULL)
>> -		return dt_set_errno(dtp, EDT_NOMEM);
>> -
>> -	for (i = 0, ci = &conf->cpus[0]; i < ncpus; i++, ci++)
>> -		memcpy(&data[ci->cpu_id].ci, ci, sizeof(cpuinfo_t));
>> -
>>   	dtp->dt_cpumap_fd = create_gmap(dtp, "cpuinfo",
>> -					BPF_MAP_TYPE_PERCPU_ARRAY,
>> +					BPF_MAP_TYPE_HASH,
>>   					sizeof(uint32_t),
>> -					sizeof(dt_bpf_cpuinfo_t), 1);
>> +					sizeof(dt_bpf_cpuinfo_t), ncpus);
>>   	if (dtp->dt_cpumap_fd == -1)
>>   		return -1;
>>   
>> -	rc = dt_bpf_map_update(dtp->dt_cpumap_fd, &key, data);
>> -	dt_free(dtp, data);
>> -	if (rc == -1)
>> -		return dt_bpf_error(dtp,
>> -				    "cannot update BPF map 'cpuinfo': %s\n",
>> -				    strerror(errno));
>> +	memset(&data, 0, sizeof(data));
> Do we need this, because (see below)....
>
>> +	for (i = 0, ci = &conf->cpus[0]; i < ncpus; i++, ci++) {
>> +		memcpy(&data.ci, ci, sizeof(cpuinfo_t));
> Do we need this, because (see below)....
>
>> +		key = ci->cpu_id;
>> +		if (dt_bpf_map_update(dtp->dt_cpumap_fd, &key, &data) == -1)
> Why can'you we simply do:
>
> 		if (dt_bpf_map_update(dtp->dt_cpumap_fd, &key, ci) == -1)

I think the problem is that the BPF map has elements with size 
sizeof(dt_bpf_cpuinfo_t).  Meanwhile, ci has size sizeof(cpuinfo_t), 
which is smaller.  So if we do an update like that, the map will have 
stuff where we want it to be initialized to 0.

>> +			return dt_bpf_error(dtp,
>> +					    "cannot update BPF map 'cpuinfo': %s\n",
>> +					    strerror(errno));
>> +	}
>>   
>>   	return 0;
>>   }
>> diff --git a/libdtrace/dt_cg.c b/libdtrace/dt_cg.c
>> index 6dcf4cd3d..d83b1c2ce 100644
>> --- a/libdtrace/dt_cg.c
>> +++ b/libdtrace/dt_cg.c
>> @@ -1243,9 +1243,12 @@ dt_cg_epilogue(dt_pcb_t *pcb)
>>   		} else {
>>   			idp = dt_dlib_get_map(dtp, "cpuinfo");
>>   			assert(idp != NULL);
>> +
>> +			emit(dlp, BPF_CALL_HELPER(BPF_FUNC_get_smp_processor_id));
>> +
>>   			dt_cg_xsetx(dlp, idp, DT_LBL_NONE, BPF_REG_1, idp->di_id);
>>   			emit(dlp, BPF_LOAD(BPF_DW, BPF_REG_2, BPF_REG_FP, DT_STK_SP));
>> -			emit(dlp, BPF_STORE_IMM(BPF_DW, BPF_REG_2, 0, 0));
>> +			emit(dlp, BPF_STORE(BPF_DW, BPF_REG_2, 0, BPF_REG_0));
>>   			emit(dlp, BPF_CALL_HELPER(BPF_FUNC_map_lookup_elem));
>>   			emit(dlp, BPF_BRANCH_IMM(BPF_JEQ, BPF_REG_0, 0, pcb->pcb_exitlbl));
>>   			emit(dlp, BPF_MOV_IMM(BPF_REG_1, 1));
>> diff --git a/libdtrace/dt_prov_lockstat.c b/libdtrace/dt_prov_lockstat.c
>> index c73edf9be..8b2cf4da2 100644
>> --- a/libdtrace/dt_prov_lockstat.c
>> +++ b/libdtrace/dt_prov_lockstat.c
>> @@ -121,11 +121,13 @@ static void get_cpuinfo(dtrace_hdl_t *dtp, dt_irlist_t *dlp, uint_t exitlbl)
>>   {
>>   	dt_ident_t	*idp = dt_dlib_get_map(dtp, "cpuinfo");
>>   
>> +	emit(dlp,  BPF_CALL_HELPER(BPF_FUNC_get_smp_processor_id));
>> +
>>   	assert(idp != NULL);
>>   	dt_cg_xsetx(dlp, idp, DT_LBL_NONE, BPF_REG_1, idp->di_id);
>>   	emit(dlp, BPF_MOV_REG(BPF_REG_2, BPF_REG_FP));
>>   	emit(dlp, BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, DT_TRAMP_SP_BASE));
>> -	emit(dlp, BPF_STORE_IMM(BPF_DW, BPF_REG_2, 0, 0));
>> +	emit(dlp, BPF_STORE(BPF_DW, BPF_REG_2, 0, BPF_REG_0));
>>   	emit(dlp, BPF_CALL_HELPER(BPF_FUNC_map_lookup_elem));
>>   	emit(dlp, BPF_BRANCH_IMM(BPF_JEQ, BPF_REG_0, 0, exitlbl));
>>   	emit(dlp, BPF_MOV_REG(BPF_REG_6, BPF_REG_0));
>> diff --git a/libdtrace/dt_prov_sched.c b/libdtrace/dt_prov_sched.c
>> index 3a218f3cb..a548e679f 100644
>> --- a/libdtrace/dt_prov_sched.c
>> +++ b/libdtrace/dt_prov_sched.c
>> @@ -84,6 +84,40 @@ static int populate(dtrace_hdl_t *dtp)
>>   			       probe_args, probes);
>>   }
>>   
>> +/*
>> + * Get a pointer to the cpuinfo_t structure for the CPU associated
>> + * with the runqueue that is in arg0.
>> + *
>> + * Clobbers %r1 through %r5
>> + * Stores pointer to cpuinfo_t struct in %r0
>> + */
>> +static void get_cpuinfo(dtrace_hdl_t *dtp, dt_irlist_t *dlp, uint_t exitlbl)
>> +{
>> +	dt_ident_t	*idp = dt_dlib_get_map(dtp, "cpuinfo");
>> +
>> +	assert(idp != NULL);
>> +
>> +	/* Put the runqueue pointer from mst->arg0 into %r3. */
>> +	emit(dlp, BPF_LOAD(BPF_DW, BPF_REG_3, BPF_REG_7, DMST_ARG(0)));
>> +
>> +	/* Turn it into a pointer to its cpu member. */
>> +	emit(dlp, BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, dt_cg_ctf_offsetof("struct rq", "cpu", NULL, 1)));
>> +
>> +	/* Call bpf_probe_read_kernel(%fp + DT_TRAMP_SP_SLOT[0], sizeof(int), %r3) */
>> +	emit(dlp, BPF_MOV_IMM(BPF_REG_2, (int) sizeof(int)));
>> +	emit(dlp, BPF_MOV_REG(BPF_REG_1, BPF_REG_FP));
>> +	emit(dlp, BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, DT_TRAMP_SP_SLOT(0)));
>> +	emit(dlp, BPF_CALL_HELPER(BPF_FUNC_probe_read_kernel));
>> +	emit(dlp, BPF_BRANCH_IMM(BPF_JNE, BPF_REG_0, 0, exitlbl));
>> +
>> +	/* Now look up the corresponding cpuinfo_t. */
>> +	dt_cg_xsetx(dlp, idp, DT_LBL_NONE, BPF_REG_1, idp->di_id);
>> +	emit(dlp, BPF_MOV_REG(BPF_REG_2, BPF_REG_FP));
>> +	emit(dlp, BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, DT_TRAMP_SP_SLOT(0)));
>> +	emit(dlp, BPF_CALL_HELPER(BPF_FUNC_map_lookup_elem));
>> +	emit(dlp, BPF_BRANCH_IMM(BPF_JEQ, BPF_REG_0, 0, exitlbl));
>> +}
>> +
>>   /*
>>    * Generate a BPF trampoline for a SDT probe.
>>    *
>> @@ -98,18 +132,39 @@ static int populate(dtrace_hdl_t *dtp)
>>    */
>>   static int trampoline(dt_pcb_t *pcb, uint_t exitlbl)
>>   {
>> +	dtrace_hdl_t	*dtp = pcb->pcb_hdl;
>>   	dt_irlist_t	*dlp = &pcb->pcb_ir;
>>   	dt_probe_t	*prp = pcb->pcb_probe;
>>   
>>   	if (strcmp(prp->desc->prb, "dequeue") == 0) {
>> -		emit(dlp, BPF_LOAD(BPF_DW, BPF_REG_0, BPF_REG_7, DMST_ARG(1)));
>> -		emit(dlp, BPF_STORE(BPF_DW, BPF_REG_7, DMST_ARG(0), BPF_REG_0));
>>   		/*
>> -		 * FIXME: arg1 should be a pointer to cpuinfo_t for the CPU
>> -		 *	  associated with the runqueue.
>> +		 * Get the runqueue from arg0 and place its cpuinfo_t* into %r0.
>> +		 */
>> +		get_cpuinfo(dtp, dlp, exitlbl);
>> +
>> +		/*
>> +		 * Copy arg1 into arg0.
>>   		 */
>> -		emit(dlp, BPF_STORE_IMM(BPF_DW, BPF_REG_7, DMST_ARG(1), 0));
>> +		emit(dlp, BPF_LOAD(BPF_DW, BPF_REG_3, BPF_REG_7, DMST_ARG(1)));
>> +		emit(dlp, BPF_STORE(BPF_DW, BPF_REG_7, DMST_ARG(0), BPF_REG_3));
>> +
>> +		/* Store the cpuinfo_t* in %r0 into arg1. */
>> +		emit(dlp, BPF_STORE(BPF_DW, BPF_REG_7, DMST_ARG(1), BPF_REG_0));
>>   	} else if (strcmp(prp->desc->prb, "enqueue") == 0) {
>> +		/*
>> +		 * Get the runqueue from arg0 and place its cpuinfo_t* into %r0.
>> +		 */
>> +		get_cpuinfo(dtp, dlp, exitlbl);
>> +
>> +		/*
>> +		 * Copy arg1 into arg0.
>> +		 */
>> +		emit(dlp, BPF_LOAD(BPF_DW, BPF_REG_3, BPF_REG_7, DMST_ARG(1)));
>> +		emit(dlp, BPF_STORE(BPF_DW, BPF_REG_7, DMST_ARG(0), BPF_REG_3));
>> +
>> +		/* Store the cpuinfo_t* in %r0 into arg1. */
>> +		emit(dlp, BPF_STORE(BPF_DW, BPF_REG_7, DMST_ARG(1), BPF_REG_0));
>> +
>>   /*
>>    * This is ugly but necessary...  enqueue_task() takes a flags argument and the
>>    * ENQUEUE_HEAD flag is used to indicate that the task is to be placed at the
>> @@ -120,15 +175,6 @@ static int trampoline(dt_pcb_t *pcb, uint_t exitlbl)
>>    * outside the kernel source tree.
>>    */
>>   #define ENQUEUE_HEAD	0x10
>> -
>> -		emit(dlp, BPF_LOAD(BPF_DW, BPF_REG_0, BPF_REG_7, DMST_ARG(1)));
>> -		emit(dlp, BPF_STORE(BPF_DW, BPF_REG_7, DMST_ARG(0), BPF_REG_0));
>> -		/*
>> -		 * FIXME: arg1 should be a pointer to cpuinfo_t for the CPU
>> -		 *	  associated with the runqueue.
>> -		 */
>> -		emit(dlp, BPF_STORE_IMM(BPF_DW, BPF_REG_7, DMST_ARG(1), 0));
>> -
>>   		emit(dlp, BPF_LOAD(BPF_DW, BPF_REG_0, BPF_REG_7, DMST_ARG(2)));
>>   		emit(dlp, BPF_ALU64_IMM(BPF_AND, BPF_REG_0, ENQUEUE_HEAD));
>>   		emit(dlp, BPF_STORE(BPF_DW, BPF_REG_7, DMST_ARG(2), BPF_REG_0));
>> diff --git a/libdtrace/dt_work.c b/libdtrace/dt_work.c
>> index 498d5332a..2167ed299 100644
>> --- a/libdtrace/dt_work.c
>> +++ b/libdtrace/dt_work.c
>> @@ -37,35 +37,29 @@ END_probe(void)
>>   int
>>   dt_check_cpudrops(dtrace_hdl_t *dtp, processorid_t cpu, dtrace_dropkind_t what)
>>   {
>> -	dt_bpf_cpuinfo_t	*ci;
>> -	uint32_t		cikey = 0;
>> +	dt_bpf_cpuinfo_t	ci;
>> +	uint32_t		cikey = cpu;
>>   	uint64_t		cnt;
>>   	int			rval = 0;
>>   
>>   	assert(what == DTRACEDROP_PRINCIPAL || what == DTRACEDROP_AGGREGATION);
>>   
>> -	ci = dt_calloc(dtp, dtp->dt_conf.num_possible_cpus,
>> -		       sizeof(dt_bpf_cpuinfo_t));
>> -	if (ci == NULL)
>> -		return dt_set_errno(dtp, EDT_NOMEM);
>> -
>> -	if (dt_bpf_map_lookup(dtp->dt_cpumap_fd, &cikey, ci) == -1) {
>> +	if (dt_bpf_map_lookup(dtp->dt_cpumap_fd, &cikey, &ci) == -1) {
>>   		rval = dt_set_errno(dtp, EDT_BPF);
>>   		goto fail;
>>   	}
>>   
>>   	if (what == DTRACEDROP_PRINCIPAL) {
>> -		cnt = ci[cpu].buf_drops - dtp->dt_drops[cpu].buf;
>> -		dtp->dt_drops[cpu].buf = ci[cpu].buf_drops;
>> +		cnt = ci.buf_drops - dtp->dt_drops[cpu].buf;
>> +		dtp->dt_drops[cpu].buf = ci.buf_drops;
>>   	} else {
>> -		cnt = ci[cpu].agg_drops - dtp->dt_drops[cpu].agg;
>> -		dtp->dt_drops[cpu].agg = ci[cpu].agg_drops;
>> +		cnt = ci.agg_drops - dtp->dt_drops[cpu].agg;
>> +		dtp->dt_drops[cpu].agg = ci.agg_drops;
>>   	}
>>   
>>   	rval = dt_handle_cpudrop(dtp, cpu, what, cnt);
>>   
>>   fail:
>> -	dt_free(dtp, ci);
>>   	return rval;
>>   }
>>   
>> diff --git a/test/unittest/sched/tst.enqueue.d b/test/unittest/sched/tst.enqueue.d
>> index f445ac843..28dcace8c 100644
>> --- a/test/unittest/sched/tst.enqueue.d
>> +++ b/test/unittest/sched/tst.enqueue.d
>> @@ -4,7 +4,6 @@
>>    * Licensed under the Universal Permissive License v 1.0 as shown at
>>    * http://oss.oracle.com/licenses/upl.
>>    */
>> -/* @@xfail: dtv2 */
>>   
>>   #pragma D option switchrate=100hz
>>   #pragma D option destructive
>> -- 
>> 2.43.5
>>



More information about the DTrace-devel mailing list