[DTrace-devel] [PATCH v3 2/2] Clean up sched provider trampoline FIXMEs
Kris Van Hees
kris.van.hees at oracle.com
Thu Apr 24 15:32:24 UTC 2025
On Tue, Apr 15, 2025 at 01:11:57PM -0400, Eugene Loh wrote:
> On 4/15/25 07:59, Kris Van Hees wrote:
>
> > On Fri, Apr 11, 2025 at 05:20:00PM -0400, Eugene Loh wrote:
> > > On 4/11/25 16:48, Kris Van Hees wrote:
> > >
> > > > Partial comments below (still looking at the provider changes)...
> > > >
> > > > On Thu, Apr 03, 2025 at 01:02:52AM -0400, eugene.loh at oracle.com wrote:
> > > > > From: Eugene Loh <eugene.loh at oracle.com>
> > > > >
> > > > > The sched provider trampoline for enqueue and dequeue probes had
> > > > > pending FIXMEs for providing a cpuinfo_t* for the cpu associated
> > > > > with the run queue. Implement the missing code.
> > > > >
> > > > > Since the cpu associated with the run queue might be different from
> > > > > the cpu where we are running, it becomes necessary to access the
> > > > > cpuinfo for some random cpu. With Linux 5.18, there is a BPF
> > > > > helper function map_lookup_percpu_elem() that allows such lookups
> > > > > on per-cpu arrays. To support older kernels, however, we change
> > > > > the cpuinfo BPF map from per-cpu to global. Also, it is a hash
> > > > > table rather than an array in case cpus are not numbered consecutively.
> > > > I agree with all the above. Good solution.
> > > >
> > > > > Signed-off-by: Eugene Loh <eugene.loh at oracle.com>
> > > > > ---
> > > > > bpf/get_agg.c | 2 +-
> > > > > bpf/get_bvar.c | 2 +-
> > > > > libdtrace/dt_bpf.c | 34 ++++++--------
> > > > > libdtrace/dt_cg.c | 5 ++-
> > > > > libdtrace/dt_prov_lockstat.c | 4 +-
> > > > > libdtrace/dt_prov_sched.c | 74 +++++++++++++++++++++++++------
> > > > > libdtrace/dt_work.c | 20 +++------
> > > > > test/unittest/sched/tst.enqueue.d | 1 -
> > > > > 8 files changed, 89 insertions(+), 53 deletions(-)
> > > > >
> > > > > diff --git a/bpf/get_agg.c b/bpf/get_agg.c
> > > > > index c0eb825f0..e70caa6ef 100644
> > > > > --- a/bpf/get_agg.c
> > > > > +++ b/bpf/get_agg.c
> > > > > @@ -21,7 +21,7 @@ extern struct bpf_map_def cpuinfo;
> > > > > */
> > > > > noinline uint64_t *dt_no_agg(void)
> > > > > {
> > > > > - uint32_t key = 0;
> > > > > + uint32_t key = bpf_get_smp_processor_id();
> > > > > dt_bpf_cpuinfo_t *ci;
> > > > > ci = bpf_map_lookup_elem(&cpuinfo, &key);
> > > > > diff --git a/bpf/get_bvar.c b/bpf/get_bvar.c
> > > > > index d372b3445..d81c3605f 100644
> > > > > --- a/bpf/get_bvar.c
> > > > > +++ b/bpf/get_bvar.c
> > > > > @@ -67,7 +67,7 @@ noinline uint64_t dt_bvar_caller(const dt_dctx_t *dctx)
> > > > > noinline uint64_t dt_bvar_curcpu(const dt_dctx_t *dctx)
> > > > > {
> > > > > - uint32_t key = 0;
> > > > > + uint32_t key = bpf_get_smp_processor_id();
> > > > > void *val = bpf_map_lookup_elem(&cpuinfo, &key);
> > > > > if (val == NULL) {
> > > > > diff --git a/libdtrace/dt_bpf.c b/libdtrace/dt_bpf.c
> > > > > index 6d42a96c7..d6722cbd1 100644
> > > > > --- a/libdtrace/dt_bpf.c
> > > > > +++ b/libdtrace/dt_bpf.c
> > > > > @@ -761,37 +761,29 @@ gmap_create_buffers(dtrace_hdl_t *dtp)
> > > > > static int
> > > > > gmap_create_cpuinfo(dtrace_hdl_t *dtp)
> > > > > {
> > > > > - int i, rc;
> > > > > + int i;
> > > > > uint32_t key = 0;
> > > > > dtrace_conf_t *conf = &dtp->dt_conf;
> > > > > size_t ncpus = conf->num_online_cpus;
> > > > > - dt_bpf_cpuinfo_t *data;
> > > > > + dt_bpf_cpuinfo_t data;
> > > > Not sure about this, because (see below)...
> > > >
> > > > > cpuinfo_t *ci;
> > > > > - /*
> > > > > - * num_possible_cpus <= num_online_cpus: see dt_conf_init.
> > > > > - */
> > > > > - data = dt_calloc(dtp, dtp->dt_conf.num_possible_cpus,
> > > > > - sizeof(dt_bpf_cpuinfo_t));
> > > > > - if (data == NULL)
> > > > > - return dt_set_errno(dtp, EDT_NOMEM);
> > > > > -
> > > > > - for (i = 0, ci = &conf->cpus[0]; i < ncpus; i++, ci++)
> > > > > - memcpy(&data[ci->cpu_id].ci, ci, sizeof(cpuinfo_t));
> > > > > -
> > > > > dtp->dt_cpumap_fd = create_gmap(dtp, "cpuinfo",
> > > > > - BPF_MAP_TYPE_PERCPU_ARRAY,
> > > > > + BPF_MAP_TYPE_HASH,
> > > > > sizeof(uint32_t),
> > > > > - sizeof(dt_bpf_cpuinfo_t), 1);
> > > > > + sizeof(dt_bpf_cpuinfo_t), ncpus);
> > > > > if (dtp->dt_cpumap_fd == -1)
> > > > > return -1;
> > > > > - rc = dt_bpf_map_update(dtp->dt_cpumap_fd, &key, data);
> > > > > - dt_free(dtp, data);
> > > > > - if (rc == -1)
> > > > > - return dt_bpf_error(dtp,
> > > > > - "cannot update BPF map 'cpuinfo': %s\n",
> > > > > - strerror(errno));
> > > > > + memset(&data, 0, sizeof(data));
> > > > Do we need this, because (see below)....
> > > >
> > > > > + for (i = 0, ci = &conf->cpus[0]; i < ncpus; i++, ci++) {
> > > > > + memcpy(&data.ci, ci, sizeof(cpuinfo_t));
> > > > Do we need this, because (see below)....
> > > >
> > > > > + key = ci->cpu_id;
> > > > > + if (dt_bpf_map_update(dtp->dt_cpumap_fd, &key, &data) == -1)
> > > > Why can'you we simply do:
> > > >
> > > > if (dt_bpf_map_update(dtp->dt_cpumap_fd, &key, ci) == -1)
> > > I think the problem is that the BPF map has elements with size
> > > sizeof(dt_bpf_cpuinfo_t). Meanwhile, ci has size sizeof(cpuinfo_t), which
> > > is smaller. So if we do an update like that, the map will have stuff where
> > > we want it to be initialized to 0.
> > Yes, but I am 99% certain that BPF maps are allocated and initialized with
> > zeros because doing otherwise would be a major security risk for the kernel.
> > So you can count on that (should verify first to make certain but honestly
> > it needs to be or else it could leak data which is a big no-no).
>
> I don't think that helps. We do not feed map_update() a size. We cannot
> say, "Fill in only the first few bytes of the element." We just point to
> where the new value is and the BPF function copies as much data in as it
> needs for the map element. Unless we prepare a zero-padded copy of the
> data, map_update() will overreach and copy in values that should be zero but
> are actually data corresponding to other CPUs.
Ah yes, you are right of course. Nevermind.
>
> > > > > + return dt_bpf_error(dtp,
> > > > > + "cannot update BPF map 'cpuinfo': %s\n",
> > > > > + strerror(errno));
> > > > > + }
> > > > > return 0;
> > > > > }
> > > > > diff --git a/libdtrace/dt_cg.c b/libdtrace/dt_cg.c
> > > > > index 6dcf4cd3d..d83b1c2ce 100644
> > > > > --- a/libdtrace/dt_cg.c
> > > > > +++ b/libdtrace/dt_cg.c
> > > > > @@ -1243,9 +1243,12 @@ dt_cg_epilogue(dt_pcb_t *pcb)
> > > > > } else {
> > > > > idp = dt_dlib_get_map(dtp, "cpuinfo");
> > > > > assert(idp != NULL);
> > > > > +
> > > > > + emit(dlp, BPF_CALL_HELPER(BPF_FUNC_get_smp_processor_id));
> > > > > +
> > > > > dt_cg_xsetx(dlp, idp, DT_LBL_NONE, BPF_REG_1, idp->di_id);
> > > > > emit(dlp, BPF_LOAD(BPF_DW, BPF_REG_2, BPF_REG_FP, DT_STK_SP));
> > > > > - emit(dlp, BPF_STORE_IMM(BPF_DW, BPF_REG_2, 0, 0));
> > > > > + emit(dlp, BPF_STORE(BPF_DW, BPF_REG_2, 0, BPF_REG_0));
> > > > > emit(dlp, BPF_CALL_HELPER(BPF_FUNC_map_lookup_elem));
> > > > > emit(dlp, BPF_BRANCH_IMM(BPF_JEQ, BPF_REG_0, 0, pcb->pcb_exitlbl));
> > > > > emit(dlp, BPF_MOV_IMM(BPF_REG_1, 1));
> > > > > diff --git a/libdtrace/dt_prov_lockstat.c b/libdtrace/dt_prov_lockstat.c
> > > > > index c73edf9be..8b2cf4da2 100644
> > > > > --- a/libdtrace/dt_prov_lockstat.c
> > > > > +++ b/libdtrace/dt_prov_lockstat.c
> > > > > @@ -121,11 +121,13 @@ static void get_cpuinfo(dtrace_hdl_t *dtp, dt_irlist_t *dlp, uint_t exitlbl)
> > > > > {
> > > > > dt_ident_t *idp = dt_dlib_get_map(dtp, "cpuinfo");
> > > > > + emit(dlp, BPF_CALL_HELPER(BPF_FUNC_get_smp_processor_id));
> > > > > +
> > > > > assert(idp != NULL);
> > > > > dt_cg_xsetx(dlp, idp, DT_LBL_NONE, BPF_REG_1, idp->di_id);
> > > > > emit(dlp, BPF_MOV_REG(BPF_REG_2, BPF_REG_FP));
> > > > > emit(dlp, BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, DT_TRAMP_SP_BASE));
> > > > > - emit(dlp, BPF_STORE_IMM(BPF_DW, BPF_REG_2, 0, 0));
> > > > > + emit(dlp, BPF_STORE(BPF_DW, BPF_REG_2, 0, BPF_REG_0));
> > > > > emit(dlp, BPF_CALL_HELPER(BPF_FUNC_map_lookup_elem));
> > > > > emit(dlp, BPF_BRANCH_IMM(BPF_JEQ, BPF_REG_0, 0, exitlbl));
> > > > > emit(dlp, BPF_MOV_REG(BPF_REG_6, BPF_REG_0));
> > > > > diff --git a/libdtrace/dt_prov_sched.c b/libdtrace/dt_prov_sched.c
> > > > > index 3a218f3cb..a548e679f 100644
> > > > > --- a/libdtrace/dt_prov_sched.c
> > > > > +++ b/libdtrace/dt_prov_sched.c
> > > > > @@ -84,6 +84,40 @@ static int populate(dtrace_hdl_t *dtp)
> > > > > probe_args, probes);
> > > > > }
> > > > > +/*
> > > > > + * Get a pointer to the cpuinfo_t structure for the CPU associated
> > > > > + * with the runqueue that is in arg0.
> > > > > + *
> > > > > + * Clobbers %r1 through %r5
> > > > > + * Stores pointer to cpuinfo_t struct in %r0
> > > > > + */
> > > > > +static void get_cpuinfo(dtrace_hdl_t *dtp, dt_irlist_t *dlp, uint_t exitlbl)
> > > > > +{
> > > > > + dt_ident_t *idp = dt_dlib_get_map(dtp, "cpuinfo");
> > > > > +
> > > > > + assert(idp != NULL);
> > > > > +
> > > > > + /* Put the runqueue pointer from mst->arg0 into %r3. */
> > > > > + emit(dlp, BPF_LOAD(BPF_DW, BPF_REG_3, BPF_REG_7, DMST_ARG(0)));
> > > > > +
> > > > > + /* Turn it into a pointer to its cpu member. */
> > > > > + emit(dlp, BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, dt_cg_ctf_offsetof("struct rq", "cpu", NULL, 1)));
> > > > > +
> > > > > + /* Call bpf_probe_read_kernel(%fp + DT_TRAMP_SP_SLOT[0], sizeof(int), %r3) */
> > > > > + emit(dlp, BPF_MOV_IMM(BPF_REG_2, (int) sizeof(int)));
> > > > > + emit(dlp, BPF_MOV_REG(BPF_REG_1, BPF_REG_FP));
> > > > > + emit(dlp, BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, DT_TRAMP_SP_SLOT(0)));
> > > > > + emit(dlp, BPF_CALL_HELPER(BPF_FUNC_probe_read_kernel));
> > > > > + emit(dlp, BPF_BRANCH_IMM(BPF_JNE, BPF_REG_0, 0, exitlbl));
> > > > > +
> > > > > + /* Now look up the corresponding cpuinfo_t. */
> > > > > + dt_cg_xsetx(dlp, idp, DT_LBL_NONE, BPF_REG_1, idp->di_id);
> > > > > + emit(dlp, BPF_MOV_REG(BPF_REG_2, BPF_REG_FP));
> > > > > + emit(dlp, BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, DT_TRAMP_SP_SLOT(0)));
> > > > > + emit(dlp, BPF_CALL_HELPER(BPF_FUNC_map_lookup_elem));
> > > > > + emit(dlp, BPF_BRANCH_IMM(BPF_JEQ, BPF_REG_0, 0, exitlbl));
> > > > > +}
> > > > > +
> > > > > /*
> > > > > * Generate a BPF trampoline for a SDT probe.
> > > > > *
> > > > > @@ -98,18 +132,39 @@ static int populate(dtrace_hdl_t *dtp)
> > > > > */
> > > > > static int trampoline(dt_pcb_t *pcb, uint_t exitlbl)
> > > > > {
> > > > > + dtrace_hdl_t *dtp = pcb->pcb_hdl;
> > > > > dt_irlist_t *dlp = &pcb->pcb_ir;
> > > > > dt_probe_t *prp = pcb->pcb_probe;
> > > > > if (strcmp(prp->desc->prb, "dequeue") == 0) {
> > > > > - emit(dlp, BPF_LOAD(BPF_DW, BPF_REG_0, BPF_REG_7, DMST_ARG(1)));
> > > > > - emit(dlp, BPF_STORE(BPF_DW, BPF_REG_7, DMST_ARG(0), BPF_REG_0));
> > > > > /*
> > > > > - * FIXME: arg1 should be a pointer to cpuinfo_t for the CPU
> > > > > - * associated with the runqueue.
> > > > > + * Get the runqueue from arg0 and place its cpuinfo_t* into %r0.
> > > > > + */
> > > > > + get_cpuinfo(dtp, dlp, exitlbl);
> > > > > +
> > > > > + /*
> > > > > + * Copy arg1 into arg0.
> > > > > */
> > > > > - emit(dlp, BPF_STORE_IMM(BPF_DW, BPF_REG_7, DMST_ARG(1), 0));
> > > > > + emit(dlp, BPF_LOAD(BPF_DW, BPF_REG_3, BPF_REG_7, DMST_ARG(1)));
> > > > > + emit(dlp, BPF_STORE(BPF_DW, BPF_REG_7, DMST_ARG(0), BPF_REG_3));
> > > > > +
> > > > > + /* Store the cpuinfo_t* in %r0 into arg1. */
> > > > > + emit(dlp, BPF_STORE(BPF_DW, BPF_REG_7, DMST_ARG(1), BPF_REG_0));
> > > > > } else if (strcmp(prp->desc->prb, "enqueue") == 0) {
> > > > > + /*
> > > > > + * Get the runqueue from arg0 and place its cpuinfo_t* into %r0.
> > > > > + */
> > > > > + get_cpuinfo(dtp, dlp, exitlbl);
> > > > > +
> > > > > + /*
> > > > > + * Copy arg1 into arg0.
> > > > > + */
> > > > > + emit(dlp, BPF_LOAD(BPF_DW, BPF_REG_3, BPF_REG_7, DMST_ARG(1)));
> > > > > + emit(dlp, BPF_STORE(BPF_DW, BPF_REG_7, DMST_ARG(0), BPF_REG_3));
> > > > > +
> > > > > + /* Store the cpuinfo_t* in %r0 into arg1. */
> > > > > + emit(dlp, BPF_STORE(BPF_DW, BPF_REG_7, DMST_ARG(1), BPF_REG_0));
> > > > > +
> > > > > /*
> > > > > * This is ugly but necessary... enqueue_task() takes a flags argument and the
> > > > > * ENQUEUE_HEAD flag is used to indicate that the task is to be placed at the
> > > > > @@ -120,15 +175,6 @@ static int trampoline(dt_pcb_t *pcb, uint_t exitlbl)
> > > > > * outside the kernel source tree.
> > > > > */
> > > > > #define ENQUEUE_HEAD 0x10
> > > > > -
> > > > > - emit(dlp, BPF_LOAD(BPF_DW, BPF_REG_0, BPF_REG_7, DMST_ARG(1)));
> > > > > - emit(dlp, BPF_STORE(BPF_DW, BPF_REG_7, DMST_ARG(0), BPF_REG_0));
> > > > > - /*
> > > > > - * FIXME: arg1 should be a pointer to cpuinfo_t for the CPU
> > > > > - * associated with the runqueue.
> > > > > - */
> > > > > - emit(dlp, BPF_STORE_IMM(BPF_DW, BPF_REG_7, DMST_ARG(1), 0));
> > > > > -
> > > > > emit(dlp, BPF_LOAD(BPF_DW, BPF_REG_0, BPF_REG_7, DMST_ARG(2)));
> > > > > emit(dlp, BPF_ALU64_IMM(BPF_AND, BPF_REG_0, ENQUEUE_HEAD));
> > > > > emit(dlp, BPF_STORE(BPF_DW, BPF_REG_7, DMST_ARG(2), BPF_REG_0));
> > > > > diff --git a/libdtrace/dt_work.c b/libdtrace/dt_work.c
> > > > > index 498d5332a..2167ed299 100644
> > > > > --- a/libdtrace/dt_work.c
> > > > > +++ b/libdtrace/dt_work.c
> > > > > @@ -37,35 +37,29 @@ END_probe(void)
> > > > > int
> > > > > dt_check_cpudrops(dtrace_hdl_t *dtp, processorid_t cpu, dtrace_dropkind_t what)
> > > > > {
> > > > > - dt_bpf_cpuinfo_t *ci;
> > > > > - uint32_t cikey = 0;
> > > > > + dt_bpf_cpuinfo_t ci;
> > > > > + uint32_t cikey = cpu;
> > > > > uint64_t cnt;
> > > > > int rval = 0;
> > > > > assert(what == DTRACEDROP_PRINCIPAL || what == DTRACEDROP_AGGREGATION);
> > > > > - ci = dt_calloc(dtp, dtp->dt_conf.num_possible_cpus,
> > > > > - sizeof(dt_bpf_cpuinfo_t));
> > > > > - if (ci == NULL)
> > > > > - return dt_set_errno(dtp, EDT_NOMEM);
> > > > > -
> > > > > - if (dt_bpf_map_lookup(dtp->dt_cpumap_fd, &cikey, ci) == -1) {
> > > > > + if (dt_bpf_map_lookup(dtp->dt_cpumap_fd, &cikey, &ci) == -1) {
> > > > > rval = dt_set_errno(dtp, EDT_BPF);
> > > > > goto fail;
> > > > > }
> > > > > if (what == DTRACEDROP_PRINCIPAL) {
> > > > > - cnt = ci[cpu].buf_drops - dtp->dt_drops[cpu].buf;
> > > > > - dtp->dt_drops[cpu].buf = ci[cpu].buf_drops;
> > > > > + cnt = ci.buf_drops - dtp->dt_drops[cpu].buf;
> > > > > + dtp->dt_drops[cpu].buf = ci.buf_drops;
> > > > > } else {
> > > > > - cnt = ci[cpu].agg_drops - dtp->dt_drops[cpu].agg;
> > > > > - dtp->dt_drops[cpu].agg = ci[cpu].agg_drops;
> > > > > + cnt = ci.agg_drops - dtp->dt_drops[cpu].agg;
> > > > > + dtp->dt_drops[cpu].agg = ci.agg_drops;
> > > > > }
> > > > > rval = dt_handle_cpudrop(dtp, cpu, what, cnt);
> > > > > fail:
> > > > > - dt_free(dtp, ci);
> > > > > return rval;
> > > > > }
> > > > > diff --git a/test/unittest/sched/tst.enqueue.d b/test/unittest/sched/tst.enqueue.d
> > > > > index f445ac843..28dcace8c 100644
> > > > > --- a/test/unittest/sched/tst.enqueue.d
> > > > > +++ b/test/unittest/sched/tst.enqueue.d
> > > > > @@ -4,7 +4,6 @@
> > > > > * Licensed under the Universal Permissive License v 1.0 as shown at
> > > > > * http://oss.oracle.com/licenses/upl.
> > > > > */
> > > > > -/* @@xfail: dtv2 */
> > > > > #pragma D option switchrate=100hz
> > > > > #pragma D option destructive
> > > > > --
> > > > > 2.43.5
> > > > >
More information about the DTrace-devel
mailing list