[DTrace-devel] [PATCH v3 2/2] Clean up sched provider trampoline FIXMEs
eugene.loh at oracle.com
eugene.loh at oracle.com
Thu Apr 3 05:02:52 UTC 2025
From: Eugene Loh <eugene.loh at oracle.com>
The sched provider trampoline for enqueue and dequeue probes had
pending FIXMEs for providing a cpuinfo_t* for the cpu associated
with the run queue. Implement the missing code.
Since the cpu associated with the run queue might be different from
the cpu where we are running, it becomes necessary to access the
cpuinfo for some random cpu. With Linux 5.18, there is a BPF
helper function map_lookup_percpu_elem() that allows such lookups
on per-cpu arrays. To support older kernels, however, we change
the cpuinfo BPF map from per-cpu to global. Also, it is a hash
table rather than an array in case cpus are not numbered consecutively.
Signed-off-by: Eugene Loh <eugene.loh at oracle.com>
---
bpf/get_agg.c | 2 +-
bpf/get_bvar.c | 2 +-
libdtrace/dt_bpf.c | 34 ++++++--------
libdtrace/dt_cg.c | 5 ++-
libdtrace/dt_prov_lockstat.c | 4 +-
libdtrace/dt_prov_sched.c | 74 +++++++++++++++++++++++++------
libdtrace/dt_work.c | 20 +++------
test/unittest/sched/tst.enqueue.d | 1 -
8 files changed, 89 insertions(+), 53 deletions(-)
diff --git a/bpf/get_agg.c b/bpf/get_agg.c
index c0eb825f0..e70caa6ef 100644
--- a/bpf/get_agg.c
+++ b/bpf/get_agg.c
@@ -21,7 +21,7 @@ extern struct bpf_map_def cpuinfo;
*/
noinline uint64_t *dt_no_agg(void)
{
- uint32_t key = 0;
+ uint32_t key = bpf_get_smp_processor_id();
dt_bpf_cpuinfo_t *ci;
ci = bpf_map_lookup_elem(&cpuinfo, &key);
diff --git a/bpf/get_bvar.c b/bpf/get_bvar.c
index d372b3445..d81c3605f 100644
--- a/bpf/get_bvar.c
+++ b/bpf/get_bvar.c
@@ -67,7 +67,7 @@ noinline uint64_t dt_bvar_caller(const dt_dctx_t *dctx)
noinline uint64_t dt_bvar_curcpu(const dt_dctx_t *dctx)
{
- uint32_t key = 0;
+ uint32_t key = bpf_get_smp_processor_id();
void *val = bpf_map_lookup_elem(&cpuinfo, &key);
if (val == NULL) {
diff --git a/libdtrace/dt_bpf.c b/libdtrace/dt_bpf.c
index 6d42a96c7..d6722cbd1 100644
--- a/libdtrace/dt_bpf.c
+++ b/libdtrace/dt_bpf.c
@@ -761,37 +761,29 @@ gmap_create_buffers(dtrace_hdl_t *dtp)
static int
gmap_create_cpuinfo(dtrace_hdl_t *dtp)
{
- int i, rc;
+ int i;
uint32_t key = 0;
dtrace_conf_t *conf = &dtp->dt_conf;
size_t ncpus = conf->num_online_cpus;
- dt_bpf_cpuinfo_t *data;
+ dt_bpf_cpuinfo_t data;
cpuinfo_t *ci;
- /*
- * num_possible_cpus <= num_online_cpus: see dt_conf_init.
- */
- data = dt_calloc(dtp, dtp->dt_conf.num_possible_cpus,
- sizeof(dt_bpf_cpuinfo_t));
- if (data == NULL)
- return dt_set_errno(dtp, EDT_NOMEM);
-
- for (i = 0, ci = &conf->cpus[0]; i < ncpus; i++, ci++)
- memcpy(&data[ci->cpu_id].ci, ci, sizeof(cpuinfo_t));
-
dtp->dt_cpumap_fd = create_gmap(dtp, "cpuinfo",
- BPF_MAP_TYPE_PERCPU_ARRAY,
+ BPF_MAP_TYPE_HASH,
sizeof(uint32_t),
- sizeof(dt_bpf_cpuinfo_t), 1);
+ sizeof(dt_bpf_cpuinfo_t), ncpus);
if (dtp->dt_cpumap_fd == -1)
return -1;
- rc = dt_bpf_map_update(dtp->dt_cpumap_fd, &key, data);
- dt_free(dtp, data);
- if (rc == -1)
- return dt_bpf_error(dtp,
- "cannot update BPF map 'cpuinfo': %s\n",
- strerror(errno));
+ memset(&data, 0, sizeof(data));
+ for (i = 0, ci = &conf->cpus[0]; i < ncpus; i++, ci++) {
+ memcpy(&data.ci, ci, sizeof(cpuinfo_t));
+ key = ci->cpu_id;
+ if (dt_bpf_map_update(dtp->dt_cpumap_fd, &key, &data) == -1)
+ return dt_bpf_error(dtp,
+ "cannot update BPF map 'cpuinfo': %s\n",
+ strerror(errno));
+ }
return 0;
}
diff --git a/libdtrace/dt_cg.c b/libdtrace/dt_cg.c
index 6dcf4cd3d..d83b1c2ce 100644
--- a/libdtrace/dt_cg.c
+++ b/libdtrace/dt_cg.c
@@ -1243,9 +1243,12 @@ dt_cg_epilogue(dt_pcb_t *pcb)
} else {
idp = dt_dlib_get_map(dtp, "cpuinfo");
assert(idp != NULL);
+
+ emit(dlp, BPF_CALL_HELPER(BPF_FUNC_get_smp_processor_id));
+
dt_cg_xsetx(dlp, idp, DT_LBL_NONE, BPF_REG_1, idp->di_id);
emit(dlp, BPF_LOAD(BPF_DW, BPF_REG_2, BPF_REG_FP, DT_STK_SP));
- emit(dlp, BPF_STORE_IMM(BPF_DW, BPF_REG_2, 0, 0));
+ emit(dlp, BPF_STORE(BPF_DW, BPF_REG_2, 0, BPF_REG_0));
emit(dlp, BPF_CALL_HELPER(BPF_FUNC_map_lookup_elem));
emit(dlp, BPF_BRANCH_IMM(BPF_JEQ, BPF_REG_0, 0, pcb->pcb_exitlbl));
emit(dlp, BPF_MOV_IMM(BPF_REG_1, 1));
diff --git a/libdtrace/dt_prov_lockstat.c b/libdtrace/dt_prov_lockstat.c
index c73edf9be..8b2cf4da2 100644
--- a/libdtrace/dt_prov_lockstat.c
+++ b/libdtrace/dt_prov_lockstat.c
@@ -121,11 +121,13 @@ static void get_cpuinfo(dtrace_hdl_t *dtp, dt_irlist_t *dlp, uint_t exitlbl)
{
dt_ident_t *idp = dt_dlib_get_map(dtp, "cpuinfo");
+ emit(dlp, BPF_CALL_HELPER(BPF_FUNC_get_smp_processor_id));
+
assert(idp != NULL);
dt_cg_xsetx(dlp, idp, DT_LBL_NONE, BPF_REG_1, idp->di_id);
emit(dlp, BPF_MOV_REG(BPF_REG_2, BPF_REG_FP));
emit(dlp, BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, DT_TRAMP_SP_BASE));
- emit(dlp, BPF_STORE_IMM(BPF_DW, BPF_REG_2, 0, 0));
+ emit(dlp, BPF_STORE(BPF_DW, BPF_REG_2, 0, BPF_REG_0));
emit(dlp, BPF_CALL_HELPER(BPF_FUNC_map_lookup_elem));
emit(dlp, BPF_BRANCH_IMM(BPF_JEQ, BPF_REG_0, 0, exitlbl));
emit(dlp, BPF_MOV_REG(BPF_REG_6, BPF_REG_0));
diff --git a/libdtrace/dt_prov_sched.c b/libdtrace/dt_prov_sched.c
index 3a218f3cb..a548e679f 100644
--- a/libdtrace/dt_prov_sched.c
+++ b/libdtrace/dt_prov_sched.c
@@ -84,6 +84,40 @@ static int populate(dtrace_hdl_t *dtp)
probe_args, probes);
}
+/*
+ * Get a pointer to the cpuinfo_t structure for the CPU associated
+ * with the runqueue that is in arg0.
+ *
+ * Clobbers %r1 through %r5
+ * Stores pointer to cpuinfo_t struct in %r0
+ */
+static void get_cpuinfo(dtrace_hdl_t *dtp, dt_irlist_t *dlp, uint_t exitlbl)
+{
+ dt_ident_t *idp = dt_dlib_get_map(dtp, "cpuinfo");
+
+ assert(idp != NULL);
+
+ /* Put the runqueue pointer from mst->arg0 into %r3. */
+ emit(dlp, BPF_LOAD(BPF_DW, BPF_REG_3, BPF_REG_7, DMST_ARG(0)));
+
+ /* Turn it into a pointer to its cpu member. */
+ emit(dlp, BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, dt_cg_ctf_offsetof("struct rq", "cpu", NULL, 1)));
+
+ /* Call bpf_probe_read_kernel(%fp + DT_TRAMP_SP_SLOT[0], sizeof(int), %r3) */
+ emit(dlp, BPF_MOV_IMM(BPF_REG_2, (int) sizeof(int)));
+ emit(dlp, BPF_MOV_REG(BPF_REG_1, BPF_REG_FP));
+ emit(dlp, BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, DT_TRAMP_SP_SLOT(0)));
+ emit(dlp, BPF_CALL_HELPER(BPF_FUNC_probe_read_kernel));
+ emit(dlp, BPF_BRANCH_IMM(BPF_JNE, BPF_REG_0, 0, exitlbl));
+
+ /* Now look up the corresponding cpuinfo_t. */
+ dt_cg_xsetx(dlp, idp, DT_LBL_NONE, BPF_REG_1, idp->di_id);
+ emit(dlp, BPF_MOV_REG(BPF_REG_2, BPF_REG_FP));
+ emit(dlp, BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, DT_TRAMP_SP_SLOT(0)));
+ emit(dlp, BPF_CALL_HELPER(BPF_FUNC_map_lookup_elem));
+ emit(dlp, BPF_BRANCH_IMM(BPF_JEQ, BPF_REG_0, 0, exitlbl));
+}
+
/*
* Generate a BPF trampoline for a SDT probe.
*
@@ -98,18 +132,39 @@ static int populate(dtrace_hdl_t *dtp)
*/
static int trampoline(dt_pcb_t *pcb, uint_t exitlbl)
{
+ dtrace_hdl_t *dtp = pcb->pcb_hdl;
dt_irlist_t *dlp = &pcb->pcb_ir;
dt_probe_t *prp = pcb->pcb_probe;
if (strcmp(prp->desc->prb, "dequeue") == 0) {
- emit(dlp, BPF_LOAD(BPF_DW, BPF_REG_0, BPF_REG_7, DMST_ARG(1)));
- emit(dlp, BPF_STORE(BPF_DW, BPF_REG_7, DMST_ARG(0), BPF_REG_0));
/*
- * FIXME: arg1 should be a pointer to cpuinfo_t for the CPU
- * associated with the runqueue.
+ * Get the runqueue from arg0 and place its cpuinfo_t* into %r0.
+ */
+ get_cpuinfo(dtp, dlp, exitlbl);
+
+ /*
+ * Copy arg1 into arg0.
*/
- emit(dlp, BPF_STORE_IMM(BPF_DW, BPF_REG_7, DMST_ARG(1), 0));
+ emit(dlp, BPF_LOAD(BPF_DW, BPF_REG_3, BPF_REG_7, DMST_ARG(1)));
+ emit(dlp, BPF_STORE(BPF_DW, BPF_REG_7, DMST_ARG(0), BPF_REG_3));
+
+ /* Store the cpuinfo_t* in %r0 into arg1. */
+ emit(dlp, BPF_STORE(BPF_DW, BPF_REG_7, DMST_ARG(1), BPF_REG_0));
} else if (strcmp(prp->desc->prb, "enqueue") == 0) {
+ /*
+ * Get the runqueue from arg0 and place its cpuinfo_t* into %r0.
+ */
+ get_cpuinfo(dtp, dlp, exitlbl);
+
+ /*
+ * Copy arg1 into arg0.
+ */
+ emit(dlp, BPF_LOAD(BPF_DW, BPF_REG_3, BPF_REG_7, DMST_ARG(1)));
+ emit(dlp, BPF_STORE(BPF_DW, BPF_REG_7, DMST_ARG(0), BPF_REG_3));
+
+ /* Store the cpuinfo_t* in %r0 into arg1. */
+ emit(dlp, BPF_STORE(BPF_DW, BPF_REG_7, DMST_ARG(1), BPF_REG_0));
+
/*
* This is ugly but necessary... enqueue_task() takes a flags argument and the
* ENQUEUE_HEAD flag is used to indicate that the task is to be placed at the
@@ -120,15 +175,6 @@ static int trampoline(dt_pcb_t *pcb, uint_t exitlbl)
* outside the kernel source tree.
*/
#define ENQUEUE_HEAD 0x10
-
- emit(dlp, BPF_LOAD(BPF_DW, BPF_REG_0, BPF_REG_7, DMST_ARG(1)));
- emit(dlp, BPF_STORE(BPF_DW, BPF_REG_7, DMST_ARG(0), BPF_REG_0));
- /*
- * FIXME: arg1 should be a pointer to cpuinfo_t for the CPU
- * associated with the runqueue.
- */
- emit(dlp, BPF_STORE_IMM(BPF_DW, BPF_REG_7, DMST_ARG(1), 0));
-
emit(dlp, BPF_LOAD(BPF_DW, BPF_REG_0, BPF_REG_7, DMST_ARG(2)));
emit(dlp, BPF_ALU64_IMM(BPF_AND, BPF_REG_0, ENQUEUE_HEAD));
emit(dlp, BPF_STORE(BPF_DW, BPF_REG_7, DMST_ARG(2), BPF_REG_0));
diff --git a/libdtrace/dt_work.c b/libdtrace/dt_work.c
index 498d5332a..2167ed299 100644
--- a/libdtrace/dt_work.c
+++ b/libdtrace/dt_work.c
@@ -37,35 +37,29 @@ END_probe(void)
int
dt_check_cpudrops(dtrace_hdl_t *dtp, processorid_t cpu, dtrace_dropkind_t what)
{
- dt_bpf_cpuinfo_t *ci;
- uint32_t cikey = 0;
+ dt_bpf_cpuinfo_t ci;
+ uint32_t cikey = cpu;
uint64_t cnt;
int rval = 0;
assert(what == DTRACEDROP_PRINCIPAL || what == DTRACEDROP_AGGREGATION);
- ci = dt_calloc(dtp, dtp->dt_conf.num_possible_cpus,
- sizeof(dt_bpf_cpuinfo_t));
- if (ci == NULL)
- return dt_set_errno(dtp, EDT_NOMEM);
-
- if (dt_bpf_map_lookup(dtp->dt_cpumap_fd, &cikey, ci) == -1) {
+ if (dt_bpf_map_lookup(dtp->dt_cpumap_fd, &cikey, &ci) == -1) {
rval = dt_set_errno(dtp, EDT_BPF);
goto fail;
}
if (what == DTRACEDROP_PRINCIPAL) {
- cnt = ci[cpu].buf_drops - dtp->dt_drops[cpu].buf;
- dtp->dt_drops[cpu].buf = ci[cpu].buf_drops;
+ cnt = ci.buf_drops - dtp->dt_drops[cpu].buf;
+ dtp->dt_drops[cpu].buf = ci.buf_drops;
} else {
- cnt = ci[cpu].agg_drops - dtp->dt_drops[cpu].agg;
- dtp->dt_drops[cpu].agg = ci[cpu].agg_drops;
+ cnt = ci.agg_drops - dtp->dt_drops[cpu].agg;
+ dtp->dt_drops[cpu].agg = ci.agg_drops;
}
rval = dt_handle_cpudrop(dtp, cpu, what, cnt);
fail:
- dt_free(dtp, ci);
return rval;
}
diff --git a/test/unittest/sched/tst.enqueue.d b/test/unittest/sched/tst.enqueue.d
index f445ac843..28dcace8c 100644
--- a/test/unittest/sched/tst.enqueue.d
+++ b/test/unittest/sched/tst.enqueue.d
@@ -4,7 +4,6 @@
* Licensed under the Universal Permissive License v 1.0 as shown at
* http://oss.oracle.com/licenses/upl.
*/
-/* @@xfail: dtv2 */
#pragma D option switchrate=100hz
#pragma D option destructive
--
2.43.5
More information about the DTrace-devel
mailing list