[DTrace-devel] [PATCH 2/2] Extend the USDT bit mask to multiple words
eugene.loh at oracle.com
eugene.loh at oracle.com
Thu Feb 20 04:43:50 UTC 2025
From: Eugene Loh <eugene.loh at oracle.com>
Currently, USDT is limited to 64 probe descriptions since the
underlying probe uses a 64-bit mask to decide which probes to execute.
Change to a multi-word bit mask that can be extended to however many
probe descriptions there are.
Also, change the mask words to be 32-bit rather than 64-bit. The reason
is that, commonly, there will be fewer than 32 probe descriptions. In
this case, we shorten the value of the "USDT prids" BPF map from 16 bytes
uint32_t prid;
long long mask[1];
down to 8 bytes
uint32_t prid;
uint32_t mask[1];
(The second member is smaller and no longer costs extra padding.)
We also add an
extern int usdt_prids_map_val_extra_bytes;
to denote how many extra bytes will be needed for the extended mask.
This value is computed by usdt_prids_map_val_extra_bytes_init().
Currently, this function is awkwardly called in gmap_create_usdt(),
just before the value is needed. Such a call to a provider-specific
function is clumsy, but there are no other calls to the provider
between compilation (where the number of statements is determined)
and this map creation.
Signed-off-by: Eugene Loh <eugene.loh at oracle.com>
---
libdtrace/dt_bpf.c | 6 +-
libdtrace/dt_bpf_maps.h | 3 +-
libdtrace/dt_prov_uprobe.c | 81 ++++++++---
.../usdt/tst.many_probe_descriptions.r | 1 +
.../usdt/tst.many_probe_descriptions.sh | 64 +++++++++
.../usdt/tst.many_probe_descriptions2.r | 1 +
.../usdt/tst.many_probe_descriptions2.sh | 127 ++++++++++++++++++
7 files changed, 260 insertions(+), 23 deletions(-)
create mode 100644 test/unittest/usdt/tst.many_probe_descriptions.r
create mode 100755 test/unittest/usdt/tst.many_probe_descriptions.sh
create mode 100644 test/unittest/usdt/tst.many_probe_descriptions2.r
create mode 100755 test/unittest/usdt/tst.many_probe_descriptions2.sh
diff --git a/libdtrace/dt_bpf.c b/libdtrace/dt_bpf.c
index 662fd81a4..1ed9376ea 100644
--- a/libdtrace/dt_bpf.c
+++ b/libdtrace/dt_bpf.c
@@ -940,6 +940,7 @@ gmap_create_probes(dtrace_hdl_t *dtp)
return 0;
}
+void usdt_prids_map_val_extra_bytes_init(dtrace_hdl_t *dtp);
/*
* Create the 'usdt_names' and 'usdt_prids' BPF maps.
*
@@ -965,8 +966,11 @@ gmap_create_usdt(dtrace_hdl_t *dtp)
if (dtp->dt_usdt_namesmap_fd == -1)
return -1;
+ usdt_prids_map_val_extra_bytes_init(dtp);
+
dtp->dt_usdt_pridsmap_fd = create_gmap(dtp, "usdt_prids", BPF_MAP_TYPE_HASH,
- sizeof(usdt_prids_map_key_t), sizeof(usdt_prids_map_val_t), nusdtprobes);
+ sizeof(usdt_prids_map_key_t),
+ sizeof(usdt_prids_map_val_t) + usdt_prids_map_val_extra_bytes, nusdtprobes);
if (dtp->dt_usdt_pridsmap_fd == -1)
return -1;
diff --git a/libdtrace/dt_bpf_maps.h b/libdtrace/dt_bpf_maps.h
index 884dc3983..ba17d8942 100644
--- a/libdtrace/dt_bpf_maps.h
+++ b/libdtrace/dt_bpf_maps.h
@@ -48,8 +48,9 @@ typedef struct usdt_prids_map_key {
} usdt_prids_map_key_t;
typedef struct usdt_prids_map_val {
uint32_t prid; /* should be dtrace_id_t, sys/dtrace_types.h */
- long long mask;
+ uint32_t mask[1];
} usdt_prids_map_val_t;
+extern int usdt_prids_map_val_extra_bytes;
#ifdef __cplusplus
}
diff --git a/libdtrace/dt_prov_uprobe.c b/libdtrace/dt_prov_uprobe.c
index f1323cc31..2a5b0ce91 100644
--- a/libdtrace/dt_prov_uprobe.c
+++ b/libdtrace/dt_prov_uprobe.c
@@ -76,6 +76,8 @@ typedef struct list_key {
usdt_prids_map_key_t key;
} list_key_t;
+int usdt_prids_map_val_extra_bytes;
+
static const dtrace_pattr_t pattr = {
{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_ISA },
{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
@@ -175,7 +177,7 @@ clean_usdt_probes(dtrace_hdl_t *dtp)
int fdprids = dtp->dt_usdt_pridsmap_fd;
int fdnames = dtp->dt_usdt_namesmap_fd;
usdt_prids_map_key_t key, nxt;
- usdt_prids_map_val_t val;
+ usdt_prids_map_val_t *val = alloca(sizeof(usdt_prids_map_val_t) + usdt_prids_map_val_extra_bytes);
list_key_t keys_to_delete, *elem, *elem_next;
dt_probe_t *prp, *prp_next;
@@ -190,7 +192,7 @@ clean_usdt_probes(dtrace_hdl_t *dtp)
while (dt_bpf_map_next_key(fdprids, &key, &nxt) == 0) {
memcpy(&key, &nxt, sizeof(usdt_prids_map_key_t));
- if (dt_bpf_map_lookup(fdprids, &key, &val) == -1)
+ if (dt_bpf_map_lookup(fdprids, &key, val) == -1)
return dt_set_errno(dtp, EDT_BPF);
/* Check if the process is still running. */
@@ -203,7 +205,7 @@ clean_usdt_probes(dtrace_hdl_t *dtp)
* we might delete the same usdt_names entry
* multiple times. That's okay.
*/
- dt_bpf_map_delete(fdnames, &val.prid);
+ dt_bpf_map_delete(fdnames, &val->prid);
/*
* Delete the usdt_prids entry.
@@ -224,7 +226,7 @@ clean_usdt_probes(dtrace_hdl_t *dtp)
* FIXME. There might be another case, where the process
* is still running, but some of its USDT probes are gone?
* So maybe we have to check for the existence of one of
- * dtrace_probedesc_t *pdp = dtp->dt_probes[val.prid]->desc;
+ * dtrace_probedesc_t *pdp = dtp->dt_probes[val->prid]->desc;
* char *prv = ...pdp->prv minus the numerial part;
*
* /run/dtrace/probes/$pid/$pdp->prv/$pdp->mod/$pdp->fun/$pdp->prb
@@ -346,6 +348,33 @@ ignore_clause(dtrace_hdl_t *dtp, int n, const dt_probe_t *uprp)
return 0;
}
+void usdt_prids_map_val_extra_bytes_init(dtrace_hdl_t *dtp) {
+ int i, n = 0, w = sizeof(((usdt_prids_map_val_t *)0)->mask[0]);
+
+ /* Count how many statements cannot be ignored, regardless of uprp. */
+ for (i = 0; i < dtp->dt_stmt_nextid; i++) {
+ dtrace_stmtdesc_t *stp;
+
+ stp = dtp->dt_stmts[i];
+ if (stp == NULL || ignore_clause(dtp, i, NULL))
+ continue;
+
+ n++;
+ }
+
+ /* Determine how many bytes are needed for this many bits. */
+ n = (n + CHAR_BIT - 1) / CHAR_BIT;
+
+ /* Determine how many words are needed for this many bytes. */
+ n = (n + w - 1) / w;
+
+ /* Determine how many extra bytes are needed. */
+ if (n > 1)
+ usdt_prids_map_val_extra_bytes = (n - 1) * w;
+ else
+ usdt_prids_map_val_extra_bytes = 0;
+}
+
static int add_probe_uprobe(dtrace_hdl_t *dtp, dt_probe_t *prp)
{
dtrace_difo_t *dp;
@@ -416,6 +445,7 @@ static int add_probe_usdt(dtrace_hdl_t *dtp, dt_probe_t *prp)
int fd = dtp->dt_usdt_namesmap_fd;
pid_t pid;
list_probe_t *pup;
+ usdt_prids_map_val_t *val;
/* Add probe name elements to usdt_names map. */
p = probnam;
@@ -451,11 +481,11 @@ static int add_probe_usdt(dtrace_hdl_t *dtp, dt_probe_t *prp)
}
/* Add prid and bit mask to usdt_prids map. */
+ val = alloca(sizeof(usdt_prids_map_val_t) + usdt_prids_map_val_extra_bytes);
for (pup = prp->prv_data; pup != NULL; pup = dt_list_next(pup)) {
dt_probe_t *uprp = pup->probe;
- long long mask = 0, bit = 1;
+ uint32_t iword = 0, mask = 0, bit = 1;
usdt_prids_map_key_t key;
- usdt_prids_map_val_t val;
dt_uprobe_t *upp = uprp->prv_data;
/*
@@ -473,11 +503,15 @@ static int add_probe_usdt(dtrace_hdl_t *dtp, dt_probe_t *prp)
dtrace_stmtdesc_t *stp;
stp = dtp->dt_stmts[n];
- if (stp == NULL)
+ if (stp == NULL || ignore_clause(dtp, n, uprp))
continue;
- if (ignore_clause(dtp, n, uprp))
- continue;
+ if (bit == 0) {
+ val->mask[iword] = mask;
+ mask = 0;
+ iword++;
+ bit = 1;
+ }
if (dt_gmatch(prp->desc->prv, stp->dtsd_ecbdesc->dted_probe.prv) &&
dt_gmatch(prp->desc->mod, stp->dtsd_ecbdesc->dted_probe.mod) &&
@@ -492,11 +526,11 @@ static int add_probe_usdt(dtrace_hdl_t *dtp, dt_probe_t *prp)
key.pid = pid;
key.uprid = uprp->desc->id;
- val.prid = prp->desc->id;
- val.mask = mask;
+ val->prid = prp->desc->id;
+ val->mask[iword] = mask;
// FIXME Check return value, but how should errors be handled?
- dt_bpf_map_update(dtp->dt_usdt_pridsmap_fd, &key, &val);
+ dt_bpf_map_update(dtp->dt_usdt_pridsmap_fd, &key, val);
}
return 0;
@@ -922,7 +956,7 @@ static int trampoline(dt_pcb_t *pcb, uint_t exitlbl)
const list_probe_t *pop;
uint_t lbl_exit = pcb->pcb_exitlbl;
dt_ident_t *usdt_prids = dt_dlib_get_map(dtp, "usdt_prids");
- int n;
+ int n, ibit, w = CHAR_BIT * sizeof(((usdt_prids_map_val_t *)0)->mask[0]);
assert(usdt_prids != NULL);
@@ -1020,7 +1054,8 @@ static int trampoline(dt_pcb_t *pcb, uint_t exitlbl)
*/
assert(sizeof(usdt_prids_map_key_t) <= DT_STK_SLOT_SZ);
emit(dlp, BPF_STORE(BPF_W, BPF_REG_FP, DT_TRAMP_SP_SLOT(0), BPF_REG_0));
- emit(dlp, BPF_STORE_IMM(BPF_W, BPF_REG_FP, DT_TRAMP_SP_SLOT(0) + sizeof(pid_t), uprp->desc->id));
+ emit(dlp, BPF_STORE_IMM(BPF_W, BPF_REG_FP,
+ DT_TRAMP_SP_SLOT(0) + (int)sizeof(pid_t), uprp->desc->id));
dt_cg_xsetx(dlp, usdt_prids, DT_LBL_NONE, BPF_REG_1, usdt_prids->di_id);
emit(dlp, BPF_MOV_REG(BPF_REG_2, BPF_REG_FP));
emit(dlp, BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, DT_TRAMP_SP_SLOT(0)));
@@ -1054,27 +1089,30 @@ static int trampoline(dt_pcb_t *pcb, uint_t exitlbl)
emit(dlp, BPF_LOAD(BPF_W, BPF_REG_1, BPF_REG_0, 0));
emit(dlp, BPF_STORE(BPF_W, BPF_REG_7, DMST_PRID, BPF_REG_1));
- /* Read the bit mask from the table lookup in %r6. */ // FIXME someday, extend this past 64 bits
- emit(dlp, BPF_LOAD(BPF_DW, BPF_REG_6, BPF_REG_0, offsetof(usdt_prids_map_val_t, mask)));
+ /* Store the value key for reuse. */
+ emit(dlp, BPF_STORE(BPF_DW, BPF_REG_FP, DT_TRAMP_SP_SLOT(0), BPF_REG_0));
/*
* Hold the bit mask in %r6 between clause calls.
*/
- for (n = 0; n < dtp->dt_stmt_nextid; n++) {
+ for (ibit = n = 0; n < dtp->dt_stmt_nextid; n++) {
dtrace_stmtdesc_t *stp;
dt_ident_t *idp;
uint_t lbl_next;
stp = dtp->dt_stmts[n];
- if (stp == NULL)
- continue;
-
- if (ignore_clause(dtp, n, uprp))
+ if (stp == NULL || ignore_clause(dtp, n, uprp))
continue;
idp = stp->dtsd_clause;
lbl_next = dt_irlist_label(dlp);
+ /* Load the next word of the bit mask into %r6. */
+ if (ibit % w == 0) {
+ emit(dlp, BPF_LOAD(BPF_DW, BPF_REG_0, BPF_REG_FP, DT_TRAMP_SP_SLOT(0)));
+ emit(dlp, BPF_LOAD(BPF_W, BPF_REG_6, BPF_REG_0, offsetof(usdt_prids_map_val_t, mask[ibit / w])));
+ }
+
/* If the lowest %r6 bit is 0, skip over this clause. */
emit(dlp, BPF_MOV_REG(BPF_REG_1, BPF_REG_6));
emit(dlp, BPF_ALU64_IMM(BPF_AND, BPF_REG_1, 1));
@@ -1102,6 +1140,7 @@ static int trampoline(dt_pcb_t *pcb, uint_t exitlbl)
/* Right-shift %r6. */
emit(dlp, BPF_ALU64_IMM(BPF_RSH, BPF_REG_6, 1));
+ ibit++;
}
out:
diff --git a/test/unittest/usdt/tst.many_probe_descriptions.r b/test/unittest/usdt/tst.many_probe_descriptions.r
new file mode 100644
index 000000000..2e9ba477f
--- /dev/null
+++ b/test/unittest/usdt/tst.many_probe_descriptions.r
@@ -0,0 +1 @@
+success
diff --git a/test/unittest/usdt/tst.many_probe_descriptions.sh b/test/unittest/usdt/tst.many_probe_descriptions.sh
new file mode 100755
index 000000000..92a61d5b7
--- /dev/null
+++ b/test/unittest/usdt/tst.many_probe_descriptions.sh
@@ -0,0 +1,64 @@
+#!/bin/bash
+#
+# Oracle Linux DTrace.
+# Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved.
+# Licensed under the Universal Permissive License v 1.0 as shown at
+# http://oss.oracle.com/licenses/upl.
+
+dtrace=$1
+TRIGGER=$PWD/test/triggers/usdt-tst-args
+
+DIRNAME="$tmpdir/usdt-many_probe_descriptions.$$.$RANDOM"
+mkdir -p $DIRNAME
+cd $DIRNAME
+
+# Construct the D scripts and output files.
+# We stick 80 probe descriptions in each of 3 scripts to test
+# USDT's ability to handle hundreds of probe descriptions.
+for d in 0 1 2; do
+for x in 00 01 02 03 04 05 06 07 08 09 \
+ 10 11 12 13 14 15 16 17 18 19 \
+ 20 21 22 23 24 25 26 27 28 29 \
+ 30 31 32 33 34 35 36 37 38 39 \
+ 40 41 42 43 44 45 46 47 48 49 \
+ 50 51 52 53 54 55 56 57 58 59 \
+ 60 61 62 63 64 65 66 67 68 69 \
+ 70 71 72 73 74 75 76 77 78 79 \
+; do
+ echo 'test_prov$target:::place { printf("'$d$x'\n"); }' >> D$d.d
+ echo $d$x >> expect.txt
+done
+done
+echo 'test_prov$target:::place { exit(0); }' >> D$d.d
+echo >> expect.txt
+
+# Run DTrace.
+
+$dtrace $dt_flags -c $TRIGGER -q -s D0.d -s D1.d -s D2.d >& actual.txt
+if [ $? -eq 0 ]; then
+ if diff -q expect.txt actual.txt > /dev/null; then
+ echo success
+ exit 0
+ else
+ echo ERROR: did not get expected results
+ echo === expect.txt
+ cat expect.txt
+ echo === actual.txt
+ cat actual.txt
+ echo === diff
+ diff expect.txt actual.txt
+ fi
+else
+ echo ERROR: dtrace error
+ echo ==== output
+ cat actual.txt
+fi
+
+echo ==== script D0.d
+cat D0.d
+echo ==== script D1.d
+cat D1.d
+echo ==== script D2.d
+cat D2.d
+
+exit 1
diff --git a/test/unittest/usdt/tst.many_probe_descriptions2.r b/test/unittest/usdt/tst.many_probe_descriptions2.r
new file mode 100644
index 000000000..2e9ba477f
--- /dev/null
+++ b/test/unittest/usdt/tst.many_probe_descriptions2.r
@@ -0,0 +1 @@
+success
diff --git a/test/unittest/usdt/tst.many_probe_descriptions2.sh b/test/unittest/usdt/tst.many_probe_descriptions2.sh
new file mode 100755
index 000000000..cc8821c6e
--- /dev/null
+++ b/test/unittest/usdt/tst.many_probe_descriptions2.sh
@@ -0,0 +1,127 @@
+#!/bin/bash
+#
+# Oracle Linux DTrace.
+# Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved.
+# Licensed under the Universal Permissive License v 1.0 as shown at
+# http://oss.oracle.com/licenses/upl.
+
+# This test uses many probes and probe descriptions. Therefore, the
+# number of BPF programs to load into the kernel -- dt_bpf_load_prog()
+# calling prp->prov->impl->load_prog(), which is dt_bpf_prog_load() --
+# and the duration of each load are both increasing.
+# @@timeout: 400
+
+dtrace=$1
+
+DIRNAME="$tmpdir/usdt-many_probe_descriptions2.$$.$RANDOM"
+mkdir -p $DIRNAME
+cd $DIRNAME
+
+# Set the lists.
+# - The probes will be foo$x$y.
+# - The probe descriptions will be foo$x* and foo*$y, for each $d.
+# So if there are nx items in xlist, ny in ylist, and nd in dlist,
+# - there will be roughly nx*ny probes
+# - there will be roughly (nx+ny)*nd probe descriptions
+
+xlist="a b c d e f g h i j k l m"
+ylist="n o p q r s t u v w x y z"
+dlist="0 1 2 3 4 5 6 7 8"
+
+# Make the trigger: Preambles.
+
+echo "provider testprov {" > prov.d
+
+echo '#include "prov.h"' > main.c
+echo 'int main(int argc, char **argv) {' >> main.c
+
+# Make the trigger: Loop over the probes.
+
+for x in $xlist; do
+for y in $ylist; do
+ echo "probe foo$x$y();" >> prov.d
+ echo "TESTPROV_FOO$x$y();" | awk '{ print(toupper($1)) }' >> main.c
+done
+done
+
+# Make the trigger: Epilogues.
+
+echo "};" >> prov.d
+echo "return 0; }" >> main.c
+
+# Build the trigger.
+
+$dtrace $dt_flags -h -s prov.d
+if [ $? -ne 0 ]; then
+ echo "failed to generate header file" >&2
+ cat prov.d
+ exit 1
+fi
+gcc $test_cppflags -c main.c
+if [ $? -ne 0 ]; then
+ echo "failed to compile test" >&2
+ cat main.c
+ exit 1
+fi
+$dtrace $dt_flags -G -64 -s prov.d main.o
+if [ $? -ne 0 ]; then
+ echo "failed to create DOF" >&2
+ exit 1
+fi
+gcc $test_ldflags -o main main.o prov.o
+if [ $? -ne 0 ]; then
+ echo "failed to link final executable" >&2
+ exit 1
+fi
+
+# Prepare the D script, generating the probe descriptions.
+
+rm -f D.d
+for d in $dlist; do
+ for x in $xlist; do
+ echo 'testprov$target:::foo'$x'* { printf("'$d' '$x'* %s\n", probename) }' >> D.d
+ done
+ for y in $ylist; do
+ echo 'testprov$target:::foo*'$y' { printf("'$d' *'$y' %s\n", probename) }' >> D.d
+ done
+done
+
+# Prepare the expected output.
+
+for x in $xlist; do
+for y in $ylist; do
+for d in $dlist; do
+ echo $d $x'*' foo$x$y >> expect.txt
+ echo $d '*'$y foo$x$y >> expect.txt
+done
+done
+done
+echo >> expect.txt
+
+# Run DTrace.
+
+$dtrace $dt_flags -c ./main -qs D.d >& actual.txt
+if [ $? -ne 0 ]; then
+ echo ERROR: dtrace error
+ echo "==== D script"
+ cat D.d
+ echo "==== output"
+ cat actual.txt
+ exit 1
+fi
+
+# Check results.
+
+if diff -q expect.txt actual.txt; then
+ echo success
+ exit 0
+else
+ echo ERROR: unexpected results
+ echo "==== expect"
+ cat expect.txt
+ echo "==== actual"
+ cat actual.txt
+ echo "==== diff"
+ diff expect.txt actual.txt
+ exit 1
+fi
--
2.43.5
More information about the DTrace-devel
mailing list