[DTrace-devel] [PATCH 2/2] Extend the USDT bit mask to multiple words

Thu Feb 20 04:43:50 UTC 2025

From: Eugene Loh <eugene.loh at oracle.com>

Currently, USDT is limited to 64 probe descriptions since the
underlying probe uses a 64-bit mask to decide which probes to execute.

Change to a multi-word bit mask that can be extended to however many
probe descriptions there are.

Also, change the mask words to be 32-bit rather than 64-bit.  The reason
is that, commonly, there will be fewer than 32 probe descriptions.  In
this case, we shorten the value of the "USDT prids" BPF map from 16 bytes
        uint32_t        prid;
        long long       mask[1];
down to 8 bytes
        uint32_t        prid;
        uint32_t        mask[1];
(The second member is smaller and no longer costs extra padding.)

We also add an
        extern int usdt_prids_map_val_extra_bytes;
to denote how many extra bytes will be needed for the extended mask.
This value is computed by usdt_prids_map_val_extra_bytes_init().
Currently, this function is awkwardly called in gmap_create_usdt(),
just before the value is needed.  Such a call to a provider-specific
function is clumsy, but there are no other calls to the provider
between compilation (where the number of statements is determined)
and this map creation.

Signed-off-by: Eugene Loh <eugene.loh at oracle.com>
---
 libdtrace/dt_bpf.c                            |   6 +-
 libdtrace/dt_bpf_maps.h                       |   3 +-
 libdtrace/dt_prov_uprobe.c                    |  81 ++++++++---
 .../usdt/tst.many_probe_descriptions.r        |   1 +
 .../usdt/tst.many_probe_descriptions.sh       |  64 +++++++++
 .../usdt/tst.many_probe_descriptions2.r       |   1 +
 .../usdt/tst.many_probe_descriptions2.sh      | 127 ++++++++++++++++++
 7 files changed, 260 insertions(+), 23 deletions(-)
 create mode 100644 test/unittest/usdt/tst.many_probe_descriptions.r
 create mode 100755 test/unittest/usdt/tst.many_probe_descriptions.sh
 create mode 100644 test/unittest/usdt/tst.many_probe_descriptions2.r
 create mode 100755 test/unittest/usdt/tst.many_probe_descriptions2.sh

diff --git a/libdtrace/dt_bpf.c b/libdtrace/dt_bpf.c
index 662fd81a4..1ed9376ea 100644
--- a/libdtrace/dt_bpf.c
+++ b/libdtrace/dt_bpf.c
@@ -940,6 +940,7 @@ gmap_create_probes(dtrace_hdl_t *dtp)
 	return 0;
 }
 
+void usdt_prids_map_val_extra_bytes_init(dtrace_hdl_t *dtp);
 /*
  * Create the 'usdt_names' and 'usdt_prids' BPF maps.
  *
@@ -965,8 +966,11 @@ gmap_create_usdt(dtrace_hdl_t *dtp)
 	if (dtp->dt_usdt_namesmap_fd == -1)
 		return -1;
 
+	usdt_prids_map_val_extra_bytes_init(dtp);
+
 	dtp->dt_usdt_pridsmap_fd = create_gmap(dtp, "usdt_prids", BPF_MAP_TYPE_HASH,
-	    sizeof(usdt_prids_map_key_t), sizeof(usdt_prids_map_val_t), nusdtprobes);
+	    sizeof(usdt_prids_map_key_t),
+	    sizeof(usdt_prids_map_val_t) + usdt_prids_map_val_extra_bytes, nusdtprobes);
 	if (dtp->dt_usdt_pridsmap_fd == -1)
 		return -1;
 
diff --git a/libdtrace/dt_bpf_maps.h b/libdtrace/dt_bpf_maps.h
index 884dc3983..ba17d8942 100644
--- a/libdtrace/dt_bpf_maps.h
+++ b/libdtrace/dt_bpf_maps.h
@@ -48,8 +48,9 @@ typedef struct usdt_prids_map_key {
 } usdt_prids_map_key_t;
 typedef struct usdt_prids_map_val {
 	uint32_t	prid;		/* should be dtrace_id_t, sys/dtrace_types.h */
-	long long	mask;
+	uint32_t	mask[1];
 } usdt_prids_map_val_t;
+extern int usdt_prids_map_val_extra_bytes;
 
 #ifdef  __cplusplus
 }
diff --git a/libdtrace/dt_prov_uprobe.c b/libdtrace/dt_prov_uprobe.c
index f1323cc31..2a5b0ce91 100644
--- a/libdtrace/dt_prov_uprobe.c
+++ b/libdtrace/dt_prov_uprobe.c
@@ -76,6 +76,8 @@ typedef struct list_key {
 	usdt_prids_map_key_t	key;
 } list_key_t;
 
+int usdt_prids_map_val_extra_bytes;
+
 static const dtrace_pattr_t	pattr = {
 { DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_ISA },
 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
@@ -175,7 +177,7 @@ clean_usdt_probes(dtrace_hdl_t *dtp)
 	int			fdprids = dtp->dt_usdt_pridsmap_fd;
 	int			fdnames = dtp->dt_usdt_namesmap_fd;
 	usdt_prids_map_key_t	key, nxt;
-	usdt_prids_map_val_t	val;
+	usdt_prids_map_val_t	*val = alloca(sizeof(usdt_prids_map_val_t) + usdt_prids_map_val_extra_bytes);
 	list_key_t		keys_to_delete, *elem, *elem_next;
 	dt_probe_t		*prp, *prp_next;
 
@@ -190,7 +192,7 @@ clean_usdt_probes(dtrace_hdl_t *dtp)
 	while (dt_bpf_map_next_key(fdprids, &key, &nxt) == 0) {
 		memcpy(&key, &nxt, sizeof(usdt_prids_map_key_t));
 
-		if (dt_bpf_map_lookup(fdprids, &key, &val) == -1)
+		if (dt_bpf_map_lookup(fdprids, &key, val) == -1)
 			return dt_set_errno(dtp, EDT_BPF);
 
 		/* Check if the process is still running. */
@@ -203,7 +205,7 @@ clean_usdt_probes(dtrace_hdl_t *dtp)
 			 * we might delete the same usdt_names entry
 			 * multiple times.  That's okay.
 			 */
-			dt_bpf_map_delete(fdnames, &val.prid);
+			dt_bpf_map_delete(fdnames, &val->prid);
 
 			/*
 			 * Delete the usdt_prids entry.
@@ -224,7 +226,7 @@ clean_usdt_probes(dtrace_hdl_t *dtp)
 		 * FIXME.  There might be another case, where the process
 		 * is still running, but some of its USDT probes are gone?
 		 * So maybe we have to check for the existence of one of
-		 *     dtrace_probedesc_t *pdp = dtp->dt_probes[val.prid]->desc;
+		 *     dtrace_probedesc_t *pdp = dtp->dt_probes[val->prid]->desc;
 		 *     char *prv = ...pdp->prv minus the numerial part;
 		 *
 		 *     /run/dtrace/probes/$pid/$pdp->prv/$pdp->mod/$pdp->fun/$pdp->prb
@@ -346,6 +348,33 @@ ignore_clause(dtrace_hdl_t *dtp, int n, const dt_probe_t *uprp)
 	return 0;
 }
 
+void usdt_prids_map_val_extra_bytes_init(dtrace_hdl_t *dtp) {
+	int i, n = 0, w = sizeof(((usdt_prids_map_val_t *)0)->mask[0]);
+
+	/* Count how many statements cannot be ignored, regardless of uprp. */
+	for (i = 0; i < dtp->dt_stmt_nextid; i++) {
+		dtrace_stmtdesc_t *stp;
+
+		stp = dtp->dt_stmts[i];
+		if (stp == NULL || ignore_clause(dtp, i, NULL))
+			continue;
+
+		n++;
+	}
+
+	/* Determine how many bytes are needed for this many bits. */
+	n = (n + CHAR_BIT - 1) / CHAR_BIT;
+
+	/* Determine how many words are needed for this many bytes. */
+	n = (n + w - 1) / w;
+
+	/* Determine how many extra bytes are needed. */
+	if (n > 1)
+		usdt_prids_map_val_extra_bytes = (n - 1) * w;
+	else
+		usdt_prids_map_val_extra_bytes = 0;
+}
+
 static int add_probe_uprobe(dtrace_hdl_t *dtp, dt_probe_t *prp)
 {
 	dtrace_difo_t   *dp;
@@ -416,6 +445,7 @@ static int add_probe_usdt(dtrace_hdl_t *dtp, dt_probe_t *prp)
 	int				fd = dtp->dt_usdt_namesmap_fd;
 	pid_t				pid;
 	list_probe_t			*pup;
+	usdt_prids_map_val_t		*val;
 
 	/* Add probe name elements to usdt_names map. */
 	p = probnam;
@@ -451,11 +481,11 @@ static int add_probe_usdt(dtrace_hdl_t *dtp, dt_probe_t *prp)
 	}
 
 	/* Add prid and bit mask to usdt_prids map. */
+	val = alloca(sizeof(usdt_prids_map_val_t) + usdt_prids_map_val_extra_bytes);
 	for (pup = prp->prv_data; pup != NULL; pup = dt_list_next(pup)) {
 		dt_probe_t		*uprp = pup->probe;
-		long long		mask = 0, bit = 1;
+		uint32_t		iword = 0, mask = 0, bit = 1;
 		usdt_prids_map_key_t	key;
-		usdt_prids_map_val_t	val;
 		dt_uprobe_t		*upp = uprp->prv_data;
 
 		/*
@@ -473,11 +503,15 @@ static int add_probe_usdt(dtrace_hdl_t *dtp, dt_probe_t *prp)
 				dtrace_stmtdesc_t *stp;
 
 				stp = dtp->dt_stmts[n];
-				if (stp == NULL)
+				if (stp == NULL || ignore_clause(dtp, n, uprp))
 					continue;
 
-				if (ignore_clause(dtp, n, uprp))
-					continue;
+				if (bit == 0) {
+					val->mask[iword] = mask;
+					mask = 0;
+					iword++;
+					bit = 1;
+				}
 
 				if (dt_gmatch(prp->desc->prv, stp->dtsd_ecbdesc->dted_probe.prv) &&
 				    dt_gmatch(prp->desc->mod, stp->dtsd_ecbdesc->dted_probe.mod) &&
@@ -492,11 +526,11 @@ static int add_probe_usdt(dtrace_hdl_t *dtp, dt_probe_t *prp)
 		key.pid = pid;
 		key.uprid = uprp->desc->id;
 
-		val.prid = prp->desc->id;
-		val.mask = mask;
+		val->prid = prp->desc->id;
+		val->mask[iword] = mask;
 
 		// FIXME Check return value, but how should errors be handled?
-		dt_bpf_map_update(dtp->dt_usdt_pridsmap_fd, &key, &val);
+		dt_bpf_map_update(dtp->dt_usdt_pridsmap_fd, &key, val);
 	}
 
 	return 0;
@@ -922,7 +956,7 @@ static int trampoline(dt_pcb_t *pcb, uint_t exitlbl)
 	const list_probe_t	*pop;
 	uint_t			lbl_exit = pcb->pcb_exitlbl;
 	dt_ident_t		*usdt_prids = dt_dlib_get_map(dtp, "usdt_prids");
-	int			n;
+	int			n, ibit, w = CHAR_BIT * sizeof(((usdt_prids_map_val_t *)0)->mask[0]);
 
 	assert(usdt_prids != NULL);
 
@@ -1020,7 +1054,8 @@ static int trampoline(dt_pcb_t *pcb, uint_t exitlbl)
 	 */
 	assert(sizeof(usdt_prids_map_key_t) <= DT_STK_SLOT_SZ);
 	emit(dlp,  BPF_STORE(BPF_W, BPF_REG_FP, DT_TRAMP_SP_SLOT(0), BPF_REG_0));
-	emit(dlp,  BPF_STORE_IMM(BPF_W, BPF_REG_FP, DT_TRAMP_SP_SLOT(0) + sizeof(pid_t), uprp->desc->id));
+	emit(dlp,  BPF_STORE_IMM(BPF_W, BPF_REG_FP,
+		   DT_TRAMP_SP_SLOT(0) + (int)sizeof(pid_t), uprp->desc->id));
 	dt_cg_xsetx(dlp, usdt_prids, DT_LBL_NONE, BPF_REG_1, usdt_prids->di_id);
 	emit(dlp,  BPF_MOV_REG(BPF_REG_2, BPF_REG_FP));
 	emit(dlp,  BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, DT_TRAMP_SP_SLOT(0)));
@@ -1054,27 +1089,30 @@ static int trampoline(dt_pcb_t *pcb, uint_t exitlbl)
 	emit(dlp,  BPF_LOAD(BPF_W, BPF_REG_1, BPF_REG_0, 0));
 	emit(dlp,  BPF_STORE(BPF_W, BPF_REG_7, DMST_PRID, BPF_REG_1));
 
-	/* Read the bit mask from the table lookup in %r6. */    // FIXME someday, extend this past 64 bits
-	emit(dlp,  BPF_LOAD(BPF_DW, BPF_REG_6, BPF_REG_0, offsetof(usdt_prids_map_val_t, mask)));
+	/* Store the value key for reuse. */
+	emit(dlp,  BPF_STORE(BPF_DW, BPF_REG_FP, DT_TRAMP_SP_SLOT(0), BPF_REG_0));
 
 	/*
 	 * Hold the bit mask in %r6 between clause calls.
 	 */
-	for (n = 0; n < dtp->dt_stmt_nextid; n++) {
+	for (ibit = n = 0; n < dtp->dt_stmt_nextid; n++) {
 		dtrace_stmtdesc_t *stp;
 		dt_ident_t	*idp;
 		uint_t		lbl_next;
 
 		stp = dtp->dt_stmts[n];
-		if (stp == NULL)
-			continue;
-
-		if (ignore_clause(dtp, n, uprp))
+		if (stp == NULL || ignore_clause(dtp, n, uprp))
 			continue;
 
 		idp = stp->dtsd_clause;
 		lbl_next = dt_irlist_label(dlp);
 
+		/* Load the next word of the bit mask into %r6. */
+		if (ibit % w == 0) {
+			emit(dlp,  BPF_LOAD(BPF_DW, BPF_REG_0, BPF_REG_FP, DT_TRAMP_SP_SLOT(0)));
+			emit(dlp,  BPF_LOAD(BPF_W, BPF_REG_6, BPF_REG_0, offsetof(usdt_prids_map_val_t, mask[ibit / w])));
+		}
+
 		/* If the lowest %r6 bit is 0, skip over this clause. */
 		emit(dlp,  BPF_MOV_REG(BPF_REG_1, BPF_REG_6));
 		emit(dlp,  BPF_ALU64_IMM(BPF_AND, BPF_REG_1, 1));
@@ -1102,6 +1140,7 @@ static int trampoline(dt_pcb_t *pcb, uint_t exitlbl)
 
 		/* Right-shift %r6. */
 		emit(dlp,  BPF_ALU64_IMM(BPF_RSH, BPF_REG_6, 1));
+		ibit++;
 	}
 
 out:
diff --git a/test/unittest/usdt/tst.many_probe_descriptions.r b/test/unittest/usdt/tst.many_probe_descriptions.r
new file mode 100644
index 000000000..2e9ba477f
--- /dev/null
+++ b/test/unittest/usdt/tst.many_probe_descriptions.r
@@ -0,0 +1 @@
+success
diff --git a/test/unittest/usdt/tst.many_probe_descriptions.sh b/test/unittest/usdt/tst.many_probe_descriptions.sh
new file mode 100755
index 000000000..92a61d5b7
--- /dev/null
+++ b/test/unittest/usdt/tst.many_probe_descriptions.sh
@@ -0,0 +1,64 @@
+#!/bin/bash
+#
+# Oracle Linux DTrace.
+# Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved.
+# Licensed under the Universal Permissive License v 1.0 as shown at
+# http://oss.oracle.com/licenses/upl.
+
+dtrace=$1
+TRIGGER=$PWD/test/triggers/usdt-tst-args
+
+DIRNAME="$tmpdir/usdt-many_probe_descriptions.$$.$RANDOM"
+mkdir -p $DIRNAME
+cd $DIRNAME
+
+# Construct the D scripts and output files.
+# We stick 80 probe descriptions in each of 3 scripts to test
+# USDT's ability to handle hundreds of probe descriptions.
+for d in 0 1 2; do
+for x in 00 01 02 03 04 05 06 07 08 09 \
+         10 11 12 13 14 15 16 17 18 19 \
+         20 21 22 23 24 25 26 27 28 29 \
+         30 31 32 33 34 35 36 37 38 39 \
+         40 41 42 43 44 45 46 47 48 49 \
+         50 51 52 53 54 55 56 57 58 59 \
+         60 61 62 63 64 65 66 67 68 69 \
+         70 71 72 73 74 75 76 77 78 79 \
+; do
+	echo 'test_prov$target:::place { printf("'$d$x'\n"); }' >> D$d.d
+	echo $d$x >> expect.txt
+done
+done
+echo 'test_prov$target:::place { exit(0); }' >> D$d.d
+echo >> expect.txt
+
+# Run DTrace.
+
+$dtrace $dt_flags -c $TRIGGER -q -s D0.d -s D1.d -s D2.d >& actual.txt
+if [ $? -eq 0 ]; then
+	if diff -q expect.txt actual.txt > /dev/null; then
+		echo success
+		exit 0
+	else
+		echo ERROR: did not get expected results
+		echo === expect.txt
+		cat      expect.txt
+		echo === actual.txt
+		cat      actual.txt
+		echo === diff
+		diff expect.txt actual.txt
+	fi
+else
+	echo ERROR: dtrace error
+	echo ==== output
+	cat actual.txt
+fi
+
+echo ==== script D0.d
+cat D0.d
+echo ==== script D1.d
+cat D1.d
+echo ==== script D2.d
+cat D2.d
+
+exit 1
diff --git a/test/unittest/usdt/tst.many_probe_descriptions2.r b/test/unittest/usdt/tst.many_probe_descriptions2.r
new file mode 100644
index 000000000..2e9ba477f
--- /dev/null
+++ b/test/unittest/usdt/tst.many_probe_descriptions2.r
@@ -0,0 +1 @@
+success
diff --git a/test/unittest/usdt/tst.many_probe_descriptions2.sh b/test/unittest/usdt/tst.many_probe_descriptions2.sh
new file mode 100755
index 000000000..cc8821c6e
--- /dev/null
+++ b/test/unittest/usdt/tst.many_probe_descriptions2.sh
@@ -0,0 +1,127 @@
+#!/bin/bash
+#
+# Oracle Linux DTrace.
+# Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved.
+# Licensed under the Universal Permissive License v 1.0 as shown at
+# http://oss.oracle.com/licenses/upl.
+
+# This test uses many probes and probe descriptions.  Therefore, the
+# number of BPF programs to load into the kernel -- dt_bpf_load_prog()
+# calling prp->prov->impl->load_prog(), which is dt_bpf_prog_load() --
+# and the duration of each load are both increasing.
+# @@timeout: 400
+
+dtrace=$1
+
+DIRNAME="$tmpdir/usdt-many_probe_descriptions2.$$.$RANDOM"
+mkdir -p $DIRNAME
+cd $DIRNAME
+
+# Set the lists.
+# - The probes will be foo$x$y.
+# - The probe descriptions will be foo$x* and foo*$y, for each $d.
+# So if there are nx items in xlist, ny in ylist, and nd in dlist,
+# - there will be roughly nx*ny probes
+# - there will be roughly (nx+ny)*nd probe descriptions
+
+xlist="a b c d e f g h i j k l m"
+ylist="n o p q r s t u v w x y z"
+dlist="0 1 2 3 4 5 6 7 8"
+
+# Make the trigger:  Preambles.
+
+echo "provider testprov {" > prov.d
+
+echo '#include "prov.h"' > main.c
+echo 'int main(int argc, char **argv) {' >> main.c
+
+# Make the trigger:  Loop over the probes.
+
+for x in $xlist; do
+for y in $ylist; do
+    echo "probe foo$x$y();" >> prov.d
+    echo "TESTPROV_FOO$x$y();" | awk '{ print(toupper($1)) }' >> main.c
+done
+done
+
+# Make the trigger:  Epilogues.
+
+echo "};" >> prov.d
+echo "return 0; }" >> main.c
+
+# Build the trigger.
+
+$dtrace $dt_flags -h -s prov.d
+if [ $? -ne 0 ]; then
+	echo "failed to generate header file" >&2
+	cat prov.d
+	exit 1
+fi
+gcc $test_cppflags -c main.c
+if [ $? -ne 0 ]; then
+	echo "failed to compile test" >&2
+	cat main.c
+	exit 1
+fi
+$dtrace $dt_flags -G -64 -s prov.d main.o
+if [ $? -ne 0 ]; then
+	echo "failed to create DOF" >&2
+	exit 1
+fi
+gcc $test_ldflags -o main main.o prov.o
+if [ $? -ne 0 ]; then
+	echo "failed to link final executable" >&2
+	exit 1
+fi
+
+# Prepare the D script, generating the probe descriptions.
+
+rm -f D.d
+for d in $dlist; do
+	for x in $xlist; do
+		echo 'testprov$target:::foo'$x'* { printf("'$d' '$x'* %s\n", probename) }' >> D.d
+	done
+	for y in $ylist; do
+		echo 'testprov$target:::foo*'$y' { printf("'$d' *'$y' %s\n", probename) }' >> D.d
+	done
+done
+
+# Prepare the expected output.
+
+for x in $xlist; do
+for y in $ylist; do
+for d in $dlist; do
+	echo $d $x'*' foo$x$y >> expect.txt
+	echo $d '*'$y foo$x$y >> expect.txt
+done
+done
+done
+echo >> expect.txt
+
+# Run DTrace.
+
+$dtrace $dt_flags -c ./main -qs D.d >& actual.txt
+if [ $? -ne 0 ]; then
+	echo ERROR: dtrace error
+	echo "==== D script"
+	cat D.d
+	echo "==== output"
+	cat actual.txt
+	exit 1
+fi
+
+# Check results.
+
+if diff -q expect.txt actual.txt; then
+	echo success
+	exit 0
+else
+	echo ERROR: unexpected results
+	echo "==== expect"
+	cat expect.txt
+	echo "==== actual"
+	cat actual.txt
+	echo "==== diff"
+	diff expect.txt actual.txt
+	exit 1
+fi
-- 
2.43.5