[DTrace-devel] [PATCH] cg: implement concurrent probe execution protection

Kris Van Hees kris.van.hees at oracle.com
Mon Mar 4 16:25:29 UTC 2024


On kernels >= 5.11, BPF programs execute in preemptive mode which can
lead to data corruption if the BPF programs attached to a probe has its
execution interrupted by another probe on the same CPU.

Pending implementation of a mechanism to support preemptive probe program
execution in DTrace, this patch disallows execution of a probe program
if one is already executing on the current CPU.

Signed-off-by: Kris Van Hees <kris.van.hees at oracle.com>
---
 libdtrace/dt_cg.c  | 39 ++++++++++++++++++++++++++-------------
 libdtrace/dt_pcb.h |  1 +
 2 files changed, 27 insertions(+), 13 deletions(-)

diff --git a/libdtrace/dt_cg.c b/libdtrace/dt_cg.c
index 1b2dbb84..25c2f765 100644
--- a/libdtrace/dt_cg.c
+++ b/libdtrace/dt_cg.c
@@ -174,6 +174,7 @@ dt_cg_tramp_prologue_act(dt_pcb_t *pcb, dt_activity_t act)
 	dt_ident_t	*state = dt_dlib_get_map(dtp, "state");
 	dt_ident_t	*prid = dt_dlib_get_var(pcb->pcb_hdl, "PRID");
 	dt_ident_t	*ro_off = dt_dlib_get_var(dtp, "RODATA_OFF");
+	uint_t		lbl_fast = pcb->pcb_fastlbl;
 	uint_t		lbl_exit = pcb->pcb_exitlbl;
 
 	assert(aggs != NULL);
@@ -218,10 +219,10 @@ dt_cg_tramp_prologue_act(dt_pcb_t *pcb, dt_activity_t act)
 	 *				// call bpf_map_lookup_elem
 	 *				//     (%r1 ... %r5 clobbered)
 	 *				//     (%r0 = map value)
-	 *	if (rc == 0)		// jeq %r0, 0, lbl_exit
-	 *		goto exit;
+	 *	if (rc == 0)		// jeq %r0, 0, lbl_fast
+	 *		goto fast;
 	 *	if (*rc != act)		// ldw %r1, [%r0 + 0]
-	 *		goto exit;	// jne %r1, act, lbl_exit
+	 *		goto fast;	// jne %r1, act, lbl_fast
 	 *
 	 *	dctx.act = rc;		// stdw [%r9 + DCTX_ACT], %r0
 	 */
@@ -230,37 +231,42 @@ dt_cg_tramp_prologue_act(dt_pcb_t *pcb, dt_activity_t act)
 	emit(dlp,  BPF_MOV_REG(BPF_REG_2, BPF_REG_9));
 	emit(dlp,  BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, DCTX_ACT));
 	emit(dlp,  BPF_CALL_HELPER(BPF_FUNC_map_lookup_elem));
-	emit(dlp,  BPF_BRANCH_IMM(BPF_JEQ, BPF_REG_0, 0, lbl_exit));
+	emit(dlp,  BPF_BRANCH_IMM(BPF_JEQ, BPF_REG_0, 0, lbl_fast));
 	emit(dlp,  BPF_LOAD(BPF_W, BPF_REG_1, BPF_REG_0, 0));
-	emit(dlp,  BPF_BRANCH_IMM(BPF_JNE, BPF_REG_1, act, lbl_exit));
+	emit(dlp,  BPF_BRANCH_IMM(BPF_JNE, BPF_REG_1, act, lbl_fast));
 	emit(dlp,  BPF_STORE(BPF_DW, BPF_REG_9, DCTX_ACT, BPF_REG_0));
 
 	/*
-	 *	key = 0;		// stw [%r9 + DCTX_MST], 0
+	 *	key = 0;		// stw [%r9 + DCTX_BUF], 0
 	 *	rc = bpf_map_lookup_elem(&mem, &key);
 	 *				// lddw %r1, &mem
 	 *				// mov %r2, %r9
-	 *				// add %r2, DCTX_MST
+	 *				// add %r2, DCTX_BUF
 	 *				// call bpf_map_lookup_elem
 	 *				//     (%r1 ... %r5 clobbered)
 	 *				//     (%r0 = 'mem' BPF map value)
-	 *	if (rc == 0)		// jeq %r0, 0, lbl_exit
-	 *		goto exit;
+	 *	if (rc == 0)		// jeq %r0, 0, lbl_fast
+	 *		goto fast;
 	 *				//     (%r0 = map value)
 	 *				//     (%r7 = pointer to dt_mstate_t)
 	 *				// mov %r7, %r0
+	 *				// ldw %r1, [%r7 + DMST_PRID]
+	 *	if (dctx.mst->prid != 0)// jne %r1, 0, lbl_fast
+	 *		goto exit;
 	 *	dctx.mst = rc;		// stdw [%r9 + DCTX_MST], %r7
 	 *	dctx.mst->prid = PRID;	// stw [%r7 + DMST_PRID], PRID
 	 *	dctx.mst->syscall_errno = 0;
 	 *				// stw [%r7 + DMST_ERRNO], 0
 	 */
-	emit(dlp,  BPF_STORE_IMM(BPF_W, BPF_REG_9, DCTX_MST, 0));
+	emit(dlp,  BPF_STORE_IMM(BPF_W, BPF_REG_9, DCTX_BUF, 0));
 	dt_cg_xsetx(dlp, mem, DT_LBL_NONE, BPF_REG_1, mem->di_id);
 	emit(dlp,  BPF_MOV_REG(BPF_REG_2, BPF_REG_9));
-	emit(dlp,  BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, DCTX_MST));
+	emit(dlp,  BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, DCTX_BUF));
 	emit(dlp,  BPF_CALL_HELPER(BPF_FUNC_map_lookup_elem));
-	emit(dlp,  BPF_BRANCH_IMM(BPF_JEQ, BPF_REG_0, 0, lbl_exit));
+	emit(dlp,  BPF_BRANCH_IMM(BPF_JEQ, BPF_REG_0, 0, lbl_fast));
 	emit(dlp,  BPF_MOV_REG(BPF_REG_7, BPF_REG_0));
+	emit(dlp,  BPF_LOAD(BPF_W, BPF_REG_1, BPF_REG_7, DMST_PRID));
+	emit(dlp,  BPF_BRANCH_IMM(BPF_JNE, BPF_REG_1, 0, lbl_fast));
 	emit(dlp,  BPF_STORE(BPF_DW, BPF_REG_9, DCTX_MST, BPF_REG_7));
 	emite(dlp, BPF_STORE_IMM(BPF_W, BPF_REG_7, DMST_PRID, -1), prid);
 	emit(dlp,  BPF_STORE_IMM(BPF_W, BPF_REG_7, DMST_ERRNO, 0));
@@ -410,7 +416,7 @@ dt_cg_tramp_prologue(dt_pcb_t *pcb)
 		emit(dlp,  BPF_CALL_HELPER(BPF_FUNC_get_smp_processor_id));
 		emit(dlp,  BPF_BRANCH_IMM(BPF_JNE, BPF_REG_0,
 					  dtp->dt_options[DTRACEOPT_CPU],
-					  pcb->pcb_exitlbl));
+					  pcb->pcb_fastlbl));
 
 		emit(dlp,  BPF_MOV_REG(BPF_REG_1, BPF_REG_8));
 	}
@@ -930,11 +936,17 @@ dt_cg_tramp_return(dt_pcb_t *pcb)
 
 	/*
 	 * exit:
+	 *				// lddw %r9, [%fp-DCTX_SIZE+DCTX_MST]
+	 *	dctx.mst->prid = 0;	// stw [%r9 + DMST_PRID], 0
+	 * out:
 	 *	return 0;		// mov %r0, 0
 	 *				// exit
 	 * }
 	 */
 	emitl(dlp, pcb->pcb_exitlbl,
+		   BPF_LOAD(BPF_DW, BPF_REG_0, BPF_REG_FP, (ushort_t)-DCTX_SIZE+DCTX_MST));
+	emit(dlp,  BPF_STORE_IMM(BPF_W, BPF_REG_0, DMST_PRID, 0));
+	emitl(dlp, pcb->pcb_fastlbl,
 		   BPF_MOV_IMM(BPF_REG_0, 0));
 	emit(dlp,  BPF_RETURN());
 
@@ -8504,6 +8516,7 @@ dt_cg(dt_pcb_t *pcb, dt_node_t *dnp)
 	dt_irlist_destroy(&pcb->pcb_ir);
 	dt_irlist_create(&pcb->pcb_ir);
 	pcb->pcb_exitlbl = dt_irlist_label(&pcb->pcb_ir);
+	pcb->pcb_fastlbl = dt_irlist_label(&pcb->pcb_ir);
 
 	pcb->pcb_bufoff = 0;
 
diff --git a/libdtrace/dt_pcb.h b/libdtrace/dt_pcb.h
index f261b9dd..7c57f832 100644
--- a/libdtrace/dt_pcb.h
+++ b/libdtrace/dt_pcb.h
@@ -42,6 +42,7 @@ typedef struct dt_pcb {
 	uint32_t pcb_bufoff;	/* output buffer offset (for DFUNCs) */
 	dt_irlist_t pcb_ir;	/* list of unrelocated IR instructions */
 	uint_t pcb_exitlbl;	/* label for exit of program */
+	uint_t pcb_fastlbl;	/* label for fast exit of program */
 	uint_t pcb_asvidx;	/* assembler vartab index (see dt_as.c) */
 	ulong_t **pcb_asxrefs;	/* assembler imported xlators (see dt_as.c) */
 	uint_t pcb_asxreflen;	/* assembler xlator map length (see dt_as.c) */
-- 
2.42.0




More information about the DTrace-devel mailing list