[DTrace-devel] [PATCH 3/3] Ensure that BEGIN is the first probe and END the last one

Eugene Loh eugene.loh at oracle.com
Fri Sep 4 17:51:11 PDT 2020


First, a question.  If we always enable BEGIN and END, why don't I see 
them in dtrace output.  E.g., "dtrace -n tick-1" does not explicitly 
specify BEGIN, but I don't see that probe firing.  I feel like I should 
be able to figure this out but... too lazy or whatever.

A few other comments below.

On 08/27/2020 11:54 AM, Kris Van Hees wrote:

> DTrace semantics require that the BEGIN probe is the first one to be
> recorded in the trace buffers, and that the END probe is tha last one

s/tha/the/

> to be recorded in the trace buffers.  This is implemented using the
> activity member of the DTrace state that is shared using a BPF map.
>
> The default activity state is INACTIVE which means that no probe will
> processed except for the BEGIN probe.  The BEGIN probe will only be
> processed when the state is INACTIVE.
>
> Once the BEGIN probe processing has completed, the activity state is
> advanced to ACTIVE.  All probes other than BEGIN require the activity
> state to be ACTIVE in order for them to be processed.
>
> When the END probe has been processed, the activity state is advanced
> to DRAINING.
>
> The BEGIN and END probes are therefore always added to the list of
> enablings, even if no clauses are specified for them.  In other words,
> the BEGIN and END probes will always be enabled to ensure that the
> activity state is updated correctly.
>
> Signed-off-by: Kris Van Hees <kris.van.hees at oracle.com>
> ---
>   libdtrace/dt_bpf.c                    |   1 -
>   libdtrace/dt_cg.c                     | 122 +++++++++++++++++++++++---
>   libdtrace/dt_impl.h                   |   8 +-
>   libdtrace/dt_prov_dtrace.c            |  41 ++++++++-
>   libdtrace/dt_state.h                  |  12 ++-
>   test/unittest/begin/tst.begin-first.d |  24 +++++
>   6 files changed, 187 insertions(+), 21 deletions(-)
>   create mode 100644 test/unittest/begin/tst.begin-first.d
>
> diff --git a/libdtrace/dt_bpf.c b/libdtrace/dt_bpf.c
> index a381a86e..7adcf70d 100644
> --- a/libdtrace/dt_bpf.c
> +++ b/libdtrace/dt_bpf.c
> @@ -14,7 +14,6 @@
>   #include <dtrace.h>
>   #include <dt_impl.h>
>   #include <dt_probe.h>
> -#include <dt_state.h>
>   #include <dt_bpf.h>
>   #include <port.h>
>   

This change is then reverted in a subsequent patch.  Sigh.  But maybe 
not worth the effort of cleaning that commit history up due to a 
transient need for that change here.

> diff --git a/libdtrace/dt_cg.c b/libdtrace/dt_cg.c
> index 8a7a09db..01e8707f 100644
> --- a/libdtrace/dt_cg.c
> +++ b/libdtrace/dt_cg.c
> @@ -39,10 +39,11 @@ static void dt_cg_node(dt_node_t *, dt_irlist_t *, dt_regset_t *);
>    * the trampiline prologue.

Maybe you regard that as out of scope, but s/trampiline/trampoline/.

>    */
>   void
> -dt_cg_tramp_prologue(dt_pcb_t *pcb, uint_t lbl_exit)
> +dt_cg_tramp_prologue_act(dt_pcb_t *pcb, uint_t lbl_exit, dt_activity_t act)
>   {
>   	dt_irlist_t	*dlp = &pcb->pcb_ir;
>   	dt_ident_t	*mem = dt_dlib_get_map(pcb->pcb_hdl, "mem");
> +	dt_ident_t	*state = dt_dlib_get_map(pcb->pcb_hdl, "state");
>   	struct bpf_insn	instr;
>   
>   	assert(mem != NULL);
> @@ -63,7 +64,39 @@ dt_cg_tramp_prologue(dt_pcb_t *pcb, uint_t lbl_exit)
>   	dt_irlist_append(dlp, dt_cg_node_alloc(DT_LBL_NONE, instr));
>   
>   	/*
> -	 *	key = 0;                // stw [%fp + DCTX_FP(DCTX_MST)], 0
> +	 *	key = DT_STATE_ACTIVITY;// stw [%fp + DCTX_FP(DCTX_MST)],
> +	 *				//		DT_STATE_ACTIVITY
> +	 *	rc = bpf_map_lookup_elem(&state, &key);
> +	 *				// lddw %r1, &state
> +	 *				// mov %r2, %fp
> +	 *				// add %r2, DCTX_FP(DCTX_MST)
> +	 *				// call bpf_map_lookup_elem
> +	 *				//     (%r1 ... %r5 clobbered)
> +	 *				//     (%r0 = map value)
> +	 *	if (rc == 0)		// jeq %r0, 0, lbl_exit
> +	 *		goto exit;
> +	 *	if (*rc == act)		// ldw %r0, [%r0 + 0]
> +	 *		goto exit;	// jne %r0, act, lbl_exit
> +	 */
> +	instr = BPF_STORE_IMM(BPF_W, BPF_REG_FP, DCTX_FP(DCTX_MST),
> +			      DT_STATE_ACTIVITY);
> +	dt_irlist_append(dlp, dt_cg_node_alloc(DT_LBL_NONE, instr));
> +	dt_cg_xsetx(dlp, state, DT_LBL_NONE, BPF_REG_1, state->di_id);
> +	instr = BPF_MOV_REG(BPF_REG_2, BPF_REG_FP);
> +	dt_irlist_append(dlp, dt_cg_node_alloc(DT_LBL_NONE, instr));
> +	instr = BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, DCTX_FP(DCTX_MST));
> +	dt_irlist_append(dlp, dt_cg_node_alloc(DT_LBL_NONE, instr));
> +	instr = BPF_CALL_HELPER(BPF_FUNC_map_lookup_elem);
> +	dt_irlist_append(dlp, dt_cg_node_alloc(DT_LBL_NONE, instr));
> +	instr = BPF_BRANCH_IMM(BPF_JEQ, BPF_REG_0, 0, lbl_exit);
> +	dt_irlist_append(dlp, dt_cg_node_alloc(DT_LBL_NONE, instr));
> +	instr = BPF_LOAD(BPF_W, BPF_REG_0, BPF_REG_0, 0);
> +	dt_irlist_append(dlp, dt_cg_node_alloc(DT_LBL_NONE, instr));
> +	instr = BPF_BRANCH_IMM(BPF_JNE, BPF_REG_0, act, lbl_exit);
> +	dt_irlist_append(dlp, dt_cg_node_alloc(DT_LBL_NONE, instr));
> +
> +	/*
> +	 *	key = 0;		// stw [%fp + DCTX_FP(DCTX_MST)], 0
>   	 *	rc = bpf_map_lookup_elem(&mem, &key);
>   	 *				// lddw %r1, &mem
>   	 *				// mov %r2, %fp
> @@ -71,10 +104,10 @@ dt_cg_tramp_prologue(dt_pcb_t *pcb, uint_t lbl_exit)
>   	 *				// call bpf_map_lookup_elem
>   	 *				//     (%r1 ... %r5 clobbered)
>   	 *				//     (%r0 = 'mem' BPF map value)
> -	 *	if (rc == 0)            // jeq %r0, 0, lbl_exit
> +	 *	if (rc == 0)		// jeq %r0, 0, lbl_exit
>   	 *		goto exit;
>   	 *				//     (%r0 = pointer to dt_mstate_t)
> -	 *	dctx.mst = rc;          // stdw [%fp + DCTX_FP(DCTX_MST)], %r0
> +	 *	dctx.mst = rc;		// stdw [%fp + DCTX_FP(DCTX_MST)], %r0
>   	 */
>   	instr = BPF_STORE_IMM(BPF_W, BPF_REG_FP, DCTX_FP(DCTX_MST), 0);
>   	dt_irlist_append(dlp, dt_cg_node_alloc(DT_LBL_NONE, instr));
> @@ -91,14 +124,14 @@ dt_cg_tramp_prologue(dt_pcb_t *pcb, uint_t lbl_exit)
>   	dt_irlist_append(dlp, dt_cg_node_alloc(DT_LBL_NONE, instr));
>   
>   	/*
> -	 *      buf = rc + roundup(sizeof(dt_mstate_t), 8);
> +	 *	buf = rc + roundup(sizeof(dt_mstate_t), 8);
>   	 *				// add %r0, roundup(
>   	 *						sizeof(dt_mstate_t), 8)
> -	 *      *((uint64_t *)&buf[0]) = 0;
> +	 *	*((uint64_t *)&buf[0]) = 0;
>   	 *				// stdw [%r0 + 0], 0
> -	 *      buf += 8;		// add %r0, 8
> +	 *	buf += 8;		// add %r0, 8
>   	 *				//     (%r0 = pointer to buffer space)
> -	 *      dctx.buf = buf;		// stdw [%fp + DCTX_FP(DCTX_BUF)], %r0
> +	 *	dctx.buf = buf;		// stdw [%fp + DCTX_FP(DCTX_BUF)], %r0
>   	 */
>   	instr = BPF_ALU64_IMM(BPF_ADD, BPF_REG_0,
>   			      roundup(sizeof(dt_mstate_t), 8));
> @@ -111,6 +144,12 @@ dt_cg_tramp_prologue(dt_pcb_t *pcb, uint_t lbl_exit)
>   	dt_irlist_append(dlp, dt_cg_node_alloc(DT_LBL_NONE, instr));
>   }
>   
> +void
> +dt_cg_tramp_prologue(dt_pcb_t *pcb, uint_t lbl_exit)
> +{
> +	dt_cg_tramp_prologue_act(pcb, lbl_exit, DT_ACTIVITY_ACTIVE);
> +}
> +
>   static int
>   dt_cg_call_clause(dtrace_hdl_t *dtp, dt_ident_t *idp, dt_irlist_t *dlp)
>   {
> @@ -127,11 +166,10 @@ dt_cg_call_clause(dtrace_hdl_t *dtp, dt_ident_t *idp, dt_irlist_t *dlp)
>   	return 0;
>   }
>   
> -void
> -dt_cg_tramp_epilogue(dt_pcb_t *pcb, uint_t lbl_exit)
> +static void
> +dt_cg_tramp_call_clauses(dt_pcb_t *pcb)
>   {
>   	dt_irlist_t	*dlp = &pcb->pcb_ir;
> -	struct bpf_insn	instr;
>   
>   	/*
>   	 *	dt_program(dctx);	// mov %r1, %fp
> @@ -141,6 +179,13 @@ dt_cg_tramp_epilogue(dt_pcb_t *pcb, uint_t lbl_exit)
>   	 */
>   	dt_probe_clause_iter(pcb->pcb_hdl, pcb->pcb_probe,
>   			     (dt_clause_f *)dt_cg_call_clause, dlp);
> +}
> +
> +static void
> +dt_cg_tramp_return(dt_pcb_t *pcb, uint_t lbl_exit)
> +{
> +	dt_irlist_t	*dlp = &pcb->pcb_ir;
> +	struct bpf_insn	instr;
>   
>   	/*
>   	 * exit:
> @@ -154,6 +199,57 @@ dt_cg_tramp_epilogue(dt_pcb_t *pcb, uint_t lbl_exit)
>   	dt_irlist_append(dlp, dt_cg_node_alloc(DT_LBL_NONE, instr));
>   }
>   
> +void
> +dt_cg_tramp_epilogue(dt_pcb_t *pcb, uint_t lbl_exit)
> +{
> +	dt_cg_tramp_call_clauses(pcb);
> +	dt_cg_tramp_return(pcb, lbl_exit);
> +}
> +
> +void
> +dt_cg_tramp_epilogue_advance(dt_pcb_t *pcb, uint_t lbl_exit)
> +{
> +	dt_irlist_t	*dlp = &pcb->pcb_ir;
> +	dt_ident_t	*state = dt_dlib_get_map(pcb->pcb_hdl, "state");
> +	struct bpf_insn	instr;
> +
> +	dt_cg_tramp_call_clauses(pcb);
> +
> +	/*
> +	 *	key = DT_STATE_ACTIVITY;// stw [%fp + DCTX_FP(DCTX_MST)],
> +	 *				//		DT_STATE_ACTIVITY
> +	 *	rc = bpf_map_lookup_elem(&state, &key);
> +	 *				// lddw %r1, &state
> +	 *				// mov %r2, %fp
> +	 *				// add %r2, DCTX_FP(DCTX_MST)
> +	 *				// call bpf_map_lookup_elem
> +	 *				//     (%r1 ... %r5 clobbered)
> +	 *				//     (%r0 = map value)
> +	 *	if (rc == 0)		// jeq %r0, 0, lbl_exit
> +	 *		goto exit;
> +	 *	(*rc)++;		// mov %r1, 1
> +	 *				// xadd [%r0 + 0], %r1
> +	 */
> +	instr = BPF_STORE_IMM(BPF_W, BPF_REG_FP, DCTX_FP(DCTX_MST),
> +			      DT_STATE_ACTIVITY);
> +	dt_irlist_append(dlp, dt_cg_node_alloc(DT_LBL_NONE, instr));
> +	dt_cg_xsetx(dlp, state, DT_LBL_NONE, BPF_REG_1, state->di_id);
> +	instr = BPF_MOV_REG(BPF_REG_2, BPF_REG_FP);
> +	dt_irlist_append(dlp, dt_cg_node_alloc(DT_LBL_NONE, instr));
> +	instr = BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, DCTX_FP(DCTX_MST));
> +	dt_irlist_append(dlp, dt_cg_node_alloc(DT_LBL_NONE, instr));
> +	instr = BPF_CALL_HELPER(BPF_FUNC_map_lookup_elem);
> +	dt_irlist_append(dlp, dt_cg_node_alloc(DT_LBL_NONE, instr));
> +	instr = BPF_BRANCH_IMM(BPF_JEQ, BPF_REG_0, 0, lbl_exit);
> +	dt_irlist_append(dlp, dt_cg_node_alloc(DT_LBL_NONE, instr));
> +	instr = BPF_MOV_IMM(BPF_REG_1, 1);
> +	dt_irlist_append(dlp, dt_cg_node_alloc(DT_LBL_NONE, instr));
> +	instr = BPF_XADD_REG(BPF_W, BPF_REG_0, 0, BPF_REG_1);
> +	dt_irlist_append(dlp, dt_cg_node_alloc(DT_LBL_NONE, instr));
> +
> +	dt_cg_tramp_return(pcb, lbl_exit);
> +}
> +
>   /*
>    * Generate the function prologue.
>    *
> @@ -483,7 +579,7 @@ dt_cg_act_clear(dt_pcb_t *pcb, dt_node_t *dnp, dtrace_actkind_t kind)
>   	}
>   
>   	aid = anp->dn_ident;
> -        if (aid->di_gen == pcb->pcb_hdl->dt_gen &&
> +	if (aid->di_gen == pcb->pcb_hdl->dt_gen &&
>   	    !(aid->di_flags & DT_IDFLG_MOD)) {
>   		dnerror(dnp, D_CLEAR_AGGBAD,
>   			"undefined aggregation: @%s\n", aid->di_name);
> @@ -539,7 +635,7 @@ dt_cg_act_denormalize(dt_pcb_t *pcb, dt_node_t *dnp, dtrace_actkind_t kind)
>   	}
>   
>   	aid = anp->dn_ident;
> -        if (aid->di_gen == pcb->pcb_hdl->dt_gen &&
> +	if (aid->di_gen == pcb->pcb_hdl->dt_gen &&
>   	    !(aid->di_flags & DT_IDFLG_MOD)) {
>   		dnerror(dnp, D_NORMALIZE_AGGBAD,
>   			"undefined aggregation: @%s\n", aid->di_name);
> diff --git a/libdtrace/dt_impl.h b/libdtrace/dt_impl.h
> index 60095a80..f062aa9b 100644
> --- a/libdtrace/dt_impl.h
> +++ b/libdtrace/dt_impl.h
> @@ -50,6 +50,7 @@ extern "C" {
>   #include <dt_pt_regs.h>
>   #include <dt_printf.h>
>   #include <dt_dctx.h>
> +#include <dt_state.h>
>   #include <dt_debug.h>
>   #include <dt_version.h>
>   

Ditto earlier remark:  This change will be reverted in a later patch, 
but... oh well?  Let it go?  Too clumsy to rearrange history?

> @@ -776,8 +777,11 @@ extern void dt_pragma(dt_node_t *);
>   extern int dt_reduce(dtrace_hdl_t *, dt_version_t);
>   extern void dt_cg(dt_pcb_t *, dt_node_t *);
>   extern dt_irnode_t *dt_cg_node_alloc(uint_t, struct bpf_insn);
> -extern void dt_cg_tramp_prologue(dt_pcb_t *dtp, uint_t lbl_exit);
> -extern void dt_cg_tramp_epilogue(dt_pcb_t *dtp, uint_t lbl_exit);
> +extern void dt_cg_tramp_prologue_act(dt_pcb_t *pcb, uint_t lbl_exit,
> +				     dt_activity_t act);
> +extern void dt_cg_tramp_prologue(dt_pcb_t *pcb, uint_t lbl_exit);
> +extern void dt_cg_tramp_epilogue(dt_pcb_t *pcb, uint_t lbl_exit);
> +extern void dt_cg_tramp_epilogue_advance(dt_pcb_t *pcb, uint_t lbl_exit);
>   extern dtrace_difo_t *dt_as(dt_pcb_t *);
>   extern void dt_dis_program(dtrace_hdl_t *dtp, dtrace_prog_t *pgp, FILE *fp);
>   extern void dt_dis_difo(const dtrace_difo_t *dp, FILE *fp);
> diff --git a/libdtrace/dt_prov_dtrace.c b/libdtrace/dt_prov_dtrace.c
> index a0d0e38a..d9051229 100644
> --- a/libdtrace/dt_prov_dtrace.c
> +++ b/libdtrace/dt_prov_dtrace.c
> @@ -34,16 +34,21 @@ static const dtrace_pattr_t	pattr = {
>   static int populate(dtrace_hdl_t *dtp)
>   {
>   	dt_provider_t	*prv;
> +	dt_probe_t	*prp;
>   	int		n = 0;
>   
>   	prv = dt_provider_create(dtp, prvname, &dt_dtrace, &pattr);
>   	if (prv == NULL)
>   		return 0;
>   
> -	if (tp_probe_insert(dtp, prv, prvname, modname, funname, "BEGIN"))
> +	prp = tp_probe_insert(dtp, prv, prvname, modname, funname, "BEGIN");
> +	if (prp)
>   		n++;
> -	if (tp_probe_insert(dtp, prv, prvname, modname, funname, "END"))
> +	dt_list_append(&dtp->dt_enablings, prp);

If you test prp, apparently it can be NULL, but the test does not apply 
to the dt_list_append() call even though passing NULL will lead to a 
NULL dereference.  So, maybe the dt_list_append() should be part of the 
"if (prp)" clause.

> +	prp = tp_probe_insert(dtp, prv, prvname, modname, funname, "END");
> +	if (prp)
>   		n++;
> +	dt_list_append(&dtp->dt_enablings, prp);

Same comment about not checking prp!=NULL before the dt_list_append() call.

>   	if (tp_probe_insert(dtp, prv, prvname, modname, funname, "ERROR"))
>   		n++;
>   
> @@ -67,8 +72,33 @@ static void trampoline(dt_pcb_t *pcb)
>   	dt_irlist_t	*dlp = &pcb->pcb_ir;
>   	struct bpf_insn	instr;
>   	uint_t		lbl_exit = dt_irlist_label(dlp);
> +	dt_activity_t	act;
> +	int		adv_act;
>   
> -	dt_cg_tramp_prologue(pcb, lbl_exit);
> +	/*
> +	 * The BEGIN probe should only run when the activity state is INACTIVE.
> +	 * At the end of the trampoline (after executing any clauses), the
> +	 * state must be advanced to the next state (INACTIVE -> ACTIVE).
> +	 *
> +	 * The END probe should only run when the activity state is ACTIVE.
> +	 * At the end of the trampoline (after executing any clauses), the
> +	 * state must be advanced to the next state (ACTIVE -> DRAINING).
> +	 *
> +	 * Any other probe requires the state tp be ACTIVE, and does not change

s/tp/to/

> +	 * the state.
> +	 */
> +	if (strcmp(pcb->pcb_probe->desc->prb, "BEGIN") == 0) {
> +		act = DT_ACTIVITY_INACTIVE;
> +		adv_act = 1;
> +	} else if (strcmp(pcb->pcb_probe->desc->prb, "END") == 0) {
> +		act = DT_ACTIVITY_ACTIVE;
> +		adv_act = 1;
> +	} else {
> +		act = DT_ACTIVITY_ACTIVE;
> +		adv_act = 0;
> +	}
> +
> +	dt_cg_tramp_prologue_act(pcb, lbl_exit, act);
>   
>   	/*
>   	 * We cannot assume anything about the state of any registers so set up
> @@ -154,7 +184,10 @@ static void trampoline(dt_pcb_t *pcb)
>   		dt_irlist_append(dlp, dt_cg_node_alloc(DT_LBL_NONE, instr));
>   	}
>   
> -	dt_cg_tramp_epilogue(pcb, lbl_exit);
> +	if (adv_act)
> +		dt_cg_tramp_epilogue_advance(pcb, lbl_exit);
> +	else
> +		dt_cg_tramp_epilogue(pcb, lbl_exit);
>   }
>   
>   static char *uprobe_spec(dtrace_hdl_t *dtp, const char *prb)
> diff --git a/libdtrace/dt_state.h b/libdtrace/dt_state.h
> index 9acb66b2..f7bf2a9f 100644
> --- a/libdtrace/dt_state.h
> +++ b/libdtrace/dt_state.h
> @@ -18,10 +18,20 @@
>   #define DT_STATE_VAL_TYPE	uint32_t
>   
>   typedef enum dt_state_elem {
> -	DT_STATE_ACTIVITY	= 0,	/* activity state of the seesion */
> +	DT_STATE_ACTIVITY = 0,		/* activity state of the seesion */

As noted in an earlier review, this change c/should have been made when 
the code was initially introduced.

>   	DT_STATE_BEGANON,		/* cpu BEGIN probe executed on */
>   	DT_STATE_ENDEDON,		/* cpu END probe executed on */
>   	DT_STATE_NUM_ELEMS
>   } dt_state_elem_t;
>   
> +/*
> + * DTrace activity state
> + */
> +typedef enum dt_activity {
> +	DT_ACTIVITY_INACTIVE = 0,	/* tracing not started */
> +	DT_ACTIVITY_ACTIVE,		/* tracing in progress */
> +	DT_ACTIVITY_DRAINING,		/* tracing being stopped */
> +	DT_ACTIVITY_STOPPED		/* tracing stopped */
> +} dt_activity_t;
> +

What is DT_ACTIVITY_STOPPED used for?  Another one of those "future 
patch" things?

>   #endif /* _DT_STATE_H */
> diff --git a/test/unittest/begin/tst.begin-first.d b/test/unittest/begin/tst.begin-first.d
> new file mode 100644
> index 00000000..aef57532
> --- /dev/null
> +++ b/test/unittest/begin/tst.begin-first.d
> @@ -0,0 +1,24 @@
> +/*
> + * Oracle Linux DTrace.
> + * Copyright (c) 2020, Oracle and/or its affiliates. All rights reserved.
> + * Licensed under the Universal Permissive License v 1.0 as shown at
> + * http://oss.oracle.com/licenses/upl.
> + */
> +
> +/*
> + * ASSERTION: BEGIN probe first before all other probes.
> + *
> + * SECTION: dtrace Provider
> + */
> +
> +#pragma D option quiet
> +
> +BEGIN
> +{
> +	exit(0);
> +}
> +
> +tick-200us
> +{
> +	exit(1);
> +}

This does not seem like much of a test.  I mean, I can see how we expect 
it to pass, but it might well have passed even without this patch.  In 
fact, yes, I just tried it without the patch and it passed.  Twice in a 
row.  Maybe stress it more (some test that runs many times or something) 
so that it seems more meaningful.

Also, shouldn't there be a corresponding END test? 
unittest/end/tst.end-last.* or so?



More information about the DTrace-devel mailing list