[DTrace-devel] [PATCH] Record CPU that BEGIN and END probes run on in the 'state' BPF map

Fri Sep 4 22:12:25 PDT 2020

On 09/02/2020 01:56 PM, Kris Van Hees wrote:

> In order to ensure that the BEGIN probe trace data is processed first,
I would think no comma, but do as you like.
> and END probe trace data is processed last, we need to know what CPU
> buffers their data is stored in.  This patch adds code to record the
> CPU on which the BEGIN and END probes run.  The 'state' BPF map has
> entries (DT_STATE_BEGANON and DT_STATE_ENDEDON) to store this data.
>
> Signed-off-by: Kris Van Hees <kris.van.hees at oracle.com>
> ---
>   libdtrace/dt_cg.c          |  3 +--
>   libdtrace/dt_impl.h        |  1 +
>   libdtrace/dt_prov_dtrace.c | 54 +++++++++++++++++++++++++++++++++++++-
>   3 files changed, 55 insertions(+), 3 deletions(-)
>
> diff --git a/libdtrace/dt_cg.c b/libdtrace/dt_cg.c
> index d289e02b..46da8f33 100644
> --- a/libdtrace/dt_cg.c
> +++ b/libdtrace/dt_cg.c
> @@ -21,7 +21,6 @@
>   #include <dt_bpf_builtins.h>
>   #include <bpf_asm.h>
>   
> -static void dt_cg_xsetx(dt_irlist_t *, dt_ident_t *, uint_t, int, uint64_t);
>   static void dt_cg_node(dt_node_t *, dt_irlist_t *, dt_regset_t *);
>   
>   /*
> @@ -1105,7 +1104,7 @@ dt_cg_membinfo(ctf_file_t *fp, ctf_id_t type, const char *s, ctf_membinfo_t *mp)
>   	return (fp);
>   }
>   
> -static void
> +void
>   dt_cg_xsetx(dt_irlist_t *dlp, dt_ident_t *idp, uint_t lbl, int reg, uint64_t x)
>   {
>   	struct bpf_insn instr[2] = { BPF_LDDW(reg, x) };
> diff --git a/libdtrace/dt_impl.h b/libdtrace/dt_impl.h
> index f062aa9b..b7d53dd9 100644
> --- a/libdtrace/dt_impl.h
> +++ b/libdtrace/dt_impl.h
> @@ -776,6 +776,7 @@ extern dtrace_difo_t *dt_program_construct(dtrace_hdl_t *dtp,
>   extern void dt_pragma(dt_node_t *);
>   extern int dt_reduce(dtrace_hdl_t *, dt_version_t);
>   extern void dt_cg(dt_pcb_t *, dt_node_t *);
> +extern void dt_cg_xsetx(dt_irlist_t *, dt_ident_t *, uint_t, int, uint64_t);
>   extern dt_irnode_t *dt_cg_node_alloc(uint_t, struct bpf_insn);
>   extern void dt_cg_tramp_prologue_act(dt_pcb_t *pcb, uint_t lbl_exit,
>   				     dt_activity_t act);
> diff --git a/libdtrace/dt_prov_dtrace.c b/libdtrace/dt_prov_dtrace.c
> index d9051229..af3c4242 100644
> --- a/libdtrace/dt_prov_dtrace.c
> +++ b/libdtrace/dt_prov_dtrace.c
> @@ -74,25 +74,36 @@ static void trampoline(dt_pcb_t *pcb)
>   	uint_t		lbl_exit = dt_irlist_label(dlp);
>   	dt_activity_t	act;
>   	int		adv_act;
> +	uint32_t	key = 0;
>   
>   	/*
>   	 * The BEGIN probe should only run when the activity state is INACTIVE.
>   	 * At the end of the trampoline (after executing any clauses), the
>   	 * state must be advanced to the next state (INACTIVE -> ACTIVE).
>   	 *
> +	 * When the BEGIN probe is triggered, we need to record the CPU it runs
> +	 * on in state[DT_STATE_BEGANON] to ensure that we know which trace
> +	 * data buffer to process first.
> +	 *
>   	 * The END probe should only run when the activity state is ACTIVE.
>   	 * At the end of the trampoline (after executing any clauses), the
>   	 * state must be advanced to the next state (ACTIVE -> DRAINING).
>   	 *
> -	 * Any other probe requires the state tp be ACTIVE, and does not change

I'm guessing this typo is already fixed in an earlier patch? Anyhow, 
guess it doesn't matter and okay to fix it here.

> +	 * When the END probe is triggered, we need to record the CPU it runs
> +	 * on in state[DT_STATE_ENDEDON] to ensure that we know which trace
> +	 * data buffer to process last.

Makes no sense to me to cut and paste this paragraph.  Just duplicate 
text that's taking up space and more stuff to maintain.  I would combine 
the DT_STATE_BEGANON and DT_STATE_ENDEDON paragraphs. But... whatever 
you want.

> +	 *
> +	 * Any other probe requires the state to be ACTIVE, and does not change
>   	 * the state.
>   	 */
>   	if (strcmp(pcb->pcb_probe->desc->prb, "BEGIN") == 0) {
>   		act = DT_ACTIVITY_INACTIVE;
>   		adv_act = 1;
> +		key = DT_STATE_BEGANON;
>   	} else if (strcmp(pcb->pcb_probe->desc->prb, "END") == 0) {
>   		act = DT_ACTIVITY_ACTIVE;
>   		adv_act = 1;
> +		key = DT_STATE_ENDEDON;
>   	} else {
>   		act = DT_ACTIVITY_ACTIVE;
>   		adv_act = 0;
> @@ -100,6 +111,47 @@ static void trampoline(dt_pcb_t *pcb)
>   
>   	dt_cg_tramp_prologue_act(pcb, lbl_exit, act);
>   
> +	if (key) {
> +		/*
> +		 *     key = DT_STATE_BEGANON;
> +		 *			// stw [%fp + DT_STK_SPILL(0)],
> +		 *			//			DT_STATE_BEGANON

The DT_STATE_BEGANON comments are wrong.  We are not setting key; that's 
already been done.  And this code generation is for both BEGAN and 
ENDED.  All we're doing is storing key.

> +		 *     val = bpf_get_smp_processor_id();
> +		 *			// call bpf_get_smp_processor_id
> +		 *			// stw [%fp + DT_STK_SPILL(1)], %r0
> +		 *     bpf_map_update_elem(state, &key, &val, BPF_ANY);
> +		 *			// lddw %r1, &state
> +		 *			// mov %r2, %fp
> +		 *			// add %r2, DT_STK_SPILL(0)
> +		 *			// mov %r3, %fp
> +		 *			// add %r3, DT_STK_SPILL(1)
> +		 *			// mov %r4, BPF_ANY
> +		 *			// call bpf_map_update_elem
> +		 */
> +		dt_ident_t	*state = dt_dlib_get_map(pcb->pcb_hdl, "state");
> +
> +		instr = BPF_STORE_IMM(BPF_W, BPF_REG_FP, DT_STK_SPILL(0), key);
> +		dt_irlist_append(dlp, dt_cg_node_alloc(DT_LBL_NONE, instr));
> +		instr = BPF_CALL_HELPER(BPF_FUNC_get_smp_processor_id);
> +		dt_irlist_append(dlp, dt_cg_node_alloc(DT_LBL_NONE, instr));
> +		instr = BPF_STORE(BPF_W, BPF_REG_FP, DT_STK_SPILL(1),
> +				  BPF_REG_0);
> +		dt_irlist_append(dlp, dt_cg_node_alloc(DT_LBL_NONE, instr));
> +		dt_cg_xsetx(dlp, state, DT_LBL_NONE, BPF_REG_1, state->di_id);
> +		instr = BPF_MOV_REG(BPF_REG_2, BPF_REG_FP);
> +		dt_irlist_append(dlp, dt_cg_node_alloc(DT_LBL_NONE, instr));
> +		instr = BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, DT_STK_SPILL(0));
> +		dt_irlist_append(dlp, dt_cg_node_alloc(DT_LBL_NONE, instr));
> +		instr = BPF_MOV_REG(BPF_REG_3, BPF_REG_FP);
> +		dt_irlist_append(dlp, dt_cg_node_alloc(DT_LBL_NONE, instr));
> +		instr = BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, DT_STK_SPILL(1));
> +		dt_irlist_append(dlp, dt_cg_node_alloc(DT_LBL_NONE, instr));
> +		instr = BPF_MOV_IMM(BPF_REG_4, BPF_ANY);
> +		dt_irlist_append(dlp, dt_cg_node_alloc(DT_LBL_NONE, instr));
> +		instr = BPF_CALL_HELPER(BPF_FUNC_map_update_elem);
> +		dt_irlist_append(dlp, dt_cg_node_alloc(DT_LBL_NONE, instr));
> +	}
> +
>   	/*
>   	 * We cannot assume anything about the state of any registers so set up
>   	 * the ones we need:

Should we be checking return values from those functions?  Meh. Life is 
short.

There should be some test here, but that also means pairing this patch 
with something that makes the change observable and testable. So, hold 
off on this patch until it's usable?