[DTrace-devel] [PATCH v3] Add support for strtok() subroutine

Sun Dec 5 01:02:27 UTC 2021

On Sat, Dec 04, 2021 at 12:16:03PM -0500, Eugene Loh via DTrace-devel wrote:
> On 12/3/21 8:41 PM, Kris Van Hees wrote:

<<stuff removed for brevity>>

> > My point is that (after the modification to put strtok_init in the state):
> > +		emit(dlp,  BPF_LOAD(BPF_DW, BPF_REG_1, BPF_REG_FP, DT_STK_DCTX));
> > +		emit(dlp,  BPF_LOAD(BPF_DW, BPF_REG_1, BPF_REG_1, DCTX_MEM));
> > +		emit(dlp,  BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, DMEM_STRTOK));
> > 
> > calculates the pointer to the strtok state, and you use that 3 more times in
> > the dt_cg_subr_strtok() (actually, 5 times, but 2 instances are in the two legs
> > of a comditional).
> I do not understand this counting.  We generate this pointer once (albeit on
> two code paths.... depends on str NULL or not) for "internal state", and
> then once for "function call".  So there is only one "reuse" opportunity. 
> You seem to describe something else.

Err, really?  Let's take a look at the function:

static void
dt_cg_subr_strtok(dt_node_t *dnp, dt_irlist_t *dlp, dt_regset_t *drp)
{
	dtrace_hdl_t	*dtp = yypcb->pcb_hdl;
	dt_node_t	*str = dnp->dn_args;
	dt_node_t	*del = str->dn_list;
	dt_ident_t	*idp = dt_dlib_get_func(yypcb->pcb_hdl, "dt_strtok");
	uint64_t	off;

	assert(idp != NULL);

	TRACE_REGSET("    subr-strtok:Begin");

	/*
	 * Start with a static check for a NULL string.
	 * That is, strtok(NULL, foo) has a special meaning:
	 * continue parsing the previous string.  In contrast,
	 * strtok(str, foo) (even if str==NULL) means to parse str.
	 */
	if (str->dn_op != DT_TOK_INT || str->dn_value != 0) {
		/* string is present:  copy it to internal state */
		dt_cg_node(str, dlp, drp);
		dt_cg_check_notnull(dlp, drp, str->dn_reg);

		if (dt_regset_xalloc_args(drp) == -1)
			longjmp(yypcb->pcb_jmpbuf, EDT_NOREG);

		/* the 8-byte prefix is the offset, which we initialize to 0 */
		emit(dlp,  BPF_LOAD(BPF_DW, BPF_REG_1, BPF_REG_FP, DT_STK_DCTX));
		emit(dlp,  BPF_LOAD(BPF_DW, BPF_REG_1, BPF_REG_1, DCTX_MEM));

# First (A) instance of determining dctx->mem (main branch)

		emit(dlp,  BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, DMEM_STRTOK));
		emit(dlp,  BPF_STORE_IMM(BPF_DW, BPF_REG_1, 0, 0));
		emit(dlp,  BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 8));
		emit(dlp,  BPF_MOV_IMM(BPF_REG_2, dtp->dt_options[DTRACEOPT_STRSIZE] + 1));
		emit(dlp,  BPF_MOV_REG(BPF_REG_3, str->dn_reg));
		dt_regset_free(drp, str->dn_reg);
		if (str->dn_tstring)
			dt_cg_tstring_free(yypcb, str);
		emit(dlp,  BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, DT_STRLEN_BYTES));
		dt_regset_xalloc(drp, BPF_REG_0);
		emit(dlp,  BPF_CALL_HELPER(BPF_FUNC_probe_read_str));
		dt_regset_free(drp, BPF_REG_0);
		dt_regset_free_args(drp);
	} else {
		/* NULL string:  error if internal state is uninitialized */
		emit(dlp,  BPF_LOAD(BPF_DW, BPF_REG_0, BPF_REG_FP, DT_STK_DCTX));
		emit(dlp,  BPF_LOAD(BPF_DW, BPF_REG_0, BPF_REG_0, DCTX_MEM));

# First (B) instance of determining dctx->mem

		emit(dlp,  BPF_LOAD(BPF_DW, BPF_REG_0, BPF_REG_0, DMEM_STRTOK));
		emit(dlp,  BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 1));
		dt_cg_check_notnull(dlp, drp, BPF_REG_0);
	}

	/* get delimiters */
	dt_cg_node(del, dlp, drp);
	dt_cg_check_notnull(dlp, drp, del->dn_reg);

	/* allocate temporary string for result */
	dnp->dn_reg = dt_regset_alloc(drp);
	if (dnp->dn_reg == -1)
		longjmp(yypcb->pcb_jmpbuf, EDT_NOREG);
	dt_cg_tstring_alloc(yypcb, dnp);
	emit(dlp,  BPF_LOAD(BPF_DW, dnp->dn_reg, BPF_REG_FP, DT_STK_DCTX));
	emit(dlp,  BPF_LOAD(BPF_DW, dnp->dn_reg, dnp->dn_reg, DCTX_MEM));

# Second instance of determining dctx->mem

	emit(dlp,  BPF_ALU64_IMM(BPF_ADD, dnp->dn_reg, dnp->dn_tstring->dn_value));

	/* allocate temporary string for internal purposes */
	off = dt_cg_tstring_xalloc(yypcb);

	/* function call */
	if (dt_regset_xalloc_args(drp) == -1)
		longjmp(yypcb->pcb_jmpbuf, EDT_NOREG);

	emit(dlp,  BPF_MOV_REG(BPF_REG_1, dnp->dn_reg));
	emit(dlp,  BPF_LOAD(BPF_DW, BPF_REG_2, BPF_REG_FP, DT_STK_DCTX));
	emit(dlp,  BPF_LOAD(BPF_DW, BPF_REG_2, BPF_REG_2, DCTX_MEM));

# Third instance of determining dctx->mem

	emit(dlp,  BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, DMEM_STRTOK));
	emit(dlp,  BPF_MOV_REG(BPF_REG_3, del->dn_reg));
	dt_regset_free(drp, del->dn_reg);
	if (del->dn_tstring)
		dt_cg_tstring_free(yypcb, del);

	emit(dlp,  BPF_LOAD(BPF_DW, BPF_REG_4, BPF_REG_FP, DT_STK_DCTX));
	emit(dlp,  BPF_LOAD(BPF_DW, BPF_REG_4, BPF_REG_4, DCTX_MEM));

# Fourth instance of determining dctx->mem

	emit(dlp,  BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, off));

	dt_regset_xalloc(drp, BPF_REG_0);
	emite(dlp, BPF_CALL_FUNC(idp->di_id), idp);
	dt_regset_free_args(drp);
	dt_cg_tstring_xfree(yypcb, off);
	emit(dlp,  BPF_MOV_REG(dnp->dn_reg, BPF_REG_0));
	dt_regset_free(drp, BPF_REG_0);

	TRACE_REGSET("    subr-strtok:End  ");
}

So yes, I count 5 instances of determining dctx->mem, and for every invocation
of dt_cg_subr_strtok() 4 will actually be generated.  How else can you count
this?

> > While not a massive savings, placing this value in a reg
> > and using that (incl. as a base to apply an offset to to get to the temp
> > string data in the state) makes the code easier to read and also makes it
> > easier to maintain if we were to make changes in he dctx / mstate setup.  Since
> > this is all code generated from within a single function, doing this kind of
> > optimization is useful.  Again, not an optimization in terms of performance
> > but it does help with code clarity, maintainability, and (in the end) if quite
> > a few invocations of strtok() are used, it reduces the overall program length.
> The reduction in instructions is infinitesimal at best.  Let's say you need
> the pointer N times.  The current method requires 3*N instructions.  Reuse
> requires 3+N instructions.  In our case, N=2. (Once, albeit on 2 code paths,
> depending on str NULL or not.  Once for the function call.)  So, 6
> instructions versus 5.  And if there is any spill/fill, your proposed
> version costs more instructions. Anyhow, we then call strtok.S, which dwarfs
> all of this.

N = 4, so 3 * 4 = 12 vs 3 + 4 = 7

But as I alrady stated in my previous email, the savings on instruction count
are not the issue here (aside from reducing the program length a little).  The
primary gain is less duplication of instruction sequences in a place where it
can really be reduced easily which makes the code easier to maintain.

> The current approach strikes me as more clear.  The generated instructions
> are local to where the pointer is needed.  To put this in a register would
> mean another register declared, alloced, and freed.
> > > But let me know if you feel strongly about this.
> We disagree on which approach is clearer and how many instructions are at
> stake.  It appears we do not even agree on how many times the pointer is
> used.  Ultimately, though, either approach is possible and our users will
> never notice the difference (unless reuse triggers some unforeseen register
> pressure).  I think the "regeneration" approach is cleaner, but again just
> let me know.  It would be nice if we agreed on the rationale -- or even on
> how many times the pointer is being used -- but we do not need to.

I don't think I have been obscure about the rationale, and I have re-stated it
above just to be absolutely clear.  I am unclear where you did not see the four
instances of this code sequencee in our code, but I hope the annotated code
above clears that up beyond doubt.

	Kris