[DTrace-devel] [PATCH v3] Add support for strtok() subroutine
Kris Van Hees
kris.van.hees at oracle.com
Sun Dec 5 01:02:27 UTC 2021
On Sat, Dec 04, 2021 at 12:16:03PM -0500, Eugene Loh via DTrace-devel wrote:
> On 12/3/21 8:41 PM, Kris Van Hees wrote:
<<stuff removed for brevity>>
> > My point is that (after the modification to put strtok_init in the state):
> > + emit(dlp, BPF_LOAD(BPF_DW, BPF_REG_1, BPF_REG_FP, DT_STK_DCTX));
> > + emit(dlp, BPF_LOAD(BPF_DW, BPF_REG_1, BPF_REG_1, DCTX_MEM));
> > + emit(dlp, BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, DMEM_STRTOK));
> >
> > calculates the pointer to the strtok state, and you use that 3 more times in
> > the dt_cg_subr_strtok() (actually, 5 times, but 2 instances are in the two legs
> > of a comditional).
> I do not understand this counting. We generate this pointer once (albeit on
> two code paths.... depends on str NULL or not) for "internal state", and
> then once for "function call". So there is only one "reuse" opportunity.
> You seem to describe something else.
Err, really? Let's take a look at the function:
static void
dt_cg_subr_strtok(dt_node_t *dnp, dt_irlist_t *dlp, dt_regset_t *drp)
{
dtrace_hdl_t *dtp = yypcb->pcb_hdl;
dt_node_t *str = dnp->dn_args;
dt_node_t *del = str->dn_list;
dt_ident_t *idp = dt_dlib_get_func(yypcb->pcb_hdl, "dt_strtok");
uint64_t off;
assert(idp != NULL);
TRACE_REGSET(" subr-strtok:Begin");
/*
* Start with a static check for a NULL string.
* That is, strtok(NULL, foo) has a special meaning:
* continue parsing the previous string. In contrast,
* strtok(str, foo) (even if str==NULL) means to parse str.
*/
if (str->dn_op != DT_TOK_INT || str->dn_value != 0) {
/* string is present: copy it to internal state */
dt_cg_node(str, dlp, drp);
dt_cg_check_notnull(dlp, drp, str->dn_reg);
if (dt_regset_xalloc_args(drp) == -1)
longjmp(yypcb->pcb_jmpbuf, EDT_NOREG);
/* the 8-byte prefix is the offset, which we initialize to 0 */
emit(dlp, BPF_LOAD(BPF_DW, BPF_REG_1, BPF_REG_FP, DT_STK_DCTX));
emit(dlp, BPF_LOAD(BPF_DW, BPF_REG_1, BPF_REG_1, DCTX_MEM));
# First (A) instance of determining dctx->mem (main branch)
emit(dlp, BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, DMEM_STRTOK));
emit(dlp, BPF_STORE_IMM(BPF_DW, BPF_REG_1, 0, 0));
emit(dlp, BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 8));
emit(dlp, BPF_MOV_IMM(BPF_REG_2, dtp->dt_options[DTRACEOPT_STRSIZE] + 1));
emit(dlp, BPF_MOV_REG(BPF_REG_3, str->dn_reg));
dt_regset_free(drp, str->dn_reg);
if (str->dn_tstring)
dt_cg_tstring_free(yypcb, str);
emit(dlp, BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, DT_STRLEN_BYTES));
dt_regset_xalloc(drp, BPF_REG_0);
emit(dlp, BPF_CALL_HELPER(BPF_FUNC_probe_read_str));
dt_regset_free(drp, BPF_REG_0);
dt_regset_free_args(drp);
} else {
/* NULL string: error if internal state is uninitialized */
emit(dlp, BPF_LOAD(BPF_DW, BPF_REG_0, BPF_REG_FP, DT_STK_DCTX));
emit(dlp, BPF_LOAD(BPF_DW, BPF_REG_0, BPF_REG_0, DCTX_MEM));
# First (B) instance of determining dctx->mem
emit(dlp, BPF_LOAD(BPF_DW, BPF_REG_0, BPF_REG_0, DMEM_STRTOK));
emit(dlp, BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 1));
dt_cg_check_notnull(dlp, drp, BPF_REG_0);
}
/* get delimiters */
dt_cg_node(del, dlp, drp);
dt_cg_check_notnull(dlp, drp, del->dn_reg);
/* allocate temporary string for result */
dnp->dn_reg = dt_regset_alloc(drp);
if (dnp->dn_reg == -1)
longjmp(yypcb->pcb_jmpbuf, EDT_NOREG);
dt_cg_tstring_alloc(yypcb, dnp);
emit(dlp, BPF_LOAD(BPF_DW, dnp->dn_reg, BPF_REG_FP, DT_STK_DCTX));
emit(dlp, BPF_LOAD(BPF_DW, dnp->dn_reg, dnp->dn_reg, DCTX_MEM));
# Second instance of determining dctx->mem
emit(dlp, BPF_ALU64_IMM(BPF_ADD, dnp->dn_reg, dnp->dn_tstring->dn_value));
/* allocate temporary string for internal purposes */
off = dt_cg_tstring_xalloc(yypcb);
/* function call */
if (dt_regset_xalloc_args(drp) == -1)
longjmp(yypcb->pcb_jmpbuf, EDT_NOREG);
emit(dlp, BPF_MOV_REG(BPF_REG_1, dnp->dn_reg));
emit(dlp, BPF_LOAD(BPF_DW, BPF_REG_2, BPF_REG_FP, DT_STK_DCTX));
emit(dlp, BPF_LOAD(BPF_DW, BPF_REG_2, BPF_REG_2, DCTX_MEM));
# Third instance of determining dctx->mem
emit(dlp, BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, DMEM_STRTOK));
emit(dlp, BPF_MOV_REG(BPF_REG_3, del->dn_reg));
dt_regset_free(drp, del->dn_reg);
if (del->dn_tstring)
dt_cg_tstring_free(yypcb, del);
emit(dlp, BPF_LOAD(BPF_DW, BPF_REG_4, BPF_REG_FP, DT_STK_DCTX));
emit(dlp, BPF_LOAD(BPF_DW, BPF_REG_4, BPF_REG_4, DCTX_MEM));
# Fourth instance of determining dctx->mem
emit(dlp, BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, off));
dt_regset_xalloc(drp, BPF_REG_0);
emite(dlp, BPF_CALL_FUNC(idp->di_id), idp);
dt_regset_free_args(drp);
dt_cg_tstring_xfree(yypcb, off);
emit(dlp, BPF_MOV_REG(dnp->dn_reg, BPF_REG_0));
dt_regset_free(drp, BPF_REG_0);
TRACE_REGSET(" subr-strtok:End ");
}
So yes, I count 5 instances of determining dctx->mem, and for every invocation
of dt_cg_subr_strtok() 4 will actually be generated. How else can you count
this?
> > While not a massive savings, placing this value in a reg
> > and using that (incl. as a base to apply an offset to to get to the temp
> > string data in the state) makes the code easier to read and also makes it
> > easier to maintain if we were to make changes in he dctx / mstate setup. Since
> > this is all code generated from within a single function, doing this kind of
> > optimization is useful. Again, not an optimization in terms of performance
> > but it does help with code clarity, maintainability, and (in the end) if quite
> > a few invocations of strtok() are used, it reduces the overall program length.
> The reduction in instructions is infinitesimal at best. Let's say you need
> the pointer N times. The current method requires 3*N instructions. Reuse
> requires 3+N instructions. In our case, N=2. (Once, albeit on 2 code paths,
> depending on str NULL or not. Once for the function call.) So, 6
> instructions versus 5. And if there is any spill/fill, your proposed
> version costs more instructions. Anyhow, we then call strtok.S, which dwarfs
> all of this.
N = 4, so 3 * 4 = 12 vs 3 + 4 = 7
But as I alrady stated in my previous email, the savings on instruction count
are not the issue here (aside from reducing the program length a little). The
primary gain is less duplication of instruction sequences in a place where it
can really be reduced easily which makes the code easier to maintain.
> The current approach strikes me as more clear. The generated instructions
> are local to where the pointer is needed. To put this in a register would
> mean another register declared, alloced, and freed.
> > > But let me know if you feel strongly about this.
> We disagree on which approach is clearer and how many instructions are at
> stake. It appears we do not even agree on how many times the pointer is
> used. Ultimately, though, either approach is possible and our users will
> never notice the difference (unless reuse triggers some unforeseen register
> pressure). I think the "regeneration" approach is cleaner, but again just
> let me know. It would be nice if we agreed on the rationale -- or even on
> how many times the pointer is being used -- but we do not need to.
I don't think I have been obscure about the rationale, and I have re-stated it
above just to be absolutely clear. I am unclear where you did not see the four
instances of this code sequencee in our code, but I hope the annotated code
above clears that up beyond doubt.
Kris
More information about the DTrace-devel
mailing list