[DTrace-devel] [PATCH 1/2] Add support for the basename() action

Kris Van Hees kris.van.hees at oracle.com
Fri Nov 19 04:51:02 UTC 2021


On Thu, Nov 18, 2021 at 08:45:04PM -0500, Eugene Loh via DTrace-devel wrote:
> On 11/18/21 7:04 PM, Kris Van Hees wrote:
> 
> > On Tue, Nov 16, 2021 at 05:47:59PM -0500, eugene.loh--- via DTrace-devel wrote:
> > > From: Eugene Loh <eugene.loh at oracle.com>
> > > 
> > > Signed-off-by: Eugene Loh <eugene.loh at oracle.com>
> > > ---
> > >   bpf/Build                           |   1 +
> > >   bpf/basename.S                      | 139 ++++++++++++++++++++++++++++
> > >   libdtrace/dt_cg.c                   |  55 ++++++++++-
> > >   libdtrace/dt_dlibs.c                |   1 +
> > >   test/unittest/dif/basename.d        |   1 -
> > >   test/unittest/funcs/tst.basename0.d |  38 ++++++++
> > >   test/unittest/funcs/tst.basename0.r |  21 +++++
> > >   7 files changed, 254 insertions(+), 2 deletions(-)
> > >   create mode 100644 bpf/basename.S
> > >   create mode 100644 test/unittest/funcs/tst.basename0.d
> > >   create mode 100644 test/unittest/funcs/tst.basename0.r
> > > 
> > > diff --git a/bpf/Build b/bpf/Build
> > > index d8279411..e66e8dc9 100644
> > > --- a/bpf/Build
> > > +++ b/bpf/Build
> > > @@ -23,6 +23,7 @@ bpf_dlib_DIR := $(current-dir)
> > >   bpf_dlib_SRCDEPS = $(objdir)/include/.dir.stamp
> > >   bpf_dlib_SOURCES = \
> > >   	agg_lqbin.c agg_qbin.c \
> > > +	basename.S \
> > >   	get_bvar.c \
> > >   	get_tvar.c set_tvar.c \
> > >   	index.S \
> > > diff --git a/bpf/basename.S b/bpf/basename.S
> > > new file mode 100644
> > > index 00000000..983457ad
> > > --- /dev/null
> > > +++ b/bpf/basename.S
> > > @@ -0,0 +1,139 @@
> > > +// SPDX-License-Identifier: GPL-2.0
> > > +/*
> > > + * Copyright (c) 2021, Oracle and/or its affiliates. All rights reserved.
> > > + */
> > > +
> > > +#define DT_STRLEN_BYTES		2
> > > +
> > > +#define BPF_FUNC_probe_read	4
> > > +#define BPF_FUNC_probe_read_str	45
> > > +
> > > +/*
> > > + * void dt_basename(char *src, char *dst);
> > > + */
> > > +	.text
> > > +	.align	4
> > > +	.global	dt_basename
> > > +	.type	dt_basename, @function
> > > +dt_basename :
> > > +#define SRC %r6
> > > +#define DST %r7
> > > +#define BGN %r8
> > > +#define END %r9
> > > +
> > > +	/* store copies of input arguments */
> > > +	mov	SRC, %r1
> > > +	mov	DST, %r2
> > > +
> > > +	/* ignore the string-length prefix */
> > > +	add	SRC, DT_STRLEN_BYTES
> > > +
> > > +	/* r0 = bpf_probe_read_str(dst, STRSZ + 1, src) */
> > > +	mov	%r1, DST
> > > +	lddw	%r2, STRSZ
> > > +	add	%r2, 1
> > > +	mov	%r3, SRC
> > > +	call	BPF_FUNC_probe_read_str
> > > +
> > > +	/* if (r0 s<= 1) goto Ldot */
> > > +	jsle	%r0, 1, .Ldot
> > > +
> > > +	/* end = r0 - 1 */
> > > +	mov	END, %r0
> > > +	sub	END, 1
> > > +
> > > +	/*
> > > +	 * Loop over end, backing it up to find a non-'/' char.
> > > +	 */
> > > +.Lend:
> > > +	/* end-- */
> > > +	sub	END, 1
> > > +	/* if (end s< 0) goto Lslash */
> > > +	jslt	END, 0, .Lslash
> > > +	/* if (src[end] == '/') goto Lend */
> > > +	mov	%r1, SRC
> > > +	add	%r1, END
> > > +	ldxb	%r1, [%r1+0]
> > > +	and	%r1, 0xff
> > > +	jeq	%r1, '/', .Lend
> > > +
> > > +	/*
> > > +	 * Loop over bgn, backing it up to find a '/' char.
> > > +	 */
> > > +	/* bgn = end */
> > > +	mov	BGN, END
> > > +.Lbgn:
> > > +	/* bgn-- */
> > > +	sub	BGN, 1
> > > +	/* if (bgn s< 0) goto Lcopy */
> > > +	jslt	BGN, 0, .Lcopy
> > > +	/* if (src[bgn] != '/') goto Lbgn */
> > > +	mov	%r1, SRC
> > > +	add	%r1, BGN
> > > +	ldxb	%r1, [%r1+0]
> > > +	and	%r1, 0xff
> > > +	jne	%r1, '/', .Lbgn
> > > +
> > > +.Lcopy:
> > > +	/*
> > > +	 * The output string is a copy of the designated substring.
> > > +	 */
> > > +	/* end -= bgn (and help the BPF verifier) */
> > > +	sub	END, BGN
> > Err, this kind of breaks your whole scheme of using defines for register
> > assignments because you just changed the meaning of END here.  It was the
> > index of the end and now it becomes a byte count.
> 
> I see what you're saying, but I don't think this any more severe than saying
> "len means STRSZ+1; oh but now it means strlen."  The pseudocode is short
> enough (a few lines) that I think it's not hard to follow.  Would it seem
> better to you to call it "len"?  So, "len" originally means from the
> beginning of the old string but then it means from the beginning of the new
> string?

Well, no, because now you replace the issue with END meaning two things simply
with having a LEN that means two things.  And if that is OK then I think that
the entire purpose of these defines is lost.  Of course, I find the use of
defines here confusing as is anyway, because you invariably need to know what
the macro is defined as to know what instructions to use, so the value seems to
be a bit lost on me.  E.g. SRC is a register here but in other functions you
have defined it as a [%fp + OFFSET] style stack location.  But the specifics
of those definitions affects whether you can do a MOV on it or need to use LD
to get the value or ST to set the value.  But anyway, that is just that.

Point here is though that if you are going to use symbolic names, they should
mean somrthing and be consistent.  I think you'd be better served then to also
set a define CNT %r9 and when you are done with END and need to start working
with a byte count, start using CNT.  And since you already get to deal in other
code with needing to know what something is defined as, you simply have the
same here where you need to know that END and CNT use the same register.

> > > +	jsge	END, 0, 1
> > > +	mov	END, 0
> > > +	/* bgn++ */
> > > +	add	BGN, 1
> > > +	/* bpf_probe_read(dst + DT_STRLEN_BYTES, end, &src[bgn]) */
> > > +	mov	%r1, DST
> > > +	add	%r1, DT_STRLEN_BYTES
> > > +	mov	%r2, END
> > > +	mov	%r3, SRC
> > > +	add	%r3, BGN
> > > +	call	BPF_FUNC_probe_read
> > Why not use probe_read_str() and let that one place the terminating \0 for you
> > so you don't have to do it here.
> 
> Yup.  Good point.  I think that didn't dawn on me until I saw you doing that
> in your patches.
> 
> > > +	/* dst[DT_STRLEN_BYTES + end] = '\0\ */
> > > +	mov	%r1, DST
> > > +	add	%r1, END
> > > +	stb	[%r1+DT_STRLEN_BYTES], 0
> > > +
> > > +	/* dt_strlen_store(end, dst) */
> > > +	mov	%r1, END
> > > +	mov	%r2, DST
> > > +	call	dt_strlen_store
> > > +
> > > +	/* return */
> > > +	exit
> > > +
> > > +.Ldot:
> > > +	/*
> > > +	 * The output string is simply ".".
> > > +	 */
> > > +	/* dt_strlen_store(1, dst) */
> > > +	mov	%r1, 1
> > > +	mov	%r2, DST
> > > +	call	dt_strlen_store
> > > +	/* dst[DT_STRLEN_BYTES+0] = '.' */
> > > +	stb	[DST+DT_STRLEN_BYTES], '.'
> > > +	/* dst[DT_STRLEN_BYTES+1] = '\0' */
> > > +	stb	[DST+(DT_STRLEN_BYTES+1)], '\0'
> > > +	exit
> > > +
> > > +.Lslash:
> > > +	/*
> > > +	 * The output string is simply "/".
> > > +	 */
> > > +	/* dt_strlen_store(1, dst) */
> > > +	mov	%r1, 1
> > > +	mov	%r2, DST
> > > +	call	dt_strlen_store
> > > +	/* dst[DT_STRLEN_BYTES+0] = '/' */
> > > +	stb	[DST+DT_STRLEN_BYTES], '/'
> > > +	/* dst[DT_STRLEN_BYTES+1] = '\0' */
> > > +	stb	[DST+(DT_STRLEN_BYTES+1)], '\0'
> > > +	exit
> > I would just make these two labels store the '.' or '/' character respectively,
> > and assign 1 to whatever register for the string length, and then jump to a
> > label right before dt_strlen_store() in the main code above.
> 
> Okay.
> 
> > > +#undef SRC
> > > +#undef DST
> > > +#undef BGN
> > > +#undef END
> > > +	.size	dt_basename, .-dt_basename
> > > diff --git a/libdtrace/dt_cg.c b/libdtrace/dt_cg.c
> > > index 537e6706..ac7ea603 100644
> > > --- a/libdtrace/dt_cg.c
> > > +++ b/libdtrace/dt_cg.c
> > > @@ -3294,6 +3294,59 @@ dt_cg_array_op(dt_node_t *dnp, dt_irlist_t *dlp, dt_regset_t *drp)
> > >   	emit(dlp, BPF_ALU64_REG((dnp->dn_flags & DT_NF_SIGNED) ? BPF_ARSH : BPF_RSH, dnp->dn_reg, n));
> > >   }
> > > +/*
> > > + * This function implements dt_cg_subr_foo() that takes a string
> > > + * input and produces a tstring output.
> > This comment doesn't really make sense.  It does not implement st_cg_subr_foo()
> > at all.  I know what you are getting at but I think it makes a lot more sense
> > to just spell it out.  This function is a helper function for subroutines that
> > take a string argument and return the result of applying a given BPF function
> > (passed by name) on it.  Whatever way you want to word that to reflect this.
> 
> Gotcha.
> 
> > > + */
> > > +static void
> > > +dt_cg_subr_str_of_str(dt_node_t *dnp, dt_irlist_t *dlp, dt_regset_t *drp,
> > > +		      const char *fname)
> > Perhaps name it something more specific like dt_cg_subr_path_helper() or
> > something like that, since it is only used for pathname-related subroutines
> > anyway?
> 
> Well, there may be other cases we might want to support.  E.g. strjoin(),
> strstr(), and strtok() are all string(string, string). Okay, bad example
> since they set things up so differently.  Maybe strchr() and strrchr(),
> which are both string(string, char).  I don't know if we will want any more
> of these helper functions, but I was trying to name in an extensible way
> based on type signature. E.g., what if someday, D added a reversestring()
> subroutine? Unlikely, but that's the idea.

Since there aren't any other cases, and there is no plan to have other cases
at this point, I think it is fine to go with a name that covers the need here.
If later other cases are added, a rename may be in order, but I wouldn't want
to speculate on that right now.  Nothing is cast in stone, so you might as well
enjoy the fact that there is a consistency in where it is used and name it
accordingly.

> > > +{
> > > +	dt_ident_t	*idp;
> > > +	dt_node_t	*str = dnp->dn_args;
> > > +
> > > +	TRACE_REGSET("    subr-str_of_str:Begin");
> > > +	dt_cg_node(str, dlp, drp);
> > > +	dt_cg_check_notnull(dlp, drp, str->dn_reg);
> > > +
> > > +	/*
> > > +	 * The result needs be be a temporary string, so we request one.
> > > +	 */
> > > +	dnp->dn_reg = dt_regset_alloc(drp);
> > > +	if (dnp->dn_reg == -1)
> > > +		longjmp(yypcb->pcb_jmpbuf, EDT_NOREG);
> > > +	dt_cg_tstring_alloc(yypcb, dnp);
> > > +
> > > +	/* function call */
> > > +	if (dt_regset_xalloc_args(drp) == -1)
> > > +		longjmp(yypcb->pcb_jmpbuf, EDT_NOREG);
> > > +
> > > +	emit(dlp, BPF_MOV_REG(BPF_REG_1, str->dn_reg));
> > > +	dt_regset_free(drp, str->dn_reg);
> > > +	if (str->dn_tstring)
> > > +		dt_cg_tstring_free(yypcb, str);
> > > +
> > > +	emit(dlp,  BPF_LOAD(BPF_DW, dnp->dn_reg, BPF_REG_FP, DT_STK_DCTX));
> > > +	emit(dlp,  BPF_LOAD(BPF_DW, dnp->dn_reg, dnp->dn_reg, DCTX_MEM));
> > > +	emit(dlp,  BPF_ALU64_IMM(BPF_ADD, dnp->dn_reg, dnp->dn_tstring->dn_value));
> > The above 3 instructions should move to right after the dt_cg_tstring_alloc()
> > above, to keep them together.  That reflects the way it is done with most other
> > cases, and will make it easier when (if) we actually merge all that together
> > to avoid the frequent duplication.
> 
> Yipes.  And even worse, (assuming I'm thinking about this clearly) if
> dnp->dn_reg were, say, %r5, we would spill a bogus value at xalloc_args(),
> compute dnp->dn_reg, fill with the bogus value at free_args(), and get
> garbage.  Good catch.  Thanks.  Will fix.
> 
> > > +	emit(dlp,  BPF_MOV_REG(BPF_REG_2, dnp->dn_reg));
> > > +
> > > +	dt_regset_xalloc(drp, BPF_REG_0);
> > > +	idp = dt_dlib_get_func(yypcb->pcb_hdl, fname);
> > > +	assert(idp != NULL);
> > > +	emite(dlp,  BPF_CALL_FUNC(idp->di_id), idp);
> > > +	dt_regset_free_args(drp);
> > > +	dt_regset_free(drp, BPF_REG_0);
> > > +
> > > +	TRACE_REGSET("    subr-str_of_str:End  ");
> > > +}
> > > +
> > > +static void
> > > +dt_cg_subr_basename(dt_node_t *dnp, dt_irlist_t *dlp, dt_regset_t *drp)
> > > +{
> > > +	dt_cg_subr_str_of_str(dnp, dlp, drp, "dt_basename");
> > > +}
> > > +
> > >   /*
> > >    * Get and return a new speculation ID.  These are unallocated entries in the
> > >    * specs map, obtained by calling dt_speculation().  Return zero if none is
> > > @@ -3986,7 +4039,7 @@ static dt_cg_subr_f *_dt_cg_subr[DIF_SUBR_MAX + 1] = {
> > >   	[DIF_SUBR_DDI_PATHNAME]		= NULL,
> > >   	[DIF_SUBR_STRJOIN]		= dt_cg_subr_strjoin,
> > >   	[DIF_SUBR_LLTOSTR]		= &dt_cg_subr_lltostr,
> > > -	[DIF_SUBR_BASENAME]		= NULL,
> > > +	[DIF_SUBR_BASENAME]		= &dt_cg_subr_basename,
> > >   	[DIF_SUBR_DIRNAME]		= NULL,
> > >   	[DIF_SUBR_CLEANPATH]		= NULL,
> > >   	[DIF_SUBR_STRCHR]		= &dt_cg_subr_strchr,
> > > diff --git a/libdtrace/dt_dlibs.c b/libdtrace/dt_dlibs.c
> > > index 50acc0dc..771eba64 100644
> > > --- a/libdtrace/dt_dlibs.c
> > > +++ b/libdtrace/dt_dlibs.c
> > > @@ -54,6 +54,7 @@ static const dt_ident_t		dt_bpf_symbols[] = {
> > >   	/* BPF library (external) functions */
> > >   	DT_BPF_SYMBOL(dt_agg_lqbin, DT_IDENT_SYMBOL),
> > >   	DT_BPF_SYMBOL(dt_agg_qbin, DT_IDENT_SYMBOL),
> > > +	DT_BPF_SYMBOL(dt_basename, DT_IDENT_SYMBOL),
> > >   	DT_BPF_SYMBOL(dt_error, DT_IDENT_SYMBOL),
> > >   	DT_BPF_SYMBOL(dt_get_bvar, DT_IDENT_SYMBOL),
> > >   	DT_BPF_SYMBOL(dt_get_string, DT_IDENT_SYMBOL),
> > > diff --git a/test/unittest/dif/basename.d b/test/unittest/dif/basename.d
> > > index a4c50d7e..5e60b96d 100644
> > > --- a/test/unittest/dif/basename.d
> > > +++ b/test/unittest/dif/basename.d
> > > @@ -1,4 +1,3 @@
> > > -/* @@xfail: dtv2 */
> > >   BEGIN
> > >   {
> > >   	exit(basename("/foo/bar") == "bar" ? 0 : 1);
> > > diff --git a/test/unittest/funcs/tst.basename0.d b/test/unittest/funcs/tst.basename0.d
> > > new file mode 100644
> > > index 00000000..0f758996
> > > --- /dev/null
> > > +++ b/test/unittest/funcs/tst.basename0.d
> > > @@ -0,0 +1,38 @@
> > > +/*
> > > + * Oracle Linux DTrace.
> > > + * Copyright (c) 2021, Oracle and/or its affiliates. All rights reserved.
> > > + * Licensed under the Universal Permissive License v 1.0 as shown at
> > > + * http://oss.oracle.com/licenses/upl.
> > > + */
> > > +
> > > +#pragma D option quiet
> > > +
> > > +BEGIN
> > > +{
> > > +	printf("|%s|\n", basename("/foo/bar/baz"));
> > > +	printf("|%s|\n", basename("/foo/bar///baz/"));
> > > +	printf("|%s|\n", basename("/foo/bar/baz/"));
> > > +	printf("|%s|\n", basename("/foo/bar/baz//"));
> > > +	printf("|%s|\n", basename("/foo/bar/baz/."));
> > > +	printf("|%s|\n", basename("/foo/bar/baz/./"));
> > > +	printf("|%s|\n", basename("/foo/bar/baz/.//"));
> > > +	printf("|%s|\n", basename("foo/bar/baz/"));
> > > +	printf("|%s|\n", basename("/"));
> > > +	printf("|%s|\n", basename("./"));
> > > +	printf("|%s|\n", basename("//"));
> > > +	printf("|%s|\n", basename("/."));
> > > +	printf("|%s|\n", basename("/./"));
> > > +	printf("|%s|\n", basename("/./."));
> > > +	printf("|%s|\n", basename("/.//"));
> > > +	printf("|%s|\n", basename("."));
> > > +	printf("|%s|\n", basename("f"));
> > > +	printf("|%s|\n", basename("f/"));
> > > +	printf("|%s|\n", basename("/////"));
> > > +	printf("|%s|\n", basename(""));
> > > +	exit(0);
> > > +}
> > > +
> > > +ERROR
> > > +{
> > > +	exit(1);
> > > +}
> > > diff --git a/test/unittest/funcs/tst.basename0.r b/test/unittest/funcs/tst.basename0.r
> > > new file mode 100644
> > > index 00000000..37a96547
> > > --- /dev/null
> > > +++ b/test/unittest/funcs/tst.basename0.r
> > > @@ -0,0 +1,21 @@
> > > +|baz|
> > > +|baz|
> > > +|baz|
> > > +|baz|
> > > +|.|
> > > +|.|
> > > +|.|
> > > +|baz|
> > > +|/|
> > > +|.|
> > > +|/|
> > > +|.|
> > > +|.|
> > > +|.|
> > > +|.|
> > > +|.|
> > > +|f|
> > > +|f|
> > > +|/|
> > > +|.|
> > > +
> > > -- 
> > > 2.18.4
> > > 
> > > 
> > > _______________________________________________
> > > DTrace-devel mailing list
> > > DTrace-devel at oss.oracle.com
> > > https://oss.oracle.com/mailman/listinfo/dtrace-devel
> 
> _______________________________________________
> DTrace-devel mailing list
> DTrace-devel at oss.oracle.com
> https://oss.oracle.com/mailman/listinfo/dtrace-devel



More information about the DTrace-devel mailing list