[DTrace-devel] [PATCH] Implement the io provider

Eugene Loh eugene.loh at oracle.com
Fri Dec 8 19:30:48 UTC 2023


There were already some comments from Nick.  Off-line, I send some test 
tweaks to Kris.  Sounds like Kris is exploring some other issues.  
Meanwhile, just a few other minor comments below:

On 11/30/23 10:59, Kris Van Hees via DTrace-devel wrote:
> diff --git a/libdtrace/dt_prov_io.c b/libdtrace/dt_prov_io.c
> @@ -0,0 +1,669 @@
> +/*
> + * Oracle Linux DTrace.
> + * Copyright (c) 2023, Oracle and/or its affiliates. All rights reserved.
> + * Licensed under the Universal Permissive License v 1.0 as shown at
> + * http://oss.oracle.com/licenses/upl.
> + *
> + * The 'io' SDT provider for DTrace-specific probes.
> + *
> + * These io::: probes mimic the instrumentation in legacy DTrace.
> + * Specifically, all probes have three probe args:
> + *     bufinfo_t *
> + *     devinfo_t *
> + *     fileinfo_t *
> + * but the trampoline really only needs to supply a bio pointer,
> + * from which translators will make the first two args.  The fileinfo_t *
> + * is 0 on Linux.
> + *
> + * The bio pointer is passed into some functions and is easily captured
> + * if we are using an fbt:::entry probe on such a function.  See
> + * DTRACE_IO() sites in the legacy implementation.

Yes, though the comment should perhaps also acknowledge that we are now 
also (heavily) using raw tps.

> + * For fbt:::entry probes on nfs_ and xfs_ functions, however, get only
> + * a hdr arg.  For them, we have a "fake struct bio", which the trampoline
> + * populates from the function's hdr arg.  See DTRACE_IO_NFS() and
> + * DTRACE_IO_XFS() sites in the legacy implementation.

Yes but again perhaps acknowledge rawtp.

> + * In some cases, we have to use fbt:::return probes, for which we no
> + * longer have the function's arguments.  So, these cases rely on the
> + * corresponding entry probe to cache the bio pointer (or populate the
> + * fake bio), which the return probe can then retrieve.

Kind of.  At this point, only one fbt:::return case remains, so might as 
well specify which one.

> + * Unfortunately, more than one function may be active at any time (on a
> + * CPU).  So the return function needs to know which bio pointer or fake
> + * bio to use.  These rules are used:
> + *
> + *   - For nfs_ and xfs_ functions, just use the fake bio.
> + *
> + *   - For most other functions, use the bio pointer for that
> + *     function.
> + */

Obsolete?  Just use TLS for the one remaining fbt:::return case?

> +/*
> + * For NFS events, we have to construct a fake struct bio, which we have to
> + * populate from the nfs_pgio_header argument the underlying probe provides.
> + */
> +static void io_nfs_args(dtrace_hdl_t *dtp, dt_irlist_t *dlp, uint_t exitlbl,
> +			const char *prb, const char *uprb)

The uprb isn't really the uprobe probe name anymore.  Might "funcname" 
or something make more sense?

> +{
> +	[...]
> +}
> +
> +/*
> + * For XFS events, we have to construct a fake struct bio, which we have to
> + * populate from the xfs_buf argument the underlying probe provides.
> + */
> +static void io_xfs_args(dtrace_hdl_t *dtp, dt_irlist_t *dlp, uint_t exitlbl)
> +{
> +	int	off;
> +	size_t	siz;
> +
> +	/*
> +	 * Determine the various sizes and offsets we want.
> +	 *
> +	 *     // Access these fields relative to &bio.
> +	 *     struct bio bio = {
> +	 *         .bi_opf = ...,
> +	 *         .bi_iter.bi_size = ...,      // struct bvec_iter bi_iter
> +	 *         .bi_iter.bi_sector = ...,
> +	 *         .bi_bdev = ...,		// -or- .bi_disk = ...
> +	 *                                      // and  .bi_partno = ...
> +	 *     };

I find that comment kind of confusing.  How about
+     *         .bi_bdev = ...,        // -or- .bi_disk = ...
+     *         .bi_partno = ...
One could also add "// if available" to bi_partno, but that is evident 
from the code.

> +	 *
> +	 *     // Access these fields relative to bp.
> +	 *     struct xfs_buf *bp;
> +	 *     ... = (bp)->b_flags;
> +	 *     ... = xfs_buf_daddr(bp);
> +	 *     ... = (bp)->b_length;
> +	 *     ... = (bp)->b_target->bt_bdev;   // struct xfs_buftarg *b_target;
> +	 */
> +
> +	[...]
> +
> +	/*
> +	 * bio.bi_bdev = (bp)->b_target->bt_bdev
> +	 */

Yes, or bio.bi_disk.

> +	emit(dlp, BPF_LOAD(BPF_DW, BPF_REG_3, BPF_REG_7, DMST_ARG(0)));
> +	off = dt_cg_ctf_offsetof("struct xfs_buf", "b_target", &siz, 0);
> +	assert(siz == sizeof(void *));
> +	deref_reg3(dlp, exitlbl, off, 8, BPF_REG_3);

Okay but how about s/8/siz/?

> +	off = dt_cg_ctf_offsetof("struct xfs_buftarg", "bt_bdev", &siz, 0);
> +	deref_reg3(dlp, exitlbl, off, siz, BPF_REG_3);
> +	off = dt_cg_ctf_offsetof("struct bio", "bi_bdev", &siz, 1);
> +	if (off == -1)
> +		off = dt_cg_ctf_offsetof("struct bio", "bi_disk", &siz, 0);
> +	emit(dlp, BPF_STORE(BPF_width(siz), BPF_REG_6, off, BPF_REG_0));
> +
> +	[...]
> +}
> +
> +static int trampoline(dt_pcb_t *pcb, uint_t exitlbl)
> +{
> +	dtrace_hdl_t	*dtp = pcb->pcb_hdl;
> +	dt_irlist_t	*dlp = &pcb->pcb_ir;
> +	dt_probe_t	*prp = pcb->pcb_probe;
> +	dt_probe_t	*uprp = pcb->pcb_parent_probe;
> +
> +	/*
> +	 * The nfs_* and xfs_* probes do not pass a bio argument, and therefore
> +	 * we need to synthesize one.
> +	 */
> +	if (strcmp(uprp->desc->mod, "nfs") == 0) {
> +		/*
> +		 * If the underlying probe is an FBT probe, we pass function
> +		 * name.  Otherwise, pass probe name.
> +		 */
> +		io_nfs_args(dtp, dlp, exitlbl, prp->desc->prb,
> +			    strcmp(uprp->desc->prb, "entry") == 0
> +				? uprp->desc->fun : uprp->desc->prb);

Isn't it more direct to test strcmp(uprp->desc->prv, "fbt")?

> +		goto done;
> +	} else if (strcmp(uprp->desc->mod, "xfs") == 0) {
> +		io_xfs_args(dtp, dlp, exitlbl);
> +		goto done;
> +	}
> +
> +	[...]
> +
> +	/* Handle the non-XFS wait-done flavor. */
> +	if (strcmp(prp->desc->prb, "wait-done") == 0) {
> +		/*
> +		 * We need instrument submit_bio_wait(struct bio *):

s/need/need to/

> +		 *   - on entry, store the bio pointer into a TLS var
> +		 *   - on return, retrieve the pointer (and clear the TLS var)
> +		 * We use a TLS var to distinguish among possible concurrent
> +		 * submit_bio_wait() on the CPU.
> +		 */
> +		uint_t		varid = get_id_tls(dtp) - DIF_VAR_OTHER_UBASE;
> +		dt_ident_t	*fnp = dt_dlib_get_func(dtp, "dt_get_tvar");
> +		dt_ident_t	*zero_off = dt_dlib_get_var(dtp, "ZERO_OFF");
> +
> +		assert(fnp);
> +		assert(zero_off);
> +
> +		/* Return probe: retrieve the bio pointer from TLS. */
> +		if (strcmp(uprp->desc->prb, "return") == 0) {
> +			uint_t	Lnull = dt_irlist_label(dlp);
> +
> +			/* Call dt_get_tvar() for our private io-provider TLS variable. */
> +			emit(dlp,  BPF_MOV_IMM(BPF_REG_1, varid));
> +			emit(dlp,  BPF_MOV_IMM(BPF_REG_2, 0));
> +			emit(dlp,  BPF_MOV_IMM(BPF_REG_3, 0));
> +			emit(dlp,  BPF_LOAD(BPF_DW, BPF_REG_4, BPF_REG_9, DCTX_STRTAB));
> +			emite(dlp, BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, -1), zero_off);
> +			emite(dlp, BPF_CALL_FUNC(fnp->di_id), fnp);
> +
> +			/* If we got a nonzero address, load from it. */
> +			emit(dlp,  BPF_BRANCH_IMM(BPF_JEQ, BPF_REG_0, 0, Lnull));
> +			emit(dlp,  BPF_LOAD(BPF_DW, BPF_REG_0, BPF_REG_0, 0));
> +			emitl(dlp, Lnull,
> +				   BPF_NOP());
> +
> +			/* Store the retrieved value (bio pointer) as arg0. */
> +			emit(dlp, BPF_STORE(BPF_DW, BPF_REG_7, DMST_ARG(0), BPF_REG_0));
> +		}
> +
> +		/*
> +		 * Store (update) the TLS copy of the bio pointer:
> +		 *   - return: store 0 (clear TLS, freeing storage)
> +		 *   - entry: store arg0 (for the return probe to use later)
> +		 */
> +		emit(dlp,  BPF_MOV_IMM(BPF_REG_1, varid));
> +		emit(dlp,  BPF_MOV_IMM(BPF_REG_2, 1));
> +		if (strcmp(uprp->desc->prb, "return") == 0)
> +			emit(dlp, BPF_MOV_IMM(BPF_REG_3, 0));
> +		else
> +			emit(dlp, BPF_LOAD(BPF_DW, BPF_REG_3, BPF_REG_7, DMST_ARG(0)));
> +		emit(dlp,  BPF_LOAD(BPF_DW, BPF_REG_4, BPF_REG_9, DCTX_STRTAB));
> +		emite(dlp, BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, -1), zero_off);
> +		emite(dlp, BPF_CALL_FUNC(fnp->di_id), fnp);
> +
> +		/*
> +		 * At this point, the entry probe only has the TLS variable
> +		 * address.  It has yet actually to store arg0 there,
> +		 * provided the address is nonzero.
> +		 */
> +		if (strcmp(uprp->desc->prb, "entry") == 0) {
> +			uint_t Lnull = dt_irlist_label(dlp);
> +
> +			emit(dlp,  BPF_BRANCH_IMM(BPF_JEQ, BPF_REG_0, 0, Lnull));
> +			emit(dlp,  BPF_LOAD(BPF_DW, BPF_REG_1, BPF_REG_7, DMST_ARG(0)));
> +			emit(dlp,  BPF_STORE(BPF_DW, BPF_REG_0, 0, BPF_REG_1));
> +			emitl(dlp, Lnull,
> +				   BPF_NOP());
> +
> +			return 1;
> +		}
> +	}
> +
> +done:
> +	/*
> +	 * Note: DTrace does not currently support the use of fileinfo_t with
> +	 * io probes.  In Oracle Linux, there is no information about the file
> +	 * where the I/O request originated at the point where the io probes
> +	 * fire.
> +	 */
> +	emit(dlp, BPF_STORE_IMM(BPF_DW, BPF_REG_7, DMST_ARG(1), 0));
> +
> +	return 0;
> +}
> diff --git a/test/unittest/io/tst.local.sh b/test/unittest/io/tst.local.sh
> @@ -25,7 +24,7 @@ tempfile=`mktemp -u -p $iodir`
>   trap "umount $iodir; rmdir $iodir; rm -f $iodir.img" QUIT EXIT
>   
>   # create loopback file system
> -dd if=/dev/zero of=$iodir.img bs=1024 count=$((16*$nblocks)) status=none
> +dd if=/dev/zero of=$iodir.img bs=1024 count=$((300*$nblocks)) status=none
>   mkfs.$fstype $iodir.img > /dev/null
>   mkdir $iodir
>   test/triggers/io-mount-local.sh $iodir $fstype $fsoptions

Not a big deal, just curious:  would s/16/300/ warrant a touch of 
copyright year?

> diff --git a/test/unittest/io/tst.wait.sh b/test/unittest/io/tst.wait.sh
> @@ -22,7 +21,7 @@ tempfile=`mktemp -u -p $iodir`
>   trap "umount $iodir; rmdir $iodir; rm -f $iodir.img" QUIT EXIT
>   
>   # create loopback file system
> -dd if=/dev/zero of=$iodir.img bs=1024 count=$((16*$nblocks)) status=none
> +dd if=/dev/zero of=$iodir.img bs=1024 count=$((300*$nblocks)) status=none
>   mkfs.$fstype $iodir.img > /dev/null
>   mkdir $iodir
>   test/triggers/io-mount-local.sh $iodir $fstype $fsoptions

Ditto.



More information about the DTrace-devel mailing list