[DTrace-devel] [PATCH] Add support for aggpercpu option

Wed Jan 24 20:42:37 UTC 2024

On Tue, Nov 07, 2023 at 11:40:39PM -0500, eugene.loh at oracle.com wrote:
> 
> The aggpercpu option appears in the documentation.  Further, the
> current implementation has vestigial code that suggests the option
> was once supported.
> 
> On the other hand, its behavior is not particularly described in
> the documentation.  Further, it seems the option has no effect on
> Solaris or with the legacy Linux implementation.  An easy workaround
> is to add cpu as a key to an aggregation;  this workaround makes the
> feature superfluous.
> 
> Another challenge is that it's hard to know what behavior makes most
> sense if aggpercpu is combined with other features.  For quantize()
> output, should the same row values be used for all CPUs as for the
> overall aggregation?  What should the output format look like if
> aggpercpu is combined with a printa() that has multiple aggregations?
> 
> Just implement some reasonable version of aggpercpu support and leave
> intricate scenarios for the user to handle.
> 
> Signed-off-by: Eugene Loh <eugene.loh at oracle.com>

Some suggested changes below.

> ---
>  libdtrace/dt_aggregate.c            |  45 ++++++++-
>  libdtrace/dt_consume.c              |   9 ++
>  test/unittest/aggs/tst.aggpercpu.sh | 143 ++++++++++++++++++++++++++++
>  3 files changed, 192 insertions(+), 5 deletions(-)
>  create mode 100755 test/unittest/aggs/tst.aggpercpu.sh
> 
> diff --git a/libdtrace/dt_aggregate.c b/libdtrace/dt_aggregate.c
> index e6275fe3..3d4c9b4a 100644
> --- a/libdtrace/dt_aggregate.c
> +++ b/libdtrace/dt_aggregate.c
> @@ -462,19 +462,19 @@ dt_aggregate_clear_one(const dtrace_aggdata_t *agd, void *arg)
>  		*vals = INT64_MAX;
>  		if (agd->dtada_percpu)
>  			for (i = 0; i < max_cpus; i++)
> -				*((uint64_t*)agd->dtada_percpu[i]) = INT64_MAX;
> +				*((uint64_t*)&agd->dtada_percpu[i][rec->dtrd_offset]) = INT64_MAX;
>  		break;
>  	case DT_AGG_MAX:
>  		*vals = INT64_MIN;
>  		if (agd->dtada_percpu)
>  			for (i = 0; i < max_cpus; i++)
> -				*((uint64_t*)agd->dtada_percpu[i]) = INT64_MIN;
> +				*((uint64_t*)&agd->dtada_percpu[i][rec->dtrd_offset]) = INT64_MIN;
>  		break;
>  	default:
>  		memset(vals, 0, rec->dtrd_size);
>  		if (agd->dtada_percpu)
>  			for (i = 0; i < max_cpus; i++)
> -				memset(agd->dtada_percpu[i], 0, rec->dtrd_size);
> +				memset(&agd->dtada_percpu[i][rec->dtrd_offset], 0, rec->dtrd_size);
>  		break;
>  	}
>  
> @@ -543,6 +543,9 @@ dt_aggregate_snap_one(dtrace_hdl_t *dtp, int aggid, int cpu, const char *key,
>  		assert(aid != NULL);
>  		dt_agg_one_agg(aid, &agg->dtagd_drecs[DT_AGGDATA_RECORD],
>  			       agd->dtada_data, data);
> +		if (agd->dtada_percpu != NULL)
> +			dt_agg_one_agg(aid, &agg->dtagd_drecs[DT_AGGDATA_RECORD],
> +				       agd->dtada_percpu[cpu], data);
>  
>  		return 0;
>  
> @@ -578,8 +581,40 @@ hashnext:
>  
>  	memcpy(ptr, data, size);
>  	agd->dtada_data = ptr;
> +	if (dtp->dt_aggregate.dtat_flags & DTRACE_A_PERCPU) {
> +		int i, max_cpus = dtp->dt_conf.max_cpuid + 1;
> +		dtrace_recdesc_t	*rec = &agg->dtagd_drecs[DT_AGGDATA_RECORD];
> +		dtrace_actkind_t	act = rec->dtrd_action;
> +		uint32_t		siz = rec->dtrd_size;
> +
> +		agd->dtada_percpu = dt_alloc(dtp, max_cpus * sizeof(caddr_t));
> +		if (agd->dtada_percpu == NULL)
> +			return dt_set_errno(dtp, EDT_NOMEM);
> +
> +		for (i = 0; i < max_cpus; i++) {
> +			int64_t	*vals;
> +
> +			agd->dtada_percpu[i] = dt_alloc(dtp, size);
> +			if (agd->dtada_percpu[i] == NULL)
> +				return dt_set_errno(dtp, EDT_NOMEM);
> +
> +			vals = (int64_t *) &agd->dtada_percpu[i][rec->dtrd_offset];
> +			switch (act) {
> +			case DT_AGG_MIN:
> +				*vals = INT64_MAX;
> +				break;
> +			case DT_AGG_MAX:
> +				*vals = INT64_MIN;
> +				break;
> +			default:
> +				memset(vals, 0, siz);
> +				break;
> +			}
> +		}
> +		memcpy(agd->dtada_percpu[cpu], data, size);
> +	}

I wonder whether it might be better to just allocate everything in this loop,
and then call dt_aggregate_clear_one() to do the actual initialization?  Or
split out the initialization code from dt_aggregate_clear_one() and call it
from there and from here?  It contains enough complexity in indirections and
array indexing that I think that would be warranted.

> -	/* Add the new entru to the hashtable. */
> +	/* Add the new entry to the hashtable. */
>  	if (agh->dtah_hash[ndx] != NULL)
>  		agh->dtah_hash[ndx]->dtahe_prev = h;
>  
> @@ -1564,7 +1599,7 @@ dtrace_aggregate_walk_joined(dtrace_hdl_t *dtp, dtrace_aggid_t *aggvars,
>  
>  		if ((zdata = dt_zalloc(dtp, zsize)) == NULL) {
>  			/*
> -			 * If we failed to allocated some zero-filled data, we
> +			 * If we failed to allocate some zero-filled data, we
>  			 * need to zero out the remaining dtada_data pointers
>  			 * to prevent the wrong data from being freed below.
>  			 */
> diff --git a/libdtrace/dt_consume.c b/libdtrace/dt_consume.c
> index a67c0f7b..1f2f137b 100644
> --- a/libdtrace/dt_consume.c
> +++ b/libdtrace/dt_consume.c
> @@ -1745,6 +1745,15 @@ dt_print_aggs(const dtrace_aggdata_t **aggsdata, int naggvars, void *arg)
>  		if (dt_print_datum(dtp, fp, rec, aggdata->dtada_data, normal,
>  				   agg->dtagd_sig) < 0)
>  			return DTRACE_AGGWALK_ERROR;
> +		if (aggdata->dtada_percpu != NULL) {
> +			int j, max_cpus = aggdata->dtada_hdl->dt_conf.max_cpuid + 1;
> +			for (j = 0; j < max_cpus; j++) {
> +				if (dt_printf(dtp, fp, "\nagg per cpu %d", aggdata->dtada_hdl->dt_conf.cpus[j].cpu_id) < 0)

I think this makes the output look rather weird.  How about:

	if (dt_printf(dtp, fp, "\n    [CPU %d]", aggdata->dtada_hdl->dt_conf.cpus[j].cpu_id) < 0)

That makes it more clear I think that this is the data entry for that specific
CPU (for the aggregation for which the overall aggregated value was just
printed above).  As far as I can see that still works OK with aggregations
that are indexed.

> +					return DTRACE_AGGWALK_ERROR;
> +				if (dt_print_datum(dtp, fp, rec, aggdata->dtada_percpu[j], normal, agg->dtagd_sig) < 0)
> +					return DTRACE_AGGWALK_ERROR;
> +			}
> +		}
>  
>  		if (dt_buffered_flush(dtp, NULL, rec, aggdata,
>  				      DTRACE_BUFDATA_AGGVAL) < 0)
> diff --git a/test/unittest/aggs/tst.aggpercpu.sh b/test/unittest/aggs/tst.aggpercpu.sh
> new file mode 100755
> index 00000000..e78d6103
> --- /dev/null
> +++ b/test/unittest/aggs/tst.aggpercpu.sh
> @@ -0,0 +1,143 @@
> +#!/bin/bash
> +#
> +# Oracle Linux DTrace.
> +# Copyright (c) 2023, Oracle and/or its affiliates. All rights reserved.
> +# Licensed under the Universal Permissive License v 1.0 as shown at
> +# http://oss.oracle.com/licenses/upl.
> +#
> +
> +dtrace=$1
> +
> +DIRNAME="$tmpdir/aggpercpu.$$.$RANDOM"
> +mkdir -p $DIRNAME
> +cd $DIRNAME
> +
> +#
> +# Run a D script that fires on every CPU,
> +# forcing DTrace to aggregate results over all CPUs.
> +#
> +
> +$dtrace -xaggpercpu -qn '
> +    profile-600ms
> +    {
> +        printf("cpu %d\n", cpu);
> +        @xcnt = count();
> +        @xavg = avg(10 * cpu + 3);
> +        @xstd = stddev(20 * cpu + 8);
> +        @xmin = min(30 * cpu - 10);
> +        @xmax = max(40 * cpu - 15);
> +        @xsum = sum(50 * cpu);
> +    }
> +    tick-900ms
> +    {
> +        exit(0)
> +    }
> +' > dtrace.out
> +if [ $? -ne 0 ]; then
> +    echo DTrace failed
> +    cat dtrace.out
> +    exit 1
> +fi
> +
> +#
> +# Examine the results.
> +#
> +
> +awk '
> +    # The expected value for the aggregation is aggval.
> +    # The expected value on a CPU is (m * cpu + b).
> +    function check(label, aggval, m, b) {
> +        # Check the aggregation over all CPUs.
> +        getline;
> +        print "check:", $0;
> +        if ($1 != aggval) { printf("ERROR: %s, expect %d got %d\n", label, aggval, $1) };
> +
> +        # Check the per-CPU values.
> +        for (i = 1; i <= ncpu; i++) {
> +            getline;
> +            print "check:", $0;
> +            if (match($0, "^agg per cpu ") != 1 ||
> +                strtonum($4) != cpu[i] ||
> +                strtonum($5) != m * cpu[i] + b)
> +                printf("ERROR: %s, agg per cpu %d, line: %s\n", label, cpu[i], $0);
> +        }
> +    }
> +
> +    BEGIN {
> +        xcnt = xavg = xstm = xstd = xsum = 0;
> +        xmin = +1000000000;
> +        xmax = -1000000000;
> +        ncpu = 0;
> +    }
> +
> +    # The first "cpu" lines provide the inputs to the aggregations.
> +    /^cpu [0-9]*$/ {
> +	cpu[++ncpu] = strtonum($NF);
> +
> +        xcnt += 1;
> +
> +        x = 10 * $2 + 3;
> +        xavg += x;
> +
> +        x = 20 * $2 + 8;
> +        xstm += x;
> +        xstd += x * x;
> +
> +        x = 30 * $2 - 10;
> +        if (xmin > x) { xmin = x };
> +
> +        x = 40 * $2 - 15;
> +        if (xmax < x) { xmax = x };
> +
> +        x = 50 * $2;
> +        xsum += x;
> +
> +        next;
> +    }
> +
> +    # The remaining lines are the aggregation results.
> +    {
> +        # First we finish computing our estimates for avg and stddev.
> +        # (The other results require no further action.)
> +
> +        xavg /= xcnt;
> +
> +        xstm /= xcnt;
> +        xstd /= xcnt;
> +        xstd -= xstm * xstm;
> +        xstd = int(sqrt(xstd));
> +
> +        # Sort the cpus.
> +
> +        asort(cpu);
> +
> +        # Now read the results and compare.
> +
> +        check("cnt", xcnt,  0,   1);
> +        check("avg", xavg, 10,   3);
> +        check("std", xstd,  0,   0);
> +        check("min", xmin, 30, -10);
> +        check("max", xmax, 40, -15);
> +        check("sum", xsum, 50,   0);
> +
> +        printf("done\n");
> +    }
> +' dtrace.out > awk.out
> +if [ $? -ne 0 ]; then
> +    echo awk failed
> +    cat dtrace.out
> +    exit 1
> +fi
> +
> +if grep -q ERROR awk.out ; then
> +    echo ERROR found
> +    echo "=================================================="
> +    cat dtrace.out
> +    echo "=================================================="
> +    cat awk.out
> +    echo "=================================================="
> +    exit 1
> +fi
> +
> +echo success
> +exit 0
> -- 
> 2.18.4
> 
>