[DTrace-devel] [PATCH 5/6 v3] Implement associative array support

Fri Mar 11 21:41:06 UTC 2022

On 3/11/22 11:19 AM, Kris Van Hees via DTrace-devel wrote:

> v4 in the works due to a issue with register spilling
Okay.  Meanwhile here is some other feedback.
> On Fri, Mar 11, 2022 at 01:26:30AM -0500, Kris Van Hees via DTrace-devel wrote:
>> diff --git a/bpf/get_dvar.c b/bpf/get_dvar.c
>> @@ -11,8 +11,24 @@
>>   #endif
>>   
>>   extern struct bpf_map_def dvars;
>> +extern struct bpf_map_def tuples;
>>   extern uint64_t NCPUS;
>>   
>> +/*
>> + * Dynamic variables are identified using a unique 64-bit key.  Three diferent
s/diferent/different/
>> + * categories of dynamic variables are supported in DTrace:
>> + *
>> + * 1. Thread-local storage (TLS) variables:
>> + *	dvar key = TLS key (highest bit = 0)
>> + * 2. Global associative array elements:
>> + *	dvar key = &tuples[var id, tuple, (uint64_t)0] (highest bit = 1)
>> + * 2. TLS associative array elements:
>> + *	dvar key = &tuples[var id, tuple, TLS key] (highest bit = 1)
>> + *
>> + * Given that the TLS key can never be 0, uniqueness of the dvar key is
>> + * guaranteed in this scheme.
>> + */
>> +
>>   noinline uint64_t dt_tlskey(uint32_t id)
>>   {
>>   	uint64_t	key;
>> @@ -25,7 +41,7 @@ noinline uint64_t dt_tlskey(uint32_t id)
>>   		key += (uint32_t)(uint64_t)&NCPUS;
>>   
>>   	key++;
>> -	key = (key << 32) | id;
>> +	key = ((key & 0x7fffffff) << 32) | id;
I do not understand the & 0x7fffffff.  Are you throwing a bit away?  Or 
is this operation unnecessary?  Or is it simply there for the BPF verifier?
>>   
>>   	return key;
>>   }
>> @@ -81,3 +97,59 @@ noinline void *dt_get_tvar(uint32_t id, uint64_t store, uint64_t nval)
>>   {
>>   	return dt_get_dvar(dt_tlskey(id), store, nval);
>>   }
>> +
>> +noinline void *dt_get_assoc(uint32_t id, const char *tuple,
>> +			    uint64_t store, uint64_t nval)
>> +{
>> +	uint64_t	*valp;
>> +	uint64_t	val;
>> +	uint64_t	dflt_val = 0;
>> +
>> +	/*
>> +	 * Place the variable ID at the beginning of the tuple.
>> +	 */
>> +	*(uint32_t *)tuple = id;
>> +
>> +	/*
>> +	 * Look for the tuple in the tuples map.
>> +	 */
>> +	valp = bpf_map_lookup_elem(&tuples, tuple);
>> +	if (valp == 0) {
>> +		/*
>> +		 * Not found.  If we are not storing a value (i.e. performing a
>> +		 * load), return the default value (0).  If we are trying to
>> +		 * delete an associative array element, we don't have to do
>> +		 * anything because it does not exist anyway.
>> +		 */
>> +		if (!store || !nval)
>> +			return 0;
>> +
>> +		/*
>> +		 * Create the tuple and use the address of the value as the
>> +		 * actual value.
>> +		 */
>> +		if (bpf_map_update_elem(&tuples, tuple, &dflt_val, BPF_ANY) < 0)
>> +			return 0;
>> +		valp = bpf_map_lookup_elem(&tuples, tuple);
>> +		if (valp == 0)
>> +			return 0;
>> +		*valp = (uint64_t)valp;
>> +		if (bpf_map_update_elem(&tuples, tuple, valp, BPF_ANY) < 0)
>> +			return 0;
I do not understand this.  I'm pretty sure the problem is with me and 
not the code.  We get valp.  Then...

*)  We set *valp=valp.  Why?  Can't we simply use the pointer valp as a 
key to the dvars map?  That is, we eventually look a tuple up in the 
tuples map to get the pointer to some value.  Can we not use that 
pointer as a key to the dvars map rather than having to look up the 
value?  In fact, some of the comments suggest just that.

*)  What does that bpf_map_update_elem(.... valp...) call do?  I would 
have thought that it simply updates the value for "tuple" with the value 
pointed to by valp, but at this point valp already points to the 
destination location.  So the function call would seem to be a no-op.  
(But I just tried commenting it out and getting BPF verifier failures.  
So I'm missing something.)
>> +		val = *valp;
>> +	} else {
>> +		/*
>> +		 * Record the value (used as key into the dvars map), and if we
>> +		 * are storing a zero-value (deleting the element), delete the
>> +		 * tuple.  The associated dynamic variable will be delete by
s/delete/deleted/
>> +		 * the dt_get_dvar() call.
>> +		 */
>> +		val = *valp;
>> +
>> +		if (store && !nval)
>> +			bpf_map_delete_elem(&tuples, tuple);
>> +	}
>> +	
>> +	return dt_get_dvar(val, store, nval);
>> +}
>> diff --git a/libdtrace/dt_cg.c b/libdtrace/dt_cg.c
>> index 1cb06ef7..7ccf63c4 100644
>> --- a/libdtrace/dt_cg.c
>> +++ b/libdtrace/dt_cg.c
>> @@ -2325,6 +2325,225 @@ dt_cg_store(dt_node_t *src, dt_irlist_t *dlp, dt_regset_t *drp, dt_node_t *dst)
>>   	}
>>   }
>>   
>> +/*
>> + * Generate code for a typecast or for argument promotion from the type of the
>> + * actual to the type of the formal.  We need to generate code for casts when
>> + * a scalar type is being narrowed or changing signed-ness.  We first shift the
>> + * desired bits high (losing excess bits if narrowing) and then shift them down
>> + * using logical shift (unsigned result) or arithmetic shift (signed result).
>> + */
>> +static void
>> +dt_cg_typecast(const dt_node_t *src, const dt_node_t *dst,
>> +    dt_irlist_t *dlp, dt_regset_t *drp)
>> +{
>> +	size_t srcsize;
>> +	size_t dstsize;
>> +	int n;
>> +
>> +	/* If the destination type is '@' (any type) we need not cast. */
>> +	if (dst->dn_ctfp == NULL && dst->dn_type == CTF_ERR)
>> +		return;
>> +
>> +	srcsize = dt_node_type_size(src);
>> +	dstsize = dt_node_type_size(dst);
>> +
>> +	if (dstsize < srcsize)
>> +		n = sizeof(uint64_t) * NBBY - dstsize * NBBY;
>> +	else
>> +		n = sizeof(uint64_t) * NBBY - srcsize * NBBY;
>> +
>> +	if (dt_node_is_scalar(dst) && n != 0 && (dstsize < srcsize ||
>> +	    (src->dn_flags & DT_NF_SIGNED) ^ (dst->dn_flags & DT_NF_SIGNED))) {
>> +		emit(dlp, BPF_MOV_REG(dst->dn_reg, src->dn_reg));
>> +		emit(dlp, BPF_ALU64_IMM(BPF_LSH, dst->dn_reg, n));
>> +		emit(dlp, BPF_ALU64_IMM((dst->dn_flags & DT_NF_SIGNED) ? BPF_ARSH : BPF_RSH, dst->dn_reg, n));
>> +	}
>> +}
>> +
>> +/*
>> + * Generate code to push the specified argument list on to the tuple stack.
>> + * We use this routine for handling the index tuple for associative arrays.
>> + * We must first generate code for all subexpressions because any subexpression
>> + * could itself require the use of the tuple assembly area and we only provide
>> + * one.
>> + *
>> + * Since the number of tuple components is unknown, we do not want to waste
>> + * registers on storing the subexpression values.  So, instead, we store the
>> + * subexpression values on the stack.
>> + *
>> + * Once code for all subexpressions has been generated, we assemble the tuple.
>> + *
>> + * Note that we leave space at the beginning of the tuple for a uint32_t value,
>> + * and at the end space for a uint64_t value.
>> + */
>> +static void
>> +dt_cg_arglist(dt_ident_t *idp, dt_node_t *args, dt_irlist_t *dlp,
>> +	      dt_regset_t *drp)
>> +{
>> +	dtrace_hdl_t		*dtp = yypcb->pcb_hdl;
>> +	const dt_idsig_t	*isp = idp->di_data;
>> +	dt_node_t		*dnp;
>> +	dt_ident_t		*tupsz = dt_dlib_get_var(dtp, "TUPSZ");
>> +	int			i;
>> +	int			treg, areg;
>> +	uint_t			tuplesize = sizeof(uint32_t);

It'd be nice to have some comments explaining the difference between 
tupsz and tuplesize.  And/or rename the variables so the distinction is 
more clear.  E.g. tupsz_reloc and tupsz_cur or something like that.  
Probably it makes sense first to write the comments and then to choose 
the new names.

>> +	size_t			strsize = dtp->dt_options[DTRACEOPT_STRSIZE];

Declare and define strsize inside the one code block that uses it.

>> +	TRACE_REGSET("      arglist: Begin");
A few more comments to break up this big function might be helpful -- at 
least to relieve some eyestrain.  Here how about:
/* Compute tuple args and store on stack */
>> +	for (dnp = args, i = 0; dnp != NULL; dnp = dnp->dn_list, i++) {
>> +		dt_cg_node(dnp, dlp, drp);
>> +		dt_regset_xalloc(drp, BPF_REG_0);
>> +		emit(dlp, BPF_LOAD(BPF_DW, BPF_REG_0, BPF_REG_FP, DT_STK_SP));
>> +		emit(dlp, BPF_STORE(BPF_DW, BPF_REG_0, 0, dnp->dn_reg));
>> +		dt_regset_free(drp, dnp->dn_reg);
>> +		emit(dlp, BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, -DT_STK_SLOT_SZ));
>> +		emit(dlp, BPF_STORE(BPF_DW, BPF_REG_FP, DT_STK_SP, BPF_REG_0));
>> +		dt_regset_free(drp, BPF_REG_0);
>> +	}
>> +
>> +	if (i > dtp->dt_conf.dtc_diftupregs)
>> +		longjmp(yypcb->pcb_jmpbuf, EDT_NOTUPREG);

Wouldn't it make more sense to check i inside that loop?

>> +	TRACE_REGSET("      arglist: Stack");

Here maybe "/* Prepare treg */.  Again, now an earth-shattering comment 
but something to give structure to the big function, relieve eyestrain, 
and complement the comments already there.

>> +	if ((treg = dt_regset_alloc(drp)) == -1)
>> +		longjmp(yypcb->pcb_jmpbuf, EDT_NOREG);
>> +
>> +	emit(dlp, BPF_LOAD(BPF_DW, treg, BPF_REG_FP, DT_STK_DCTX));
>> +	emit(dlp, BPF_LOAD(BPF_DW, treg, treg, DCTX_MEM));
>> +
>> +	/*
>> +	 * We need to clear the tuple assembly area in case the previous tuple
>> +	 * was larger than the one we will construct, because otherwise we end
>> +	 * up with trailing garbage.
>> +	 */
Yeah.  Seems a shame to use up BPF map space like this for this purpose 
but good enough for now.  Another alternative could be to have a block 
of 0s that's only strsize big and only zero out string areas if/when 
they pop up.  Not needed for the current patch, I suppose.
>> +	if (dt_regset_xalloc_args(drp) == -1)
>> +		longjmp(yypcb->pcb_jmpbuf, EDT_NOREG);
>> +
>> +	emit(dlp,  BPF_MOV_REG(BPF_REG_1, treg));
>> +	emit(dlp,  BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, DMEM_TUPLE(dtp)));
>> +	emite(dlp, BPF_MOV_IMM(BPF_REG_2, -1), tupsz);
>> +	emit(dlp,  BPF_MOV_REG(BPF_REG_3, BPF_REG_1));
>> +	emite(dlp, BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, -1), tupsz);
>> +	dt_regset_xalloc(drp, BPF_REG_0);
>> +	emit(dlp,  BPF_CALL_HELPER(BPF_FUNC_probe_read));
>> +	dt_regset_free_args(drp);
>> +	dt_regset_free(drp, BPF_REG_0);
>> +
>> +	emit(dlp, BPF_ALU64_IMM(BPF_ADD, treg, DMEM_TUPLE(dtp) + sizeof(uint32_t)));
>> +
>> +	if ((areg = dt_regset_alloc(drp)) == -1)
>> +		longjmp(yypcb->pcb_jmpbuf, EDT_NOREG);
Here could be /* Append args to the tuple assembly area */.  This seems 
to me like a better place for that one comment that's currently inside 
the loop.
>> +	for (dnp = args, i = 0; dnp != NULL; dnp = dnp->dn_list, i++) {
>> +		dtrace_diftype_t	t;
>> +		size_t			size;
>> +
>> +		dt_node_diftype(yypcb->pcb_hdl, dnp, &t);
>> +		size = t.dtdt_size;
>> +
>> +		/* Append the value to the tuple assembly area. */
>> +		if (t.dtdt_size == 0)
>> +			continue;
Instead of t.dtdt_size, you can just use size.
>> +		emit(dlp, BPF_LOAD(BPF_DW, areg, BPF_REG_FP, DT_STK_SP));
>> +		emit(dlp, BPF_ALU64_IMM(BPF_ADD, areg, DT_STK_SLOT_SZ));
>> +		emit(dlp, BPF_STORE(BPF_DW, BPF_REG_FP, DT_STK_SP, areg));
>> +		emit(dlp, BPF_LOAD(BPF_DW, areg, areg, 0));
>> +
>> +		isp->dis_args[i].dn_reg = areg;
>> +		dt_cg_typecast(dnp, &isp->dis_args[i], dlp, drp);
>> +		isp->dis_args[i].dn_reg = -1;
>> +
>> +		if (dt_node_is_scalar(dnp) || dt_node_is_float(dnp)) {
>> +			assert(size > 0 && size <= 8 &&
>> +			       (size & (size - 1)) == 0);
No big deal but I think we're up to about ten instances of this 
assert(...) check in this file now.  At some point maybe it'd make sense 
to define a macro.
>> +			emit(dlp,  BPF_STORE(ldstw[size], treg, 0, areg));
>> +			emit(dlp,  BPF_ALU64_IMM(BPF_ADD, treg, size));
>> +
>> +			tuplesize += size;
>> +		} else if (dt_node_is_string(dnp)) {
>> +			uint_t	lbl_valid = dt_irlist_label(dlp);
>> +
>> +			if (dt_regset_xalloc_args(drp) == -1)
>> +				longjmp(yypcb->pcb_jmpbuf, EDT_NOREG);
>> +
>> +			emit(dlp,  BPF_MOV_REG(BPF_REG_1, treg));
>> +			emit(dlp,  BPF_MOV_IMM(BPF_REG_2, strsize + 1));
>> +			emit(dlp,  BPF_MOV_REG(BPF_REG_3, areg));
>> +			dt_cg_tstring_free(yypcb, dnp);
>> +			dt_regset_xalloc(drp, BPF_REG_0);
>> +			emit(dlp,  BPF_CALL_HELPER(BPF_FUNC_probe_read_str));
>> +			dt_regset_free_args(drp);
>> +			emit(dlp,  BPF_BRANCH_IMM(BPF_JSGE, BPF_REG_0, 0, lbl_valid));
>> +			dt_cg_probe_error(yypcb, -1, DTRACEFLT_BADADDR, 0);
>> +			emitl(dlp, lbl_valid,
>> +				   BPF_ALU64_REG(BPF_ADD, treg, BPF_REG_0));
>> +			dt_regset_free(drp, BPF_REG_0);

There seem to be two choices here for how far to advance in the tuple 
assembly area when a string is being written.  One is the number of 
chars written.  Another is the standard string size. Either choice is 
the same as far as memory usage goes, making sure there are no garbage 
bytes, etc.  But if you advance the standard string size, you can just 
advance treg and tuplesize together. Specifically, you would not need to 
advance treg at all but could just store tuple args to [%treg + 
tuplesize].  I think that makes the code a little simpler and, if most 
of your tuple args are scalars, it would mean you're emitting fewer BPF 
instructions. Further, you wouldn't need to reset treg at the end.

>> +			tuplesize += size + 1;
>> +		} else if (t.dtdt_flags & DIF_TF_BYREF) {
>> +			uint_t	lbl_valid = dt_irlist_label(dlp);
>> +
>> +			if (dt_regset_xalloc_args(drp) == -1)
>> +				longjmp(yypcb->pcb_jmpbuf, EDT_NOREG);
>> +
>> +			emit(dlp,  BPF_MOV_REG(BPF_REG_1, treg));
>> +			emit(dlp,  BPF_MOV_IMM(BPF_REG_2, size));
>> +			emit(dlp,  BPF_MOV_REG(BPF_REG_3, areg));
>> +			dt_regset_xalloc(drp, BPF_REG_0);
>> +			emit(dlp,  BPF_CALL_HELPER(BPF_FUNC_probe_read));
>> +			dt_regset_free_args(drp);
>> +			emit(dlp,  BPF_BRANCH_IMM(BPF_JEQ, BPF_REG_0, 0, lbl_valid));
>> +			dt_cg_probe_error(yypcb, -1, DTRACEFLT_BADADDR, 0);
>> +			emitl(dlp, lbl_valid,
>> +				   BPF_ALU64_IMM(BPF_ADD, treg, size));
>> +			dt_regset_free(drp, BPF_REG_0);
>> +
>> +			tuplesize += size;
>> +		} else
>> +			assert(0);	/* We shouldn't be able to get here. */
>> +	}
>> +
>> +	dt_regset_free(drp, areg);
>> +
>> +	TRACE_REGSET("      arglist: Tuple");
>> +
>> +	/* Add room for an optional TLS key (or 0). */
>> +	tuplesize += sizeof(uint64_t);
All above, you first fill in the tuple space and then you update 
tuplesize.  For consistency, might as well do the same thing here. That 
is, move the tuplesize+= statement to after the write that it represents.
>> +	if (idp->di_flags & DT_IDFLG_TLS) {
>> +		dt_ident_t	*idp = dt_dlib_get_func(dtp, "dt_tlskey");
>> +		uint_t		varid = idp->di_id - DIF_VAR_OTHER_UBASE;
>> +
>> +		assert(idp != NULL);
>> +
>> +		if (dt_regset_xalloc_args(drp) == -1)
>> +			longjmp(yypcb->pcb_jmpbuf, EDT_NOREG);
>> +
>> +		emit(dlp,  BPF_MOV_IMM(BPF_REG_1, varid));
>> +		dt_regset_xalloc(drp, BPF_REG_0);
>> +		emite(dlp, BPF_CALL_FUNC(idp->di_id), idp);
>> +		dt_regset_free_args(drp);
>> +		emit(dlp,  BPF_STORE(BPF_DW, treg, 0, BPF_REG_0));
>> +		dt_regset_free(drp, BPF_REG_0);
>> +	} else
>> +		emit(dlp,  BPF_STORE_IMM(BPF_DW, treg, 0, 0));
>> +
>> +	if (tuplesize > dtp->dt_maxtuplesize)
>> +		dtp->dt_maxtuplesize = tuplesize;
And here: /* Reset the tuple register */
>> +	emit(dlp, BPF_LOAD(BPF_DW, treg, BPF_REG_FP, DT_STK_DCTX));
>> +	emit(dlp, BPF_LOAD(BPF_DW, treg, treg, DCTX_MEM));
>> +	emit(dlp, BPF_ALU64_IMM(BPF_ADD, treg, DMEM_TUPLE(dtp)));
>> +
>> +	args->dn_reg = treg;
>> +
>> +	TRACE_REGSET("      arglist: End  ");
>> +}
>> +
>>   /*
>>    * dnp = node of the assignment
>>    *   dn_left = identifier node for the destination (idp = identifier)
>> @@ -3174,75 +3350,59 @@ dt_cg_asgn_op(dt_node_t *dnp, dt_irlist_t *dlp, dt_regset_t *drp)
>>   static void
>>   dt_cg_assoc_op(dt_node_t *dnp, dt_irlist_t *dlp, dt_regset_t *drp)
>>   {
>> -	ssize_t base;
>> +	dt_ident_t	*idp = dt_dlib_get_func(yypcb->pcb_hdl, "dt_get_assoc");
>> +	uint_t		varid;
>>   
>> +	TRACE_REGSET("    assoc_op: Begin");
>> +
>> +	assert(idp != NULL);
>>   	assert(dnp->dn_kind == DT_NODE_VAR);
>>   	assert(!(dnp->dn_ident->di_flags & DT_IDFLG_LOCAL));
>>   	assert(dnp->dn_args != NULL);
>>   
>> +	dnp->dn_ident->di_flags |= DT_IDFLG_DIFR;
>> +

Adding just a few comments can clarify what's going on here.  E.g. here 
can add
/* Build tuple */

>>   	dt_cg_arglist(dnp->dn_ident, dnp->dn_args, dlp, drp);

Add /* Get pointer to dyn var */ (or something like that).

>>   
>>   	if ((dnp->dn_reg = dt_regset_alloc(drp)) == -1)
>>   		longjmp(yypcb->pcb_jmpbuf, EDT_NOREG);
>> +	if (dt_regset_xalloc_args(drp) == -1)
>> +		longjmp(yypcb->pcb_jmpbuf, EDT_NOREG);
>>   
>> -	if (dnp->dn_ident->di_flags & DT_IDFLG_TLS)
>> -		base = 0x2000;
>> -	else
>> -		base = 0x3000;
>> +	varid = dnp->dn_ident->di_id - DIF_VAR_OTHER_UBASE;
>>   
>> -	dnp->dn_ident->di_flags |= DT_IDFLG_DIFR;
>> -	emit(dlp, BPF_LOAD(BPF_DW, dnp->dn_reg, BPF_REG_FP, base + dnp->dn_ident->di_id));
>> +	emit(dlp,  BPF_MOV_IMM(BPF_REG_1, varid));
>> +	emit(dlp,  BPF_MOV_REG(BPF_REG_2, dnp->dn_args->dn_reg));
>> +	dt_regset_free(drp, dnp->dn_args->dn_reg);
>> +	emit(dlp,  BPF_MOV_IMM(BPF_REG_3, 0));
>> +	emit(dlp,  BPF_MOV_IMM(BPF_REG_4, 0));
>> +	dt_regset_xalloc(drp, BPF_REG_0);
>> +	emite(dlp, BPF_CALL_FUNC(idp->di_id), idp);
>> +	dt_regset_free_args(drp);

Add /* Read data from the returned pointer */ (or something like that).

>> -	/*
>> -	 * If the associative array is a pass-by-reference type, then we are
>> -	 * loading its value as a pointer to either load or store through it.
>> -	 * The array element in question may not have been faulted in yet, in
>> -	 * which case DIF_OP_LD*AA will return zero.  We append an epilogue
>> -	 * of instructions similar to the following:
>> -	 *
>> -	 *	  ld?aa	 id, %r1	! base ld?aa instruction above
>> -	 *	  tst	 %r1		! start of epilogue
>> -	 *   +--- bne	 label
>> -	 *   |    setx	 size, %r1
>> -	 *   |    allocs %r1, %r1
>> -	 *   |    st?aa	 id, %r1
>> -	 *   |    ld?aa	 id, %r1
>> -	 *   v
>> -	 * label: < rest of code >
>> -	 *
>> -	 * The idea is that we allocs a zero-filled chunk of scratch space and
>> -	 * do a DIF_OP_ST*AA to fault in and initialize the array element, and
>> -	 * then reload it to get the faulted-in address of the new variable
>> -	 * storage.  This isn't cheap, but pass-by-ref associative array values
>> -	 * are (thus far) uncommon and the allocs cost only occurs once.  If
>> -	 * this path becomes important to DTrace users, we can improve things
>> -	 * by adding a new DIF opcode to fault in associative array elements.
>> -	 */
>>   	if (dnp->dn_flags & DT_NF_REF) {
>> -#ifdef FIXME
>> -		uint_t stvop = op == DIF_OP_LDTAA ? DIF_OP_STTAA : DIF_OP_STGAA;
>> -		uint_t label = dt_irlist_label(dlp);
>> -
>> -		emit(dlp, BPF_BRANCH_IMM(BPF_JNE, dnp->dn_reg, 0, label));
>> -
>> -		dt_cg_setx(dlp, dnp->dn_reg, dt_node_type_size(dnp));
>> -		instr = DIF_INSTR_ALLOCS(dnp->dn_reg, dnp->dn_reg);
>> -		dt_irlist_append(dlp, dt_cg_node_alloc(DT_LBL_NONE, instr));
>> -
>> -		dnp->dn_ident->di_flags |= DT_IDFLG_DIFW;
>> -		instr = DIF_INSTR_STV(stvop, dnp->dn_ident->di_id, dnp->dn_reg);
>> -		dt_irlist_append(dlp, dt_cg_node_alloc(DT_LBL_NONE, instr));
>> -
>> -		instr = DIF_INSTR_LDV(op, dnp->dn_ident->di_id, dnp->dn_reg);
>> -		dt_irlist_append(dlp, dt_cg_node_alloc(DT_LBL_NONE, instr));
>> +		emit(dlp,  BPF_MOV_REG(dnp->dn_reg, BPF_REG_0));
>> +		dt_regset_free(drp, BPF_REG_0);
>> +	} else {
>> +		size_t	size = dt_node_type_size(dnp);
>> +		uint_t	lbl_notnull = dt_irlist_label(dlp);
>> +		uint_t	lbl_done = dt_irlist_label(dlp);
>> +
>> +		assert(size > 0 && size <= 8 &&
>> +		       (size & (size - 1)) == 0);
>> +
>> +		emit(dlp,  BPF_BRANCH_IMM(BPF_JNE, BPF_REG_0, 0, lbl_notnull));
>> +		emit(dlp,  BPF_MOV_IMM(dnp->dn_reg, 0));
>> +		emit(dlp,  BPF_JUMP(lbl_done));
>> +		emitl(dlp, lbl_notnull,
>> +			   BPF_LOAD(ldstw[size], dnp->dn_reg, BPF_REG_0, 0));
>> +		dt_regset_free(drp, BPF_REG_0);
>>   
>> -		emitl(dlp, label,
>> +		emitl(dlp, lbl_done,
>>   			   BPF_NOP());
>> -#else
>> -		xyerror(D_UNKNOWN, "internal error -- no support for "
>> -			"associative arrays yet\n");
>> -#endif
>>   	}
Or pull the common reg_free(%r0) out of both if branches so that the one 
reg_alloc(%r0) is matched by one reg_free(%r0).
>> +
>> +	TRACE_REGSET("    assoc_op: End  ");
>>   }
>>   
>>   static void
>> diff --git a/libdtrace/dt_dctx.h b/libdtrace/dt_dctx.h
>> index d45720d7..dc2f8e58 100644
>> --- a/libdtrace/dt_dctx.h
>> +++ b/libdtrace/dt_dctx.h
>> @@ -72,9 +72,13 @@ typedef struct dt_dctx {
>>    *                       +----------------+----------------+
>>    *                  0 -> | Stack          :     tstring    | \
>>    *                       |   trace     (shared)   storage  |  |
>> - *                       |     storage    :                |   > DMEM_SIZE
>> + *                       |     storage    :                |  |
>>    *                       +----------------+----------------+  |
>> - *        DMEM_STRTOK -> |     strtok() internal state     | /
>> + *        DMEM_STRTOK -> |     strtok() internal state     |   > DMEM_SIZE
>> + *                       +---------------------------------+  |
>> + *        DMEM_TUPLE  -> |       tuple assembly area       |  |
>> + *                       +---------------------------------+  |
>> + *   DMEM_TUPLE_DFLT  -> |       default empty tuple       | /
>>    *                       +---------------------------------+
>>    */
>>   
>> @@ -88,17 +92,23 @@ typedef struct dt_dctx {
>>   		 P2ROUNDUP((dtp)->dt_options[DTRACEOPT_STRSIZE] + 1, 8))
>>   #define DMEM_STRTOK_SZ(dtp) \
>>   		(sizeof(uint64_t) + (dtp)->dt_options[DTRACEOPT_STRSIZE] + 1)
>> +#define DMEM_TUPLE_SZ(dtp) \
>> +		((dtp)->dt_maxtuplesize)
>>   
>>   /*
>> - * Macro to determine the offset from mem to the strtok internal state.
>> + * Macros to determine the offset of the components of dctx->mem.
>>    */
>>   #define DMEM_STRTOK(dtp) \
>>   		MAX(DMEM_STACK_SZ(dtp), DMEM_TSTR_SZ(dtp))
>> +#define DMEM_TUPLE(dtp) \
>> +		(DMEM_STRTOK(dtp) + DMEM_STRTOK_SZ(dtp))
>> +#define DMEM_TUPLE_DFLT(dtp) \
>> +		(DMEM_TUPLE(dtp) + DMEM_TUPLE_SZ(dtp))
>>   
>>   /*
>>    * Macro to determine the total size of the mem area.
>>    */
>> -#define DMEM_SIZE(dtp)	(DMEM_STRTOK(dtp) + DMEM_STRTOK_SZ(dtp))
>> +#define DMEM_SIZE(dtp)	(DMEM_TUPLE_DFLT(dtp) + DMEM_TUPLE_SZ(dtp))
I'm kind of uncomftable about defining
         #define DMEM_TUPLE_SZ(dtp)
         #define DMEM_TUPLE_DFLT(dtp)
         #define DMEM_SIZE(dtp)
The problem is that they depend on dtp->dt_maxtuplesize, which changes 
during code generation.  So they should (if defined) be used very 
sparingly.  And they are!  But that raises the question of whether they 
should be defined at all.  This is not currently an issue, but I could 
imagine these definitions inviting future mistakes.  Or, maybe include 
some warning comment by their definitions that they change during code 
generation.

But DMEM_TUPLE_DFLT really does not seem to be needed.  It's not really 
used.  Could just say that the tuple area is twice the size it needs to be.

>> diff --git a/test/unittest/assocs/tst.init-str.d b/test/unittest/assocs/tst.init-str.d
>> new file mode 100644
>> index 00000000..2894fd68
>> --- /dev/null
>> +++ b/test/unittest/assocs/tst.init-str.d
>> @@ -0,0 +1,29 @@
>> +/*
>> + * Oracle Linux DTrace.
>> + * Copyright (c) 2022, Oracle and/or its affiliates. All rights reserved.
>> + * Licensed under the Universal Permissive License v 1.0 as shown at
>> + * http://oss.oracle.com/licenses/upl.
>> + */
>> +
>> +/*
>> + * ASSERTION: Default value of unassigned elements in asssociative string
>> + *	      arrays is 0.  This will cause an 'invalid address 0x0' fault
>> + *	      upon dereferencing.
>> + *
>> + * SECTION: Variables/Associative Arrays
>> + */
>> +
>> +#pragma D option quiet
>> +
>> +string s[int];
>> +
>> +BEGIN
>> +{
>> +	trace(s[1]);
>> +	exit(1);
>> +}
>> +
>> +ERROR
>> +{
>> +	exit(arg4 != 1 || arg5 != 0);
>> +}
I follow the rationale, but that's not how things behave on Solaris or 
legacy DTrace.  Intentional change?
>> diff --git a/test/unittest/assocs/tst.store_zero_deletes.d b/test/unittest/assocs/tst.store_zero_deletes.d
>> new file mode 100644
>> index 00000000..2db9c6c0
>> --- /dev/null
>> +++ b/test/unittest/assocs/tst.store_zero_deletes.d
>> @@ -0,0 +1,35 @@
>> +/*
>> + * Oracle Linux DTrace.
>> + * Copyright (c) 2022, Oracle and/or its affiliates. All rights reserved.
>> + * Licensed under the Universal Permissive License v 1.0 as shown at
>> + * http://oss.oracle.com/licenses/upl.
>> + */
>> +
>> +/*
>> + * ASSERTION: Storing 0 into an associate array element removes it from
>> + *	      storage, making room for another element.
>> + *
>> + * SECTION: Variables/Thread-Local Variables
>> + */
>> +
>> +#pragma D option quiet
>> +#pragma D option dynvarsize=15
>> +
>> +BEGIN
>> +{
>> +	a[1] = 1;
>> +	a[2] = 2;
>> +	a[3] = 3;
>> +	a[1] = 0;
>> +	a[4] = 4;
>> +	trace(a[1]);
>> +	trace(a[2]);
>> +	trace(a[3]);
>> +	trace(a[4]);
>> +	exit(0);
>> +}
>> +
>> +ERROR
>> +{
>> +	exit(1);
>> +}
FWIW this works very differently from the way it works on Solaris or 
legacy DTrace, but I suspect we do not care here.