[DTrace-devel] [PATCH 2/2] Implement the pcap action

Tue Oct 10 03:54:32 UTC 2023

I am not familiar with pcap, but have some comments below.

First of all, should there be some err.* tests to check too few args, 
too many args, etc., as we have for some other actions? Also...

On 9/23/23 15:14, Kris Van Hees via DTrace-devel wrote:
> diff --git a/libdtrace/dt_cg.c b/libdtrace/dt_cg.c
> @@ -1882,8 +1905,91 @@ dt_cg_act_panic(dt_pcb_t *pcb, dt_node_t *dnp, dtrace_actkind_t kind)
>   static void
>   dt_cg_act_pcap(dt_pcb_t *pcb, dt_node_t *dnp, dtrace_actkind_t kind)
>   {
> -	dnerror(dnp, D_UNKNOWN, "pcap() is not implemented (yet)\n");
> -	/* FIXME: Needs implementation */
> +	dtrace_hdl_t	*dtp = pcb->pcb_hdl;
> +	dt_regset_t	*drp = pcb->pcb_regs;
> +	dt_irlist_t	*dlp = &pcb->pcb_ir;
> +	dt_node_t	*addr = dnp->dn_args;
> +	dt_node_t	*proto = addr->dn_list;
> +	uint_t		time_off, size_off, data_off, off;
> +	uint_t		lbl_lenok = dt_irlist_label(dlp);
> +	uint64_t	pcapsz = dtp->dt_options[DTRACEOPT_PCAPSIZE];
> +	int		lenreg;
> +
> +	TRACE_REGSET("pcap(): Begin ");
> +
> +	/*
> +	 * Create records for timestamp, size, protocol id, and packet data.
> +	 * The protocol id is stored immediately from the passed in argument.
> +	 */
> +	time_off = dt_rec_add(dtp, dt_cg_fill_gap, DTRACEACT_PCAP,
> +			      sizeof(uint64_t), sizeof(uint64_t), NULL, 0);
> +	size_off = dt_rec_add(dtp, dt_cg_fill_gap, DTRACEACT_PCAP,
> +			      sizeof(uint32_t), sizeof(uint32_t), NULL, 0);
> +	dt_cg_store_val(pcb, proto, DTRACEACT_PCAP, NULL, 0);

Might the evaluation of addr and/or proto have a side effect that 
impacts the other?  If so, we have to evaluate them in the correct 
order.  So, I would think we cannot evaluate proto yet. Presumably, the 
test suite could have tests that check arg evaluation order for various 
subroutines and actions.

> +	data_off = dt_rec_add(dtp, dt_cg_fill_gap, DTRACEACT_PCAP,
> +			      dtp->dt_options[DTRACEOPT_PCAPSIZE], 1, NULL, 0);
> +
> +	/* Store timestamp (ns since boot time). */
> +	if (dt_regset_xalloc_args(drp) == -1)
> +		longjmp(yypcb->pcb_jmpbuf, EDT_NOREG);
> +	dt_regset_xalloc(drp, BPF_REG_0);
> +	emit(dlp,  BPF_CALL_HELPER(BPF_FUNC_ktime_get_ns));
> +	dt_regset_free_args(drp);
> +	emit(dlp,  BPF_STORE(BPF_DW, BPF_REG_9, time_off, BPF_REG_0));
> +	dt_regset_free(drp, BPF_REG_0);
> +
> +	/*
> +	 * Determine and store the packet data size.
> +	 * The size of the data to be copied (and reported as size of the
> +	 * packet capture) is the lesser of the data size (skb->len) and the
> +	 * capture size.
> +	 */
> +	dt_cg_node(addr, dlp, drp);
> +	off = dt_cg_ctf_offsetof("struct sk_buff", "len", NULL);
> +
> +	if (dt_regset_xalloc_args(drp) == -1)
> +		longjmp(yypcb->pcb_jmpbuf, EDT_NOREG);
> +	emit(dlp,  BPF_MOV_REG(BPF_REG_3, addr->dn_reg));
> +	emit(dlp,  BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, off));
> +	emit(dlp,  BPF_MOV_IMM(BPF_REG_2, sizeof(uint32_t)));
> +	emit(dlp,  BPF_LOAD(BPF_DW, BPF_REG_1, BPF_REG_FP, DT_STK_SP));
> +	dt_regset_xalloc(drp, BPF_REG_0);
> +	emit(dlp,  BPF_CALL_HELPER(dtp->dt_bpfhelper[BPF_FUNC_probe_read_kernel]));
> +	dt_regset_free_args(drp);
> +	dt_regset_free(drp, BPF_REG_0);
> +
> +	if ((lenreg = dt_regset_alloc(drp)) == -1)
> +		longjmp(yypcb->pcb_jmpbuf, EDT_NOREG);
> +
> +	emit(dlp,  BPF_LOAD(BPF_DW, lenreg, BPF_REG_FP, DT_STK_SP));
> +	emit(dlp,  BPF_LOAD(BPF_W, lenreg, lenreg, 0));

That's "a lot" of instructions simply to read a word from an unverified 
address.  Not too many, but such a read is relatively common.  So, we 
should have a simple way of generating these instructions.  How about 
dt_cg_load_scalar()?  Bottom line is that we should look at the various 
sites that read words in this manner and put the associated code 
generation under a function that can be called from code like this.  
Code reuse.

> +	emit(dlp,  BPF_BRANCH_IMM(BPF_JLE, lenreg, pcapsz, lbl_lenok));
> +	emit(dlp,  BPF_MOV_IMM(lenreg, pcapsz));
> +	emitl(dlp, lbl_lenok,
> +		   BPF_STORE(BPF_W, BPF_REG_9, size_off, lenreg));
> +
> +	/* Copy the packet data to the output buffer. */
> +	off = dt_cg_ctf_offsetof("struct sk_buff", "data", NULL);
> +
> +	if (dt_regset_xalloc_args(drp) == -1)
> +		longjmp(yypcb->pcb_jmpbuf, EDT_NOREG);
> +	emit(dlp,  BPF_MOV_REG(BPF_REG_3, addr->dn_reg));
> +	dt_regset_free(drp, addr->dn_reg);
> +	emit(dlp,  BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, off));
> +	emit(dlp,  BPF_MOV_IMM(BPF_REG_2, sizeof(uint64_t)));
> +	emit(dlp,  BPF_LOAD(BPF_DW, BPF_REG_1, BPF_REG_FP, DT_STK_SP));
> +	dt_regset_xalloc(drp, BPF_REG_0);
> +	emit(dlp,  BPF_CALL_HELPER(dtp->dt_bpfhelper[BPF_FUNC_probe_read_kernel]));
> +	emit(dlp,  BPF_LOAD(BPF_DW, BPF_REG_3, BPF_REG_FP, DT_STK_SP));
> +	emit(dlp,  BPF_LOAD(BPF_DW, BPF_REG_3, BPF_REG_3, 0));

For example, here already is another instance of such a load. Here is 
already an opportunity for code reuse.

> +	emit(dlp,  BPF_MOV_REG(BPF_REG_2, lenreg));

Should lenreg be freed now?  Otherwise, we have a register leak? Come to 
think of it, there could be a test to check for such reg leaks.  Like 
tst.pcap.file.sh but with the pcap action showing up twice, three times, 
four times, many times.  Different problems would be exposed.  E.g., ...

What if lenreg were %r1, %r2, %r4, or %r5?  The BPF verifier would 
complain since the reg was just wiped clean in the last BPF helper 
function call.  The fix is to be disciplined about filling/spilling regs 
between BPF helper calls.

What if lenreg were %r3?  The BPF verifier would not complain in this 
case... but for a bad reason:  we just finished overwriting that reg.  
So the verifier would be fine, but the value would be bad.  In this 
case, we have to fill %r2 first.  Then we can fill %r1 and %r3.

I know you've said that what we really need is robust reg management.  I 
suppose so, but we've had the current setup for a long time and so we 
should live with it for now.

> +	emit(dlp,  BPF_MOV_REG(BPF_REG_1, BPF_REG_9));
> +	emit(dlp,  BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, data_off));
> +	emit(dlp,  BPF_CALL_HELPER(dtp->dt_bpfhelper[BPF_FUNC_probe_read_kernel]));
> +	dt_regset_free_args(drp);
> +	dt_regset_free(drp, BPF_REG_0);
> +
> +	TRACE_REGSET("pcap(): End   ");
>   }
>   
> diff --git a/libdtrace/dt_consume.c b/libdtrace/dt_consume.c
> @@ -1427,28 +1427,26 @@ dt_print_mod(dtrace_hdl_t *dtp, FILE *fp, const char *format, caddr_t addr)
>   }
>   
>   int
> -dt_print_pcap(dtrace_hdl_t *dtp, FILE *fp, dtrace_recdesc_t *rec,
> +dt_print_pcap(dtrace_hdl_t *dtp, FILE *fp, dtrace_recdesc_t *rec, uint_t nrecs,
>   	      const caddr_t buf)
>   {
> -	caddr_t	addr;
> -	uint64_t time, proto, pktlen, maxlen;
> -	const char *filename;
> +	caddr_t			data;
> +	uint64_t		time, proto, pktlen;
> +	uint64_t		maxlen = dtp->dt_options[DTRACEOPT_PCAPSIZE];
> +	const char		*filename;
>   
> -	addr = (caddr_t)buf + rec->dtrd_offset;
> +	if (nrecs < 4)
> +		return dt_set_errno(dtp, EDT_PCAP);
>   
>   	if (dt_read_scalar(buf, rec, &time) < 0 ||
> -	    dt_read_scalar(buf + sizeof(uint64_t), rec, &pktlen) < 0)
> +	    dt_read_scalar(buf, rec + 1, &pktlen) < 0)
>   		return dt_set_errno(dtp, EDT_PCAP);
>   
> -	if (pktlen == 0) {
> -		/*
> -		 * skb must have been NULL, skip without capturing.
> -		 */
> +	/* If pktlen is 0, the skb must have been ULL, so don't capture it. */

s/ULL/NULL/

> +	if (pktlen == 0)
>   		return 0;
> -	}
> -	maxlen = DT_PCAPSIZE(dtp->dt_options[DTRACEOPT_PCAPSIZE]);

So we should remove the DT_PCAPSIZE definition in libdtrace/dt_pcap.h.

> -	if (dt_read_scalar(buf, rec + 1, &proto) < 0)
> +	if (dt_read_scalar(buf, rec + 2, &proto) < 0)
>   		return dt_set_errno(dtp, EDT_PCAP);