[DTrace-devel] [PATCH v2] Implement the pcap action
Kris Van Hees
kris.van.hees at oracle.com
Tue Nov 7 21:28:22 UTC 2023
On Tue, Nov 07, 2023 at 04:20:58PM -0500, Eugene Loh via DTrace-devel wrote:
> I assume Alan's comments were integrated. Meanwhile...
> > diff --git a/libdtrace/dt_cg.c b/libdtrace/dt_cg.c
> > @@ -1882,8 +1905,112 @@ dt_cg_act_panic(dt_pcb_t *pcb, dt_node_t *dnp, dtrace_actkind_t kind)
> > static void
> > dt_cg_act_pcap(dt_pcb_t *pcb, dt_node_t *dnp, dtrace_actkind_t kind)
> > {
> > - dnerror(dnp, D_UNKNOWN, "pcap() is not implemented (yet)\n");
> > - /* FIXME: Needs implementation */
> > + dtrace_hdl_t *dtp = pcb->pcb_hdl;
> > + dt_regset_t *drp = pcb->pcb_regs;
> > + dt_irlist_t *dlp = &pcb->pcb_ir;
> > + dt_node_t *addr = dnp->dn_args;
> > + dt_node_t *proto = addr->dn_list;
> > + uint_t time_off, size_off, data_off, off;
> > + uint_t lbl_lenok = dt_irlist_label(dlp);
> > + uint64_t pcapsz = dtp->dt_options[DTRACEOPT_PCAPSIZE];
> > + int lenreg;
> > +
> > + TRACE_REGSET("pcap(): Begin ");
> > +
> > + /*
> > + * Create records for timestamp, size, protocol id, and packet data.
> > + * The protocol id is stored immediately from the passed in argument.
> > + */
> > + time_off = dt_rec_add(dtp, dt_cg_fill_gap, DTRACEACT_PCAP,
> > + sizeof(uint64_t), sizeof(uint64_t), NULL, 0);
> > + size_off = dt_rec_add(dtp, dt_cg_fill_gap, DTRACEACT_PCAP,
> > + sizeof(uint32_t), sizeof(uint32_t), NULL, 0);
> > + dt_cg_store_val(pcb, proto, DTRACEACT_PCAP, NULL, 0);
> > + data_off = dt_rec_add(dtp, dt_cg_fill_gap, DTRACEACT_PCAP,
> > + dtp->dt_options[DTRACEOPT_PCAPSIZE], 1, NULL, 0);
>
> Use pcapsz in place of dtp->dt_options[DTRACEOPT_PCAPSIZE] here.
Ah yes, darn, missed that one.
> > +
> > + /* Store timestamp (ns since boot time). */
> > + if (dt_regset_xalloc_args(drp) == -1)
> > + longjmp(yypcb->pcb_jmpbuf, EDT_NOREG);
> > + dt_regset_xalloc(drp, BPF_REG_0);
> > + emit(dlp, BPF_CALL_HELPER(BPF_FUNC_ktime_get_ns));
> > + dt_regset_free_args(drp);
> > + emit(dlp, BPF_STORE(BPF_DW, BPF_REG_9, time_off, BPF_REG_0));
> > + dt_regset_free(drp, BPF_REG_0);
> > +
> > + /*
> > + * Determine and store the packet data size.
> > + * The size of the data to be copied (and reported as size of the
> > + * packet capture) is the lesser of the linear data size
> > + * (skb->len - skb->data_len) and the capture size.
> > + */
> > + dt_cg_node(addr, dlp, drp);
> > +
> > + off = dt_cg_ctf_offsetof("struct sk_buff", "len", NULL);
> > +
> > + if (dt_regset_xalloc_args(drp) == -1)
> > + longjmp(yypcb->pcb_jmpbuf, EDT_NOREG);
> > +
> > + emit(dlp, BPF_MOV_REG(BPF_REG_3, addr->dn_reg));
> > + emit(dlp, BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, off));
> > + emit(dlp, BPF_MOV_IMM(BPF_REG_2, sizeof(uint32_t)));
> > + emit(dlp, BPF_LOAD(BPF_DW, BPF_REG_1, BPF_REG_FP, DT_STK_SP));
> > + dt_regset_xalloc(drp, BPF_REG_0);
> > + emit(dlp, BPF_CALL_HELPER(dtp->dt_bpfhelper[BPF_FUNC_probe_read_kernel]));
> > + dt_regset_free(drp, BPF_REG_0);
> > + dt_regset_free_args(drp);
>
> Shouldn't make any difference, but we normally free_args immediately after
> the function call. That makes sense since that's when the BPF verifier
> invalidates those regs. And I thought that's what was done in v1. So why
> have free(%r0) and free_args() been exchanged?
It makes no difference. I think it changed because I had to change the code
a bit to do the subtraction of two values that I need to retrieve. And so I
defaulted this last one to alloc/free in pairs (inner, outer). I tend to do
that, when I remember :)
> > +
> > + if ((lenreg = dt_regset_alloc(drp)) == -1)
> > + longjmp(yypcb->pcb_jmpbuf, EDT_NOREG);
> > +
> > + emit(dlp, BPF_LOAD(BPF_DW, lenreg, BPF_REG_FP, DT_STK_SP));
> > + emit(dlp, BPF_LOAD(BPF_W, lenreg, lenreg, 0));
> > +
> > + off = dt_cg_ctf_offsetof("struct sk_buff", "data_len", NULL);
> > +
> > + if (dt_regset_xalloc_args(drp) == -1)
> > + longjmp(yypcb->pcb_jmpbuf, EDT_NOREG);
> > +
> > + emit(dlp, BPF_MOV_REG(BPF_REG_3, addr->dn_reg));
> > + emit(dlp, BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, off));
> > + emit(dlp, BPF_MOV_IMM(BPF_REG_2, sizeof(uint32_t)));
> > + emit(dlp, BPF_LOAD(BPF_DW, BPF_REG_1, BPF_REG_FP, DT_STK_SP));
> > + dt_regset_xalloc(drp, BPF_REG_0);
> > + emit(dlp, BPF_CALL_HELPER(dtp->dt_bpfhelper[BPF_FUNC_probe_read_kernel]));
> > + emit(dlp, BPF_LOAD(BPF_DW, BPF_REG_0, BPF_REG_FP, DT_STK_SP));
> > + emit(dlp, BPF_LOAD(BPF_W, BPF_REG_0, BPF_REG_0, 0));
> > + dt_regset_free(drp, BPF_REG_0);
> > + dt_regset_free_args(drp);
>
> Same thing about free_args() should be right after the function call.
>
> Also, here you put some result in %r0 and then free(%r0). Typically you'd
> get lucky but the whole reason for the xalloc/free is in case %r0 is
> spilled/filled. If such a thing is really possible, then this code will be
> overwriting %r0 (filling the reg) before the %r0 result is used.
Ah yes, this is a mistake. I overlooked things. It is meant to be free'd
after the next instruction. I'll fix that.
> > +
> > + emit(dlp, BPF_ALU64_REG(BPF_SUB, lenreg, BPF_REG_0));
> > + emit(dlp, BPF_BRANCH_IMM(BPF_JLE, lenreg, pcapsz, lbl_lenok));
> > + emit(dlp, BPF_MOV_IMM(lenreg, pcapsz));
> > + emitl(dlp, lbl_lenok,
> > + BPF_STORE(BPF_W, BPF_REG_9, size_off, lenreg));
> > +
> > + /* Copy the packet data to the output buffer. */
> > + off = dt_cg_ctf_offsetof("struct sk_buff", "data", NULL);
> > +
> > + if (dt_regset_xalloc_args(drp) == -1)
> > + longjmp(yypcb->pcb_jmpbuf, EDT_NOREG);
> > + emit(dlp, BPF_MOV_REG(BPF_REG_3, addr->dn_reg));
> > + dt_regset_free(drp, addr->dn_reg);
> > + emit(dlp, BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, off));
> > + emit(dlp, BPF_MOV_IMM(BPF_REG_2, sizeof(uint64_t)));
> > + emit(dlp, BPF_LOAD(BPF_DW, BPF_REG_1, BPF_REG_FP, DT_STK_SP));
> > + dt_regset_xalloc(drp, BPF_REG_0);
> > + emit(dlp, BPF_CALL_HELPER(dtp->dt_bpfhelper[BPF_FUNC_probe_read_kernel]));
> > + emit(dlp, BPF_LOAD(BPF_DW, BPF_REG_3, BPF_REG_FP, DT_STK_SP));
> > + emit(dlp, BPF_LOAD(BPF_DW, BPF_REG_3, BPF_REG_3, 0));
> > + emit(dlp, BPF_MOV_REG(BPF_REG_2, lenreg));
>
> I already commented on this: it's a problem if lenreg is %r1 %r2 %r4 or
> %r5: the BPF verifier will complain since those regs were invalidated with
> the last func call. And if lenreg is %r3, the verifier will not complain,
> but the value one is getting is the value that was just written into %r3.
>
> I think the thing to do is to free_args() immediately after the function
> call, then xalloc_args(), then fill the regs starting with %r2 first.
Ugh, this was a bad day for making a v2. I fotgot to make that important
change.
> > + dt_regset_free(drp, lenreg);
> > + emit(dlp, BPF_MOV_REG(BPF_REG_1, BPF_REG_9));
> > + emit(dlp, BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, data_off));
> > + emit(dlp, BPF_CALL_HELPER(dtp->dt_bpfhelper[BPF_FUNC_probe_read_kernel]));
> > + dt_regset_free_args(drp);
> > + dt_regset_free(drp, BPF_REG_0);
> > +
> > + TRACE_REGSET("pcap(): End ");
> > }
> > diff --git a/libdtrace/dt_consume.c b/libdtrace/dt_consume.c
> > @@ -1427,28 +1427,26 @@ dt_print_mod(dtrace_hdl_t *dtp, FILE *fp, const char *format, caddr_t addr)
> > }
> > int
> > -dt_print_pcap(dtrace_hdl_t *dtp, FILE *fp, dtrace_recdesc_t *rec,
> > +dt_print_pcap(dtrace_hdl_t *dtp, FILE *fp, dtrace_recdesc_t *rec, uint_t nrecs,
> > const caddr_t buf)
> > {
> > - caddr_t addr;
> > - uint64_t time, proto, pktlen, maxlen;
> > - const char *filename;
> > + caddr_t data;
> > + uint64_t time, proto, pktlen;
> > + uint64_t maxlen = dtp->dt_options[DTRACEOPT_PCAPSIZE];
> > + const char *filename;
> > - addr = (caddr_t)buf + rec->dtrd_offset;
> > + if (nrecs < 4)
> > + return dt_set_errno(dtp, EDT_PCAP);
> > if (dt_read_scalar(buf, rec, &time) < 0 ||
> > - dt_read_scalar(buf + sizeof(uint64_t), rec, &pktlen) < 0)
> > + dt_read_scalar(buf, rec + 1, &pktlen) < 0)
> > return dt_set_errno(dtp, EDT_PCAP);
> > - if (pktlen == 0) {
> > - /*
> > - * skb must have been NULL, skip without capturing.
> > - */
> > + /* If pktlen is 0, the skb must have been ULL, so don't capture it. */
>
> ULL => NULL
Another one I forgot.
> > + if (pktlen == 0)
> > return 0;
> > - }
> > - maxlen = DT_PCAPSIZE(dtp->dt_options[DTRACEOPT_PCAPSIZE]);
>
>
> So get rid of DT_PCAPSIZE in pcap.h.
Yes.
> > - if (dt_read_scalar(buf, rec + 1, &proto) < 0)
> > + if (dt_read_scalar(buf, rec + 2, &proto) < 0)
> > return dt_set_errno(dtp, EDT_PCAP);
>
v3 will be coming by tomorrow.
More information about the DTrace-devel
mailing list