[DTrace-devel] [PATCH 3/5] libproc: debugging improvements
Kris Van Hees
kris.van.hees at oracle.com
Sat Dec 7 04:38:34 UTC 2024
On Tue, Dec 03, 2024 at 11:36:08AM +0000, Nick Alcock wrote:
> Attempting to track down the intermittent failures in
> test/unittest/proc/tst.multitrace.sh is rendered difficult by the fact that
> multiple dtraces are running at once, tracing multiple processes, but the
> debugging messages emitted by libproc do not provide either the TID of the
> tracing thread or the PID of the process being traced in too many cases.
>
> Worse yet, if you do turn debugging on, both dtraces emit debugging output
> simultaneously to the same stderr stream. The interleaving is bad enough,
> but very often this causes lines to be emitted to stderr that do not start
> with the standard libproc DEBUG time-since-epoch: string, because it got
> interspersed into the previous line. runtest then partitions that partial
> line off into a separate part of the log, rendering everything entirely
> incomprehensible.
>
> So emit more PID-related info, including the TID of the process-control
> thread; and arrange for tst.multitrace.sh to capture DTrace debugging output
> itself and dump it as two separated pieces: if a dtrace exits nonzero, note
> which one exited as well as dumping its debug output.
>
> Also, when this-should-never-happen conditions like a Pwait() returning
> -ECHILD happen (usually indicating that we were not the victim's tracer
> after all) by dumping the entire /proc/$pid/status of that process to
> the debug stream, so we can tell what the tracer *was* and whether the
> process was even stopped. (No impact at all if this condition never
> happens, which it never should, of course, or if debugging is off.)
>
> Signed-off-by: Nick Alcock <nick.alcock at oracle.com>
Reviewed-by: Kris Van Hees <kris.van.hees at oracle.com>
> ---
> libdtrace/dt_proc.c | 6 ++-
> libproc/Pcontrol.c | 57 +++++++++++++++++++++++++---
> libproc/rtld_db.c | 2 +-
> test/unittest/usdt/tst.multitrace.sh | 17 ++++++++-
> 4 files changed, 73 insertions(+), 9 deletions(-)
>
> diff --git a/libdtrace/dt_proc.c b/libdtrace/dt_proc.c
> index 41b148e807f84..6ad57cc5f12ef 100644
> --- a/libdtrace/dt_proc.c
> +++ b/libdtrace/dt_proc.c
> @@ -870,6 +870,9 @@ dt_proc_control(void *arg)
> int err;
> jmp_buf exec_jmp;
>
> + dt_dprintf("%i: process control thread %i starting.\n", dpr->dpr_pid,
> + gettid());
> +
> /*
> * Set up global libproc hooks that must be active before any processes
> * are grabbed or created.
> @@ -1421,7 +1424,8 @@ dt_proc_control_cleanup(void *arg)
> * out by ptrace() wrappers above us in the call stack, since the whole
> * thread is going away.
> */
> - dt_dprintf("%i: process control thread going away.\n", dpr->dpr_pid);
> + dt_dprintf("%i: process control thread %i going away.\n", dpr->dpr_pid,
> + gettid());
> if(dpr->dpr_lock_count_ctrl == 0 ||
> !pthread_equal(dpr->dpr_lock_holder, pthread_self()))
> dt_proc_lock(dpr);
> diff --git a/libproc/Pcontrol.c b/libproc/Pcontrol.c
> index c96a410b70b3f..a32a362eaa1fb 100644
> --- a/libproc/Pcontrol.c
> +++ b/libproc/Pcontrol.c
> @@ -66,6 +66,7 @@ static int add_bkpt(struct ps_prochandle *P, uintptr_t addr,
> void *data);
> static void delete_bkpt_handler(struct bkpt *bkpt);
> static jmp_buf **single_thread_unwinder_pad(struct ps_prochandle *unused);
> +static void Pdump_proc_status(pid_t pid);
>
> static ptrace_lock_hook_fun *ptrace_lock_hook;
> static waitpid_lock_hook_fun *waitpid_lock_hook;
> @@ -332,10 +333,12 @@ Pgrab(pid_t pid, int noninvasiveness, int already_ptraced, void *wrap_arg,
>
> if (*perr < 0) {
> if (noninvasiveness < 1) {
> + _dprintf("%i: Pgrab(): not grabbed.\n", P->pid);
> Pfree_internal(P);
> return NULL;
> }
> close(P->memfd);
> + _dprintf("%i: Pgrab(): grabbed noninvasively.\n", P->pid);
> noninvasiveness = 2;
> }
> } else {
> @@ -719,7 +722,9 @@ Pwait_internal(struct ps_prochandle *P, boolean_t block, int *return_early)
> return 0;
>
> if (errno == ECHILD) {
> + _dprintf("%i: Pwait: got ECHILD from waitpid(), state %i, trace count %i, halted %i\n", P->pid, P->state, P->ptrace_count, P->ptrace_halted);
> P->state = PS_DEAD;
> + Pdump_proc_status(P->pid);
> return 0;
> }
>
> @@ -1262,7 +1267,7 @@ Ppush_state(struct ps_prochandle *P, int state)
>
> s->state = state;
> dt_list_prepend(&P->ptrace_states, s);
> - _dprintf("Ppush_state(): ptrace_count %i, state %i\n", P->ptrace_count, s->state);
> + _dprintf("%i: Ppush_state(): ptrace_count %i, state %i\n", P->pid, P->ptrace_count, s->state);
>
> return s;
> }
> @@ -1278,7 +1283,7 @@ Ppop_state(struct ps_prochandle *P)
>
> s = dt_list_next(&P->ptrace_states);
> dt_list_delete(&P->ptrace_states, s);
> - _dprintf("Ppop_state(): ptrace_count %i, state %i\n", P->ptrace_count+1, s->state);
> + _dprintf("%i: Ppop_state(): ptrace_count %i, state %i\n", P->pid, P->ptrace_count+1, s->state);
> state = s->state;
> free(s);
> return state;
> @@ -1422,6 +1427,9 @@ err_nostate:
> if (P->ptrace_count == 0 && ptrace_lock_hook)
> ptrace_lock_hook(P, P->wrap_arg, 0);
>
> + _dprintf("Ptrace(): error return (possibly other tracer), trace count now %i: %s\n",
> + P->ptrace_count, strerror(errno));
> +
> if (err != -ECHILD)
> return err;
> else
> @@ -1511,8 +1519,11 @@ Puntrace(struct ps_prochandle *P, int leave_stopped)
> P->state = PS_RUN;
> P->ptraced = FALSE;
> if ((wrapped_ptrace(P, PTRACE_DETACH, P->pid, 0, 0) < 0) &&
> - (errno == ESRCH))
> + (errno == ESRCH)) {
> + _dprintf("%i: Punbkpt(): -ESRCH, process is dead.\n",
> + P->pid);
> P->state = PS_DEAD;
> + }
> P->ptrace_halted = FALSE;
> P->info_valid = 0;
> }
> @@ -1812,7 +1823,7 @@ Punbkpt(struct ps_prochandle *P, uintptr_t addr)
> if (Preset_bkpt_ip(P, P->tracing_bkpt) < 0)
> switch (errno) {
> case ESRCH:
> - _dprintf("%i: -ESRCH, process is dead.\n",
> + _dprintf("%i: Punbkpt(): -ESRCH, process is dead.\n",
> P->pid);
> P->state = PS_DEAD;
> return;
> @@ -1870,7 +1881,8 @@ Punbkpt_child_poke(struct ps_prochandle *P, pid_t pid, bkpt_t *bkpt)
> bkpt->bkpt_addr, bkpt->orig_insn) < 0)
> switch (errno) {
> case ESRCH:
> - _dprintf("%i: -ESRCH, process is dead.\n", child_pid);
> + _dprintf("%i: Punbkpt_child_poke(): -ESRCH, process is dead.\n",
> + child_pid);
> if (!pid)
> P->state = PS_DEAD;
> return;
> @@ -2181,8 +2193,11 @@ Pbkpt_continue(struct ps_prochandle *P)
> if (wrapped_ptrace(P, PTRACE_CONT, P->pid, 0, 0) < 0) {
> int err = errno;
> if (err == ESRCH) {
> - if ((kill(P->pid, 0) < 0) && errno == ESRCH)
> + if ((kill(P->pid, 0) < 0) && errno == ESRCH) {
> + _dprintf("%i: Pbpkt_continue(): Got ESRCH, process is dead.\n",
> + P->pid);
> P->state = PS_DEAD;
> + }
> }
> /*
> * Since we must have an outstanding Ptrace() anyway,
> @@ -2760,6 +2775,36 @@ Phastty(pid_t pid)
> return tty != 0;
> }
>
> +/*
> + * Dump /proc/$pid/status into the debug log.
> + */
> +static void
> +Pdump_proc_status(pid_t pid)
> +{
> + char status[PATH_MAX];
> + FILE *fp;
> + char *line = NULL;
> + size_t len;
> +
> + snprintf(status, sizeof(status), "/proc/%i/status", pid);
> +
> + if ((fp = fopen(status, "r")) == NULL) {
> + _dprintf("Process is dead.\n");
> + return;
> + }
> +
> + while (getline(&line, &len, fp) >= 0) {
> + if (strlen(line) > 0) {
> + if (line[strlen(line) - 1] == '\n')
> + line[strlen(line)-1] = '\0';
> + _dprintf("%li: %s\n", (long)pid, line);
> + }
> + }
> + free(line);
> + fclose(fp);
> + return;
> +}
> +
> /*
> * Get a specific field out of /proc/$pid/status and return the portion after
> * the colon in a new dynamically allocated string, or NULL if no field matches.
> diff --git a/libproc/rtld_db.c b/libproc/rtld_db.c
> index b6d33ce38425a..9eb63138326dc 100644
> --- a/libproc/rtld_db.c
> +++ b/libproc/rtld_db.c
> @@ -1771,7 +1771,7 @@ rd_loadobj_iter(rd_agent_t *rd, rl_iter_f *fun, void *state)
> if (rd->P->state == PS_DEAD) {
> *jmp_pad = old_exec_jmp;
>
> - _dprintf("%i: link map iteration failed: process is dead..\n",
> + _dprintf("%i: link map iteration failed: process is dead.\n",
> rd->P->pid);
> return RD_ERR;
> }
> diff --git a/test/unittest/usdt/tst.multitrace.sh b/test/unittest/usdt/tst.multitrace.sh
> index 4e53dbb992508..262c782821ef0 100755
> --- a/test/unittest/usdt/tst.multitrace.sh
> +++ b/test/unittest/usdt/tst.multitrace.sh
> @@ -73,7 +73,7 @@ if [ $? -ne 0 ]; then
> fi
>
> script() {
> - exec $dtrace $dt_flags -qws /dev/stdin $1 $2 $3 <<'EOF'
> + exec $dtrace $dt_flags -qws /dev/stdin $1 $2 $3 2> debug.$3 <<'EOF'
> int fired[pid_t];
> int exited[pid_t];
>
> @@ -142,13 +142,28 @@ DONE=$!
> script $ONE $TWO 2 &
> DTWO=$!
>
> +dump_debug() {
> + if [[ -n $DTRACE_DEBUG ]]; then
> + echo "runtest DEBUG $(date +%s): Debug output of first dtrace so far, PID $DONE" >&2
> + cat debug.1 >&2
> +
> + echo "runtest DEBUG $(date +%s): Debug output of second dtrace so far, PID $DTWO" >&2
> + cat debug.2 >&2
> + fi
> +}
> +
> if ! wait $DONE; then
> + dump_debug
> + echo "first dtrace exited nonzero at $(date +%s)" >&2
> exit 1
> fi
>
> if ! wait $DTWO; then
> + dump_debug
> + echo "second dtrace exited nonzero at $(date +%s)" >&2
> exit 1
> fi
> +dump_debug
>
> wait $ONE $TWO
>
> --
> 2.47.1.279.g84c5f4e78e
>
More information about the DTrace-devel
mailing list