[DTrace-devel] [PATCH 3/5] libproc: debugging improvements
Nick Alcock
nick.alcock at oracle.com
Tue Dec 3 11:36:08 UTC 2024
Attempting to track down the intermittent failures in
test/unittest/proc/tst.multitrace.sh is rendered difficult by the fact that
multiple dtraces are running at once, tracing multiple processes, but the
debugging messages emitted by libproc do not provide either the TID of the
tracing thread or the PID of the process being traced in too many cases.
Worse yet, if you do turn debugging on, both dtraces emit debugging output
simultaneously to the same stderr stream. The interleaving is bad enough,
but very often this causes lines to be emitted to stderr that do not start
with the standard libproc DEBUG time-since-epoch: string, because it got
interspersed into the previous line. runtest then partitions that partial
line off into a separate part of the log, rendering everything entirely
incomprehensible.
So emit more PID-related info, including the TID of the process-control
thread; and arrange for tst.multitrace.sh to capture DTrace debugging output
itself and dump it as two separated pieces: if a dtrace exits nonzero, note
which one exited as well as dumping its debug output.
Also, when this-should-never-happen conditions like a Pwait() returning
-ECHILD happen (usually indicating that we were not the victim's tracer
after all) by dumping the entire /proc/$pid/status of that process to
the debug stream, so we can tell what the tracer *was* and whether the
process was even stopped. (No impact at all if this condition never
happens, which it never should, of course, or if debugging is off.)
Signed-off-by: Nick Alcock <nick.alcock at oracle.com>
---
libdtrace/dt_proc.c | 6 ++-
libproc/Pcontrol.c | 57 +++++++++++++++++++++++++---
libproc/rtld_db.c | 2 +-
test/unittest/usdt/tst.multitrace.sh | 17 ++++++++-
4 files changed, 73 insertions(+), 9 deletions(-)
diff --git a/libdtrace/dt_proc.c b/libdtrace/dt_proc.c
index 41b148e807f84..6ad57cc5f12ef 100644
--- a/libdtrace/dt_proc.c
+++ b/libdtrace/dt_proc.c
@@ -870,6 +870,9 @@ dt_proc_control(void *arg)
int err;
jmp_buf exec_jmp;
+ dt_dprintf("%i: process control thread %i starting.\n", dpr->dpr_pid,
+ gettid());
+
/*
* Set up global libproc hooks that must be active before any processes
* are grabbed or created.
@@ -1421,7 +1424,8 @@ dt_proc_control_cleanup(void *arg)
* out by ptrace() wrappers above us in the call stack, since the whole
* thread is going away.
*/
- dt_dprintf("%i: process control thread going away.\n", dpr->dpr_pid);
+ dt_dprintf("%i: process control thread %i going away.\n", dpr->dpr_pid,
+ gettid());
if(dpr->dpr_lock_count_ctrl == 0 ||
!pthread_equal(dpr->dpr_lock_holder, pthread_self()))
dt_proc_lock(dpr);
diff --git a/libproc/Pcontrol.c b/libproc/Pcontrol.c
index c96a410b70b3f..a32a362eaa1fb 100644
--- a/libproc/Pcontrol.c
+++ b/libproc/Pcontrol.c
@@ -66,6 +66,7 @@ static int add_bkpt(struct ps_prochandle *P, uintptr_t addr,
void *data);
static void delete_bkpt_handler(struct bkpt *bkpt);
static jmp_buf **single_thread_unwinder_pad(struct ps_prochandle *unused);
+static void Pdump_proc_status(pid_t pid);
static ptrace_lock_hook_fun *ptrace_lock_hook;
static waitpid_lock_hook_fun *waitpid_lock_hook;
@@ -332,10 +333,12 @@ Pgrab(pid_t pid, int noninvasiveness, int already_ptraced, void *wrap_arg,
if (*perr < 0) {
if (noninvasiveness < 1) {
+ _dprintf("%i: Pgrab(): not grabbed.\n", P->pid);
Pfree_internal(P);
return NULL;
}
close(P->memfd);
+ _dprintf("%i: Pgrab(): grabbed noninvasively.\n", P->pid);
noninvasiveness = 2;
}
} else {
@@ -719,7 +722,9 @@ Pwait_internal(struct ps_prochandle *P, boolean_t block, int *return_early)
return 0;
if (errno == ECHILD) {
+ _dprintf("%i: Pwait: got ECHILD from waitpid(), state %i, trace count %i, halted %i\n", P->pid, P->state, P->ptrace_count, P->ptrace_halted);
P->state = PS_DEAD;
+ Pdump_proc_status(P->pid);
return 0;
}
@@ -1262,7 +1267,7 @@ Ppush_state(struct ps_prochandle *P, int state)
s->state = state;
dt_list_prepend(&P->ptrace_states, s);
- _dprintf("Ppush_state(): ptrace_count %i, state %i\n", P->ptrace_count, s->state);
+ _dprintf("%i: Ppush_state(): ptrace_count %i, state %i\n", P->pid, P->ptrace_count, s->state);
return s;
}
@@ -1278,7 +1283,7 @@ Ppop_state(struct ps_prochandle *P)
s = dt_list_next(&P->ptrace_states);
dt_list_delete(&P->ptrace_states, s);
- _dprintf("Ppop_state(): ptrace_count %i, state %i\n", P->ptrace_count+1, s->state);
+ _dprintf("%i: Ppop_state(): ptrace_count %i, state %i\n", P->pid, P->ptrace_count+1, s->state);
state = s->state;
free(s);
return state;
@@ -1422,6 +1427,9 @@ err_nostate:
if (P->ptrace_count == 0 && ptrace_lock_hook)
ptrace_lock_hook(P, P->wrap_arg, 0);
+ _dprintf("Ptrace(): error return (possibly other tracer), trace count now %i: %s\n",
+ P->ptrace_count, strerror(errno));
+
if (err != -ECHILD)
return err;
else
@@ -1511,8 +1519,11 @@ Puntrace(struct ps_prochandle *P, int leave_stopped)
P->state = PS_RUN;
P->ptraced = FALSE;
if ((wrapped_ptrace(P, PTRACE_DETACH, P->pid, 0, 0) < 0) &&
- (errno == ESRCH))
+ (errno == ESRCH)) {
+ _dprintf("%i: Punbkpt(): -ESRCH, process is dead.\n",
+ P->pid);
P->state = PS_DEAD;
+ }
P->ptrace_halted = FALSE;
P->info_valid = 0;
}
@@ -1812,7 +1823,7 @@ Punbkpt(struct ps_prochandle *P, uintptr_t addr)
if (Preset_bkpt_ip(P, P->tracing_bkpt) < 0)
switch (errno) {
case ESRCH:
- _dprintf("%i: -ESRCH, process is dead.\n",
+ _dprintf("%i: Punbkpt(): -ESRCH, process is dead.\n",
P->pid);
P->state = PS_DEAD;
return;
@@ -1870,7 +1881,8 @@ Punbkpt_child_poke(struct ps_prochandle *P, pid_t pid, bkpt_t *bkpt)
bkpt->bkpt_addr, bkpt->orig_insn) < 0)
switch (errno) {
case ESRCH:
- _dprintf("%i: -ESRCH, process is dead.\n", child_pid);
+ _dprintf("%i: Punbkpt_child_poke(): -ESRCH, process is dead.\n",
+ child_pid);
if (!pid)
P->state = PS_DEAD;
return;
@@ -2181,8 +2193,11 @@ Pbkpt_continue(struct ps_prochandle *P)
if (wrapped_ptrace(P, PTRACE_CONT, P->pid, 0, 0) < 0) {
int err = errno;
if (err == ESRCH) {
- if ((kill(P->pid, 0) < 0) && errno == ESRCH)
+ if ((kill(P->pid, 0) < 0) && errno == ESRCH) {
+ _dprintf("%i: Pbpkt_continue(): Got ESRCH, process is dead.\n",
+ P->pid);
P->state = PS_DEAD;
+ }
}
/*
* Since we must have an outstanding Ptrace() anyway,
@@ -2760,6 +2775,36 @@ Phastty(pid_t pid)
return tty != 0;
}
+/*
+ * Dump /proc/$pid/status into the debug log.
+ */
+static void
+Pdump_proc_status(pid_t pid)
+{
+ char status[PATH_MAX];
+ FILE *fp;
+ char *line = NULL;
+ size_t len;
+
+ snprintf(status, sizeof(status), "/proc/%i/status", pid);
+
+ if ((fp = fopen(status, "r")) == NULL) {
+ _dprintf("Process is dead.\n");
+ return;
+ }
+
+ while (getline(&line, &len, fp) >= 0) {
+ if (strlen(line) > 0) {
+ if (line[strlen(line) - 1] == '\n')
+ line[strlen(line)-1] = '\0';
+ _dprintf("%li: %s\n", (long)pid, line);
+ }
+ }
+ free(line);
+ fclose(fp);
+ return;
+}
+
/*
* Get a specific field out of /proc/$pid/status and return the portion after
* the colon in a new dynamically allocated string, or NULL if no field matches.
diff --git a/libproc/rtld_db.c b/libproc/rtld_db.c
index b6d33ce38425a..9eb63138326dc 100644
--- a/libproc/rtld_db.c
+++ b/libproc/rtld_db.c
@@ -1771,7 +1771,7 @@ rd_loadobj_iter(rd_agent_t *rd, rl_iter_f *fun, void *state)
if (rd->P->state == PS_DEAD) {
*jmp_pad = old_exec_jmp;
- _dprintf("%i: link map iteration failed: process is dead..\n",
+ _dprintf("%i: link map iteration failed: process is dead.\n",
rd->P->pid);
return RD_ERR;
}
diff --git a/test/unittest/usdt/tst.multitrace.sh b/test/unittest/usdt/tst.multitrace.sh
index 4e53dbb992508..262c782821ef0 100755
--- a/test/unittest/usdt/tst.multitrace.sh
+++ b/test/unittest/usdt/tst.multitrace.sh
@@ -73,7 +73,7 @@ if [ $? -ne 0 ]; then
fi
script() {
- exec $dtrace $dt_flags -qws /dev/stdin $1 $2 $3 <<'EOF'
+ exec $dtrace $dt_flags -qws /dev/stdin $1 $2 $3 2> debug.$3 <<'EOF'
int fired[pid_t];
int exited[pid_t];
@@ -142,13 +142,28 @@ DONE=$!
script $ONE $TWO 2 &
DTWO=$!
+dump_debug() {
+ if [[ -n $DTRACE_DEBUG ]]; then
+ echo "runtest DEBUG $(date +%s): Debug output of first dtrace so far, PID $DONE" >&2
+ cat debug.1 >&2
+
+ echo "runtest DEBUG $(date +%s): Debug output of second dtrace so far, PID $DTWO" >&2
+ cat debug.2 >&2
+ fi
+}
+
if ! wait $DONE; then
+ dump_debug
+ echo "first dtrace exited nonzero at $(date +%s)" >&2
exit 1
fi
if ! wait $DTWO; then
+ dump_debug
+ echo "second dtrace exited nonzero at $(date +%s)" >&2
exit 1
fi
+dump_debug
wait $ONE $TWO
--
2.47.1.279.g84c5f4e78e
More information about the DTrace-devel
mailing list