[DTrace-devel] [PATCH 3/5] libproc: debugging improvements

Kris Van Hees kris.van.hees at oracle.com
Sat Dec 7 04:38:34 UTC 2024


On Tue, Dec 03, 2024 at 11:36:08AM +0000, Nick Alcock wrote:
> Attempting to track down the intermittent failures in
> test/unittest/proc/tst.multitrace.sh is rendered difficult by the fact that
> multiple dtraces are running at once, tracing multiple processes, but the
> debugging messages emitted by libproc do not provide either the TID of the
> tracing thread or the PID of the process being traced in too many cases.
> 
> Worse yet, if you do turn debugging on, both dtraces emit debugging output
> simultaneously to the same stderr stream.  The interleaving is bad enough,
> but very often this causes lines to be emitted to stderr that do not start
> with the standard libproc DEBUG time-since-epoch: string, because it got
> interspersed into the previous line. runtest then partitions that partial
> line off into a separate part of the log, rendering everything entirely
> incomprehensible.
> 
> So emit more PID-related info, including the TID of the process-control
> thread; and arrange for tst.multitrace.sh to capture DTrace debugging output
> itself and dump it as two separated pieces: if a dtrace exits nonzero, note
> which one exited as well as dumping its debug output.
> 
> Also, when this-should-never-happen conditions like a Pwait() returning
> -ECHILD happen (usually indicating that we were not the victim's tracer
> after all) by dumping the entire /proc/$pid/status of that process to
> the debug stream, so we can tell what the tracer *was* and whether the
> process was even stopped.  (No impact at all if this condition never
> happens, which it never should, of course, or if debugging is off.)
> 
> Signed-off-by: Nick Alcock <nick.alcock at oracle.com>

Reviewed-by: Kris Van Hees <kris.van.hees at oracle.com>

> ---
>  libdtrace/dt_proc.c                  |  6 ++-
>  libproc/Pcontrol.c                   | 57 +++++++++++++++++++++++++---
>  libproc/rtld_db.c                    |  2 +-
>  test/unittest/usdt/tst.multitrace.sh | 17 ++++++++-
>  4 files changed, 73 insertions(+), 9 deletions(-)
> 
> diff --git a/libdtrace/dt_proc.c b/libdtrace/dt_proc.c
> index 41b148e807f84..6ad57cc5f12ef 100644
> --- a/libdtrace/dt_proc.c
> +++ b/libdtrace/dt_proc.c
> @@ -870,6 +870,9 @@ dt_proc_control(void *arg)
>  	int err;
>  	jmp_buf exec_jmp;
>  
> +	dt_dprintf("%i: process control thread %i starting.\n", dpr->dpr_pid,
> +		   gettid());
> +
>  	/*
>  	 * Set up global libproc hooks that must be active before any processes
>  	 * are grabbed or created.
> @@ -1421,7 +1424,8 @@ dt_proc_control_cleanup(void *arg)
>  	 * out by ptrace() wrappers above us in the call stack, since the whole
>  	 * thread is going away.
>  	 */
> -	dt_dprintf("%i: process control thread going away.\n", dpr->dpr_pid);
> +	dt_dprintf("%i: process control thread %i going away.\n", dpr->dpr_pid,
> +		   gettid());
>  	if(dpr->dpr_lock_count_ctrl == 0 ||
>  	    !pthread_equal(dpr->dpr_lock_holder, pthread_self()))
>  		dt_proc_lock(dpr);
> diff --git a/libproc/Pcontrol.c b/libproc/Pcontrol.c
> index c96a410b70b3f..a32a362eaa1fb 100644
> --- a/libproc/Pcontrol.c
> +++ b/libproc/Pcontrol.c
> @@ -66,6 +66,7 @@ static int add_bkpt(struct ps_prochandle *P, uintptr_t addr,
>      void *data);
>  static void delete_bkpt_handler(struct bkpt *bkpt);
>  static jmp_buf **single_thread_unwinder_pad(struct ps_prochandle *unused);
> +static void Pdump_proc_status(pid_t pid);
>  
>  static ptrace_lock_hook_fun *ptrace_lock_hook;
>  static waitpid_lock_hook_fun *waitpid_lock_hook;
> @@ -332,10 +333,12 @@ Pgrab(pid_t pid, int noninvasiveness, int already_ptraced, void *wrap_arg,
>  
>  			if (*perr < 0) {
>  				if (noninvasiveness < 1) {
> +					_dprintf("%i: Pgrab(): not grabbed.\n", P->pid);
>  					Pfree_internal(P);
>  					return NULL;
>  				}
>  				close(P->memfd);
> +				_dprintf("%i: Pgrab(): grabbed noninvasively.\n", P->pid);
>  				noninvasiveness = 2;
>  			}
>  		} else {
> @@ -719,7 +722,9 @@ Pwait_internal(struct ps_prochandle *P, boolean_t block, int *return_early)
>  				return 0;
>  
>  			if (errno == ECHILD) {
> +				_dprintf("%i: Pwait: got ECHILD from waitpid(), state %i, trace count %i, halted %i\n", P->pid, P->state, P->ptrace_count, P->ptrace_halted);
>  				P->state = PS_DEAD;
> +				Pdump_proc_status(P->pid);
>  				return 0;
>  			}
>  
> @@ -1262,7 +1267,7 @@ Ppush_state(struct ps_prochandle *P, int state)
>  
>  	s->state = state;
>  	dt_list_prepend(&P->ptrace_states, s);
> -	_dprintf("Ppush_state(): ptrace_count %i, state %i\n", P->ptrace_count, s->state);
> +	_dprintf("%i: Ppush_state(): ptrace_count %i, state %i\n", P->pid, P->ptrace_count, s->state);
>  
>  	return s;
>  }
> @@ -1278,7 +1283,7 @@ Ppop_state(struct ps_prochandle *P)
>  
>  	s = dt_list_next(&P->ptrace_states);
>  	dt_list_delete(&P->ptrace_states, s);
> -	_dprintf("Ppop_state(): ptrace_count %i, state %i\n", P->ptrace_count+1, s->state);
> +	_dprintf("%i: Ppop_state(): ptrace_count %i, state %i\n", P->pid, P->ptrace_count+1, s->state);
>  	state = s->state;
>  	free(s);
>  	return state;
> @@ -1422,6 +1427,9 @@ err_nostate:
>  	if (P->ptrace_count == 0 && ptrace_lock_hook)
>  		ptrace_lock_hook(P, P->wrap_arg, 0);
>  
> +	_dprintf("Ptrace(): error return (possibly other tracer), trace count now %i: %s\n",
> +		 P->ptrace_count, strerror(errno));
> +
>  	if (err != -ECHILD)
>  		return err;
>  	else
> @@ -1511,8 +1519,11 @@ Puntrace(struct ps_prochandle *P, int leave_stopped)
>  		P->state = PS_RUN;
>  		P->ptraced = FALSE;
>  		if ((wrapped_ptrace(P, PTRACE_DETACH, P->pid, 0, 0) < 0) &&
> -		    (errno == ESRCH))
> +		    (errno == ESRCH)) {
> +			_dprintf("%i: Punbkpt(): -ESRCH, process is dead.\n",
> +				 P->pid);
>  			P->state = PS_DEAD;
> +		}
>  		P->ptrace_halted = FALSE;
>  		P->info_valid = 0;
>  	}
> @@ -1812,7 +1823,7 @@ Punbkpt(struct ps_prochandle *P, uintptr_t addr)
>  		if (Preset_bkpt_ip(P, P->tracing_bkpt) < 0)
>  			switch (errno) {
>  			case ESRCH:
> -				_dprintf("%i: -ESRCH, process is dead.\n",
> +				_dprintf("%i: Punbkpt(): -ESRCH, process is dead.\n",
>  				    P->pid);
>  				P->state = PS_DEAD;
>  				return;
> @@ -1870,7 +1881,8 @@ Punbkpt_child_poke(struct ps_prochandle *P, pid_t pid, bkpt_t *bkpt)
>  		bkpt->bkpt_addr, bkpt->orig_insn) < 0)
>  		switch (errno) {
>  		case ESRCH:
> -			_dprintf("%i: -ESRCH, process is dead.\n", child_pid);
> +			_dprintf("%i: Punbkpt_child_poke(): -ESRCH, process is dead.\n",
> +				 child_pid);
>  			if (!pid)
>  				P->state = PS_DEAD;
>  			return;
> @@ -2181,8 +2193,11 @@ Pbkpt_continue(struct ps_prochandle *P)
>  		if (wrapped_ptrace(P, PTRACE_CONT, P->pid, 0, 0) < 0) {
>  			int err = errno;
>  			if (err == ESRCH) {
> -				if ((kill(P->pid, 0) < 0) && errno == ESRCH)
> +				if ((kill(P->pid, 0) < 0) && errno == ESRCH) {
> +					_dprintf("%i: Pbpkt_continue(): Got ESRCH, process is dead.\n",
> +						 P->pid);
>  					P->state = PS_DEAD;
> +				}
>  			}
>  			/*
>  			 * Since we must have an outstanding Ptrace() anyway,
> @@ -2760,6 +2775,36 @@ Phastty(pid_t pid)
>  	return tty != 0;
>  }
>  
> +/*
> + * Dump /proc/$pid/status into the debug log.
> + */
> +static void
> +Pdump_proc_status(pid_t pid)
> +{
> +	char status[PATH_MAX];
> +	FILE *fp;
> +	char *line = NULL;
> +	size_t len;
> +
> +	snprintf(status, sizeof(status), "/proc/%i/status", pid);
> +
> +	if ((fp = fopen(status, "r")) == NULL) {
> +		_dprintf("Process is dead.\n");
> +		return;
> +	}
> +
> +	while (getline(&line, &len, fp) >= 0) {
> +		if (strlen(line) > 0) {
> +			if (line[strlen(line) - 1] == '\n')
> +				line[strlen(line)-1] = '\0';
> +			_dprintf("%li: %s\n", (long)pid, line);
> +		}
> +	}
> +	free(line);
> +	fclose(fp);
> +	return;
> +}
> +
>  /*
>   * Get a specific field out of /proc/$pid/status and return the portion after
>   * the colon in a new dynamically allocated string, or NULL if no field matches.
> diff --git a/libproc/rtld_db.c b/libproc/rtld_db.c
> index b6d33ce38425a..9eb63138326dc 100644
> --- a/libproc/rtld_db.c
> +++ b/libproc/rtld_db.c
> @@ -1771,7 +1771,7 @@ rd_loadobj_iter(rd_agent_t *rd, rl_iter_f *fun, void *state)
>  	if (rd->P->state == PS_DEAD) {
>  		*jmp_pad = old_exec_jmp;
>  
> -		_dprintf("%i: link map iteration failed: process is dead..\n",
> +		_dprintf("%i: link map iteration failed: process is dead.\n",
>  		    rd->P->pid);
>  		return RD_ERR;
>  	}
> diff --git a/test/unittest/usdt/tst.multitrace.sh b/test/unittest/usdt/tst.multitrace.sh
> index 4e53dbb992508..262c782821ef0 100755
> --- a/test/unittest/usdt/tst.multitrace.sh
> +++ b/test/unittest/usdt/tst.multitrace.sh
> @@ -73,7 +73,7 @@ if [ $? -ne 0 ]; then
>  fi
>  
>  script() {
> -	exec $dtrace $dt_flags -qws /dev/stdin $1 $2 $3 <<'EOF'
> +	exec $dtrace $dt_flags -qws /dev/stdin $1 $2 $3 2> debug.$3 <<'EOF'
>  	int fired[pid_t];
>  	int exited[pid_t];
>  
> @@ -142,13 +142,28 @@ DONE=$!
>  script $ONE $TWO 2 &
>  DTWO=$!
>  
> +dump_debug() {
> +    if [[ -n $DTRACE_DEBUG ]]; then
> +        echo "runtest DEBUG $(date +%s): Debug output of first dtrace so far, PID $DONE" >&2
> +        cat debug.1 >&2
> +
> +        echo "runtest DEBUG $(date +%s): Debug output of second dtrace so far, PID $DTWO" >&2
> +        cat debug.2 >&2
> +    fi
> +}
> +
>  if ! wait $DONE; then
> +    dump_debug
> +    echo "first dtrace exited nonzero at $(date +%s)" >&2
>      exit 1
>  fi
>  
>  if ! wait $DTWO; then
> +    dump_debug
> +    echo "second dtrace exited nonzero at $(date +%s)" >&2
>      exit 1
>  fi
> +dump_debug
>  
>  wait $ONE $TWO
>  
> -- 
> 2.47.1.279.g84c5f4e78e
> 



More information about the DTrace-devel mailing list