[DTrace-devel] [PATCH 15/16] Use eventfd rather than a condition variable to signal process death

Kris Van Hees kris.van.hees at oracle.com
Thu Mar 18 21:55:03 PDT 2021


The proc handling code uses a conditional variable(dph_cv)  that is
broadcast using pthread.  THe original code would periodically check
the cv in dtrace_sleep(), called from the main consumer loop.

Now that the buffer handling implementation uses epoll(), dtrace_sleep()
has become obsolete.  An eventfd is now used to communicate that there
are process notifications waiting to be processed.

Signed-off-by: Kris Van Hees <kris.van.hees at oracle.com>
---
 libdtrace/dt_consume.c                 |  75 ++++++++++++++++-
 libdtrace/dt_impl.h                    |   1 +
 libdtrace/dt_open.c                    |   5 +-
 libdtrace/dt_proc.c                    |  27 +++----
 libdtrace/dt_work.c                    | 107 +++----------------------
 test/unittest/proc/tst.grab-exit.d     |   3 +-
 test/unittest/proc/tst.omp.sh          |   3 +-
 test/unittest/proc/tst.pthread-exec.sh |   3 +-
 test/unittest/proc/tst.signals.sh      |   3 +-
 9 files changed, 109 insertions(+), 118 deletions(-)

diff --git a/libdtrace/dt_consume.c b/libdtrace/dt_consume.c
index a034ee8e..f39156df 100644
--- a/libdtrace/dt_consume.c
+++ b/libdtrace/dt_consume.c
@@ -20,6 +20,7 @@
 #include <libproc.h>
 #include <port.h>
 #include <sys/epoll.h>
+#include <sys/eventfd.h>
 #include <linux/perf_event.h>
 #include <linux/ring_buffer.h>
 
@@ -2317,6 +2318,8 @@ dt_consume_begin(dtrace_hdl_t *dtp, FILE *fp, struct epoll_event *events,
 	for (i = 0; i < cnt; i++) {
 		bpeb = events[i].data.ptr;
 
+		if (bpeb == NULL)
+			continue;
 		if (bpeb->cpu == cpu)
 			break;
 	}
@@ -2362,7 +2365,7 @@ dt_consume_begin(dtrace_hdl_t *dtp, FILE *fp, struct epoll_event *events,
 	for (i = 0; i < cnt; i++) {
 		dt_peb_t	*peb = events[i].data.ptr;
 
-		if (peb == bpeb)
+		if (peb == NULL || peb == bpeb)
 			continue;
 
 		rval = dt_consume_cpu(dtp, fp, peb, pf, rf, arg);
@@ -2393,6 +2396,58 @@ dt_consume_begin(dtrace_hdl_t *dtp, FILE *fp, struct epoll_event *events,
 	return rval;
 }
 
+static void
+dt_consume_proc_exits(dtrace_hdl_t *dtp)
+{
+	dt_proc_hash_t		*dph = dtp->dt_procs;
+	dt_proc_notify_t	*dprn;
+
+	/*
+	 * Make sure that any synchronous notifications of process exit are
+	 * received.  Regardless of why we awaken, iterate over any pending
+	 * notifications and process them.
+	 */
+	pthread_mutex_lock(&dph->dph_lock);
+	dt_proc_enqueue_exits(dtp);
+
+	while ((dprn = dph->dph_notify) != NULL) {
+		if (dtp->dt_prochdlr != NULL) {
+			char	*err = dprn->dprn_errmsg;
+			pid_t	pid = dprn->dprn_pid;
+			int	state = PS_DEAD;
+
+			/*
+			 * The dprn_dpr may be NULL if attachment or process
+			 * creation has failed, or once the process dies.  Only
+			 * get the state of a dprn that is not NULL.
+			 */
+			if (dprn->dprn_dpr != NULL) {
+				pid = dprn->dprn_dpr->dpr_pid;
+				dt_proc_lock(dprn->dprn_dpr);
+			}
+
+			if (*err == '\0')
+				err = NULL;
+
+			if (dprn->dprn_dpr != NULL)
+				state = dt_Pstate(dtp, pid);
+
+			if (state < 0 || state == PS_DEAD)
+				pid *= -1;
+
+			if (dprn->dprn_dpr != NULL)
+				dt_proc_unlock(dprn->dprn_dpr);
+
+			dtp->dt_prochdlr(pid, err, dtp->dt_procarg);
+		}
+
+		dph->dph_notify = dprn->dprn_next;
+		dt_free(dtp, dprn);
+	}
+
+	pthread_mutex_unlock(&dph->dph_lock);
+}
+
 dtrace_workstatus_t
 dtrace_consume(dtrace_hdl_t *dtp, FILE *fp, dtrace_consume_probe_f *pf,
 	       dtrace_consume_rec_f *rf, void *arg)
@@ -2433,6 +2488,20 @@ dtrace_consume(dtrace_hdl_t *dtp, FILE *fp, dtrace_consume_probe_f *pf,
 		return DTRACE_WORKSTATUS_ERROR;
 	}
 
+	/*
+	 * See if there are notifications pending from the proc handling code.
+	 * If there are, we process them first.
+	 */
+	for (i = 0; i < cnt; i++) {
+		if (events[i].data.ptr == dtp->dt_procs) {
+			eventfd_t	dummy;
+
+			eventfd_read(dtp->dt_proc_fd, &dummy);
+			dt_consume_proc_exits(dtp);
+			events[i].data.ptr = NULL;
+		}
+	}
+
 	/*
 	 * If dtp->dt_beganon is not -1, we did not process the BEGIN probe
 	 * data (if any) yet.  We do know (since dtp->dt_active is TRUE) that
@@ -2453,6 +2522,8 @@ dtrace_consume(dtrace_hdl_t *dtp, FILE *fp, dtrace_consume_probe_f *pf,
 	for (i = 0; i < cnt; i++) {
 		dt_peb_t	*peb = events[i].data.ptr;
 
+		if (peb == NULL)
+			continue;
 		if (dtp->dt_stopped && peb->cpu == dtp->dt_endedon)
 			continue;
 
@@ -2474,6 +2545,8 @@ dtrace_consume(dtrace_hdl_t *dtp, FILE *fp, dtrace_consume_probe_f *pf,
 	for (i = 0; i < cnt; i++) {
 		dt_peb_t	*peb = events[i].data.ptr;
 
+		if (peb == NULL)
+			continue;
 		if (peb->cpu != dtp->dt_endedon)
 			continue;
 
diff --git a/libdtrace/dt_impl.h b/libdtrace/dt_impl.h
index 6a089a13..687132fe 100644
--- a/libdtrace/dt_impl.h
+++ b/libdtrace/dt_impl.h
@@ -348,6 +348,7 @@ struct dtrace_hdl {
 	int dt_ddefs_fd;	/* file descriptor for D CTF debugging cache */
 	int dt_stdout_fd;	/* file descriptor for saved stdout */
 	int dt_poll_fd;		/* file descriptor for event polling */
+	int dt_proc_fd;		/* file descriptor for proc eventfd */
 	int dt_stmap_fd;	/* file descriptor for the 'state' BPF map */
 	int dt_aggmap_fd;	/* file descriptor for the 'aggs' BPF map */
 	dtrace_handle_err_f *dt_errhdlr; /* error handler, if any */
diff --git a/libdtrace/dt_open.c b/libdtrace/dt_open.c
index 669bd714..79b834e7 100644
--- a/libdtrace/dt_open.c
+++ b/libdtrace/dt_open.c
@@ -8,7 +8,7 @@
 #include <sys/types.h>
 #include <sys/utsname.h>
 #include <sys/resource.h>
-
+#include <sys/eventfd.h>
 
 #include <libelf.h>
 #include <string.h>
@@ -737,6 +737,7 @@ dt_vopen(int version, int flags, int *errp,
 	dtp->dt_provbuckets = _dtrace_strbuckets;
 	dtp->dt_provs = calloc(dtp->dt_provbuckets, sizeof(dt_provider_t *));
 	dt_proc_hash_create(dtp);
+	dtp->dt_proc_fd = eventfd(0, EFD_CLOEXEC | EFD_NONBLOCK);
 	dtp->dt_nextepid = 1;
 	dtp->dt_maxprobe = 0;
 	dtp->dt_vmax = DT_VERS_LATEST;
@@ -1220,6 +1221,8 @@ dtrace_close(dtrace_hdl_t *dtp)
 		close(dtp->dt_ddefs_fd);
 	if (dtp->dt_stdout_fd != -1)
 		close(dtp->dt_stdout_fd);
+	if (dtp->dt_proc_fd != -1)
+		close(dtp->dt_proc_fd);
 	if (dtp->dt_poll_fd != -1)
 		close(dtp->dt_poll_fd);
 
diff --git a/libdtrace/dt_proc.c b/libdtrace/dt_proc.c
index c04b1cb0..51d2dca5 100644
--- a/libdtrace/dt_proc.c
+++ b/libdtrace/dt_proc.c
@@ -44,18 +44,19 @@
  * A simple notification mechanism is provided for libdtrace clients using
  * dtrace_handle_proc() for notification of process death.  When this event
  * occurs, the dt_proc_t itself is enqueued on a notification list and the
- * control thread broadcasts to dph_cv.  dtrace_sleep() will wake up using this
- * condition and will then call the client handler as necessary.
+ * control thread triggers an event using dtp->dt_prov_fd.  The epoll_wait()
+ * in dtrace_consume() will wake up using this condition and the client handler
+ * will be called as necessary.
  *
  * The locking in this file is crucial, to stop the process-control threads
  * from running before dtrace is ready for them, to coordinate proxy calls
  * between the main thread and process-control thread, and to ensure that the
  * state is not torn down while the process-control threads are still using it.
  * Two locks are used:
- *  - the dph_lock is a simple mutex protecting mutations of the dph notify list,
- *    and serving as the lock around the dph_cv; the dph hash itself is not
- *    protected, and may only be modified from the main thread.  This lock
- *    nests inside the dpr_lock if both are taken at once.
+ *  - the dph_lock is a simple mutex protecting mutations of the dph notify
+ *    list; the dph hash itself is not protected, and may only be modified
+ *    from the main thread.  This lock nests inside the dpr_lock if both are
+ *    taken at once.
  *  - the dpr_lock is a counted semaphore constructed from a mutex, a
  *    currently-holding thread ID, and two counters tracking a lock count for
  *    each of its two possible holders (it could equally well be constructed
@@ -72,6 +73,7 @@
  */
 
 #include <sys/wait.h>
+#include <sys/eventfd.h>
 #include <string.h>
 #include <signal.h>
 #include <assert.h>
@@ -150,7 +152,7 @@ dt_proc_notify(dtrace_hdl_t *dtp, dt_proc_hash_t *dph, dt_proc_t *dpr,
 		dph->dph_notify = dprn;
 
 		if (broadcast)
-			pthread_cond_broadcast(&dph->dph_cv);
+			eventfd_write(dtp->dt_proc_fd, 1);
 		if (lock)
 			pthread_mutex_unlock(&dph->dph_lock);
 	}
@@ -1336,11 +1338,10 @@ dt_proc_control_cleanup(void *arg)
 
 	/*
 	 * Set dpr_done and clear dpr_tid to indicate that the control thread
-	 * has exited, and notify any waiting thread on the dph_cv or in
-	 * dt_proc_destroy() that we have successfully exited.  Clean up the
-	 * libproc state, unless this is a non-ptraceable process that doesn't
-	 * need a monitor thread. (In that case, the monitor thread is suiciding
-	 * but the libproc state should remain.)
+	 * has exited, and notify any waiting thread that we have successfully
+	 * exited.  Clean up the libproc state, unless this is a non-ptraceable
+	 * process that doesn't need a monitor thread. (In that case, the
+	 * monitor thread is suiciding but the libproc state should remain.)
 	 *
 	 * If we were cancelled while already holding the mutex, don't lock it
 	 * again.
@@ -2228,7 +2229,6 @@ dt_proc_hash_create(dtrace_hdl_t *dtp)
 	    sizeof(dt_proc_t *) * _dtrace_pidbuckets - 1)) != NULL) {
 
 		pthread_mutex_init(&dtp->dt_procs->dph_lock, NULL);
-		pthread_cond_init(&dtp->dt_procs->dph_cv, NULL);
 
 		dtp->dt_procs->dph_hashlen = _dtrace_pidbuckets;
 		dtp->dt_procs->dph_lrulim = _dtrace_pidlrulim;
@@ -2259,7 +2259,6 @@ dt_proc_hash_destroy(dtrace_hdl_t *dtp)
 		*npp = npr->dprn_next;
 		dt_free(dtp, npr);
 	}
-	pthread_cond_destroy(&dph->dph_cv);
 
 	dtp->dt_procs = NULL;
 	dt_free(dtp, dph);
diff --git a/libdtrace/dt_work.c b/libdtrace/dt_work.c
index 7eb13da2..108eef9a 100644
--- a/libdtrace/dt_work.c
+++ b/libdtrace/dt_work.c
@@ -19,16 +19,6 @@
 #include <linux/perf_event.h>
 #include <sys/epoll.h>
 
-static const struct {
-	int dtslt_option;
-	size_t dtslt_offs;
-} _dtrace_sleeptab[] = {
-	{ DTRACEOPT_STATUSRATE, offsetof(dtrace_hdl_t, dt_laststatus) },
-	{ DTRACEOPT_AGGRATE, offsetof(dtrace_hdl_t, dt_lastagg) },
-	{ DTRACEOPT_SWITCHRATE, offsetof(dtrace_hdl_t, dt_lastswitch) },
-	{ DTRACEOPT_MAX, 0 }
-};
-
 void
 BEGIN_probe(void)
 {
@@ -39,88 +29,6 @@ END_probe(void)
 {
 }
 
-void
-dtrace_sleep(dtrace_hdl_t *dtp)
-{
-	dt_proc_hash_t *dph = dtp->dt_procs;
-	dtrace_optval_t policy = dtp->dt_options[DTRACEOPT_BUFPOLICY];
-	dt_proc_notify_t *dprn;
-
-	hrtime_t earliest = INT64_MAX;
-	struct timespec tv;
-	int i;
-
-	for (i = 0; _dtrace_sleeptab[i].dtslt_option < DTRACEOPT_MAX; i++) {
-		uintptr_t a = (uintptr_t)dtp + _dtrace_sleeptab[i].dtslt_offs;
-		int opt = _dtrace_sleeptab[i].dtslt_option;
-		dtrace_optval_t interval = dtp->dt_options[opt];
-
-		/*
-		 * If the buffering policy is set to anything other than
-		 * "switch", we ignore the aggrate and switchrate -- they're
-		 * meaningless.
-		 */
-		if (policy != DTRACEOPT_BUFPOLICY_SWITCH &&
-		    _dtrace_sleeptab[i].dtslt_option != DTRACEOPT_STATUSRATE)
-			continue;
-
-		if (*((hrtime_t *)a) + interval < earliest)
-			earliest = *((hrtime_t *)a) + interval;
-	}
-
-	pthread_mutex_lock(&dph->dph_lock);
-
-	tv.tv_sec = earliest / NANOSEC;
-	tv.tv_nsec = earliest % NANOSEC;
-
-	/*
-	 * Wait until the time specified by "earliest" has arrived, or until we
-	 * receive notification that a process is in an interesting state; also
-	 * make sure that any synchronous notifications of process exit are
-	 * received.  Regardless of why we awaken, iterate over any pending
-	 * notifications and process them.
-	 */
-	pthread_cond_timedwait(&dph->dph_cv, &dph->dph_lock, &tv);
-	dt_proc_enqueue_exits(dtp);
-
-	while ((dprn = dph->dph_notify) != NULL) {
-		if (dtp->dt_prochdlr != NULL) {
-			char *err = dprn->dprn_errmsg;
-			pid_t pid = dprn->dprn_pid;
-			int state = PS_DEAD;
-
-			/*
-			 * The dprn_dpr may be NULL if attachment or process
-			 * creation has failed, or once the process dies.  Only
-			 * get the state of a dprn that is not NULL.
-			 */
-			if (dprn->dprn_dpr != NULL) {
-				pid = dprn->dprn_dpr->dpr_pid;
-				dt_proc_lock(dprn->dprn_dpr);
-			}
-
-			if (*err == '\0')
-				err = NULL;
-
-			if (dprn->dprn_dpr != NULL)
-				state = dt_Pstate(dtp, pid);
-
-			if (state < 0 || state == PS_DEAD)
-				pid *= -1;
-
-			if (dprn->dprn_dpr != NULL)
-				dt_proc_unlock(dprn->dprn_dpr);
-
-			dtp->dt_prochdlr(pid, err, dtp->dt_procarg);
-		}
-
-		dph->dph_notify = dprn->dprn_next;
-		dt_free(dtp, dprn);
-	}
-
-	pthread_mutex_unlock(&dph->dph_lock);
-}
-
 int
 dtrace_status(dtrace_hdl_t *dtp)
 {
@@ -143,8 +51,9 @@ dtrace_status(dtrace_hdl_t *dtp)
 int
 dtrace_go(dtrace_hdl_t *dtp, uint_t cflags)
 {
-	size_t		size;
-	int		err;
+	size_t			size;
+	int			err;
+	struct epoll_event	ev;
 
 	if (dtp->dt_active)
 		return dt_set_errno(dtp, EINVAL);
@@ -168,6 +77,16 @@ dtrace_go(dtrace_hdl_t *dtp, uint_t cflags)
 	if (dtp->dt_poll_fd < 0)
 		return dt_set_errno(dtp, errno);
 
+	/*
+	 * Register the proc eventfd descriptor to receive notifications about
+	 * process exit.
+	 */
+	ev.events = EPOLLIN;
+	ev.data.ptr = dtp->dt_procs;
+	if (epoll_ctl(dtp->dt_poll_fd, EPOLL_CTL_ADD, dtp->dt_proc_fd, &ev) ==
+	    -1)
+		return dt_set_errno(dtp, errno);
+
 	/*
 	 * We need enough space for the pref_event_header, a 32-bit size, a
 	 * 4-byte gap, and the largest trace data record we may be writing to
diff --git a/test/unittest/proc/tst.grab-exit.d b/test/unittest/proc/tst.grab-exit.d
index 180ed135..8b2d09b1 100644
--- a/test/unittest/proc/tst.grab-exit.d
+++ b/test/unittest/proc/tst.grab-exit.d
@@ -1,11 +1,10 @@
 /*
  * Oracle Linux DTrace.
- * Copyright (c) 2019, 2020, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2019, 2021, Oracle and/or its affiliates. All rights reserved.
  * Licensed under the Universal Permissive License v 1.0 as shown at
  * http://oss.oracle.com/licenses/upl.
  */
 
-/* @@xfail: dtv2 */
 /* @@runtest-opts: -c /bin/true */
 
 /*
diff --git a/test/unittest/proc/tst.omp.sh b/test/unittest/proc/tst.omp.sh
index e12f8f50..9f5cafaa 100755
--- a/test/unittest/proc/tst.omp.sh
+++ b/test/unittest/proc/tst.omp.sh
@@ -1,11 +1,10 @@
 #!/bin/bash
 #
 # Oracle Linux DTrace.
-# Copyright (c) 2018, 2020, Oracle and/or its affiliates. All rights reserved.
+# Copyright (c) 2018, 2021, Oracle and/or its affiliates. All rights reserved.
 # Licensed under the Universal Permissive License v 1.0 as shown at
 # http://oss.oracle.com/licenses/upl.
 #
-# @@xfail: dtv2
 
 #
 # This script tests that monitoring of an OMP process works, and
diff --git a/test/unittest/proc/tst.pthread-exec.sh b/test/unittest/proc/tst.pthread-exec.sh
index 90cf58af..7900c6ae 100755
--- a/test/unittest/proc/tst.pthread-exec.sh
+++ b/test/unittest/proc/tst.pthread-exec.sh
@@ -1,11 +1,10 @@
 #!/bin/bash
 #
 # Oracle Linux DTrace.
-# Copyright (c) 2018, 2020, Oracle and/or its affiliates. All rights reserved.
+# Copyright (c) 2018, 2021, Oracle and/or its affiliates. All rights reserved.
 # Licensed under the Universal Permissive License v 1.0 as shown at
 # http://oss.oracle.com/licenses/upl.
 #
-# @@xfail: dtv2
 
 #
 # This script tests that monitoring a multithreaded process that exec()s
diff --git a/test/unittest/proc/tst.signals.sh b/test/unittest/proc/tst.signals.sh
index 0f7c1da1..1105ebc2 100755
--- a/test/unittest/proc/tst.signals.sh
+++ b/test/unittest/proc/tst.signals.sh
@@ -1,11 +1,10 @@
 #!/bin/bash
 #
 # Oracle Linux DTrace.
-# Copyright (c) 2014, 2020, Oracle and/or its affiliates. All rights reserved.
+# Copyright (c) 2014, 2021, Oracle and/or its affiliates. All rights reserved.
 # Licensed under the Universal Permissive License v 1.0 as shown at
 # http://oss.oracle.com/licenses/upl.
 #
-# @@xfail: dtv2
 
 #
 # This script tests dtrace's handling of processes that receive various
-- 
2.28.0




More information about the DTrace-devel mailing list