[DTrace-devel] [PATCH 15/16] Use eventfd rather than a condition variable to signal process death
Kris Van Hees
kris.van.hees at oracle.com
Thu Mar 18 21:55:03 PDT 2021
The proc handling code uses a conditional variable(dph_cv) that is
broadcast using pthread. THe original code would periodically check
the cv in dtrace_sleep(), called from the main consumer loop.
Now that the buffer handling implementation uses epoll(), dtrace_sleep()
has become obsolete. An eventfd is now used to communicate that there
are process notifications waiting to be processed.
Signed-off-by: Kris Van Hees <kris.van.hees at oracle.com>
---
libdtrace/dt_consume.c | 75 ++++++++++++++++-
libdtrace/dt_impl.h | 1 +
libdtrace/dt_open.c | 5 +-
libdtrace/dt_proc.c | 27 +++----
libdtrace/dt_work.c | 107 +++----------------------
test/unittest/proc/tst.grab-exit.d | 3 +-
test/unittest/proc/tst.omp.sh | 3 +-
test/unittest/proc/tst.pthread-exec.sh | 3 +-
test/unittest/proc/tst.signals.sh | 3 +-
9 files changed, 109 insertions(+), 118 deletions(-)
diff --git a/libdtrace/dt_consume.c b/libdtrace/dt_consume.c
index a034ee8e..f39156df 100644
--- a/libdtrace/dt_consume.c
+++ b/libdtrace/dt_consume.c
@@ -20,6 +20,7 @@
#include <libproc.h>
#include <port.h>
#include <sys/epoll.h>
+#include <sys/eventfd.h>
#include <linux/perf_event.h>
#include <linux/ring_buffer.h>
@@ -2317,6 +2318,8 @@ dt_consume_begin(dtrace_hdl_t *dtp, FILE *fp, struct epoll_event *events,
for (i = 0; i < cnt; i++) {
bpeb = events[i].data.ptr;
+ if (bpeb == NULL)
+ continue;
if (bpeb->cpu == cpu)
break;
}
@@ -2362,7 +2365,7 @@ dt_consume_begin(dtrace_hdl_t *dtp, FILE *fp, struct epoll_event *events,
for (i = 0; i < cnt; i++) {
dt_peb_t *peb = events[i].data.ptr;
- if (peb == bpeb)
+ if (peb == NULL || peb == bpeb)
continue;
rval = dt_consume_cpu(dtp, fp, peb, pf, rf, arg);
@@ -2393,6 +2396,58 @@ dt_consume_begin(dtrace_hdl_t *dtp, FILE *fp, struct epoll_event *events,
return rval;
}
+static void
+dt_consume_proc_exits(dtrace_hdl_t *dtp)
+{
+ dt_proc_hash_t *dph = dtp->dt_procs;
+ dt_proc_notify_t *dprn;
+
+ /*
+ * Make sure that any synchronous notifications of process exit are
+ * received. Regardless of why we awaken, iterate over any pending
+ * notifications and process them.
+ */
+ pthread_mutex_lock(&dph->dph_lock);
+ dt_proc_enqueue_exits(dtp);
+
+ while ((dprn = dph->dph_notify) != NULL) {
+ if (dtp->dt_prochdlr != NULL) {
+ char *err = dprn->dprn_errmsg;
+ pid_t pid = dprn->dprn_pid;
+ int state = PS_DEAD;
+
+ /*
+ * The dprn_dpr may be NULL if attachment or process
+ * creation has failed, or once the process dies. Only
+ * get the state of a dprn that is not NULL.
+ */
+ if (dprn->dprn_dpr != NULL) {
+ pid = dprn->dprn_dpr->dpr_pid;
+ dt_proc_lock(dprn->dprn_dpr);
+ }
+
+ if (*err == '\0')
+ err = NULL;
+
+ if (dprn->dprn_dpr != NULL)
+ state = dt_Pstate(dtp, pid);
+
+ if (state < 0 || state == PS_DEAD)
+ pid *= -1;
+
+ if (dprn->dprn_dpr != NULL)
+ dt_proc_unlock(dprn->dprn_dpr);
+
+ dtp->dt_prochdlr(pid, err, dtp->dt_procarg);
+ }
+
+ dph->dph_notify = dprn->dprn_next;
+ dt_free(dtp, dprn);
+ }
+
+ pthread_mutex_unlock(&dph->dph_lock);
+}
+
dtrace_workstatus_t
dtrace_consume(dtrace_hdl_t *dtp, FILE *fp, dtrace_consume_probe_f *pf,
dtrace_consume_rec_f *rf, void *arg)
@@ -2433,6 +2488,20 @@ dtrace_consume(dtrace_hdl_t *dtp, FILE *fp, dtrace_consume_probe_f *pf,
return DTRACE_WORKSTATUS_ERROR;
}
+ /*
+ * See if there are notifications pending from the proc handling code.
+ * If there are, we process them first.
+ */
+ for (i = 0; i < cnt; i++) {
+ if (events[i].data.ptr == dtp->dt_procs) {
+ eventfd_t dummy;
+
+ eventfd_read(dtp->dt_proc_fd, &dummy);
+ dt_consume_proc_exits(dtp);
+ events[i].data.ptr = NULL;
+ }
+ }
+
/*
* If dtp->dt_beganon is not -1, we did not process the BEGIN probe
* data (if any) yet. We do know (since dtp->dt_active is TRUE) that
@@ -2453,6 +2522,8 @@ dtrace_consume(dtrace_hdl_t *dtp, FILE *fp, dtrace_consume_probe_f *pf,
for (i = 0; i < cnt; i++) {
dt_peb_t *peb = events[i].data.ptr;
+ if (peb == NULL)
+ continue;
if (dtp->dt_stopped && peb->cpu == dtp->dt_endedon)
continue;
@@ -2474,6 +2545,8 @@ dtrace_consume(dtrace_hdl_t *dtp, FILE *fp, dtrace_consume_probe_f *pf,
for (i = 0; i < cnt; i++) {
dt_peb_t *peb = events[i].data.ptr;
+ if (peb == NULL)
+ continue;
if (peb->cpu != dtp->dt_endedon)
continue;
diff --git a/libdtrace/dt_impl.h b/libdtrace/dt_impl.h
index 6a089a13..687132fe 100644
--- a/libdtrace/dt_impl.h
+++ b/libdtrace/dt_impl.h
@@ -348,6 +348,7 @@ struct dtrace_hdl {
int dt_ddefs_fd; /* file descriptor for D CTF debugging cache */
int dt_stdout_fd; /* file descriptor for saved stdout */
int dt_poll_fd; /* file descriptor for event polling */
+ int dt_proc_fd; /* file descriptor for proc eventfd */
int dt_stmap_fd; /* file descriptor for the 'state' BPF map */
int dt_aggmap_fd; /* file descriptor for the 'aggs' BPF map */
dtrace_handle_err_f *dt_errhdlr; /* error handler, if any */
diff --git a/libdtrace/dt_open.c b/libdtrace/dt_open.c
index 669bd714..79b834e7 100644
--- a/libdtrace/dt_open.c
+++ b/libdtrace/dt_open.c
@@ -8,7 +8,7 @@
#include <sys/types.h>
#include <sys/utsname.h>
#include <sys/resource.h>
-
+#include <sys/eventfd.h>
#include <libelf.h>
#include <string.h>
@@ -737,6 +737,7 @@ dt_vopen(int version, int flags, int *errp,
dtp->dt_provbuckets = _dtrace_strbuckets;
dtp->dt_provs = calloc(dtp->dt_provbuckets, sizeof(dt_provider_t *));
dt_proc_hash_create(dtp);
+ dtp->dt_proc_fd = eventfd(0, EFD_CLOEXEC | EFD_NONBLOCK);
dtp->dt_nextepid = 1;
dtp->dt_maxprobe = 0;
dtp->dt_vmax = DT_VERS_LATEST;
@@ -1220,6 +1221,8 @@ dtrace_close(dtrace_hdl_t *dtp)
close(dtp->dt_ddefs_fd);
if (dtp->dt_stdout_fd != -1)
close(dtp->dt_stdout_fd);
+ if (dtp->dt_proc_fd != -1)
+ close(dtp->dt_proc_fd);
if (dtp->dt_poll_fd != -1)
close(dtp->dt_poll_fd);
diff --git a/libdtrace/dt_proc.c b/libdtrace/dt_proc.c
index c04b1cb0..51d2dca5 100644
--- a/libdtrace/dt_proc.c
+++ b/libdtrace/dt_proc.c
@@ -44,18 +44,19 @@
* A simple notification mechanism is provided for libdtrace clients using
* dtrace_handle_proc() for notification of process death. When this event
* occurs, the dt_proc_t itself is enqueued on a notification list and the
- * control thread broadcasts to dph_cv. dtrace_sleep() will wake up using this
- * condition and will then call the client handler as necessary.
+ * control thread triggers an event using dtp->dt_prov_fd. The epoll_wait()
+ * in dtrace_consume() will wake up using this condition and the client handler
+ * will be called as necessary.
*
* The locking in this file is crucial, to stop the process-control threads
* from running before dtrace is ready for them, to coordinate proxy calls
* between the main thread and process-control thread, and to ensure that the
* state is not torn down while the process-control threads are still using it.
* Two locks are used:
- * - the dph_lock is a simple mutex protecting mutations of the dph notify list,
- * and serving as the lock around the dph_cv; the dph hash itself is not
- * protected, and may only be modified from the main thread. This lock
- * nests inside the dpr_lock if both are taken at once.
+ * - the dph_lock is a simple mutex protecting mutations of the dph notify
+ * list; the dph hash itself is not protected, and may only be modified
+ * from the main thread. This lock nests inside the dpr_lock if both are
+ * taken at once.
* - the dpr_lock is a counted semaphore constructed from a mutex, a
* currently-holding thread ID, and two counters tracking a lock count for
* each of its two possible holders (it could equally well be constructed
@@ -72,6 +73,7 @@
*/
#include <sys/wait.h>
+#include <sys/eventfd.h>
#include <string.h>
#include <signal.h>
#include <assert.h>
@@ -150,7 +152,7 @@ dt_proc_notify(dtrace_hdl_t *dtp, dt_proc_hash_t *dph, dt_proc_t *dpr,
dph->dph_notify = dprn;
if (broadcast)
- pthread_cond_broadcast(&dph->dph_cv);
+ eventfd_write(dtp->dt_proc_fd, 1);
if (lock)
pthread_mutex_unlock(&dph->dph_lock);
}
@@ -1336,11 +1338,10 @@ dt_proc_control_cleanup(void *arg)
/*
* Set dpr_done and clear dpr_tid to indicate that the control thread
- * has exited, and notify any waiting thread on the dph_cv or in
- * dt_proc_destroy() that we have successfully exited. Clean up the
- * libproc state, unless this is a non-ptraceable process that doesn't
- * need a monitor thread. (In that case, the monitor thread is suiciding
- * but the libproc state should remain.)
+ * has exited, and notify any waiting thread that we have successfully
+ * exited. Clean up the libproc state, unless this is a non-ptraceable
+ * process that doesn't need a monitor thread. (In that case, the
+ * monitor thread is suiciding but the libproc state should remain.)
*
* If we were cancelled while already holding the mutex, don't lock it
* again.
@@ -2228,7 +2229,6 @@ dt_proc_hash_create(dtrace_hdl_t *dtp)
sizeof(dt_proc_t *) * _dtrace_pidbuckets - 1)) != NULL) {
pthread_mutex_init(&dtp->dt_procs->dph_lock, NULL);
- pthread_cond_init(&dtp->dt_procs->dph_cv, NULL);
dtp->dt_procs->dph_hashlen = _dtrace_pidbuckets;
dtp->dt_procs->dph_lrulim = _dtrace_pidlrulim;
@@ -2259,7 +2259,6 @@ dt_proc_hash_destroy(dtrace_hdl_t *dtp)
*npp = npr->dprn_next;
dt_free(dtp, npr);
}
- pthread_cond_destroy(&dph->dph_cv);
dtp->dt_procs = NULL;
dt_free(dtp, dph);
diff --git a/libdtrace/dt_work.c b/libdtrace/dt_work.c
index 7eb13da2..108eef9a 100644
--- a/libdtrace/dt_work.c
+++ b/libdtrace/dt_work.c
@@ -19,16 +19,6 @@
#include <linux/perf_event.h>
#include <sys/epoll.h>
-static const struct {
- int dtslt_option;
- size_t dtslt_offs;
-} _dtrace_sleeptab[] = {
- { DTRACEOPT_STATUSRATE, offsetof(dtrace_hdl_t, dt_laststatus) },
- { DTRACEOPT_AGGRATE, offsetof(dtrace_hdl_t, dt_lastagg) },
- { DTRACEOPT_SWITCHRATE, offsetof(dtrace_hdl_t, dt_lastswitch) },
- { DTRACEOPT_MAX, 0 }
-};
-
void
BEGIN_probe(void)
{
@@ -39,88 +29,6 @@ END_probe(void)
{
}
-void
-dtrace_sleep(dtrace_hdl_t *dtp)
-{
- dt_proc_hash_t *dph = dtp->dt_procs;
- dtrace_optval_t policy = dtp->dt_options[DTRACEOPT_BUFPOLICY];
- dt_proc_notify_t *dprn;
-
- hrtime_t earliest = INT64_MAX;
- struct timespec tv;
- int i;
-
- for (i = 0; _dtrace_sleeptab[i].dtslt_option < DTRACEOPT_MAX; i++) {
- uintptr_t a = (uintptr_t)dtp + _dtrace_sleeptab[i].dtslt_offs;
- int opt = _dtrace_sleeptab[i].dtslt_option;
- dtrace_optval_t interval = dtp->dt_options[opt];
-
- /*
- * If the buffering policy is set to anything other than
- * "switch", we ignore the aggrate and switchrate -- they're
- * meaningless.
- */
- if (policy != DTRACEOPT_BUFPOLICY_SWITCH &&
- _dtrace_sleeptab[i].dtslt_option != DTRACEOPT_STATUSRATE)
- continue;
-
- if (*((hrtime_t *)a) + interval < earliest)
- earliest = *((hrtime_t *)a) + interval;
- }
-
- pthread_mutex_lock(&dph->dph_lock);
-
- tv.tv_sec = earliest / NANOSEC;
- tv.tv_nsec = earliest % NANOSEC;
-
- /*
- * Wait until the time specified by "earliest" has arrived, or until we
- * receive notification that a process is in an interesting state; also
- * make sure that any synchronous notifications of process exit are
- * received. Regardless of why we awaken, iterate over any pending
- * notifications and process them.
- */
- pthread_cond_timedwait(&dph->dph_cv, &dph->dph_lock, &tv);
- dt_proc_enqueue_exits(dtp);
-
- while ((dprn = dph->dph_notify) != NULL) {
- if (dtp->dt_prochdlr != NULL) {
- char *err = dprn->dprn_errmsg;
- pid_t pid = dprn->dprn_pid;
- int state = PS_DEAD;
-
- /*
- * The dprn_dpr may be NULL if attachment or process
- * creation has failed, or once the process dies. Only
- * get the state of a dprn that is not NULL.
- */
- if (dprn->dprn_dpr != NULL) {
- pid = dprn->dprn_dpr->dpr_pid;
- dt_proc_lock(dprn->dprn_dpr);
- }
-
- if (*err == '\0')
- err = NULL;
-
- if (dprn->dprn_dpr != NULL)
- state = dt_Pstate(dtp, pid);
-
- if (state < 0 || state == PS_DEAD)
- pid *= -1;
-
- if (dprn->dprn_dpr != NULL)
- dt_proc_unlock(dprn->dprn_dpr);
-
- dtp->dt_prochdlr(pid, err, dtp->dt_procarg);
- }
-
- dph->dph_notify = dprn->dprn_next;
- dt_free(dtp, dprn);
- }
-
- pthread_mutex_unlock(&dph->dph_lock);
-}
-
int
dtrace_status(dtrace_hdl_t *dtp)
{
@@ -143,8 +51,9 @@ dtrace_status(dtrace_hdl_t *dtp)
int
dtrace_go(dtrace_hdl_t *dtp, uint_t cflags)
{
- size_t size;
- int err;
+ size_t size;
+ int err;
+ struct epoll_event ev;
if (dtp->dt_active)
return dt_set_errno(dtp, EINVAL);
@@ -168,6 +77,16 @@ dtrace_go(dtrace_hdl_t *dtp, uint_t cflags)
if (dtp->dt_poll_fd < 0)
return dt_set_errno(dtp, errno);
+ /*
+ * Register the proc eventfd descriptor to receive notifications about
+ * process exit.
+ */
+ ev.events = EPOLLIN;
+ ev.data.ptr = dtp->dt_procs;
+ if (epoll_ctl(dtp->dt_poll_fd, EPOLL_CTL_ADD, dtp->dt_proc_fd, &ev) ==
+ -1)
+ return dt_set_errno(dtp, errno);
+
/*
* We need enough space for the pref_event_header, a 32-bit size, a
* 4-byte gap, and the largest trace data record we may be writing to
diff --git a/test/unittest/proc/tst.grab-exit.d b/test/unittest/proc/tst.grab-exit.d
index 180ed135..8b2d09b1 100644
--- a/test/unittest/proc/tst.grab-exit.d
+++ b/test/unittest/proc/tst.grab-exit.d
@@ -1,11 +1,10 @@
/*
* Oracle Linux DTrace.
- * Copyright (c) 2019, 2020, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2019, 2021, Oracle and/or its affiliates. All rights reserved.
* Licensed under the Universal Permissive License v 1.0 as shown at
* http://oss.oracle.com/licenses/upl.
*/
-/* @@xfail: dtv2 */
/* @@runtest-opts: -c /bin/true */
/*
diff --git a/test/unittest/proc/tst.omp.sh b/test/unittest/proc/tst.omp.sh
index e12f8f50..9f5cafaa 100755
--- a/test/unittest/proc/tst.omp.sh
+++ b/test/unittest/proc/tst.omp.sh
@@ -1,11 +1,10 @@
#!/bin/bash
#
# Oracle Linux DTrace.
-# Copyright (c) 2018, 2020, Oracle and/or its affiliates. All rights reserved.
+# Copyright (c) 2018, 2021, Oracle and/or its affiliates. All rights reserved.
# Licensed under the Universal Permissive License v 1.0 as shown at
# http://oss.oracle.com/licenses/upl.
#
-# @@xfail: dtv2
#
# This script tests that monitoring of an OMP process works, and
diff --git a/test/unittest/proc/tst.pthread-exec.sh b/test/unittest/proc/tst.pthread-exec.sh
index 90cf58af..7900c6ae 100755
--- a/test/unittest/proc/tst.pthread-exec.sh
+++ b/test/unittest/proc/tst.pthread-exec.sh
@@ -1,11 +1,10 @@
#!/bin/bash
#
# Oracle Linux DTrace.
-# Copyright (c) 2018, 2020, Oracle and/or its affiliates. All rights reserved.
+# Copyright (c) 2018, 2021, Oracle and/or its affiliates. All rights reserved.
# Licensed under the Universal Permissive License v 1.0 as shown at
# http://oss.oracle.com/licenses/upl.
#
-# @@xfail: dtv2
#
# This script tests that monitoring a multithreaded process that exec()s
diff --git a/test/unittest/proc/tst.signals.sh b/test/unittest/proc/tst.signals.sh
index 0f7c1da1..1105ebc2 100755
--- a/test/unittest/proc/tst.signals.sh
+++ b/test/unittest/proc/tst.signals.sh
@@ -1,11 +1,10 @@
#!/bin/bash
#
# Oracle Linux DTrace.
-# Copyright (c) 2014, 2020, Oracle and/or its affiliates. All rights reserved.
+# Copyright (c) 2014, 2021, Oracle and/or its affiliates. All rights reserved.
# Licensed under the Universal Permissive License v 1.0 as shown at
# http://oss.oracle.com/licenses/upl.
#
-# @@xfail: dtv2
#
# This script tests dtrace's handling of processes that receive various
--
2.28.0
More information about the DTrace-devel
mailing list