[DTrace-devel] [PATCH 02/10] dtprobed: add the DOF stash
Nick Alcock
nick.alcock at oracle.com
Wed Aug 2 13:26:52 UTC 2023
The DOF stash is a runtime directory (/run/dtrace/stash by default) in which
the dtprobed daemon keeps track of incoming DOF. There's a comment
describing it at the top of dtprobed/dof_stash.c, but in brief we have two
directories under /run/dtrace/stash, dof/, which contains raw DOF named by
mapping, each entry hardlinked to a corresponding entry in
pid/$pid/$mapping/raw, which tracks the DOF-containing mappings on a
PID-by-PID basis. We also keep track of parsed DOF in the per-PID
directory, and everything necessary to regenerate the parsed DOF should the
definition of struct dof_parsed change in future.
Consumers can use inotify watches to detect newly added or removed DOF or
probes: because all the DOF is in one directory, a very small number of
watches is needed to keep track of all the DOF in the system, and only one
watch per PID is needed to track it on a PID-by-PID basis. (Currently,
nothing is doing this, but it is now a possibility where it never was
before.)
In a future commit, dtprobed will emit all the DOF it receives into this
structure, removing it when processes exit, and DTrace will use it to create
and remove probes.
dof_stash.c itself provides functions to add new raw DOF to the stash, to
add a stream of parsed DOF corresponding to a given piece of raw DOF to the
stash, to remove DOF as processes issue DTRACEHIOC_REMOVE requests, to check
for any DOF that needs reparsing to handle changes to struct dof_parsed, and
to remove entries from the DOF stash that is left behind by processes that
died uncleanly without issuing DTRACEHIOC_REMOVEs. (Currently, we don't deal
with newly-started processes that reuse the same PID as such a process, or
processes that leave DOF behind on exec() and then add more: this
improvement will come soon.)
It does not do any DOF parsing: that is left up to the caller (dtprobed.c).
All the writes to the dof_stash are done via the *at() functions to avoid
any risk of bugs causing the file creations and most especially deletions it
carries out from leaking outside /run/dtrace. (We are not concerned about
symlink attacks etc because only root can write to /run/dtrace or any
directories underneath it.)
Signed-off-by: Nick Alcock <nick.alcock at oracle.com>
---
dtprobed/Build | 2 +-
dtprobed/dof_stash.c | 1140 +++++++++++++++++++++++++++++++++++++
dtprobed/dof_stash.h | 44 ++
dtprobed/dtprobed.service | 2 +
dtrace.spec | 4 +
libcommon/dof_parser.h | 22 +-
6 files changed, 1206 insertions(+), 8 deletions(-)
create mode 100644 dtprobed/dof_stash.c
create mode 100644 dtprobed/dof_stash.h
diff --git a/dtprobed/Build b/dtprobed/Build
index c44dd40ab3d2..f1e0dfff456f 100644
--- a/dtprobed/Build
+++ b/dtprobed/Build
@@ -26,7 +26,7 @@ dtprobed_CPPFLAGS := -I. -Idtprobed -Ilibproc -Ilibcommon -Ilibport
dtprobed_CFLAGS := $(shell pkg-config --cflags $(LIBFUSE))
dtprobed_LIBS := -lcommon -lproc -lcommon -lport -lelf $(shell pkg-config --libs $(LIBFUSE))
dtprobed_DEPS := libproc.a libcommon.a libport.a
-dtprobed_SOURCES := dtprobed.c seccomp-assistance.c
+dtprobed_SOURCES := dtprobed.c dof_stash.c seccomp-assistance.c
dtprobed_LIBSOURCES := libproc libcommon
ifdef HAVE_LIBSYSTEMD
diff --git a/dtprobed/dof_stash.c b/dtprobed/dof_stash.c
new file mode 100644
index 000000000000..0e25f85bcd7b
--- /dev/null
+++ b/dtprobed/dof_stash.c
@@ -0,0 +1,1140 @@
+/*
+ * Oracle Linux DTrace; DOF storage for later probe removal.
+ * Copyright (c) 2023, Oracle and/or its affiliates. All rights reserved.
+ * Licensed under the Universal Permissive License v 1.0 as shown at
+ * http://oss.oracle.com/licenses/upl.
+ *
+ * The DOF stash is the principal state preservation mechanism used by dtprobed.
+ * Keeping DOF on disk keeps memory management complexity low, and makes it
+ * possible for dtprobed to track DOF across daemon restarts and upgrades.
+ *
+ * The state storage structure is as follows:
+ *
+ * /run/dtrace/stash/dof/$dev-$ino: DOF contributed by particular mappings,
+ * in raw form (as received from the client).
+ *
+ * /run/dtrace/stash/dof-pid/$pid/$dev-$ino/raw: hardlink to the DOF for a given
+ * PID. Pruned of dead processes at startup and on occasion: entries also
+ * deleted on receipt of DTRACEHIOC_REMOVE ioctls. The principal effect of this
+ * is to bump the link count for the corresponding DOF in dof/ directory: when
+ * this falls to 1, the DOF is considered dead and the corresponding probe is
+ * removed.
+ *
+ * /run/dtrace/stash/dof-pid/$pid/$dev-$ino/dh: Serialized form of the
+ * dof_helper_t received from a given client. Preserved for all
+ * DTRACEHIOC_ADDDOFs, even if the DOF was already contributed by some other
+ * process. Keeping this around allows the DOF to be re-parsed as needed, even
+ * by a later re-execution of a different version of the same daemon with a
+ * different dof_parsed_t structure, and to be re-parsed with appropriate
+ * addresses for whichever mapping of this DOF is under consideration.
+ *
+ * /usr/dtrace/stash/dof-pid/$pid/$dev-$ino/parsed: parsed DOF for a given PID.
+ * Generated from the helper for PIDs that are the first appearance of a given
+ * piece of DOF and used for the common case where that DOF has only one mapping
+ * until its deletion (short-lived processes, mostly). Deleted at daemon
+ * startup.
+ *
+ * /run/dtrace/stash/dof-pid/$pid/$gen: symlink to a $dev-$ino dir. "$gen" is an
+ * incrementing generation counter starting at zero for every unique PID, used
+ * by clients to uniquely identify DOF pieces to remove.
+ *
+ * /run/dtrace/stash/dof-pid/$pid/next-gen: The next generation counter to use
+ * for a new piece of DOF from this process. Encoded as a sparse file whose
+ * size is the next counter value to use.
+ */
+
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <dirent.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <inttypes.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include "dof_stash.h"
+
+#include <dt_list.h>
+
+#ifdef HAVE_FUSE_LOG
+#include <fuse_log.h>
+#else
+#include "rpl_fuse_log.h"
+#endif
+
+#include <port.h>
+
+static int dof_dir, pid_dir;
+
+/*
+ * Like close(), but only try to close positively-numbered fds.
+ */
+static int
+maybe_close(int fd)
+{
+ if (fd > -1)
+ return close(fd);
+ return 0;
+}
+
+/*
+ * Like mkdirat(), but already-existing directories are considered
+ * successful.
+ */
+static int
+mkexistingdirat(int dirfd, const char *name, mode_t mode)
+{
+ if (mkdirat(dirfd, name, mode) < 0) {
+ if (errno == EEXIST) {
+ struct stat s;
+
+ if (fstatat(dirfd, name, &s, 0) == 0 &&
+ S_ISDIR(s.st_mode))
+ return 0;
+ errno = EEXIST;
+ }
+ return -1;
+ }
+ return 0;
+}
+
+/*
+ * Make a runtime state directory and return its fd.
+ */
+static int
+make_state_dir(const char *dir)
+{
+ int fd;
+
+ if (mkdir(dir, 0755) < 0 && errno != EEXIST) {
+ fuse_log(FUSE_LOG_ERR, "dtprobed: "
+ "cannot create state directory %s: %s\n",
+ dir, strerror(errno));
+ return -1;
+ }
+
+ if ((fd = open(dir, O_PATH | O_CLOEXEC)) < 0) {
+ fuse_log(FUSE_LOG_ERR, "dtprobed: "
+ "cannot open state directory %s: %s\n",
+ dir, strerror(errno));
+ return -1;
+ }
+ return fd;
+}
+
+/*
+ * Make the a state directory under some other directory and return an fd to it.
+ *
+ * If "readable" is set, the fd is opened with O_RDONLY rather than O_PATH.
+ */
+static int
+make_state_dirat(int pid_dir, const char *name, const char *errmsg, int readable)
+{
+ int fd;
+ int flags = O_PATH;
+
+ if (readable)
+ flags = O_RDONLY;
+
+ if (mkexistingdirat(pid_dir, name, 0755) < 0) {
+ fuse_log(FUSE_LOG_ERR, "dtprobed: cannot create %s dir for %s: %s\n",
+ errmsg, name, strerror(errno));
+ return -1;
+ }
+
+ if ((fd = openat(pid_dir, name, flags | O_CLOEXEC)) < 0) {
+ fuse_log(FUSE_LOG_ERR, "dtprobed: cannot open %s dir for %s: %s\n",
+ errmsg, name, strerror(errno));
+ return -1;
+ }
+ return fd;
+}
+
+/*
+ * Initialize the DOF-stashing machinery, if needed.
+ */
+int
+dof_stash_init(const char *statedir)
+{
+ int root;
+ int stash;
+
+ if (statedir == NULL)
+ statedir="/run/dtrace";
+
+ if ((root = make_state_dir(statedir)) < 0)
+ return -1;
+
+ if ((stash = make_state_dirat(root, "stash", "statedir", 0)) < 0)
+ return -1;
+
+ if ((dof_dir = make_state_dirat(stash, "dof", "DOF", 0)) < 0)
+ return -1;
+ if ((pid_dir = make_state_dirat(stash, "dof-pid", "per-pid DOF", 1)) < 0)
+ return -1;
+
+ close(root);
+ close(stash);
+
+ return 0;
+}
+
+/*
+ * Compose a DOF directory's name from pieces.
+ */
+static char *
+make_dof_name(dev_t dev, ino_t ino)
+{
+ char *ret;
+
+ if (asprintf(&ret, "%li-%li", (long) dev, (long) ino) < 0) {
+ fuse_log(FUSE_LOG_ERR, "dtprobed: out of memory making DOF name\n");
+ return NULL;
+ }
+ return ret;
+}
+
+/*
+ * Split the pieces back up again.
+ */
+static int
+split_dof_name(const char *dof_name, dev_t *dev, ino_t *ino)
+{
+ if (sscanf(dof_name, "%li-%li", (long *) dev, (long *) ino) < 2) {
+ fuse_log(FUSE_LOG_ERR, "dtprobed: per-pid directory name %s unparseable\n",
+ dof_name);
+ return -1;
+ }
+ return 0;
+}
+
+/*
+ * A trivial wrapper.
+ */
+static char *
+make_numeric_name(long num)
+{
+ char *ret;
+
+ if (asprintf(&ret, "%li", num) < 0) {
+ fuse_log(FUSE_LOG_ERR, "dtprobed: out of memory making filename\n");
+ return NULL;
+ }
+ return ret;
+}
+
+
+/*
+ * Push a piece of parsed DOF into a list to be emitted by
+ * dof_stash_write_parsed().
+ */
+int
+dof_stash_push_parsed(dt_list_t *accum, dof_parsed_t *parsed)
+{
+ dof_parsed_list_t *entry;
+
+ if ((entry = calloc(1, sizeof(dof_parsed_list_t))) == NULL)
+ return -1;
+
+ entry->parsed = parsed;
+ dt_list_append(accum, entry);
+
+ return 0;
+}
+
+/*
+ * Free all the accumulated parsed DOF in the passed list.
+ */
+void
+dof_stash_flush(dt_list_t *accum)
+{
+ dof_parsed_list_t *accump;
+ dof_parsed_list_t *last_accump = NULL;
+
+ for (accump = dt_list_next(accum); accump != NULL;
+ accump = dt_list_next(accump)) {
+ dt_list_delete(accum, accump);
+ free(accump->parsed);
+ free(last_accump);
+ last_accump = accump;
+ }
+ free(last_accump);
+}
+
+/*
+ * Write a chunk of data to an fd.
+ */
+static int
+write_chunk(int fd, const void *buf, size_t size)
+{
+ char *bufptr = (char *) buf;
+
+ while (size > 0) {
+ size_t len;
+
+ if ((len = write(fd, bufptr, size)) < 0 && errno != EINTR)
+ return -1;
+
+ size -= len;
+ bufptr += len;
+ }
+ return 0;
+}
+
+/*
+ * Write out a piece of raw DOF. Returns the length of the file written,
+ * or 0 if none was needed (or -1 on error).
+ */
+static int
+dof_stash_write(int dirfd, const char *name, const void *buf, size_t size)
+{
+ struct stat s;
+ int fd;
+
+ /*
+ * Sanity check: if the DOF already exists but is not the same size
+ * as the DOF we already have, complain, and replace it and its parsed
+ * equivalent. If it does exist, there's no need to write it out.
+ *
+ * If we can't even unlink it or write it out, we give up -- the stash
+ * has failed and we won't be able to do anything it implies.
+ *
+ * (This is only a crude check -- obviously distinct raw DOF could be
+ * the same size by pure chance.)
+ */
+ if (fstatat(dirfd, name, &s, 0) == 0) {
+ if (s.st_size == size)
+ return 0;
+
+ fuse_log(FUSE_LOG_ERR, "dtprobed: DOF %s already exists, "
+ "but is %zx bytes long, not %zx: replacing\n",
+ name, s.st_size, size);
+ if (unlinkat(dirfd, name, 0) < 0) {
+ fuse_log(FUSE_LOG_ERR, "dtprobed: cannot remove old DOF %s: %s\n",
+ name, strerror(errno));
+ return -1;
+ }
+ }
+
+ if ((fd = openat(dirfd, name, O_CREAT | O_EXCL | O_WRONLY, 0644)) < 0) {
+ fuse_log(FUSE_LOG_ERR, "dtprobed: cannot write out raw DOF: %s\n",
+ strerror(errno));
+ return -1;
+ }
+
+ if (write_chunk(fd, buf, size) < 0)
+ goto err;
+
+ if (close(fd) < 0)
+ goto err;
+ return size + sizeof(uint64_t);
+
+ err:
+ fuse_log(FUSE_LOG_ERR, "dtprobed: cannot write out DOF: %s\n",
+ strerror(errno));
+ unlinkat(dirfd, name, 0);
+ close(fd);
+ return -1;
+}
+
+/*
+ * Write out all the parsed DOF from the passed-in accumulator if not already
+ * present. Do nothing (not even creating the file) if the accumulator is
+ * empty.
+ *
+ * The parsed file starts with a single 64-bit word giving the version of
+ * dof_parsed_t in use, to ensure that readers never use an incompatible
+ * version.
+ */
+int
+dof_stash_write_parsed(pid_t pid, dev_t dev, ino_t ino, dt_list_t *accum)
+{
+ char *pid_name = make_numeric_name(pid);
+ char *dof_name = make_dof_name(dev, ino);
+ int perpid_dir, perpid_dof_dir;
+ dof_parsed_list_t *accump;
+ int fd;
+ const char *op;
+ uint64_t version = DOF_PARSED_VERSION;
+
+ if (dt_list_next(accum) == NULL) {
+ free(pid_name);
+ free(dof_name);
+ return 0;
+ }
+
+ if ((perpid_dir = openat(pid_dir, pid_name, O_PATH | O_CLOEXEC)) < 0) {
+ fuse_log(FUSE_LOG_ERR, "dtprobed: PID %i, %lx/%lx: open failed for PID dir for parsed DOF write: %s\n",
+ pid, dev, ino, strerror(errno));
+ goto early_err;
+ }
+
+ if ((perpid_dof_dir = openat(perpid_dir, dof_name, O_PATH | O_CLOEXEC)) < 0) {
+ op = "open";
+ goto close_perpid_err;
+ }
+
+ if ((fd = openat(perpid_dof_dir, "parsed", O_CREAT | O_EXCL | O_WRONLY, 0644)) < 0) {
+ op = "open";
+ goto close_perpid_dof_err;
+ }
+
+ op = "write";
+ if (write_chunk(fd, &version, sizeof(uint64_t)) < 0)
+ goto err;
+
+ for (accump = dt_list_next(accum); accump != NULL;
+ accump = dt_list_next(accump)) {
+ if (write_chunk(fd, accump->parsed, accump->parsed->size) < 0)
+ goto err;
+ }
+
+ if (close(fd) < 0) {
+ unlinkat(perpid_dof_dir, "parsed", 0);
+ goto err;
+ }
+
+ close(perpid_dof_dir);
+ close(perpid_dir);
+ free(pid_name);
+ free(dof_name);
+ return 0;
+
+err:
+ close(fd);
+close_perpid_dof_err:
+ close(perpid_dof_dir);
+close_perpid_err:
+ close(perpid_dir);
+ fuse_log(FUSE_LOG_ERR, "dtprobed: PID %i, %lx/%lx: %s failed for parsed DOF write: %s\n",
+ pid, dev, ino, op, strerror(errno));
+ unlinkat(perpid_dof_dir, "parsed", 0);
+early_err:
+ free(pid_name);
+ free(dof_name);
+ return -1;
+}
+
+/*
+ * Write out the DOF helper. A trivial wrapper.
+ */
+static
+int dof_stash_write_dh(int dirfd, const dof_helper_t *dh)
+{
+ int fd;
+
+ if ((fd = openat(dirfd, "dh", O_CREAT | O_EXCL | O_WRONLY, 0644)) < 0)
+ goto err;
+
+ if (write_chunk(fd, dh, sizeof(dof_helper_t *)) < 0) {
+ close(fd);
+ goto err;
+ }
+
+ if (close(fd) < 0)
+ goto err;
+
+ return 0;
+
+ err:
+ fuse_log(FUSE_LOG_ERR, "dtprobed: cannot write out DOF helper: %s\n",
+ strerror(errno));
+ return -1;
+}
+
+/*
+ * Add a piece of raw DOF from a given (pid, dev, ino) triplet.
+ *
+ * Return the new DOF generation number, or -1 on error.
+ */
+int
+dof_stash_add(pid_t pid, dev_t dev, ino_t ino, const dof_helper_t *dh,
+ const void *dof, size_t size)
+{
+ char *dof_name = make_dof_name(dev, ino);
+ char *pid_name = make_numeric_name(pid);
+ int perpid_dir = -1;
+ int perpid_dof_dir = -1;
+ int gen_fd = -1;
+ struct stat gen_stat;
+ char *gen_name = NULL;
+ int new_pid = 1;
+ int new_dof = 0;
+ int err = -1;
+
+ if (!pid_name || !dof_name)
+ goto out_free;
+
+
+ /* Make the directories. */
+
+ perpid_dir = make_state_dirat(pid_dir, pid_name, "PID", 0);
+
+ if (perpid_dir < 0)
+ goto out_free;
+
+ perpid_dof_dir = make_state_dirat(perpid_dir, dof_name, "per-pid DOF", 0);
+
+ if (perpid_dof_dir < 0)
+ goto err_unlink_nomsg;
+
+ fuse_log(FUSE_LOG_DEBUG, "%i: adding DOF in mapping %lx/%lx, l_addr %lx\n",
+ pid, dev, ino, dh->dofhp_addr);
+
+ /* Get the current generation counter. */
+
+ if ((gen_fd = openat(perpid_dir, "next-gen", O_CREAT | O_WRONLY, 0644)) < 0) {
+ fuse_log(FUSE_LOG_ERR, "dtprobed: cannot open DOF generation counter for PID "
+ "%i: %s\n", pid, strerror(errno));
+ goto err_unlink_nomsg;
+ }
+
+ if (fstat(gen_fd, &gen_stat) < 0) {
+ fuse_log(FUSE_LOG_ERR, "dtprobed: cannot stat DOF generation counter for PID "
+ "%i: %s\n", pid, strerror(errno));
+ goto err_unlink_nomsg;
+ }
+ if (gen_stat.st_size > 0)
+ new_pid = 0;
+
+ /*
+ * Write out the raw DOF, if not already known, and link it into its
+ * per-PID dir: write the helper directly out. dof_stash_write tells us
+ * whether the DOF actually needed writing or not (via a straight size
+ * comparison, which suffices to spot almost everything other than the
+ * pathological case of stale DOF from a killed process whose inode is
+ * unlinked and replaced with another identically-numbered inode with
+ * DOF of the *exact same length*: this seems too unlikely to be worth
+ * bothering with.)
+ */
+ new_dof = 1;
+ switch (dof_stash_write(dof_dir, dof_name, dof, size)) {
+ case 0: new_dof = 0; break;
+ case -1: goto err_unlink_nomsg; break;
+ }
+
+ if (linkat(dof_dir, dof_name, perpid_dof_dir, "raw", 0) < 0)
+ goto err_unlink_msg;
+
+ if (dof_stash_write_dh(perpid_dof_dir, dh) < 0)
+ goto err_unlink_nomsg;
+
+ /*
+ * Update the generation counter and mark this DOF's generation. On
+ * error after this point we leak generation counter values.
+ */
+
+ if(ftruncate(gen_fd, gen_stat.st_size + 1) < 0) {
+ fuse_log(FUSE_LOG_ERR, "dtprobed: cannot update generation counter for PID %i: %s\n",
+ pid, strerror(errno));
+ goto err_unlink_nomsg;
+ }
+
+ gen_name = make_numeric_name(gen_stat.st_size);
+ if (symlinkat(dof_name, perpid_dir, gen_name) < 0)
+ goto err_unlink_msg;
+
+ err = 0;
+
+out_free:
+ free(dof_name);
+ free(pid_name);
+
+ maybe_close(perpid_dir);
+ maybe_close(perpid_dof_dir);
+ maybe_close(gen_fd);
+
+ return err == 0 ? gen_stat.st_size : err;
+
+err_unlink_msg:
+ fuse_log(FUSE_LOG_ERR, "Cannot link pids for DOF from PID %i, "
+ "DOF mapping %lx/%lx into place: %s\n", pid, dev, ino,
+ strerror(errno));
+err_unlink_nomsg:
+ unlinkat(perpid_dof_dir, "raw", 0);
+ unlinkat(perpid_dof_dir, "dh", 0);
+ unlinkat(perpid_dir, dof_name, AT_REMOVEDIR);
+
+ if (gen_name)
+ unlinkat(perpid_dir, gen_name, 0);
+
+ if (new_pid) {
+ unlinkat(perpid_dir, "next-gen", 0);
+ unlinkat(pid_dir, pid_name, AT_REMOVEDIR);
+ }
+
+ if (new_dof)
+ unlinkat(dof_dir, dof_name, 0);
+
+ goto out_free;
+}
+
+/*
+ * Determine if a file or directory (in the DOF stash) should be deleted.
+ */
+static int
+refcount_cleanup_p(int dirfd, const char *name, int isdir)
+{
+ struct stat s;
+
+ if (fstatat(dirfd, name, &s, 0) != 0) {
+ fuse_log(FUSE_LOG_ERR, "Cannot stat %s for cleanup: %s\n",
+ name, strerror(errno));
+ return -1;
+ }
+
+ if ((isdir && s.st_nlink != 2) || (!isdir && s.st_nlink != 1))
+ return 0;
+
+ return 1;
+}
+
+
+/*
+ * Delete a file or directory (in the DOF stash) if it has no other links.
+ */
+static int
+refcount_cleanup(int dirfd, const char *name, int isdir)
+{
+ switch (refcount_cleanup_p(dirfd, name, isdir)) {
+ case -1: return -1;
+ case 0: return 0;
+ default: break;
+ }
+
+ if (unlinkat(dirfd, name, isdir ? AT_REMOVEDIR : 0) < 0) {
+ fuse_log(FUSE_LOG_ERR, "dtprobed: cannot remove old DOF %s: %s\n",
+ name, strerror(errno));
+ return -1;
+ }
+ return 0;
+}
+
+/*
+ * Remove a piece of DOF, identified by generation counter. Return -1 on error.
+ *
+ * Not knowing about the DOF is not an error.
+ */
+int
+dof_stash_remove(pid_t pid, int gen)
+{
+ char *pid_name = make_numeric_name(pid);
+ char *gen_name = make_numeric_name(gen);
+ char *gen_linkname = NULL;
+ int perpid_dir = -1;
+ int perpid_dof_dir = -1;
+ struct stat pid_stat;
+ struct stat gen_stat;
+ int err = -1, unlink_err = 0;
+ const char *unlink_errfn;
+
+ /*
+ * Figure out the per-PID DOF directory by following the gen-counter
+ * symlink (by hand, because we want to reuse its name in the non-
+ * PID-specific dof/ subdir too).
+ */
+
+ if (!pid_name || !gen_name)
+ goto oom;
+
+ if ((perpid_dir = openat(pid_dir, pid_name, O_PATH | O_CLOEXEC)) < 0)
+ goto out;
+
+ if (fstatat(perpid_dir, gen_name, &gen_stat, AT_SYMLINK_NOFOLLOW) < 0)
+ goto out;
+
+ if ((gen_linkname = malloc(gen_stat.st_size + 1)) == NULL)
+ goto oom;
+
+ /*
+ * Truncated dir -> concurrent modifications, not allowed.
+ */
+ if (readlinkat(perpid_dir, gen_name, gen_linkname, gen_stat.st_size + 1) ==
+ gen_stat.st_size + 1) {
+ fuse_log(FUSE_LOG_ERR, "dtprobed: internal error: state dir is under concurrent modification, maybe multiple dtprobeds are running\n");
+ goto err;
+ }
+ gen_linkname[gen_stat.st_size] = 0;
+
+ perpid_dof_dir = openat(perpid_dir, gen_linkname, O_PATH | O_CLOEXEC);
+
+ if (strchr(gen_linkname, '/') != NULL ||
+ perpid_dof_dir < 0) {
+ if (perpid_dof_dir >= 0)
+ close(perpid_dof_dir);
+ fuse_log(FUSE_LOG_ERR, "dtprobed: internal error: generation link for PID %i generation %i is %s, which is not valid\n",
+ pid, gen, gen_linkname);
+ goto err;
+ }
+
+ if (faccessat(dof_dir, gen_linkname, F_OK, 0) < 0) {
+ fuse_log(FUSE_LOG_ERR, "dtprobed: internal error: generation link for PID %i generation %i is %s, which has no raw DOF\n",
+ pid, gen, gen_linkname);
+ goto err;
+ }
+
+ /*
+ * Unlink the per-PID raw DOF, then check the link counts on the
+ * non-per-PID stuff; if zero, unlink that too.
+ */
+
+ fuse_log(FUSE_LOG_DEBUG, "gen_name: %s; gen_linkname: %s; perpid_dof_dir: %i\n", gen_name, gen_linkname, perpid_dof_dir);
+ if (unlinkat(perpid_dof_dir, "raw", 0) != 0 && errno != ENOENT) {
+ fuse_log(FUSE_LOG_ERR, "dtprobed: cannot unlink per-PID raw DOF for PID %i generation %i: %s\n",
+ pid, gen, strerror(errno));
+ }
+
+ if (unlinkat(perpid_dof_dir, "parsed", 0) < 0 && errno != ENOENT) {
+ unlink_errfn = "parsed";
+ unlink_err = 1;
+ }
+ if (unlinkat(perpid_dof_dir, "dh", 0) < 0) {
+ unlink_errfn = "dh";
+ unlink_err = 1;
+ }
+ if (unlinkat(perpid_dir, gen_linkname, AT_REMOVEDIR) < 0) {
+ unlink_errfn = gen_linkname;
+ unlink_err = 1;
+ }
+ if (unlinkat(perpid_dir, gen_name, 0) < 0) {
+ unlink_errfn = gen_name;
+ unlink_err = 1;
+ }
+
+ if (unlink_err)
+ fuse_log(FUSE_LOG_ERR, "dtprobed: cannot unlink per-PID DOF %s for PID %i generation %i: %s\n",
+ unlink_errfn, pid, gen, strerror(errno));
+
+ refcount_cleanup(dof_dir, gen_linkname, 0);
+
+ /*
+ * Clean up the PID directory itself, if it is now empty. We can't just
+ * use refcount_cleanup here because we also have to delete the
+ * generation counter right before deleting the directory.
+ */
+
+ if (fstatat(pid_dir, pid_name, &pid_stat, 0) < 0) {
+ fuse_log(FUSE_LOG_ERR, "dtprobed: cannot stat per-PID for pid %i: %s\n",
+ pid, strerror(errno));
+ goto err;
+ }
+
+ if (pid_stat.st_nlink == 2) {
+ if (unlinkat(perpid_dir, "next-gen", 0) < 0) {
+ fuse_log(FUSE_LOG_ERR, "dtprobed: cannot clean up gen counter in per-PID dir for PID %i: %s\n",
+ pid, strerror(errno));
+ goto err;
+ }
+ refcount_cleanup(pid_dir, pid_name, 1);
+ }
+
+ out:
+ err = 0;
+
+ err:
+ free(pid_name);
+ free(gen_name);
+ free(gen_linkname);
+
+ maybe_close(perpid_dir);
+ maybe_close(perpid_dof_dir);
+
+ return err;
+
+oom:
+ fuse_log(FUSE_LOG_ERR, "dtprobed: out of memory unlinking DOF\n");
+ goto err;
+}
+
+/*
+ * Prune dead processes out of the DOF stash. This is not a correctness
+ * operation, just a space-waste reducer. If a process with DOF died uncleanly
+ * and its PID was recycled since a cleanup was last run, we'll leave some DOF
+ * around for a bit, but there are no correctness implications: either the new
+ * process has no DOF at all, in which case we'll never learn about it, or it
+ * does, in which case either it has identical mappings (in which case we'll
+ * just reuse them) or it has different ones (in which case we'll just add to
+ * them): in neither case is there any negative consequence of the nonexistent
+ * mappings other than a bit of space wastage. (This implies that any process
+ * reading DOF out of the state directory must check that a process that claims
+ * to have DOF at a given mapping actually has a corresponding mapping with the
+ * same dev/ino pair at the right address. There's no need for that process to
+ * clean it up if found to be wrong, though: this function will eventually get
+ * around to it.)
+ */
+void
+dof_stash_prune_dead(void)
+{
+ DIR *all_pids_dir;
+ struct dirent *pid_ent;
+ int tmp;
+
+ fuse_log(FUSE_LOG_DEBUG, "Pruning dead PIDs\n");
+
+ if ((tmp = dup(pid_dir)) < 0) {
+ fuse_log(FUSE_LOG_ERR, "cannot dup() for cleanup: %s\n", strerror(errno));
+ return;
+ }
+
+ if ((all_pids_dir = fdopendir(tmp)) == NULL) {
+ close(tmp);
+ fuse_log(FUSE_LOG_ERR, "cannot clean up per-PID DOF directory: %s\n",
+ strerror(errno));
+ return;
+ }
+ rewinddir(all_pids_dir);
+
+ /*
+ * Work over all the PIDs.
+ */
+ while (errno = 0, (pid_ent = readdir(all_pids_dir)) != NULL) {
+ DIR *perpid_dir;
+ struct dirent *mapping_ent;
+ char *end_pid;
+ pid_t pid;
+
+ if (errno != 0)
+ goto scan_failure;
+
+ if (pid_ent->d_type != DT_DIR)
+ continue;
+
+ /*
+ * Only directories with numeric names can be PIDs: skip all the
+ * rest.
+ */
+ pid = strtol(pid_ent->d_name, &end_pid, 10);
+ if (*end_pid != '\0')
+ continue;
+
+ if (kill(pid, 0) == 0 || errno != ESRCH)
+ continue;
+
+ /*
+ * Process dead: clean up its PIDs, deleting everything in the
+ * per-PID directory by calling dof_stash_remove() on every
+ * mapping in turn.
+ */
+
+ if ((tmp = openat(pid_dir, pid_ent->d_name, O_RDONLY | O_CLOEXEC)) < 0) {
+ fuse_log(FUSE_LOG_ERR,"cannot open per-PID DOF mappings directory for cleanup: %s\n",
+ strerror(errno));
+ goto out;
+ }
+
+ if ((perpid_dir = fdopendir(tmp)) == NULL) {
+ fuse_log(FUSE_LOG_ERR, "cannot clean up per-PID DOF mappings: %s\n",
+ strerror(errno));
+ close(tmp);
+ goto out;
+ }
+
+ fuse_log(FUSE_LOG_DEBUG, "Pruning dead PID %s\n", pid_ent->d_name);
+
+ /* Work over all the mappings in this PID. */
+
+ while (errno = 0, (mapping_ent = readdir(perpid_dir)) != NULL) {
+ int gen;
+ char *end_gen;
+
+ if (errno != 0) {
+ fuse_log(FUSE_LOG_ERR, "cannot read per-PID DOF mappings for cleanup: %s\n",
+ strerror(errno));
+ closedir(perpid_dir);
+ goto out;
+ }
+
+ if (mapping_ent->d_type != DT_LNK)
+ continue;
+
+ fuse_log(FUSE_LOG_DEBUG, "Working over generation %s\n", mapping_ent->d_name);
+
+ gen = strtol(mapping_ent->d_name, &end_gen, 10);
+ if (*end_gen != '\0')
+ continue;
+
+ dof_stash_remove(pid, gen);
+ }
+ closedir(perpid_dir);
+ errno = 0;
+ }
+
+ if (errno != 0)
+ goto scan_failure;
+
+out:
+ closedir(all_pids_dir);
+ return;
+
+scan_failure:
+ fuse_log(FUSE_LOG_ERR, "cannot scan per-PID DOF directory for cleanup: %s\n",
+ strerror(errno));
+ goto out;
+}
+
+/*
+ * Read a file into a buffer and return it. If READ_SIZE is non-negative, read
+ * only that many bytes (and silently fail if the file isn't at least that
+ * long).
+ */
+static void *
+read_file(int fd, ssize_t read_size, size_t *size)
+{
+ struct stat s;
+ char *buf;
+ char *bufptr;
+ int len;
+
+ if (fstat(fd, &s) < 0) {
+ fuse_log(FUSE_LOG_ERR, "cannot stat: %s\n", strerror(errno));
+ return NULL;
+ }
+ if (read_size >= 0) {
+ if (s.st_size < read_size)
+ return NULL;
+ } else
+ read_size = s.st_size;
+
+ if ((buf = malloc(read_size)) == NULL) {
+ fuse_log(FUSE_LOG_ERR, "out of memory allocating %zi bytes\n",
+ read_size);
+ return NULL;
+ }
+
+ if (size)
+ *size = read_size;
+
+ bufptr = buf;
+ while ((len = read(fd, bufptr, read_size)) < read_size) {
+ if (len < 0) {
+ if (errno != EINTR) {
+ fuse_log(FUSE_LOG_ERR, "cannot read: %s",
+ strerror(errno));
+ free(buf);
+ return NULL;
+ }
+ continue;
+ }
+ read_size -= len;
+ bufptr += len;
+ }
+ return buf;
+}
+
+/*
+ * Reparse all DOF. Invoked at daemon startup, so reports errors down the
+ * sync_fd for printing in the parent. Mappings that cannot be reparsed are
+ * simply ignored, on the grounds that most DOF, most of the time, is not
+ * used, so this will likely be ignorable.
+ *
+ * Normally, DOF with the same DOF_VERSION as the currently running daemon is
+ * left alone: if force is set, even that is reparsed.
+ */
+int
+reparse_dof(int out, int in,
+ int (*reparse)(int pid, int out, int in, dev_t dev, ino_t inum,
+ dof_helper_t *dh, const void *in_buf, size_t in_bufsz,
+ int reparsing),
+ int force)
+{
+ DIR *all_pids_dir;
+ struct dirent *pid_ent;
+ int err = -1;
+ int tmp;
+
+ if ((tmp = dup(pid_dir)) < 0) {
+ fuse_log(FUSE_LOG_ERR, "reparsing DOF: cannot dup(): %s\n", strerror(errno));
+ return -1;
+ }
+
+ if ((all_pids_dir = fdopendir(tmp)) == NULL) {
+ close(tmp);
+ fuse_log(FUSE_LOG_ERR, "reparsing DOF: cannot read per-PID DOF directory: %s\n",
+ strerror(errno));
+ return -1;
+ }
+
+ /* Work over all the PIDs. */
+
+ rewinddir(all_pids_dir);
+ while (errno = 0, (pid_ent = readdir(all_pids_dir)) != NULL) {
+ int perpid_fd;
+ char *end_pid;
+ pid_t pid;
+ DIR *perpid_dir;
+ struct dirent *mapping_ent;
+
+ if (errno != 0)
+ goto scan_failure;
+
+ if (pid_ent->d_type != DT_DIR)
+ continue;
+
+ /*
+ * Only directories with numeric names can be PIDs: skip all the
+ * rest.
+ */
+ pid = strtol(pid_ent->d_name, &end_pid, 10);
+ if (*end_pid != '\0')
+ continue;
+
+ if ((perpid_fd = openat(pid_dir, pid_ent->d_name, O_RDONLY | O_CLOEXEC)) < 0) {
+ fuse_log(FUSE_LOG_ERR, "reparsing DOF: cannot open per-PID DOF mappings directory for PID %s: %s\n",
+ pid_ent->d_name, strerror(errno));
+ goto out;
+ }
+
+ if ((tmp = dup(perpid_fd)) < 0) {
+ fuse_log(FUSE_LOG_ERR, "reparsing DOF: cannot dup(): %s\n", strerror(errno));
+ close(perpid_fd);
+ goto out;
+ }
+
+ if ((perpid_dir = fdopendir(tmp)) == NULL) {
+ fuse_log(FUSE_LOG_ERR, "reparsing DOF: cannot read per-PID DOF mappings for pid %s: %s\n",
+ pid_ent->d_name, strerror(errno));
+ close(tmp);
+ close(perpid_fd);
+ goto out;
+ }
+
+ /* Work over all the mappings in this PID. */
+
+ while (errno = 0, (mapping_ent = readdir(perpid_dir)) != NULL) {
+ int mapping_fd;
+ int fd;
+ dev_t dev;
+ ino_t ino;
+ size_t dof_size, dh_size;
+ void *dof = NULL;
+ void *dh = NULL;
+
+ if (errno != 0) {
+ fuse_log(FUSE_LOG_ERR, "reparsing DOF: cannot read per-PID DOF mappings for pid %s: %s\n",
+ pid_ent->d_name, strerror(errno));
+ goto perpid_err;
+ }
+
+ if (mapping_ent->d_type != DT_DIR)
+ continue;
+
+ /*
+ * Hidden files cannot be mappings. (This also skips
+ * . and ..).
+ */
+ if (mapping_ent->d_name[0] == '.')
+ continue;
+
+ if ((mapping_fd = openat(perpid_fd, mapping_ent->d_name,
+ O_PATH | O_CLOEXEC)) < 0) {
+ fuse_log(FUSE_LOG_ERR, "reparsing DOF: cannot open per-PID DOF mapping for PID %s, mapping %s: %s\n",
+ pid_ent->d_name, mapping_ent->d_name, strerror(errno));
+ goto perpid_err;
+ }
+
+ fuse_log(FUSE_LOG_DEBUG, "Parsed DOF is present: evaluating\n");
+
+ /*
+ * Only DOF with a different parsed representation gets reparsed.
+ */
+ if (!force) {
+ if ((fd = openat(mapping_fd, "parsed",
+ O_RDONLY | O_CLOEXEC)) == 0) {
+ uint64_t *parsed_version;
+
+ parsed_version = read_file(fd, sizeof(uint64_t),
+ NULL);
+ close(fd);
+
+ if (parsed_version != NULL &&
+ *parsed_version == DOF_PARSED_VERSION) {
+ fuse_log(FUSE_LOG_DEBUG, "No need to reparse\n");
+ continue;
+ }
+
+ free(parsed_version);
+ }
+ }
+
+ fuse_log(FUSE_LOG_DEBUG, "Deleting\n");
+
+ if (unlinkat(mapping_fd, "parsed", 0) < 0
+ && errno != ENOENT) {
+ close(mapping_fd);
+ fuse_log(FUSE_LOG_ERR, "reparsing DOF: cannot delete parsed DOF for PID %s, mapping %s: %s\n",
+ pid_ent->d_name, mapping_ent->d_name, strerror(errno));
+ goto perpid_err;
+ }
+
+ if (split_dof_name(mapping_ent->d_name, &dev, &ino) < 0) {
+ fuse_log(FUSE_LOG_ERR, "when reparsing DOF for PID %s, cannot derive dev/inum from %s: ignored\n",
+ pid_ent->d_name, mapping_ent->d_name);
+ continue;
+ }
+
+ if ((fd = openat(mapping_fd, "raw", O_RDONLY | O_CLOEXEC)) < 0) {
+ fuse_log(FUSE_LOG_ERR, "when reparsing DOF, cannot open raw DOF for PID %s, mapping %s: ignored: %s\n",
+ pid_ent->d_name, mapping_ent->d_name, strerror(errno));
+ continue;
+ }
+
+ if ((dof = read_file(fd, -1, &dof_size)) == NULL) {
+ fuse_log(FUSE_LOG_ERR, "when reparsing DOF, cannot read raw DOF for PID %s, mapping %s: ignored: %s\n",
+ pid_ent->d_name, mapping_ent->d_name, strerror(errno));
+ close(fd);
+ continue;
+ }
+ close(fd);
+
+ if ((fd = openat(mapping_fd, "dh", O_RDONLY | O_CLOEXEC)) < 0) {
+ fuse_log(FUSE_LOG_ERR, "when reparsing DOF, cannot open dh for PID %s, mapping %s: ignored: %s\n",
+ pid_ent->d_name, mapping_ent->d_name, strerror(errno));
+ free(dof);
+ continue;
+ }
+
+ if ((dh = read_file(fd, -1, &dh_size)) == NULL) {
+ fuse_log(FUSE_LOG_ERR, "when reparsing DOF, cannot read dh for PID %s, mapping %s: ignored: %s\n",
+ pid_ent->d_name, mapping_ent->d_name, strerror(errno));
+ free(dof);
+ close(fd);
+ continue;
+ }
+ close(fd);
+
+ fuse_log(FUSE_LOG_DEBUG, "Reparsing DOF for PID %s, mapping %s\n",
+ pid_ent->d_name, mapping_ent->d_name);
+ if (reparse(pid, out, in, dev, ino, dh, dof, dof_size, 1) < 0)
+ fuse_log(FUSE_LOG_ERR, "when reparsing DOF, cannot parse DOF for PID %s, mapping %s: ignored\n",
+ pid_ent->d_name, mapping_ent->d_name);
+ free(dof);
+ free(dh);
+ close(mapping_fd);
+
+ continue;
+
+ perpid_err:
+ closedir(perpid_dir);
+ close(perpid_fd);
+ goto out;
+ }
+ closedir(perpid_dir);
+ close(perpid_fd);
+ errno = 0;
+ }
+
+ if (errno != 0)
+ goto scan_failure;
+
+ err = 0;
+out:
+ closedir(all_pids_dir);
+ return err;
+
+scan_failure:
+ fuse_log(FUSE_LOG_ERR, "reparsing DOF: cannot scan per-PID DOF directory: %s\n",
+ strerror(errno));
+ goto out;
+}
diff --git a/dtprobed/dof_stash.h b/dtprobed/dof_stash.h
new file mode 100644
index 000000000000..2754a8180622
--- /dev/null
+++ b/dtprobed/dof_stash.h
@@ -0,0 +1,44 @@
+/*
+ * Oracle Linux DTrace; DOF storage for later probe removal.
+ * Copyright (c) 2023, Oracle and/or its affiliates. All rights reserved.
+ * Licensed under the Universal Permissive License v 1.0 as shown at
+ * http://oss.oracle.com/licenses/upl.
+ */
+
+#ifndef _DOF_STASH_H
+#define _DOF_STASH_H
+
+#include <sys/types.h>
+#include <stdint.h>
+
+#include <dt_list.h>
+#include "dof_parser.h"
+
+typedef struct dof_parsed_list {
+ dt_list_t list;
+ dof_parsed_t *parsed;
+} dof_parsed_list_t;
+
+int dof_stash_init(const char *statedir);
+
+int dof_stash_push_parsed(dt_list_t *accum, dof_parsed_t *parsed);
+
+int dof_stash_write_parsed(pid_t pid, dev_t dev, ino_t ino, dt_list_t *accum);
+
+void dof_stash_flush(dt_list_t *accum);
+
+int dof_stash_add(pid_t pid, dev_t dev, ino_t ino, const dof_helper_t *dh,
+ const void *dof, size_t size);
+
+int dof_stash_remove(pid_t pid, int gen);
+
+void dof_stash_prune_dead(void);
+
+int
+reparse_dof(int out, int in,
+ int (*reparse)(int pid, int out, int in, dev_t dev, ino_t inum,
+ dof_helper_t *dh, const void *in_buf,
+ size_t in_bufsz, int reparsing),
+ int force);
+
+#endif
diff --git a/dtprobed/dtprobed.service b/dtprobed/dtprobed.service
index 95f0a1d6d606..70443f08fb9b 100644
--- a/dtprobed/dtprobed.service
+++ b/dtprobed/dtprobed.service
@@ -20,6 +20,8 @@ ProtectHome=true
PrivateDevices=false
PrivateNetwork=true
ProtectControlGroups=true
+RuntimeDirectory=/run/dtrace
+RuntimeDirectoryPreserve=restart
[Install]
WantedBy=basic.target
diff --git a/dtrace.spec b/dtrace.spec
index 9ddac039d030..93ca984097e3 100644
--- a/dtrace.spec
+++ b/dtrace.spec
@@ -185,6 +185,10 @@ make DESTDIR=$RPM_BUILD_ROOT VERSION=%{version} \
HDRPREFIX="$RPM_BUILD_ROOT/usr/include" \
install install-test
+%if "%{?dist}" == ".el7"
+sed -i '/^ProtectSystem=/d; /^ProtectControlGroups=/d; /^RuntimeDirectory/d;' $RPM_BUILD_ROOT/usr/lib/systemd/system/dtprobed.service
+%endif
+
# Because systemtap creates a sdt.h header file we have to rename
# ours and then shift theirs out of the way.
mv $RPM_BUILD_ROOT/usr/include/sys/sdt.h \
diff --git a/libcommon/dof_parser.h b/libcommon/dof_parser.h
index 7ab18f4dee01..5ac177e85db7 100644
--- a/libcommon/dof_parser.h
+++ b/libcommon/dof_parser.h
@@ -1,6 +1,6 @@
/*
* Oracle Linux DTrace; DOF parser interface with the outside world
- * Copyright (c) 2022, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2022, 2023, Oracle and/or its affiliates. All rights reserved.
* Licensed under the Universal Permissive License v 1.0 as shown at
* http://oss.oracle.com/licenses/upl.
*/
@@ -15,11 +15,10 @@
#include <dtrace/helpers.h>
/*
- * Result of DOF probe parsing (currently used only for probe creation. We
- * receive a provider info structure, followed by N probe info structures each
- * of which is followed by possibly many tracepoint info structures, all tagged.
- * Things not useful for probe creation (like args, translated types, etc) are
- * not returned.
+ * Result of DOF probe parsing. We receive a provider info structure, followed
+ * by N probe info structures each of which is followed by possibly many
+ * tracepoint info structures, all tagged. Things not useful for probe creation
+ * (like args, translated types, etc) are not returned.
*
* On error, a DIT_ERR structure is returned with an error message.
*/
@@ -31,6 +30,15 @@ typedef enum dof_parsed_info {
DIT_ERR = 3
} dof_parsed_info_t;
+/*
+ * Bump this whenever dof_parsed changes.
+ *
+ * Files consisting of arrays of dof_parsed have a single 64-bit word at the
+ * start which is the version of the dof_parseds within it. The data flowing
+ * over the stream from the seccomped parser has no such prefix.
+ */
+#define DOF_PARSED_VERSION 1
+
typedef struct dof_parsed {
/*
* Size of this instance of this structure.
@@ -103,7 +111,7 @@ typedef struct dof_parsed {
*
* Returns 0 on success or a positive errno value on error.
*/
-int dof_parser_host_write(int out, dof_helper_t *dh, dof_hdr_t *dof);
+int dof_parser_host_write(int out, const dof_helper_t *dh, dof_hdr_t *dof);
/*
* Read a single DOF structure from a parser pipe. Wait at most TIMEOUT seconds
--
2.41.0.270.g68fa1d84b5
More information about the DTrace-devel
mailing list