[DTrace-devel] Implement perf event buffer management for DTrace

Tue Mar 17 14:44:12 PDT 2020

With the switch to using BPF as execution engine, DTrace must now also
use the perf event buffer mechanism to process trae data.  This patch
provides an implementation of the perf event buffer consumer.

The bulk of the buffer processing infrastructure is retained because
only the underlying mechanism is changing.  The biggest change is found
in the use of epoll_wait() to wait for data to become available in any
of the CPU buffers (bound by a timeout) instead of simply retrieving
buffer data at a preset rate.

TODO: Integration of the process death tracking with the new epoll based
      buffer polling implementation.  Rather than using a condition
      variable to signal a process notification, we can use an eventfd
      to communicate the fact that there are process notifications to
      process.

(Eugene Loh implemented a prototype of this patch based on earlier code
 that I had written.  This patch is based on my earlier work and builds
 on Eugene's work.)

Signed-off-by: Kris Van Hees <kris.van.hees at oracle.com>
---
 GNUmakefile                  |   2 +-
 cmd/dtrace.c                 |   3 -
 include/arm64/asm/barrier.h  |  98 +++++++++++++++
 include/i386/asm/barrier.h   |  51 ++++++++
 include/linux/compiler-gcc.h |  42 +++++++
 include/linux/compiler.h     | 175 ++++++++++++++++++++++++++
 include/linux/ring_buffer.h  |  74 +++++++++++
 libdtrace/Build              |   5 +-
 libdtrace/dt_bpf.c           |  13 +-
 libdtrace/dt_bpf.h           |   5 +
 libdtrace/dt_cg.c            |  25 +++-
 libdtrace/dt_consume.c       | 185 +++++++++++++++++++++++++++-
 libdtrace/dt_impl.h          |   4 +-
 libdtrace/dt_open.c          |  16 ++-
 libdtrace/dt_peb.c           | 232 +++++++++++++++++++++++++++++++++++
 libdtrace/dt_peb.h           |  51 ++++++++
 libdtrace/dt_work.c          |  44 ++++++-
 17 files changed, 1002 insertions(+), 23 deletions(-)
 create mode 100644 include/arm64/asm/barrier.h
 create mode 100644 include/i386/asm/barrier.h
 create mode 100644 include/linux/compiler-gcc.h
 create mode 100644 include/linux/compiler.h
 create mode 100644 include/linux/ring_buffer.h
 create mode 100644 libdtrace/dt_peb.c
 create mode 100644 libdtrace/dt_peb.h

diff --git a/GNUmakefile b/GNUmakefile
index 287ed762..22755446 100644
--- a/GNUmakefile
+++ b/GNUmakefile
@@ -37,7 +37,7 @@ PREPROCESS = $(CC) -E
 export BPFC = bpf-unknown-none-gcc
 
 BPFCPPFLAGS += -D$(subst sparc64,__sparc,$(subst aarch64,__aarch64__,$(subst x86_64,__amd64,$(ARCH))))
-BPFCFLAGS ?= -O2 -Wall -pedantic -Wno-unknown-pragmas
+BPFCFLAGS ?= -O2 -Wall -Wno-unknown-pragmas
 export BPFLD = bpf-unknown-none-ld
 
 # The first non-system uid on this system.
diff --git a/cmd/dtrace.c b/cmd/dtrace.c
index d3e64c5a..20448ae4 100644
--- a/cmd/dtrace.c
+++ b/cmd/dtrace.c
@@ -1506,9 +1506,6 @@ main(int argc, char *argv[])
 	g_pslive = g_psc; /* count for prochandler() */
 
 	do {
-		if (!g_intr && !done)
-			dtrace_sleep(g_dtp);
-
 		if ((g_newline) && (!g_testing)) {
 			/*
 			 * Output a newline just to make the output look
diff --git a/include/arm64/asm/barrier.h b/include/arm64/asm/barrier.h
new file mode 100644
index 00000000..ded2c9d5
--- /dev/null
+++ b/include/arm64/asm/barrier.h
@@ -0,0 +1,98 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _TOOLS_LINUX_ASM_AARCH64_BARRIER_H
+#define _TOOLS_LINUX_ASM_AARCH64_BARRIER_H
+
+/*
+ * From tools/perf/perf-sys.h, last modified in:
+ * f428ebd184c82a7914b2aa7e9f868918aaf7ea78 perf tools: Fix AAAAARGH64 memory barriers
+ *
+ * XXX: arch/arm64/include/asm/barrier.h in the kernel sources use dsb, is this
+ * a case like for arm32 where we do things differently in userspace?
+ */
+
+#define mb()		asm volatile("dmb ish" ::: "memory")
+#define wmb()		asm volatile("dmb ishst" ::: "memory")
+#define rmb()		asm volatile("dmb ishld" ::: "memory")
+
+/*
+ * Kernel uses dmb variants on arm64 for smp_*() barriers. Pretty much the same
+ * implementation as above mb()/wmb()/rmb(), though for the latter kernel uses
+ * dsb. In any case, should above mb()/wmb()/rmb() change, make sure the below
+ * smp_*() don't.
+ */
+#define smp_mb()	asm volatile("dmb ish" ::: "memory")
+#define smp_wmb()	asm volatile("dmb ishst" ::: "memory")
+#define smp_rmb()	asm volatile("dmb ishld" ::: "memory")
+
+#define smp_store_release(p, v)						\
+do {									\
+	union { typeof(*p) __val; char __c[1]; } __u =			\
+		{ .__val = (v) };					\
+									\
+	switch (sizeof(*p)) {						\
+	case 1:								\
+		asm volatile ("stlrb %w1, %0"				\
+				: "=Q" (*p)				\
+				: "r" (*(__u8_alias_t *)__u.__c)	\
+				: "memory");				\
+		break;							\
+	case 2:								\
+		asm volatile ("stlrh %w1, %0"				\
+				: "=Q" (*p)				\
+				: "r" (*(__u16_alias_t *)__u.__c)	\
+				: "memory");				\
+		break;							\
+	case 4:								\
+		asm volatile ("stlr %w1, %0"				\
+				: "=Q" (*p)				\
+				: "r" (*(__u32_alias_t *)__u.__c)	\
+				: "memory");				\
+		break;							\
+	case 8:								\
+		asm volatile ("stlr %1, %0"				\
+				: "=Q" (*p)				\
+				: "r" (*(__u64_alias_t *)__u.__c)	\
+				: "memory");				\
+		break;							\
+	default:							\
+		/* Only to shut up gcc ... */				\
+		mb();							\
+		break;							\
+	}								\
+} while (0)
+
+#define smp_load_acquire(p)						\
+({									\
+	union { typeof(*p) __val; char __c[1]; } __u =			\
+		{ .__c = { 0 } };					\
+									\
+	switch (sizeof(*p)) {						\
+	case 1:								\
+		asm volatile ("ldarb %w0, %1"				\
+			: "=r" (*(__u8_alias_t *)__u.__c)		\
+			: "Q" (*p) : "memory");				\
+		break;							\
+	case 2:								\
+		asm volatile ("ldarh %w0, %1"				\
+			: "=r" (*(__u16_alias_t *)__u.__c)		\
+			: "Q" (*p) : "memory");				\
+		break;							\
+	case 4:								\
+		asm volatile ("ldar %w0, %1"				\
+			: "=r" (*(__u32_alias_t *)__u.__c)		\
+			: "Q" (*p) : "memory");				\
+		break;							\
+	case 8:								\
+		asm volatile ("ldar %0, %1"				\
+			: "=r" (*(__u64_alias_t *)__u.__c)		\
+			: "Q" (*p) : "memory");				\
+		break;							\
+	default:							\
+		/* Only to shut up gcc ... */				\
+		mb();							\
+		break;							\
+	}								\
+	__u.__val;							\
+})
+
+#endif /* _TOOLS_LINUX_ASM_AARCH64_BARRIER_H */
diff --git a/include/i386/asm/barrier.h b/include/i386/asm/barrier.h
new file mode 100644
index 00000000..855e0d4a
--- /dev/null
+++ b/include/i386/asm/barrier.h
@@ -0,0 +1,51 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _TOOLS_LINUX_ASM_X86_BARRIER_H
+#define _TOOLS_LINUX_ASM_X86_BARRIER_H
+
+/*
+ * We need this because it is not included anywhere else.
+ */
+#include <linux/compiler.h>
+
+/*
+ * Copied from the Linux kernel sources, and also moving code
+ * out from tools/perf/perf-sys.h so as to make it be located
+ * in a place similar as in the kernel sources.
+ *
+ * Force strict CPU ordering.
+ * And yes, this is required on UP too when we're talking
+ * to devices.
+ */
+
+#if defined(__i386__)
+/*
+ * Some non-Intel clones support out of order store. wmb() ceases to be a
+ * nop for these.
+ */
+#define mb()	asm volatile("lock; addl $0,0(%%esp)" ::: "memory")
+#define rmb()	asm volatile("lock; addl $0,0(%%esp)" ::: "memory")
+#define wmb()	asm volatile("lock; addl $0,0(%%esp)" ::: "memory")
+#elif defined(__x86_64__)
+#define mb()	asm volatile("mfence" ::: "memory")
+#define rmb()	asm volatile("lfence" ::: "memory")
+#define wmb()	asm volatile("sfence" ::: "memory")
+#define smp_rmb() barrier()
+#define smp_wmb() barrier()
+#define smp_mb()  asm volatile("lock; addl $0,-132(%%rsp)" ::: "memory", "cc")
+#endif
+
+#if defined(__x86_64__)
+#define smp_store_release(p, v)			\
+do {						\
+	barrier();				\
+	WRITE_ONCE(*p, v);			\
+} while (0)
+
+#define smp_load_acquire(p)			\
+({						\
+	typeof(*p) ___p1 = READ_ONCE(*p);	\
+	barrier();				\
+	___p1;					\
+})
+#endif /* defined(__x86_64__) */
+#endif /* _TOOLS_LINUX_ASM_X86_BARRIER_H */
diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
new file mode 100644
index 00000000..0d35f180
--- /dev/null
+++ b/include/linux/compiler-gcc.h
@@ -0,0 +1,42 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _TOOLS_LINUX_COMPILER_H_
+#error "Please don't include <linux/compiler-gcc.h> directly, include <linux/compiler.h> instead."
+#endif
+
+/*
+ * Common definitions for all gcc versions go here.
+ */
+#define GCC_VERSION (__GNUC__ * 10000		\
+		     + __GNUC_MINOR__ * 100	\
+		     + __GNUC_PATCHLEVEL__)
+
+#if GCC_VERSION >= 70000 && !defined(__CHECKER__)
+# define __fallthrough __attribute__ ((fallthrough))
+#endif
+
+#if GCC_VERSION >= 40300
+# define __compiletime_error(message) __attribute__((error(message)))
+#endif /* GCC_VERSION >= 40300 */
+
+/* &a[0] degrades to a pointer: a different type from an array */
+#define __must_be_array(a)	BUILD_BUG_ON_ZERO(__same_type((a), &(a)[0]))
+
+#ifndef __pure
+#define  __pure		__attribute__((pure))
+#endif
+#define  noinline	__attribute__((noinline))
+#ifndef __packed
+#define __packed	__attribute__((packed))
+#endif
+#ifndef __noreturn
+#define __noreturn	__attribute__((noreturn))
+#endif
+#ifndef __aligned
+#define __aligned(x)	__attribute__((aligned(x)))
+#endif
+#define __printf(a, b)	__attribute__((format(printf, a, b)))
+#define __scanf(a, b)	__attribute__((format(scanf, a, b)))
+
+#if GCC_VERSION >= 50100
+#define COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW 1
+#endif
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
new file mode 100644
index 00000000..1db17094
--- /dev/null
+++ b/include/linux/compiler.h
@@ -0,0 +1,175 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _TOOLS_LINUX_COMPILER_H_
+#define _TOOLS_LINUX_COMPILER_H_
+
+#ifdef __GNUC__
+#include <linux/compiler-gcc.h>
+#endif
+
+#ifndef __compiletime_error
+# define __compiletime_error(message)
+#endif
+
+/* Optimization barrier */
+/* The "volatile" is due to gcc bugs */
+#define barrier() __asm__ __volatile__("": : :"memory")
+
+#ifndef __always_inline
+# define __always_inline	inline __attribute__((always_inline))
+#endif
+
+#ifndef noinline
+#define noinline
+#endif
+
+/* Are two types/vars the same type (ignoring qualifiers)? */
+#ifndef __same_type
+# define __same_type(a, b) __builtin_types_compatible_p(typeof(a), typeof(b))
+#endif
+
+#ifdef __ANDROID__
+/*
+ * FIXME: Big hammer to get rid of tons of:
+ *   "warning: always_inline function might not be inlinable"
+ *
+ * At least on android-ndk-r12/platforms/android-24/arch-arm
+ */
+#undef __always_inline
+#define __always_inline	inline
+#endif
+
+#define __user
+#define __rcu
+#define __read_mostly
+
+#ifndef __attribute_const__
+# define __attribute_const__
+#endif
+
+#ifndef __maybe_unused
+# define __maybe_unused		__attribute__((unused))
+#endif
+
+#ifndef __used
+# define __used		__attribute__((__unused__))
+#endif
+
+#ifndef __packed
+# define __packed		__attribute__((__packed__))
+#endif
+
+#ifndef __force
+# define __force
+#endif
+
+#ifndef __weak
+# define __weak			__attribute__((weak))
+#endif
+
+#ifndef likely
+# define likely(x)		__builtin_expect(!!(x), 1)
+#endif
+
+#ifndef unlikely
+# define unlikely(x)		__builtin_expect(!!(x), 0)
+#endif
+
+#ifndef __init
+# define __init
+#endif
+
+#ifndef noinline
+# define noinline
+#endif
+
+#define uninitialized_var(x) x = *(&(x))
+
+#include <linux/types.h>
+
+/*
+ * Following functions are taken from kernel sources and
+ * break aliasing rules in their original form.
+ *
+ * While kernel is compiled with -fno-strict-aliasing,
+ * perf uses -Wstrict-aliasing=3 which makes build fail
+ * under gcc 4.4.
+ *
+ * Using extra __may_alias__ type to allow aliasing
+ * in this case.
+ */
+typedef __u8  __attribute__((__may_alias__))  __u8_alias_t;
+typedef __u16 __attribute__((__may_alias__)) __u16_alias_t;
+typedef __u32 __attribute__((__may_alias__)) __u32_alias_t;
+typedef __u64 __attribute__((__may_alias__)) __u64_alias_t;
+
+static __always_inline void __read_once_size(const volatile void *p, void *res, int size)
+{
+	switch (size) {
+	case 1: *(__u8_alias_t  *) res = *(volatile __u8_alias_t  *) p; break;
+	case 2: *(__u16_alias_t *) res = *(volatile __u16_alias_t *) p; break;
+	case 4: *(__u32_alias_t *) res = *(volatile __u32_alias_t *) p; break;
+	case 8: *(__u64_alias_t *) res = *(volatile __u64_alias_t *) p; break;
+	default:
+		barrier();
+		__builtin_memcpy((void *)res, (const void *)p, size);
+		barrier();
+	}
+}
+
+static __always_inline void __write_once_size(volatile void *p, void *res, int size)
+{
+	switch (size) {
+	case 1: *(volatile  __u8_alias_t *) p = *(__u8_alias_t  *) res; break;
+	case 2: *(volatile __u16_alias_t *) p = *(__u16_alias_t *) res; break;
+	case 4: *(volatile __u32_alias_t *) p = *(__u32_alias_t *) res; break;
+	case 8: *(volatile __u64_alias_t *) p = *(__u64_alias_t *) res; break;
+	default:
+		barrier();
+		__builtin_memcpy((void *)p, (const void *)res, size);
+		barrier();
+	}
+}
+
+/*
+ * Prevent the compiler from merging or refetching reads or writes. The
+ * compiler is also forbidden from reordering successive instances of
+ * READ_ONCE and WRITE_ONCE, but only when the compiler is aware of some
+ * particular ordering. One way to make the compiler aware of ordering is to
+ * put the two invocations of READ_ONCE or WRITE_ONCE in different C
+ * statements.
+ *
+ * These two macros will also work on aggregate data types like structs or
+ * unions. If the size of the accessed data type exceeds the word size of
+ * the machine (e.g., 32 bits or 64 bits) READ_ONCE() and WRITE_ONCE() will
+ * fall back to memcpy and print a compile-time warning.
+ *
+ * Their two major use cases are: (1) Mediating communication between
+ * process-level code and irq/NMI handlers, all running on the same CPU,
+ * and (2) Ensuring that the compiler does not fold, spindle, or otherwise
+ * mutilate accesses that either do not require ordering or that interact
+ * with an explicit memory barrier or atomic instruction that provides the
+ * required ordering.
+ */
+
+#define READ_ONCE(x)					\
+({							\
+	union { typeof(x) __val; char __c[1]; } __u =	\
+		{ .__c = { 0 } };			\
+	__read_once_size(&(x), __u.__c, sizeof(x));	\
+	__u.__val;					\
+})
+
+#define WRITE_ONCE(x, val)				\
+({							\
+	union { typeof(x) __val; char __c[1]; } __u =	\
+		{ .__val = (val) };			\
+	__write_once_size(&(x), __u.__c, sizeof(x));	\
+	__u.__val;					\
+})
+
+
+#ifndef __fallthrough
+# define __fallthrough
+#endif
+
+#endif /* _TOOLS_LINUX_COMPILER_H */
diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
new file mode 100644
index 00000000..22b7e3a9
--- /dev/null
+++ b/include/linux/ring_buffer.h
@@ -0,0 +1,74 @@
+#ifndef _TOOLS_LINUX_RING_BUFFER_H_
+#define _TOOLS_LINUX_RING_BUFFER_H_
+
+#include <asm/barrier.h>
+#include <linux/perf_event.h>
+
+/*
+ * Contract with kernel for walking the perf ring buffer from
+ * user space requires the following barrier pairing (quote
+ * from kernel/events/ring_buffer.c):
+ *
+ *   Since the mmap() consumer (userspace) can run on a
+ *   different CPU:
+ *
+ *   kernel                             user
+ *
+ *   if (LOAD ->data_tail) {            LOAD ->data_head
+ *                      (A)             smp_rmb()       (C)
+ *      STORE $data                     LOAD $data
+ *      smp_wmb()       (B)             smp_mb()        (D)
+ *      STORE ->data_head               STORE ->data_tail
+ *   }
+ *
+ *   Where A pairs with D, and B pairs with C.
+ *
+ *   In our case A is a control dependency that separates the
+ *   load of the ->data_tail and the stores of $data. In case
+ *   ->data_tail indicates there is no room in the buffer to
+ *   store $data we do not.
+ *
+ *   D needs to be a full barrier since it separates the data
+ *   READ from the tail WRITE.
+ *
+ *   For B a WMB is sufficient since it separates two WRITEs,
+ *   and for C an RMB is sufficient since it separates two READs.
+ *
+ * Note, instead of B, C, D we could also use smp_store_release()
+ * in B and D as well as smp_load_acquire() in C.
+ *
+ * However, this optimization does not make sense for all kernel
+ * supported architectures since for a fair number it would
+ * resolve into READ_ONCE() + smp_mb() pair for smp_load_acquire(),
+ * and smp_mb() + WRITE_ONCE() pair for smp_store_release().
+ *
+ * Thus for those smp_wmb() in B and smp_rmb() in C would still
+ * be less expensive. For the case of D this has either the same
+ * cost or is less expensive, for example, due to TSO x86 can
+ * avoid the CPU barrier entirely.
+ */
+
+static inline uint64_t ring_buffer_read_head(struct perf_event_mmap_page *base)
+{
+/*
+ * Architectures where smp_load_acquire() does not fallback to
+ * READ_ONCE() + smp_mb() pair.
+ */
+#if defined(__x86_64__) || defined(__aarch64__) || defined(__powerpc64__) || \
+    defined(__ia64__) || defined(__sparc__) && defined(__arch64__)
+	return smp_load_acquire(&base->data_head);
+#else
+	u64 head = READ_ONCE(base->data_head);
+
+	smp_rmb();
+	return head;
+#endif
+}
+
+static inline void ring_buffer_write_tail(struct perf_event_mmap_page *base,
+					  uint64_t tail)
+{
+	smp_store_release(&base->data_tail, tail);
+}
+
+#endif /* _TOOLS_LINUX_RING_BUFFER_H_ */
diff --git a/libdtrace/Build b/libdtrace/Build
index 61151690..166c5b75 100644
--- a/libdtrace/Build
+++ b/libdtrace/Build
@@ -20,7 +20,7 @@ libdtrace-build_SOURCES = dt_lex.c dt_aggregate.c dt_as.c dt_bpf.c \
 			  dt_pid.c dt_pragma.c dt_printf.c dt_probe.c \
 			  dt_proc.c dt_program.c dt_provider.c dt_regset.c \
 			  dt_string.c dt_strtab.c dt_subr.c dt_symtab.c \
-			  dt_work.c dt_xlator.c dt_prov_dtrace.c \
+			  dt_work.c dt_xlator.c dt_peb.c dt_prov_dtrace.c \
 			  dt_prov_fbt.c dt_prov_sdt.c dt_prov_syscall.c
 
 libdtrace-build_SRCDEPS := dt_grammar.h $(objdir)/dt_git_version.h
@@ -41,8 +41,9 @@ libdtrace_LIBSOURCES := libdtrace-build libproc libport
 libdtrace_SECONDARY := libproc libport libbpf
 
 # Disable C99 variadic macro warnings for this file
-dt_proc.c_CFLAGS := -Wno-pedantic
+dt_consume.c_CFLAGS := -Wno-pedantic
 dt_dis.c_CFLAGS := -Wno-pedantic
+dt_proc.c_CFLAGS := -Wno-pedantic
 
 # Extra include path for this file
 dt_bpf.c_CFLAGS := -Ilibbpf
diff --git a/libdtrace/dt_bpf.c b/libdtrace/dt_bpf.c
index 3bd1d016..cf9e20b7 100644
--- a/libdtrace/dt_bpf.c
+++ b/libdtrace/dt_bpf.c
@@ -23,13 +23,15 @@ static bool dt_gmap_done = 0;
 
 #define BPF_CG_LICENSE	"GPL";
 
-static inline int perf_event_open(struct perf_event_attr *attr, pid_t pid,
-				  int cpu, int group_fd, unsigned long flags)
+int
+perf_event_open(struct perf_event_attr *attr, pid_t pid,
+		    int cpu, int group_fd, unsigned long flags)
 {
 	return syscall(__NR_perf_event_open, attr, pid, cpu, group_fd, flags);
 }
 
-static inline int bpf(enum bpf_cmd cmd, union bpf_attr *attr)
+int
+bpf(enum bpf_cmd cmd, union bpf_attr *attr)
 {
 	return syscall(__NR_bpf, cmd, attr, sizeof(union bpf_attr));
 }
@@ -79,6 +81,9 @@ create_gmap(dtrace_hdl_t *dtp, const char *name, enum bpf_map_type type,
  *		can use it without interference from other CPUs.  The size of
  *		the value (a byte array) is the maximum trace buffer record
  *		size that any of the compiled programs can emit.
+ *		We add 4 bytes to the size to account for the 4-byte padding we
+ *		need to add at the beginning of the data to ensure proper
+ *		trace data alignment.
  * - strtab:	String table map.  This is a global map with a singleton
  *		element (key 0) that contains the entire string table as a
  *		concatenation of all unique strings (each terminated with a
@@ -123,7 +128,7 @@ dt_bpf_gmap_create(dtrace_hdl_t *dtp, uint_t probec)
 			   sizeof(uint32_t), sizeof(uint32_t),
 			   dtp->dt_conf.numcpus) &&
 	       create_gmap(dtp, "mem", BPF_MAP_TYPE_PERCPU_ARRAY,
-			   sizeof(uint32_t), dtp->dt_maxreclen, 1) &&
+			   sizeof(uint32_t), dtp->dt_maxreclen + 4, 1) &&
 	       create_gmap(dtp, "strtab", BPF_MAP_TYPE_ARRAY,
 			   sizeof(uint32_t), dtp->dt_strlen, 1) &&
 	       create_gmap(dtp, "gvars", BPF_MAP_TYPE_ARRAY,
diff --git a/libdtrace/dt_bpf.h b/libdtrace/dt_bpf.h
index edb159e8..41772229 100644
--- a/libdtrace/dt_bpf.h
+++ b/libdtrace/dt_bpf.h
@@ -8,6 +8,7 @@
 #ifndef	_DT_BPF_H
 #define	_DT_BPF_H
 
+#include <linux/perf_event.h>
 #include <dt_impl.h>
 
 #ifdef	__cplusplus
@@ -17,6 +18,10 @@ extern "C" {
 #define DT_CONST_EPID	1
 #define DT_CONST_ARGC	2
 
+extern int perf_event_open(struct perf_event_attr *attr, pid_t pid, int cpu,
+			   int group_fd, unsigned long flags);
+extern int bpf(enum bpf_cmd cmd, union bpf_attr *attr);
+
 extern int dt_bpf_gmap_create(dtrace_hdl_t *, uint_t);
 extern int dt_bpf_map_update(int fd, const void *key, const void *val);
 extern int dt_bpf_prog(dtrace_hdl_t *, dtrace_prog_t *);
diff --git a/libdtrace/dt_cg.c b/libdtrace/dt_cg.c
index 56005bd8..e0e58abc 100644
--- a/libdtrace/dt_cg.c
+++ b/libdtrace/dt_cg.c
@@ -32,6 +32,18 @@ static void dt_cg_node(dt_node_t *, dt_irlist_t *, dt_regset_t *);
  *	5. Retrieve the output data buffer and store the base pointer in %r9
  *	6. Store the epid and 4 padding byes at the beginning of the output
  *	   buffer
+ *
+ * Due to the fact that raw perf samples are stored as a header (multiple of 8
+ * bytes) followed by a 4-byte size, we need to ensure that our trace data is
+ * preceded by a 4-byte padding.  That will place the EPID (4 bytes) at a
+ * 64-bit boundary, and given that it is followed by a 4-byte tag, all further
+ * trace data will also start at a 64-bit boundary.
+ *
+ * To accomplish all this, we store a 4-byte value at the beginning of our
+ * output data buffer, and then store the next address location in %r9.
+ *
+ * In the epilogue, we then need to submit (%r9 - 4) as source address of our
+ * data buffer.
  */
 static void
 dt_cg_prologue(dt_pcb_t *pcb)
@@ -67,13 +79,15 @@ dt_cg_prologue(dt_pcb_t *pcb)
 	dt_irlist_append(dlp, dt_cg_node_alloc(DT_LBL_NONE, instr));
 
 	/*
-	 *		stdw [%fp+-32], 0
+	 *		stdw [%fp+DT_STK_SPILL(1)], 0
 	 *		lddw %r1, &mem
 	 *		mov %r2, %fp
-	 *		add %r2, -32
+	 *		add %r2, DT_STK_SPILL(1)
 	 *		call bpf_map_lookup_elem
 	 *		je %r0, 0, lbl_exit
 	 *		mov %r9, %r0
+	 *		stw [%r9+0], 0
+	 *		add %r9, 4
 	 */
 	instr = BPF_STORE_IMM(BPF_W, BPF_REG_FP, DT_STK_SPILL(1), 0);
 	dt_irlist_append(dlp, dt_cg_node_alloc(DT_LBL_NONE, instr));
@@ -88,6 +102,10 @@ dt_cg_prologue(dt_pcb_t *pcb)
 	dt_irlist_append(dlp, dt_cg_node_alloc(DT_LBL_NONE, instr));
 	instr = BPF_MOV_REG(BPF_REG_9, BPF_REG_0);
 	dt_irlist_append(dlp, dt_cg_node_alloc(DT_LBL_NONE, instr));
+	instr = BPF_STORE_IMM(BPF_W, BPF_REG_9, 0, 0);
+	dt_irlist_append(dlp, dt_cg_node_alloc(DT_LBL_NONE, instr));
+	instr = BPF_ALU64_IMM(BPF_ADD, BPF_REG_9, 4);
+	dt_irlist_append(dlp, dt_cg_node_alloc(DT_LBL_NONE, instr));
 
 	/*
 	 * We read epid from dctx (struct dt_bpf_context) and store it in the
@@ -143,6 +161,7 @@ dt_cg_epilogue(dt_pcb_t *pcb)
 	 *		lddw %r2, &buffers
 	 *		ldw %r3, [%fp + DT_STK_CPU]
 	 *		mov %r4, %r9
+	 *		add %r4, -4
 	 *		mov %r5, pcb->pcb_bufoff
 	 *		call bpf_perf_event_output
 	 *		mov %r0, 0
@@ -154,6 +173,8 @@ dt_cg_epilogue(dt_pcb_t *pcb)
 	dt_irlist_append(dlp, dt_cg_node_alloc(DT_LBL_NONE, instr));
 	instr = BPF_MOV_REG(BPF_REG_4, BPF_REG_9);
 	dt_irlist_append(dlp, dt_cg_node_alloc(DT_LBL_NONE, instr));
+	instr = BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, -4);
+	dt_irlist_append(dlp, dt_cg_node_alloc(DT_LBL_NONE, instr));
 	instr = BPF_MOV_IMM(BPF_REG_5, pcb->pcb_bufoff);
 	dt_irlist_append(dlp, dt_cg_node_alloc(DT_LBL_NONE, instr));
 	instr = BPF_CALL_HELPER(BPF_FUNC_perf_event_output);
diff --git a/libdtrace/dt_consume.c b/libdtrace/dt_consume.c
index c4588132..2b7c7bbe 100644
--- a/libdtrace/dt_consume.c
+++ b/libdtrace/dt_consume.c
@@ -15,8 +15,12 @@
 #include <alloca.h>
 #include <dt_impl.h>
 #include <dt_pcap.h>
+#include <dt_peb.h>
 #include <libproc.h>
 #include <port.h>
+#include <sys/epoll.h>
+#include <linux/perf_event.h>
+#include <linux/ring_buffer.h>
 
 #define	DT_MASK_LO 0x00000000FFFFFFFFULL
 
@@ -1842,7 +1846,8 @@ dt_setopt(dtrace_hdl_t *dtp, const dtrace_probedata_t *data,
 	return (rval);
 }
 
-static int
+#if 0
+int
 dt_consume_cpu(dtrace_hdl_t *dtp, FILE *fp, int cpu, dtrace_bufdesc_t *buf,
     dtrace_consume_probe_f *efunc, dtrace_consume_rec_f *rfunc, void *arg)
 {
@@ -2268,6 +2273,152 @@ nextepid:
 
 	return (dt_handle_cpudrop(dtp, cpu, DTRACEDROP_PRINCIPAL, drops));
 }
+#else
+int
+dt_consume_one(dtrace_hdl_t *dtp, FILE *fp, int cpu, char *buf,
+	       dtrace_consume_probe_f *efunc, dtrace_consume_rec_f *rfunc,
+	       void *arg)
+{
+	char				*data = buf;
+	struct perf_event_header	*hdr;
+	int				rval;
+
+	hdr = (struct perf_event_header *)data;
+	data += sizeof(struct perf_event_header);
+
+	if (hdr->type == PERF_RECORD_SAMPLE) {
+		char			*ptr = data;
+		uint32_t		size, epid, tag;
+		dtrace_probedata_t	pdat;
+
+		/*
+		 * struct {
+		 *	struct perf_event_header	header;
+		 *	uint32_t			size;
+		 *	uint32_t			pad;
+		 *	uint32_t			epid;
+		 *	uint32_t			tag;
+		 *	uint64_t			data[n];
+		 * }
+		 * and data points to the 'size' member at this point.
+		 * (Note that 'n' may be 0.)
+		 */
+		if (ptr > buf + hdr->size)
+			return -1;
+
+		size = *(uint32_t *)data;
+		data += sizeof(size);
+		ptr += sizeof(size) + size;
+		if (ptr != buf + hdr->size)
+			return -1;
+
+		data += sizeof(uint32_t);		/* skip padding */
+		size -= sizeof(uint32_t);
+
+		epid = *(uint32_t *)data;
+		data += sizeof(uint32_t);
+		size -= sizeof(uint32_t);
+
+		tag = *(uint32_t *)data;
+		data += sizeof(uint32_t);
+		size -= sizeof(uint32_t);
+
+		memset(&pdat, 0, sizeof(pdat));
+		pdat.dtpda_handle = dtp;
+		pdat.dtpda_cpu = cpu;
+		rval = dt_epid_lookup(dtp, epid, &pdat.dtpda_edesc,
+						 &pdat.dtpda_pdesc);
+		if (rval != 0)
+			return (rval);
+
+		rval = (*efunc)(&pdat, arg);
+
+		/*
+		 * Call the record callback with a NULL record to indicate
+		 * that we're done processing this EPID.
+		 */
+		rval = (*rfunc)(&pdat, NULL, arg);
+	} else if (hdr->type == PERF_RECORD_LOST) {
+		uint64_t	lost;
+
+		/*
+		 * struct {
+		 *	struct perf_event_header	header;
+		 *	uint64_t			id;
+		 *	uint64_t			lost;
+		 * }
+		 * and data points to the 'id' member at this point.
+		 */
+		lost = *(uint64_t *)(data + sizeof(uint64_t));
+
+	} else
+		return -1;
+}
+
+int
+dt_consume_cpu(dtrace_hdl_t *dtp, FILE *fp, int cpu, dt_peb_t *peb,
+	       dtrace_consume_probe_f *efunc, dtrace_consume_rec_f *rfunc,
+	       void *arg)
+{
+	struct perf_event_mmap_page	*rb_page = (void *)peb->base;
+	struct perf_event_header	*hdr;
+	char				*base;
+	char				*event;
+	uint32_t			len;
+	uint64_t			head, tail;
+	dt_pebset_t			*pebset = dtp->dt_pebset;
+	uint64_t			data_size = pebset->data_size;
+
+	/*
+	 * Set base to be the start of the buffer data, i.e. we skip the first
+	 * page (it contains buffer management data).
+	 */
+	base = peb->base + pebset->page_size;
+
+	for (;;) {
+		head = ring_buffer_read_head(rb_page);
+		tail = rb_page->data_tail;
+
+		if (head == tail)
+			break;
+
+		do {
+			event = base + tail % data_size;
+			hdr = (struct perf_event_header *)event;
+			len = hdr->size;
+
+			/*
+			 * If the perf event data wraps around the boundary of
+			 * the buffer, we make a copy in contiguous memory.
+			 */
+			if (event + len > peb->endp) {
+				char		*dst;
+				uint32_t	num;
+
+				/* Increase the buffer as needed. */
+				if (pebset->tmp_len < len) {
+					pebset->tmp = realloc(pebset->tmp, len);
+					pebset->tmp_len = len;
+				}
+
+				dst = pebset->tmp;
+				num = peb->endp - event + 1;
+				memcpy(dst, event, num);
+				memcpy(dst + num, base, len - num);
+
+				event = dst;
+			}
+
+			dt_consume_one(dtp, fp, cpu, event, efunc, rfunc, arg);
+			tail += hdr->size;
+		} while (tail != head);
+
+		ring_buffer_write_tail(rb_page, tail);
+	}
+
+	return 0;
+}
+#endif
 
 typedef struct dt_begin {
 	dtrace_consume_probe_f *dtbgn_probefunc;
@@ -2477,6 +2628,7 @@ int
 dtrace_consume(dtrace_hdl_t *dtp, FILE *fp,
     dtrace_consume_probe_f *pf, dtrace_consume_rec_f *rf, void *arg)
 {
+#if 0
 	dtrace_bufdesc_t *buf = &dtp->dt_buf;
 	dtrace_optval_t size;
 	static int max_ncpus;
@@ -2572,4 +2724,35 @@ dtrace_consume(dtrace_hdl_t *dtp, FILE *fp,
 	}
 
 	return (dt_consume_cpu(dtp, fp, dtp->dt_endedon, buf, pf, rf, arg));
+#else
+	dtrace_optval_t		timeout = dtp->dt_options[DTRACEOPT_SWITCHRATE];
+	struct epoll_event	events[dtp->dt_conf.numcpus];
+	int			i, cnt;
+
+	/*
+	 * The epoll_wait() function expects the timeout to be expressed in
+	 * milliseconds whereas the switch rate is expressed in nanoseconds.
+	 * We therefore need to convert the value.
+	 */
+	timeout /= NANOSEC / MILLISEC;
+	cnt = epoll_wait(dtp->dt_poll_fd, events, dtp->dt_conf.numcpus,
+			 timeout);
+	if (cnt < 0)
+		return dt_set_errno(dtp, errno);
+
+	/*
+	 * Loop over the buffers that have data available, and process them one
+	 * by one.
+	 */
+	for (i = 0; i < cnt; i++) {
+		dt_peb_t	*peb = events[i].data.ptr;
+		int		rval;
+
+		rval = dt_consume_cpu(dtp, fp, peb->cpu, peb, pf, rf, arg);
+		if (rval != 0)
+			return rval;
+	}
+
+	return 0;
+#endif
 }
diff --git a/libdtrace/dt_impl.h b/libdtrace/dt_impl.h
index 019e91a4..0630dd4c 100644
--- a/libdtrace/dt_impl.h
+++ b/libdtrace/dt_impl.h
@@ -65,6 +65,7 @@ struct dt_pfdict;		/* see <dt_printf.h> */
 struct dt_arg;			/* see below */
 struct dt_provider;		/* see <dt_provider.h> */
 struct dt_probe;		/* see <dt_probe.h> */
+struct dt_pebset;		/* see <dt_peb.h> */
 struct dt_xlator;		/* see <dt_xlator.h> */
 
 typedef struct dt_intrinsic {
@@ -296,7 +297,7 @@ struct dtrace_hdl {
 	int dt_maxformat;	/* max format ID */
 	void **dt_formats;	/* pointer to format array */
 	dt_aggregate_t dt_aggregate; /* aggregate */
-	dtrace_bufdesc_t dt_buf; /* staging buffer */
+	struct dt_pebset *dt_pebset; /* perf event buffers set */
 	struct dt_pfdict *dt_pfdict; /* dictionary of printf conversions */
 	dt_version_t dt_vmax;	/* optional ceiling on program API binding */
 	dtrace_attribute_t dt_amin; /* optional floor on program attributes */
@@ -335,6 +336,7 @@ struct dtrace_hdl {
 	int dt_cdefs_fd;	/* file descriptor for C CTF debugging cache */
 	int dt_ddefs_fd;	/* file descriptor for D CTF debugging cache */
 	int dt_stdout_fd;	/* file descriptor for saved stdout */
+	int dt_poll_fd;		/* file descriptor for event polling */
 	dtrace_handle_err_f *dt_errhdlr; /* error handler, if any */
 	void *dt_errarg;	/* error handler argument */
 	dtrace_prog_t *dt_errprog; /* error handler program, if any */
diff --git a/libdtrace/dt_open.c b/libdtrace/dt_open.c
index bf26d007..ce8e71a1 100644
--- a/libdtrace/dt_open.c
+++ b/libdtrace/dt_open.c
@@ -36,6 +36,7 @@
 #include <dt_string.h>
 #include <dt_provider.h>
 #include <dt_probe.h>
+#include <dt_peb.h>
 
 const dt_version_t _dtrace_versions[] = {
 	DT_VERS_1_0,	/* D API 1.0.0 (PSARC 2001/466) Solaris 10 FCS */
@@ -714,6 +715,7 @@ dt_vopen(int version, int flags, int *errp,
 	dtp->dt_cdefs_fd = -1;
 	dtp->dt_ddefs_fd = -1;
 	dtp->dt_stdout_fd = -1;
+	dtp->dt_poll_fd = -1;
 	dtp->dt_modbuckets = _dtrace_strbuckets;
 	dtp->dt_mods = calloc(dtp->dt_modbuckets, sizeof (dt_module_t *));
 	dtp->dt_kernpathbuckets = _dtrace_strbuckets;
@@ -1189,22 +1191,24 @@ dtrace_close(dtrace_hdl_t *dtp)
 	dt_pcap_destroy(dtp);
 
 	if (dtp->dt_fd != -1)
-		(void) close(dtp->dt_fd);
+		close(dtp->dt_fd);
 	if (dtp->dt_ftfd != -1)
-		(void) close(dtp->dt_ftfd);
+		close(dtp->dt_ftfd);
 	if (dtp->dt_cdefs_fd != -1)
-		(void) close(dtp->dt_cdefs_fd);
+		close(dtp->dt_cdefs_fd);
 	if (dtp->dt_ddefs_fd != -1)
-		(void) close(dtp->dt_ddefs_fd);
+		close(dtp->dt_ddefs_fd);
 	if (dtp->dt_stdout_fd != -1)
-		(void) close(dtp->dt_stdout_fd);
+		close(dtp->dt_stdout_fd);
+	if (dtp->dt_poll_fd != -1)
+		close(dtp->dt_poll_fd);
 
 	dt_epid_destroy(dtp);
 	dt_aggid_destroy(dtp);
 	dt_format_destroy(dtp);
 	dt_buffered_destroy(dtp);
 	dt_aggregate_destroy(dtp);
-	free(dtp->dt_buf.dtbd_data);
+	dt_pebs_exit(dtp);
 	dt_pfdict_destroy(dtp);
 	dt_dof_fini(dtp);
 
diff --git a/libdtrace/dt_peb.c b/libdtrace/dt_peb.c
new file mode 100644
index 00000000..1c5dea10
--- /dev/null
+++ b/libdtrace/dt_peb.c
@@ -0,0 +1,232 @@
+/*
+ * Oracle Linux DTrace.
+ * Copyright (c) 2019, 2020, Oracle and/or its affiliates. All rights reserved.
+ * Licensed under the Universal Permissive License v 1.0 as shown at
+ * http://oss.oracle.com/licenses/upl.
+ */
+
+#include <assert.h>
+#include <errno.h>
+#include <sys/epoll.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <linux/perf_event.h>
+
+#include <dt_impl.h>
+#include <dt_bpf.h>
+#include <dt_peb.h>
+
+/*
+ * Find last set bit in a 64-bit value.
+ */
+static inline uint64_t fls64(uint64_t n)
+{
+	if (n == 0)
+		return 0;
+
+	return 64 - __builtin_clzl(n);
+}
+
+/*
+ * Round a given value up to the nearest power of 2.
+ */
+static inline uint64_t
+roundup_pow2(uint64_t n)
+{
+	return 1UL << fls64(n - 1);
+}
+
+/*
+ * Close a given perf event buffer.
+ */
+static void
+dt_peb_close(dt_peb_t *peb)
+{
+	dt_pebset_t	*pebs = peb->dtp->dt_pebset;
+
+	if (peb == NULL || peb->fd < 0)
+		return;
+
+	ioctl(peb->fd, PERF_EVENT_IOC_DISABLE, 0);
+
+	pebs = peb->dtp->dt_pebset;
+	munmap(peb->base, pebs->page_size + pebs->data_size);
+
+	close(peb->fd);
+
+	peb->base = NULL;
+	peb->fd = -1;
+}
+
+/*
+ * Set up a perf event buffer.
+ */
+static int
+dt_peb_open(dt_peb_t *peb)
+{
+	int			fd;
+	struct perf_event_attr	attr;
+	dt_pebset_t		*pebs = peb->dtp->dt_pebset;
+
+	/*
+	 * Event configuration for BPF-generated output in perf_event ring
+	 * buffers.
+	 */
+	memset(&attr, 0, sizeof(attr));
+	attr.config = PERF_COUNT_SW_BPF_OUTPUT;
+	attr.type = PERF_TYPE_SOFTWARE;
+	attr.sample_type = PERF_SAMPLE_RAW;
+	attr.sample_period = 1;
+	attr.wakeup_events = 1;
+	fd = perf_event_open(&attr, -1, peb->cpu, -1, PERF_FLAG_FD_CLOEXEC);
+	if (fd < 0)
+		goto fail;
+
+	/*
+	 * We add buf->page_size to the buf->data_size, because perf maintains
+	 * a meta-data page at the beginning of the memory region.  That page
+	 * is used for reader/writer symchronization.
+	 */
+	peb->fd = fd;
+	peb->base = mmap(NULL, pebs->page_size + pebs->data_size,
+			 PROT_READ | PROT_WRITE, MAP_SHARED, peb->fd, 0);
+	peb->endp = peb->base + pebs->page_size + pebs->data_size - 1;
+	if (peb->base == NULL)
+		goto fail;
+
+	if (ioctl(fd, PERF_EVENT_IOC_ENABLE, 0) < 0)
+		goto fail;
+
+	return fd;
+
+fail:
+	if (peb->base) {
+		munmap(peb->base, pebs->page_size + pebs->data_size);
+		peb->base = NULL;
+		peb->endp = NULL;
+	}
+	if (peb->fd) {
+		close(peb->fd);
+		peb->fd = -1;
+	}
+
+	return -1;
+}
+
+/*
+ * Perform cleanup of the perf event buffers.
+ */
+void
+dt_pebs_exit(dtrace_hdl_t *dtp)
+{
+	int	i;
+
+	if (dtp->dt_pebset == NULL)
+		return;
+
+	for (i = 0; i < dtp->dt_conf.numcpus; i++)
+		dt_peb_close(&dtp->dt_pebset->pebs[i]);
+
+	dt_free(dtp, dtp->dt_pebset->pebs);
+	dt_free(dtp, dtp->dt_pebset);
+}
+
+/*
+ * Initialize the perf event buffers (one per online CPU).  Each buffer will
+ * the given number of pages (i.e. the total size of each buffer will be
+ * num_pages * getpagesize()).  The  allocated memory for each buffer is mmap'd
+ * so the kernel can write to it, and its representative file descriptor is
+ * recorded in the 'buffers' BPF map so that BPF code knows where to write
+ * trace data for a specific CPU.
+ *
+ * An event polling file descriptor is created as well, and it is configured to
+ * monitor all perf event buffers at once.  This file descriptor is returned
+ * upon success..  Failure is indicated with a -1 return value.
+ */
+int dt_pebs_init(dtrace_hdl_t *dtp, size_t bufsize)
+{
+	int		i;
+	int		mapfd;
+	size_t		num_pages;
+	dt_ident_t	*idp;
+	dt_peb_t	*pebs;
+
+	/*
+	 * The perf event buffer implementation in the kernel requires that the
+	 * buffer comprises n full memory pages, where n must be a power of 2.
+	 * Since the buffer size is specified in bytes in DTrace, we need to
+	 * convert this size (and possibly round it up) to an acceptable value.
+	 */
+	num_pages = roundup_pow2((bufsize + getpagesize() - 1) / getpagesize());
+	if (num_pages * getpagesize() > bufsize)
+		fprintf(stderr, "bufsize increased to %lu\n",
+			num_pages * getpagesize());
+
+	/*
+	 * Determine the fd for the 'buffers' BPF map.
+	 */
+	idp = dt_dlib_get_map(dtp, "buffers");
+	if (idp == NULL || idp->di_id == DT_IDENT_UNDEF)
+		return -ENOENT;
+
+	mapfd = idp->di_id;
+
+	/*
+	 * Allocate the perf event buffer set.
+	 */
+	dtp->dt_pebset = dt_zalloc(dtp, sizeof(dt_pebset_t));
+	if (dtp->dt_pebset == NULL)
+		return -ENOMEM;
+
+	/*
+	 * Allocate the per-CPU perf event buffers.
+	 */
+	pebs = dt_calloc(dtp, dtp->dt_conf.numcpus, sizeof(struct dt_peb));
+	if (pebs == NULL)
+		return -ENOMEM;
+
+	dtp->dt_pebset->pebs = pebs;
+
+	dtp->dt_pebset->page_size = getpagesize();
+	dtp->dt_pebset->data_size = num_pages * dtp->dt_pebset->page_size;
+
+	/*
+	 * Initialize a perf event buffer for each online CPU.
+	 */
+	for (i = 0; i < dtp->dt_conf.numcpus; i++) {
+		int			cpu = dtp->dt_conf.cpuids[i];
+		struct epoll_event	ev;
+		dt_peb_t		*peb = &dtp->dt_pebset->pebs[i];
+
+		peb->dtp = dtp;
+		peb->cpu = cpu;
+
+		/*
+		 * Try to create the perf event buffer for 'cpu'.
+		 */
+		if (dt_peb_open(peb) == -1)
+			continue;
+
+		/*
+		 * Add the perf event buffer to the event polling descriptor.
+		 * If we cannot add the buffer for polling, we destroy the
+		 * buffer and move on.
+		 */
+		ev.events = EPOLLIN;
+		ev.data.ptr = peb;
+		assert(dtp->dt_poll_fd >= 0);
+		if (epoll_ctl(dtp->dt_poll_fd, EPOLL_CTL_ADD,
+			      peb->fd, &ev) == -1) {
+			dt_peb_close(peb);
+			continue;
+		}
+
+		/*
+		 * Store the perf event buffer file descriptor in the 'buffers'
+		 * BPF map.
+		 */
+		dt_bpf_map_update(mapfd, &cpu, &peb->fd);
+	}
+
+	return 0;
+}
diff --git a/libdtrace/dt_peb.h b/libdtrace/dt_peb.h
new file mode 100644
index 00000000..961db93c
--- /dev/null
+++ b/libdtrace/dt_peb.h
@@ -0,0 +1,51 @@
+/*
+ * Oracle Linux DTrace.
+ * Copyright (c) 2020, Oracle and/or its affiliates. All rights reserved.
+ * Licensed under the Universal Permissive License v 1.0 as shown at
+ * http://oss.oracle.com/licenses/upl.
+ */
+
+#ifndef	_DT_PEB_H
+#define	_DT_PEB_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <dt_impl.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/*
+ * Perf event buffer.
+ */
+typedef struct dt_peb {
+	dtrace_hdl_t	*dtp;		/* pointer to containing dtrace_hdl */
+	int		cpu;		/* ID of CPU that uses this buffer */
+	int		fd;		/* fd of perf output buffer */
+	char		*base;		/* address of buffer */
+	char		*endp;		/* address of end of buffer */
+} dt_peb_t;
+
+/*
+ * Set of perf event buffers.  This structure stores buffer information that is
+ * shared between all buffers, a shared chunk of memory to copy any event that
+ * spans the ring buffer boundary, and an array of perf event buffers.
+ */
+typedef struct dt_pebset {
+	size_t		page_size;	/* size of each page in buffer */
+	size_t		data_size;	/* total buffer size */
+	struct dt_peb	*pebs;		/* array of perf event buffers */
+	char		*tmp;		/* temporary event buffer */
+	size_t		tmp_len;	/* length of temporary event buffer */
+} dt_pebset_t;
+
+extern void dt_pebs_exit(dtrace_hdl_t *);
+extern int dt_pebs_init(dtrace_hdl_t *, size_t);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _DT_PEB_H */
diff --git a/libdtrace/dt_work.c b/libdtrace/dt_work.c
index b9e29e08..afe37946 100644
--- a/libdtrace/dt_work.c
+++ b/libdtrace/dt_work.c
@@ -6,12 +6,14 @@
  */
 
 #include <dt_impl.h>
+#include <dt_peb.h>
 #include <stddef.h>
 #include <errno.h>
 #include <assert.h>
 #include <time.h>
 #include <libproc.h>
 #include <port.h>
+#include <sys/epoll.h>
 
 static const struct {
 	int dtslt_option;
@@ -160,8 +162,9 @@ dtrace_status(dtrace_hdl_t *dtp)
 int
 dtrace_go(dtrace_hdl_t *dtp)
 {
-	void *dof;
-	int err;
+	void	*dof;
+	size_t	size;
+	int	err;
 
 	if (dtp->dt_active)
 		return (dt_set_errno(dtp, EINVAL));
@@ -178,6 +181,7 @@ dtrace_go(dtrace_hdl_t *dtp)
 	    dtp->dt_errno != ENOTTY || dtp->dt_vector == NULL))
 		return (-1); /* dt_errno has been set for us */
 
+#if 0
 	if ((dof = dtrace_getopt_dof(dtp)) == NULL)
 		return (-1); /* dt_errno has been set for us */
 
@@ -186,7 +190,19 @@ dtrace_go(dtrace_hdl_t *dtp)
 
 	if (err == -1 && (errno != ENOTTY || dtp->dt_vector == NULL))
 		return (dt_set_errno(dtp, errno));
+#endif
 
+	/*
+	 * Set up the event polling file descriptor.
+	 */
+	dtp->dt_poll_fd = epoll_create1(EPOLL_CLOEXEC);
+	if (dtp->dt_poll_fd < 0)
+		return dt_set_errno(dtp, errno);
+
+	dtrace_getopt(dtp, "bufsize", &size);
+	dt_pebs_init(dtp, size);
+
+#if 0
 	if (dt_ioctl(dtp, DTRACEIOC_GO, &dtp->dt_beganon) == -1) {
 		if (errno == EACCES)
 			return (dt_set_errno(dtp, EDT_DESTRUCTIVE));
@@ -205,13 +221,18 @@ dtrace_go(dtrace_hdl_t *dtp)
 
 		return (dt_set_errno(dtp, errno));
 	}
+#endif
 
 	dtp->dt_active = 1;
 
+#if 0
 	if (dt_options_load(dtp) == -1)
 		return (dt_set_errno(dtp, errno));
 
 	return (dt_aggregate_go(dtp));
+#else
+	return 0;
+#endif
 }
 
 int
@@ -240,7 +261,7 @@ dtrace_stop(dtrace_hdl_t *dtp)
 	return (0);
 }
 
-
+#if 0
 dtrace_workstatus_t
 dtrace_work(dtrace_hdl_t *dtp, FILE *fp,
     dtrace_consume_probe_f *pfunc, dtrace_consume_rec_f *rfunc, void *arg)
@@ -284,11 +305,28 @@ dtrace_work(dtrace_hdl_t *dtp, FILE *fp,
 		return (rval);
 	}
 
+#if 0
 	if (dtrace_aggregate_snap(dtp) == -1)
 		return (DTRACE_WORKSTATUS_ERROR);
+#endif
 
 	if (dtrace_consume(dtp, fp, pfunc, rfunc, arg) == -1)
 		return (DTRACE_WORKSTATUS_ERROR);
 
 	return (rval);
 }
+#else
+dtrace_workstatus_t
+dtrace_work(dtrace_hdl_t *dtp, FILE *fp, dtrace_consume_probe_f *pfunc,
+	    dtrace_consume_rec_f *rfunc, void *arg)
+{
+	dtrace_workstatus_t	rval;
+
+	rval = DTRACE_WORKSTATUS_OKAY;
+
+	if (dtrace_consume(dtp, fp, pfunc, rfunc, arg) == -1)
+		return DTRACE_WORKSTATUS_ERROR;
+
+	return rval;
+}
+#endif
-- 
2.25.0