[Ocfs2-tools-devel] [PATCH 1/2] o2hbmonitor: Disk heartbeat monitor
Srinivas Eeda
srinivas.eeda at oracle.com
Fri Nov 12 16:22:32 PST 2010
From: Sunil Mushran <sunil.mushran at oracle.com>
o2hbmonitor monitors o2cb disk heartbeat. It periodically reads o2hb
debugfs file, elapsed_time_in_ms, and checks whether the time is greater
than the warn threshold. if so, it prints a message in syslog.
Signed-off-by: Sunil Mushran <sunil.mushran at oracle.com>
---
Makefile | 2 +-
o2monitor/.gitignore | 3 +
o2monitor/Makefile | 26 ++
o2monitor/o2hbmonitor.c | 367 +++++++++++++++++++++++++++++
vendor/common/ocfs2-tools.spec-generic.in | 1 +
5 files changed, 398 insertions(+), 1 deletions(-)
create mode 100644 o2monitor/.gitignore
create mode 100644 o2monitor/Makefile
create mode 100644 o2monitor/o2hbmonitor.c
diff --git a/Makefile b/Makefile
index 88106fb..65c13f9 100644
--- a/Makefile
+++ b/Makefile
@@ -20,7 +20,7 @@ CHKCONFIG_DEP = chkconfig
COMPILE_PY = 1
endif
-SUBDIRS = include libtools-internal libo2dlm libo2cb libocfs2 fsck.ocfs2 mkfs.ocfs2 mounted.ocfs2 tunefs.ocfs2 debugfs.ocfs2 o2cb_ctl ocfs2_hb_ctl mount.ocfs2 ocfs2_controld o2image listuuid sizetest extras fswreck patches
+SUBDIRS = include libtools-internal libo2dlm libo2cb libocfs2 fsck.ocfs2 mkfs.ocfs2 mounted.ocfs2 tunefs.ocfs2 debugfs.ocfs2 o2cb_ctl ocfs2_hb_ctl mount.ocfs2 ocfs2_controld o2image o2monitor listuuid sizetest extras fswreck patches
ifdef BUILD_OCFS2CONSOLE
SUBDIRS += ocfs2console
diff --git a/o2monitor/.gitignore b/o2monitor/.gitignore
new file mode 100644
index 0000000..323fba6
--- /dev/null
+++ b/o2monitor/.gitignore
@@ -0,0 +1,3 @@
+.*.sw?
+*.d
+o2hbmonitor
diff --git a/o2monitor/Makefile b/o2monitor/Makefile
new file mode 100644
index 0000000..961eafa
--- /dev/null
+++ b/o2monitor/Makefile
@@ -0,0 +1,26 @@
+TOPDIR = ..
+
+include $(TOPDIR)/Preamble.make
+
+sbindir = $(root_sbindir)
+SBIN_PROGRAMS = o2hbmonitor
+
+WARNINGS = -Wall -Wstrict-prototypes -Wno-format -Wmissing-prototypes \
+ -Wmissing-declarations
+
+CFLAGS = $(OPTS) $(WARNINGS)
+
+INCLUDES = -I$(TOPDIR)/include -I.
+
+DEFINES = -DVERSION=\"$(VERSION)\"
+
+CFILES = o2hbmonitor.c
+
+OBJS = $(subst .c,.o,$(CFILES))
+
+DIST_FILES = $(CFILES) $(HFILES)
+
+o2hbmonitor: $(OBJS)
+ $(LINK)
+
+include $(TOPDIR)/Postamble.make
diff --git a/o2monitor/o2hbmonitor.c b/o2monitor/o2hbmonitor.c
new file mode 100644
index 0000000..f01da76
--- /dev/null
+++ b/o2monitor/o2hbmonitor.c
@@ -0,0 +1,367 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * o2hbmonitor.c
+ *
+ * Monitors o2hb
+ *
+ * Copyright (C) 2010 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+/*
+ * This utility requires the o2hb debugfs file elapsed_time_in_ms which shows
+ * the time since the o2hb heartbeat timer was last armed. This file was added
+ * in the mainline kernel via commit 43695d095dfaf266a8a940d9b07eed7f46076b49.
+ *
+ * This utility scans the configfs to see if the cluster is up. If not up, it
+ * checks again after CONFIG_POLL_IN_SECS.
+ *
+ * If up, it loads the dead threshold and then scans the debugfs file,
+ * elapsed_time_in_ms, of each heartbeat region. If the elapsed time is
+ * greater than the warn threshold, it logs a message in syslog.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <ctype.h>
+#include <linux/types.h>
+#include <sys/stat.h>
+#include <dirent.h>
+#include <fcntl.h>
+#include <string.h>
+#include <libgen.h>
+#include <syslog.h>
+#include <errno.h>
+
+#define SYS_CONFIG_DIR "/sys/kernel/config"
+#define O2HB_CLUSTER_DIR SYS_CONFIG_DIR"/cluster"
+#define O2HB_HEARTBEAT_DIR O2HB_CLUSTER_DIR"/%s/heartbeat"
+#define O2HB_DEAD_THRESHOLD O2HB_HEARTBEAT_DIR"/dead_threshold"
+#define O2HB_DEVICE O2HB_HEARTBEAT_DIR"/%s/dev"
+
+#define SYS_DEBUG_DIR "/sys/kernel/debug"
+#define O2HB_DEBUG_DIR SYS_DEBUG_DIR"/o2hb"
+#define O2HB_ELAPSED_TIME O2HB_DEBUG_DIR"/%s/elapsed_time_in_ms"
+
+#define DEAD_THRESHOLD_IN_MSECS(a) (((a) - 1) * 2000)
+#define WARN_THRESHOLD_PERCENT 50
+
+#define CONFIG_POLL_IN_SECS 60
+#define SLOW_POLL_IN_SECS 10
+#define FAST_POLL_IN_SECS 2
+
+char *progname;
+int interactive;
+int warn_threshold_percent;
+int verbose;
+
+char *cluster_name;
+unsigned long dead_threshold_in_ms;
+unsigned long warn_threshold_in_ms;
+unsigned long poll_in_secs;
+
+static void show_version(void)
+{
+ fprintf(stderr, "%s %s\n", progname, VERSION);
+}
+
+static char *do_strchomp(char *str)
+{
+ int len = strlen(str);
+ char *p;
+
+ if (!len)
+ return str;
+
+ p = str + len - 1;
+ while ((len--) && (isspace(*p) || (*p == '\n')))
+ *p-- = '\0';
+
+ return str;
+}
+
+static int get_value(char *path, char *value, int count)
+{
+ int fd = -1, ret = -1;
+ char *p = value;
+
+ fd = open(path, O_RDONLY);
+ if (fd > 0)
+ ret = read(fd, value, count);
+ if (ret > 0) {
+ p += ret;
+ *p = '\0';
+ ret = 0;
+ }
+
+ if (!ret)
+ do_strchomp(value);
+
+ if (fd > -1)
+ close(fd);
+ return ret;
+}
+
+static void get_device_name(char *region, char **device)
+{
+ int ret;
+ char val[255];
+ char path[PATH_MAX];
+
+ sprintf(path, O2HB_DEVICE, cluster_name, region);
+ ret = get_value(path, val, sizeof(val));
+ if (ret)
+ goto bail;
+ *device = strdup(val);
+
+bail:
+ return ;
+}
+
+static void process_elapsed_time(char *region, unsigned long elapsed)
+{
+ int warn = 0;
+ char *device = NULL;
+
+ if (elapsed >= warn_threshold_in_ms)
+ warn++;
+
+ if (!verbose && !warn)
+ return;
+
+ get_device_name(region, &device);
+
+ if (verbose)
+ fprintf(stdout, "Last ping %lu msecs ago on /dev/%s, %s\n",
+ elapsed, device, region);
+
+ if (warn) {
+ poll_in_secs = FAST_POLL_IN_SECS;
+ syslog(LOG_WARNING, "Last ping %lu msecs ago on /dev/%s, %s\n",
+ elapsed, device, region);
+ }
+
+ if (device)
+ free(device);
+}
+
+static int read_elapsed_time(char *region, unsigned long *elapsed)
+{
+ int ret;
+ char val[32];
+ char path[PATH_MAX];
+
+ *elapsed = 0;
+
+ sprintf(path, O2HB_ELAPSED_TIME, region);
+ ret = get_value(path, val, sizeof(val));
+ if (ret)
+ goto bail;
+ *elapsed = strtoul(val, NULL, 0);
+
+ ret = 0;
+
+bail:
+ return ret;
+}
+
+static void scan_heartbeat_regions(void)
+{
+ int ret = -1;
+ DIR *dir = NULL;
+ struct dirent *ent;
+ char path[PATH_MAX];
+ unsigned long elapsed;
+
+ sprintf(path, O2HB_DEBUG_DIR);
+
+ dir = opendir(path);
+ if (!dir)
+ return;
+
+ do {
+ ent = readdir(dir);
+ if (ent && ent->d_type == 4 && strcmp(ent->d_name, ".") &&
+ strcmp(ent->d_name, "..")) {
+ ret = read_elapsed_time(ent->d_name, &elapsed);
+ if (!ret)
+ process_elapsed_time(ent->d_name, elapsed);
+ }
+ } while (ent);
+
+ if (dir)
+ closedir(dir);
+}
+
+static int populate_thresholds(void)
+{
+ int ret;
+ char val[32];
+ char path[PATH_MAX];
+
+ sprintf(path, O2HB_DEAD_THRESHOLD, cluster_name);
+ ret = get_value(path, val, sizeof(val));
+ if (!ret) {
+ dead_threshold_in_ms =
+ DEAD_THRESHOLD_IN_MSECS(strtoul(val, NULL, 0));
+ warn_threshold_in_ms =
+ (dead_threshold_in_ms * warn_threshold_percent / 100);
+ }
+
+ return ret;
+}
+
+static int populate_cluster(void)
+{
+ DIR *dir;
+ struct dirent *ent;
+
+ if (cluster_name) {
+ free(cluster_name);
+ cluster_name = NULL;
+ }
+
+ dir = opendir(O2HB_CLUSTER_DIR);
+ if (!dir)
+ return -1;
+
+ do {
+ ent = readdir(dir);
+ if (ent && ent->d_type == 4 && strcmp(ent->d_name, ".") &&
+ strcmp(ent->d_name, "..")) {
+ cluster_name = strdup(ent->d_name);
+ break;
+ }
+ } while (ent);
+
+ closedir(dir);
+
+ if (cluster_name)
+ return 0;
+
+ return -1;
+}
+
+static int is_cluster_up(void)
+{
+ struct stat buf;
+ int status;
+ static int warn_count = 0;
+
+ status = stat(O2HB_CLUSTER_DIR, &buf);
+ if (status)
+ return 0;
+
+ status = stat(O2HB_DEBUG_DIR, &buf);
+ if (status) {
+ if (!(warn_count++ % 10))
+ syslog(LOG_WARNING,
+ "mount debugfs at /sys/kernel/debug");
+ return 0;
+ }
+
+ return 1;
+}
+
+static void monitor(void)
+{
+ int ret;
+
+ while (1) {
+ if (!is_cluster_up()) {
+ sleep(CONFIG_POLL_IN_SECS);
+ continue;
+ }
+
+ ret = populate_cluster();
+ if (!ret)
+ ret = populate_thresholds();
+ if (ret) {
+ sleep(CONFIG_POLL_IN_SECS);
+ continue;
+ }
+
+ poll_in_secs = SLOW_POLL_IN_SECS;
+
+ scan_heartbeat_regions();
+
+ sleep(poll_in_secs);
+ }
+}
+
+static void usage(void)
+{
+ fprintf(stderr, "usage: %s [-w percent] -[ivV]\n", progname);
+ fprintf(stderr, "\t -w, Warn threshold percent (default 50%%)\n");
+ fprintf(stderr, "\t -i, Interactive\n");
+ fprintf(stderr, "\t -v, Verbose\n");
+ fprintf(stderr, "\t -V, Version\n");
+ exit(1);
+}
+
+int main(int argc, char **argv)
+{
+ int c, ret, version = 0;
+
+ /* init globals */
+ progname = basename(argv[0]);
+ interactive = 0;
+ warn_threshold_percent = WARN_THRESHOLD_PERCENT;
+ verbose = 0;
+ cluster_name = NULL;
+
+ while (1) {
+ c = getopt(argc, argv, "w:i?hvV");
+ if (c == -1)
+ break;
+ switch (c) {
+ case 'i':
+ interactive = 1;
+ break;
+ case 'v':
+ ++verbose;
+ break;
+ case 'w':
+ warn_threshold_percent = strtoul(optarg, NULL, 0);
+ if (warn_threshold_percent < 1 ||
+ warn_threshold_percent > 99)
+ warn_threshold_percent = WARN_THRESHOLD_PERCENT;
+ break;
+ case 'V':
+ version = 1;
+ break;
+ case '?':
+ case 'h':
+ default:
+ usage();
+ break;
+ }
+ }
+
+ if (version)
+ show_version();
+
+ if (!interactive) {
+ ret = daemon(0, verbose);
+ if (ret)
+ fprintf(stderr, "Unable to daemonize, %s\n",
+ strerror(errno));
+ }
+
+ openlog(progname, LOG_CONS|LOG_NDELAY, LOG_DAEMON);
+ monitor();
+ closelog();
+
+ return 0;
+}
diff --git a/vendor/common/ocfs2-tools.spec-generic.in b/vendor/common/ocfs2-tools.spec-generic.in
index 3e9b46a..44a65cd 100644
--- a/vendor/common/ocfs2-tools.spec-generic.in
+++ b/vendor/common/ocfs2-tools.spec-generic.in
@@ -118,6 +118,7 @@ fi
/sbin/o2cb_ctl
/sbin/mount.ocfs2
/sbin/o2image
+/usr/sbin/o2hbmonitor
/sbin/ocfs2_hb_ctl
/etc/init.d/o2cb
/etc/init.d/ocfs2
--
1.5.6.5
More information about the Ocfs2-tools-devel
mailing list