[Ocfs2-tools-commits] jlbec commits r1398 - branches/cman-based/ocfs2_controld

svn-commits at oss.oracle.com svn-commits at oss.oracle.com
Mon Aug 20 13:27:50 PDT 2007


Author: jlbec
Date: 2007-08-20 13:27:48 -0700 (Mon, 20 Aug 2007)
New Revision: 1398

Modified:
   branches/cman-based/ocfs2_controld/action.c
   branches/cman-based/ocfs2_controld/main.c
   branches/cman-based/ocfs2_controld/ocfs2_controld_internal.h
Log:

ocfs2_controld now handles signals and exit cleanly.

Add a proper signal handler that feeds the signals to a pipe.  This way
the main loop can read the signal via POLLIN.  The signal processing
function knows to exit if there are no mounts and stay alive if there are.

When the main loop is broken due to a signal or a cman disconnect, we
now clean up in our exit path.  Any existing mounts are removed from 
user heartbeat.  This can cause a self-fence if there are live mounts, but
that only happens on a very abnormal exit (SEGV, cman exiting) where we
can't stay alive safely anyway.

The functionality for cleaning up a user heartbeat region is moved out
of do_terminate() so it can be used by the exit code.



Modified: branches/cman-based/ocfs2_controld/action.c
===================================================================
--- branches/cman-based/ocfs2_controld/action.c	2007-08-20 07:33:13 UTC (rev 1397)
+++ branches/cman-based/ocfs2_controld/action.c	2007-08-20 20:27:48 UTC (rev 1398)
@@ -890,6 +890,36 @@
 	}
 }
 
+/*
+ * THIS FUNCTION CAN CAUSE PROBLEMS.
+ *
+ * clean_up_mountgroup() can be called from do_terminate() on a normal
+ * group_leave().  In this case, it will safely do its thing.
+ *
+ * However, it can also be called when we exit due to a signal or
+ * cman dying on us.  As such, it will remove the region from o2cb but do
+ * no communication with cman.  This can cause o2cb to self-fence or cman
+ * to go nuts.  But hey, if you SIGKILL the daemon, you get what you pay 
+ * for.
+ */
+void clean_up_mountgroup(struct mountgroup *mg)
+{
+	/*
+	 * Drop all members from our local region, as we don't care about
+	 * them anymore.  Force start_type to LEAVE so that down_members()
+	 * doesn't complain.
+	 */
+	mg->start_type = GROUP_NODE_LEAVE;
+	down_members(mg, 0, NULL);
+	assert(list_empty(&mg->members));
+
+	if (drop_region(mg))
+		log_error("Error removing region %s", mg->uuid);
+
+	list_del(&mg->list);
+	free(mg);
+}
+
 void do_terminate(struct mountgroup *mg)
 {
 	log_group(mg, "termination of our unmount leave");
@@ -912,20 +942,7 @@
 	 */
 	assert(list_empty(&mg->mountpoints));
 
-	/*
-	 * Drop all members from our local region, as we don't care about
-	 * them anymore.  Force start_type to LEAVE so that down_members()
-	 * doesn't complain.
-	 */
-	mg->start_type = GROUP_NODE_LEAVE;
-	down_members(mg, 0, NULL);
-	assert(list_empty(&mg->members));
-
-	if (drop_region(mg))
-		log_error("Error removing region %s", mg->uuid);
-
-	list_del(&mg->list);
-	free(mg);
+	clean_up_mountgroup(mg);
 }
 
 void dump_state(void)

Modified: branches/cman-based/ocfs2_controld/main.c
===================================================================
--- branches/cman-based/ocfs2_controld/main.c	2007-08-20 07:33:13 UTC (rev 1397)
+++ branches/cman-based/ocfs2_controld/main.c	2007-08-20 20:27:48 UTC (rev 1398)
@@ -44,12 +44,123 @@
 
 static int cman_fd;
 static int listen_fd;
+static int sigpipe_fd;
+static int sigpipe_write_fd;
 static int groupd_fd;
 
 extern struct list_head mounts;
 extern struct list_head withdrawn_mounts;
 int no_withdraw;
 
+static void handler(int signum)
+{
+	log_debug("Caught signal %d\n", signum);
+	if (write(sigpipe_write_fd, &signum, sizeof(signum)) < sizeof(signum))
+		log_error("Problem writing signal: %s\n", strerror(-errno));
+}
+
+static int handle_signal(void)
+{
+	int rc, caught_sig, abortp = 0;
+	static int segv_already = 0;
+
+	rc = read(sigpipe_fd, (char *)&caught_sig, sizeof(caught_sig));
+	if (rc < 0) {
+		rc = -errno;
+		log_error("Error reading from signal pipe: %s",
+			  strerror(-rc));
+		goto out;
+	}
+
+	if (rc != sizeof(caught_sig)) {
+		rc = -EIO;
+		log_error("Error reading from signal pipe: %s",
+			  strerror(-rc));
+		goto out;
+	}
+
+	switch (caught_sig) {
+		case SIGQUIT:
+			abortp = 1;
+			/* FALL THROUGH */
+
+		case SIGTERM:
+		case SIGINT:
+		case SIGHUP:
+			if (list_empty(&mounts)) {
+				log_error("Caught signal %d, exiting",
+					  caught_sig);
+				rc = 1;
+			} else {
+				log_error("Caught signal %d, but mounts exist.  Ignoring.",
+					  caught_sig);
+				rc = 0;
+			}
+			break;
+
+		case SIGSEGV:
+			log_error("Segmentation fault, exiting");
+			rc = 1;
+			if (segv_already) {
+				log_error("Segmentation fault loop detected");
+				abortp = 1;
+			} else
+				segv_already = 1;
+			break;
+
+		default:
+			log_error("Caught signal %d, ignoring", caught_sig);
+			rc = 0;
+			break;
+	}
+
+	if (rc && abortp)
+		abort();
+
+out:
+	return rc;
+}
+
+static int setup_sigpipe(void)
+{
+	int rc;
+	int signal_pipe[2];
+	struct sigaction act;
+
+	rc = pipe(signal_pipe);
+	if (rc) {
+		rc = -errno;
+		log_error("Unable to set up signal pipe: %s",
+			  strerror(-rc));
+		goto out;
+	}
+
+	sigpipe_fd = signal_pipe[0];
+	sigpipe_write_fd = signal_pipe[1];
+
+	act.sa_sigaction = NULL;
+	act.sa_restorer = NULL;
+	sigemptyset(&act.sa_mask);
+	act.sa_handler = handler;
+#ifdef SA_INTERRUPT
+	act.sa_flags = SA_INTERRUPT;
+#endif
+
+	rc += sigaction(SIGTERM, &act, NULL);
+	rc += sigaction(SIGINT, &act, NULL);
+	rc += sigaction(SIGHUP, &act, NULL);
+	rc += sigaction(SIGQUIT, &act, NULL);
+	rc += sigaction(SIGSEGV, &act, NULL);
+	act.sa_handler = SIG_IGN;
+	rc += sigaction(SIGPIPE, &act, NULL);  /* Get EPIPE instead */
+
+	if (rc)
+		log_error("Unable to set up signal handlers");
+
+out:
+	return rc;
+}
+
 int do_read(int fd, void *buf, size_t count)
 {
 	int rv, off = 0;
@@ -276,6 +387,27 @@
 	return rv;
 }
 
+/*
+ * THIS FUNCTION CAUSES PROBLEMS.
+ *
+ * bail_on_mounts() is called when we are forced to exit via a signal or
+ * cman dying on us.  As such, it removes regions from o2cb but does
+ * not communicate with cman.  This can cause o2cb to self-fence or cman
+ * to go nuts.  But hey, if you SIGKILL the daemon, you get what you pay 
+ * for.
+ */
+static void bail_on_mounts(void)
+{
+	struct list_head *p, *t;
+	struct mountgroup *mg;
+
+	list_for_each_safe(p, t, &mounts) {
+		mg = list_entry(p, struct mountgroup, list);
+		clean_up_mountgroup(mg);
+	}
+}
+
+
 static int loop(void)
 {
 	int rv, i, f, poll_timeout = -1;
@@ -285,6 +417,11 @@
 		goto out;
 	client_add(listen_fd);
 
+	rv = setup_sigpipe();
+	if (rv < 0)
+		goto out;
+	client_add(sigpipe_fd);
+
 	rv = cman_fd = setup_cman();
 	if (rv < 0)
 		goto out;
@@ -299,8 +436,9 @@
 
 	for (;;) {
 		rv = poll(pollfd, client_maxi + 1, poll_timeout);
-		if (rv < 0)
+		if ((rv < 0) && (errno != EINTR))
 			log_error("poll error %d errno %d", rv, errno);
+		rv = 0;
 
 		/* client[0] is listening for new connections */
 
@@ -321,23 +459,33 @@
 					process_groupd();
 				else if (pollfd[i].fd == cman_fd)
 					process_cman();
-				else
+				else if (pollfd[i].fd == sigpipe_fd) {
+					rv = handle_signal();
+					if (rv)
+						goto stop;
+				} else
 					process_client(i);
 			}
 
 			if (pollfd[i].revents & POLLHUP) {
 				if (pollfd[i].fd == cman_fd) {
 					log_error("cman connection died");
-					exit_cman();
+					goto stop;
 				} else if (pollfd[i].fd == groupd_fd) {
 					log_error("groupd connection died");
-					exit_cman();
+					goto stop;
 				}
 				client_dead(i);
 			}
 		}
 	}
-	rv = 0;
+
+stop:
+	if (!rv && !list_empty(&mounts))
+		rv = 1;
+
+	bail_on_mounts();
+
  out:
 	return rv;
 }

Modified: branches/cman-based/ocfs2_controld/ocfs2_controld_internal.h
===================================================================
--- branches/cman-based/ocfs2_controld/ocfs2_controld_internal.h	2007-08-20 07:33:13 UTC (rev 1397)
+++ branches/cman-based/ocfs2_controld/ocfs2_controld_internal.h	2007-08-20 20:27:48 UTC (rev 1398)
@@ -247,6 +247,7 @@
 	       const char *mountpoint);
 int do_remount(int ci, char *dir, char *mode);
 void ping_kernel_mount(char *table);
+void clean_up_mountgroup(struct mountgroup *mg);
 
 int client_send(int ci, char *buf, int len);
 




More information about the Ocfs2-tools-commits mailing list