[Ocfs2-tools-devel] [PATCH 19/39] ocfs2_controld: Join a group on filesystem mount.

Fri Mar 14 16:52:42 PDT 2008

When a filesystem mounts for the first time, join a group in CPG.  This
encompasses all the join code required, but does not handle leave yet.

Signed-off-by: Joel Becker <joel.becker at oracle.com>
---
 ocfs2_controld/cman.c           |   19 +++--
 ocfs2_controld/cpg.c            |  210 +++++++++++++++++++++++++++++++++++++--
 ocfs2_controld/mount.c          |  136 ++++++++++++++++++++++++-
 ocfs2_controld/ocfs2_controld.h |    4 +
 4 files changed, 347 insertions(+), 22 deletions(-)

diff --git a/ocfs2_controld/cman.c b/ocfs2_controld/cman.c
index 520bf99..c5950e1 100644
--- a/ocfs2_controld/cman.c
+++ b/ocfs2_controld/cman.c
@@ -51,7 +51,16 @@ static int              cman_node_count;
 
 int kill_cman(int nodeid)
 {
-	return cman_kill_node(ch_admin, nodeid);
+	int error;
+
+	log_debug("killing node %d", nodeid);
+
+	error = cman_kill_node(ch_admin, nodeid);
+	if (error)
+		log_debug("Unable to kill node %d, %d %d", nodeid, error,
+			  errno);
+
+	return error;
 }
 
 static int is_member(cman_node_t *node_list, int count, int nodeid)
@@ -149,16 +158,12 @@ static void cman_callback(cman_handle_t h, void *private, int reason, int arg)
 {
 	switch (reason) {
 		case CMAN_REASON_TRY_SHUTDOWN:
-#if 0
-			if (list_empty(&mounts))
-#endif
+			if (!have_mounts())
 				cman_replyto_shutdown(ch, 1);
-#if 0
 			else {
 				log_debug("no to cman shutdown");
 				cman_replyto_shutdown(ch, 0);
 			}
-#endif
 			break;
 
 		case CMAN_REASON_STATECHANGE:
@@ -208,7 +213,7 @@ int setup_cman(void)
 	}
 
 	ch_admin = cman_admin_init(NULL);
-	if (!ch) {
+	if (!ch_admin) {
 		log_error("cman_admin_init error %d", errno);
 		rv = -ENOTCONN;
 		goto fail_finish;
diff --git a/ocfs2_controld/cpg.c b/ocfs2_controld/cpg.c
index 0d99f18..61276c6 100644
--- a/ocfs2_controld/cpg.c
+++ b/ocfs2_controld/cpg.c
@@ -34,6 +34,11 @@
 
 #include "ocfs2_controld.h"
 
+struct cnode {
+	struct list_head	cn_list;
+	int			cn_nodeid;
+};
+
 struct cgroup {
 	struct list_head	cg_list;	/* List of all CPG groups */
 
@@ -42,10 +47,25 @@ struct cgroup {
 	int			cg_fd;
 	int			cg_ci;
 
+	/* CPG's idea of the group */
 	struct cpg_name		cg_name;
 	struct cpg_address	cg_members[CPG_MEMBERS_MAX];
 	int			cg_member_count;
 
+	/*
+	 * Our idea of the group.
+	 * This lags cg_members until join/leave processing is complete.
+	 */
+	struct list_head	cg_nodes;
+	int			cg_node_count;
+
+	/* Hooks for mounters */
+	void			(*cg_set_cgroup)(struct cgroup *cg,
+						 void *user_data);
+	void			(*cg_node_down)(int nodeid,
+						void *user_data);
+	void			*cg_user_data;
+
 	/* Callback state */
 	int			cg_got_confchg;
 	struct cpg_address	cg_cb_members[CPG_MEMBERS_MAX];
@@ -99,13 +119,126 @@ void for_each_node(struct cgroup *cg,
 			   for_each_node_proxy, &fen);
 }
 
+static struct cnode *find_node(struct cgroup *cg, int nodeid)
+{
+	struct list_head *p;
+	struct cnode *cn = NULL;
+
+	list_for_each(p, &cg->cg_nodes) {
+		cn = list_entry(p, struct cnode, cn_list);
+		if (cn->cn_nodeid == nodeid)
+			break;
+
+		cn = NULL;
+	}
+
+	return cn;
+}
+
+static void push_node(struct cgroup *cg, int nodeid)
+{
+	struct cnode *cn;
+
+	if (find_node(cg, nodeid)) {
+		log_error("Node %d is already part of group %.*s", nodeid,
+			  cg->cg_name.length, cg->cg_name.value);
+		/*
+		 * If we got lost in our group members, we can't interact
+		 * safely.
+		 */
+		shutdown_daemon();
+		return;
+	}
+
+	cn = malloc(sizeof(struct cnode));
+	if (!cn) {
+		log_error("Unable to allocate node structure, exiting");
+		/*
+		 * If we can't keep track of the group, we can't
+		 * interact safely.
+		 */
+		shutdown_daemon();
+		return;
+	}
+
+	cn->cn_nodeid = nodeid;
+	list_add(&cn->cn_list, &cg->cg_nodes);
+	cg->cg_node_count++;
+}
+
+static void pop_node(struct cgroup *cg, int nodeid)
+{
+	struct cnode *cn = find_node(cg, nodeid);
+
+	if (cn) {
+		list_del(&cn->cn_list);
+		cg->cg_node_count--;
+	} else {
+		log_error("Unable to find node %d in group %.*s", nodeid,
+			  cg->cg_name.length, cg->cg_name.value);
+	}
+
+	if (cg->cg_node_count < 0) {
+		log_error("cg_node_count went negative for group %.*s",
+			  cg->cg_name.length, cg->cg_name.value);
+		cg->cg_node_count = 0;
+	}
+}
+
+static void push_node_on_join(struct cpg_address *addr,
+			      void *user_data)
+{
+	struct cgroup *cg = user_data;
+
+	log_debug("Filling node %d to group %.*s", addr->nodeid,
+		  cg->cg_name.length, cg->cg_name.value);
+
+	push_node(cg, addr->nodeid);
+}
+
+static void handle_node_join(struct cpg_address *addr,
+			     void *user_data)
+{
+	struct cgroup *cg = user_data;
+
+	log_debug("Node %d joins group %.*s",
+		  addr->nodeid, cg->cg_name.length, cg->cg_name.value);
+
+	/*
+	 * If I read group/daemon/cpg.c correctly, you cannot have more than
+	 * one entry in the join_list when you yourself join.  Thus, it is
+	 * safe to add all members of cg_cb_members.  There will not be
+	 * a duplicate in cg_cb_joined.
+	 */
+	if (addr->nodeid == our_nodeid) {
+		if (cg->cg_joined) {
+			log_error("This node has joined group %.*s more than once",
+				  cg->cg_name.length, cg->cg_name.value);
+		} else {
+			log_debug("This node joins group %.*s",
+				  cg->cg_name.length, cg->cg_name.value);
+			for_each_node_list(cg->cg_cb_members,
+					   cg->cg_cb_member_count,
+					   push_node_on_join,
+					   cg);
+			cg->cg_set_cgroup(cg, cg->cg_user_data);
+		}
+	} else
+		push_node(cg, addr->nodeid);
+
+}
 
 static void handle_node_leave(struct cpg_address *addr,
 			      void *user_data)
 {
+	struct cgroup *cg = user_data;
+
 	switch (addr->reason) {
 		case CPG_REASON_LEAVE:
-			/* XXX Handle leave */
+			log_debug("Node %d leaves group %.*s",
+				  addr->nodeid, cg->cg_name.length,
+				  cg->cg_name.value);
+			pop_node(cg, addr->nodeid);
 			break;
 
 		case CPG_REASON_NODEDOWN:
@@ -133,17 +266,16 @@ static void handle_node_leave(struct cpg_address *addr,
 
 static void group_change(struct cgroup *cg)
 {
-	log_debug("group %s confchg: members %d, left %d, joined %d",
-		  cg->cg_name.value, cg->cg_cb_member_count,
-		  cg->cg_cb_left_count, cg->cg_cb_joined_count);
+	log_debug("group %.*s confchg: members %d, left %d, joined %d",
+		  cg->cg_name.length, cg->cg_name.value,
+		  cg->cg_cb_member_count, cg->cg_cb_left_count,
+		  cg->cg_cb_joined_count);
 
-#if 0
 	for_each_node_list(cg->cg_cb_joined, cg->cg_cb_joined_count,
-			   handle_node_join, NULL);
-#endif
+			   handle_node_join, cg);
 
 	for_each_node_list(cg->cg_cb_left, cg->cg_cb_left_count,
-			   handle_node_leave, NULL);
+			   handle_node_leave, cg);
 }
 
 static void handle_daemon_left(struct cpg_address *addr,
@@ -197,13 +329,22 @@ static void handle_daemon_left(struct cpg_address *addr,
 static void handle_node_down(struct cpg_address *addr,
 			     void *user_data)
 {
+	struct list_head *p;
+	struct cgroup *cg;
+
 	if ((addr->reason != CPG_REASON_NODEDOWN) &&
 	    (addr->reason != CPG_REASON_PROCDOWN))
 		return;
 
 	log_debug("node down %d", addr->nodeid);
 
-	/* XXX For each mount group, process node down */
+	list_for_each(p, &group_list) {
+		cg = list_entry(p, struct cgroup, cg_list);
+		if (find_node(cg, addr->nodeid)) {
+			cg->cg_node_down(addr->nodeid, cg->cg_user_data);
+			pop_node(cg, addr->nodeid);
+		}
+	}
 }
 
 static void daemon_change(struct cgroup *cg)
@@ -254,18 +395,36 @@ static void process_configuration_change(struct cgroup *cg)
 
 static struct cgroup *client_to_group(int ci)
 {
+	struct list_head *p;
+	struct cgroup *cg;
+
 	if (ci == daemon_group.cg_ci)
 		return &daemon_group;
 
+	list_for_each(p, &group_list) {
+		cg = list_entry(p, struct cgroup, cg_list);
+		if (cg->cg_ci == ci)
+			return cg;
+	}
+
 	log_error("unknown client %d", ci);
 	return NULL;
 }
 
 static struct cgroup *handle_to_group(cpg_handle_t handle)
 {
+	struct list_head *p;
+	struct cgroup *cg;
+
 	if (handle == daemon_group.cg_handle)
 		return &daemon_group;
 
+	list_for_each(p, &group_list) {
+		cg = list_entry(p, struct cgroup, cg_list);
+		if (cg->cg_handle == handle)
+			return cg;
+	}
+
 	log_error("unknown handle %llu", (unsigned long long)handle);
 
 	return NULL;
@@ -434,10 +593,43 @@ out:
 	return error;
 }
 
+int group_join(const char *name,
+	       void (*set_cgroup)(struct cgroup *cg, void *user_data),
+	       void (*node_down)(int nodeid, void *user_data),
+	       void *user_data)
+{
+	int rc;
+	struct cgroup *cg;
+
+	cg = malloc(sizeof(struct cgroup));
+	if (!cg) {
+		log_error("Unable to allocate cgroup structure");
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	memset(cg, 0, sizeof(struct cgroup));
+	INIT_LIST_HEAD(&cg->cg_nodes);
+
+	cg->cg_set_cgroup = set_cgroup;
+	cg->cg_node_down = node_down;
+	cg->cg_user_data = user_data;
+
+	rc = init_group(cg, name);
+	if (rc)
+		free(cg);
+	else
+		list_add(&cg->cg_list, &group_list);
+
+out:
+	return rc;
+}
+
 int setup_cpg(void)
 {
 	cpg_error_t error;
 
+	INIT_LIST_HEAD(&group_list);
 	error = init_group(&daemon_group, "ocfs2_controld");
 
 	return error;
diff --git a/ocfs2_controld/mount.c b/ocfs2_controld/mount.c
index d849cc8..c59752f 100644
--- a/ocfs2_controld/mount.c
+++ b/ocfs2_controld/mount.c
@@ -51,6 +51,9 @@ struct mountgroup {
 	int			mg_mount_fd;
 	int			mg_mount_notified;
 
+	/* Interaction with cpg.c */
+	struct cgroup		*mg_cg;
+
 	int			mg_error;
 	char			mg_error_msg[128];
 };
@@ -324,6 +327,113 @@ static void add_mountpoint(struct mountgroup *mg, const char *device,
 	list_add(&mp->mp_list, &mg->mg_mountpoints);
 }
 
+static void finish_join(struct mountgroup *mg, struct cgroup *cg)
+{
+	struct mountpoint *mp;
+
+	if (mg->mg_cg) {
+		log_error("cgroup passed, but one already exists! (mg %s, existing %p, new %p)",
+			  mg->mg_uuid, mg->mg_cg, cg);
+		return;
+	}
+
+	mp = mg->mg_mp_in_progress;
+	if (!mp) {
+		log_error("No mountpoint in progress for mountgroup %s",
+			  mg->mg_uuid);
+		return;
+	}
+
+	if (list_empty(&mp->mp_list)) {
+		if (mg->mg_leave_on_join) {
+			/* XXX Start leave */
+		} else {
+			log_error("mountgroup %s is in the process of leaving, not joining",
+				  mg->mg_uuid);
+		}
+		return;
+	}
+
+	if (list_empty(&mg->mg_mountpoints)) {
+		log_error("No mountpoints on mountgroup %s", mg->mg_uuid);
+		return;
+	}
+
+	/* Ok, we've successfully joined the group */
+	mg->mg_cg = cg;
+	notify_mount_client(mg);
+}
+
+static void finish_leave(struct mountgroup *mg)
+{
+	if (list_empty(&mg->mg_mountpoints) &&
+	    mg->mg_mp_in_progress) {
+		/* We're done */
+		notify_mount_client(mg);
+
+		/* This is possible due to leave_on_join */
+		if (!mg->mg_cg)
+			log_debug("mg_cg was NULL");
+
+		free(mg->mg_mp_in_progress);
+		list_del(&mg->mg_list);
+		free(mg);
+		return;
+	}
+
+	/* This leave is unexpected */
+
+	log_error("Unexpected leave of group %s", mg->mg_uuid);
+	if (!mg->mg_cg)
+		log_error("No mg_cg for group %s", mg->mg_uuid);
+
+	/* XXX Do dire things */
+}
+
+/*
+ * This is called when we join or leave a group.  There are three possible
+ * states.
+ *
+ * 1) We've asked to join a group for a new filesystem.
+ *    - mg_mp_in_progress != NULL
+ *    - length(mg_mountpoints) == 1
+ *    - mg_cg == NULL
+ *
+ *    cg will be our now-joined group.
+ *
+ * 2) We've asked to leave a group upon the last unmount of a filesystem.
+ *   - mg_mp_in_progress != NULL
+ *   - mg_mountpoints is empty
+ *   - mg_cg is only NULL if we had to set leave_on_join.
+ *
+ *   cg is NULL.  We should complete our leave.
+ *
+ * 3) We've dropped out of the group unexpectedly.
+ *   - mg_mountpoints is not empty.
+ *   - mg_cg != NULL
+ *
+ *   cg is NULL.  We should basically crash.  This usually is handled by
+ *   closing our sysfs fd.
+ */
+static void mount_set_group(struct cgroup *cg, void *user_data)
+{
+	struct mountgroup *mg = user_data;
+
+	if (cg)
+		finish_join(mg, cg);
+	else
+		finish_leave(mg);
+}
+
+static void mount_node_down(int nodeid, void *user_data)
+{
+	struct mountgroup *mg = user_data;
+
+	log_debug("Node %d has left mountgroup %s", nodeid, mg->mg_uuid);
+
+	/* XXX Write to sysfs */
+}
+
 int start_mount(int ci, int fd, const char *uuid, const char *device,
 		const char *mountpoint)
 {
@@ -360,15 +470,29 @@ int start_mount(int ci, int fd, const char *uuid, const char *device,
 	if (mg->mg_error)
 		goto out;
 
-	/* XXX This is where we do the asynchronous join
-	 *
-	 * Here we fire off a group join.  The cpg infrastructure will
+	/*
+	 * Fire off a group join.  The cpg infrastructure will
 	 * let us know when the group is joined, at which point we
 	 * notify_mount_client().  If there's a failure, we notify as well.
-	 *
-	 * XXX: For now, let's pretend :-)
 	 */
-	notify_mount_client(mg);
+	rc = group_join(mg->mg_uuid, mount_set_group, mount_node_down, mg);
+	if (rc) {
+		fill_error(mg, -rc, "Unable to start join to group %s",
+			   mg->mg_uuid);
+
+		/*
+		 * Because we never started a join, mg->mg_cg is NULL.
+		 * remove_mountpoint() will set up for leave_on_join, but
+		 * that actually never happens.  Thus, it is safe to
+		 * clear mp_in_progress.
+		 */
+		remove_mountpoint(mg, mountpoint);
+		if (mg->mg_mp_in_progress) {
+			free(mg->mg_mp_in_progress);
+			mg->mg_mp_in_progress = NULL;
+		} else
+			log_error("First mount of %s failed a join, yet mp_in_progress was NULL", mg->mg_uuid);
+	}
 
 out:
 	/*
diff --git a/ocfs2_controld/ocfs2_controld.h b/ocfs2_controld/ocfs2_controld.h
index fed98f9..e70e298 100644
--- a/ocfs2_controld/ocfs2_controld.h
+++ b/ocfs2_controld/ocfs2_controld.h
@@ -74,6 +74,10 @@ void for_each_node(struct cgroup *cg,
 		   void (*func)(int nodeid,
 				void *user_data),
 		   void *user_data);
+int group_join(const char *name,
+	       void (*set_cgroup)(struct cgroup *cg, void *user_data),
+	       void (*node_down)(int nodeid, void *user_data),
+	       void *user_data);
 
 /* mount.c */
 void init_mounts(void);
-- 
1.5.3.8