Overhaul the Pacemaker hooks for ocfs2_controld - Reduce the amount of custom code by reusing more code from the Pacemaker libraries - Escalate fencing to the cluster manager instead of initiating it directly Signed-off-by: Andrew Beekhof diff --git a/ocfs2_controld/pacemaker.c b/ocfs2_controld/pacemaker.c index 53eacff..40ed0ee 100644 --- a/ocfs2_controld/pacemaker.c +++ b/ocfs2_controld/pacemaker.c @@ -23,14 +23,18 @@ #include #include + /* heartbeat support is irrelevant here */ #undef SUPPORT_HEARTBEAT #define SUPPORT_HEARTBEAT 0 #include #include +#include #include -#include +#include +#include +#include #include "ocfs2-kernel/kernel-list.h" #include "o2cb/o2cb.h" @@ -39,6 +43,8 @@ #include +#define log_printf(level, format, args...) syslog(level, "%s:%d " format "\n", __FILE__, __LINE__, ##args) + int our_nodeid = 0; static int pcmk_ci; static int stonithd_ci; @@ -49,29 +55,57 @@ const char *stackname = "pcmk"; extern int ais_fd_async; char *local_node_uname = NULL; -int kill_stack_node(int nodeid) -{ - int error = 1; - stonith_ops_t st_op; - char *target = nodeid2name(nodeid); - - log_debug("killing node %d (aka. %s)", nodeid, target); - - if(target) { - st_op.timeout = 150; - st_op.node_uuid = NULL; - st_op.private_data = NULL; - st_op.node_name = target; - st_op.optype = POWEROFF; +static IPC_Channel *attrd = NULL; - error = stonithd_node_fence(&st_op); - } - - if (error) - log_debug("Unable to kill node %d, %d %d", nodeid, error, - errno); +static void attrd_deadfn(int ci) +{ + log_printf(LOG_ERR, "Lost connection to attrd"); + attrd = NULL; + return; +} - return error; +int kill_stack_node(int nodeid) +{ + gboolean rc = FALSE; + xmlNode *update = NULL; + time_t now = time(NULL); + crm_node_t *node = crm_get_peer(nodeid, NULL); + + if(node == NULL || node->uname == NULL) { + log_printf(LOG_ERR, "%s: Don't know how to kick node %d/%p", __FUNCTION__, nodeid, node); + return -1; + } + + if(attrd == NULL) { + log_printf(LOG_INFO, "Connecting to attrd..."); + attrd = init_client_ipc_comms_nodispatch(T_ATTRD); + if(attrd) { + connection_add(attrd->ops->get_recv_select_fd(attrd), NULL, attrd_deadfn); + } + } + + if(attrd != NULL) { + update = create_xml_node(NULL, __FUNCTION__); + crm_xml_add(update, F_TYPE, T_ATTRD); + crm_xml_add(update, F_ORIG, crm_system_name); + + crm_xml_add(update, F_ATTRD_TASK, "update"); + crm_xml_add(update, F_ATTRD_SECTION, XML_CIB_TAG_STATUS); + crm_xml_add(update, F_ATTRD_ATTRIBUTE, "terminate"); + crm_xml_add_int(update, F_ATTRD_VALUE, now); + crm_xml_add(update, F_ATTRD_HOST, node->uname); + + rc = send_ipc_message(attrd, update); + free_xml(update); + } + + if(rc) { + log_printf(LOG_INFO, "Requested that node %d/%s be kicked from the cluster", nodeid, node->uname); + return 1; + } + + log_printf(LOG_ERR, "Could not kick node %d/%s from the cluster", nodeid, node->uname); + return 0; } char *nodeid2name(int nodeid) { @@ -86,7 +120,7 @@ char *nodeid2name(int nodeid) { int validate_cluster(const char *cluster) { if (!clustername) { - log_error("Trying to validate before pacemaker is alive"); + log_printf(LOG_ERR, "Trying to validate before pacemaker is alive"); return 0; } @@ -99,12 +133,12 @@ int validate_cluster(const char *cluster) int get_clustername(const char **cluster) { if (!clustername) { - log_error("Trying to validate before pacemaker is alive"); + log_printf(LOG_ERR, "Trying to validate before pacemaker is alive"); return -EIO; } if (!cluster) { - log_error("NULL passed!"); + log_printf(LOG_ERR, "NULL passed!"); return -EINVAL; } @@ -115,316 +149,36 @@ int get_clustername(const char **cluster) static void dead_pcmk(int ci) { if (ci != pcmk_ci) { - log_error("Unknown connection %d", ci); + log_printf(LOG_ERR, "Unknown connection %d", ci); return; } - log_error("pacemaker connection died"); + log_printf(LOG_ERR, "pacemaker connection died"); shutdown_daemon(); connection_dead(ci); } +extern void terminate_ais_connection(void); + void exit_stack(void) { - log_debug("closing stonithd connection"); - stonithd_signoff(); - log_debug("closing pacemaker connection"); - if (ais_fd_async) { - close(ais_fd_async); - ais_fd_async = 0; - } - if (ais_fd_sync) { - close(ais_fd_sync); - ais_fd_sync = 0; - } + terminate_ais_connection(); } static void process_pcmk(int ci) { - /* ci ::= client number */ - char *data = NULL; - char *uncompressed = NULL; - AIS_Message *msg = NULL; - SaAisErrorT rc = SA_AIS_OK; - mar_res_header_t *header = NULL; - static int header_len = sizeof(mar_res_header_t); - - header = malloc(header_len); - memset(header, 0, header_len); - - errno = 0; - rc = saRecvRetry(ais_fd_async, header, header_len); - if (rc != SA_AIS_OK) { - cl_perror("Receiving message header failed: (%d) %s", rc, - ais_error2text(rc)); - goto bail; - } else if(header->size == header_len) { - log_error("Empty message: id=%d, size=%d, error=%d, header_len=%d", - header->id, header->size, header->error, header_len); - goto done; - } else if(header->size == 0 || header->size < header_len) { - log_error("Mangled header: size=%d, header=%d, error=%d", - header->size, header_len, header->error); - goto done; - } else if(header->error != 0) { - log_error("Header contined error: %d", header->error); - } - - header = realloc(header, header->size); - /* Use a char* so we can store the remainder into an offset */ - data = (char*)header; - - errno = 0; - rc = saRecvRetry(ais_fd_async, data+header_len, header->size - header_len); - msg = (AIS_Message*)data; - - if (rc != SA_AIS_OK) { - cl_perror("Receiving message body failed: (%d) %s", rc, ais_error2text(rc)); - goto bail; - } - - data = msg->data; - if(msg->is_compressed && msg->size > 0) { - int rc = BZ_OK; - unsigned int new_size = msg->size; - - if (check_message_sanity(msg, NULL) == FALSE) - goto badmsg; - - log_debug("Decompressing message data"); - uncompressed = malloc(new_size); - memset(uncompressed, 0, new_size); - - rc = BZ2_bzBuffToBuffDecompress( - uncompressed, &new_size, data, msg->compressed_size, - 1, 0); - - if(rc != BZ_OK) { - log_error("Decompression failed: %d", rc); - goto badmsg; - } - - CRM_ASSERT(rc == BZ_OK); - CRM_ASSERT(new_size == msg->size); - - data = uncompressed; - - } else if(check_message_sanity(msg, data) == FALSE) { - goto badmsg; - - } else if(safe_str_eq("identify", data)) { - int pid = getpid(); - char *pid_s = crm_itoa(pid); - - send_ais_text(0, pid_s, TRUE, NULL, crm_msg_ais); - crm_free(pid_s); - goto done; - } - - if (msg->header.id == crm_class_members) { - xmlNode *xml = string2xml(data); - - if(xml != NULL) { - const char *value = crm_element_value(xml, "id"); - if(value) - crm_peer_seq = crm_int_helper(value, NULL); - - log_debug("Updating membership %llu", crm_peer_seq); - /* crm_log_xml_info(xml, __PRETTY_FUNCTION__); */ - xml_child_iter(xml, node, crm_update_ais_node(node, crm_peer_seq)); - crm_calculate_quorum(); - free_xml(xml); - } else { - log_error("Invalid peer update: %s", data); - } - } else { - log_error("Unexpected AIS message type: %d", msg->header.id); - } - -done: - free(uncompressed); - free(msg); - return; - -badmsg: - log_error("Invalid message (id=%d, dest=%s:%s, from=%s:%s.%d):" - " min=%d, total=%d, size=%d, bz2_size=%d", - msg->id, ais_dest(&(msg->host)), msg_type2text(msg->host.type), - ais_dest(&(msg->sender)), msg_type2text(msg->sender.type), - msg->sender.pid, (int)sizeof(AIS_Message), - msg->header.size, msg->size, msg->compressed_size); - free(uncompressed); - free(msg); - return; - -bail: - log_error("AIS connection failed"); - return; -} - -static void dead_stonithd(int ci) -{ - if (ci != stonithd_ci) { - log_error("Unknown connection %d", ci); - return; - } - - log_error("stonithd connection died"); - shutdown_daemon(); - connection_dead(ci); -} - -static void process_stonithd(int ci) -{ - IPC_Channel *stonithd_ch = stonithd_input_IPC_channel(); - - while (stonithd_op_result_ready()) { - if (stonithd_ch->ch_status != IPC_CONNECT) { - /* The message which was pending for us is that - * the IPC status is now IPC_DISCONNECT */ - break; - } - - if (ST_FAIL == stonithd_receive_ops_result(FALSE)) { - log_error("stonithd_receive_ops_result() failed"); - } - } - - if (stonithd_ch->ch_status != IPC_CONNECT) - dead_stonithd(stonithd_ci); -} - -static void result_stonithd(stonith_ops_t *op) -{ - if (op == NULL) { - log_error("Called with a NULL op!"); - return; - } - - log_debug("Stonithd result: call=%d, optype=%d, node_name=%s, result=%d, node_list=%s, action=%s", - op->call_id, op->optype, op->node_name, op->op_result, - (char *)op->node_list, op->private_data); - - switch(op->op_result) { - case STONITH_SUCCEEDED: - break; - case STONITH_CANNOT: - case STONITH_TIMEOUT: - case STONITH_GENERIC: - log_error("Stonith of %s failed (%d)", - op->node_name, op->op_result); - break; - default: - log_error("Unsupported action result: %d", op->op_result); - } -} - -static gboolean setup_stonith(void) -{ - int lpc = 0; - int rc = ST_OK; - int stonithd_fd; - const char *reason = NULL; - IPC_Channel *stonithd_ch = NULL; - - for(lpc = 0; lpc < 30; lpc++) { - log_debug("Attempting connection to fencing daemon..."); - - sleep(1); - rc = stonithd_signon("ocfs2-tools"); - if(rc == ST_OK) - break; - - log_error("Sign-in failed: pausing and trying again in 2s..."); - sleep(1); - } - - if(rc != ST_OK) { - reason = "Sign-in failed"; - goto bail; - } - - rc = stonithd_set_stonith_ops_callback(result_stonithd); - if(rc != ST_OK) { - reason = "Setup failed"; - goto bail; - } - - stonithd_ch = stonithd_input_IPC_channel(); - if(stonithd_ch == NULL) { - reason = "No connection"; - goto bail; - } - stonithd_fd = stonithd_ch->ops->get_recv_select_fd(stonithd_ch); - if(stonithd_ch <= 0) { - reason = "No fd"; - goto bail; - } - - stonithd_ci = connection_add(stonithd_fd, process_stonithd, - dead_stonithd); - if (stonithd_ci < 0) { - log_error("Unable to add stonithd client: %s", - strerror(-stonithd_ci)); - goto bail; - } - - return TRUE; - -bail: - log_error("Unable to add stonithd client: %s", reason); - return FALSE; + ais_dispatch(ais_fd_async, NULL); } int setup_stack(void) { - int retries = 0; - int pid; - char *pid_s; - int rc = SA_AIS_OK; - struct utsname name; - crm_log_init("ocfs2_controld", LOG_INFO, FALSE, TRUE, 0, NULL); - crm_peer_init(); - if (local_node_uname == NULL) { - if (uname(&name) < 0) { - cl_perror("uname(2) call failed"); - exit(100); + if(init_ais_connection(NULL, NULL, NULL, &local_node_uname, &our_nodeid) == FALSE) { + log_printf(LOG_ERR, "Connection to our AIS plugin (%d) failed", CRM_SERVICE); + return -1; } - local_node_uname = crm_strdup(name.nodename); - log_debug("Local node name: %s", local_node_uname); - } - -retry: - log_debug("Creating connection to our AIS plugin"); - rc = saServiceConnect (&ais_fd_sync, &ais_fd_async, CRM_SERVICE); - if (rc != SA_AIS_OK) - log_error("Connection to our AIS plugin (%d) failed: %s (%d)", - CRM_SERVICE, ais_error2text(rc), rc); - - switch(rc) { - case SA_AIS_OK: - break; - case SA_AIS_ERR_TRY_AGAIN: - if(retries < 30) { - sleep(1); - retries++; - goto retry; - } - log_error("Retry count exceeded"); - return 0; - default: - return 0; - } - - log_debug("AIS connection established"); - - pid = getpid(); - pid_s = crm_itoa(pid); - send_ais_text(0, pid_s, TRUE, NULL, crm_msg_ais); - crm_free(pid_s); /* Sign up for membership updates */ send_ais_text(crm_class_notify, "true", TRUE, NULL, crm_msg_ais); @@ -432,14 +186,11 @@ retry: /* Requesting the current list of known nodes */ send_ais_text(crm_class_members, __FUNCTION__, TRUE, NULL, crm_msg_ais); - our_nodeid = get_ais_nodeid(); - log_debug("Local node id: %d", our_nodeid); - pcmk_ci = connection_add(ais_fd_async, process_pcmk, dead_pcmk); - if (pcmk_ci >= 0 && setup_stonith()) + if (pcmk_ci >= 0) return ais_fd_async; - log_error("Unable to add pacemaker client: %s", strerror(-pcmk_ci)); + log_printf(LOG_ERR, "Unable to add pacemaker client: %s", strerror(-pcmk_ci)); exit_stack(); return pcmk_ci; }