[Ocfs2-tools-devel] [PATCH 2/2] o2hbmonitor: Limit number of active instances

Tristan Ye tristan.ye at oracle.com
Wed Dec 8 18:23:13 PST 2010


Sunil Mushran wrote:
> Tristan,
>
> Did you get a chance to play with this. I am wondering if I can get
> any confirmation whether this works for them or not. It is working
> on my box. kill -9 is trapped and the named semaphore is cleaned up.
> Wondering if someone else is seeing the same or not.
Hi Sunil,

    I'm just wondering how you can go with POSIX semaphores on known 
'SIGKILL'
problem, it was just incapable of undoing the locks after SIGKILL being 
delivered.
Also, my testing wasn't supportive to your patches as well.

    Like what Joel said, System V semaphores have no troubling deal with 
the issue
by 'SEM_UNDO'.


>
> Sunil
>
> On 12/02/2010 07:15 PM, Tristan Ye wrote:
>> Sunil Mushran wrote:
>>> Patch attempts to disallow multiple instances of o2hbmonitor
>>> running at the same time.
>>>
>>> Signed-off-by: Sunil Mushran <sunil.mushran at oracle.com>
>>> ---
>>>  o2monitor/Makefile      |    2 +-
>>>  o2monitor/o2hbmonitor.c |   85 
>>> +++++++++++++++++++++++++++++++++++++++++++++++
>>>  2 files changed, 86 insertions(+), 1 deletions(-)
>>>
>>> diff --git a/o2monitor/Makefile b/o2monitor/Makefile
>>> index 961eafa..6392b96 100644
>>> --- a/o2monitor/Makefile
>>> +++ b/o2monitor/Makefile
>>> @@ -21,6 +21,6 @@ OBJS = $(subst .c,.o,$(CFILES))
>>>  DIST_FILES = $(CFILES) $(HFILES)
>>>
>>>  o2hbmonitor: $(OBJS)
>>> -    $(LINK)
>>> +    $(LINK) -lrt
>>>
>>>  include $(TOPDIR)/Postamble.make
>>> diff --git a/o2monitor/o2hbmonitor.c b/o2monitor/o2hbmonitor.c
>>> index 58e9280..44acb3a 100644
>>> --- a/o2monitor/o2hbmonitor.c
>>> +++ b/o2monitor/o2hbmonitor.c
>>> @@ -43,6 +43,9 @@
>>>  #include <libgen.h>
>>>  #include <syslog.h>
>>>  #include <errno.h>
>>> +#include <sys/ipc.h>
>>> +#include <semaphore.h>
>>> +#include <signal.h>
>>>
>>>  #define SYS_CONFIG_DIR            "/sys/kernel/config"
>>>  #define O2HB_CLUSTER_DIR        SYS_CONFIG_DIR"/cluster"
>>> @@ -71,6 +74,22 @@ unsigned long dead_threshold_in_ms;
>>>  unsigned long warn_threshold_in_ms;
>>>  unsigned long poll_in_secs;
>>>
>>> +sem_t *sem;
>>> +char sem_name[NAME_MAX - 4];
>>> +int sem_taken;
>>> +
>>> +static void handler(int sig)
>>> +{
>>> +    if (sem_taken) {
>>> +        sem_unlink(sem_name);
>>> +        sem_post(sem);
>>> +        sem_close(sem);
>>> +    }
>>> +
>>> +    syslog(LOG_INFO, "Exiting\n");
>>> +    exit(0);
>>> +}
>>> +
>>>  static void show_version(void)
>>>  {
>>>      fprintf(stderr, "%s %s\n", progname, VERSION);
>>> @@ -278,6 +297,8 @@ static void monitor(void)
>>>  {
>>>      int ret;
>>>
>>> +    syslog(LOG_INFO, "Starting\n");
>>> +
>>>      while (1) {
>>>          if (!is_cluster_up()) {
>>>              sleep(CONFIG_POLL_IN_SECS);
>>> @@ -300,6 +321,45 @@ static void monitor(void)
>>>      }
>>>  }
>>>
>>> +/* Returns -1 if already running, 0 if not, 1 if unknown */
>>> +static int is_already_running(void)
>>> +{
>>> +    int ret;
>>> +
>>> +    sem = sem_open(sem_name, O_CREAT, 0644, 1);
>>> +    if (sem <= 0) {
>>> +        fprintf(stderr, "%s\n", strerror(errno));
>>> +        return 1;
>>> +    }
>>> +
>>> +    ret = sem_trywait(sem);
>>> +    if (ret) {
>>> +        if (errno == EAGAIN)
>>> +            return -1;
>>> +        return 1;
>>> +    }
>>> +
>>> +    sem_taken = 1;
>>> +
>>> +    return 0;
>>> +}
>>> +
>>> +static int setup_signals(void)
>>> +{
>>> +    int ret = 0;
>>> +
>>> +    struct sigaction act = { .sa_handler = handler };
>>> +
>>> +    sigemptyset(&act.sa_mask);
>>> +    ret = sigaction(SIGTERM, &act, NULL);
>>> +    ret += sigaction(SIGINT, &act, NULL);
>>> +    ret += sigaction(SIGHUP, &act, NULL);
>>> +    ret += sigaction(SIGQUIT, &act, NULL);
>>> +    ret += sigaction(SIGSEGV, &act, NULL);
>>
>>    Just wondering here how it would behave against 'SIGKILL', in that 
>> case, we won't be able to do cleanup for semaphore,
>>
>> and next run of instance may still gets kind of such error:'sorry, 
>> we've already got one instance running...'
>>
>>> +
>>> +    return ret;
>>> +}
>>> +
>>>  static void usage(void)
>>>  {
>>>      fprintf(stderr, "usage: %s [-w percent] -[ivV]\n", progname);
>>> @@ -320,6 +380,9 @@ int main(int argc, char **argv)
>>>      warn_threshold_percent = WARN_THRESHOLD_PERCENT;
>>>      verbose = 0;
>>>      cluster_name = NULL;
>>> +    sem = NULL;
>>> +    sem_taken = 0;
>>> +    snprintf(sem_name, sizeof(sem_name), "/%s", progname);
>>>
>>>      while (1) {
>>>          c = getopt(argc, argv, "w:i?hvV");
>>> @@ -352,6 +415,27 @@ int main(int argc, char **argv)
>>>      if (version)
>>>          show_version();
>>>
>>> +    ret = setup_signals();
>>> +    if (ret) {
>>> +        fprintf(stderr, "Unable to set up signal handlers. %s. "
>>> +            "Aborting.\n", strerror(errno));
>>> +        goto bail;
>>> +    }
>>> +
>>> +    ret = is_already_running();
>>> +    switch (ret) {
>>> +    case -1:
>>> +        fprintf(stderr, "Another instance of %s is already running. "
>>> +            "Aborting.\n", progname);
>>> +        goto bail;
>>> +    case 1:
>>> +        fprintf(stderr, "Unable to determine if %s is already "
>>> +            "running. Starting a new instance.\n", progname);
>>> +    case 0:
>>> +    default:
>>> +        break;
>>> +    }
>>> +
>>>      if (!interactive) {
>>>          ret = daemon(0, verbose);
>>>          if (ret)
>>> @@ -363,5 +447,6 @@ int main(int argc, char **argv)
>>>      monitor();
>>>      closelog();
>>>
>>> +bail:
>>>      return 0;
>>>  }
>>
>




More information about the Ocfs2-tools-devel mailing list