Skip to content

Commit 0aac809

Browse files
committed
Fix: sbd watchdog rebooting upon restart of pacemaker-remote
1 parent 681ce1a commit 0aac809

File tree

3 files changed

+31
-6
lines changed

3 files changed

+31
-6
lines changed

src/sbd-cluster.c

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,15 @@ static crm_cluster_t cluster;
5151
static gboolean sbd_remote_check(gpointer user_data);
5252
static long unsigned int find_pacemaker_remote(void);
5353
static void sbd_membership_destroy(gpointer user_data);
54+
static bool wait_for_pacemaker_remote_lost = false;
5455

56+
static void signal_exitreq(void)
57+
{
58+
union sigval signal_value;
59+
pid_t ppid = getppid();
60+
61+
sigqueue(ppid, SIG_EXITREQ, signal_value);
62+
}
5563

5664
#if SUPPORT_PLUGIN
5765
static void
@@ -459,6 +467,10 @@ sbd_remote_check(gpointer user_data)
459467
set_servant_health(pcmk_health_online, LOG_INFO,
460468
"Connected to Pacemaker Remote %lu", (long unsigned int)remoted_pid);
461469
} else {
470+
if (wait_for_pacemaker_remote_lost) {
471+
signal_exitreq();
472+
return true;
473+
}
462474
set_servant_health(pcmk_health_unclean, LOG_WARNING,
463475
"Connection to Pacemaker Remote %lu lost", (long unsigned int)remoted_pid);
464476
}
@@ -520,6 +532,16 @@ cluster_shutdown(int nsig)
520532
clean_up(0);
521533
}
522534

535+
static void
536+
trigger_wait_for_pacemaker_remote_lost(int nsig)
537+
{
538+
/* if we've never seen pacemaker_remoted request exit immeditely */
539+
if ((remoted_pid <= 0) || !remote_node) {
540+
signal_exitreq();
541+
}
542+
wait_for_pacemaker_remote_lost = true;
543+
}
544+
523545
int
524546
servant_cluster(const char *diskname, int mode, const void* argp)
525547
{
@@ -539,6 +561,7 @@ servant_cluster(const char *diskname, int mode, const void* argp)
539561

540562
mainloop_add_signal(SIGTERM, cluster_shutdown);
541563
mainloop_add_signal(SIGINT, cluster_shutdown);
564+
mainloop_add_signal(SIGUSR2, trigger_wait_for_pacemaker_remote_lost);
542565

543566
g_main_run(mainloop);
544567
g_main_destroy(mainloop);

src/sbd-inquisitor.c

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -177,14 +177,14 @@ void servants_start(void)
177177
}
178178
}
179179

180-
void servants_kill(void)
180+
void servants_kill(int sig)
181181
{
182182
struct servants_list_item *s;
183183
union sigval svalue;
184184

185185
for (s = servants_leader; s; s = s->next) {
186186
if (s->pid != 0)
187-
sigqueue(s->pid, SIGKILL, svalue);
187+
sigqueue(s->pid, sig, svalue);
188188
}
189189
}
190190

@@ -465,7 +465,7 @@ void inquisitor_child(void)
465465
clock_gettime(CLOCK_MONOTONIC, &t_now);
466466

467467
if (sig == SIG_EXITREQ || sig == SIGTERM) {
468-
servants_kill();
468+
servants_kill(SIGKILL);
469469
watchdog_close(true);
470470
exiting = 1;
471471
} else if (sig == SIGCHLD) {
@@ -523,6 +523,8 @@ void inquisitor_child(void)
523523
if (exiting)
524524
continue;
525525
servants_start();
526+
} else if (sig == SIGUSR2) {
527+
servants_kill(SIGUSR2);
526528
}
527529

528530
if (exiting) {
@@ -631,7 +633,7 @@ void inquisitor_child(void)
631633
*/
632634
cl_log(LOG_DEBUG, "Decoupling");
633635
if (inquisitor_decouple() < 0) {
634-
servants_kill();
636+
servants_kill(SIGKILL);
635637
exiting = 1;
636638
continue;
637639
} else {
@@ -647,7 +649,7 @@ void inquisitor_child(void)
647649
/* We're still being watched by our
648650
* parent. We don't fence, but exit. */
649651
cl_log(LOG_ERR, "SBD: Not enough votes to proceed. Aborting start-up.");
650-
servants_kill();
652+
servants_kill(SIGKILL);
651653
exiting = 1;
652654
continue;
653655
}

src/sbd_remote.service.in

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ Type=forking
1010
PIDFile=@localstatedir@/run/sbd.pid
1111
EnvironmentFile=-@CONFIGDIR@/sbd
1212
ExecStart=@sbindir@/sbd $SBD_OPTS -p @localstatedir@/run/sbd.pid watch
13-
ExecStop=@bindir@/kill -TERM $MAINPID
13+
ExecStop=@bindir@/kill -USR2 $MAINPID
1414

1515
# Could this benefit from exit codes for restart?
1616
# Does this need to be set to msgwait * 1.2?

0 commit comments

Comments
 (0)