Skip to content

Commit 46a9b12

Browse files
committed
Abort HA Realization Logic After Timeout
A strange HA behavior was reported in [icingadb-787], resulting in both instances being active. The logs contained an entry of the previous active instance exiting the HA.realize() method successfully after 1m9s. This, however, should not be possible as the method's context is deadlined to a minute after the heartbeat was received. With introducing Settings.QuickContextExit for retry.WithBackoff in [icinga-go-library-69] and using it here, the function directly returns a context.DeadlineExceeded error the moment the context has expired. Doing so allows directly handing over, while the other instance can now take over due to the expired heartbeat in the database. As a related change, the HA.insertEnvironment() method was inlined into the retryable function to use the deadlined context. Otherwise, this might block afterwards, as it was used within HA.realize(), but without the passed context. In addition, the main loop select cases for hactx.Done() and ctx.Done() were unified, as hactx is a derived ctx. A closed ctx case may be lost as the hactx case could have been chosen. [icinga-go-library-69]: Icinga/icinga-go-library#69 [icingadb-787]: #787
1 parent 77ccab2 commit 46a9b12

File tree

2 files changed

+18
-25
lines changed

2 files changed

+18
-25
lines changed

cmd/icingadb/main.go

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -325,7 +325,10 @@ func run() int {
325325

326326
cancelHactx()
327327
case <-hactx.Done():
328-
// Nothing to do here, surrounding loop will terminate now.
328+
if ctx.Err() != nil {
329+
logger.Fatalf("%+v", errors.New("main context closed unexpectedly"))
330+
}
331+
// Otherwise, there is nothing to do here, surrounding loop will terminate now.
329332
case <-ha.Done():
330333
if err := ha.Err(); err != nil {
331334
logger.Fatalf("%+v", errors.Wrap(err, "HA exited with an error"))
@@ -337,8 +340,6 @@ func run() int {
337340
cancelHactx()
338341

339342
return ExitFailure
340-
case <-ctx.Done():
341-
logger.Fatalf("%+v", errors.New("main context closed unexpectedly"))
342343
case s := <-sig:
343344
logger.Infow("Exiting due to signal", zap.String("signal", s.String()))
344345
cancelHactx()

pkg/icingadb/ha.go

Lines changed: 14 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -170,7 +170,7 @@ func (h *HA) controller() {
170170
}
171171
tt := t.Time()
172172
if tt.After(now.Add(1 * time.Second)) {
173-
h.logger.Debugw("Received heartbeat from the future", zap.Time("time", tt))
173+
h.logger.Warnw("Received heartbeat from the future", zap.Time("time", tt))
174174
}
175175
if tt.Before(now.Add(-1 * peerTimeout)) {
176176
h.logger.Errorw("Received heartbeat from the past", zap.Time("time", tt))
@@ -264,6 +264,9 @@ func (h *HA) controller() {
264264

265265
// realize a HA cycle triggered by a heartbeat event.
266266
//
267+
// The given context MUST have a deadline, as the method will otherwise panic. This deadline is hardly enforced to
268+
// cancel the logic if it has expired.
269+
//
267270
// shouldLogRoutineEvents indicates if recurrent events should be logged.
268271
func (h *HA) realize(
269272
ctx context.Context,
@@ -300,6 +303,7 @@ func (h *HA) realize(
300303
if errBegin != nil {
301304
return errors.Wrap(errBegin, "can't start transaction")
302305
}
306+
defer func() { _ = tx.Rollback() }()
303307

304308
query := h.db.Rebind("SELECT id, heartbeat FROM icingadb_instance "+
305309
"WHERE environment_id = ? AND responsible = ? AND id <> ?") + selectLock
@@ -370,9 +374,14 @@ func (h *HA) realize(
370374

371375
if takeover != "" {
372376
stmt := h.db.Rebind("UPDATE icingadb_instance SET responsible = ? WHERE environment_id = ? AND id <> ?")
373-
_, err := tx.ExecContext(ctx, stmt, "n", envId, h.instanceId)
377+
if _, err := tx.ExecContext(ctx, stmt, "n", envId, h.instanceId); err != nil {
378+
return database.CantPerformQuery(err, stmt)
379+
}
374380

375-
if err != nil {
381+
// Insert the environment after each heartbeat takeover if it does not already exist in the database
382+
// as the environment may have changed, although this is likely to happen very rarely.
383+
stmt, _ = h.db.BuildInsertIgnoreStmt(h.environment)
384+
if _, err := h.db.NamedExecContext(ctx, stmt, h.environment); err != nil {
376385
return database.CantPerformQuery(err, stmt)
377386
}
378387
}
@@ -386,7 +395,7 @@ func (h *HA) realize(
386395
retry.Retryable,
387396
backoff.NewExponentialWithJitter(256*time.Millisecond, 3*time.Second),
388397
retry.Settings{
389-
// Intentionally no timeout is set, as we use a context with a deadline.
398+
// Intentionally, no timeout is set because a context with a deadline is used and QuickContextExit is set.
390399
OnRetryableError: func(_ time.Duration, attempt uint64, err, lastErr error) {
391400
if lastErr == nil || err.Error() != lastErr.Error() {
392401
log := h.logger.Debugw
@@ -413,19 +422,14 @@ func (h *HA) realize(
413422
zap.NamedError("recovered_error", lastErr))
414423
}
415424
},
425+
QuickContextExit: true,
416426
},
417427
)
418428
if err != nil {
419429
return err
420430
}
421431

422432
if takeover != "" {
423-
// Insert the environment after each heartbeat takeover if it does not already exist in the database
424-
// as the environment may have changed, although this is likely to happen very rarely.
425-
if err := h.insertEnvironment(); err != nil {
426-
return errors.Wrap(err, "can't insert environment")
427-
}
428-
429433
h.signalTakeover(takeover)
430434
} else if otherResponsible {
431435
if state, _ := h.state.Load(); !state.otherResponsible {
@@ -445,18 +449,6 @@ func (h *HA) realizeLostHeartbeat() {
445449
}
446450
}
447451

448-
// insertEnvironment inserts the environment from the specified state into the database if it does not already exist.
449-
func (h *HA) insertEnvironment() error {
450-
// Instead of checking whether the environment already exists, use an INSERT statement that does nothing if it does.
451-
stmt, _ := h.db.BuildInsertIgnoreStmt(h.environment)
452-
453-
if _, err := h.db.NamedExecContext(h.ctx, stmt, h.environment); err != nil {
454-
return database.CantPerformQuery(err, stmt)
455-
}
456-
457-
return nil
458-
}
459-
460452
func (h *HA) removeInstance(ctx context.Context) {
461453
h.logger.Debugw("Removing our row from icingadb_instance", zap.String("instance_id", hex.EncodeToString(h.instanceId)))
462454
// Intentionally not using h.ctx here as it's already cancelled.

0 commit comments

Comments
 (0)