Skip to content

Commit c2d8bd6

Browse files
committed
HA/Heartbeat: Use last message's timestamp
Since the retryable HA function may be executed a few times before succeeding, the inserted heartbeat value will be directly outdated. The heartbeat logic was slightly altered to always use the latest heartbeat time value.
1 parent dd0ca8f commit c2d8bd6

File tree

2 files changed

+21
-3
lines changed

2 files changed

+21
-3
lines changed

pkg/icingadb/ha.go

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -221,7 +221,7 @@ func (h *HA) controller() {
221221

222222
// Ensure that updating/inserting the instance row is completed by the current heartbeat's expiry time.
223223
realizeCtx, cancelRealizeCtx := context.WithDeadline(h.ctx, m.ExpiryTime())
224-
err = h.realize(realizeCtx, s, t, envId, shouldLogRoutineEvents)
224+
err = h.realize(realizeCtx, s, envId, shouldLogRoutineEvents)
225225
cancelRealizeCtx()
226226
if errors.Is(err, context.DeadlineExceeded) {
227227
h.signalHandover("instance update/insert deadline exceeded heartbeat expiry time")
@@ -268,10 +268,12 @@ func (h *HA) controller() {
268268
// realize a HA cycle triggered by a heartbeat event.
269269
//
270270
// shouldLogRoutineEvents indicates if recurrent events should be logged.
271+
//
272+
// The internal, retryable function always fetches the last received heartbeat's timestamp instead of reusing the one
273+
// from the calling controller loop. Doing so results in inserting a more accurate timestamp if a retry happens.
271274
func (h *HA) realize(
272275
ctx context.Context,
273276
s *icingaredisv1.IcingaStatus,
274-
t *types.UnixMilli,
275277
envId types.Binary,
276278
shouldLogRoutineEvents bool,
277279
) error {
@@ -354,7 +356,7 @@ func (h *HA) realize(
354356
EnvironmentMeta: v1.EnvironmentMeta{
355357
EnvironmentId: envId,
356358
},
357-
Heartbeat: *t,
359+
Heartbeat: types.UnixMilli(time.UnixMilli(h.heartbeat.LastMessageTime())),
358360
Responsible: types.Bool{Bool: takeover != "" || h.responsible, Valid: true},
359361
EndpointId: s.EndpointId,
360362
Icinga2Version: s.Version,

pkg/icingaredis/heartbeat.go

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ type Heartbeat struct {
2525
active bool
2626
events chan *HeartbeatMessage
2727
lastReceivedMs atomic.Int64
28+
lastMessageMs atomic.Int64
2829
cancelCtx context.CancelFunc
2930
client *redis.Client
3031
done chan struct{}
@@ -62,6 +63,11 @@ func (h *Heartbeat) LastReceived() int64 {
6263
return h.lastReceivedMs.Load()
6364
}
6465

66+
// LastMessageTime returns the last message's time in ms.
67+
func (h *Heartbeat) LastMessageTime() int64 {
68+
return h.lastMessageMs.Load()
69+
}
70+
6571
// Close stops the heartbeat controller loop, waits for it to finish, and returns an error if any.
6672
// Implements the io.Closer interface.
6773
func (h *Heartbeat) Close() error {
@@ -139,6 +145,15 @@ func (h *Heartbeat) controller(ctx context.Context) {
139145
}
140146

141147
h.lastReceivedMs.Store(m.received.UnixMilli())
148+
149+
statsT, err := m.stats.Time()
150+
if err != nil {
151+
h.logger.Warnw("Received Icinga heartbeat with invalid stats time", zap.Error(err))
152+
h.lastMessageMs.Store(0)
153+
} else {
154+
h.lastMessageMs.Store(statsT.Time().UnixMilli())
155+
}
156+
142157
h.sendEvent(m)
143158
case <-time.After(Timeout):
144159
if h.active {
@@ -150,6 +165,7 @@ func (h *Heartbeat) controller(ctx context.Context) {
150165
}
151166

152167
h.lastReceivedMs.Store(0)
168+
h.lastMessageMs.Store(0)
153169
case <-ctx.Done():
154170
return ctx.Err()
155171
}

0 commit comments

Comments
 (0)