Skip to content

Commit a0d71bf

Browse files
fixes config drift metric persistence
1 parent 1881dab commit a0d71bf

File tree

1 file changed

+39
-1
lines changed

1 file changed

+39
-1
lines changed

pkg/daemon/daemon.go

Lines changed: 39 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -246,6 +246,10 @@ const (
246246
// Where nmstate writes the link files if it persisted ifnames.
247247
// https://github.com/nmstate/nmstate/blob/03c7b03bd4c9b0067d3811dbbf72635201519356/rust/src/cli/persist_nic.rs#L32-L36
248248
systemdNetworkDir = "etc/systemd/network"
249+
250+
// Config drift error message fragments used to identify drift-related degradation
251+
configDriftContentMismatch = "content mismatch"
252+
configDriftModeMismatch = "mode mismatch"
249253
)
250254

251255
type onceFromOrigin int
@@ -1522,8 +1526,37 @@ func (dn *Daemon) getCurrentConfigFromNode() (*onDiskConfig, error) {
15221526
return tempConfig, nil
15231527
}
15241528

1525-
func (dn *Daemon) startConfigDriftMonitor() {
1529+
// initConfigDriftMetric initializes the config drift metric based on the node's current state.
1530+
// If the node is Degraded due to config drift, set the metric to current time to indicate ongoing drift.
1531+
// Otherwise, clear the metric.
1532+
func (dn *Daemon) initConfigDriftMetric() {
1533+
var reason string
1534+
1535+
state, err := getNodeAnnotationExt(dn.node, constants.MachineConfigDaemonStateAnnotationKey, true)
1536+
if err != nil {
1537+
klog.Warningf("Could not get node state when initializing config drift metric: %v", err)
1538+
goto clearMetric
1539+
}
1540+
if state != constants.MachineConfigDaemonStateDegraded {
1541+
goto clearMetric
1542+
}
1543+
1544+
reason, err = getNodeAnnotationExt(dn.node, constants.MachineConfigDaemonReasonAnnotationKey, true)
1545+
if err != nil {
1546+
klog.Warningf("Could not get node reason when initializing config drift metric: %v", err)
1547+
goto clearMetric
1548+
}
1549+
if strings.Contains(reason, configDriftContentMismatch) || strings.Contains(reason, configDriftModeMismatch) {
1550+
mcdConfigDrift.SetToCurrentTime()
1551+
klog.Infof("Config drift metric initialized: node is degraded due to config drift")
1552+
return
1553+
}
1554+
1555+
clearMetric:
15261556
mcdConfigDrift.Set(0)
1557+
}
1558+
1559+
func (dn *Daemon) startConfigDriftMonitor() {
15271560
// Even though the Config Drift Monitor object ensures that only a single
15281561
// Config Drift Watcher is running at any given time, other things, such as
15291562
// emitting Kube events on startup, should only occur if we weren't
@@ -1533,6 +1566,8 @@ func (dn *Daemon) startConfigDriftMonitor() {
15331566
return
15341567
}
15351568

1569+
dn.initConfigDriftMetric()
1570+
15361571
odc, err := dn.getCurrentConfigOnDisk()
15371572
if err != nil && !os.IsNotExist(err) {
15381573
dn.exitCh <- fmt.Errorf("could not get current config from disk: %w", err)
@@ -2291,6 +2326,9 @@ func (dn *Daemon) checkStateOnFirstRun() error {
22912326

22922327
if err := dn.validateOnDiskStateOrImage(state.currentConfig, state.currentImage); err != nil {
22932328
dn.nodeWriter.Eventf(corev1.EventTypeWarning, "OnDiskStateValidationFailed", err.Error())
2329+
// Start the config drift monitor even when there's pre-existing drift
2330+
// so the metric gets initialized correctly on MCD restart
2331+
dn.startConfigDriftMonitor()
22942332
return err
22952333
}
22962334

0 commit comments

Comments
 (0)