@@ -246,6 +246,10 @@ const (
246246 // Where nmstate writes the link files if it persisted ifnames.
247247 // https://github.com/nmstate/nmstate/blob/03c7b03bd4c9b0067d3811dbbf72635201519356/rust/src/cli/persist_nic.rs#L32-L36
248248 systemdNetworkDir = "etc/systemd/network"
249+
250+ // Config drift error message fragments used to identify drift-related degradation
251+ configDriftContentMismatch = "content mismatch"
252+ configDriftModeMismatch = "mode mismatch"
249253)
250254
251255type onceFromOrigin int
@@ -1522,8 +1526,37 @@ func (dn *Daemon) getCurrentConfigFromNode() (*onDiskConfig, error) {
15221526 return tempConfig , nil
15231527}
15241528
1525- func (dn * Daemon ) startConfigDriftMonitor () {
1529+ // initConfigDriftMetric initializes the config drift metric based on the node's current state.
1530+ // If the node is Degraded due to config drift, set the metric to current time to indicate ongoing drift.
1531+ // Otherwise, clear the metric.
1532+ func (dn * Daemon ) initConfigDriftMetric () {
1533+ var reason string
1534+
1535+ state , err := getNodeAnnotationExt (dn .node , constants .MachineConfigDaemonStateAnnotationKey , true )
1536+ if err != nil {
1537+ klog .Warningf ("Could not get node state when initializing config drift metric: %v" , err )
1538+ goto clearMetric
1539+ }
1540+ if state != constants .MachineConfigDaemonStateDegraded {
1541+ goto clearMetric
1542+ }
1543+
1544+ reason , err = getNodeAnnotationExt (dn .node , constants .MachineConfigDaemonReasonAnnotationKey , true )
1545+ if err != nil {
1546+ klog .Warningf ("Could not get node reason when initializing config drift metric: %v" , err )
1547+ goto clearMetric
1548+ }
1549+ if strings .Contains (reason , configDriftContentMismatch ) || strings .Contains (reason , configDriftModeMismatch ) {
1550+ mcdConfigDrift .SetToCurrentTime ()
1551+ klog .Infof ("Config drift metric initialized: node is degraded due to config drift" )
1552+ return
1553+ }
1554+
1555+ clearMetric:
15261556 mcdConfigDrift .Set (0 )
1557+ }
1558+
1559+ func (dn * Daemon ) startConfigDriftMonitor () {
15271560 // Even though the Config Drift Monitor object ensures that only a single
15281561 // Config Drift Watcher is running at any given time, other things, such as
15291562 // emitting Kube events on startup, should only occur if we weren't
@@ -1533,6 +1566,8 @@ func (dn *Daemon) startConfigDriftMonitor() {
15331566 return
15341567 }
15351568
1569+ dn .initConfigDriftMetric ()
1570+
15361571 odc , err := dn .getCurrentConfigOnDisk ()
15371572 if err != nil && ! os .IsNotExist (err ) {
15381573 dn .exitCh <- fmt .Errorf ("could not get current config from disk: %w" , err )
@@ -2291,6 +2326,9 @@ func (dn *Daemon) checkStateOnFirstRun() error {
22912326
22922327 if err := dn .validateOnDiskStateOrImage (state .currentConfig , state .currentImage ); err != nil {
22932328 dn .nodeWriter .Eventf (corev1 .EventTypeWarning , "OnDiskStateValidationFailed" , err .Error ())
2329+ // Start the config drift monitor even when there's pre-existing drift
2330+ // so the metric gets initialized correctly on MCD restart
2331+ dn .startConfigDriftMonitor ()
22942332 return err
22952333 }
22962334
0 commit comments