Skip to content

Commit 6bde8f6

Browse files
committed
node: add skew enforcement prometheus alert
1 parent 7fbc041 commit 6bde8f6

File tree

6 files changed

+112
-3
lines changed

6 files changed

+112
-3
lines changed

cmd/machine-config-controller/start.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -274,6 +274,7 @@ func createControllers(ctx *ctrlcommon.ControllerContext) []ctrlcommon.Controlle
274274
ctx.OCLInformerFactory.Machineconfiguration().V1().MachineOSBuilds(),
275275
ctx.InformerFactory.Machineconfiguration().V1().MachineConfigNodes(),
276276
ctx.ConfigInformerFactory.Config().V1().Schedulers(),
277+
ctx.OperatorInformerFactory.Operator().V1().MachineConfigurations(),
277278
ctx.ClientBuilder.KubeClientOrDie("node-update-controller"),
278279
ctx.ClientBuilder.MachineConfigClientOrDie("node-update-controller"),
279280
ctx.FeatureGatesHandler,

install/0000_90_machine-config_01_prometheus-rules.yaml

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,18 @@ spec:
3838
annotations:
3939
summary: "Triggers when nodes in a pool have overlapping labels such as master, worker, and a custom label therefore a choice must be made as to which is honored."
4040
description: "Node {{ $labels.exported_node }} has triggered a pool alert due to a label change. For more details check MachineConfigController pod logs: oc logs -f -n {{ $labels.namespace }} machine-config-controller-xxxxx -c machine-config-controller"
41-
runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/machine-config-operator/MachineConfigControllerPoolAlert.md
41+
runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/machine-config-operator/MachineConfigControllerPoolAlert.md
42+
- name: mcc-boot-image-skew-enforcement-none
43+
rules:
44+
- alert: MCCBootImageSkewEnforcementNone
45+
expr: |
46+
mcc_boot_image_skew_enforcement_none == 1
47+
labels:
48+
namespace: openshift-machine-config-operator
49+
severity: info
50+
annotations:
51+
summary: "Boot image skew enforcement is disabled. Scaling operations may not be successful."
52+
description: "Boot image skew enforcement mode is set to None. When scaling up, new nodes may be provisioned with older boot images that could introduce compatibility issues. Consider manually updating boot images to match the cluster version. Please refer to docs at [TODO-INSERTLINK] for additional details."
4253
---
4354
apiVersion: monitoring.coreos.com/v1
4455
kind: PrometheusRule

pkg/controller/common/metrics.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,15 @@ var (
2525
Help: "state of OS image override",
2626
}, []string{"pool"})
2727

28+
// MCCBootImageSkewEnforcementNone indicates when boot image skew enforcement is disabled.
29+
// Set to 1 when mode is "None", 0 otherwise. A value of 1 indicates scaling operations may
30+
// not be successful
31+
MCCBootImageSkewEnforcementNone = prometheus.NewGauge(
32+
prometheus.GaugeOpts{
33+
Name: "mcc_boot_image_skew_enforcement_none",
34+
Help: "Set to 1 when boot image skew enforcement mode is None, indicating scaling may not be successful as bootimages are out of date",
35+
})
36+
2837
// MCCDrainErr logs failed drain
2938
MCCDrainErr = prometheus.NewGaugeVec(
3039
prometheus.GaugeOpts{
@@ -94,6 +103,7 @@ func RegisterMCCMetrics() error {
94103
MCCUpdatedMachineCount,
95104
MCCDegradedMachineCount,
96105
MCCUnavailableMachineCount,
106+
MCCBootImageSkewEnforcementNone,
97107
})
98108

99109
if err != nil {

pkg/controller/node/node_controller.go

Lines changed: 82 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,12 +15,15 @@ import (
1515
configv1 "github.com/openshift/api/config/v1"
1616
features "github.com/openshift/api/features"
1717
mcfgv1 "github.com/openshift/api/machineconfiguration/v1"
18+
opv1 "github.com/openshift/api/operator/v1"
1819

1920
cligoinformersv1 "github.com/openshift/client-go/config/informers/externalversions/config/v1"
2021
cligolistersv1 "github.com/openshift/client-go/config/listers/config/v1"
2122
mcfgclientset "github.com/openshift/client-go/machineconfiguration/clientset/versioned"
2223
"github.com/openshift/client-go/machineconfiguration/clientset/versioned/scheme"
2324
mcfginformersv1 "github.com/openshift/client-go/machineconfiguration/informers/externalversions/machineconfiguration/v1"
25+
mcopinformersv1 "github.com/openshift/client-go/operator/informers/externalversions/operator/v1"
26+
mcoplistersv1 "github.com/openshift/client-go/operator/listers/operator/v1"
2427

2528
mcfglistersv1 "github.com/openshift/client-go/machineconfiguration/listers/machineconfiguration/v1"
2629
"github.com/openshift/library-go/pkg/operator/v1helpers"
@@ -108,6 +111,9 @@ type Controller struct {
108111
schedulerList cligolistersv1.SchedulerLister
109112
schedulerListerSynced cache.InformerSynced
110113

114+
mcopLister mcoplistersv1.MachineConfigurationLister
115+
mcopListerSynced cache.InformerSynced
116+
111117
queue workqueue.TypedRateLimitingInterface[string]
112118

113119
fgHandler ctrlcommon.FeatureGatesHandler
@@ -127,6 +133,7 @@ func New(
127133
mosbInformer mcfginformersv1.MachineOSBuildInformer,
128134
mcnInformer mcfginformersv1.MachineConfigNodeInformer,
129135
schedulerInformer cligoinformersv1.SchedulerInformer,
136+
mcopInformer mcopinformersv1.MachineConfigurationInformer,
130137
kubeClient clientset.Interface,
131138
mcfgClient mcfgclientset.Interface,
132139
fgHandler ctrlcommon.FeatureGatesHandler,
@@ -141,6 +148,7 @@ func New(
141148
podInformer,
142149
mcnInformer,
143150
schedulerInformer,
151+
mcopInformer,
144152
kubeClient,
145153
mcfgClient,
146154
defaultUpdateDelay,
@@ -158,6 +166,7 @@ func NewWithCustomUpdateDelay(
158166
mosbInformer mcfginformersv1.MachineOSBuildInformer,
159167
mcnInformer mcfginformersv1.MachineConfigNodeInformer,
160168
schedulerInformer cligoinformersv1.SchedulerInformer,
169+
mcopInformer mcopinformersv1.MachineConfigurationInformer,
161170
kubeClient clientset.Interface,
162171
mcfgClient mcfgclientset.Interface,
163172
updateDelay time.Duration,
@@ -173,6 +182,7 @@ func NewWithCustomUpdateDelay(
173182
podInformer,
174183
mcnInformer,
175184
schedulerInformer,
185+
mcopInformer,
176186
kubeClient,
177187
mcfgClient,
178188
updateDelay,
@@ -191,6 +201,7 @@ func newController(
191201
podInformer coreinformersv1.PodInformer,
192202
mcnInformer mcfginformersv1.MachineConfigNodeInformer,
193203
schedulerInformer cligoinformersv1.SchedulerInformer,
204+
mcopInformer mcopinformersv1.MachineConfigurationInformer,
194205
kubeClient clientset.Interface,
195206
mcfgClient mcfgclientset.Interface,
196207
updateDelay time.Duration,
@@ -240,6 +251,11 @@ func newController(
240251
UpdateFunc: ctrl.updateMachineConfigNode,
241252
DeleteFunc: ctrl.deleteMachineConfigNode,
242253
})
254+
mcopInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
255+
AddFunc: ctrl.addMachineConfiguration,
256+
UpdateFunc: ctrl.updateMachineConfiguration,
257+
DeleteFunc: ctrl.deleteMachineConfiguration,
258+
})
243259

244260
ctrl.syncHandler = ctrl.syncMachineConfigPool
245261
ctrl.enqueueMachineConfigPool = ctrl.enqueueDefault
@@ -263,6 +279,9 @@ func newController(
263279
ctrl.schedulerList = schedulerInformer.Lister()
264280
ctrl.schedulerListerSynced = schedulerInformer.Informer().HasSynced
265281

282+
ctrl.mcopLister = mcopInformer.Lister()
283+
ctrl.mcopListerSynced = mcopInformer.Informer().HasSynced
284+
266285
return ctrl
267286
}
268287

@@ -271,7 +290,7 @@ func (ctrl *Controller) Run(workers int, stopCh <-chan struct{}) {
271290
defer utilruntime.HandleCrash()
272291
defer ctrl.queue.ShutDown()
273292

274-
if !cache.WaitForCacheSync(stopCh, ctrl.ccListerSynced, ctrl.mcListerSynced, ctrl.mcpListerSynced, ctrl.moscListerSynced, ctrl.mosbListerSynced, ctrl.nodeListerSynced, ctrl.schedulerListerSynced) {
293+
if !cache.WaitForCacheSync(stopCh, ctrl.ccListerSynced, ctrl.mcListerSynced, ctrl.mcpListerSynced, ctrl.moscListerSynced, ctrl.mosbListerSynced, ctrl.nodeListerSynced, ctrl.schedulerListerSynced, ctrl.mcopListerSynced) {
275294
return
276295
}
277296

@@ -1849,3 +1868,65 @@ func (ctrl *Controller) syncMetrics() error {
18491868
}
18501869
return nil
18511870
}
1871+
1872+
// addMachineConfiguration handles MachineConfiguration add events to update the boot image skew enforcement metric.
1873+
func (ctrl *Controller) addMachineConfiguration(obj any) {
1874+
if ctrl.fgHandler == nil || !ctrl.fgHandler.Enabled(features.FeatureGateBootImageSkewEnforcement) {
1875+
return
1876+
}
1877+
1878+
ctrl.syncBootImageSkewEnforcementMetric(obj)
1879+
}
1880+
1881+
// updateMachineConfiguration handles MachineConfiguration update events to update the boot image skew enforcement metric.
1882+
// Only takes action if BootImageSkewEnforcementStatus has changed.
1883+
func (ctrl *Controller) updateMachineConfiguration(old, cur any) {
1884+
if ctrl.fgHandler == nil || !ctrl.fgHandler.Enabled(features.FeatureGateBootImageSkewEnforcement) {
1885+
return
1886+
}
1887+
1888+
oldMCOP, ok := old.(*opv1.MachineConfiguration)
1889+
if !ok {
1890+
return
1891+
}
1892+
curMCOP, ok := cur.(*opv1.MachineConfiguration)
1893+
if !ok {
1894+
return
1895+
}
1896+
1897+
// Only update metric if BootImageSkewEnforcementStatus mode changed
1898+
if oldMCOP.Status.BootImageSkewEnforcementStatus.Mode == curMCOP.Status.BootImageSkewEnforcementStatus.Mode {
1899+
return
1900+
}
1901+
1902+
ctrl.syncBootImageSkewEnforcementMetric(cur)
1903+
}
1904+
1905+
// deleteMachineConfiguration handles MachineConfiguration delete events to reset the boot image skew enforcement metric.
1906+
func (ctrl *Controller) deleteMachineConfiguration(_ any) {
1907+
if ctrl.fgHandler == nil || !ctrl.fgHandler.Enabled(features.FeatureGateBootImageSkewEnforcement) {
1908+
return
1909+
}
1910+
1911+
// Reset metric to 0 when MachineConfiguration is deleted
1912+
ctrlcommon.MCCBootImageSkewEnforcementNone.Set(0)
1913+
}
1914+
1915+
// syncBootImageSkewEnforcementMetric updates the mcc_boot_image_skew_enforcement_none metric
1916+
// based on the current BootImageSkewEnforcementStatus mode in MachineConfiguration.
1917+
// The metric is set to 1 when mode is "None", indicating that scaling operations may
1918+
// not be successful.
1919+
func (ctrl *Controller) syncBootImageSkewEnforcementMetric(obj any) {
1920+
1921+
mcop, ok := obj.(*opv1.MachineConfiguration)
1922+
if !ok {
1923+
klog.Warningf("Expected MachineConfiguration object, got %T", obj)
1924+
return
1925+
}
1926+
1927+
if mcop.Status.BootImageSkewEnforcementStatus.Mode == opv1.BootImageSkewEnforcementModeStatusNone {
1928+
ctrlcommon.MCCBootImageSkewEnforcementNone.Set(1)
1929+
} else {
1930+
ctrlcommon.MCCBootImageSkewEnforcementNone.Set(0)
1931+
}
1932+
}

pkg/controller/node/node_controller_test.go

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,8 @@ import (
3434
configv1informer "github.com/openshift/client-go/config/informers/externalversions"
3535
"github.com/openshift/client-go/machineconfiguration/clientset/versioned/fake"
3636
informers "github.com/openshift/client-go/machineconfiguration/informers/externalversions"
37+
fakeoperatorclient "github.com/openshift/client-go/operator/clientset/versioned/fake"
38+
operatorinformer "github.com/openshift/client-go/operator/informers/externalversions"
3739
"github.com/openshift/machine-config-operator/pkg/constants"
3840
ctrlcommon "github.com/openshift/machine-config-operator/pkg/controller/common"
3941
daemonconsts "github.com/openshift/machine-config-operator/pkg/daemon/constants"
@@ -102,17 +104,20 @@ func (f *fixture) newControllerWithStopChan(stopCh <-chan struct{}) *Controller
102104
f.client = fake.NewSimpleClientset(f.objects...)
103105
f.kubeclient = k8sfake.NewSimpleClientset(f.kubeobjects...)
104106
f.schedulerClient = fakeconfigv1client.NewSimpleClientset(f.schedulerObjects...)
107+
operatorClient := fakeoperatorclient.NewSimpleClientset()
105108

106109
i := informers.NewSharedInformerFactory(f.client, noResyncPeriodFunc())
107110
k8sI := kubeinformers.NewSharedInformerFactory(f.kubeclient, noResyncPeriodFunc())
108111
ci := configv1informer.NewSharedInformerFactory(f.schedulerClient, noResyncPeriodFunc())
112+
oi := operatorinformer.NewSharedInformerFactory(operatorClient, noResyncPeriodFunc())
109113
c := NewWithCustomUpdateDelay(i.Machineconfiguration().V1().ControllerConfigs(), i.Machineconfiguration().V1().MachineConfigs(), i.Machineconfiguration().V1().MachineConfigPools(), k8sI.Core().V1().Nodes(),
110-
k8sI.Core().V1().Pods(), i.Machineconfiguration().V1().MachineOSConfigs(), i.Machineconfiguration().V1().MachineOSBuilds(), i.Machineconfiguration().V1().MachineConfigNodes(), ci.Config().V1().Schedulers(), f.kubeclient, f.client, time.Millisecond, f.fgHandler)
114+
k8sI.Core().V1().Pods(), i.Machineconfiguration().V1().MachineOSConfigs(), i.Machineconfiguration().V1().MachineOSBuilds(), i.Machineconfiguration().V1().MachineConfigNodes(), ci.Config().V1().Schedulers(), oi.Operator().V1().MachineConfigurations(), f.kubeclient, f.client, time.Millisecond, f.fgHandler)
111115

112116
c.ccListerSynced = alwaysReady
113117
c.mcpListerSynced = alwaysReady
114118
c.nodeListerSynced = alwaysReady
115119
c.schedulerListerSynced = alwaysReady
120+
c.mcopListerSynced = alwaysReady
116121
c.eventRecorder = &record.FakeRecorder{}
117122

118123
i.Start(stopCh)

test/e2e-bootstrap/bootstrap_test.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -624,6 +624,7 @@ func createControllers(ctx *ctrlcommon.ControllerContext) []ctrlcommon.Controlle
624624
ctx.InformerFactory.Machineconfiguration().V1().MachineOSBuilds(),
625625
ctx.InformerFactory.Machineconfiguration().V1().MachineConfigNodes(),
626626
ctx.ConfigInformerFactory.Config().V1().Schedulers(),
627+
ctx.OperatorInformerFactory.Operator().V1().MachineConfigurations(),
627628
ctx.ClientBuilder.KubeClientOrDie("node-update-controller"),
628629
ctx.ClientBuilder.MachineConfigClientOrDie("node-update-controller"),
629630
ctx.FeatureGatesHandler,

0 commit comments

Comments
 (0)