Skip to content

Commit ba0a205

Browse files
committed
node: add skew enforcement prometheus alert
1 parent 2277bea commit ba0a205

File tree

6 files changed

+72
-3
lines changed

6 files changed

+72
-3
lines changed

cmd/machine-config-controller/start.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -272,6 +272,7 @@ func createControllers(ctx *ctrlcommon.ControllerContext) []ctrlcommon.Controlle
272272
ctx.OCLInformerFactory.Machineconfiguration().V1().MachineOSBuilds(),
273273
ctx.InformerFactory.Machineconfiguration().V1().MachineConfigNodes(),
274274
ctx.ConfigInformerFactory.Config().V1().Schedulers(),
275+
ctx.OperatorInformerFactory.Operator().V1().MachineConfigurations(),
275276
ctx.ClientBuilder.KubeClientOrDie("node-update-controller"),
276277
ctx.ClientBuilder.MachineConfigClientOrDie("node-update-controller"),
277278
ctx.FeatureGatesHandler,

install/0000_90_machine-config_01_prometheus-rules.yaml

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,19 @@ spec:
3838
annotations:
3939
summary: "Triggers when nodes in a pool have overlapping labels such as master, worker, and a custom label therefore a choice must be made as to which is honored."
4040
description: "Node {{ $labels.exported_node }} has triggered a pool alert due to a label change. For more details check MachineConfigController pod logs: oc logs -f -n {{ $labels.namespace }} machine-config-controller-xxxxx -c machine-config-controller"
41-
runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/machine-config-operator/MachineConfigControllerPoolAlert.md
41+
runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/machine-config-operator/MachineConfigControllerPoolAlert.md
42+
- name: mcc-boot-image-skew-enforcement-none
43+
rules:
44+
- alert: MCCBootImageSkewEnforcementNone
45+
expr: |
46+
mcc_boot_image_skew_enforcement_none == 1
47+
labels:
48+
namespace: openshift-machine-config-operator
49+
severity: info
50+
annotations:
51+
summary: "Boot image skew enforcement is disabled. Scaling operations may not be successful."
52+
description: "Boot image skew enforcement mode is set to None. When scaling up, new nodes may be provisioned with older boot images that could introduce compatibility issues. Consider manually updating boot images to match the cluster version."
53+
runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/machine-config-operator/MCCBootImageSkewEnforcementNone.md
4254
---
4355
apiVersion: monitoring.coreos.com/v1
4456
kind: PrometheusRule

pkg/controller/common/metrics.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,15 @@ var (
2525
Help: "state of OS image override",
2626
}, []string{"pool"})
2727

28+
// MCCBootImageSkewEnforcementNone indicates when boot image skew enforcement is disabled.
29+
// Set to 1 when mode is "None", 0 otherwise. A value of 1 indicates scaling operations may
30+
// not be successful
31+
MCCBootImageSkewEnforcementNone = prometheus.NewGauge(
32+
prometheus.GaugeOpts{
33+
Name: "mcc_boot_image_skew_enforcement_none",
34+
Help: "Set to 1 when boot image skew enforcement mode is None, indicating scaling may not be successful as bootimages are out of date",
35+
})
36+
2837
// MCCDrainErr logs failed drain
2938
MCCDrainErr = prometheus.NewGaugeVec(
3039
prometheus.GaugeOpts{
@@ -94,6 +103,7 @@ func RegisterMCCMetrics() error {
94103
MCCUpdatedMachineCount,
95104
MCCDegradedMachineCount,
96105
MCCUnavailableMachineCount,
106+
MCCBootImageSkewEnforcementNone,
97107
})
98108

99109
if err != nil {

pkg/controller/node/node_controller.go

Lines changed: 41 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,12 +15,15 @@ import (
1515
configv1 "github.com/openshift/api/config/v1"
1616
features "github.com/openshift/api/features"
1717
mcfgv1 "github.com/openshift/api/machineconfiguration/v1"
18+
opv1 "github.com/openshift/api/operator/v1"
1819

1920
cligoinformersv1 "github.com/openshift/client-go/config/informers/externalversions/config/v1"
2021
cligolistersv1 "github.com/openshift/client-go/config/listers/config/v1"
2122
mcfgclientset "github.com/openshift/client-go/machineconfiguration/clientset/versioned"
2223
"github.com/openshift/client-go/machineconfiguration/clientset/versioned/scheme"
2324
mcfginformersv1 "github.com/openshift/client-go/machineconfiguration/informers/externalversions/machineconfiguration/v1"
25+
mcopinformersv1 "github.com/openshift/client-go/operator/informers/externalversions/operator/v1"
26+
mcoplistersv1 "github.com/openshift/client-go/operator/listers/operator/v1"
2427

2528
mcfglistersv1 "github.com/openshift/client-go/machineconfiguration/listers/machineconfiguration/v1"
2629
"github.com/openshift/library-go/pkg/operator/v1helpers"
@@ -108,6 +111,9 @@ type Controller struct {
108111
schedulerList cligolistersv1.SchedulerLister
109112
schedulerListerSynced cache.InformerSynced
110113

114+
mcopLister mcoplistersv1.MachineConfigurationLister
115+
mcopListerSynced cache.InformerSynced
116+
111117
queue workqueue.TypedRateLimitingInterface[string]
112118

113119
fgHandler ctrlcommon.FeatureGatesHandler
@@ -127,6 +133,7 @@ func New(
127133
mosbInformer mcfginformersv1.MachineOSBuildInformer,
128134
mcnInformer mcfginformersv1.MachineConfigNodeInformer,
129135
schedulerInformer cligoinformersv1.SchedulerInformer,
136+
mcopInformer mcopinformersv1.MachineConfigurationInformer,
130137
kubeClient clientset.Interface,
131138
mcfgClient mcfgclientset.Interface,
132139
fgHandler ctrlcommon.FeatureGatesHandler,
@@ -141,6 +148,7 @@ func New(
141148
podInformer,
142149
mcnInformer,
143150
schedulerInformer,
151+
mcopInformer,
144152
kubeClient,
145153
mcfgClient,
146154
defaultUpdateDelay,
@@ -158,6 +166,7 @@ func NewWithCustomUpdateDelay(
158166
mosbInformer mcfginformersv1.MachineOSBuildInformer,
159167
mcnInformer mcfginformersv1.MachineConfigNodeInformer,
160168
schedulerInformer cligoinformersv1.SchedulerInformer,
169+
mcopInformer mcopinformersv1.MachineConfigurationInformer,
161170
kubeClient clientset.Interface,
162171
mcfgClient mcfgclientset.Interface,
163172
updateDelay time.Duration,
@@ -173,6 +182,7 @@ func NewWithCustomUpdateDelay(
173182
podInformer,
174183
mcnInformer,
175184
schedulerInformer,
185+
mcopInformer,
176186
kubeClient,
177187
mcfgClient,
178188
updateDelay,
@@ -191,6 +201,7 @@ func newController(
191201
podInformer coreinformersv1.PodInformer,
192202
mcnInformer mcfginformersv1.MachineConfigNodeInformer,
193203
schedulerInformer cligoinformersv1.SchedulerInformer,
204+
mcopInformer mcopinformersv1.MachineConfigurationInformer,
194205
kubeClient clientset.Interface,
195206
mcfgClient mcfgclientset.Interface,
196207
updateDelay time.Duration,
@@ -263,6 +274,9 @@ func newController(
263274
ctrl.schedulerList = schedulerInformer.Lister()
264275
ctrl.schedulerListerSynced = schedulerInformer.Informer().HasSynced
265276

277+
ctrl.mcopLister = mcopInformer.Lister()
278+
ctrl.mcopListerSynced = mcopInformer.Informer().HasSynced
279+
266280
return ctrl
267281
}
268282

@@ -271,7 +285,7 @@ func (ctrl *Controller) Run(workers int, stopCh <-chan struct{}) {
271285
defer utilruntime.HandleCrash()
272286
defer ctrl.queue.ShutDown()
273287

274-
if !cache.WaitForCacheSync(stopCh, ctrl.ccListerSynced, ctrl.mcListerSynced, ctrl.mcpListerSynced, ctrl.moscListerSynced, ctrl.mosbListerSynced, ctrl.nodeListerSynced, ctrl.schedulerListerSynced) {
288+
if !cache.WaitForCacheSync(stopCh, ctrl.ccListerSynced, ctrl.mcListerSynced, ctrl.mcpListerSynced, ctrl.moscListerSynced, ctrl.mosbListerSynced, ctrl.nodeListerSynced, ctrl.schedulerListerSynced, ctrl.mcopListerSynced) {
275289
return
276290
}
277291

@@ -1847,5 +1861,31 @@ func (ctrl *Controller) syncMetrics() error {
18471861
ctrlcommon.MCCDegradedMachineCount.WithLabelValues(pool.Name).Set(float64(pool.Status.DegradedMachineCount))
18481862
ctrlcommon.MCCUnavailableMachineCount.WithLabelValues(pool.Name).Set(float64(pool.Status.UnavailableMachineCount))
18491863
}
1864+
1865+
// Update boot image skew enforcement metric
1866+
ctrl.syncBootImageSkewEnforcementMetric()
1867+
18501868
return nil
18511869
}
1870+
1871+
// syncBootImageSkewEnforcementMetric updates the mcc_boot_image_skew_enforcement_none metric
1872+
// based on the current BootImageSkewEnforcementStatus mode in MachineConfiguration.
1873+
// The metric is set to 1 when mode is "None", indicating that scaling operations may
1874+
// not be successful.
1875+
func (ctrl *Controller) syncBootImageSkewEnforcementMetric() {
1876+
if ctrl.fgHandler == nil || !ctrl.fgHandler.Enabled(features.FeatureGateBootImageSkewEnforcement) {
1877+
return
1878+
}
1879+
1880+
mcop, err := ctrl.mcopLister.Get(ctrlcommon.MCOOperatorKnobsObjectName)
1881+
if err != nil {
1882+
klog.V(4).Infof("Failed to get MachineConfiguration for boot image skew enforcement metric: %v", err)
1883+
return
1884+
}
1885+
1886+
if mcop.Status.BootImageSkewEnforcementStatus.Mode == opv1.BootImageSkewEnforcementModeStatusNone {
1887+
ctrlcommon.MCCBootImageSkewEnforcementNone.Set(1)
1888+
} else {
1889+
ctrlcommon.MCCBootImageSkewEnforcementNone.Set(0)
1890+
}
1891+
}

pkg/controller/node/node_controller_test.go

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,8 @@ import (
3434
configv1informer "github.com/openshift/client-go/config/informers/externalversions"
3535
"github.com/openshift/client-go/machineconfiguration/clientset/versioned/fake"
3636
informers "github.com/openshift/client-go/machineconfiguration/informers/externalversions"
37+
fakeoperatorclient "github.com/openshift/client-go/operator/clientset/versioned/fake"
38+
operatorinformer "github.com/openshift/client-go/operator/informers/externalversions"
3739
"github.com/openshift/machine-config-operator/pkg/constants"
3840
ctrlcommon "github.com/openshift/machine-config-operator/pkg/controller/common"
3941
daemonconsts "github.com/openshift/machine-config-operator/pkg/daemon/constants"
@@ -102,17 +104,20 @@ func (f *fixture) newControllerWithStopChan(stopCh <-chan struct{}) *Controller
102104
f.client = fake.NewSimpleClientset(f.objects...)
103105
f.kubeclient = k8sfake.NewSimpleClientset(f.kubeobjects...)
104106
f.schedulerClient = fakeconfigv1client.NewSimpleClientset(f.schedulerObjects...)
107+
operatorClient := fakeoperatorclient.NewSimpleClientset()
105108

106109
i := informers.NewSharedInformerFactory(f.client, noResyncPeriodFunc())
107110
k8sI := kubeinformers.NewSharedInformerFactory(f.kubeclient, noResyncPeriodFunc())
108111
ci := configv1informer.NewSharedInformerFactory(f.schedulerClient, noResyncPeriodFunc())
112+
oi := operatorinformer.NewSharedInformerFactory(operatorClient, noResyncPeriodFunc())
109113
c := NewWithCustomUpdateDelay(i.Machineconfiguration().V1().ControllerConfigs(), i.Machineconfiguration().V1().MachineConfigs(), i.Machineconfiguration().V1().MachineConfigPools(), k8sI.Core().V1().Nodes(),
110-
k8sI.Core().V1().Pods(), i.Machineconfiguration().V1().MachineOSConfigs(), i.Machineconfiguration().V1().MachineOSBuilds(), i.Machineconfiguration().V1().MachineConfigNodes(), ci.Config().V1().Schedulers(), f.kubeclient, f.client, time.Millisecond, f.fgHandler)
114+
k8sI.Core().V1().Pods(), i.Machineconfiguration().V1().MachineOSConfigs(), i.Machineconfiguration().V1().MachineOSBuilds(), i.Machineconfiguration().V1().MachineConfigNodes(), ci.Config().V1().Schedulers(), oi.Operator().V1().MachineConfigurations(), f.kubeclient, f.client, time.Millisecond, f.fgHandler)
111115

112116
c.ccListerSynced = alwaysReady
113117
c.mcpListerSynced = alwaysReady
114118
c.nodeListerSynced = alwaysReady
115119
c.schedulerListerSynced = alwaysReady
120+
c.mcopListerSynced = alwaysReady
116121
c.eventRecorder = &record.FakeRecorder{}
117122

118123
i.Start(stopCh)

test/e2e-bootstrap/bootstrap_test.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -624,6 +624,7 @@ func createControllers(ctx *ctrlcommon.ControllerContext) []ctrlcommon.Controlle
624624
ctx.InformerFactory.Machineconfiguration().V1().MachineOSBuilds(),
625625
ctx.InformerFactory.Machineconfiguration().V1().MachineConfigNodes(),
626626
ctx.ConfigInformerFactory.Config().V1().Schedulers(),
627+
ctx.OperatorInformerFactory.Operator().V1().MachineConfigurations(),
627628
ctx.ClientBuilder.KubeClientOrDie("node-update-controller"),
628629
ctx.ClientBuilder.MachineConfigClientOrDie("node-update-controller"),
629630
ctx.FeatureGatesHandler,

0 commit comments

Comments
 (0)