Skip to content

Commit

Permalink
Fix hco system health metric values (#3242)
Browse files Browse the repository at this point in the history
kubevirt_hco_system_health_status values changed to value+1,
this pr remove unknown from the options and change back the values to be
again 0 for health, 1 for warning and 2 for error.

also tests and related recording rules and alerts updated accordingly.

Signed-off-by: avlitman <[email protected]>
  • Loading branch information
avlitman authored Dec 30, 2024
1 parent 3f5df2e commit aacb2e5
Show file tree
Hide file tree
Showing 6 changed files with 62 additions and 87 deletions.
43 changes: 18 additions & 25 deletions controllers/hyperconverged/hyperconverged_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -1011,6 +1011,8 @@ func (r *ReconcileHyperConverged) updateConditions(req *common.HcoRequest) {
req.Instance.Status.SystemHealthStatus = systemHealthStatus
req.StatusDirty = true
}

metrics.SetHCOMetricSystemHealthStatus(getNumericalHealthStatus(systemHealthStatus))
}

func (r *ReconcileHyperConverged) setLabels(req *common.HcoRequest) {
Expand Down Expand Up @@ -1066,42 +1068,23 @@ func (r *ReconcileHyperConverged) detectTaintedConfiguration(req *common.HcoRequ
}

func (r *ReconcileHyperConverged) getSystemHealthStatus(conditions common.HcoConditions) string {
if isError, reason := isSystemHealthStatusError(conditions); isError {
metrics.SetHCOSystemError(reason)
if isSystemHealthStatusError(conditions) {
return systemHealthStatusError
}

if isWarning, reason := isSystemHealthStatusWarning(conditions); isWarning {
metrics.SetHCOSystemWarning(reason)
if isSystemHealthStatusWarning(conditions) {
return systemHealthStatusWarning
}

metrics.SetHCOSystemHealthy()
return systemHealthStatusHealthy
}

func isSystemHealthStatusError(conditions common.HcoConditions) (bool, string) {
if cond, found := conditions.GetCondition(hcov1beta1.ConditionDegraded); found && cond.Status == metav1.ConditionTrue {
return true, cond.Reason
}

if cond, found := conditions.GetCondition(hcov1beta1.ConditionAvailable); found && cond.Status != metav1.ConditionTrue {
return true, cond.Reason
}

return false, ""
func isSystemHealthStatusError(conditions common.HcoConditions) bool {
return !conditions.IsStatusConditionTrue(hcov1beta1.ConditionAvailable) || conditions.IsStatusConditionTrue(hcov1beta1.ConditionDegraded)
}

func isSystemHealthStatusWarning(conditions common.HcoConditions) (bool, string) {
if cond, found := conditions.GetCondition(hcov1beta1.ConditionProgressing); found && cond.Status == metav1.ConditionTrue {
return true, cond.Reason
}

if cond, found := conditions.GetCondition(hcov1beta1.ConditionReconcileComplete); found && cond.Status != metav1.ConditionTrue {
return true, cond.Reason
}

return false, ""
func isSystemHealthStatusWarning(conditions common.HcoConditions) bool {
return !conditions.IsStatusConditionTrue(hcov1beta1.ConditionReconcileComplete) || conditions.IsStatusConditionTrue(hcov1beta1.ConditionProgressing)
}

func getNumOfChangesJSONPatch(jsonPatch string) int {
Expand All @@ -1112,6 +1095,16 @@ func getNumOfChangesJSONPatch(jsonPatch string) int {
return len(patches)
}

func getNumericalHealthStatus(status string) float64 {
healthStatusCodes := map[string]float64{
systemHealthStatusHealthy: metrics.SystemHealthStatusHealthy,
systemHealthStatusWarning: metrics.SystemHealthStatusWarning,
systemHealthStatusError: metrics.SystemHealthStatusError,
}

return healthStatusCodes[status]
}

func (r *ReconcileHyperConverged) firstLoopInitialization(request *common.HcoRequest) {
// Initialize operand handler.
r.operandHandler.FirstUseInitiation(r.scheme, hcoutil.GetClusterInfo(), request.Instance)
Expand Down
10 changes: 5 additions & 5 deletions controllers/hyperconverged/hyperconverged_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,7 @@ var _ = Describe("HyperconvergedController", func() {
Message: "Initializing HyperConverged cluster",
})))

verifySystemHealthStatusError(foundResource, reconcileInit)
verifySystemHealthStatusError(foundResource)

expectedFeatureGates := []string{
"CPUManager",
Expand Down Expand Up @@ -307,7 +307,7 @@ var _ = Describe("HyperconvergedController", func() {
Message: reconcileCompletedMessage,
})))

verifySystemHealthStatusError(foundResource, "SSPConditions")
verifySystemHealthStatusError(foundResource)

Expect(foundResource.Status.RelatedObjects).To(HaveLen(21))
expectedRef := corev1.ObjectReference{
Expand Down Expand Up @@ -3794,15 +3794,15 @@ func verifyHyperConvergedCRExistsMetricFalse() {
func verifySystemHealthStatusHealthy(hco *hcov1beta1.HyperConverged) {
ExpectWithOffset(1, hco.Status.SystemHealthStatus).To(Equal(systemHealthStatusHealthy))

systemHealthStatusMetric, err := metrics.GetHCOMetricSystemHealthStatus("healthy")
systemHealthStatusMetric, err := metrics.GetHCOMetricSystemHealthStatus()
ExpectWithOffset(1, err).ToNot(HaveOccurred())
ExpectWithOffset(1, systemHealthStatusMetric).To(Equal(metrics.SystemHealthStatusHealthy))
}

func verifySystemHealthStatusError(hco *hcov1beta1.HyperConverged, reason string) {
func verifySystemHealthStatusError(hco *hcov1beta1.HyperConverged) {
ExpectWithOffset(1, hco.Status.SystemHealthStatus).To(Equal(systemHealthStatusError))

systemHealthStatusMetric, err := metrics.GetHCOMetricSystemHealthStatus(reason)
systemHealthStatusMetric, err := metrics.GetHCOMetricSystemHealthStatus()
ExpectWithOffset(1, err).ToNot(HaveOccurred())
ExpectWithOffset(1, systemHealthStatusMetric).To(Equal(metrics.SystemHealthStatusError))
}
Expand Down
39 changes: 19 additions & 20 deletions hack/prom-rule-ci/prom-rules-tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -610,7 +610,7 @@ tests:
input_series:
- series: 'kubevirt_hco_system_health_status'
# time: 0 1 2 3 4 5 6 7 8 9 10 11
values: "1 1 1 1 2 2 2 2 3 3 3 3"
values: "0 0 0 0 1 1 1 1 2 2 2 2"
- series: 'ALERTS{kubernetes_operator_part_of="kubevirt", alertstate="firing", operator_health_impact="warning"}'
# time: 0 1 2 3 4 5 6 7 8 9 10 11
values: "1 1 stale stale 1 1 stale stale 1 1 stale stale"
Expand Down Expand Up @@ -771,47 +771,46 @@ tests:
kubernetes_operator_part_of: "kubevirt"
kubernetes_operator_component: "hyperconverged-cluster-operator"

# Test OperatorConditionsUnhealthy
# Test HCOOperatorConditionsUnhealthy
- interval: 1m
input_series:
- series: 'kubevirt_hco_system_health_status{reason="healthy"}'
- values: "stale 1 stale stale stale"
- series: 'kubevirt_hco_system_health_status'
- values: "stale stale 2 stale"

- series: 'kubevirt_hco_system_health_status{reason="SOME_ERROR"}'
values: "stale stale stale 3 stale"

- series: 'kubevirt_hco_system_health_status{reason="SOME_WARNING"}'
values: "stale stale stale stale stale 2 stale"
- series: 'kubevirt_hco_system_health_status'
values: "stale stale 2 stale 1 stale"

alert_rule_test:
- eval_time: 1m
alertname: OperatorConditionsUnhealthy
alertname: HCOOperatorConditionsUnhealthy
exp_alerts: [ ]

- eval_time: 3m
alertname: OperatorConditionsUnhealthy
- eval_time: 1m
alertname: HCOOperatorConditionsUnhealthy
exp_alerts: [ ]

- eval_time: 2m
alertname: HCOOperatorConditionsUnhealthy
exp_alerts:
- exp_annotations:
description: "HCO and its secondary resources are in a critical state due to SOME_ERROR."
description: "HCO and its secondary resources are in a critical state due to system error."
summary: "HCO and its secondary resources are in a critical state."
runbook_url: "https://kubevirt.io/monitoring/runbooks/OperatorConditionsUnhealthy"
runbook_url: "https://kubevirt.io/monitoring/runbooks/HCOOperatorConditionsUnhealthy"
exp_labels:
severity: "critical"
operator_health_impact: "critical"
kubernetes_operator_part_of: "kubevirt"
kubernetes_operator_component: "hyperconverged-cluster-operator"
reason: "SOME_ERROR"

- eval_time: 5m
alertname: OperatorConditionsUnhealthy
- eval_time: 4m
alertname: HCOOperatorConditionsUnhealthy
exp_alerts:
- exp_annotations:
description: "HCO and its secondary resources are in a warning state due to SOME_WARNING."
description: "HCO and its secondary resources are in a warning state due to system warning."
summary: "HCO and its secondary resources are in a warning state."
runbook_url: "https://kubevirt.io/monitoring/runbooks/OperatorConditionsUnhealthy"
runbook_url: "https://kubevirt.io/monitoring/runbooks/HCOOperatorConditionsUnhealthy"
exp_labels:
severity: "warning"
operator_health_impact: "warning"
kubernetes_operator_part_of: "kubevirt"
kubernetes_operator_component: "hyperconverged-cluster-operator"
reason: "SOME_WARNING"
33 changes: 10 additions & 23 deletions pkg/monitoring/metrics/operator_metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,7 @@ const (
)

const (
SystemHealthStatusUnknown float64 = iota
SystemHealthStatusHealthy
SystemHealthStatusHealthy float64 = iota
SystemHealthStatusWarning
SystemHealthStatusError
)
Expand Down Expand Up @@ -53,12 +52,11 @@ var (
},
)

systemHealthStatus = operatormetrics.NewGaugeVec(
systemHealthStatus = operatormetrics.NewGauge(
operatormetrics.MetricOpts{
Name: "kubevirt_hco_system_health_status",
Help: "Indicates whether the system health status is healthy (0), warning (1), or error (2), by aggregating the conditions of HCO and its secondary resources",
},
[]string{"reason"},
)
)

Expand Down Expand Up @@ -119,30 +117,19 @@ func IsHCOMetricHyperConvergedExists() (bool, error) {
return value == hyperConvergedExists, nil
}

func SetHCOSystemHealthy() {
systemHealthStatus.Reset()
systemHealthStatus.WithLabelValues("healthy").Set(SystemHealthStatusHealthy)
func SetHCOMetricSystemHealthStatus(status float64) {
systemHealthStatus.Set(status)
}

func SetHCOSystemWarning(reason string) {
systemHealthStatus.Reset()
systemHealthStatus.WithLabelValues(reason).Set(SystemHealthStatusWarning)
}

func SetHCOSystemError(reason string) {
systemHealthStatus.Reset()
systemHealthStatus.WithLabelValues(reason).Set(SystemHealthStatusError)
}

// GetHCOMetricSystemHealthStatus returns current value of gauge. If error is not nil then value is undefined
func GetHCOMetricSystemHealthStatus(reason string) (float64, error) {
func GetHCOMetricSystemHealthStatus() (float64, error) {
dto := &ioprometheusclient.Metric{}
err := systemHealthStatus.WithLabelValues(reason).Write(dto)
err := systemHealthStatus.Write(dto)
value := dto.Gauge.GetValue()

if err != nil {
return SystemHealthStatusUnknown, err
return 0, err
}

return dto.Gauge.GetValue(), nil
return value, nil
}

func getLabelsForObj(kind string, name string) string {
Expand Down
16 changes: 6 additions & 10 deletions pkg/monitoring/metrics/operator_metrics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,20 +9,16 @@ import (

var _ = Describe("Operator Metrics", func() {
Context("kubevirt_hco_system_health_status", func() {
It("should set system error reason correctly", func() {
metrics.SetHCOSystemError("Reason1")
v, err := metrics.GetHCOMetricSystemHealthStatus("Reason1")
It("should set the correct system health status", func() {
metrics.SetHCOMetricSystemHealthStatus(metrics.SystemHealthStatusError)
v, err := metrics.GetHCOMetricSystemHealthStatus()
Expect(err).ToNot(HaveOccurred())
Expect(v).To(Equal(metrics.SystemHealthStatusError))

metrics.SetHCOSystemError("Reason2")
v, err = metrics.GetHCOMetricSystemHealthStatus("Reason2")
metrics.SetHCOMetricSystemHealthStatus(metrics.SystemHealthStatusWarning)
v, err = metrics.GetHCOMetricSystemHealthStatus()
Expect(err).ToNot(HaveOccurred())
Expect(v).To(Equal(metrics.SystemHealthStatusError))

v, err = metrics.GetHCOMetricSystemHealthStatus("Reason1")
Expect(err).ToNot(HaveOccurred())
Expect(v).To(Equal(metrics.SystemHealthStatusUnknown))
Expect(v).To(Equal(metrics.SystemHealthStatusWarning))
})
})
})
8 changes: 4 additions & 4 deletions pkg/monitoring/rules/alerts/health_alerts.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,10 @@ import (
func healthAlerts() []promv1.Rule {
return []promv1.Rule{
{
Alert: "OperatorConditionsUnhealthy",
Alert: "HCOOperatorConditionsUnhealthy",
Expr: intstr.FromString(fmt.Sprintf("kubevirt_hco_system_health_status == %f", metrics.SystemHealthStatusError)),
Annotations: map[string]string{
"description": "HCO and its secondary resources are in a critical state due to {{ $labels.reason }}.",
"description": "HCO and its secondary resources are in a critical state due to system error.",
"summary": "HCO and its secondary resources are in a critical state.",
},
Labels: map[string]string{
Expand All @@ -24,10 +24,10 @@ func healthAlerts() []promv1.Rule {
},
},
{
Alert: "OperatorConditionsUnhealthy",
Alert: "HCOOperatorConditionsUnhealthy",
Expr: intstr.FromString(fmt.Sprintf("kubevirt_hco_system_health_status == %f", metrics.SystemHealthStatusWarning)),
Annotations: map[string]string{
"description": "HCO and its secondary resources are in a warning state due to {{ $labels.reason }}.",
"description": "HCO and its secondary resources are in a warning state due to system warning.",
"summary": "HCO and its secondary resources are in a warning state.",
},
Labels: map[string]string{
Expand Down

0 comments on commit aacb2e5

Please sign in to comment.