Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 18 additions & 13 deletions controllers/fenceagentsremediation_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -132,13 +132,13 @@ func (r *FenceAgentsRemediationReconciler) Reconcile(ctx context.Context, req ct
return emptyResult, err
}

// Check NHC timeout annotation
if isTimedOutByNHC(far) {
// Check NHC timeout annotation and stop the agent, since remediation is no longer relevant (most likely because fixed by a different remediator)
if r.Executor.Exists(far.GetUID()) && isTimedOutByNHC(far) {
Copy link
Member

@slintes slintes Jan 18, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you add a condition here, and few lines below you change to requeue.... I think that's a significant unwanted change in execution flow, what happens on next reconcile?

r.Log.Info(utils.EventMessageRemediationStoppedByNHC)
r.Executor.Remove(far.GetUID())
r.stopAgentAndGetCrConditionsStatus(far, node.Name)
utils.UpdateConditions(utils.RemediationInterruptedByNHC, far, r.Log)
commonEvents.RemediationStoppedByNHC(r.Recorder, far)
return emptyResult, err
return requeueImmediately, nil
}

// Add finalizer when the CR is created
Expand All @@ -158,12 +158,7 @@ func (r *FenceAgentsRemediationReconciler) Reconcile(ctx context.Context, req ct
r.Log.Info("CR's deletion timestamp is not zero, and FAR finalizer exists", "CR Name", req.Name)

if !meta.IsStatusConditionPresentAndEqual(far.Status.Conditions, commonConditions.SucceededType, metav1.ConditionTrue) {
processingCondition := meta.FindStatusCondition(far.Status.Conditions, commonConditions.ProcessingType).Status
fenceAgentActionSucceededCondition := meta.FindStatusCondition(far.Status.Conditions, utils.FenceAgentActionSucceededType).Status
succeededCondition := meta.FindStatusCondition(far.Status.Conditions, commonConditions.SucceededType).Status
r.Log.Info("FAR didn't finish remediate the node ", "CR Name", req.Name, "processing condition", processingCondition,
"fenceAgentActionSucceeded condition", fenceAgentActionSucceededCondition, "succeeded condition", succeededCondition)
r.Executor.Remove(far.GetUID())
r.stopAgentAndGetCrConditionsStatus(far, node.Name)
}

// remove out-of-service taint when using OutOfServiceTaint remediation
Expand All @@ -179,11 +174,11 @@ func (r *FenceAgentsRemediationReconciler) Reconcile(ctx context.Context, req ct
return emptyResult, err
}
}
r.Log.Info("out-of-service taint was removed", "Node Name", req.Name)
r.Log.Info("out-of-service taint was removed", "Node Name", node.Name)
commonEvents.NormalEvent(r.Recorder, node, utils.EventReasonRemoveOutOfServiceTaint, utils.EventMessageRemoveOutOfServiceTaint)
}

// remove node's taints
// remove FAR remediation taint
taint := utils.CreateRemediationTaint()
if err := utils.RemoveTaint(r.Client, node.Name, taint); err != nil {
if apiErrors.IsConflict(err) {
Expand All @@ -203,7 +198,7 @@ func (r *FenceAgentsRemediationReconciler) Reconcile(ctx context.Context, req ct
if err := r.Client.Update(context.Background(), far); err != nil {
return emptyResult, fmt.Errorf("failed to remove finalizer from CR - %w", err)
}
r.Log.Info("Finalizer was removed", "CR Name", req.Name)
r.Log.Info("FAR Finalizer was removed", "CR Name", far.Name)
commonEvents.NormalEvent(r.Recorder, far, utils.EventReasonRemoveFinalizer, utils.EventMessageRemoveFinalizer)
return emptyResult, nil
}
Expand Down Expand Up @@ -283,6 +278,16 @@ func (r *FenceAgentsRemediationReconciler) Reconcile(ctx context.Context, req ct
return emptyResult, nil
}

// stopAgentAndGetCrStatus log FAR CR status and stops the agent's parallel execution
func (r *FenceAgentsRemediationReconciler) stopAgentAndGetCrConditionsStatus(far *v1alpha1.FenceAgentsRemediation, nodeName string) {
processingCondition := meta.FindStatusCondition(far.Status.Conditions, commonConditions.ProcessingType).Status
fenceAgentActionSucceededCondition := meta.FindStatusCondition(far.Status.Conditions, utils.FenceAgentActionSucceededType).Status
succeededCondition := meta.FindStatusCondition(far.Status.Conditions, commonConditions.SucceededType).Status
r.Log.Info("FAR agent was stopped", "Node Name", nodeName, "Agent Name", far.Spec.Agent, "processing condition", processingCondition,
"fenceAgentActionSucceeded condition", fenceAgentActionSucceededCondition, "succeeded condition", succeededCondition)
r.Executor.Remove(far.GetUID())
Comment on lines +281 to +288
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

Nil-pointer panic risk when reading CR conditions

meta.FindStatusCondition returns *metav1.Condition which can be nil when the condition is absent.
Directly dereferencing .Status will panic and crash the controller on the very scenario it tries to handle (e.g., first reconcile where conditions aren’t set).

- processingCondition := meta.FindStatusCondition(far.Status.Conditions, commonConditions.ProcessingType).Status
- fenceAgentActionSucceededCondition := meta.FindStatusCondition(far.Status.Conditions, utils.FenceAgentActionSucceededType).Status
- succeededCondition := meta.FindStatusCondition(far.Status.Conditions, commonConditions.SucceededType).Status
+ var processingStatus, actionSucceededStatus, succeededStatus metav1.ConditionStatus
+
+ if c := meta.FindStatusCondition(far.Status.Conditions, commonConditions.ProcessingType); c != nil {
+     processingStatus = c.Status
+ }
+ if c := meta.FindStatusCondition(far.Status.Conditions, utils.FenceAgentActionSucceededType); c != nil {
+     actionSucceededStatus = c.Status
+ }
+ if c := meta.FindStatusCondition(far.Status.Conditions, commonConditions.SucceededType); c != nil {
+     succeededStatus = c.Status
+ }

This guards against panics while preserving the intended log output.

Committable suggestion skipped: line range outside the PR's diff.

🤖 Prompt for AI Agents
In controllers/fenceagentsremediation_controller.go around lines 281 to 288, the
code directly dereferences the Status field of the result from
meta.FindStatusCondition without checking for nil, which can cause a nil-pointer
panic if the condition is absent. To fix this, first check if the returned
condition pointer is nil before accessing its Status field; if nil, use a
default value (e.g., an empty string or a specific status) to safely log the
condition status without panicking.

}

// isTimedOutByNHC checks if NHC set a timeout annotation on the CR
func isTimedOutByNHC(far *v1alpha1.FenceAgentsRemediation) bool {
if far != nil && far.Annotations != nil && far.DeletionTimestamp == nil {
Expand Down
Loading