Skip to content

Commit 06547b1

Browse files
committed
Remove finalizer on NHC Timeout deleted FAR CR
Create three function to reconcile logic into smaller usable functions: remove FAR CR finalizer on deletion, remove FAR taint on deletion, and stop agent execution when NHC time out annotation exist or FAR CR was deleted
1 parent ce57ab4 commit 06547b1

File tree

1 file changed

+75
-49
lines changed

1 file changed

+75
-49
lines changed

controllers/fenceagentsremediation_controller.go

Lines changed: 75 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ import (
2828
commonEvents "github.com/medik8s/common/pkg/events"
2929
commonResources "github.com/medik8s/common/pkg/resources"
3030

31+
corev1 "k8s.io/api/core/v1"
3132
apiErrors "k8s.io/apimachinery/pkg/api/errors"
3233
"k8s.io/apimachinery/pkg/api/meta"
3334
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
@@ -132,13 +133,22 @@ func (r *FenceAgentsRemediationReconciler) Reconcile(ctx context.Context, req ct
132133
return emptyResult, err
133134
}
134135

135-
// Check NHC timeout annotation
136+
// Check NHC timeout annotation and stop the agent, since remediation is no longer relevant (most likely because fixed by a different remediator)
136137
if isTimedOutByNHC(far) {
137138
r.Log.Info(utils.EventMessageRemediationStoppedByNHC)
138-
r.Executor.Remove(far.GetUID())
139+
r.stopAgentAndGetCrStatus(far, node.Name)
139140
utils.UpdateConditions(utils.RemediationInterruptedByNHC, far, r.Log)
140141
commonEvents.RemediationStoppedByNHC(r.Recorder, far)
141-
return emptyResult, err
142+
143+
if far.GetDeletionTimestamp() != nil {
144+
if res, err := r.removeFarTaints(far, node); res != emptyResult || err != nil {
145+
return res, err
146+
}
147+
// Removing FAR CR finalizer so NHC deletion of the remediation can be completed
148+
r.Log.Info("Removing finalizer of timed-out remediation deleted by NHC", "remediation name", far.GetName())
149+
return emptyResult, r.removeFarFinalizer(far)
150+
}
151+
return emptyResult, nil
142152
}
143153

144154
// Add finalizer when the CR is created
@@ -158,54 +168,12 @@ func (r *FenceAgentsRemediationReconciler) Reconcile(ctx context.Context, req ct
158168
r.Log.Info("CR's deletion timestamp is not zero, and FAR finalizer exists", "CR Name", req.Name)
159169

160170
if !meta.IsStatusConditionPresentAndEqual(far.Status.Conditions, commonConditions.SucceededType, metav1.ConditionTrue) {
161-
processingCondition := meta.FindStatusCondition(far.Status.Conditions, commonConditions.ProcessingType).Status
162-
fenceAgentActionSucceededCondition := meta.FindStatusCondition(far.Status.Conditions, utils.FenceAgentActionSucceededType).Status
163-
succeededCondition := meta.FindStatusCondition(far.Status.Conditions, commonConditions.SucceededType).Status
164-
r.Log.Info("FAR didn't finish remediate the node ", "CR Name", req.Name, "processing condition", processingCondition,
165-
"fenceAgentActionSucceeded condition", fenceAgentActionSucceededCondition, "succeeded condition", succeededCondition)
166-
r.Executor.Remove(far.GetUID())
171+
r.stopAgentAndGetCrStatus(far, node.Name)
167172
}
168-
169-
// remove out-of-service taint when using OutOfServiceTaint remediation
170-
if far.Spec.RemediationStrategy == v1alpha1.OutOfServiceTaintRemediationStrategy {
171-
r.Log.Info("Removing out-of-service taint", "Fence Agent", far.Spec.Agent, "Node Name", node.Name)
172-
taint := utils.CreateOutOfServiceTaint()
173-
if err := utils.RemoveTaint(r.Client, node.Name, taint); err != nil {
174-
if apiErrors.IsConflict(err) {
175-
r.Log.Error(err, "Failed to remove taint from node due to node update, retrying... ,", "node name", node.Name, "taint key", taint.Key, "taint effect", taint.Effect)
176-
return ctrl.Result{RequeueAfter: time.Second}, nil
177-
} else if !apiErrors.IsNotFound(err) {
178-
r.Log.Error(err, "Failed to remove taint from node,", "node name", node.Name, "taint key", taint.Key, "taint effect", taint.Effect)
179-
return emptyResult, err
180-
}
181-
}
182-
r.Log.Info("out-of-service taint was removed", "Node Name", req.Name)
183-
commonEvents.NormalEvent(r.Recorder, node, utils.EventReasonRemoveOutOfServiceTaint, utils.EventMessageRemoveOutOfServiceTaint)
173+
if res, err := r.removeFarTaints(far, node); res != emptyResult || err != nil {
174+
return res, err
184175
}
185-
186-
// remove node's taints
187-
taint := utils.CreateRemediationTaint()
188-
if err := utils.RemoveTaint(r.Client, node.Name, taint); err != nil {
189-
if apiErrors.IsConflict(err) {
190-
r.Log.Info("Failed to remove taint from node due to node update, retrying... ,", "node name", node.Name, "taint key", taint.Key, "taint effect", taint.Effect)
191-
return ctrl.Result{RequeueAfter: time.Second}, nil
192-
193-
} else if !apiErrors.IsNotFound(err) {
194-
r.Log.Error(err, "Failed to remove taint from node,", "node name", node.Name, "taint key", taint.Key, "taint effect", taint.Effect)
195-
return emptyResult, err
196-
}
197-
}
198-
199-
r.Log.Info("FAR remediation taint was removed", "Node Name", node.Name)
200-
commonEvents.NormalEvent(r.Recorder, node, utils.EventReasonRemoveRemediationTaint, utils.EventMessageRemoveRemediationTaint)
201-
// remove finalizer
202-
controllerutil.RemoveFinalizer(far, v1alpha1.FARFinalizer)
203-
if err := r.Client.Update(context.Background(), far); err != nil {
204-
return emptyResult, fmt.Errorf("failed to remove finalizer from CR - %w", err)
205-
}
206-
r.Log.Info("Finalizer was removed", "CR Name", req.Name)
207-
commonEvents.NormalEvent(r.Recorder, far, utils.EventReasonRemoveFinalizer, utils.EventMessageRemoveFinalizer)
208-
return emptyResult, nil
176+
return emptyResult, r.removeFarFinalizer(far)
209177
}
210178
// Add FAR (medik8s) remediation taint
211179
taintAdded, err := utils.AppendTaint(r.Client, node.Name, utils.CreateRemediationTaint())
@@ -283,6 +251,64 @@ func (r *FenceAgentsRemediationReconciler) Reconcile(ctx context.Context, req ct
283251
return emptyResult, nil
284252
}
285253

254+
// removeFarTaints removes the out-of-service and remeidaiton taints if present, and returns an error on failure
255+
func (r *FenceAgentsRemediationReconciler) removeFarTaints(far *v1alpha1.FenceAgentsRemediation, node *corev1.Node) (ctrl.Result, error) {
256+
emptyResult := ctrl.Result{}
257+
// remove out-of-service taint when using OutOfServiceTaint remediation
258+
if far.Spec.RemediationStrategy == v1alpha1.OutOfServiceTaintRemediationStrategy {
259+
r.Log.Info("Removing out-of-service taint", "Fence Agent", far.Spec.Agent, "Node Name", node.Name)
260+
taint := utils.CreateOutOfServiceTaint()
261+
if err := utils.RemoveTaint(r.Client, node.Name, taint); err != nil {
262+
if apiErrors.IsConflict(err) {
263+
r.Log.Error(err, "Failed to remove taint from node due to node update, retrying... ,", "node name", node.Name, "taint key", taint.Key, "taint effect", taint.Effect)
264+
return ctrl.Result{RequeueAfter: time.Second}, nil
265+
} else if !apiErrors.IsNotFound(err) {
266+
r.Log.Error(err, "Failed to remove taint from node,", "node name", node.Name, "taint key", taint.Key, "taint effect", taint.Effect)
267+
return emptyResult, err
268+
}
269+
}
270+
r.Log.Info("out-of-service taint was removed", "Node Name", node.Name)
271+
commonEvents.NormalEvent(r.Recorder, node, utils.EventReasonRemoveOutOfServiceTaint, utils.EventMessageRemoveOutOfServiceTaint)
272+
}
273+
274+
// remove FAR remediation taint
275+
taint := utils.CreateRemediationTaint()
276+
if err := utils.RemoveTaint(r.Client, node.Name, taint); err != nil {
277+
if apiErrors.IsConflict(err) {
278+
r.Log.Info("Failed to remove taint from node due to node update, retrying... ,", "node name", node.Name, "taint key", taint.Key, "taint effect", taint.Effect)
279+
return ctrl.Result{RequeueAfter: time.Second}, nil
280+
281+
} else if !apiErrors.IsNotFound(err) {
282+
r.Log.Error(err, "Failed to remove taint from node,", "node name", node.Name, "taint key", taint.Key, "taint effect", taint.Effect)
283+
return emptyResult, err
284+
}
285+
}
286+
r.Log.Info("FAR remediation taint was removed", "Node Name", node.Name)
287+
commonEvents.NormalEvent(r.Recorder, node, utils.EventReasonRemoveRemediationTaint, utils.EventMessageRemoveRemediationTaint)
288+
return emptyResult, nil
289+
}
290+
291+
// stopAgentAndGetCrStatus log FAR CR status and stops the agent's parallel execution
292+
func (r *FenceAgentsRemediationReconciler) stopAgentAndGetCrStatus(far *v1alpha1.FenceAgentsRemediation, nodeName string) {
293+
processingCondition := meta.FindStatusCondition(far.Status.Conditions, commonConditions.ProcessingType).Status
294+
fenceAgentActionSucceededCondition := meta.FindStatusCondition(far.Status.Conditions, utils.FenceAgentActionSucceededType).Status
295+
succeededCondition := meta.FindStatusCondition(far.Status.Conditions, commonConditions.SucceededType).Status
296+
r.Log.Info("FAR didn't finish to remediate the node", "Node Name", nodeName, "Agent Name", far.Spec.Agent, "processing condition", processingCondition,
297+
"fenceAgentActionSucceeded condition", fenceAgentActionSucceededCondition, "succeeded condition", succeededCondition)
298+
r.Executor.Remove(far.GetUID())
299+
}
300+
301+
// removeFarFinalizer removes FAR finalizer, update CR, and emits an event
302+
func (r *FenceAgentsRemediationReconciler) removeFarFinalizer(far *v1alpha1.FenceAgentsRemediation) error {
303+
controllerutil.RemoveFinalizer(far, v1alpha1.FARFinalizer)
304+
if err := r.Client.Update(context.Background(), far); err != nil {
305+
return fmt.Errorf("failed to remove finalizer from CR - %w", err)
306+
}
307+
r.Log.Info("Finalizer was removed", "CR Name", far.Name)
308+
commonEvents.NormalEvent(r.Recorder, far, utils.EventReasonRemoveFinalizer, utils.EventMessageRemoveFinalizer)
309+
return nil
310+
}
311+
286312
// isTimedOutByNHC checks if NHC set a timeout annotation on the CR
287313
func isTimedOutByNHC(far *v1alpha1.FenceAgentsRemediation) bool {
288314
if far != nil && far.Annotations != nil && far.DeletionTimestamp == nil {

0 commit comments

Comments
 (0)