@@ -28,6 +28,7 @@ import (
2828 commonEvents "github.com/medik8s/common/pkg/events"
2929 commonResources "github.com/medik8s/common/pkg/resources"
3030
31+ corev1 "k8s.io/api/core/v1"
3132 apiErrors "k8s.io/apimachinery/pkg/api/errors"
3233 "k8s.io/apimachinery/pkg/api/meta"
3334 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
@@ -132,13 +133,22 @@ func (r *FenceAgentsRemediationReconciler) Reconcile(ctx context.Context, req ct
132133 return emptyResult , err
133134 }
134135
135- // Check NHC timeout annotation
136+ // Check NHC timeout annotation and stop the agent, since remediation is no longer relevant (most likely because fixed by a different remediator)
136137 if isTimedOutByNHC (far ) {
137138 r .Log .Info (utils .EventMessageRemediationStoppedByNHC )
138- r .Executor . Remove (far . GetUID () )
139+ r .stopAgentAndGetCrStatus (far , node . Name )
139140 utils .UpdateConditions (utils .RemediationInterruptedByNHC , far , r .Log )
140141 commonEvents .RemediationStoppedByNHC (r .Recorder , far )
141- return emptyResult , err
142+
143+ if far .GetDeletionTimestamp () != nil {
144+ if res , err := r .removeFarTaints (far , node ); res != emptyResult || err != nil {
145+ return res , err
146+ }
147+ // Removing FAR CR finalizer so NHC deletion of the remediation can be completed
148+ r .Log .Info ("Removing finalizer of timed-out remediation deleted by NHC" , "remediation name" , far .GetName ())
149+ return emptyResult , r .removeFarFinalizer (far )
150+ }
151+ return emptyResult , nil
142152 }
143153
144154 // Add finalizer when the CR is created
@@ -158,54 +168,12 @@ func (r *FenceAgentsRemediationReconciler) Reconcile(ctx context.Context, req ct
158168 r .Log .Info ("CR's deletion timestamp is not zero, and FAR finalizer exists" , "CR Name" , req .Name )
159169
160170 if ! meta .IsStatusConditionPresentAndEqual (far .Status .Conditions , commonConditions .SucceededType , metav1 .ConditionTrue ) {
161- processingCondition := meta .FindStatusCondition (far .Status .Conditions , commonConditions .ProcessingType ).Status
162- fenceAgentActionSucceededCondition := meta .FindStatusCondition (far .Status .Conditions , utils .FenceAgentActionSucceededType ).Status
163- succeededCondition := meta .FindStatusCondition (far .Status .Conditions , commonConditions .SucceededType ).Status
164- r .Log .Info ("FAR didn't finish remediate the node " , "CR Name" , req .Name , "processing condition" , processingCondition ,
165- "fenceAgentActionSucceeded condition" , fenceAgentActionSucceededCondition , "succeeded condition" , succeededCondition )
166- r .Executor .Remove (far .GetUID ())
171+ r .stopAgentAndGetCrStatus (far , node .Name )
167172 }
168-
169- // remove out-of-service taint when using OutOfServiceTaint remediation
170- if far .Spec .RemediationStrategy == v1alpha1 .OutOfServiceTaintRemediationStrategy {
171- r .Log .Info ("Removing out-of-service taint" , "Fence Agent" , far .Spec .Agent , "Node Name" , node .Name )
172- taint := utils .CreateOutOfServiceTaint ()
173- if err := utils .RemoveTaint (r .Client , node .Name , taint ); err != nil {
174- if apiErrors .IsConflict (err ) {
175- r .Log .Error (err , "Failed to remove taint from node due to node update, retrying... ," , "node name" , node .Name , "taint key" , taint .Key , "taint effect" , taint .Effect )
176- return ctrl.Result {RequeueAfter : time .Second }, nil
177- } else if ! apiErrors .IsNotFound (err ) {
178- r .Log .Error (err , "Failed to remove taint from node," , "node name" , node .Name , "taint key" , taint .Key , "taint effect" , taint .Effect )
179- return emptyResult , err
180- }
181- }
182- r .Log .Info ("out-of-service taint was removed" , "Node Name" , req .Name )
183- commonEvents .NormalEvent (r .Recorder , node , utils .EventReasonRemoveOutOfServiceTaint , utils .EventMessageRemoveOutOfServiceTaint )
173+ if res , err := r .removeFarTaints (far , node ); res != emptyResult || err != nil {
174+ return res , err
184175 }
185-
186- // remove node's taints
187- taint := utils .CreateRemediationTaint ()
188- if err := utils .RemoveTaint (r .Client , node .Name , taint ); err != nil {
189- if apiErrors .IsConflict (err ) {
190- r .Log .Info ("Failed to remove taint from node due to node update, retrying... ," , "node name" , node .Name , "taint key" , taint .Key , "taint effect" , taint .Effect )
191- return ctrl.Result {RequeueAfter : time .Second }, nil
192-
193- } else if ! apiErrors .IsNotFound (err ) {
194- r .Log .Error (err , "Failed to remove taint from node," , "node name" , node .Name , "taint key" , taint .Key , "taint effect" , taint .Effect )
195- return emptyResult , err
196- }
197- }
198-
199- r .Log .Info ("FAR remediation taint was removed" , "Node Name" , node .Name )
200- commonEvents .NormalEvent (r .Recorder , node , utils .EventReasonRemoveRemediationTaint , utils .EventMessageRemoveRemediationTaint )
201- // remove finalizer
202- controllerutil .RemoveFinalizer (far , v1alpha1 .FARFinalizer )
203- if err := r .Client .Update (context .Background (), far ); err != nil {
204- return emptyResult , fmt .Errorf ("failed to remove finalizer from CR - %w" , err )
205- }
206- r .Log .Info ("Finalizer was removed" , "CR Name" , req .Name )
207- commonEvents .NormalEvent (r .Recorder , far , utils .EventReasonRemoveFinalizer , utils .EventMessageRemoveFinalizer )
208- return emptyResult , nil
176+ return emptyResult , r .removeFarFinalizer (far )
209177 }
210178 // Add FAR (medik8s) remediation taint
211179 taintAdded , err := utils .AppendTaint (r .Client , node .Name , utils .CreateRemediationTaint ())
@@ -283,6 +251,64 @@ func (r *FenceAgentsRemediationReconciler) Reconcile(ctx context.Context, req ct
283251 return emptyResult , nil
284252}
285253
254+ // removeFarTaints removes the out-of-service and remeidaiton taints if present, and returns an error on failure
255+ func (r * FenceAgentsRemediationReconciler ) removeFarTaints (far * v1alpha1.FenceAgentsRemediation , node * corev1.Node ) (ctrl.Result , error ) {
256+ emptyResult := ctrl.Result {}
257+ // remove out-of-service taint when using OutOfServiceTaint remediation
258+ if far .Spec .RemediationStrategy == v1alpha1 .OutOfServiceTaintRemediationStrategy {
259+ r .Log .Info ("Removing out-of-service taint" , "Fence Agent" , far .Spec .Agent , "Node Name" , node .Name )
260+ taint := utils .CreateOutOfServiceTaint ()
261+ if err := utils .RemoveTaint (r .Client , node .Name , taint ); err != nil {
262+ if apiErrors .IsConflict (err ) {
263+ r .Log .Error (err , "Failed to remove taint from node due to node update, retrying... ," , "node name" , node .Name , "taint key" , taint .Key , "taint effect" , taint .Effect )
264+ return ctrl.Result {RequeueAfter : time .Second }, nil
265+ } else if ! apiErrors .IsNotFound (err ) {
266+ r .Log .Error (err , "Failed to remove taint from node," , "node name" , node .Name , "taint key" , taint .Key , "taint effect" , taint .Effect )
267+ return emptyResult , err
268+ }
269+ }
270+ r .Log .Info ("out-of-service taint was removed" , "Node Name" , node .Name )
271+ commonEvents .NormalEvent (r .Recorder , node , utils .EventReasonRemoveOutOfServiceTaint , utils .EventMessageRemoveOutOfServiceTaint )
272+ }
273+
274+ // remove FAR remediation taint
275+ taint := utils .CreateRemediationTaint ()
276+ if err := utils .RemoveTaint (r .Client , node .Name , taint ); err != nil {
277+ if apiErrors .IsConflict (err ) {
278+ r .Log .Info ("Failed to remove taint from node due to node update, retrying... ," , "node name" , node .Name , "taint key" , taint .Key , "taint effect" , taint .Effect )
279+ return ctrl.Result {RequeueAfter : time .Second }, nil
280+
281+ } else if ! apiErrors .IsNotFound (err ) {
282+ r .Log .Error (err , "Failed to remove taint from node," , "node name" , node .Name , "taint key" , taint .Key , "taint effect" , taint .Effect )
283+ return emptyResult , err
284+ }
285+ }
286+ r .Log .Info ("FAR remediation taint was removed" , "Node Name" , node .Name )
287+ commonEvents .NormalEvent (r .Recorder , node , utils .EventReasonRemoveRemediationTaint , utils .EventMessageRemoveRemediationTaint )
288+ return emptyResult , nil
289+ }
290+
291+ // stopAgentAndGetCrStatus log FAR CR status and stops the agent's parallel execution
292+ func (r * FenceAgentsRemediationReconciler ) stopAgentAndGetCrStatus (far * v1alpha1.FenceAgentsRemediation , nodeName string ) {
293+ processingCondition := meta .FindStatusCondition (far .Status .Conditions , commonConditions .ProcessingType ).Status
294+ fenceAgentActionSucceededCondition := meta .FindStatusCondition (far .Status .Conditions , utils .FenceAgentActionSucceededType ).Status
295+ succeededCondition := meta .FindStatusCondition (far .Status .Conditions , commonConditions .SucceededType ).Status
296+ r .Log .Info ("FAR didn't finish to remediate the node" , "Node Name" , nodeName , "Agent Name" , far .Spec .Agent , "processing condition" , processingCondition ,
297+ "fenceAgentActionSucceeded condition" , fenceAgentActionSucceededCondition , "succeeded condition" , succeededCondition )
298+ r .Executor .Remove (far .GetUID ())
299+ }
300+
301+ // removeFarFinalizer removes FAR finalizer, update CR, and emits an event
302+ func (r * FenceAgentsRemediationReconciler ) removeFarFinalizer (far * v1alpha1.FenceAgentsRemediation ) error {
303+ controllerutil .RemoveFinalizer (far , v1alpha1 .FARFinalizer )
304+ if err := r .Client .Update (context .Background (), far ); err != nil {
305+ return fmt .Errorf ("failed to remove finalizer from CR - %w" , err )
306+ }
307+ r .Log .Info ("Finalizer was removed" , "CR Name" , far .Name )
308+ commonEvents .NormalEvent (r .Recorder , far , utils .EventReasonRemoveFinalizer , utils .EventMessageRemoveFinalizer )
309+ return nil
310+ }
311+
286312// isTimedOutByNHC checks if NHC set a timeout annotation on the CR
287313func isTimedOutByNHC (far * v1alpha1.FenceAgentsRemediation ) bool {
288314 if far != nil && far .Annotations != nil && far .DeletionTimestamp == nil {
0 commit comments