@@ -28,6 +28,7 @@ import (
2828 commonEvents "github.com/medik8s/common/pkg/events"
2929 commonResources "github.com/medik8s/common/pkg/resources"
3030
31+ corev1 "k8s.io/api/core/v1"
3132 apiErrors "k8s.io/apimachinery/pkg/api/errors"
3233 "k8s.io/apimachinery/pkg/api/meta"
3334 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
@@ -132,13 +133,22 @@ func (r *FenceAgentsRemediationReconciler) Reconcile(ctx context.Context, req ct
132133 return emptyResult , err
133134 }
134135
135- // Check NHC timeout annotation
136+ // Check NHC timeout annotation and stop the agent, since remediation is no longer relevant (most likely because fixed by a different remediator)
136137 if isTimedOutByNHC (far ) {
137138 r .Log .Info (utils .EventMessageRemediationStoppedByNHC )
138- r .Executor . Remove (far . GetUID () )
139+ r .stopAgentAndGetCrStatus (far , node . Name )
139140 utils .UpdateConditions (utils .RemediationInterruptedByNHC , far , r .Log )
140141 commonEvents .RemediationStoppedByNHC (r .Recorder , far )
141- return emptyResult , err
142+
143+ if far .GetDeletionTimestamp () != nil {
144+ if res , err := r .removeFarTaints (far , node ); res == emptyResult && err != nil {
145+ return res , err
146+ }
147+ // Removing FAR CR finalizer so NHC deletion of the remediation can be completed
148+ r .Log .Info ("Removing finalizer of timed-out remediation deleted by NHC" , "remediation name" , far .GetName ())
149+ return emptyResult , r .removeFarFinalizer (far )
150+ }
151+ return emptyResult , nil
142152 }
143153
144154 // Add finalizer when the CR is created
@@ -158,54 +168,12 @@ func (r *FenceAgentsRemediationReconciler) Reconcile(ctx context.Context, req ct
158168 r .Log .Info ("CR's deletion timestamp is not zero, and FAR finalizer exists" , "CR Name" , req .Name )
159169
160170 if ! meta .IsStatusConditionPresentAndEqual (far .Status .Conditions , commonConditions .SucceededType , metav1 .ConditionTrue ) {
161- processingCondition := meta .FindStatusCondition (far .Status .Conditions , commonConditions .ProcessingType ).Status
162- fenceAgentActionSucceededCondition := meta .FindStatusCondition (far .Status .Conditions , utils .FenceAgentActionSucceededType ).Status
163- succeededCondition := meta .FindStatusCondition (far .Status .Conditions , commonConditions .SucceededType ).Status
164- r .Log .Info ("FAR didn't finish remediate the node " , "CR Name" , req .Name , "processing condition" , processingCondition ,
165- "fenceAgentActionSucceeded condition" , fenceAgentActionSucceededCondition , "succeeded condition" , succeededCondition )
166- r .Executor .Remove (far .GetUID ())
171+ r .stopAgentAndGetCrStatus (far , node .Name )
167172 }
168-
169- // remove out-of-service taint when using OutOfServiceTaint remediation
170- if far .Spec .RemediationStrategy == v1alpha1 .OutOfServiceTaintRemediationStrategy {
171- r .Log .Info ("Removing out-of-service taint" , "Fence Agent" , far .Spec .Agent , "Node Name" , node .Name )
172- taint := utils .CreateOutOfServiceTaint ()
173- if err := utils .RemoveTaint (r .Client , node .Name , taint ); err != nil {
174- if apiErrors .IsConflict (err ) {
175- r .Log .Error (err , "Failed to remove taint from node due to node update, retrying... ," , "node name" , node .Name , "taint key" , taint .Key , "taint effect" , taint .Effect )
176- return ctrl.Result {RequeueAfter : time .Second }, nil
177- } else if ! apiErrors .IsNotFound (err ) {
178- r .Log .Error (err , "Failed to remove taint from node," , "node name" , node .Name , "taint key" , taint .Key , "taint effect" , taint .Effect )
179- return emptyResult , err
180- }
181- }
182- r .Log .Info ("out-of-service taint was removed" , "Node Name" , req .Name )
183- commonEvents .NormalEvent (r .Recorder , node , utils .EventReasonRemoveOutOfServiceTaint , utils .EventMessageRemoveOutOfServiceTaint )
173+ if res , err := r .removeFarTaints (far , node ); res == emptyResult && err != nil {
174+ return res , err
184175 }
185-
186- // remove node's taints
187- taint := utils .CreateRemediationTaint ()
188- if err := utils .RemoveTaint (r .Client , node .Name , taint ); err != nil {
189- if apiErrors .IsConflict (err ) {
190- r .Log .Info ("Failed to remove taint from node due to node update, retrying... ," , "node name" , node .Name , "taint key" , taint .Key , "taint effect" , taint .Effect )
191- return ctrl.Result {RequeueAfter : time .Second }, nil
192-
193- } else if ! apiErrors .IsNotFound (err ) {
194- r .Log .Error (err , "Failed to remove taint from node," , "node name" , node .Name , "taint key" , taint .Key , "taint effect" , taint .Effect )
195- return emptyResult , err
196- }
197- }
198-
199- r .Log .Info ("FAR remediation taint was removed" , "Node Name" , node .Name )
200- commonEvents .NormalEvent (r .Recorder , node , utils .EventReasonRemoveRemediationTaint , utils .EventMessageRemoveRemediationTaint )
201- // remove finalizer
202- controllerutil .RemoveFinalizer (far , v1alpha1 .FARFinalizer )
203- if err := r .Client .Update (context .Background (), far ); err != nil {
204- return emptyResult , fmt .Errorf ("failed to remove finalizer from CR - %w" , err )
205- }
206- r .Log .Info ("Finalizer was removed" , "CR Name" , req .Name )
207- commonEvents .NormalEvent (r .Recorder , far , utils .EventReasonRemoveFinalizer , utils .EventMessageRemoveFinalizer )
208- return emptyResult , nil
176+ return emptyResult , r .removeFarFinalizer (far )
209177 }
210178 // Add FAR (medik8s) remediation taint
211179 taintAdded , err := utils .AppendTaint (r .Client , node .Name , utils .CreateRemediationTaint ())
@@ -282,6 +250,62 @@ func (r *FenceAgentsRemediationReconciler) Reconcile(ctx context.Context, req ct
282250
283251 return emptyResult , nil
284252}
253+ func (r * FenceAgentsRemediationReconciler ) removeFarTaints (far * v1alpha1.FenceAgentsRemediation , node * corev1.Node ) (ctrl.Result , error ) {
254+ emptyResult := ctrl.Result {}
255+ // remove out-of-service taint when using OutOfServiceTaint remediation
256+ if far .Spec .RemediationStrategy == v1alpha1 .OutOfServiceTaintRemediationStrategy {
257+ r .Log .Info ("Removing out-of-service taint" , "Fence Agent" , far .Spec .Agent , "Node Name" , node .Name )
258+ taint := utils .CreateOutOfServiceTaint ()
259+ if err := utils .RemoveTaint (r .Client , node .Name , taint ); err != nil {
260+ if apiErrors .IsConflict (err ) {
261+ r .Log .Error (err , "Failed to remove taint from node due to node update, retrying... ," , "node name" , node .Name , "taint key" , taint .Key , "taint effect" , taint .Effect )
262+ return ctrl.Result {RequeueAfter : time .Second }, nil
263+ } else if ! apiErrors .IsNotFound (err ) {
264+ r .Log .Error (err , "Failed to remove taint from node," , "node name" , node .Name , "taint key" , taint .Key , "taint effect" , taint .Effect )
265+ return emptyResult , err
266+ }
267+ }
268+ r .Log .Info ("out-of-service taint was removed" , "Node Name" , node .Name )
269+ commonEvents .NormalEvent (r .Recorder , node , utils .EventReasonRemoveOutOfServiceTaint , utils .EventMessageRemoveOutOfServiceTaint )
270+ }
271+
272+ // remove node's taints
273+ taint := utils .CreateRemediationTaint ()
274+ if err := utils .RemoveTaint (r .Client , node .Name , taint ); err != nil {
275+ if apiErrors .IsConflict (err ) {
276+ r .Log .Info ("Failed to remove taint from node due to node update, retrying... ," , "node name" , node .Name , "taint key" , taint .Key , "taint effect" , taint .Effect )
277+ return ctrl.Result {RequeueAfter : time .Second }, nil
278+
279+ } else if ! apiErrors .IsNotFound (err ) {
280+ r .Log .Error (err , "Failed to remove taint from node," , "node name" , node .Name , "taint key" , taint .Key , "taint effect" , taint .Effect )
281+ return emptyResult , err
282+ }
283+ }
284+
285+ r .Log .Info ("FAR remediation taint was removed" , "Node Name" , node .Name )
286+ commonEvents .NormalEvent (r .Recorder , node , utils .EventReasonRemoveRemediationTaint , utils .EventMessageRemoveRemediationTaint )
287+ return emptyResult , nil
288+ }
289+
290+ func (r * FenceAgentsRemediationReconciler ) stopAgentAndGetCrStatus (far * v1alpha1.FenceAgentsRemediation , nodeName string ) {
291+ processingCondition := meta .FindStatusCondition (far .Status .Conditions , commonConditions .ProcessingType ).Status
292+ fenceAgentActionSucceededCondition := meta .FindStatusCondition (far .Status .Conditions , utils .FenceAgentActionSucceededType ).Status
293+ succeededCondition := meta .FindStatusCondition (far .Status .Conditions , commonConditions .SucceededType ).Status
294+ r .Log .Info ("FAR didn't finish to remediate the node" , "Node Name" , nodeName , "Agent Name" , far .Spec .Agent , "processing condition" , processingCondition ,
295+ "fenceAgentActionSucceeded condition" , fenceAgentActionSucceededCondition , "succeeded condition" , succeededCondition )
296+ r .Executor .Remove (far .GetUID ())
297+ }
298+
299+ // removeFarFinalizer removes FAR finalizer, update CR, and emits an event
300+ func (r * FenceAgentsRemediationReconciler ) removeFarFinalizer (far * v1alpha1.FenceAgentsRemediation ) error {
301+ controllerutil .RemoveFinalizer (far , v1alpha1 .FARFinalizer )
302+ if err := r .Client .Update (context .Background (), far ); err != nil {
303+ return fmt .Errorf ("failed to remove finalizer from CR - %w" , err )
304+ }
305+ r .Log .Info ("Finalizer was removed" , "CR Name" , far .Name )
306+ commonEvents .NormalEvent (r .Recorder , far , utils .EventReasonRemoveFinalizer , utils .EventMessageRemoveFinalizer )
307+ return nil
308+ }
285309
286310// isTimedOutByNHC checks if NHC set a timeout annotation on the CR
287311func isTimedOutByNHC (far * v1alpha1.FenceAgentsRemediation ) bool {
0 commit comments