Skip to content

Commit

Permalink
Fix orphaned CR handling
Browse files Browse the repository at this point in the history
No remediator is deleting AND recreating nodes anymore, so it's safe
to delete the CR when its node was deleted.
For backwards compapibilty we keep the old behaviour for remediators
which mark their CR with the PermanentNodeDeletionExpected condition:
the CR will only be deleted once it succeeded.

Fixes the case when escalating remediation with MDR was used, and
CRs for other remediators weren't cleaned up after node deletion.

Signed-off-by: Marc Sluiter <[email protected]>
  • Loading branch information
slintes authored and openshift-cherrypick-robot committed Jan 9, 2025
1 parent 507b7d7 commit 2cc2d2e
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 16 deletions.
13 changes: 6 additions & 7 deletions controllers/nodehealthcheck_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -465,12 +465,14 @@ func (r *NodeHealthCheckReconciler) deleteOrphanedRemediationCRs(nhc *remediatio
}

// check conditions
// for some remediators (e.g. MDR) node deletion is expected. If so, wait until they are succeeded.
// for all other remediators, we can delete the CRs immediately after node deletion
permanentNodeDeletionExpectedCondition := getCondition(&cr, commonconditions.PermanentNodeDeletionExpectedType, log)
permanentNodeDeletionExpected := permanentNodeDeletionExpectedCondition != nil && permanentNodeDeletionExpectedCondition.Status == metav1.ConditionTrue
succeededCondition := getCondition(&cr, commonconditions.SucceededType, log)
succeeded := succeededCondition != nil && succeededCondition.Status == metav1.ConditionTrue
if !permanentNodeDeletionExpected || !succeeded {
// no node name change expected, or not succeeded yet
if permanentNodeDeletionExpected && !succeeded {
// node deletion is expected, but remediation not succeeded yet
return false
}

Expand Down Expand Up @@ -504,13 +506,10 @@ func (r *NodeHealthCheckReconciler) deleteOrphanedRemediationCRs(nhc *remediatio
resources.UpdateStatusNodeHealthy(nodeName, nhc)

if deleted, err := rm.DeleteRemediationCR(&cr, nhc); err != nil {
log.Error(err, "failed to delete remediation CR", "name", cr.GetName())
log.Error(err, "failed to delete orphaned remediation CR", "name", cr.GetName())
return err
} else if deleted {
permanentNodeDeletionExpectedCondition := getCondition(&cr, commonconditions.PermanentNodeDeletionExpectedType, log)
log.Info("deleted orphaned remediation CR", "name", cr.GetName(),
"reason", permanentNodeDeletionExpectedCondition.Reason,
"message", permanentNodeDeletionExpectedCondition.Message)
log.Info("deleted orphaned remediation CR", "name", cr.GetName(), "for deleted node", nodeName)
}

}
Expand Down
33 changes: 24 additions & 9 deletions controllers/nodehealthcheck_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,10 @@ var _ = Describe("Node Health Check CR", func() {
Expect(k8sClient.Update(context.Background(), &item)).To(Succeed())
Expect(k8sClient.Get(context.Background(), client.ObjectKeyFromObject(&item), &item)).To(Succeed())
}
Expect(k8sClient.Delete(context.Background(), &item)).To(Succeed())
err := k8sClient.Delete(context.Background(), &item)
if err != nil {
Expect(errors.IsNotFound(err)).To(BeTrue())
}
}
}

Expand Down Expand Up @@ -1288,15 +1291,15 @@ var _ = Describe("Node Health Check CR", func() {
Expect(k8sClient.Delete(context.Background(), node))
}

markCR := func() *unstructured.Unstructured {
By("marking CR as succeeded and permanent node deletion expected")
markCR := func(succeededStatus metav1.ConditionStatus) *unstructured.Unstructured {
By("marking CR for permanent node deletion expected")
cr := findRemediationCRForNHC("unhealthy-worker-node-1", underTest)
Expect(cr).ToNot(BeNil())

conditions := []interface{}{
map[string]interface{}{
"type": commonconditions.SucceededType,
"status": "True",
"status": string(succeededStatus),
"lastTransitionTime": time.Now().Format(time.RFC3339),
},
map[string]interface{}{
Expand All @@ -1322,18 +1325,30 @@ var _ = Describe("Node Health Check CR", func() {
Expect(underTest.Status.UnhealthyNodes).To(BeEmpty())
}

It("it should delete orphaned CR when CR was updated", func() {
ensureCRNotDeleted := func(cr *unstructured.Unstructured) {
By("ensuring CR is not deleted")
Consistently(func() error {
return k8sClient.Get(context.Background(), client.ObjectKeyFromObject(cr), cr)
}, "10s", "1s").ShouldNot(HaveOccurred())

// get updated NHC
Expect(k8sClient.Get(context.Background(), client.ObjectKeyFromObject(underTest), underTest)).To(Succeed())
Expect(underTest.Status.UnhealthyNodes).ToNot(BeEmpty())
}

It("it should delete orphaned CR when CR with expected node deletion succeeded", func() {
Expect(underTest.Status.UnhealthyNodes).To(HaveLen(1))
cr := markCR(metav1.ConditionFalse)
deleteNode()
time.Sleep(1 * time.Second)
cr := markCR()
ensureCRNotDeleted(cr)
cr = markCR(metav1.ConditionTrue)
expectCRDeletion(cr)
})

It("it should delete orphaned CR when node is deleted", func() {
Expect(underTest.Status.UnhealthyNodes).To(HaveLen(1))
cr := markCR()
time.Sleep(1 * time.Second)
cr := findRemediationCRForNHC("unhealthy-worker-node-1", underTest)
Expect(cr).ToNot(BeNil())
deleteNode()
expectCRDeletion(cr)
})
Expand Down

0 comments on commit 2cc2d2e

Please sign in to comment.