Skip to content

Commit

Permalink
Just delete daemonset pods
Browse files Browse the repository at this point in the history
Signed-off-by: naoki-take <[email protected]>
  • Loading branch information
tkna committed Jun 26, 2024
1 parent 131033c commit 6c9df33
Show file tree
Hide file tree
Showing 4 changed files with 53 additions and 73 deletions.
76 changes: 24 additions & 52 deletions op/reboot.go
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,7 @@ func (c rebootDrainStartCommand) Run(ctx context.Context, inf cke.Infrastructure

//

type rebootEvictDaemonSetPodOp struct {
type rebootDeleteDaemonSetPodOp struct {
finished bool

entries []*cke.RebootQueueEntry
Expand All @@ -214,118 +214,90 @@ type rebootEvictDaemonSetPodOp struct {
failedNodes []string
}

func RebootEvictDaemonSetPodOp(apiserver *cke.Node, entries []*cke.RebootQueueEntry, config *cke.Reboot) cke.InfoOperator {
return &rebootEvictDaemonSetPodOp{
func RebootDeleteDaemonSetPodOp(apiserver *cke.Node, entries []*cke.RebootQueueEntry, config *cke.Reboot) cke.InfoOperator {
return &rebootDeleteDaemonSetPodOp{
entries: entries,
config: config,
apiserver: apiserver,
}
}

type rebootEvictDaemonSetPodCommand struct {
entries []*cke.RebootQueueEntry
protectedNamespaces *metav1.LabelSelector
apiserver *cke.Node
evictAttempts int
evictInterval time.Duration
type rebootDeleteDaemonSetPodCommand struct {
entries []*cke.RebootQueueEntry
apiserver *cke.Node

notifyFailedNode func(string)
}

func (o *rebootEvictDaemonSetPodOp) Name() string {
return "reboot-evict-daemonset-pod"
func (o *rebootDeleteDaemonSetPodOp) Name() string {
return "reboot-delete-daemonset-pod"
}

func (o *rebootEvictDaemonSetPodOp) notifyFailedNode(node string) {
func (o *rebootDeleteDaemonSetPodOp) notifyFailedNode(node string) {
o.mu.Lock()
o.failedNodes = append(o.failedNodes, node)
o.mu.Unlock()
}

func (o *rebootEvictDaemonSetPodOp) Targets() []string {
func (o *rebootDeleteDaemonSetPodOp) Targets() []string {
ipAddresses := make([]string, len(o.entries))
for i, entry := range o.entries {
ipAddresses[i] = entry.Node
}
return ipAddresses
}

func (o *rebootEvictDaemonSetPodOp) Info() string {
func (o *rebootDeleteDaemonSetPodOp) Info() string {
if len(o.failedNodes) == 0 {
return ""
}
return fmt.Sprintf("failed to evict DaemonSet pods on some nodes: %v", o.failedNodes)
return fmt.Sprintf("failed to delete DaemonSet pods on some nodes: %v", o.failedNodes)
}

func (o *rebootEvictDaemonSetPodOp) NextCommand() cke.Commander {
func (o *rebootDeleteDaemonSetPodOp) NextCommand() cke.Commander {
if o.finished {
return nil
}
o.finished = true

attempts := 1
if o.config.EvictRetries != nil {
attempts = *o.config.EvictRetries + 1
}
interval := 0 * time.Second
if o.config.EvictInterval != nil {
interval = time.Second * time.Duration(*o.config.EvictInterval)
}

return rebootEvictDaemonSetPodCommand{
entries: o.entries,
protectedNamespaces: o.config.ProtectedNamespaces,
apiserver: o.apiserver,
notifyFailedNode: o.notifyFailedNode,
evictAttempts: attempts,
evictInterval: interval,
return rebootDeleteDaemonSetPodCommand{
entries: o.entries,
apiserver: o.apiserver,
notifyFailedNode: o.notifyFailedNode,
}
}

func (c rebootEvictDaemonSetPodCommand) Command() cke.Command {
func (c rebootDeleteDaemonSetPodCommand) Command() cke.Command {
ipAddresses := make([]string, len(c.entries))
for i, entry := range c.entries {
ipAddresses[i] = entry.Node
}
return cke.Command{
Name: "rebootEvictDaemonSetPodCommand",
Name: "rebootDeleteDaemonSetPodCommand",
Target: strings.Join(ipAddresses, ","),
}
}

func (c rebootEvictDaemonSetPodCommand) Run(ctx context.Context, inf cke.Infrastructure, _ string) error {
func (c rebootDeleteDaemonSetPodCommand) Run(ctx context.Context, inf cke.Infrastructure, _ string) error {
cs, err := inf.K8sClient(ctx, c.apiserver)
if err != nil {
return err
}

protected, err := listProtectedNamespaces(ctx, cs, c.protectedNamespaces)
if err != nil {
return err
}

// evict DaemonSet pod on each node
// cordon is unnecessary for DaemonSet pods, so dry-run eviction is also skipped.
// delete DaemonSet pod on each node
for _, entry := range c.entries {
// keep entry.Status as RebootStatusDraining and don't update it here.

log.Info("start eviction of DaemonSet pod", map[string]interface{}{
log.Info("start deletion of DaemonSet pod", map[string]interface{}{
"name": entry.Node,
})
err := evictOrDeleteOnDeleteDaemonSetPod(ctx, cs, entry.Node, protected, c.evictAttempts, c.evictInterval)
err := deleteOnDeleteDaemonSetPod(ctx, cs, entry.Node)
if err != nil {
log.Warn("eviction of DaemonSet pod failed", map[string]interface{}{
log.Warn("deletion of DaemonSet pod failed", map[string]interface{}{
"name": entry.Node,
log.FnError: err,
})
c.notifyFailedNode(entry.Node)
err = drainBackOff(ctx, inf, entry, err)
if err != nil {
return err
}
log.Info("eviction of DaemonSet pod succeeded", map[string]interface{}{
"name": entry.Node,
})
}
}

Expand Down
46 changes: 27 additions & 19 deletions op/reboot_decide.go
Original file line number Diff line number Diff line change
Expand Up @@ -101,44 +101,35 @@ func enumerateOnDeleteDaemonSetPods(ctx context.Context, cs *kubernetes.Clientse
// dryRunEvictOrDeleteNodePod checks eviction or deletion of Pods on the specified Node can proceed.
// It returns an error if a running Pod exists or an eviction of the Pod in protected namespace failed.
func dryRunEvictOrDeleteNodePod(ctx context.Context, cs *kubernetes.Clientset, node string, protected map[string]bool) error {
return enumeratePods(ctx, cs, node,
doEvictOrDeleteNodePod(ctx, cs, node, protected, 0, time.Duration(0), true),
func(pod *corev1.Pod) error {
return fmt.Errorf("job-managed pod exists: %s/%s, phase=%s", pod.Namespace, pod.Name, pod.Status.Phase)
},
)
return doEvictOrDeleteNodePod(ctx, cs, node, protected, 0, 0, true)
}

// evictOrDeleteNodePod evicts or delete Pods on the specified Node.
// If a running Job Pod exists, this function returns an error.
func evictOrDeleteNodePod(ctx context.Context, cs *kubernetes.Clientset, node string, protected map[string]bool, attempts int, interval time.Duration) error {
return enumeratePods(ctx, cs, node,
doEvictOrDeleteNodePod(ctx, cs, node, protected, attempts, interval, false),
func(pod *corev1.Pod) error {
return fmt.Errorf("job-managed pod exists: %s/%s, phase=%s", pod.Namespace, pod.Name, pod.Status.Phase)
},
)
return doEvictOrDeleteNodePod(ctx, cs, node, protected, attempts, interval, false)
}

// evictOrDeleteOnDeleteDaemonSetPod evicts or delete Pods on the specified Node that are owned by "updateStrategy:OnDelete" DaemonSets.
func evictOrDeleteOnDeleteDaemonSetPod(ctx context.Context, cs *kubernetes.Clientset, node string, protected map[string]bool, attempts int, interval time.Duration) error {
return enumerateOnDeleteDaemonSetPods(ctx, cs, node, doEvictOrDeleteNodePod(ctx, cs, node, protected, attempts, interval, false))
// deleteOnDeleteDaemonSetPod evicts or delete Pods on the specified Node that are owned by "updateStrategy:OnDelete" DaemonSets.
func deleteOnDeleteDaemonSetPod(ctx context.Context, cs *kubernetes.Clientset, node string) error {
return doDeleteOnDeleteDaemonSetPod(ctx, cs, node)
}

// doEvictOrDeleteNodePod returns a pod handler that evicts or delete Pods on the specified Node.
// doEvictOrDeleteNodePod evicts or delete Pods on the specified Node.
// It first tries eviction.
// If the eviction failed and the Pod's namespace is not protected, it deletes the Pod.
// If the eviction failed and the Pod's namespace is protected, it retries after `interval` interval at most `attempts` times.
// If a running Job Pod exists, this function returns an error.
// If `dry` is true, it performs dry run and `attempts` and `interval` are ignored.
func doEvictOrDeleteNodePod(ctx context.Context, cs *kubernetes.Clientset, node string, protected map[string]bool, attempts int, interval time.Duration, dry bool) func(pod *corev1.Pod) error {
func doEvictOrDeleteNodePod(ctx context.Context, cs *kubernetes.Clientset, node string, protected map[string]bool, attempts int, interval time.Duration, dry bool) error {
var deleteOptions *metav1.DeleteOptions
if dry {
deleteOptions = &metav1.DeleteOptions{
DryRun: []string{"All"},
}
}

return func(pod *corev1.Pod) error {
return enumeratePods(ctx, cs, node, func(pod *corev1.Pod) error {
if dry && !protected[pod.Namespace] {
// in case of dry-run for Pods in non-protected namespace,
// return immediately because its "eviction or deletion" never fails
Expand Down Expand Up @@ -209,7 +200,24 @@ func doEvictOrDeleteNodePod(ctx context.Context, cs *kubernetes.Clientset, node
return fmt.Errorf("failed to evict pod %s/%s due to PDB: %w", pod.Namespace, pod.Name, err)
}
return nil
}
}, func(pod *corev1.Pod) error {
return fmt.Errorf("job-managed pod exists: %s/%s, phase=%s", pod.Namespace, pod.Name, pod.Status.Phase)
})
}

// doDeleteOnDeleteDaemonSetPod deletes 'OnDelete' DaemonSet pods on the specified Node.
func doDeleteOnDeleteDaemonSetPod(ctx context.Context, cs *kubernetes.Clientset, node string) error {
return enumerateOnDeleteDaemonSetPods(ctx, cs, node, func(pod *corev1.Pod) error {
err := cs.CoreV1().Pods(pod.Namespace).Delete(ctx, pod.Name, metav1.DeleteOptions{})
if err != nil && !apierrors.IsNotFound(err) {
return err
}
log.Info("deleted daemonset pod", map[string]interface{}{
"namespace": pod.Namespace,
"name": pod.Name,
})
return nil
})
}

// checkPodDeletion checks whether the evicted or deleted Pods are eventually deleted.
Expand Down
2 changes: 1 addition & 1 deletion server/strategy.go
Original file line number Diff line number Diff line change
Expand Up @@ -899,7 +899,7 @@ func rebootOps(c *cke.Cluster, constraints *cke.Constraints, rebootArgs DecideOp

if len(rebootArgs.DrainCompleted) > 0 {
// After eviction of normal pods, evict "OnDelete" daemonset pods.
ops = append(ops, op.RebootEvictDaemonSetPodOp(nf.HealthyAPIServer(), rebootArgs.DrainCompleted, &c.Reboot))
ops = append(ops, op.RebootDeleteDaemonSetPodOp(nf.HealthyAPIServer(), rebootArgs.DrainCompleted, &c.Reboot))
ops = append(ops, op.RebootRebootOp(nf.HealthyAPIServer(), rebootArgs.DrainCompleted, &c.Reboot))
}
if len(rebootArgs.NewlyDrained) > 0 {
Expand Down
2 changes: 1 addition & 1 deletion server/strategy_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2637,7 +2637,7 @@ func TestDecideOps(t *testing.T) {
},
}),
ExpectedOps: []opData{
{"reboot-evict-daemonset-pod", 1},
{"reboot-delete-daemonset-pod", 1},
{"reboot-reboot", 1},
},
},
Expand Down

0 comments on commit 6c9df33

Please sign in to comment.