Skip to content

Commit

Permalink
Add minPeersForRemediation configuration value.
Browse files Browse the repository at this point in the history
With this one can specify the number of worker peers needed to
be able to contact before determining a node is unhealthy.

It covers the case in which there are 3 control plane nodes and a single
worker node, and yet you still want to be able to perform remediations
on that worker node

It has a default of 1, which maintains existing behaviors without
explicitly altering the value.
  • Loading branch information
novasbc committed Oct 2, 2024
1 parent 0bc6866 commit af2b099
Show file tree
Hide file tree
Showing 6 changed files with 26 additions and 3 deletions.
7 changes: 7 additions & 0 deletions api/v1alpha1/selfnoderemediationconfig_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ const (
ConfigCRName = "self-node-remediation-config"
defaultWatchdogPath = "/dev/watchdog"
defaultIsSoftwareRebootEnabled = true
defaultMinPeersForRemediation = 1
)

// SelfNodeRemediationConfigSpec defines the desired state of SelfNodeRemediationConfig
Expand Down Expand Up @@ -127,6 +128,11 @@ type SelfNodeRemediationConfigSpec struct {
// CustomDsTolerations allows to add custom tolerations snr agents that are running on the ds in order to support remediation for different types of nodes.
// +optional
CustomDsTolerations []v1.Toleration `json:"customDsTolerations,omitempty"`

// +kubebuilder:default:=1
// +kubebuilder:validation:Minimum=0
// Minimum number of peer workers/control nodes to attempt to contact before deciding if node is unhealthy or not
MinPeersForRemediation int `json:"minPeersForRemediation,omitempty"`
}

// SelfNodeRemediationConfigStatus defines the observed state of SelfNodeRemediationConfig
Expand Down Expand Up @@ -170,6 +176,7 @@ func NewDefaultSelfNodeRemediationConfig() SelfNodeRemediationConfig {
Spec: SelfNodeRemediationConfigSpec{
WatchdogFilePath: defaultWatchdogPath,
IsSoftwareRebootEnabled: defaultIsSoftwareRebootEnabled,
MinPeersForRemediation: defaultMinPeersForRemediation,
},
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,12 @@ spec:
its peers.
minimum: 1
type: integer
minPeersForRemediation:
default: 1
description: Minimum number of peer workers/control nodes to attempt
to contact before deciding if node is unhealthy or not
minimum: 0
type: integer
peerApiServerTimeout:
default: 5s
description: |-
Expand Down
1 change: 1 addition & 0 deletions controllers/selfnoderemediationconfig_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,7 @@ func (r *SelfNodeRemediationConfigReconciler) syncConfigDaemonSet(ctx context.Co
data.Data["PeerRequestTimeout"] = snrConfig.Spec.PeerRequestTimeout.Nanoseconds()
data.Data["MaxApiErrorThreshold"] = snrConfig.Spec.MaxApiErrorThreshold
data.Data["EndpointHealthCheckUrl"] = snrConfig.Spec.EndpointHealthCheckUrl
data.Data["MinPeersForRemediation"] = snrConfig.Spec.MinPeersForRemediation
data.Data["HostPort"] = snrConfig.Spec.HostPort
data.Data["IsSoftwareRebootEnabled"] = fmt.Sprintf("\"%t\"", snrConfig.Spec.IsSoftwareRebootEnabled)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,7 @@ var _ = Describe("SNR Config Test", func() {
Expect(createdConfig.Spec.ApiServerTimeout.Seconds()).To(BeEquivalentTo(5))
Expect(createdConfig.Spec.ApiCheckInterval.Seconds()).To(BeEquivalentTo(15))
Expect(createdConfig.Spec.PeerUpdateInterval.Seconds()).To(BeEquivalentTo(15 * 60))
Expect(createdConfig.Spec.MinPeersForRemediation).To(BeEquivalentTo(1))
})
})

Expand Down
2 changes: 2 additions & 0 deletions install/self-node-remediation-deamonset.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,8 @@ spec:
value: {{.EndpointHealthCheckUrl}}
- name: HOST_PORT
value: "{{.HostPort}}"
- name: MIN_PEERS_FOR_REMEDIATION
value: "{{.MinPeersForRemediation}}"
image: {{.Image}}
imagePullPolicy: Always
volumeMounts:
Expand Down
12 changes: 9 additions & 3 deletions pkg/apicheck/check.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ type ApiConnectivityCheckConfig struct {
PeerRequestTimeout time.Duration
PeerHealthPort int
MaxTimeForNoPeersResponse time.Duration
MinPeersForRemediation int
}

func New(config *ApiConnectivityCheckConfig, controlPlaneManager *controlplane.Manager) *ApiConnectivityCheck {
Expand Down Expand Up @@ -129,12 +130,17 @@ func (c *ApiConnectivityCheck) getWorkerPeersResponse() peers.Response {

c.config.Log.Info("Error count exceeds threshold, trying to ask other nodes if I'm healthy")
peersToAsk := c.config.Peers.GetPeersAddresses(peers.Worker)
if peersToAsk == nil || len(peersToAsk) == 0 {
c.config.Log.Info("Peers list is empty and / or couldn't be retrieved from server, nothing we can do, so consider the node being healthy")
// TODO: maybe we need to check if this happens too much and reboot
if peersToAsk == nil && c.config.MinPeersForRemediation != 0 || len(peersToAsk) < c.config.MinPeersForRemediation {
c.config.Log.Info("Peers list is empty and / or less than the minimum required peers for remediation, so consider the node being healthy")
//todo maybe we need to check if this happens too much and reboot
return peers.Response{IsHealthy: true, Reason: peers.HealthyBecauseNoPeersWereFound}
}

//if MinPeersForRemediation == 0 and there are no peers to contact, assume node is unhealthy
if peersToAsk == nil || len(peersToAsk) == 0 {
return peers.Response{IsHealthy: false, Reason: peers.UnHealthyBecauseNodeIsIsolated}
}

apiErrorsResponsesSum := 0
nrAllPeers := len(peersToAsk)
// peersToAsk is being reduced at every iteration, iterate until no peers left to ask
Expand Down

0 comments on commit af2b099

Please sign in to comment.