diff --git a/api/v1alpha1/selfnoderemediationconfig_types.go b/api/v1alpha1/selfnoderemediationconfig_types.go index 381428f7..41ba8a12 100644 --- a/api/v1alpha1/selfnoderemediationconfig_types.go +++ b/api/v1alpha1/selfnoderemediationconfig_types.go @@ -28,6 +28,7 @@ const ( ConfigCRName = "self-node-remediation-config" defaultWatchdogPath = "/dev/watchdog" defaultIsSoftwareRebootEnabled = true + defaultMinPeersForRemediation = 1 ) // SelfNodeRemediationConfigSpec defines the desired state of SelfNodeRemediationConfig @@ -127,6 +128,11 @@ type SelfNodeRemediationConfigSpec struct { // CustomDsTolerations allows to add custom tolerations snr agents that are running on the ds in order to support remediation for different types of nodes. // +optional CustomDsTolerations []v1.Toleration `json:"customDsTolerations,omitempty"` + + // +kubebuilder:default:=1 + // +kubebuilder:validation:Minimum=0 + // Minimum number of peer workers/control nodes to attempt to contact before deciding if node is unhealthy or not + MinPeersForRemediation int `json:"minPeersForRemediation,omitempty"` } // SelfNodeRemediationConfigStatus defines the observed state of SelfNodeRemediationConfig @@ -170,6 +176,7 @@ func NewDefaultSelfNodeRemediationConfig() SelfNodeRemediationConfig { Spec: SelfNodeRemediationConfigSpec{ WatchdogFilePath: defaultWatchdogPath, IsSoftwareRebootEnabled: defaultIsSoftwareRebootEnabled, + MinPeersForRemediation: defaultMinPeersForRemediation, }, } } diff --git a/bundle/manifests/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml b/bundle/manifests/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml index bea94e5e..b943c923 100644 --- a/bundle/manifests/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml +++ b/bundle/manifests/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml @@ -127,6 +127,12 @@ spec: its peers. minimum: 1 type: integer + minPeersForRemediation: + default: 1 + description: Minimum number of peer workers/control nodes to attempt + to contact before deciding if node is unhealthy or not + minimum: 0 + type: integer peerApiServerTimeout: default: 5s description: |- diff --git a/config/crd/bases/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml b/config/crd/bases/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml index f4d47917..84f7deec 100644 --- a/config/crd/bases/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml +++ b/config/crd/bases/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml @@ -125,6 +125,12 @@ spec: its peers. minimum: 1 type: integer + minPeersForRemediation: + default: 1 + description: Minimum number of peer workers/control nodes to attempt + to contact before deciding if node is unhealthy or not + minimum: 0 + type: integer peerApiServerTimeout: default: 5s description: |- diff --git a/controllers/selfnoderemediationconfig_controller.go b/controllers/selfnoderemediationconfig_controller.go index b4f64842..1d5c6078 100644 --- a/controllers/selfnoderemediationconfig_controller.go +++ b/controllers/selfnoderemediationconfig_controller.go @@ -153,6 +153,7 @@ func (r *SelfNodeRemediationConfigReconciler) syncConfigDaemonSet(ctx context.Co data.Data["PeerRequestTimeout"] = snrConfig.Spec.PeerRequestTimeout.Nanoseconds() data.Data["MaxApiErrorThreshold"] = snrConfig.Spec.MaxApiErrorThreshold data.Data["EndpointHealthCheckUrl"] = snrConfig.Spec.EndpointHealthCheckUrl + data.Data["MinPeersForRemediation"] = snrConfig.Spec.MinPeersForRemediation data.Data["HostPort"] = snrConfig.Spec.HostPort data.Data["IsSoftwareRebootEnabled"] = fmt.Sprintf("\"%t\"", snrConfig.Spec.IsSoftwareRebootEnabled) diff --git a/controllers/tests/config/selfnoderemediationconfig_controller_test.go b/controllers/tests/config/selfnoderemediationconfig_controller_test.go index 8469e931..b272422a 100644 --- a/controllers/tests/config/selfnoderemediationconfig_controller_test.go +++ b/controllers/tests/config/selfnoderemediationconfig_controller_test.go @@ -238,6 +238,7 @@ var _ = Describe("SNR Config Test", func() { Expect(createdConfig.Spec.ApiServerTimeout.Seconds()).To(BeEquivalentTo(5)) Expect(createdConfig.Spec.ApiCheckInterval.Seconds()).To(BeEquivalentTo(15)) Expect(createdConfig.Spec.PeerUpdateInterval.Seconds()).To(BeEquivalentTo(15 * 60)) + Expect(createdConfig.Spec.MinPeersForRemediation).To(BeEquivalentTo(1)) }) }) diff --git a/controllers/tests/config/suite_test.go b/controllers/tests/config/suite_test.go index f25fe157..7374be0f 100644 --- a/controllers/tests/config/suite_test.go +++ b/controllers/tests/config/suite_test.go @@ -40,7 +40,10 @@ import ( "github.com/medik8s/self-node-remediation/pkg/apicheck" "github.com/medik8s/self-node-remediation/pkg/certificates" "github.com/medik8s/self-node-remediation/pkg/peers" + //+kubebuilder:scaffold:imports + "github.com/medik8s/self-node-remediation/pkg/reboot" + "github.com/medik8s/self-node-remediation/pkg/watchdog" ) // These tests use Ginkgo (BDD-style Go testing framework). Refer to @@ -52,6 +55,7 @@ var cancelFunc context.CancelFunc var k8sClient *shared.K8sClientWrapper var certReader certificates.CertStorageReader var managerReconciler *controllers.SelfNodeRemediationReconciler +var dummyDog watchdog.Watchdog func TestAPIs(t *testing.T) { RegisterFailHandler(Fail) @@ -120,12 +124,17 @@ var _ = BeforeSuite(func() { Expect(err).ToNot(HaveOccurred()) certReader = certificates.NewSecretCertStorage(k8sClient, ctrl.Log.WithName("SecretCertStorage"), shared.Namespace) + + dummyDog = watchdog.NewFake(true) + rebooter := reboot.NewWatchdogRebooter(dummyDog, ctrl.Log.WithName("rebooter")) + apiConnectivityCheckConfig := &apicheck.ApiConnectivityCheckConfig{ Log: ctrl.Log.WithName("api-check"), MyNodeName: shared.UnhealthyNodeName, CheckInterval: shared.ApiCheckInterval, MaxErrorsThreshold: shared.MaxErrorThreshold, Peers: peers, + Rebooter: rebooter, Cfg: cfg, CertReader: certReader, } diff --git a/install/self-node-remediation-deamonset.yaml b/install/self-node-remediation-deamonset.yaml index 36869fd8..0561b691 100644 --- a/install/self-node-remediation-deamonset.yaml +++ b/install/self-node-remediation-deamonset.yaml @@ -72,6 +72,8 @@ spec: value: {{.EndpointHealthCheckUrl}} - name: HOST_PORT value: "{{.HostPort}}" + - name: MIN_PEERS_FOR_REMEDIATION + value: "{{.MinPeersForRemediation}}" image: {{.Image}} imagePullPolicy: Always volumeMounts: diff --git a/pkg/apicheck/check.go b/pkg/apicheck/check.go index fda7e901..e3d2466d 100644 --- a/pkg/apicheck/check.go +++ b/pkg/apicheck/check.go @@ -50,6 +50,7 @@ type ApiConnectivityCheckConfig struct { PeerRequestTimeout time.Duration PeerHealthPort int MaxTimeForNoPeersResponse time.Duration + MinPeersForRemediation int } func New(config *ApiConnectivityCheckConfig, controlPlaneManager *controlplane.Manager) *ApiConnectivityCheck { @@ -129,12 +130,17 @@ func (c *ApiConnectivityCheck) getWorkerPeersResponse() peers.Response { c.config.Log.Info("Error count exceeds threshold, trying to ask other nodes if I'm healthy") peersToAsk := c.config.Peers.GetPeersAddresses(peers.Worker) - if peersToAsk == nil || len(peersToAsk) == 0 { - c.config.Log.Info("Peers list is empty and / or couldn't be retrieved from server, nothing we can do, so consider the node being healthy") - // TODO: maybe we need to check if this happens too much and reboot + if peersToAsk == nil && c.config.MinPeersForRemediation != 0 || len(peersToAsk) < c.config.MinPeersForRemediation { + c.config.Log.Info("Peers list is empty and / or less than the minimum required peers for remediation, so consider the node being healthy") + //todo maybe we need to check if this happens too much and reboot return peers.Response{IsHealthy: true, Reason: peers.HealthyBecauseNoPeersWereFound} } + //if MinPeersForRemediation == 0 and there are no peers to contact, assume node is unhealthy + if peersToAsk == nil || len(peersToAsk) == 0 { + return peers.Response{IsHealthy: false, Reason: peers.UnHealthyBecauseNodeIsIsolated} + } + apiErrorsResponsesSum := 0 nrAllPeers := len(peersToAsk) // peersToAsk is being reduced at every iteration, iterate until no peers left to ask