@@ -38,7 +38,6 @@ const (
3838
3939 //TODO: try to minimize timeout
4040 // eventually parameters
41- timeoutLogs = "3m0s"
4241 timeoutTaint = "2s" // Timeout for checking the FAR taint
4342 timeoutReboot = "6m0s" // fencing with fence_aws should be completed within 6 minutes
4443 timeoutAfterReboot = "5s" // Timeout for verifying steps after the node has been rebooted
@@ -48,56 +47,71 @@ const (
4847 skipOOSREnvVarName = "SKIP_OOST_REMEDIATION_VERIFICATION"
4948)
5049
51- var remediationTimes []time.Duration
50+ var (
51+ stopTesting bool
52+ remediationTimes []time.Duration
53+ )
5254
5355var _ = Describe ("FAR E2e" , func () {
5456 var (
5557 fenceAgent , nodeIdentifierPrefix string
5658 testShareParam map [v1alpha1.ParameterName ]string
5759 testNodeParam map [v1alpha1.ParameterName ]map [v1alpha1.NodeName ]string
58- selectedNode * corev1.Node
59- nodeName string
60- pod * corev1.Pod
61- startTime , nodeBootTimeBefore time.Time
6260 err error
6361 )
64- BeforeEach (func () {
65- // create FAR CR spec based on OCP platformn
66- clusterPlatform , err := e2eUtils .GetClusterInfo (configClient )
67- Expect (err ).ToNot (HaveOccurred (), "can't identify the cluster platform" )
68- log .Info ("Begin e2e test" , "Cluster name" , string (clusterPlatform .Name ), "PlatformType" , string (clusterPlatform .Status .PlatformStatus .Type ))
69-
70- switch clusterPlatform .Status .PlatformStatus .Type {
71- case configv1 .AWSPlatformType :
72- fenceAgent = fenceAgentAWS
73- nodeIdentifierPrefix = nodeIdentifierPrefixAWS
74- By ("running fence_aws" )
75- case configv1 .BareMetalPlatformType :
76- fenceAgent = fenceAgentIPMI
77- nodeIdentifierPrefix = nodeIdentifierPrefixIPMI
78- By ("running fence_ipmilan" )
79- default :
80- Skip ("FAR haven't been tested on this kind of cluster (non AWS or BareMetal)" )
81- }
62+ When ("trying to identify cluster platform" , func () {
63+ It ("should be AWS/BareMetal for finding the needed cluster and node parameters" , func () {
64+ // create FAR CR spec based on OCP platformn
65+ clusterPlatform , err := e2eUtils .GetClusterInfo (configClient )
66+ Expect (err ).ToNot (HaveOccurred (), "can't identify the cluster platform" )
67+ log .Info ("Getting Cluster Infromation" , "Cluster name" , string (clusterPlatform .Name ), "PlatformType" , string (clusterPlatform .Status .PlatformStatus .Type ))
68+
69+ switch clusterPlatform .Status .PlatformStatus .Type {
70+ case configv1 .AWSPlatformType :
71+ fenceAgent = fenceAgentAWS
72+ nodeIdentifierPrefix = nodeIdentifierPrefixAWS
73+ By ("running fence_aws" )
74+ case configv1 .BareMetalPlatformType :
75+ fenceAgent = fenceAgentIPMI
76+ nodeIdentifierPrefix = nodeIdentifierPrefixIPMI
77+ By ("running fence_ipmilan" )
78+ default :
79+ stopTesting = true // Mark to stop subsequent tests
80+ Fail ("FAR haven't been tested on this kind of cluster (non AWS or BareMetal)" )
81+ }
8282
83- testShareParam , err = buildSharedParameters (clusterPlatform , fenceAgentAction )
84- Expect (err ).ToNot (HaveOccurred (), "can't get shared information" )
85- testNodeParam , err = buildNodeParameters (clusterPlatform .Status .PlatformStatus .Type )
86- Expect (err ).ToNot (HaveOccurred (), "can't get node information" )
83+ testShareParam , err = buildSharedParameters (clusterPlatform , fenceAgentAction )
84+ Expect (err ).ToNot (HaveOccurred (), "can't get shared information" )
85+ testNodeParam , err = buildNodeParameters (clusterPlatform .Status .PlatformStatus .Type )
86+ Expect (err ).ToNot (HaveOccurred (), "can't get node information" )
87+ })
8788 })
8889
8990 // runFARTests is a utility function to run FAR tests.
9091 // It accepts a remediation strategy and a condition to determine if the tests should be skipped.
9192 runFARTests := func (remediationStrategy v1alpha1.RemediationStrategyType , skipCondition func () bool ) {
92- var availableWorkerNodes * corev1.NodeList
93+ var (
94+ availableWorkerNodes * corev1.NodeList
95+ selectedNode * corev1.Node
96+ nodeName string
97+ pod * corev1.Pod
98+ startTime , nodeBootTimeBefore time.Time
99+ )
93100 BeforeEach (func () {
101+ if stopTesting {
102+ Skip ("Skip testing due to unsupported platform" )
103+ }
94104 if skipCondition () {
95105 Skip ("Skip this block due to unsupported condition" )
96106 }
97107
98108 if availableWorkerNodes == nil {
99- availableWorkerNodes = getAvailableWorkerNodes ()
109+ availableWorkerNodes = getReadyWorkerNodes ()
100110 }
111+ if len (availableWorkerNodes .Items ) < 1 {
112+ Fail ("There isn't an available (and Ready) worker node in the cluster" )
113+ }
114+
101115 selectedNode = pickRemediatedNode (availableWorkerNodes )
102116 nodeName = selectedNode .Name
103117 printNodeDetails (selectedNode , nodeIdentifierPrefix , testNodeParam )
@@ -107,7 +121,7 @@ var _ = Describe("FAR E2e", func() {
107121 Expect (err ).ToNot (HaveOccurred (), "failed to get boot time of the node" )
108122
109123 // create tested pod which will be deleted by the far CR
110- pod = createTestedPod (nodeName , testContainerName )
124+ pod = createTestedPod (nodeName )
111125 DeferCleanup (cleanupTestedResources , pod )
112126
113127 // set the node as "unhealthy" by disabling kubelet
@@ -240,25 +254,31 @@ func buildNodeParameters(clusterPlatformType configv1.PlatformType) (map[v1alpha
240254 return testNodeParam , nil
241255}
242256
243- // getAvailableNodes a list of available worker nodes in the cluster
244- func getAvailableWorkerNodes () * corev1.NodeList {
245- availableNodes := & corev1.NodeList {}
257+ // getReadyWorkerNodes returns a list of ready worker nodes in the cluster if any
258+ func getReadyWorkerNodes () * corev1.NodeList {
259+ availableWorkerNodes := & corev1.NodeList {}
246260 selector := labels .NewSelector ()
247- requirement , _ := labels .NewRequirement (medik8sLabels .WorkerRole , selection .Exists , []string {})
261+ requirement , err := labels .NewRequirement (medik8sLabels .WorkerRole , selection .Exists , []string {})
262+ Expect (err ).To (BeNil ())
248263 selector = selector .Add (* requirement )
249- Expect (k8sClient .List (context .Background (), availableNodes , & client.ListOptions {LabelSelector : selector })).ToNot (HaveOccurred ())
250- if len (availableNodes .Items ) < 1 {
251- Fail ("No worker nodes found in the cluster" )
264+ Expect (k8sClient .List (context .Background (), availableWorkerNodes , & client.ListOptions {LabelSelector : selector })).ToNot (HaveOccurred ())
265+
266+ // Filter nodes to only include those in "Ready" state
267+ readyWorkerNodes := & corev1.NodeList {}
268+ for _ , node := range availableWorkerNodes .Items {
269+ for _ , condition := range node .Status .Conditions {
270+ if condition .Type == corev1 .NodeReady && condition .Status == corev1 .ConditionTrue {
271+ readyWorkerNodes .Items = append (readyWorkerNodes .Items , node )
272+ break // "Ready" was found
273+ }
274+ }
252275 }
253- return availableNodes
276+ return readyWorkerNodes
254277}
255278
256279// pickRemediatedNode randomly returns a next remediated node from the current available nodes,
257280// and then the node is removed from the list of available nodes
258281func pickRemediatedNode (availableNodes * corev1.NodeList ) * corev1.Node {
259- if len (availableNodes .Items ) < 1 {
260- Fail ("No available node found for remediation" )
261- }
262282 // Generate a random seed based on the current time
263283 r := rand .New (rand .NewSource (time .Now ().UnixNano ()))
264284 // Randomly select a worker node
@@ -270,14 +290,14 @@ func pickRemediatedNode(availableNodes *corev1.NodeList) *corev1.Node {
270290}
271291
272292// createTestedPod creates tested pod which will be deleted by the far CR
273- func createTestedPod (nodeName , containerName string ) * corev1.Pod {
293+ func createTestedPod (nodeName string ) * corev1.Pod {
274294 pod := e2eUtils .GetPod (nodeName , testContainerName )
275295 pod .Name = testPodName
276296 pod .Namespace = testNsName
277297 pod .Spec .Tolerations = []corev1.Toleration {
278298 {
279299 Key : v1alpha1 .FARNoExecuteTaintKey ,
280- Operator : corev1 .TolerationOpEqual ,
300+ Operator : corev1 .TolerationOpExists ,
281301 Effect : corev1 .TaintEffectNoExecute ,
282302 },
283303 }
@@ -303,7 +323,7 @@ func createFAR(nodeName string, agent string, sharedParameters map[v1alpha1.Para
303323 SharedParameters : sharedParameters ,
304324 NodeParameters : nodeParameters ,
305325 RemediationStrategy : strategy ,
306- RetryCount : 5 ,
326+ RetryCount : 10 ,
307327 RetryInterval : metav1.Duration {Duration : 20 * time .Second },
308328 Timeout : metav1.Duration {Duration : 60 * time .Second },
309329 },
0 commit comments