@@ -38,7 +38,6 @@ const (
38
38
39
39
//TODO: try to minimize timeout
40
40
// eventually parameters
41
- timeoutLogs = "3m0s"
42
41
timeoutTaint = "2s" // Timeout for checking the FAR taint
43
42
timeoutReboot = "6m0s" // fencing with fence_aws should be completed within 6 minutes
44
43
timeoutAfterReboot = "5s" // Timeout for verifying steps after the node has been rebooted
@@ -48,56 +47,71 @@ const (
48
47
skipOOSREnvVarName = "SKIP_OOST_REMEDIATION_VERIFICATION"
49
48
)
50
49
51
- var remediationTimes []time.Duration
50
+ var (
51
+ stopTesting bool
52
+ remediationTimes []time.Duration
53
+ )
52
54
53
55
var _ = Describe ("FAR E2e" , func () {
54
56
var (
55
57
fenceAgent , nodeIdentifierPrefix string
56
58
testShareParam map [v1alpha1.ParameterName ]string
57
59
testNodeParam map [v1alpha1.ParameterName ]map [v1alpha1.NodeName ]string
58
- selectedNode * corev1.Node
59
- nodeName string
60
- pod * corev1.Pod
61
- startTime , nodeBootTimeBefore time.Time
62
60
err error
63
61
)
64
- BeforeEach (func () {
65
- // create FAR CR spec based on OCP platformn
66
- clusterPlatform , err := e2eUtils .GetClusterInfo (configClient )
67
- Expect (err ).ToNot (HaveOccurred (), "can't identify the cluster platform" )
68
- log .Info ("Begin e2e test" , "Cluster name" , string (clusterPlatform .Name ), "PlatformType" , string (clusterPlatform .Status .PlatformStatus .Type ))
69
-
70
- switch clusterPlatform .Status .PlatformStatus .Type {
71
- case configv1 .AWSPlatformType :
72
- fenceAgent = fenceAgentAWS
73
- nodeIdentifierPrefix = nodeIdentifierPrefixAWS
74
- By ("running fence_aws" )
75
- case configv1 .BareMetalPlatformType :
76
- fenceAgent = fenceAgentIPMI
77
- nodeIdentifierPrefix = nodeIdentifierPrefixIPMI
78
- By ("running fence_ipmilan" )
79
- default :
80
- Skip ("FAR haven't been tested on this kind of cluster (non AWS or BareMetal)" )
81
- }
62
+ When ("trying to identify cluster platform" , func () {
63
+ It ("should be AWS/BareMetal for finding the needed cluster and node parameters" , func () {
64
+ // create FAR CR spec based on OCP platformn
65
+ clusterPlatform , err := e2eUtils .GetClusterInfo (configClient )
66
+ Expect (err ).ToNot (HaveOccurred (), "can't identify the cluster platform" )
67
+ log .Info ("Getting Cluster Infromation" , "Cluster name" , string (clusterPlatform .Name ), "PlatformType" , string (clusterPlatform .Status .PlatformStatus .Type ))
68
+
69
+ switch clusterPlatform .Status .PlatformStatus .Type {
70
+ case configv1 .AWSPlatformType :
71
+ fenceAgent = fenceAgentAWS
72
+ nodeIdentifierPrefix = nodeIdentifierPrefixAWS
73
+ By ("running fence_aws" )
74
+ case configv1 .BareMetalPlatformType :
75
+ fenceAgent = fenceAgentIPMI
76
+ nodeIdentifierPrefix = nodeIdentifierPrefixIPMI
77
+ By ("running fence_ipmilan" )
78
+ default :
79
+ stopTesting = true // Mark to stop subsequent tests
80
+ Fail ("FAR haven't been tested on this kind of cluster (non AWS or BareMetal)" )
81
+ }
82
82
83
- testShareParam , err = buildSharedParameters (clusterPlatform , fenceAgentAction )
84
- Expect (err ).ToNot (HaveOccurred (), "can't get shared information" )
85
- testNodeParam , err = buildNodeParameters (clusterPlatform .Status .PlatformStatus .Type )
86
- Expect (err ).ToNot (HaveOccurred (), "can't get node information" )
83
+ testShareParam , err = buildSharedParameters (clusterPlatform , fenceAgentAction )
84
+ Expect (err ).ToNot (HaveOccurred (), "can't get shared information" )
85
+ testNodeParam , err = buildNodeParameters (clusterPlatform .Status .PlatformStatus .Type )
86
+ Expect (err ).ToNot (HaveOccurred (), "can't get node information" )
87
+ })
87
88
})
88
89
89
90
// runFARTests is a utility function to run FAR tests.
90
91
// It accepts a remediation strategy and a condition to determine if the tests should be skipped.
91
92
runFARTests := func (remediationStrategy v1alpha1.RemediationStrategyType , skipCondition func () bool ) {
92
- var availableWorkerNodes * corev1.NodeList
93
+ var (
94
+ availableWorkerNodes * corev1.NodeList
95
+ selectedNode * corev1.Node
96
+ nodeName string
97
+ pod * corev1.Pod
98
+ startTime , nodeBootTimeBefore time.Time
99
+ )
93
100
BeforeEach (func () {
101
+ if stopTesting {
102
+ Skip ("Skip testing due to unsupported platform" )
103
+ }
94
104
if skipCondition () {
95
105
Skip ("Skip this block due to unsupported condition" )
96
106
}
97
107
98
108
if availableWorkerNodes == nil {
99
- availableWorkerNodes = getAvailableWorkerNodes ()
109
+ availableWorkerNodes = getReadyWorkerNodes ()
100
110
}
111
+ if len (availableWorkerNodes .Items ) < 1 {
112
+ Fail ("There isn't an available (and Ready) worker node in the cluster" )
113
+ }
114
+
101
115
selectedNode = pickRemediatedNode (availableWorkerNodes )
102
116
nodeName = selectedNode .Name
103
117
printNodeDetails (selectedNode , nodeIdentifierPrefix , testNodeParam )
@@ -107,7 +121,7 @@ var _ = Describe("FAR E2e", func() {
107
121
Expect (err ).ToNot (HaveOccurred (), "failed to get boot time of the node" )
108
122
109
123
// create tested pod which will be deleted by the far CR
110
- pod = createTestedPod (nodeName , testContainerName )
124
+ pod = createTestedPod (nodeName )
111
125
DeferCleanup (cleanupTestedResources , pod )
112
126
113
127
// set the node as "unhealthy" by disabling kubelet
@@ -240,25 +254,31 @@ func buildNodeParameters(clusterPlatformType configv1.PlatformType) (map[v1alpha
240
254
return testNodeParam , nil
241
255
}
242
256
243
- // getAvailableNodes a list of available worker nodes in the cluster
244
- func getAvailableWorkerNodes () * corev1.NodeList {
245
- availableNodes := & corev1.NodeList {}
257
+ // getReadyWorkerNodes returns a list of ready worker nodes in the cluster if any
258
+ func getReadyWorkerNodes () * corev1.NodeList {
259
+ availableWorkerNodes := & corev1.NodeList {}
246
260
selector := labels .NewSelector ()
247
- requirement , _ := labels .NewRequirement (medik8sLabels .WorkerRole , selection .Exists , []string {})
261
+ requirement , err := labels .NewRequirement (medik8sLabels .WorkerRole , selection .Exists , []string {})
262
+ Expect (err ).To (BeNil ())
248
263
selector = selector .Add (* requirement )
249
- Expect (k8sClient .List (context .Background (), availableNodes , & client.ListOptions {LabelSelector : selector })).ToNot (HaveOccurred ())
250
- if len (availableNodes .Items ) < 1 {
251
- Fail ("No worker nodes found in the cluster" )
264
+ Expect (k8sClient .List (context .Background (), availableWorkerNodes , & client.ListOptions {LabelSelector : selector })).ToNot (HaveOccurred ())
265
+
266
+ // Filter nodes to only include those in "Ready" state
267
+ readyWorkerNodes := & corev1.NodeList {}
268
+ for _ , node := range availableWorkerNodes .Items {
269
+ for _ , condition := range node .Status .Conditions {
270
+ if condition .Type == corev1 .NodeReady && condition .Status == corev1 .ConditionTrue {
271
+ readyWorkerNodes .Items = append (readyWorkerNodes .Items , node )
272
+ break // "Ready" was found
273
+ }
274
+ }
252
275
}
253
- return availableNodes
276
+ return readyWorkerNodes
254
277
}
255
278
256
279
// pickRemediatedNode randomly returns a next remediated node from the current available nodes,
257
280
// and then the node is removed from the list of available nodes
258
281
func pickRemediatedNode (availableNodes * corev1.NodeList ) * corev1.Node {
259
- if len (availableNodes .Items ) < 1 {
260
- Fail ("No available node found for remediation" )
261
- }
262
282
// Generate a random seed based on the current time
263
283
r := rand .New (rand .NewSource (time .Now ().UnixNano ()))
264
284
// Randomly select a worker node
@@ -270,14 +290,14 @@ func pickRemediatedNode(availableNodes *corev1.NodeList) *corev1.Node {
270
290
}
271
291
272
292
// createTestedPod creates tested pod which will be deleted by the far CR
273
- func createTestedPod (nodeName , containerName string ) * corev1.Pod {
293
+ func createTestedPod (nodeName string ) * corev1.Pod {
274
294
pod := e2eUtils .GetPod (nodeName , testContainerName )
275
295
pod .Name = testPodName
276
296
pod .Namespace = testNsName
277
297
pod .Spec .Tolerations = []corev1.Toleration {
278
298
{
279
299
Key : v1alpha1 .FARNoExecuteTaintKey ,
280
- Operator : corev1 .TolerationOpEqual ,
300
+ Operator : corev1 .TolerationOpExists ,
281
301
Effect : corev1 .TaintEffectNoExecute ,
282
302
},
283
303
}
@@ -303,7 +323,7 @@ func createFAR(nodeName string, agent string, sharedParameters map[v1alpha1.Para
303
323
SharedParameters : sharedParameters ,
304
324
NodeParameters : nodeParameters ,
305
325
RemediationStrategy : strategy ,
306
- RetryCount : 5 ,
326
+ RetryCount : 10 ,
307
327
RetryInterval : metav1.Duration {Duration : 20 * time .Second },
308
328
Timeout : metav1.Duration {Duration : 60 * time .Second },
309
329
},
0 commit comments