Skip to content

Commit d4f78cd

Browse files
Merge pull request #159 from razo7/more-info-e2e
Remediate Ready Worker Node in E2E
2 parents 63b5399 + 556f51a commit d4f78cd

File tree

3 files changed

+98
-57
lines changed

3 files changed

+98
-57
lines changed

test/e2e/far_e2e_test.go

Lines changed: 65 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,6 @@ const (
3838

3939
//TODO: try to minimize timeout
4040
// eventually parameters
41-
timeoutLogs = "3m0s"
4241
timeoutTaint = "2s" // Timeout for checking the FAR taint
4342
timeoutReboot = "6m0s" // fencing with fence_aws should be completed within 6 minutes
4443
timeoutAfterReboot = "5s" // Timeout for verifying steps after the node has been rebooted
@@ -48,56 +47,71 @@ const (
4847
skipOOSREnvVarName = "SKIP_OOST_REMEDIATION_VERIFICATION"
4948
)
5049

51-
var remediationTimes []time.Duration
50+
var (
51+
stopTesting bool
52+
remediationTimes []time.Duration
53+
)
5254

5355
var _ = Describe("FAR E2e", func() {
5456
var (
5557
fenceAgent, nodeIdentifierPrefix string
5658
testShareParam map[v1alpha1.ParameterName]string
5759
testNodeParam map[v1alpha1.ParameterName]map[v1alpha1.NodeName]string
58-
selectedNode *corev1.Node
59-
nodeName string
60-
pod *corev1.Pod
61-
startTime, nodeBootTimeBefore time.Time
6260
err error
6361
)
64-
BeforeEach(func() {
65-
// create FAR CR spec based on OCP platformn
66-
clusterPlatform, err := e2eUtils.GetClusterInfo(configClient)
67-
Expect(err).ToNot(HaveOccurred(), "can't identify the cluster platform")
68-
log.Info("Begin e2e test", "Cluster name", string(clusterPlatform.Name), "PlatformType", string(clusterPlatform.Status.PlatformStatus.Type))
69-
70-
switch clusterPlatform.Status.PlatformStatus.Type {
71-
case configv1.AWSPlatformType:
72-
fenceAgent = fenceAgentAWS
73-
nodeIdentifierPrefix = nodeIdentifierPrefixAWS
74-
By("running fence_aws")
75-
case configv1.BareMetalPlatformType:
76-
fenceAgent = fenceAgentIPMI
77-
nodeIdentifierPrefix = nodeIdentifierPrefixIPMI
78-
By("running fence_ipmilan")
79-
default:
80-
Skip("FAR haven't been tested on this kind of cluster (non AWS or BareMetal)")
81-
}
62+
When("trying to identify cluster platform", func() {
63+
It("should be AWS/BareMetal for finding the needed cluster and node parameters", func() {
64+
// create FAR CR spec based on OCP platformn
65+
clusterPlatform, err := e2eUtils.GetClusterInfo(configClient)
66+
Expect(err).ToNot(HaveOccurred(), "can't identify the cluster platform")
67+
log.Info("Getting Cluster Infromation", "Cluster name", string(clusterPlatform.Name), "PlatformType", string(clusterPlatform.Status.PlatformStatus.Type))
68+
69+
switch clusterPlatform.Status.PlatformStatus.Type {
70+
case configv1.AWSPlatformType:
71+
fenceAgent = fenceAgentAWS
72+
nodeIdentifierPrefix = nodeIdentifierPrefixAWS
73+
By("running fence_aws")
74+
case configv1.BareMetalPlatformType:
75+
fenceAgent = fenceAgentIPMI
76+
nodeIdentifierPrefix = nodeIdentifierPrefixIPMI
77+
By("running fence_ipmilan")
78+
default:
79+
stopTesting = true // Mark to stop subsequent tests
80+
Fail("FAR haven't been tested on this kind of cluster (non AWS or BareMetal)")
81+
}
8282

83-
testShareParam, err = buildSharedParameters(clusterPlatform, fenceAgentAction)
84-
Expect(err).ToNot(HaveOccurred(), "can't get shared information")
85-
testNodeParam, err = buildNodeParameters(clusterPlatform.Status.PlatformStatus.Type)
86-
Expect(err).ToNot(HaveOccurred(), "can't get node information")
83+
testShareParam, err = buildSharedParameters(clusterPlatform, fenceAgentAction)
84+
Expect(err).ToNot(HaveOccurred(), "can't get shared information")
85+
testNodeParam, err = buildNodeParameters(clusterPlatform.Status.PlatformStatus.Type)
86+
Expect(err).ToNot(HaveOccurred(), "can't get node information")
87+
})
8788
})
8889

8990
// runFARTests is a utility function to run FAR tests.
9091
// It accepts a remediation strategy and a condition to determine if the tests should be skipped.
9192
runFARTests := func(remediationStrategy v1alpha1.RemediationStrategyType, skipCondition func() bool) {
92-
var availableWorkerNodes *corev1.NodeList
93+
var (
94+
availableWorkerNodes *corev1.NodeList
95+
selectedNode *corev1.Node
96+
nodeName string
97+
pod *corev1.Pod
98+
startTime, nodeBootTimeBefore time.Time
99+
)
93100
BeforeEach(func() {
101+
if stopTesting {
102+
Skip("Skip testing due to unsupported platform")
103+
}
94104
if skipCondition() {
95105
Skip("Skip this block due to unsupported condition")
96106
}
97107

98108
if availableWorkerNodes == nil {
99-
availableWorkerNodes = getAvailableWorkerNodes()
109+
availableWorkerNodes = getReadyWorkerNodes()
100110
}
111+
if len(availableWorkerNodes.Items) < 1 {
112+
Fail("There isn't an available (and Ready) worker node in the cluster")
113+
}
114+
101115
selectedNode = pickRemediatedNode(availableWorkerNodes)
102116
nodeName = selectedNode.Name
103117
printNodeDetails(selectedNode, nodeIdentifierPrefix, testNodeParam)
@@ -107,7 +121,7 @@ var _ = Describe("FAR E2e", func() {
107121
Expect(err).ToNot(HaveOccurred(), "failed to get boot time of the node")
108122

109123
// create tested pod which will be deleted by the far CR
110-
pod = createTestedPod(nodeName, testContainerName)
124+
pod = createTestedPod(nodeName)
111125
DeferCleanup(cleanupTestedResources, pod)
112126

113127
// set the node as "unhealthy" by disabling kubelet
@@ -240,25 +254,31 @@ func buildNodeParameters(clusterPlatformType configv1.PlatformType) (map[v1alpha
240254
return testNodeParam, nil
241255
}
242256

243-
// getAvailableNodes a list of available worker nodes in the cluster
244-
func getAvailableWorkerNodes() *corev1.NodeList {
245-
availableNodes := &corev1.NodeList{}
257+
// getReadyWorkerNodes returns a list of ready worker nodes in the cluster if any
258+
func getReadyWorkerNodes() *corev1.NodeList {
259+
availableWorkerNodes := &corev1.NodeList{}
246260
selector := labels.NewSelector()
247-
requirement, _ := labels.NewRequirement(medik8sLabels.WorkerRole, selection.Exists, []string{})
261+
requirement, err := labels.NewRequirement(medik8sLabels.WorkerRole, selection.Exists, []string{})
262+
Expect(err).To(BeNil())
248263
selector = selector.Add(*requirement)
249-
Expect(k8sClient.List(context.Background(), availableNodes, &client.ListOptions{LabelSelector: selector})).ToNot(HaveOccurred())
250-
if len(availableNodes.Items) < 1 {
251-
Fail("No worker nodes found in the cluster")
264+
Expect(k8sClient.List(context.Background(), availableWorkerNodes, &client.ListOptions{LabelSelector: selector})).ToNot(HaveOccurred())
265+
266+
// Filter nodes to only include those in "Ready" state
267+
readyWorkerNodes := &corev1.NodeList{}
268+
for _, node := range availableWorkerNodes.Items {
269+
for _, condition := range node.Status.Conditions {
270+
if condition.Type == corev1.NodeReady && condition.Status == corev1.ConditionTrue {
271+
readyWorkerNodes.Items = append(readyWorkerNodes.Items, node)
272+
break // "Ready" was found
273+
}
274+
}
252275
}
253-
return availableNodes
276+
return readyWorkerNodes
254277
}
255278

256279
// pickRemediatedNode randomly returns a next remediated node from the current available nodes,
257280
// and then the node is removed from the list of available nodes
258281
func pickRemediatedNode(availableNodes *corev1.NodeList) *corev1.Node {
259-
if len(availableNodes.Items) < 1 {
260-
Fail("No available node found for remediation")
261-
}
262282
// Generate a random seed based on the current time
263283
r := rand.New(rand.NewSource(time.Now().UnixNano()))
264284
// Randomly select a worker node
@@ -270,14 +290,14 @@ func pickRemediatedNode(availableNodes *corev1.NodeList) *corev1.Node {
270290
}
271291

272292
// createTestedPod creates tested pod which will be deleted by the far CR
273-
func createTestedPod(nodeName, containerName string) *corev1.Pod {
293+
func createTestedPod(nodeName string) *corev1.Pod {
274294
pod := e2eUtils.GetPod(nodeName, testContainerName)
275295
pod.Name = testPodName
276296
pod.Namespace = testNsName
277297
pod.Spec.Tolerations = []corev1.Toleration{
278298
{
279299
Key: v1alpha1.FARNoExecuteTaintKey,
280-
Operator: corev1.TolerationOpEqual,
300+
Operator: corev1.TolerationOpExists,
281301
Effect: corev1.TaintEffectNoExecute,
282302
},
283303
}
@@ -303,7 +323,7 @@ func createFAR(nodeName string, agent string, sharedParameters map[v1alpha1.Para
303323
SharedParameters: sharedParameters,
304324
NodeParameters: nodeParameters,
305325
RemediationStrategy: strategy,
306-
RetryCount: 5,
326+
RetryCount: 10,
307327
RetryInterval: metav1.Duration{Duration: 20 * time.Second},
308328
Timeout: metav1.Duration{Duration: 60 * time.Second},
309329
},

test/e2e/utils/cluster.go

Lines changed: 32 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,23 @@ func GetSecretData(clientSet *kubernetes.Clientset, secretName, secretNamespace,
4848
return string(secret.Data[secretData1]), string(secret.Data[secretData2]), nil
4949
}
5050

51+
// getNodeRoleFromMachine return node role "master/control-plane" or "worker" from machine label if present, otherwise "unknown"
52+
func getNodeRoleFromMachine(nodeLabels map[string]string) string {
53+
machineLabelPrefixRole := "machine.openshift.io/cluster-api-machine-"
54+
// look for machine.openshift.io/cluster-api-machine-role or machine.openshift.io/cluster-api-machine-type label
55+
for _, labelKey := range []string{machineLabelPrefixRole + "role", machineLabelPrefixRole + "type"} {
56+
if labelVal, isFound := nodeLabels[labelKey]; isFound {
57+
if labelVal == "worker" {
58+
return "worker"
59+
}
60+
if labelVal == "master" {
61+
return "master/control-plane"
62+
}
63+
}
64+
}
65+
return "unknown"
66+
}
67+
5168
// GetAWSNodeInfoList returns a list of the node names and their identification, e.g., AWS instance ID
5269
func GetAWSNodeInfoList(machineClient *machineclient.Clientset) (map[v1alpha1.NodeName]string, error) {
5370
// oc get machine -n openshift-machine-api MACHINE_NAME -o jsonpath='{.spec.providerID}'
@@ -62,24 +79,28 @@ func GetAWSNodeInfoList(machineClient *machineclient.Clientset) (map[v1alpha1.No
6279
}
6380

6481
var missNodeMachineErr error
82+
missNodeMachineNames := ""
6583
// creates map for nodeName and AWS instance ID
6684
for _, machine := range machineList.Items {
6785
if machine.Status.NodeRef == nil || machine.Spec.ProviderID == nil {
6886
if missNodeMachineErr != nil {
69-
missNodeMachineErr = fmt.Errorf("machine %s is not associated with any node or it't provider ID is missing\n%w", machine.Spec.Name, missNodeMachineErr)
87+
missNodeMachineNames += ", " + machine.ObjectMeta.GetName()
88+
missNodeMachineErr = fmt.Errorf("machines %s are not associated with any node or there provider ID is missing", missNodeMachineNames)
7089
} else {
71-
missNodeMachineErr = fmt.Errorf("machine %s is not associated with any node or it't provider ID is missing", machine.Spec.Name)
90+
missNodeMachineNames = machine.ObjectMeta.GetName()
91+
missNodeMachineErr = fmt.Errorf("machine %s is not associated with any node or it's provider ID is missing", machine.ObjectMeta.GetName())
7292
}
73-
continue
93+
} else {
94+
nodeName := v1alpha1.NodeName(machine.Status.NodeRef.Name)
95+
nodeRole := getNodeRoleFromMachine(machine.Labels)
96+
providerID := *machine.Spec.ProviderID
97+
98+
// Get the instance ID from the provider ID aws:///us-east-1b/i-082ac37ab919a82c2 -> i-082ac37ab919a82c2
99+
splitedProviderID := strings.Split(providerID, "/i-")
100+
instanceID := "i-" + splitedProviderID[1]
101+
nodeList[nodeName] = instanceID
102+
fmt.Printf("node: %s, Role: %s, Instance ID: %s \n", nodeName, nodeRole, instanceID)
74103
}
75-
nodeName := v1alpha1.NodeName(machine.Status.NodeRef.Name)
76-
providerID := *machine.Spec.ProviderID
77-
78-
// Get the instance ID from the provider ID aws:///us-east-1b/i-082ac37ab919a82c2 -> i-082ac37ab919a82c2
79-
splitedProviderID := strings.Split(providerID, "/i-")
80-
instanceID := "i-" + splitedProviderID[1]
81-
nodeList[nodeName] = instanceID
82-
fmt.Printf("node: %s Instance ID: %s \n", nodeName, instanceID)
83104
}
84105
return nodeList, missNodeMachineErr
85106
}

test/e2e/utils/command.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -193,7 +193,7 @@ func GetPod(nodeName, containerName string) *corev1.Pod {
193193
Tolerations: []corev1.Toleration{
194194
{
195195
Key: v1alpha1.FARNoExecuteTaintKey,
196-
Operator: corev1.TolerationOpEqual,
196+
Operator: corev1.TolerationOpExists,
197197
Effect: corev1.TaintEffectNoExecute,
198198
},
199199
{

0 commit comments

Comments
 (0)