Skip to content

Commit

Permalink
Merge pull request #159 from razo7/more-info-e2e
Browse files Browse the repository at this point in the history
Remediate Ready Worker Node in E2E
  • Loading branch information
openshift-merge-bot[bot] authored Nov 14, 2024
2 parents 63b5399 + 556f51a commit d4f78cd
Show file tree
Hide file tree
Showing 3 changed files with 98 additions and 57 deletions.
110 changes: 65 additions & 45 deletions test/e2e/far_e2e_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@ const (

//TODO: try to minimize timeout
// eventually parameters
timeoutLogs = "3m0s"
timeoutTaint = "2s" // Timeout for checking the FAR taint
timeoutReboot = "6m0s" // fencing with fence_aws should be completed within 6 minutes
timeoutAfterReboot = "5s" // Timeout for verifying steps after the node has been rebooted
Expand All @@ -48,56 +47,71 @@ const (
skipOOSREnvVarName = "SKIP_OOST_REMEDIATION_VERIFICATION"
)

var remediationTimes []time.Duration
var (
stopTesting bool
remediationTimes []time.Duration
)

var _ = Describe("FAR E2e", func() {
var (
fenceAgent, nodeIdentifierPrefix string
testShareParam map[v1alpha1.ParameterName]string
testNodeParam map[v1alpha1.ParameterName]map[v1alpha1.NodeName]string
selectedNode *corev1.Node
nodeName string
pod *corev1.Pod
startTime, nodeBootTimeBefore time.Time
err error
)
BeforeEach(func() {
// create FAR CR spec based on OCP platformn
clusterPlatform, err := e2eUtils.GetClusterInfo(configClient)
Expect(err).ToNot(HaveOccurred(), "can't identify the cluster platform")
log.Info("Begin e2e test", "Cluster name", string(clusterPlatform.Name), "PlatformType", string(clusterPlatform.Status.PlatformStatus.Type))

switch clusterPlatform.Status.PlatformStatus.Type {
case configv1.AWSPlatformType:
fenceAgent = fenceAgentAWS
nodeIdentifierPrefix = nodeIdentifierPrefixAWS
By("running fence_aws")
case configv1.BareMetalPlatformType:
fenceAgent = fenceAgentIPMI
nodeIdentifierPrefix = nodeIdentifierPrefixIPMI
By("running fence_ipmilan")
default:
Skip("FAR haven't been tested on this kind of cluster (non AWS or BareMetal)")
}
When("trying to identify cluster platform", func() {
It("should be AWS/BareMetal for finding the needed cluster and node parameters", func() {
// create FAR CR spec based on OCP platformn
clusterPlatform, err := e2eUtils.GetClusterInfo(configClient)
Expect(err).ToNot(HaveOccurred(), "can't identify the cluster platform")
log.Info("Getting Cluster Infromation", "Cluster name", string(clusterPlatform.Name), "PlatformType", string(clusterPlatform.Status.PlatformStatus.Type))

switch clusterPlatform.Status.PlatformStatus.Type {
case configv1.AWSPlatformType:
fenceAgent = fenceAgentAWS
nodeIdentifierPrefix = nodeIdentifierPrefixAWS
By("running fence_aws")
case configv1.BareMetalPlatformType:
fenceAgent = fenceAgentIPMI
nodeIdentifierPrefix = nodeIdentifierPrefixIPMI
By("running fence_ipmilan")
default:
stopTesting = true // Mark to stop subsequent tests
Fail("FAR haven't been tested on this kind of cluster (non AWS or BareMetal)")
}

testShareParam, err = buildSharedParameters(clusterPlatform, fenceAgentAction)
Expect(err).ToNot(HaveOccurred(), "can't get shared information")
testNodeParam, err = buildNodeParameters(clusterPlatform.Status.PlatformStatus.Type)
Expect(err).ToNot(HaveOccurred(), "can't get node information")
testShareParam, err = buildSharedParameters(clusterPlatform, fenceAgentAction)
Expect(err).ToNot(HaveOccurred(), "can't get shared information")
testNodeParam, err = buildNodeParameters(clusterPlatform.Status.PlatformStatus.Type)
Expect(err).ToNot(HaveOccurred(), "can't get node information")
})
})

// runFARTests is a utility function to run FAR tests.
// It accepts a remediation strategy and a condition to determine if the tests should be skipped.
runFARTests := func(remediationStrategy v1alpha1.RemediationStrategyType, skipCondition func() bool) {
var availableWorkerNodes *corev1.NodeList
var (
availableWorkerNodes *corev1.NodeList
selectedNode *corev1.Node
nodeName string
pod *corev1.Pod
startTime, nodeBootTimeBefore time.Time
)
BeforeEach(func() {
if stopTesting {
Skip("Skip testing due to unsupported platform")
}
if skipCondition() {
Skip("Skip this block due to unsupported condition")
}

if availableWorkerNodes == nil {
availableWorkerNodes = getAvailableWorkerNodes()
availableWorkerNodes = getReadyWorkerNodes()
}
if len(availableWorkerNodes.Items) < 1 {
Fail("There isn't an available (and Ready) worker node in the cluster")
}

selectedNode = pickRemediatedNode(availableWorkerNodes)
nodeName = selectedNode.Name
printNodeDetails(selectedNode, nodeIdentifierPrefix, testNodeParam)
Expand All @@ -107,7 +121,7 @@ var _ = Describe("FAR E2e", func() {
Expect(err).ToNot(HaveOccurred(), "failed to get boot time of the node")

// create tested pod which will be deleted by the far CR
pod = createTestedPod(nodeName, testContainerName)
pod = createTestedPod(nodeName)
DeferCleanup(cleanupTestedResources, pod)

// set the node as "unhealthy" by disabling kubelet
Expand Down Expand Up @@ -240,25 +254,31 @@ func buildNodeParameters(clusterPlatformType configv1.PlatformType) (map[v1alpha
return testNodeParam, nil
}

// getAvailableNodes a list of available worker nodes in the cluster
func getAvailableWorkerNodes() *corev1.NodeList {
availableNodes := &corev1.NodeList{}
// getReadyWorkerNodes returns a list of ready worker nodes in the cluster if any
func getReadyWorkerNodes() *corev1.NodeList {
availableWorkerNodes := &corev1.NodeList{}
selector := labels.NewSelector()
requirement, _ := labels.NewRequirement(medik8sLabels.WorkerRole, selection.Exists, []string{})
requirement, err := labels.NewRequirement(medik8sLabels.WorkerRole, selection.Exists, []string{})
Expect(err).To(BeNil())
selector = selector.Add(*requirement)
Expect(k8sClient.List(context.Background(), availableNodes, &client.ListOptions{LabelSelector: selector})).ToNot(HaveOccurred())
if len(availableNodes.Items) < 1 {
Fail("No worker nodes found in the cluster")
Expect(k8sClient.List(context.Background(), availableWorkerNodes, &client.ListOptions{LabelSelector: selector})).ToNot(HaveOccurred())

// Filter nodes to only include those in "Ready" state
readyWorkerNodes := &corev1.NodeList{}
for _, node := range availableWorkerNodes.Items {
for _, condition := range node.Status.Conditions {
if condition.Type == corev1.NodeReady && condition.Status == corev1.ConditionTrue {
readyWorkerNodes.Items = append(readyWorkerNodes.Items, node)
break // "Ready" was found
}
}
}
return availableNodes
return readyWorkerNodes
}

// pickRemediatedNode randomly returns a next remediated node from the current available nodes,
// and then the node is removed from the list of available nodes
func pickRemediatedNode(availableNodes *corev1.NodeList) *corev1.Node {
if len(availableNodes.Items) < 1 {
Fail("No available node found for remediation")
}
// Generate a random seed based on the current time
r := rand.New(rand.NewSource(time.Now().UnixNano()))
// Randomly select a worker node
Expand All @@ -270,14 +290,14 @@ func pickRemediatedNode(availableNodes *corev1.NodeList) *corev1.Node {
}

// createTestedPod creates tested pod which will be deleted by the far CR
func createTestedPod(nodeName, containerName string) *corev1.Pod {
func createTestedPod(nodeName string) *corev1.Pod {
pod := e2eUtils.GetPod(nodeName, testContainerName)
pod.Name = testPodName
pod.Namespace = testNsName
pod.Spec.Tolerations = []corev1.Toleration{
{
Key: v1alpha1.FARNoExecuteTaintKey,
Operator: corev1.TolerationOpEqual,
Operator: corev1.TolerationOpExists,
Effect: corev1.TaintEffectNoExecute,
},
}
Expand All @@ -303,7 +323,7 @@ func createFAR(nodeName string, agent string, sharedParameters map[v1alpha1.Para
SharedParameters: sharedParameters,
NodeParameters: nodeParameters,
RemediationStrategy: strategy,
RetryCount: 5,
RetryCount: 10,
RetryInterval: metav1.Duration{Duration: 20 * time.Second},
Timeout: metav1.Duration{Duration: 60 * time.Second},
},
Expand Down
43 changes: 32 additions & 11 deletions test/e2e/utils/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,23 @@ func GetSecretData(clientSet *kubernetes.Clientset, secretName, secretNamespace,
return string(secret.Data[secretData1]), string(secret.Data[secretData2]), nil
}

// getNodeRoleFromMachine return node role "master/control-plane" or "worker" from machine label if present, otherwise "unknown"
func getNodeRoleFromMachine(nodeLabels map[string]string) string {
machineLabelPrefixRole := "machine.openshift.io/cluster-api-machine-"
// look for machine.openshift.io/cluster-api-machine-role or machine.openshift.io/cluster-api-machine-type label
for _, labelKey := range []string{machineLabelPrefixRole + "role", machineLabelPrefixRole + "type"} {
if labelVal, isFound := nodeLabels[labelKey]; isFound {
if labelVal == "worker" {
return "worker"
}
if labelVal == "master" {
return "master/control-plane"
}
}
}
return "unknown"
}

// GetAWSNodeInfoList returns a list of the node names and their identification, e.g., AWS instance ID
func GetAWSNodeInfoList(machineClient *machineclient.Clientset) (map[v1alpha1.NodeName]string, error) {
// oc get machine -n openshift-machine-api MACHINE_NAME -o jsonpath='{.spec.providerID}'
Expand All @@ -62,24 +79,28 @@ func GetAWSNodeInfoList(machineClient *machineclient.Clientset) (map[v1alpha1.No
}

var missNodeMachineErr error
missNodeMachineNames := ""
// creates map for nodeName and AWS instance ID
for _, machine := range machineList.Items {
if machine.Status.NodeRef == nil || machine.Spec.ProviderID == nil {
if missNodeMachineErr != nil {
missNodeMachineErr = fmt.Errorf("machine %s is not associated with any node or it't provider ID is missing\n%w", machine.Spec.Name, missNodeMachineErr)
missNodeMachineNames += ", " + machine.ObjectMeta.GetName()
missNodeMachineErr = fmt.Errorf("machines %s are not associated with any node or there provider ID is missing", missNodeMachineNames)
} else {
missNodeMachineErr = fmt.Errorf("machine %s is not associated with any node or it't provider ID is missing", machine.Spec.Name)
missNodeMachineNames = machine.ObjectMeta.GetName()
missNodeMachineErr = fmt.Errorf("machine %s is not associated with any node or it's provider ID is missing", machine.ObjectMeta.GetName())
}
continue
} else {
nodeName := v1alpha1.NodeName(machine.Status.NodeRef.Name)
nodeRole := getNodeRoleFromMachine(machine.Labels)
providerID := *machine.Spec.ProviderID

// Get the instance ID from the provider ID aws:///us-east-1b/i-082ac37ab919a82c2 -> i-082ac37ab919a82c2
splitedProviderID := strings.Split(providerID, "/i-")
instanceID := "i-" + splitedProviderID[1]
nodeList[nodeName] = instanceID
fmt.Printf("node: %s, Role: %s, Instance ID: %s \n", nodeName, nodeRole, instanceID)
}
nodeName := v1alpha1.NodeName(machine.Status.NodeRef.Name)
providerID := *machine.Spec.ProviderID

// Get the instance ID from the provider ID aws:///us-east-1b/i-082ac37ab919a82c2 -> i-082ac37ab919a82c2
splitedProviderID := strings.Split(providerID, "/i-")
instanceID := "i-" + splitedProviderID[1]
nodeList[nodeName] = instanceID
fmt.Printf("node: %s Instance ID: %s \n", nodeName, instanceID)
}
return nodeList, missNodeMachineErr
}
Expand Down
2 changes: 1 addition & 1 deletion test/e2e/utils/command.go
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,7 @@ func GetPod(nodeName, containerName string) *corev1.Pod {
Tolerations: []corev1.Toleration{
{
Key: v1alpha1.FARNoExecuteTaintKey,
Operator: corev1.TolerationOpEqual,
Operator: corev1.TolerationOpExists,
Effect: corev1.TaintEffectNoExecute,
},
{
Expand Down

0 comments on commit d4f78cd

Please sign in to comment.