diff --git a/points-all.yml b/points-all.yml index 99d91d03b..8e67cc224 100644 --- a/points-all.yml +++ b/points-all.yml @@ -100,7 +100,7 @@ tags: platform, dynamic - name: node_failure tags: platform, dynamic -- name: recover_from_node_failure +- name: worker_reboot_recovery tags: platform, platform:resilience, dynamic - name: oci_compliant tags: platform, platform:hardware_and_scheduling, dynamic diff --git a/points.yml b/points.yml index 732275475..e89c79673 100644 --- a/points.yml +++ b/points.yml @@ -109,7 +109,7 @@ - name: k8s_conformance tags: platform, dynamic -- name: recover_from_node_failure +- name: worker_reboot_recovery tags: platform, platform:resilience, dynamic - name: oci_compliant tags: platform, platform:hardware_and_scheduling, dynamic diff --git a/spec/platform/resilience_spec.cr b/spec/platform/resilience_spec.cr index 9e0a42f99..fc6d1fa14 100644 --- a/spec/platform/resilience_spec.cr +++ b/spec/platform/resilience_spec.cr @@ -4,25 +4,19 @@ require "./../../src/tasks/utils/utils.cr" describe "Platform" do before_all do - # LOGGING.debug `pwd` - # LOGGING.debug `echo $KUBECONFIG` - `./cnf-conformance samples_cleanup` - $?.success?.should be_true `./cnf-conformance setup` $?.success?.should be_true - `./cnf-conformance sample_coredns_with_wait_setup` - $?.success?.should be_true end - it "'node_failure' should pass if chaos_mesh node_failure tests prove the platform is resilient" do + it "'worker_reboot_recovery' should pass if platform successfully recovers after reboot", tags: "platform:worker_reboot_recovery" do if check_destructive puts "Tests running in destructive mode".colorize(:red) - response_s = `./cnf-conformance platform:node_failure poc destructive` + response_s = `./cnf-conformance platform:worker_reboot_recovery destructive` LOGGING.info response_s (/(PASSED: Node came back online)/ =~ response_s).should_not be_nil else - response_s = `./cnf-conformance platform:node_failure poc` + response_s = `./cnf-conformance platform:worker_reboot_recovery` LOGGING.info response_s - (/(PASSED: Nodes are resilient|Skipped)/ =~ response_s).should_not be_nil + (/Skipped/ =~ response_s).should_not be_nil end end end diff --git a/src/tasks/platform/platform.cr b/src/tasks/platform/platform.cr index 85b2ac759..9c36f7cc8 100644 --- a/src/tasks/platform/platform.cr +++ b/src/tasks/platform/platform.cr @@ -1,3 +1,4 @@ +# coding: utf-8 desc "Platform Tests" task "platform", ["k8s_conformance", "platform:resilience", "platform:hardware_and_scheduling"] do |_, args| VERBOSE_LOGGING.info "platform" if check_verbose(args) @@ -22,11 +23,13 @@ task "k8s_conformance" do |_, args| #TODO when in test mode --mode quick, prod mode no quick testrun = "" VERBOSE_LOGGING.info ENV["CRYSTAL_ENV"]? if check_verbose(args) - # if ENV["CRYSTAL_ENV"]? == "TEST" + if ENV["CRYSTAL_ENV"]? == "TEST" + LOGGING.info("Running Sonobuoy using Quick Mode") testrun = `#{sonobuoy} run --wait --mode quick` - # else - # testrun = `#{sonobuoy} run --wait` - # end + else + LOGGING.info("Running Sonobuoy Conformance") + testrun = `#{sonobuoy} run --wait` + end VERBOSE_LOGGING.info testrun if check_verbose(args) results = `results=$(#{sonobuoy} retrieve); #{sonobuoy} results $results` diff --git a/src/tasks/platform/resilience.cr b/src/tasks/platform/resilience.cr index 6183738d4..3eaaf1f15 100644 --- a/src/tasks/platform/resilience.cr +++ b/src/tasks/platform/resilience.cr @@ -5,7 +5,7 @@ require "../utils/utils.cr" namespace "platform" do desc "The CNF conformance suite checks to see if the CNFs are resilient to failures." - task "resilience", ["node_failure"] do |t, args| + task "resilience", ["worker_reboot_recovery"] do |t, args| VERBOSE_LOGGING.info "resilience" if check_verbose(args) VERBOSE_LOGGING.debug "resilience args.raw: #{args.raw}" if check_verbose(args) VERBOSE_LOGGING.debug "resilience args.named: #{args.named}" if check_verbose(args) @@ -13,9 +13,9 @@ namespace "platform" do end desc "Does the Platform recover the node and reschedule pods when a worker node fails" - task "node_failure" do |_, args| - unless check_poc(args) && check_destructive(args) - LOGGING.info "skipping node_failure: not in POC and destructive mode" + task "worker_reboot_recovery" do |_, args| + unless check_destructive(args) + LOGGING.info "skipping node_failure: not in destructive mode" puts "Skipped".colorize(:yellow) next end @@ -45,7 +45,7 @@ namespace "platform" do pod_ready = pod_status("reboot", "--field-selector spec.nodeName=#{worker_node}").split(",")[2] pod_ready_timeout = pod_ready_timeout - 1 if pod_ready_timeout == 0 - upsert_failed_task("recover_from_node_failure", "✖️ FAILURE: Failed to install reboot daemon") + upsert_failed_task("worker_reboot_recovery", "✖️ FAILURE: Failed to install reboot daemon") exit 1 end sleep 1 @@ -69,7 +69,7 @@ namespace "platform" do puts "Node Ready Status: #{node_ready}" node_failure_timeout = node_failure_timeout - 1 if node_failure_timeout == 0 - upsert_failed_task("recover_from_node_failure", "✖️ FAILURE: Node failed to go offline") + upsert_failed_task("worker_reboot_recovery", "✖️ FAILURE: Node failed to go offline") exit 1 end sleep 1 @@ -87,14 +87,14 @@ namespace "platform" do puts "Node Ready Status: #{node_ready}" node_online_timeout = node_online_timeout - 1 if node_online_timeout == 0 - upsert_failed_task("recover_from_node_failure", "✖️ FAILURE: Node failed to come back online") + upsert_failed_task("worker_reboot_recovery", "✖️ FAILURE: Node failed to come back online") exit 1 end sleep 1 end - emoji_chaos_network_loss="📶☠️" - resp = upsert_passed_task("recover_from_node_failure","✔️ PASSED: Node came back online #{emoji_chaos_network_loss}") + emoji_worker_reboot_recovery="" + resp = upsert_passed_task("worker_reboot_recovery","✔️ PASSED: Node came back online #{emoji_worker_reboot_recovery}") ensure