Skip to content

Commit

Permalink
Merge pull request #377 from cncf/feature/resilience_test#320
Browse files Browse the repository at this point in the history
Non-POC Worker Node Recover Test #320
  • Loading branch information
wvwatson authored Aug 27, 2020
2 parents 7b0cbf8 + 6ad9b37 commit a0edc6f
Show file tree
Hide file tree
Showing 5 changed files with 22 additions and 25 deletions.
2 changes: 1 addition & 1 deletion points-all.yml
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@
tags: platform, dynamic
- name: node_failure
tags: platform, dynamic
- name: recover_from_node_failure
- name: worker_reboot_recovery
tags: platform, platform:resilience, dynamic
- name: oci_compliant
tags: platform, platform:hardware_and_scheduling, dynamic
2 changes: 1 addition & 1 deletion points.yml
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@

- name: k8s_conformance
tags: platform, dynamic
- name: recover_from_node_failure
- name: worker_reboot_recovery
tags: platform, platform:resilience, dynamic
- name: oci_compliant
tags: platform, platform:hardware_and_scheduling, dynamic
14 changes: 4 additions & 10 deletions spec/platform/resilience_spec.cr
Original file line number Diff line number Diff line change
Expand Up @@ -4,25 +4,19 @@ require "./../../src/tasks/utils/utils.cr"

describe "Platform" do
before_all do
# LOGGING.debug `pwd`
# LOGGING.debug `echo $KUBECONFIG`
`./cnf-conformance samples_cleanup`
$?.success?.should be_true
`./cnf-conformance setup`
$?.success?.should be_true
`./cnf-conformance sample_coredns_with_wait_setup`
$?.success?.should be_true
end
it "'node_failure' should pass if chaos_mesh node_failure tests prove the platform is resilient" do
it "'worker_reboot_recovery' should pass if platform successfully recovers after reboot", tags: "platform:worker_reboot_recovery" do
if check_destructive
puts "Tests running in destructive mode".colorize(:red)
response_s = `./cnf-conformance platform:node_failure poc destructive`
response_s = `./cnf-conformance platform:worker_reboot_recovery destructive`
LOGGING.info response_s
(/(PASSED: Node came back online)/ =~ response_s).should_not be_nil
else
response_s = `./cnf-conformance platform:node_failure poc`
response_s = `./cnf-conformance platform:worker_reboot_recovery`
LOGGING.info response_s
(/(PASSED: Nodes are resilient|Skipped)/ =~ response_s).should_not be_nil
(/Skipped/ =~ response_s).should_not be_nil
end
end
end
Expand Down
11 changes: 7 additions & 4 deletions src/tasks/platform/platform.cr
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# coding: utf-8
desc "Platform Tests"
task "platform", ["k8s_conformance", "platform:resilience", "platform:hardware_and_scheduling"] do |_, args|
VERBOSE_LOGGING.info "platform" if check_verbose(args)
Expand All @@ -22,11 +23,13 @@ task "k8s_conformance" do |_, args|
#TODO when in test mode --mode quick, prod mode no quick
testrun = ""
VERBOSE_LOGGING.info ENV["CRYSTAL_ENV"]? if check_verbose(args)
# if ENV["CRYSTAL_ENV"]? == "TEST"
if ENV["CRYSTAL_ENV"]? == "TEST"
LOGGING.info("Running Sonobuoy using Quick Mode")
testrun = `#{sonobuoy} run --wait --mode quick`
# else
# testrun = `#{sonobuoy} run --wait`
# end
else
LOGGING.info("Running Sonobuoy Conformance")
testrun = `#{sonobuoy} run --wait`
end
VERBOSE_LOGGING.info testrun if check_verbose(args)

results = `results=$(#{sonobuoy} retrieve); #{sonobuoy} results $results`
Expand Down
18 changes: 9 additions & 9 deletions src/tasks/platform/resilience.cr
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,17 @@ require "../utils/utils.cr"

namespace "platform" do
desc "The CNF conformance suite checks to see if the CNFs are resilient to failures."
task "resilience", ["node_failure"] do |t, args|
task "resilience", ["worker_reboot_recovery"] do |t, args|
VERBOSE_LOGGING.info "resilience" if check_verbose(args)
VERBOSE_LOGGING.debug "resilience args.raw: #{args.raw}" if check_verbose(args)
VERBOSE_LOGGING.debug "resilience args.named: #{args.named}" if check_verbose(args)
stdout_score("platform:resilience")
end

desc "Does the Platform recover the node and reschedule pods when a worker node fails"
task "node_failure" do |_, args|
unless check_poc(args) && check_destructive(args)
LOGGING.info "skipping node_failure: not in POC and destructive mode"
task "worker_reboot_recovery" do |_, args|
unless check_destructive(args)
LOGGING.info "skipping node_failure: not in destructive mode"
puts "Skipped".colorize(:yellow)
next
end
Expand Down Expand Up @@ -45,7 +45,7 @@ namespace "platform" do
pod_ready = pod_status("reboot", "--field-selector spec.nodeName=#{worker_node}").split(",")[2]
pod_ready_timeout = pod_ready_timeout - 1
if pod_ready_timeout == 0
upsert_failed_task("recover_from_node_failure", "✖️ FAILURE: Failed to install reboot daemon")
upsert_failed_task("worker_reboot_recovery", "✖️ FAILURE: Failed to install reboot daemon")
exit 1
end
sleep 1
Expand All @@ -69,7 +69,7 @@ namespace "platform" do
puts "Node Ready Status: #{node_ready}"
node_failure_timeout = node_failure_timeout - 1
if node_failure_timeout == 0
upsert_failed_task("recover_from_node_failure", "✖️ FAILURE: Node failed to go offline")
upsert_failed_task("worker_reboot_recovery", "✖️ FAILURE: Node failed to go offline")
exit 1
end
sleep 1
Expand All @@ -87,14 +87,14 @@ namespace "platform" do
puts "Node Ready Status: #{node_ready}"
node_online_timeout = node_online_timeout - 1
if node_online_timeout == 0
upsert_failed_task("recover_from_node_failure", "✖️ FAILURE: Node failed to come back online")
upsert_failed_task("worker_reboot_recovery", "✖️ FAILURE: Node failed to come back online")
exit 1
end
sleep 1
end

emoji_chaos_network_loss="📶☠️"
resp = upsert_passed_task("recover_from_node_failure","✔️ PASSED: Node came back online #{emoji_chaos_network_loss}")
emoji_worker_reboot_recovery=""
resp = upsert_passed_task("worker_reboot_recovery","✔️ PASSED: Node came back online #{emoji_worker_reboot_recovery}")


ensure
Expand Down

0 comments on commit a0edc6f

Please sign in to comment.