Skip to content

Commit

Permalink
Add possibility to change node drain duration
Browse files Browse the repository at this point in the history
Add possibility to change duration of node drain
litmus chaos test. This is needed for CNFs with
longer startup/shutdown.
Additionally, fix litmus waiter code and timeout module.
Slight refactor of LitmusManager module.

Refs: cnti-testcatalog#2098

Signed-off-by: Martin Matyas <[email protected]>
  • Loading branch information
martin-mat committed Jul 9, 2024
1 parent c4eb30f commit 950f12e
Show file tree
Hide file tree
Showing 4 changed files with 33 additions and 43 deletions.
1 change: 1 addition & 0 deletions USAGE.md
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,7 @@ CNF_TESTSUITE_RESOURCE_CREATION_TIMEOUT=120
CNF_TESTSUITE_NODE_READINESS_TIMEOUT=240
CNF_TESTSUITE_POD_READINESS_TIMEOUT=180
CNF_TESTSUITE_LITMUS_CHAOS_TEST_TIMEOUT=1800
CNF_TESTSUITE_NODE_DRAIN_TOTAL_CHAOS_DURATION=90
```

#### Running The Linter in Developer Mode
Expand Down
65 changes: 26 additions & 39 deletions src/tasks/litmus_setup.cr
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ module LitmusManager
DOWNLOADED_LITMUS_FILE = "litmus-operator-downloaded.yaml"
MODIFIED_LITMUS_FILE = "litmus-operator-modified.yaml"
LITMUS_NAMESPACE = "litmus"
LITMUS_K8S_DOMAIN = "litmuschaos.io"



Expand Down Expand Up @@ -85,60 +86,46 @@ module LitmusManager
appNodeName_response.to_s
end

private def self.get_status_info(chaos_resource, test_name, output_format, namespace) : {Int32, String}
status_cmd = "kubectl get #{chaos_resource}.#{LITMUS_K8S_DOMAIN} #{test_name} -n #{namespace} -o '#{output_format}'"
Log.info { "Getting litmus status info: #{status_cmd}" }
status_code = Process.run("#{status_cmd}", shell: true, output: status_response = IO::Memory.new, error: stderr = IO::Memory.new).exit_status
status_response = status_response.to_s
Log.info { "status_code: #{status_code}, response: #{status_response}" }
{status_code, status_response}
end

private def self.get_status_info_until(chaos_resource, test_name, output_format, timeout, namespace, &block)
repeat_with_timeout(timeout: timeout, errormsg: "Litmus response timed-out") do
status_code, status_response = get_status_info(chaos_resource, test_name, output_format, namespace)
status_code == 0 && yield status_response
end
end

## wait_for_test will wait for the completion of litmus test
def self.wait_for_test(test_name, chaos_experiment_name, args, namespace : String = "default")
chaos_result_name = "#{test_name}-#{chaos_experiment_name}"

experimentStatus_cmd = "kubectl get chaosengine.litmuschaos.io #{test_name} -n #{namespace} -o jsonpath='{.status.engineStatus}'"
Log.for("wait_for_test").info { "Checking experiment status #{experimentStatus_cmd}" } if check_verbose(args)

## Wait for completion of chaosengine which indicates the completion of chaos
repeat_with_timeout(timeout: LITMUS_CHAOS_TEST_TIMEOUT, errormsg: "Litmus test has timed-out") do
status_code = Process.run("#{experimentStatus_cmd}",
shell: true,
output: experimentStatus_response = IO::Memory.new,
error: stderr = IO::Memory.new).exit_status
Log.for("wait_for_test").info { "#{chaos_experiment_name} status_code: #{status_code}" } if check_verbose(args)
experimentStatus = experimentStatus_response.to_s
Log.for("wait_for_test").info {"#{chaos_experiment_name} experiment status: " + experimentStatus}
if (experimentStatus != "Waiting for Job Creation" && experimentStatus != "Running" && experimentStatus != "Completed")
true
else
status_code == 0 && experimentStatus == "Completed"
end
Log.info { "wait_for_test: #{chaos_result_name}" }

get_status_info_until("chaosengine", test_name, "jsonpath={.status.engineStatus}", LITMUS_CHAOS_TEST_TIMEOUT, namespace) do |engineStatus|
["completed", "stopped"].includes?(engineStatus)
end

verdict_cmd = "kubectl get chaosresults.litmuschaos.io #{chaos_result_name} -n #{namespace} -o jsonpath='{.status.experimentStatus.verdict}'"
Log.for("wait_for_test").info { "Checking experiment verdict #{verdict_cmd}" } if check_verbose(args)
## Check the chaosresult verdict
repeat_with_timeout(timeout: GENERIC_OPERATION_TIMEOUT, errormsg: "Litmus verdict aquiring has timed-out") do
status_code = Process.run("#{verdict_cmd}",
shell: true,
output: verdict_response = IO::Memory.new,
error: stderr = IO::Memory.new).exit_status
Log.for("wait_for_test").info { "status_code: #{status_code}" } if check_verbose(args)
Log.for("wait_for_test").info { "verdict: #{verdict_response.to_s}" } if check_verbose(args)
verdict = verdict_response.to_s
status_code == 0 && verdict != "Awaited"
get_status_info_until("chaosresults", chaos_result_name, "jsonpath={.status.experimentStatus.verdict}", GENERIC_OPERATION_TIMEOUT, namespace) do |verdict|
verdict != "Awaited"
end
end

## check_chaos_verdict will check the verdict of chaosexperiment
def self.check_chaos_verdict(chaos_result_name, chaos_experiment_name, args, namespace : String = "default") : Bool
verdict_cmd = "kubectl get chaosresults.litmuschaos.io #{chaos_result_name} -n #{namespace} -o jsonpath='{.status.experimentStatus.verdict}'"
Log.for("LitmusManager.check_chaos_verdict").debug { "Checking experiment verdict with command: #{verdict_cmd}" }
status_code = Process.run("#{verdict_cmd}", shell: true, output: verdict_response = IO::Memory.new, error: stderr = IO::Memory.new).exit_status
Log.for("LitmusManager.check_chaos_verdict").debug { "status_code: #{status_code}; verdict: #{verdict_response.to_s}" }
verdict = verdict_response.to_s
_, verdict = get_status_info("chaosresult", chaos_result_name, "jsonpath={.status.experimentStatus.verdict}", namespace)

emoji_test_failed= "🗡️💀♻️"
if verdict == "Pass"
return true
else
Log.for("LitmusManager.check_chaos_verdict#details").debug do
verdict_details_cmd = "kubectl get chaosresults.litmuschaos.io #{chaos_result_name} -n #{namespace} -o json"
status_code = Process.run("#{verdict_details_cmd}", shell: true, output: verdict_details_response = IO::Memory.new, error: stderr = IO::Memory.new).exit_status
"#{verdict_details_response.to_s}"
status_code, verdict_details_response = get_status_info("chaosresult", chaos_result_name, "json", namespace)
"#{verdict_details_response}"
end

Log.for("LitmusManager.check_chaos_verdict").info {"#{chaos_experiment_name} chaos test failed: #{chaos_result_name}, verdict: #{verdict}"}
Expand Down
4 changes: 3 additions & 1 deletion src/tasks/utils/chaos_templates.cr
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
NODE_DRAIN_TOTAL_CHAOS_DURATION = ENV.has_key?("CNF_TESTSUITE_NODE_DRAIN_TOTAL_CHAOS_DURATION") ? ENV["CNF_TESTSUITE_NODE_DRAIN_TOTAL_CHAOS_DURATION"].to_i : 90

class ChaosTemplates
class PodIoStress
def initialize(
Expand Down Expand Up @@ -113,7 +115,7 @@ class ChaosTemplates
@deployment_label : String,
@deployment_label_value : String,
@app_nodename : String,
@total_chaos_duration : String = "90"
@total_chaos_duration : String = "#{NODE_DRAIN_TOTAL_CHAOS_DURATION}"
)
end
ECR.def_to_s("src/templates/chaos_templates/node_drain.yml.ecr")
Expand Down
6 changes: 3 additions & 3 deletions src/tasks/utils/timeouts.cr
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ LITMUS_CHAOS_TEST_TIMEOUT = ENV.has_key?("CNF_TESTSUITE_LITMUS_CHAOS_TEST_TIMEOU

def repeat_with_timeout(timeout, errormsg, reset_on_nil=false, delay=2, &block)
start_time = Time.utc
while (Time.utc - start_time).seconds < timeout
while (Time.utc - start_time).to_i < timeout
result = yield
if result.nil?
if reset_on_nil
Expand All @@ -21,8 +21,8 @@ def repeat_with_timeout(timeout, errormsg, reset_on_nil=false, delay=2, &block)
return true
end
sleep delay
Log.for("verbose").info { "Time left: #{timeout - (Time.utc - start_time).seconds} seconds" }
Log.for("verbose").info { "Time left: #{timeout - (Time.utc - start_time).to_i} seconds" }
end
Log.error { errormsg }
false
end
end

0 comments on commit 950f12e

Please sign in to comment.