Add possibility to change node drain duration (cnti-testcatalog#2102)

Add possibility to change duration of node drain litmus chaos test. This is needed for CNFs with longer startup/shutdown. Additionally, fix litmus waiter code and timeout module. Slight refactor of LitmusManager module. Refs: cnti-testcatalog#2098 Signed-off-by: Martin Matyas <[email protected]>
horecoli · Jul 10, 2024 · 7f8c910 · 7f8c910
1 parent 2a133f8
commit 7f8c910
Show file tree

Hide file tree

Showing 4 changed files with 33 additions and 43 deletions.
diff --git a/USAGE.md b/USAGE.md
@@ -157,6 +157,7 @@ CNF_TESTSUITE_RESOURCE_CREATION_TIMEOUT=120
 CNF_TESTSUITE_NODE_READINESS_TIMEOUT=240
 CNF_TESTSUITE_POD_READINESS_TIMEOUT=180
 CNF_TESTSUITE_LITMUS_CHAOS_TEST_TIMEOUT=1800
+CNF_TESTSUITE_NODE_DRAIN_TOTAL_CHAOS_DURATION=90
 ```
 
 #### Running The Linter in Developer Mode

diff --git a/src/tasks/litmus_setup.cr b/src/tasks/litmus_setup.cr
@@ -58,6 +58,7 @@ module LitmusManager
   DOWNLOADED_LITMUS_FILE = "litmus-operator-downloaded.yaml"
   MODIFIED_LITMUS_FILE = "litmus-operator-modified.yaml"
   LITMUS_NAMESPACE = "litmus"
+  LITMUS_K8S_DOMAIN = "litmuschaos.io"
 
 
 
@@ -85,60 +86,46 @@ module LitmusManager
     appNodeName_response.to_s
   end
 
+  private def self.get_status_info(chaos_resource, test_name, output_format, namespace) : {Int32, String}
+    status_cmd = "kubectl get #{chaos_resource}.#{LITMUS_K8S_DOMAIN} #{test_name} -n #{namespace} -o '#{output_format}'"
+    Log.info { "Getting litmus status info: #{status_cmd}" }
+    status_code = Process.run("#{status_cmd}", shell: true, output: status_response = IO::Memory.new, error: stderr = IO::Memory.new).exit_status
+    status_response = status_response.to_s
+    Log.info { "status_code: #{status_code}, response: #{status_response}" }
+    {status_code, status_response}
+  end
+
+  private def self.get_status_info_until(chaos_resource, test_name, output_format, timeout, namespace, &block)
+    repeat_with_timeout(timeout: timeout, errormsg: "Litmus response timed-out") do
+      status_code, status_response = get_status_info(chaos_resource, test_name, output_format, namespace)
+      status_code == 0 && yield status_response
+    end
+  end
+
   ## wait_for_test will wait for the completion of litmus test
   def self.wait_for_test(test_name, chaos_experiment_name, args, namespace : String = "default")
     chaos_result_name = "#{test_name}-#{chaos_experiment_name}"
-
-    experimentStatus_cmd = "kubectl get chaosengine.litmuschaos.io #{test_name} -n #{namespace} -o jsonpath='{.status.engineStatus}'"
-    Log.for("wait_for_test").info { "Checking experiment status #{experimentStatus_cmd}" } if check_verbose(args)
-
-    ## Wait for completion of chaosengine which indicates the completion of chaos
-    repeat_with_timeout(timeout: LITMUS_CHAOS_TEST_TIMEOUT, errormsg: "Litmus test has timed-out") do
-      status_code = Process.run("#{experimentStatus_cmd}", 
-                                shell: true, 
-                                output: experimentStatus_response = IO::Memory.new, 
-                                error: stderr = IO::Memory.new).exit_status
-      Log.for("wait_for_test").info { "#{chaos_experiment_name} status_code: #{status_code}" } if check_verbose(args)
-      experimentStatus = experimentStatus_response.to_s
-      Log.for("wait_for_test").info {"#{chaos_experiment_name} experiment status: " + experimentStatus}
-      if (experimentStatus != "Waiting for Job Creation" && experimentStatus != "Running" && experimentStatus != "Completed")
-        true
-      else
-        status_code == 0 && experimentStatus == "Completed"
-      end
+    Log.info { "wait_for_test: #{chaos_result_name}" }
+
+    get_status_info_until("chaosengine", test_name, "jsonpath={.status.engineStatus}", LITMUS_CHAOS_TEST_TIMEOUT, namespace) do |engineStatus|
+      ["completed", "stopped"].includes?(engineStatus)
     end
 
-    verdict_cmd = "kubectl get chaosresults.litmuschaos.io #{chaos_result_name}  -n #{namespace} -o jsonpath='{.status.experimentStatus.verdict}'"
-    Log.for("wait_for_test").info { "Checking experiment verdict  #{verdict_cmd}" } if check_verbose(args)
-    ## Check the chaosresult verdict
-    repeat_with_timeout(timeout: GENERIC_OPERATION_TIMEOUT, errormsg: "Litmus verdict aquiring has timed-out") do
-      status_code = Process.run("#{verdict_cmd}", 
-                                shell: true, 
-                                output: verdict_response = IO::Memory.new, 
-                                error: stderr = IO::Memory.new).exit_status
-      Log.for("wait_for_test").info { "status_code: #{status_code}" } if check_verbose(args)
-      Log.for("wait_for_test").info { "verdict: #{verdict_response.to_s}" } if check_verbose(args)
-      verdict = verdict_response.to_s
-      status_code == 0 && verdict != "Awaited"
+    get_status_info_until("chaosresults", chaos_result_name, "jsonpath={.status.experimentStatus.verdict}", GENERIC_OPERATION_TIMEOUT, namespace) do |verdict|
+      verdict != "Awaited"
     end
   end
 
   ## check_chaos_verdict will check the verdict of chaosexperiment
   def self.check_chaos_verdict(chaos_result_name, chaos_experiment_name, args, namespace : String = "default") : Bool
-    verdict_cmd = "kubectl get chaosresults.litmuschaos.io #{chaos_result_name} -n #{namespace} -o jsonpath='{.status.experimentStatus.verdict}'"
-    Log.for("LitmusManager.check_chaos_verdict").debug { "Checking experiment verdict with command: #{verdict_cmd}" }
-    status_code = Process.run("#{verdict_cmd}", shell: true, output: verdict_response = IO::Memory.new, error: stderr = IO::Memory.new).exit_status
-    Log.for("LitmusManager.check_chaos_verdict").debug { "status_code: #{status_code}; verdict: #{verdict_response.to_s}" }
-    verdict = verdict_response.to_s
+    _, verdict = get_status_info("chaosresult", chaos_result_name, "jsonpath={.status.experimentStatus.verdict}", namespace)
 
-    emoji_test_failed= "🗡️💀♻️"
     if verdict == "Pass"
       return true
     else
       Log.for("LitmusManager.check_chaos_verdict#details").debug do
-        verdict_details_cmd = "kubectl get chaosresults.litmuschaos.io #{chaos_result_name} -n #{namespace} -o json"
-        status_code = Process.run("#{verdict_details_cmd}", shell: true, output: verdict_details_response = IO::Memory.new, error: stderr = IO::Memory.new).exit_status
-        "#{verdict_details_response.to_s}"
+        status_code, verdict_details_response = get_status_info("chaosresult", chaos_result_name, "json", namespace)
+        "#{verdict_details_response}"
       end
 
       Log.for("LitmusManager.check_chaos_verdict").info {"#{chaos_experiment_name} chaos test failed: #{chaos_result_name}, verdict: #{verdict}"}

diff --git a/src/tasks/utils/chaos_templates.cr b/src/tasks/utils/chaos_templates.cr
@@ -1,3 +1,5 @@
+NODE_DRAIN_TOTAL_CHAOS_DURATION = ENV.has_key?("CNF_TESTSUITE_NODE_DRAIN_TOTAL_CHAOS_DURATION") ? ENV["CNF_TESTSUITE_NODE_DRAIN_TOTAL_CHAOS_DURATION"].to_i : 90
+
 class ChaosTemplates
   class PodIoStress
     def initialize(
@@ -113,7 +115,7 @@ class ChaosTemplates
       @deployment_label : String,
       @deployment_label_value : String,
       @app_nodename : String,
-      @total_chaos_duration : String = "90"
+      @total_chaos_duration : String = "#{NODE_DRAIN_TOTAL_CHAOS_DURATION}"
     )
     end
     ECR.def_to_s("src/templates/chaos_templates/node_drain.yml.ecr")

diff --git a/src/tasks/utils/timeouts.cr b/src/tasks/utils/timeouts.cr
@@ -9,7 +9,7 @@ LITMUS_CHAOS_TEST_TIMEOUT = ENV.has_key?("CNF_TESTSUITE_LITMUS_CHAOS_TEST_TIMEOU
 
 def repeat_with_timeout(timeout, errormsg, reset_on_nil=false, delay=2, &block)
   start_time = Time.utc
-  while (Time.utc - start_time).seconds < timeout
+  while (Time.utc - start_time).to_i < timeout
     result = yield
     if result.nil?
       if reset_on_nil
@@ -21,8 +21,8 @@ def repeat_with_timeout(timeout, errormsg, reset_on_nil=false, delay=2, &block)
       return true
     end
     sleep delay
-    Log.for("verbose").info { "Time left: #{timeout - (Time.utc - start_time).seconds} seconds" }
+    Log.for("verbose").info { "Time left: #{timeout - (Time.utc - start_time).to_i} seconds" }
   end
   Log.error { errormsg }
   false
-end
+end