From 867f672562324374901c8e853426ebe838ac87cc Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao6@illinois.edu>
Date: Fri, 17 Jan 2025 22:23:39 -0500
Subject: [PATCH] reliable way to reproduce error

---
 tests/runtime/test_stress_remote_runtime.py | 172 +++++++++++++++++++-
 1 file changed, 170 insertions(+), 2 deletions(-)

diff --git a/tests/runtime/test_stress_remote_runtime.py b/tests/runtime/test_stress_remote_runtime.py
index 77ba961f6c1e..a7dc72d99ae4 100644
--- a/tests/runtime/test_stress_remote_runtime.py
+++ b/tests/runtime/test_stress_remote_runtime.py
@@ -1,8 +1,22 @@
-"""Bash-related tests for the EventStreamRuntime, which connects to the ActionExecutor running in the sandbox."""
+"""Stress tests for the remote runtime.
+
+Example usage:
+
+```bash
+export ALLHANDS_API_KEY="YOUR_API_KEY"
+export RUNTIME=remote
+export SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.staging.all-hands.dev"
+poetry run pytest -vvxss tests/runtime/test_stress_remote_runtime.py
+```
+
+"""
 
 import asyncio
+import json
 import os
 import tempfile
+import time
+from datetime import datetime
 from unittest.mock import MagicMock
 
 import pandas as pd
@@ -41,6 +55,8 @@
     'CodeActAgent': codeact_user_response,
 }
 
+SAVE_PERF_DEBUG = os.environ.get('SAVE_PERF_DEBUG', 'false').lower() in ['true', '1']
+
 
 def get_config() -> AppConfig:
     assert (
@@ -140,7 +156,7 @@ def _process_instance(
         else:
             logger.info(f'Starting evaluation for instance {instance.instance_id}.')
 
-        runtime = create_runtime(config, headless_mode=False)
+        runtime = create_runtime(config, headless_mode=True)
         call_async_from_sync(runtime.connect)
 
         try:
@@ -224,3 +240,155 @@ def next_command(*args, **kwargs):
     )
 
     run_evaluation(instances, metadata, output_file, n_eval_workers, _process_instance)
+
+
+@pytest.mark.skipif(
+    TEST_IN_CI,
+    reason='This test should only be run locally, not in CI.',
+)
+def test_stress_remote_runtime_long_output_with_soft_and_hard_timeout():
+    """Stress test for the remote runtime."""
+    config = get_config()
+    runtime = create_runtime(config, headless_mode=True)
+    call_async_from_sync(runtime.connect)
+
+    _time_for_test = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
+
+    action = CmdRunAction(
+        command='sudo apt-get update && sudo apt-get install -y stress-ng'
+    )
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert obs.exit_code == 0
+
+    try:
+        # Run a command that generates long output multiple times
+        for i in range(10):
+            start_time = time.time()
+            iteration_stats = {
+                'iteration': i,
+                'timestamp': time.time(),
+            }
+
+            # Check overall system memory usage
+            mem_action = CmdRunAction(
+                'free -k | grep "Mem:" | awk \'{printf "Total: %8.1f MB, Used: %8.1f MB, Free: %8.1f MB, Available: %8.1f MB\\n", $2/1024, $3/1024, $4/1024, $7/1024}\''
+            )
+            mem_obs = runtime.run_action(mem_action)
+            assert mem_obs.exit_code == 0
+            logger.info(
+                f'System memory usage (iteration {i}): {mem_obs.content.strip()}'
+            )
+            # Parse memory values from output
+            mem_parts = mem_obs.content.strip().split(',')
+            for part in mem_parts:
+                key, value = part.strip().split(':')
+                iteration_stats[f'memory_{key.lower()}'] = float(
+                    value.replace('MB', '').strip()
+                )
+
+            # Check top memory-consuming processes
+            mem_action = CmdRunAction(
+                'ps aux | awk \'{printf "%8.1f MB  %s\\n", $6/1024, $0}\' | sort -nr | head -n 5'
+            )
+            mem_obs = runtime.run_action(mem_action)
+            assert mem_obs.exit_code == 0
+            _top_processes = [i.strip() for i in mem_obs.content.strip().split('\n')]
+            logger.info(
+                f'Top 5 memory-consuming processes (iteration {i}):\n{"- " + "\n- ".join(_top_processes)}'
+            )
+            iteration_stats['top_processes'] = _top_processes
+
+            # Check tmux memory usage (in KB)
+            mem_action = CmdRunAction(
+                'ps aux | awk \'{printf "%8.1f MB  %s\\n", $6/1024, $0}\' | sort -nr | grep "/usr/bin/tmux" | grep -v grep | awk \'{print $1}\''
+            )
+            mem_obs = runtime.run_action(mem_action)
+            assert mem_obs.exit_code == 0
+            logger.info(
+                f'Tmux memory usage (iteration {i}): {mem_obs.content.strip()} KB'
+            )
+            try:
+                iteration_stats['tmux_memory_mb'] = float(mem_obs.content.strip())
+            except (ValueError, AttributeError):
+                iteration_stats['tmux_memory_mb'] = None
+
+            # Check action_execution_server mem
+            mem_action = CmdRunAction(
+                'ps aux | awk \'{printf "%8.1f MB  %s\\n", $6/1024, $0}\' | sort -nr | grep "action_execution_server" | grep "/openhands/poetry" | grep -v grep | awk \'{print $1}\''
+            )
+            mem_obs = runtime.run_action(mem_action)
+            assert mem_obs.exit_code == 0
+            logger.info(
+                f'Action execution server memory usage (iteration {i}): {mem_obs.content.strip()} MB'
+            )
+            try:
+                iteration_stats['action_server_memory_mb'] = float(
+                    mem_obs.content.strip()
+                )
+            except (ValueError, AttributeError):
+                iteration_stats['action_server_memory_mb'] = None
+
+            # Test soft timeout
+            action = CmdRunAction(
+                'read -p "Do you want to continue? [Y/n] " answer; if [[ $answer == "Y" ]]; then echo "Proceeding with operation..."; echo "Operation completed successfully!"; else echo "Operation cancelled."; exit 1; fi'
+            )
+            obs = runtime.run_action(action)
+            assert 'Do you want to continue?' in obs.content
+            assert obs.exit_code == -1  # Command is still running, waiting for input
+
+            # Send the confirmation
+            action = CmdRunAction('Y', is_input=True)
+            obs = runtime.run_action(action)
+            assert 'Proceeding with operation...' in obs.content
+            assert 'Operation completed successfully!' in obs.content
+            assert obs.exit_code == 0
+            assert '[The command completed with exit code 0.]' in obs.metadata.suffix
+
+            # Test hard timeout w/ long output
+            # Generate long output with 1000 asterisks per line
+            action = CmdRunAction(
+                f'export i={i}; for j in $(seq 1 100); do echo "Line $j - Iteration $i - $(printf \'%1000s\' | tr " " "*")"; sleep 1; done'
+            )
+            action.set_hard_timeout(2)
+            obs = runtime.run_action(action)
+
+            # Verify the output
+            assert obs.exit_code == -1
+            assert f'Line 1 - Iteration {i}' in obs.content
+
+            # Because hard-timeout is triggered, the terminal will in a weird state
+            # where it will not accept any new commands.
+            obs = runtime.run_action(CmdRunAction('ls'))
+            assert obs.exit_code == -1
+            assert 'The previous command is still running' in obs.metadata.suffix
+
+            # We need to send a Ctrl+C to reset the terminal.
+            obs = runtime.run_action(CmdRunAction('C-c', is_input=True))
+            assert obs.exit_code == 130
+
+            # Now make sure the terminal is in a good state
+            obs = runtime.run_action(CmdRunAction('ls'))
+            assert obs.exit_code == 0
+
+            # run stress-ng stress tests for 1 minute
+            action = CmdRunAction(command='stress-ng --all 1 -t 1m')
+            action.set_hard_timeout(120)
+            logger.info(action, extra={'msg_type': 'ACTION'})
+            obs = runtime.run_action(action)
+            logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+
+            duration = time.time() - start_time
+            iteration_stats['duration'] = duration
+            logger.info(f'Completed iteration {i} in {duration:.2f} seconds')
+
+            # Save stats to JSONL file
+            if SAVE_PERF_DEBUG:
+                with open(
+                    f'terminal_perf_analysis_result_{_time_for_test}.jsonl', 'a'
+                ) as f:
+                    json.dump(iteration_stats, f)
+                    f.write('\n')
+    finally:
+        runtime.close()