princeton-pli · schwartzadev · Mar 27, 2026
diff --git a/.env.template b/.env.template
@@ -13,13 +13,5 @@ SERPER_API_KEY=
 WANDB_API_KEY=
 HF_TOKEN=
 
-# Azure VM Configuration (optional, for --vm flag)
-AZURE_SUBSCRIPTION_ID=
-AZURE_RESOURCE_GROUP_NAME=
-AZURE_LOCATION=
-NETWORK_SECURITY_GROUP_NAME=
-SSH_PUBLIC_KEY_PATH=
-SSH_PRIVATE_KEY_PATH=
-
 # Required: Set to your name for tracking who ran the benchmark
 EXECUTED_BY=
diff --git a/README.md b/README.md
@@ -92,35 +92,7 @@ This repository provides a standardized evaluation harness for reproducible agen
    pip install anthropic
    ```
 
-6. **Optional: Azure VM Setup**
-   If you plan to use Azure VMs for evaluation, add the following to your `.env`:
-
-   ```
-   AZURE_SUBSCRIPTION_ID=your_subscription_id
-   AZURE_RESOURCE_GROUP_NAME=your_resource_group
-   AZURE_LOCATION=your_location
-   NETWORK_SECURITY_GROUP_NAME=your_nsg_name
-   SSH_PUBLIC_KEY_PATH=/path/to/your/ssh/key.pub
-   SSH_PRIVATE_KEY_PATH=/path/to/your/ssh/key
-   ```
-
-   - AZURE_SUBSCRIPTION_ID: This is the ID of your Azure subscription. Use the UUID.
-   - AZURE_RESOURCE_GROUP_NAME: This is the name of the resource group in which your VMs should be created.
-   - AZURE_LOCATION: e.g., "eastus" or "westus", etc.
-   - NETWORK_SECURITY_GROUP_NAME
-     - You will need to create a NSG in Azure for your access.
-     - Ensure that the NSG has an Inbound security rule that permits your machine to access SSH (port 22).
-     - Enter the NSG's name here.
-   - SSH_PUBLIC_KEY_PATH: This is your SSH key (on your local machine)
-   - SSH_PRIVATE_KEY_PATH: This is your SSH key (on your local machine)
-
-   Then run the following command to install the optional azure dependencies:
-
-   ```bash
-   pip install -e ".[azure]"
-   ```
-
-7. **Optional: Docker Setup**
+6. **Optional: Docker Setup**
    If you plan to use Docker containers for isolated evaluation, make sure Docker is installed on your system. The harness will automatically build the required Docker image.
 
 ## Tests
@@ -404,7 +376,6 @@ hal-eval --benchmark <benchmark_name> --agent_dir <agent_directory> --agent_func
 - **`--upload`**: Upload results to HuggingFace Hub
 - **`--max_concurrent <number>`**: Number of parallel tasks (default: 1)
 - **`--conda_env_name <env_name>`**: Conda environment for agent execution
-- **`--vm`**: Run evaluation on Azure VMs
 - **`--docker`**: Run evaluation in Docker containers for isolation
 - **`--run_id <run_id>`**: Specify a run ID (useful for continuing runs)
 - **`--continue_run`**: Continue from a previous run (requires run_id)
@@ -434,19 +405,7 @@ hal-eval --benchmark usaco \
   -A model_name=gpt-4o-mini-2024-07-18
 ```
 
-3. **Running USACO on Azure VM:**
-
-```bash
-hal-eval --benchmark usaco \
-  --agent_dir agents/usaco_example_agent/ \
-  --agent_function main.run \
-  --agent_name "USACO Solver (gpt-4o-2024-11-20)" \
-  --vm \
-  --max_concurrent 5 \
-  -A model_name=gpt-4o-2024-11-20
-```
-
-4. **Running USACO with Amazon Bedrock models:**
+3. **Running USACO with Amazon Bedrock models:**
 
 ```bash
 hal-eval --benchmark usaco \

diff --git a/agents/README.md b/agents/README.md
@@ -32,9 +32,7 @@ def run(input: dict[str, dict], **kwargs) -> dict[str, str]:
 
 ## General Requirements
 
-1. **Dependencies**: List all dependencies in `requirements.txt`. These will be installed:
-   - On VMs if `--vm` flag is used
-   - If you run evaluations locally, you must install the dependencies yourself. Then specify the conda environment name with `--conda_env_name` or run evaluations from the conda environment.
+1. **Dependencies**: List all dependencies in `requirements.txt`. These will be installed when running in Docker. If you run evaluations locally, install the dependencies yourself and specify the conda environment name with `--conda_env_name` or run from the conda environment.
 
 2. **Arguments**: Your agent can receive additional arguments via `-A` flags:
 
@@ -172,7 +170,6 @@ def run(input: dict, **kwargs):
 
 **Requirements**:
 
-- Must be run with `--vm` flag
 - **Important:** set `remote_environment_url` to `http://0.0.0.0:8000` and `experiment_name` to `output`. An example is below and in `agents/appworld_example_agent/main.py`.
 
 **Example Agent**:

diff --git a/hal/agent_runner.py b/hal/agent_runner.py
@@ -26,7 +26,6 @@ def __init__(
         config: Dict[str, Any],
         task_timeout: int,
         run_id: Optional[str] = None,
-        use_vm: bool = False,
         use_docker: bool = False,
         max_concurrent: int = 1,
         conda_env: Optional[str] = None,
@@ -59,16 +58,15 @@ def __init__(
             not os.path.exists(requirements_path)
             and not conda_env
             and not use_docker
-            and not use_vm
         ):
             raise ValueError(
                 f"No requirements.txt found in agent directory: {agent_dir}"
             )
 
         # Validate runner options
-        if sum([bool(conda_env), use_vm, use_docker]) > 1:
+        if sum([bool(conda_env), use_docker]) > 1:
             raise ValueError(
-                "Only one of conda_env, use_vm, or use_docker can be set at a time."
+                "Only one of conda_env or use_docker can be set at a time."
             )
 
         # Initialize benchmark first
@@ -83,45 +81,19 @@ def __init__(
                 results_dir, self.benchmark.benchmark_name
             )
 
-        # Check if any task requires GPU
-        has_gpu_task = False
-        if hasattr(self.benchmark, "benchmark") and isinstance(
-            self.benchmark.benchmark, dict
-        ):
-            for task_id, task_data in self.benchmark.benchmark.items():
-                if isinstance(task_data, dict) and task_data.get("gpu", False):
-                    has_gpu_task = True
-                    break
-
-        # Print warning if GPU tasks are present but not running on VM
-        if has_gpu_task and not use_vm:
-            logger.warning(
-                "Warning: This benchmark contains tasks that require GPU, but is not being run on a VM. "
-                "GPU tasks may not work correctly without VM execution. Use the --vm flag to run on a VM."
-            )
-
         self.run_command = run_command
 
         # Check if benchmark requires sandbox
-        if self.benchmark.requires_sandbox and not use_vm and not use_docker:
+        if self.benchmark.requires_sandbox and not use_docker:
             raise ValueError(
-                f"Benchmark {benchmark_name} requires sandbox execution. Please use --vm or --docker flag."
+                f"Benchmark {benchmark_name} requires sandbox execution. Please use --docker flag."
             )
 
         # Set run ID
         self.run_id = run_id or f"{benchmark_name}_{int(time.time())}"
 
         # Initialize appropriate runner with benchmark
-        if use_vm:
-            from .utils.virtual_machine_runner import VirtualMachineRunner
-
-            self.runner = VirtualMachineRunner(
-                max_concurrent=max_concurrent,
-                log_dir=self.benchmark.get_run_dir(self.run_id),
-                benchmark=self.benchmark,
-                task_timeout=task_timeout,
-            )
-        elif use_docker:
+        if use_docker:
             self.runner = DockerRunner(
                 max_concurrent=max_concurrent,
                 log_dir=self.benchmark.get_run_dir(self.run_id),
@@ -143,7 +115,6 @@ def __init__(
         self.config = config
         self.max_concurrent = max_concurrent
         self.conda_env = conda_env
-        self.use_vm = use_vm
         self.use_docker = use_docker
         self.continue_run = continue_run
         self.ignore_errors = ignore_errors

diff --git a/hal/benchmarks/README.md b/hal/benchmarks/README.md
@@ -120,25 +120,11 @@ class SimpleMathBenchmark(BaseBenchmark):
 
 3. **Sandbox Support**: Set `requires_sandbox = True` if benchmark requires sandbox execution.
 
-4. **Setup Script**: Provide `setup_script` for installing benchmark-specific dependencies on VMs.
-
-5. **GPU Support**: Tasks can specify GPU requirements by including a `"gpu": true` flag in their benchmark entries. When the benchmark is run with the `--vm` flag, tasks with this flag will be executed on GPU-enabled VMs.
-   ```python
-   # Example of a task that requires GPU
-   self.benchmark = {
-       "task_id": {
-           "prompt": "Train a neural network model...",
-           "files": {...},
-           "gpu": true  # This task will use a GPU VM when run with --vm flag
-       }
-   }
-   ```
-   - GPU VMs are only created if the benchmark is run with the `--vm` flag (which activates VM-based execution)
-   - For tasks that don't specify a GPU requirement (no `"gpu"` key or `"gpu": false`), regular VMs will be used
+4. **Setup Script**: Provide `setup_script` for installing benchmark-specific dependencies.
 
 ## Providing task-specific files to agents
 
-Benchmarks can provide files to agents by including a `files` dictionary in each task. These files will be automatically copied into the agent's working environment by both the VM and local runs.
+Benchmarks can provide files to agents by including a `files` dictionary in each task. These files will be automatically copied into the agent's working environment.
 
 Example task with files:
 ```python

diff --git a/hal/benchmarks/base_benchmark.py b/hal/benchmarks/base_benchmark.py
@@ -34,7 +34,7 @@ def __init__(
         )
         self.agent_args: Dict[str, Any] = {}  # Store agent args
         self.requires_sandbox = (
-            requires_sandbox  # Whether benchmark requires VM execution
+            requires_sandbox  # Whether benchmark requires sandbox execution
         )
 
     def _normalize_agent_output(self, agent_output: Dict[str, Any]) -> Dict[str, Any]:

diff --git a/hal/cli.py b/hal/cli.py
@@ -82,7 +82,6 @@
     default=os.path.join(os.path.dirname(__file__), "config.yaml"),
     help="Path to configuration file. (currently not used)",
 )
-@click.option("--vm", is_flag=True, help="Run the agent on azure VMs")
 @click.option(
     "--docker",
     is_flag=True,
@@ -163,7 +162,6 @@ def main(
     a,
     b,
     i,
-    vm,
     docker,
     max_tasks,
     prompt_sensitivity,
@@ -204,7 +202,7 @@ def main(
         log_dir = os.path.join(results_dir, benchmark, run_id)
         os.makedirs(log_dir, exist_ok=True)
         verbose_log_path = os.path.join(log_dir, f"{run_id}_verbose.log")
-        setup_logging(log_dir, run_id, use_vm=vm)
+        setup_logging(log_dir, run_id)
 
         logger.info("HAL Harness")
 
@@ -216,9 +214,9 @@ def main(
             validate_model_pricing(agent_args["model_name"])
 
         # Validate runner options
-        if sum([bool(conda_env_name), vm, docker]) > 1:
+        if sum([bool(conda_env_name), docker]) > 1:
             logger.error(
-                "Only one of --conda_env_name, --vm, or --docker can be specified. Exiting..."
+                "Only one of --conda_env_name or --docker can be specified. Exiting..."
             )
             sys.exit(1)
 
@@ -239,7 +237,6 @@ def main(
             max_concurrent=max_concurrent,
             conda_env_name=conda_env_name,
             log_dir=log_dir,
-            vm=vm,
             docker=docker,
             continue_run=continue_run,
             ignore_errors=ignore_errors,
@@ -260,7 +257,6 @@ def main(
                 benchmark_name=benchmark,
                 config=config,
                 run_id=run_id,  # Now guaranteed to have a value
-                use_vm=vm,
                 use_docker=docker,
                 max_concurrent=max_concurrent,
                 conda_env=conda_env_name,

diff --git a/hal/utils/logging_utils.py b/hal/utils/logging_utils.py
@@ -16,13 +16,12 @@
 logger = logging.getLogger(__name__)
 
 
-def setup_logging(log_dir: str, run_id: str, use_vm: bool = False) -> None:
+def setup_logging(log_dir: str, run_id: str) -> None:
     """Setup logging configuration.
 
     Args:
         log_dir: Directory for log files
         run_id: Unique run identifier
-        use_vm: Unused; kept for API compatibility.
     """
     # Create absolute path for log directory to avoid path duplication
     log_dir = os.path.abspath(log_dir)
@@ -39,16 +38,8 @@ def setup_logging(log_dir: str, run_id: str, use_vm: bool = False) -> None:
     for handler in root_logger.handlers[:]:
         root_logger.removeHandler(handler)
 
-    # Suppress verbose Azure SDK logging
-    logging.getLogger("azure.core.pipeline.policies.http_logging_policy").setLevel(
-        logging.WARNING
-    )
-    # Suppress Azure identity logging
-    logging.getLogger("azure.identity").setLevel(logging.WARNING)
     # Suppress httpx logging
     logging.getLogger("httpx").setLevel(logging.WARNING)
-    # Suppress SSH logging
-    logging.getLogger("paramiko.transport").setLevel(logging.WARNING)
 
     # Create formatters
     detailed_formatter = logging.Formatter(
@@ -142,7 +133,6 @@ def print_run_config(
     max_concurrent: int,
     log_dir: str,
     conda_env_name: Optional[str],
-    vm: bool,
     continue_run: bool,
     docker: bool = False,
     ignore_errors: bool = False,
@@ -160,7 +150,6 @@ def print_run_config(
     logger.info(f"  Log Directory: {log_dir}")
     logger.info(f"  Max Concurrent: {max_concurrent}")
     logger.info(f"  Upload Results: {'Yes' if upload else 'No'}")
-    logger.info(f"  VM Execution: {'Yes' if vm else 'No'}")
     logger.info(f"  Docker Execution: {'Yes' if docker else 'No'}")
     logger.info(f"  Continue Previous Run: {'Yes' if continue_run else 'No'}")
     logger.info(f"  Ignore Errors: {'Yes' if ignore_errors else 'No'}")

diff --git a/hal/utils/setup_vm.sh b/hal/utils/setup_vm.sh