diff --git a/.github/workflows/claude-code-review.LMY b/.github/workflows/claude-code-review.LMY
index 76923dc..83c8bf4 100644
--- a/.github/workflows/claude-code-review.LMY
+++ b/.github/workflows/claude-code-review.LMY
@@ -32,6 +32,7 @@ jobs:
     #   github.event.pull_request.author_association == 'FIRST_TIME_CONTRIBUTOR'
 
     runs-on: ubuntu-latest
+    environment: test-environment
     permissions:
       contents: read
       pull-requests: read
diff --git a/.github/workflows/e2e.yml b/.github/workflows/e2e.yml
new file mode 100644
index 0000000..422ae1a
--- /dev/null
+++ b/.github/workflows/e2e.yml
@@ -0,0 +1,142 @@
+# Runs a minimal CoreBench hard VM smoke eval. Requires a GitHub Environment
+# (update `environment.name` below) with the secrets/vars listed in the job `env` block.
+# Azure auth (pick one): (A) OIDC — Azure login with client-id, tenant-id, subscription-id
+# and id-token: write (see https://github.com/azure/login ); (B) service principal + secret —
+# Environment secrets `AZURE_CLIENT_ID`, `AZURE_CLIENT_SECRET`, `AZURE_TENANT_ID` plus var
+# `AZURE_SUBSCRIPTION_ID` assembled into azure/login `creds` JSON; omit id-token: write.
+name: HAL eval CoreBench hard (VM)
+
+on:
+  push:
+    branches: [main]
+  pull_request:
+    branches: [main]
+
+# New run on the same branch/PR cancels the previous in-progress run (saves VM/API cost).
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  corebench-hard-vm:
+    runs-on: ubuntu-latest
+    # Create this Environment in Repo → Settings → Environments and attach secrets/vars.
+    environment: test-environment
+    permissions:
+      contents: read
+    env:
+      OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+      ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
+      WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }}
+      AZURE_SUBSCRIPTION_ID: ${{ vars.AZURE_SUBSCRIPTION_ID }}
+      AZURE_RESOURCE_GROUP_NAME: ${{ vars.AZURE_RESOURCE_GROUP_NAME }}
+      AZURE_LOCATION: ${{ vars.AZURE_LOCATION }}
+      NETWORK_SECURITY_GROUP_NAME: ${{ vars.NETWORK_SECURITY_GROUP_NAME }}
+      SSH_PRIVATE_KEY_PATH: ${{ vars.SSH_PRIVATE_KEY_PATH }}
+      SSH_PUBLIC_KEY_PATH: ${{ vars.SSH_PUBLIC_KEY_PATH }}
+    steps:
+      - uses: actions/checkout@v4
+
+      # Warm capsules avoid re-downloading from corebench.cs.princeton.edu every run.
+      # - uses: actions/cache@v4
+      #   with:
+      #     path: hal/benchmarks/corebench/capsules
+      #     key: corebench-capsules-${{ hashFiles('hal/benchmarks/corebench/core_test.json.gpg') }}
+      #     restore-keys: |
+      #       corebench-capsules-
+
+      - name: Assemble Azure creds JSON
+        id: azure_creds
+        env:
+          AZURE_CLIENT_ID: ${{ secrets.AZURE_CLIENT_ID }}
+          AZURE_CLIENT_SECRET: ${{ secrets.AZURE_CLIENT_SECRET }}
+          AZURE_TENANT_ID: ${{ secrets.AZURE_TENANT_ID }}
+        run: |
+          set -euo pipefail
+          for v in AZURE_CLIENT_ID AZURE_CLIENT_SECRET AZURE_TENANT_ID AZURE_SUBSCRIPTION_ID; do
+            if [ -z "${!v:-}" ]; then
+              echo "::error::Required $v is empty (set on GitHub Environment \`test-environment\` secrets or vars)." >&2
+              exit 1
+            fi
+          done
+          CREDS="$(python3 -c 'import json, os; print(json.dumps({"clientId": os.environ["AZURE_CLIENT_ID"], "clientSecret": os.environ["AZURE_CLIENT_SECRET"], "subscriptionId": os.environ["AZURE_SUBSCRIPTION_ID"], "tenantId": os.environ["AZURE_TENANT_ID"]}))')"
+          {
+            echo 'creds<<CREDS_JSON_EOF'
+            echo "$CREDS"
+            echo 'CREDS_JSON_EOF'
+          } >> "$GITHUB_OUTPUT"
+
+      - name: Azure login (service principal)
+        uses: azure/login@v2
+        with:
+          auth-type: SERVICE_PRINCIPAL
+          creds: ${{ steps.azure_creds.outputs.creds }}
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+
+      - name: Install dependencies
+        run: |
+          set -euo pipefail
+          pip install -e ".[dev,azure,corebench,coreagent]"
+          # Ensure gql 4+ (weave needs TransportConnectionFailed); extras can pull gql 3.x.
+          pip install "gql[httpx]>=4.0,<5"
+
+      - name: Generate SSH key pair for VM provisioning
+        run: |
+          set -euo pipefail
+          key="${SSH_PRIVATE_KEY_PATH:?SSH_PRIVATE_KEY_PATH must be set}"
+          pub="${SSH_PUBLIC_KEY_PATH:?SSH_PUBLIC_KEY_PATH must be set}"
+          mkdir -p "$(dirname "$key")"
+          rm -f "$key" "$pub"
+          ssh-keygen -t ed25519 -f "$key" -N "" -q
+          test -f "$pub"
+          chmod 600 "$key"
+          chmod 644 "$pub"
+
+      # core_test.json is gitignored; benchmark loads decrypted JSON (see README / hal/benchmarks/corebench.py).
+      - name: Decrypt CoreBench test set
+        run: |
+          set -euo pipefail
+          gpg --batch --yes --pinentry-mode loopback \
+            --passphrase 'reproducibility' \
+            --output hal/benchmarks/corebench/core_test.json \
+            --decrypt hal/benchmarks/corebench/core_test.json.gpg
+
+      - name: Run hal-eval (CoreBench hard, one test, VM)
+        run: |
+          set -euo pipefail
+          if [ -z "${OPENAI_API_KEY:-}" ]; then
+            echo "::error::OPENAI_API_KEY is empty — add repository or environment secret \`OPENAI_API_KEY\` on GitHub Environment \`test-environment\`." >&2
+            exit 1
+          fi
+          log="$(mktemp)"
+          set +e
+          set -o pipefail
+          hal-eval --benchmark corebench_hard \
+            --agent_dir agents/core_agent \
+            --agent_function main.run \
+            --agent_name "CORE-Agent" \
+            --vm \
+            --no-download-environment \
+            --max_concurrent 1 \
+            --max_tasks 1 \
+            -A 'model_name=openai/gpt-4.1-2025-04-14' \
+            2>&1 | tee "$log"
+          eval_status=${PIPESTATUS[0]}
+          set -e
+          if [ "$eval_status" -ne 0 ]; then
+            echo "::error::hal-eval exited with status $eval_status" >&2
+            exit "$eval_status"
+          fi
+          if ! grep -Fq 'hal.cli: Evaluation completed successfully' "$log"; then
+            echo "::error::hal-eval output must contain: hal.cli: Evaluation completed successfully" >&2
+            exit 1
+          fi
+
+      - name: Verify CoreBench hard result layout and upload JSON
+        run: |
+          set -euo pipefail
+          bash tests/gh_actions/verify_corebench_hard_e2e.sh
diff --git a/README.md b/README.md
index ab4a4ef..1bbd0de 100644
--- a/README.md
+++ b/README.md
@@ -405,6 +405,7 @@ hal-eval --benchmark <benchmark_name> --agent_dir <agent_directory> --agent_func
 - **`--max_concurrent <number>`**: Number of parallel tasks (default: 1)
 - **`--conda_env_name <env_name>`**: Conda environment for agent execution
 - **`--vm`**: Run evaluation on Azure VMs
+- **`--no-download-environment`**: With **`--vm`**, omit the VM’s `environment/` directory (task data, code, and results mounts) when downloading results back to the host, so the SFTP step is much faster. `output.json`, logs, and other files under `/home/agent` are still downloaded. No effect without `--vm`.
 - **`--docker`**: Run evaluation in Docker containers for isolation
 - **`--run_id <run_id>`**: Specify a run ID (useful for continuing runs)
 - **`--continue_run`**: Continue from a previous run (requires run_id)
diff --git a/agents/core_agent/agent_hints.py b/agents/core_agent/agent_hints.py
index e81c752..6ed27d3 100644
--- a/agents/core_agent/agent_hints.py
+++ b/agents/core_agent/agent_hints.py
@@ -56,7 +56,6 @@
 • If there exists a file called 'manuscript' then first read this file to extract the required results to answer the questions of the task.
 • If you are extracting information from html (such as the output of a Jupyter notebook), convert it to a PDF or PNG first and then extract the relevant information.
 • Before running the code, first determine a list of package/dependency requirements that must be installed by reading through the README file or code itself. Then install those dependencies before running the code.
-• Note: Symbolic links have been automatically created for environment/data → /data, environment/code → /code, and environment/results → /results to ensure proper file access.
 
 Constraints:
 • Use flags or modify commands to bypass any need for confirmations during execution to streamline processes.
diff --git a/agents/core_agent/main.py b/agents/core_agent/main.py
index 707dd3e..2e8ab98 100644
--- a/agents/core_agent/main.py
+++ b/agents/core_agent/main.py
@@ -551,17 +551,6 @@ def run(input: dict[str, dict], **kwargs) -> dict[str, str]:
         print(f"[WARNING] Failed to run 'conda list': {str(e)}")
     print("=== End of Package Versions and Environment Information ===")
 
-    # Create symbolic links
-    try:
-        cwd = os.getcwd()
-        os.symlink(f"{cwd}/environment/data", "/data", target_is_directory=True)
-        os.symlink(f"{cwd}/environment/code", "/code", target_is_directory=True)
-        os.symlink(f"{cwd}/environment/results", "/results", target_is_directory=True)
-    except Exception as e:
-        print(
-            f"[WARNING] Failed to create symbolic links for /data, /code, and /results: {str(e)}"
-        )
-
     assert "model_name" in kwargs, "model_name is required"
     assert len(input) == 1, "input must contain only one task"
 
@@ -692,10 +681,19 @@ async def acompletion_with_provider(*args, **completion_kwargs):
 
     model = LiteLLMModel(**model_params)
 
+    # CoreBench layout: always use cwd-based paths (no /data, /code, /results symlinks).
+    cwd = os.getcwd()
+    env_paths = (
+        f"• Task directories — use these absolute paths: "
+        f"{cwd}/environment/data (data), {cwd}/environment/code (code), "
+        f"{cwd}/environment/results (results).\n\n"
+    )
     # Prepend hints to the task prompt if available
     prompt = task["prompt"]
     if hints:
-        prompt = f"{hints}\n\n{prompt}"
+        prompt = f"{env_paths}{hints}\n\n{prompt}"
+    else:
+        prompt = f"{env_paths}{prompt}"
 
     # Create a custom FinalAnswerTool that includes key validation and LLM-based giving-up detection
     class CustomFinalAnswerTool(Tool):
diff --git a/docs/EXECUTION_FLOW.md b/docs/EXECUTION_FLOW.md
new file mode 100644
index 0000000..f3f2c81
--- /dev/null
+++ b/docs/EXECUTION_FLOW.md
@@ -0,0 +1,232 @@
+# HAL evaluation execution flow (hal-eval → finish)
+
+High-level map of how a run flows from the `hal-eval` CLI to completion.
+
+---
+
+## 1. Entry point
+
+```
+hal-eval  (console script from pyproject.toml)
+    → hal.cli:main
+```
+
+**`pyproject.toml`** `[project.scripts]`: `hal-eval = "hal.cli:main"`
+
+---
+
+## 2. CLI
+
+**File:** `hal/cli.py`
+
+- **Click** parses options (`--benchmark`, `--agent_dir`, `--vm`, `--task_timeout`, etc.).
+- **main()**:
+  1. **Parse args**: `parse_cli_args(a)`, `parse_cli_args(b)`, `parse_cli_args(i)` → `agent_args`, `benchmark_args`, `inspect_eval_args`.
+  2. **run_id**: default `{benchmark}_{agent_name_sanitized}_{timestamp}` or use `--run_id`.
+  3. **Logging**: `setup_logging(log_dir, run_id, use_vm=vm)`; `log_dir = results_dir/benchmark/run_id`.
+  4. **Validation**: model pricing if `model_name` in agent_args; exactly one of conda/vm/docker; `--continue_run` requires `run_id`.
+  5. **print_run_config(...)**.
+  6. **Build run command**: `run_command = "hal-eval " + sys.argv[1:]` (for logging/repro).
+  7. **Create AgentRunner** with all parsed options (including `task_timeout`, `agent_function`, `agent_dir`, `benchmark_name`, etc.).
+  8. **Run**: `asyncio.run(runner.run(agent_name=agent_name, upload=upload))`.
+  9. **Post-run**: `log_results(results)`, `log_run_summary(...)` (or warning if no benchmark/run_dir).
+
+---
+
+## 3. AgentRunner
+
+**File:** `hal/agent_runner.py`
+
+### 3.1 Construction (`__init__`)
+
+- **BenchmarkManager** (`hal/benchmark_manager.py`): (agent_dir, config) → **get_benchmark(benchmark_name)** → concrete benchmark (e.g. `CoreBenchBenchmark` from `hal/benchmarks/corebench.py`).
+- **Runner choice** (one of):
+  - **VirtualMachineRunner** (`hal/utils/virtual_machine_runner.py`): (log_dir, benchmark, task_timeout, max_concurrent) if `use_vm`;
+  - **DockerRunner** (`hal/utils/docker_runner.py`): (..., task_timeout) if `use_docker`;
+  - **LocalRunner** (`hal/utils/local_runner.py`): (..., conda_env, task_timeout) otherwise.
+- Stores agent_function, agent_dir, agent_args, run_id, etc.
+
+### 3.2 Run pipeline (`run()`)
+
+1. **Weave**: `weave.init(self.run_id)`.
+2. **Dataset**: `dataset = self.benchmark.get_dataset()`.
+3. **Continue run** (if `continue_run`):
+   - If not `ignore_errors`: `dataset = self.get_remaining_tasks(dataset)` (filter out tasks already in `*_RAW_SUBMISSIONS.jsonl`).
+   - If `ignore_errors`: `dataset = {}` (evaluation uses only previous submissions).
+4. **Filter**: by `--task_ids` if set; cap by `--max_tasks` if set.
+5. **Prompt sensitivity** (if enabled): build variation datasets; then either single-variation or multi-variation runs.
+6. **Continue run cleanup**: if continuing and not ignore_errors, delete Weave calls for tasks in dataset so they can be re-run.
+7. **Run agent**:
+   - **Normal**: `agent_output = await self.runner.run_agent(dataset=dataset, agent_function=..., agent_dir=..., agent_args=..., run_id=..., benchmark=..., task=..., progress=...)`.
+   - **Prompt sensitivity (single var)**: same but with `single_variation_dataset` and then set `prompt_sensitivity = False` for evaluation.
+   - **Prompt sensitivity (multi var)**: loop over variation indices, run agent per variation, collect `all_variations_output`.
+   - If `continue_run`: merge `agent_output` with previous `*_RAW_SUBMISSIONS.jsonl`.
+8. **Evaluate**:
+   - **Prompt sensitivity**: `weave.finish()`; for each task/variation call `benchmark.evaluate_output(single_output, run_id)`; collect scores.
+   - **Normal**: `weave.finish()`; `eval_results = self.benchmark.evaluate_output(agent_output, self.run_id)`.
+9. **Process results**: `results = self.benchmark.process_results(agent_name=..., run_id=..., agent_args=..., run_command=..., eval_results=..., weave_client=..., agent_output=..., upload=..., prompt_sensitivity=..., )`.
+10. **Return** `results` to CLI.
+
+---
+
+## 4. Runner: VM path
+
+**File:** `hal/utils/virtual_machine_runner.py`
+
+### 4.1 `run_agent(dataset, agent_function, agent_dir, agent_args, run_id, benchmark, progress, task)`
+
+- For each `(task_id, input_data)` in `dataset`, an async **process_task(task_id, input_data)** is run.
+- Concurrency is limited by **semaphore** `max_concurrent`; tasks are scheduled with **asyncio.gather(run_with_semaphore(...) for each task)**.
+- Each **process_task** does the following (see below).
+- Results are merged into one dict and appended to `{run_id}_RAW_SUBMISSIONS.jsonl` in `log_dir`, then returned.
+
+### 4.2 Per-task VM flow: `process_task(task_id, input_data)`
+
+1. **VM name**: `vm_name = "agent-{benchmark_name}-{uuid}"[:32]`.
+2. **GPU**: read `benchmark.benchmark[task_id].get("gpu", False)`.
+3. **Create VM**: `vm_manager.create_virtual_machine_by_name(...)`  
+   → **VirtualMachineManager** (`hal/utils/virtual_machine_manager.py`) + **AzureVirtualMachine** (`hal/utils/vm/azure_virtual_machine.py`) (provision VM, wait for startup script, etc.).
+4. **Temp dir**: build task payload:
+   - `input.json`, `agent_args.json`;
+   - copy any `input_data["files"]` into temp dir;
+   - copy benchmark `setup_script.sh` if present.
+5. **Copy to VM**:
+   - `vm_manager.compress_and_copy_files_to_vm(vm_name, temp_dir)` (tar.gz → SFTP → extract under `/home/agent`);
+   - `vm_manager.compress_and_copy_files_to_vm(vm_name, agent_dir)` (agent code).
+6. **Run agent on VM**: `vm_manager.run_agent_on_virtual_machine(vm_name, agent_function, task_id, input_data, agent_args, run_id, log_dir, benchmark)` (see below).
+7. **Poll until done or timeout** (`task_timeout` seconds):
+   - Every 30s: `fetch_agent_logs(vm_name, ...)` (pull `agent_trace.log`), then `vm_manager.check_task_completion(vm_name)` (checks for `/home/agent/output.json`).
+   - If `output.json` appears → task complete; break.
+8. **Copy results**: `vm_manager.copy_files_from_vm(vm_name, dest_dir, download_environment=...)` (e.g. `log_dir/{task_id}/`); read `output.json` from dest_dir → **result**. With `hal-eval --no-download-environment` (requires `--vm`), the archive omits `environment/` (task payload) for a faster download; `output.json` and other files under `/home/agent` are still copied.
+9. **Cleanup**: `vm_manager.delete_virtual_machine_by_name(vm_name)`.
+10. Return `{task_id: result}` (or error/timeout dict).
+
+---
+
+## 5. VirtualMachineManager
+
+**File:** `hal/utils/virtual_machine_manager.py`
+
+### 5.1 `create_virtual_machine_by_name(vm_name, has_gpu, setup_timeout)`
+
+- **AzureVirtualMachine** (`hal/utils/vm/azure_virtual_machine.py`): (name, resource_group, location, subscription_id, nsg_id, ssh_public_key, gpu=..., timeout=setup_timeout).
+- VM creation (Azure API), wait for SSH, run startup (e.g. cloud-init); **timeout** is used for waiting for startup script.
+
+### 5.2 `compress_and_copy_files_to_vm(vm_name, source_directory)`
+
+- Tar.gz `source_directory` → SFTP put to `/home/agent/` → SSH `tar -xzf ... --strip-components=1 -C /home/agent`; remove remote tar and local tar.
+
+### 5.3 `run_agent_on_virtual_machine(vm_name, agent_function, task_id, input_data, agent_args, run_id, log_dir, benchmark)`
+
+1. **copy_env_and_run_setup_script** (inner):
+   - SFTP: copy `.env` → `/home/agent/.env`; copy `setup_vm.sh` → VM, chmod +x, run `sudo bash setup_vm.sh`.
+   - If benchmark has `setup_script`, run that on VM (e.g. `bash setup_script.sh` in agent home).
+2. **copy_env_and_run_setup_script** (outer call) is invoked again with same args (conda/env setup if used).
+3. **SFTP**:
+   - Write `/home/agent/input.json` (`{task_id: input_data}`), `/home/agent/agent_args.json`.
+   - Write `/home/agent/run_agent.env` with `RUN_ID`, `AGENT_FUNCTION`, `TASK_ID`.
+   - Read static **run_agent.py** from `hal/utils/vm/run_agent.py`, write to `/home/agent/run_agent.py`, chmod +x.
+4. **SSH**: run  
+   `source ... conda && conda activate agent_env && python /home/agent/run_agent.py > agent_trace.log 2>&1`  
+   (non-blocking; script runs in background on VM).
+
+### 5.4 `check_task_completion(vm_name)`
+
+- Uses **AzureVirtualMachine.check_for_file_presence_by_path** for `/home/agent/output.json` (SSH test -f).
+
+### 5.5 `get_agent_trace(vm_name)`
+
+- SFTP read `/home/agent/agent_trace.log` → returned as string (used by **fetch_agent_logs** in runner).
+
+### 5.6 `copy_files_from_vm(vm_name, destination_directory, *, download_environment=True)`
+
+- SSH: `rm -rf /home/agent/miniconda3`; tar home dir on VM (with `tar --exclude=environment` when `download_environment=False`); SFTP get tar; extract locally to `destination_directory`.
+
+### 5.7 `delete_virtual_machine_by_name(vm_name)`
+
+- **AzureVirtualMachine.delete()** (`hal/utils/vm/azure_virtual_machine.py`) (VM + NIC, disk, etc.).
+
+---
+
+## 6. On-VM agent execution
+
+**File:** `hal/utils/vm/run_agent.py`
+
+- **Static script** (no string interpolation).
+- **Load env**: `load_dotenv("/home/agent/.env")`, `load_dotenv("/home/agent/run_agent.env")`.
+- **Require**: `RUN_ID`, `AGENT_FUNCTION`, `TASK_ID` (exit with error if missing).
+- **Parse** `AGENT_FUNCTION` → `module_name`, `function_name`.
+- **Weave**: `weave.init(RUN_ID)`.
+- **Read** `/home/agent/input.json`, `/home/agent/agent_args.json`.
+- **Load agent**: `importlib.util.spec_from_file_location(module_name, "/home/agent/{module_name}.py")` → `exec_module` → `getattr(module, function_name)`.
+- **Run**: `with weave.attributes({"weave_task_id": TASK_ID}): result = agent(input_data, **agent_args)`.
+- **Write** `/home/agent/output.json` with `result`.
+- On exception: write `/home/agent/error.log`, re-raise.
+
+---
+
+## 7. Local / Docker runners (brief)
+
+- **LocalRunner.run_agent** (`hal/utils/local_runner.py`): for each task, run agent in subprocess (conda env if set) with **task_timeout**; collect stdout/result; same high-level contract (dataset in → agent_output dict out).
+- **DockerRunner.run_agent** (`hal/utils/docker_runner.py`): similar but each task runs in a container; same timeout and result shape.
+
+---
+
+## 8. Benchmark layer
+
+**Files:** `hal/benchmark_manager.py` (registry); `hal/benchmarks/base_benchmark.py` (base); `hal/benchmarks/<name>.py` (e.g. `corebench.py`, `gaia.py`).
+
+- **get_dataset()**: benchmark-specific (e.g. load from HuggingFace, disk); returns `Dict[task_id, task_input]`.
+- **evaluate_output(agent_output, run_id)**: benchmark-specific scoring; returns eval result per task (e.g. scores).
+- **process_results(...)**: build upload payload, write `*_UPLOAD.json`, optionally upload to HuggingFace, return final **results** dict for CLI.
+
+---
+
+## 9. End-to-end flow (single path, VM, one task)
+
+```
+hal-eval
+  → main()                          hal/cli.py
+  → AgentRunner(...)                hal/agent_runner.py
+  → runner.run(agent_name, upload)  hal/agent_runner.py
+       → weave.init(run_id)
+       → benchmark.get_dataset()    hal/benchmarks/base_benchmark.py (or subclass)
+       → (filters: continue_run, task_ids, max_tasks)
+       → runner.run_agent(dataset, ...)  hal/utils/virtual_machine_runner.py
+            → process_task(task_id, input_data)  [per task]
+                 → create_virtual_machine_by_name()     hal/utils/virtual_machine_manager.py
+                 → compress_and_copy_files_to_vm()      hal/utils/virtual_machine_manager.py
+                 → run_agent_on_virtual_machine()       hal/utils/virtual_machine_manager.py
+                      → copy_env_and_run_setup_script (setup_vm.sh, benchmark setup_script)
+                      → write input.json, agent_args.json, run_agent.env
+                      → deploy run_agent.py from hal/utils/vm/run_agent.py
+                      → SSH: conda activate agent_env && python run_agent.py  (runs hal/utils/vm/run_agent.py on VM)
+                 → poll: get_agent_trace(), check_task_completion()  virtual_machine_manager.py
+                 → copy_files_from_vm()                hal/utils/virtual_machine_manager.py
+                 → delete_virtual_machine_by_name()    hal/utils/virtual_machine_manager.py + vm/azure_virtual_machine.py
+            → merge results, append RAW_SUBMISSIONS.jsonl
+       → weave.finish()
+       → benchmark.evaluate_output(agent_output, run_id)   hal/benchmarks/*.py
+       → benchmark.process_results(...)                   hal/benchmarks/base_benchmark.py (or subclass)
+  → log_results(results), log_run_summary(...)        hal/cli.py (implementations in hal/utils/logging_utils.py)
+```
+
+---
+
+## 10. Key files reference
+
+| Role | File |
+|------|------|
+| Entry + CLI | `hal/cli.py` |
+| Logging helpers (setup_logging, log_results, log_run_summary, print_run_config) | `hal/utils/logging_utils.py` |
+| Orchestration + run pipeline | `hal/agent_runner.py` |
+| Benchmark registry + get_benchmark | `hal/benchmark_manager.py` |
+| VM run loop + per-task flow | `hal/utils/virtual_machine_runner.py` |
+| VM lifecycle + SSH/SFTP + run script deploy | `hal/utils/virtual_machine_manager.py` |
+| Setup script copied to VM | `hal/utils/setup_vm.sh` (dir next to virtual_machine_manager.py) |
+| Static agent entrypoint on VM | `hal/utils/vm/run_agent.py` |
+| Azure VM resource creation/deletion | `hal/utils/vm/azure_virtual_machine.py` |
+| Benchmark base (get_dataset, evaluate_output, process_results) | `hal/benchmarks/base_benchmark.py` |
+| Concrete benchmarks (corebench, gaia, scicode, etc.) | `hal/benchmarks/*.py` |
+| Local process runner | `hal/utils/local_runner.py` |
+| Docker runner | `hal/utils/docker_runner.py` |
diff --git a/hal/agent_runner.py b/hal/agent_runner.py
index c54a6d9..dc11adf 100644
--- a/hal/agent_runner.py
+++ b/hal/agent_runner.py
@@ -41,6 +41,7 @@ def __init__(
         variation_index: Optional[int] = None,
         results_dir: str = "results",
         task_ids: Optional[str] = None,
+        download_environment: bool = True,
     ):
         # Validate agent_function format
         if not isinstance(agent_function, str) or "." not in agent_function:
@@ -72,9 +73,13 @@ def __init__(
                 "Only one of conda_env, use_vm, or use_docker can be set at a time."
             )
 
-        # Initialize benchmark first
+        # Initialize benchmark first (--max_tasks limits CoreBench capsule downloads; full load if
+        # --task_ids is used so those IDs are present in the benchmark dict).
         self.benchmark_manager = BenchmarkManager(agent_dir, config)
-        self.benchmark = self.benchmark_manager.get_benchmark(benchmark_name)
+        capsule_preload_limit = None if task_ids else max_tasks
+        self.benchmark = self.benchmark_manager.get_benchmark(
+            benchmark_name, max_tasks=capsule_preload_limit
+        )
         self.benchmark.agent_args = agent_args
 
         # Override results directory if non-default
@@ -121,6 +126,7 @@ def __init__(
                 log_dir=self.benchmark.get_run_dir(self.run_id),
                 benchmark=self.benchmark,
                 task_timeout=task_timeout,
+                download_environment=download_environment,
             )
         elif use_docker:
             self.runner = DockerRunner(
@@ -250,11 +256,18 @@ async def run(self, agent_name: str, upload: bool = False) -> Dict[str, Any]:
                 logger.error("No valid task IDs found. Exiting.")
                 return {}
 
-        # Limit the number of tasks if max_tasks is specified
-        if self.max_tasks and self.max_tasks > 0 and self.max_tasks < len(dataset):
-            logger.info(f"Limiting to the first {self.max_tasks} tasks as requested")
-            task_ids = list(dataset.keys())[: self.max_tasks]
-            dataset = {task_id: dataset[task_id] for task_id in task_ids}
+        # Cap tasks (--max_tasks). Always apply here when set, not only when max_tasks < len(dataset),
+        # so execution cannot exceed N even if a benchmark constructor omits pre-slicing.
+        if self.max_tasks is not None and self.max_tasks > 0:
+            before = len(dataset)
+            selected = list(dataset.keys())[: self.max_tasks]
+            dataset = {k: dataset[k] for k in selected}
+            if len(dataset) < before:
+                logger.info(
+                    "Limiting run to %s of %s tasks (--max_tasks)",
+                    len(dataset),
+                    before,
+                )
 
         # Handle prompt sensitivity if enabled
         prompt_variations_map = None
diff --git a/hal/benchmark_manager.py b/hal/benchmark_manager.py
index 5c25562..e86fb8a 100644
--- a/hal/benchmark_manager.py
+++ b/hal/benchmark_manager.py
@@ -35,7 +35,11 @@ def __init__(
             "colbench_frontend_design",
         ]
 
-    def get_benchmark(self, benchmark_name: str) -> BaseBenchmark:
+    def get_benchmark(
+        self,
+        benchmark_name: str,
+        max_tasks: Optional[int] = None,
+    ) -> BaseBenchmark:
         """Get benchmark instance for given name"""
         if benchmark_name == "gaia":
             from .benchmarks.gaia import GaiaBenchmark
@@ -71,15 +75,17 @@ def get_benchmark(self, benchmark_name: str) -> BaseBenchmark:
         elif benchmark_name == "corebench_easy":
             from .benchmarks.corebench import CoreBenchEasy
 
-            benchmark = CoreBenchEasy(self.agent_dir, self.config)
+            benchmark = CoreBenchEasy(self.agent_dir, self.config, max_tasks=max_tasks)
         elif benchmark_name == "corebench_medium":
             from .benchmarks.corebench import CoreBenchMedium
 
-            benchmark = CoreBenchMedium(self.agent_dir, self.config)
+            benchmark = CoreBenchMedium(
+                self.agent_dir, self.config, max_tasks=max_tasks
+            )
         elif benchmark_name == "corebench_hard":
             from .benchmarks.corebench import CoreBenchHard
 
-            benchmark = CoreBenchHard(self.agent_dir, self.config)
+            benchmark = CoreBenchHard(self.agent_dir, self.config, max_tasks=max_tasks)
         elif benchmark_name == "scienceagentbench":
             from .benchmarks.scienceagentbench import ScienceAgentBench
 
diff --git a/hal/benchmarks/corebench.py b/hal/benchmarks/corebench.py
index eda19a1..28e14a0 100644
--- a/hal/benchmarks/corebench.py
+++ b/hal/benchmarks/corebench.py
@@ -3,7 +3,7 @@
 import urllib.request
 import tarfile
 import time
-from typing import Dict, Any
+from typing import Any, Dict, Optional
 import numpy as np
 from scipy.stats import t
 import math
@@ -20,7 +20,12 @@ class CoreBench(BaseBenchmark):
 
     _no_ground_truth = True
 
-    def __init__(self, agent_dir: str, config: Dict[str, Any]):
+    def __init__(
+        self,
+        agent_dir: str,
+        config: Dict[str, Any],
+        max_tasks: Optional[int] = None,
+    ):
         # Set benchmark_name in subclasses
 
         # Load tasks from core_test.json
@@ -43,6 +48,18 @@ def __init__(self, agent_dir: str, config: Dict[str, Any]):
         with open(core_test_path, "r") as f:
             dataset = json.load(f)
 
+        if not isinstance(dataset, list):
+            raise TypeError("core_test.json must contain a JSON array of tasks")
+
+        # Match hal-eval --max_tasks: only download/load capsules we will run (saves CI time).
+        if max_tasks is not None and max_tasks > 0 and max_tasks < len(dataset):
+            logger.info(
+                "CoreBench: loading %s of %s tasks (max_tasks); skipping other capsule downloads",
+                max_tasks,
+                len(dataset),
+            )
+            dataset = dataset[:max_tasks]
+
         self.benchmark = {}
         self.benchmark_answers = {}
 
@@ -112,9 +129,6 @@ def __download_and_extract_capsule(
         backoff_factor=1,
     ):
         """Downloads and extracts a capsule archive from the CoreBench repository."""
-        # FIXME: this doesn't respect the --max_tasks flag
-        # Expected: --max_tasks 2 only downloads 2 capsules
-        # Actual: --max_tasks 2 downloads all capsules, then runs only 2
         capsule_dir = os.path.join(capsules_dir, capsule_id)
         capsule_url = f"https://corebench.cs.princeton.edu/capsules/{capsule_id}.tar.gz"
         tar_path = os.path.join(capsules_dir, f"{capsule_id}.tar.gz")
@@ -441,9 +455,14 @@ def get_metrics(self, eval_results: Dict[str, Any]) -> Dict[str, Any]:
 class CoreBenchEasy(CoreBench):
     """CoreBench benchmark with easy difficulty level"""
 
-    def __init__(self, agent_dir: str, config: Dict[str, Any]):
+    def __init__(
+        self,
+        agent_dir: str,
+        config: Dict[str, Any],
+        max_tasks: Optional[int] = None,
+    ):
         self.benchmark_name = "corebench_easy"
-        super().__init__(agent_dir, config)
+        super().__init__(agent_dir, config, max_tasks=max_tasks)
 
     def _construct_prompt(self, task):
         """
@@ -476,9 +495,14 @@ def _get_capsule_files_dict(self, capsule_dir: str) -> Dict[str, str]:
 class CoreBenchMedium(CoreBench):
     """CoreBench benchmark with medium difficulty level"""
 
-    def __init__(self, agent_dir: str, config: Dict[str, Any]):
+    def __init__(
+        self,
+        agent_dir: str,
+        config: Dict[str, Any],
+        max_tasks: Optional[int] = None,
+    ):
         self.benchmark_name = "corebench_medium"
-        super().__init__(agent_dir, config)
+        super().__init__(agent_dir, config, max_tasks=max_tasks)
 
     def _construct_prompt(self, task):
         """
@@ -527,9 +551,14 @@ def _get_capsule_files_dict(self, capsule_dir: str) -> Dict[str, str]:
 class CoreBenchHard(CoreBench):
     """CoreBench benchmark with hard difficulty level"""
 
-    def __init__(self, agent_dir: str, config: Dict[str, Any]):
+    def __init__(
+        self,
+        agent_dir: str,
+        config: Dict[str, Any],
+        max_tasks: Optional[int] = None,
+    ):
         self.benchmark_name = "corebench_hard"
-        super().__init__(agent_dir, config)
+        super().__init__(agent_dir, config, max_tasks=max_tasks)
 
     def _construct_prompt(self, task):
         """
diff --git a/hal/cli.py b/hal/cli.py
index c60440f..b9b439b 100644
--- a/hal/cli.py
+++ b/hal/cli.py
@@ -83,6 +83,15 @@
     help="Path to configuration file. (currently not used)",
 )
 @click.option("--vm", is_flag=True, help="Run the agent on azure VMs")
+@click.option(
+    "--no-download-environment",
+    is_flag=True,
+    help=(
+        "With --vm only: when copying results from the VM, omit the environment/ "
+        "directory (task data/code/results) from the archive to speed up download. "
+        "output.json and other files under /home/agent are still retrieved."
+    ),
+)
 @click.option(
     "--docker",
     is_flag=True,
@@ -179,6 +188,7 @@ def main(
     task_timeout,
     results_dir,
     task_ids,
+    no_download_environment,
     **kwargs,
 ):
     """Run agent evaluation on specified benchmark with given model."""
@@ -228,6 +238,11 @@ def main(
             )
             sys.exit(1)
 
+        if no_download_environment and not vm:
+            logger.warning(
+                "--no-download-environment only applies to --vm runs; ignoring."
+            )
+
         if continue_run and not set_run_id:
             raise ValueError("continue_run flag requires run_id to be set")
 
@@ -252,6 +267,9 @@ def main(
             prompt_sensitivity=prompt_sensitivity,
             num_variations=num_variations,
             variation_strength=variation_strength,
+            max_tasks=max_tasks,
+            task_ids=task_ids,
+            no_download_environment=no_download_environment and vm,
         )
 
         # get exact command used to run the evaluation from click
@@ -282,6 +300,7 @@ def main(
                 task_timeout=task_timeout,
                 results_dir=results_dir,
                 task_ids=task_ids,
+                download_environment=not (no_download_environment and vm),
             )
 
             # Run evaluation
diff --git a/hal/utils/logging_utils.py b/hal/utils/logging_utils.py
index e7a8f77..d458775 100644
--- a/hal/utils/logging_utils.py
+++ b/hal/utils/logging_utils.py
@@ -22,7 +22,6 @@ def setup_logging(log_dir: str, run_id: str, use_vm: bool = False) -> None:
     Args:
         log_dir: Directory for log files
         run_id: Unique run identifier
-        use_vm: Unused; kept for API compatibility.
     """
     # Create absolute path for log directory to avoid path duplication
     log_dir = os.path.abspath(log_dir)
@@ -149,6 +148,9 @@ def print_run_config(
     prompt_sensitivity: bool = False,
     num_variations: int = 3,
     variation_strength: str = "mild",
+    max_tasks: Optional[int] = None,
+    task_ids: Optional[str] = None,
+    no_download_environment: bool = False,
 ) -> None:
     """Print a formatted table with the run configuration"""
     logger.info("=== Run Configuration ===")
@@ -159,8 +161,17 @@ def print_run_config(
     logger.info(f"  Agent Directory: {agent_dir}")
     logger.info(f"  Log Directory: {log_dir}")
     logger.info(f"  Max Concurrent: {max_concurrent}")
+    logger.info(
+        f"  Max Tasks: {max_tasks if max_tasks is not None else 'all'}",
+    )
+    if task_ids:
+        logger.info(f"  Task IDs: {task_ids}")
     logger.info(f"  Upload Results: {'Yes' if upload else 'No'}")
     logger.info(f"  VM Execution: {'Yes' if vm else 'No'}")
+    if vm:
+        logger.info(
+            f"  VM Download Environment: {'No (skip environment/)' if no_download_environment else 'Yes'}"
+        )
     logger.info(f"  Docker Execution: {'Yes' if docker else 'No'}")
     logger.info(f"  Continue Previous Run: {'Yes' if continue_run else 'No'}")
     logger.info(f"  Ignore Errors: {'Yes' if ignore_errors else 'No'}")
diff --git a/hal/utils/virtual_machine_manager.py b/hal/utils/virtual_machine_manager.py
index 54c2b4c..70882de 100644
--- a/hal/utils/virtual_machine_manager.py
+++ b/hal/utils/virtual_machine_manager.py
@@ -3,11 +3,13 @@
 from azure.identity import DefaultAzureCredential
 import paramiko
 import os
+import shlex
 import tarfile
 import json
 import logging
 from contextlib import contextmanager
 from pathlib import Path
+from dotenv import dotenv_values
 from .vm.azure_virtual_machine import AzureVirtualMachine
 
 # Mount names for core_agent: used only under VM_AGENT_HOME/environment/ (e.g. environment/data, environment/code, environment/results from task payload).
@@ -16,6 +18,33 @@
 
 RUN_AGENT_SCRIPT_PATH = Path(__file__).resolve().parent / "vm" / "run_agent.py"
 
+
+def _vm_env_var_from_host(name: str) -> str | None:
+    """Resolve a secret from the host process env or local .env (for VM payload files)."""
+    v = os.environ.get(name, "").strip()
+    if v:
+        return v
+    env_file = Path.cwd() / ".env"
+    if env_file.is_file():
+        raw = dotenv_values(env_file).get(name)
+        if raw:
+            s = str(raw).strip()
+            if s:
+                return s
+    return None
+
+
+# Injected into run_agent.env so the remote agent sees keys when the host has no .env file
+# to copy to the VM (e.g. CI: secrets exist only on the runner environment).
+_VM_RUN_AGENT_SECRET_NAMES = (
+    "WANDB_API_KEY",
+    "OPENAI_API_KEY",
+    "ANTHROPIC_API_KEY",
+    "GEMINI_API_KEY",
+    "TOGETHERAI_API_KEY",
+    "OPENROUTER_API_KEY",
+)
+
 # Set up base logger
 _base_logger = logging.getLogger(__name__)
 
@@ -271,8 +300,20 @@ def compress_and_copy_files_to_vm(self, vm_name, source_directory):
             logger.error(f"Error copying files: {e}")
             raise
 
-    def copy_files_from_vm(self, vm_name, destination_directory):
-        """Copy files from the VM to local directory."""
+    def copy_files_from_vm(
+        self,
+        vm_name,
+        destination_directory,
+        *,
+        download_environment: bool = True,
+    ):
+        """Copy files from the VM to local directory.
+
+        When download_environment is False, omits ``/home/agent/environment`` from
+        the archive (task data/code/results) to speed up SFTP; ``output.json`` and
+        other home-directory files are still included.
+        """
+        logger = _get_logger(vm_name)
         with self._get_sftp_client(
             vm_name,
             self.network_client,
@@ -288,8 +329,15 @@ def copy_files_from_vm(self, vm_name, destination_directory):
                 f"/home/agent/{os.path.basename(destination_directory)}_back.tar.gz"
             )
             remote_home_directory = "/home/agent"
+            exclude = ""
+            if not download_environment:
+                logger.info(
+                    "Excluding environment/ from VM results archive (faster download)"
+                )
+                exclude = "--exclude=environment "
+            quoted_tar = shlex.quote(remote_tar_file_path)
             _, stdout, _ = ssh_client.exec_command(
-                f"tar -czf {remote_tar_file_path} -C {remote_home_directory} ."
+                f"tar {exclude}-czf {quoted_tar} -C {remote_home_directory} ."
             )
             for _ in stdout:
                 pass  # Block until the tar command completes
@@ -423,6 +471,10 @@ def copy_env_and_run_setup_script(
 
                 # Write run-specific env vars for static run_agent.py
                 run_agent_env = f"RUN_ID={run_id}\nAGENT_FUNCTION={agent_function}\nTASK_ID={task_id}\n"
+                for name in _VM_RUN_AGENT_SECRET_NAMES:
+                    val = _vm_env_var_from_host(name)
+                    if val:
+                        run_agent_env += f"{name}={val}\n"
                 with sftp_client.open("/home/agent/run_agent.env", "w") as f:
                     f.write(run_agent_env)
 
diff --git a/hal/utils/virtual_machine_runner.py b/hal/utils/virtual_machine_runner.py
index 211cfb4..e380313 100644
--- a/hal/utils/virtual_machine_runner.py
+++ b/hal/utils/virtual_machine_runner.py
@@ -25,10 +25,12 @@ def __init__(
         task_timeout: int,
         max_concurrent: int = 1,
         benchmark: Optional[BaseBenchmark] = None,
+        download_environment: bool = True,
     ):
         self.max_concurrent = max_concurrent
         self.log_dir = log_dir
         self.task_timeout = task_timeout
+        self.download_environment = download_environment
         self.vm_manager = VirtualMachineManager()
         self._semaphore = asyncio.Semaphore(max_concurrent)
         self._file_lock = asyncio.Lock()
@@ -248,6 +250,7 @@ async def process_task(task_id: str, input_data: Any) -> Optional[Dict]:
                         self.vm_manager.copy_files_from_vm,
                         vm_name,
                         dest_dir,
+                        download_environment=self.download_environment,
                     )
 
                     # Read the output.json file from the copied directory
@@ -277,7 +280,12 @@ async def process_task(task_id: str, input_data: Any) -> Optional[Dict]:
             except Exception as e:
                 logger.error(f"Error processing task {task_id} on VM {vm_name}: {e}")
                 traceback.print_exc()
-                return {task_id: f"ERROR: {str(e)}"}
+                # Re-raise so the eval run fails hard; `finally` below still deletes the VM (or
+                # whatever resources exist for this name). Swallowing here made hal-eval continue
+                # and report success despite Azure / provisioning failures.
+                raise RuntimeError(
+                    f"VM run failed for task {task_id} (VM {vm_name}): {e}"
+                ) from e
 
             finally:
                 wall_clock_time = time.time() - task_start_time
diff --git a/pyproject.toml b/pyproject.toml
index 7c02a40..64430da 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -22,6 +22,7 @@ dependencies = [
 [project.optional-dependencies]
 dev = [
     "ruff==0.15.4",
+    "jsonschema>=4.22.0",
     "pytest>=8.0.0",
     "pytest-mock>=3.12.0",
     "matplotlib>=3.7.0",
diff --git a/tests/gh_actions/corebench_hard_upload.schema.json b/tests/gh_actions/corebench_hard_upload.schema.json
new file mode 100644
index 0000000..2fcd8c2
--- /dev/null
+++ b/tests/gh_actions/corebench_hard_upload.schema.json
@@ -0,0 +1,173 @@
+{
+  "$schema": "https://json-schema.org/draft/2020-12/schema",
+  "$id": "hal-harness/tests/gh_actions/corebench_hard_upload",
+  "title": "CoreBench hard VM e2e upload artifact",
+  "description": "Matches hal.benchmarks.base_benchmark.BaseBenchmark.process_results output for a CoreBench run.",
+  "type": "object",
+  "required": [
+    "config",
+    "results",
+    "raw_eval_results",
+    "raw_logging_results",
+    "total_usage",
+    "total_cost",
+    "git_info"
+  ],
+  "properties": {
+    "config": {
+      "type": "object",
+      "required": [
+        "agent_name",
+        "benchmark_name",
+        "date",
+        "run_id",
+        "agent_args",
+        "run_command",
+        "prompt_sensitivity"
+      ],
+      "properties": {
+        "agent_name": { "const": "CORE-Agent" },
+        "benchmark_name": { "const": "corebench_hard" },
+        "date": {
+          "type": "string",
+          "pattern": "^[0-9]{4}-[0-9]{2}-[0-9]{2}$"
+        },
+        "run_id": {
+          "type": "string",
+          "pattern": "^corebench_hard_coreagent_[0-9]+$"
+        },
+        "agent_args": {
+          "type": "object",
+          "required": ["model_name", "benchmark_name"],
+          "properties": {
+            "model_name": {
+              "type": "string",
+              "const": "openai/gpt-4.1-2025-04-14"
+            },
+            "benchmark_name": {
+              "type": "string",
+              "const": "corebench_hard"
+            }
+          },
+          "additionalProperties": true
+        },
+        "run_command": {
+          "type": "string",
+          "minLength": 1,
+          "pattern": "hal-eval.*--benchmark corebench_hard"
+        },
+        "prompt_sensitivity": { "type": "boolean", "const": false }
+      },
+      "additionalProperties": true
+    },
+    "results": {
+      "type": "object",
+      "required": [
+        "accuracy",
+        "written_accuracy",
+        "vision_accuracy",
+        "successful_tasks",
+        "failed_tasks",
+        "total_cost",
+        "latencies"
+      ],
+      "properties": {
+        "accuracy": { "type": "number" },
+        "written_accuracy": { "type": "number" },
+        "vision_accuracy": { "type": "number" },
+        "successful_tasks": {
+          "type": "array",
+          "items": { "type": "string", "pattern": "^capsule-[0-9]+$" }
+        },
+        "failed_tasks": {
+          "type": "array",
+          "items": { "type": "string", "pattern": "^capsule-[0-9]+$" }
+        },
+        "total_cost": { "type": "number" },
+        "latencies": {
+          "type": "object",
+          "minProperties": 1,
+          "additionalProperties": {
+            "type": "object",
+            "required": [
+              "first_call_timestamp",
+              "last_call_timestamp",
+              "total_time"
+            ],
+            "properties": {
+              "first_call_timestamp": { "type": "string", "minLength": 1 },
+              "last_call_timestamp": { "type": "string", "minLength": 1 },
+              "total_time": { "type": "number" }
+            },
+            "additionalProperties": true
+          }
+        }
+      },
+      "additionalProperties": true
+    },
+    "raw_eval_results": {
+      "type": "object",
+      "minProperties": 1,
+      "propertyNames": { "pattern": "^capsule-[0-9]+$" },
+      "additionalProperties": {
+        "type": "object",
+        "required": [
+          "correct_written_answers",
+          "correct_vision_answers",
+          "total_written_questions",
+          "total_vision_questions"
+        ],
+        "properties": {
+          "correct_written_answers": { "type": "integer", "minimum": 0 },
+          "correct_vision_answers": { "type": "integer", "minimum": 0 },
+          "total_written_questions": { "type": "integer", "minimum": 0 },
+          "total_vision_questions": { "type": "integer", "minimum": 0 },
+          "error": { "type": "string" }
+        },
+        "additionalProperties": true
+      }
+    },
+    "raw_logging_results": {
+      "type": "array",
+      "minItems": 1,
+      "items": { "type": "object", "additionalProperties": true }
+    },
+    "total_usage": {
+      "type": "object",
+      "additionalProperties": {
+        "type": "object",
+        "properties": {
+          "prompt_tokens": { "type": "number", "minimum": 0 },
+          "completion_tokens": { "type": "number", "minimum": 0 },
+          "cache_creation_input_tokens": { "type": "number", "minimum": 0 },
+          "cache_read_input_tokens": { "type": "number", "minimum": 0 }
+        },
+        "additionalProperties": true
+      }
+    },
+    "total_cost": { "type": "number" },
+    "git_info": {
+      "type": "object",
+      "minProperties": 1,
+      "anyOf": [
+        {
+          "required": ["commit", "repository_url", "branch"],
+          "properties": {
+            "commit": { "type": "string", "minLength": 1 },
+            "repository_url": { "type": "string", "minLength": 1 },
+            "branch": { "type": "string", "minLength": 1 },
+            "commit_timestamp": { "type": "string" },
+            "commit_url": { "type": "string" }
+          }
+        },
+        {
+          "required": ["error"],
+          "properties": { "error": { "type": "string" } }
+        }
+      ]
+    },
+    "prompt_sensitivity_metrics": { "type": "object" },
+    "task_metrics": { "type": "object" }
+  },
+  "additionalProperties": true
+}
diff --git a/tests/gh_actions/verify_corebench_hard_e2e.sh b/tests/gh_actions/verify_corebench_hard_e2e.sh
new file mode 100755
index 0000000..03dc26f
--- /dev/null
+++ b/tests/gh_actions/verify_corebench_hard_e2e.sh
@@ -0,0 +1,84 @@
+#!/usr/bin/env bash
+# Validate CoreBench hard VM e2e run directory and *_UPLOAD.json (schema alongside this script).
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
+BENCH_DIR="${ROOT}/results/corebench_hard"
+SCHEMA="${SCRIPT_DIR}/corebench_hard_upload.schema.json"
+
+if [[ ! -d "$BENCH_DIR" ]]; then
+  echo "::error::Expected benchmark results directory missing: $BENCH_DIR" >&2
+  exit 1
+fi
+
+# Newest run directory (default hal-eval run_id: corebench_hard_<sanitized_agent>_<unix_time>).
+shopt -s nullglob
+dirs=("$BENCH_DIR"/corebench_hard_*)
+shopt -u nullglob
+run_dir=""
+if ((${#dirs[@]})); then
+  run_dir="$(ls -td "${dirs[@]}" | head -n1)"
+fi
+
+if [[ -z "${run_dir:-}" || ! -d "$run_dir" ]]; then
+  echo "::error::No run directory matching results/corebench_hard/corebench_hard_*" >&2
+  exit 1
+fi
+
+run_id="$(basename "$run_dir")"
+upload="${run_dir}/${run_id}_UPLOAD.json"
+
+echo "Verifying e2e artifacts under: $run_id"
+
+require_file() {
+  local f="$1"
+  if [[ ! -f "$f" ]]; then
+    echo "::error::Missing required file: $f" >&2
+    exit 1
+  fi
+}
+
+require_dir() {
+  local d="$1"
+  if [[ ! -d "$d" ]]; then
+    echo "::error::Missing required directory: $d" >&2
+    exit 1
+  fi
+}
+
+require_file "$upload"
+require_file "${run_dir}/${run_id}.json"
+require_file "${run_dir}/${run_id}.log"
+require_file "${run_dir}/${run_id}_RAW_SUBMISSIONS.jsonl"
+
+require_dir "${run_dir}/agent_logs"
+if ! compgen -G "${run_dir}/agent_logs/*_log.log" >/dev/null; then
+  echo "::error::Expected at least one agent_logs/*_log.log under $run_dir" >&2
+  exit 1
+fi
+
+if ! find "$run_dir" -mindepth 1 -maxdepth 1 -type d -name 'capsule-*' -print -quit | grep -q .; then
+  echo "::error::Expected a per-task capsule-* directory under $run_dir (VM result copy)" >&2
+  exit 1
+fi
+
+if ! find "$run_dir" -mindepth 1 -maxdepth 1 -type f -name 'setup_vm_log_*.log' -print -quit | grep -q .; then
+  echo "::error::Expected setup_vm_log_*.log under $run_dir" >&2
+  exit 1
+fi
+
+python3 - "$SCHEMA" "$upload" <<'PY'
+import json
+import sys
+from pathlib import Path
+
+from jsonschema import Draft202012Validator
+
+schema_path, upload_path = Path(sys.argv[1]), Path(sys.argv[2])
+schema = json.loads(schema_path.read_text(encoding="utf-8"))
+instance = json.loads(upload_path.read_text(encoding="utf-8"))
+Draft202012Validator(schema).validate(instance)
+PY
+
+echo "CoreBench hard e2e artifact checks passed."