diff --git a/.github/workflows/claude-code-review.LMY b/.github/workflows/claude-code-review.LMY index 76923dc..83c8bf4 100644 --- a/.github/workflows/claude-code-review.LMY +++ b/.github/workflows/claude-code-review.LMY @@ -32,6 +32,7 @@ jobs: # github.event.pull_request.author_association == 'FIRST_TIME_CONTRIBUTOR' runs-on: ubuntu-latest + environment: test-environment permissions: contents: read pull-requests: read diff --git a/.github/workflows/e2e.yml b/.github/workflows/e2e.yml new file mode 100644 index 0000000..422ae1a --- /dev/null +++ b/.github/workflows/e2e.yml @@ -0,0 +1,142 @@ +# Runs a minimal CoreBench hard VM smoke eval. Requires a GitHub Environment +# (update `environment.name` below) with the secrets/vars listed in the job `env` block. +# Azure auth (pick one): (A) OIDC — Azure login with client-id, tenant-id, subscription-id +# and id-token: write (see https://github.com/azure/login ); (B) service principal + secret — +# Environment secrets `AZURE_CLIENT_ID`, `AZURE_CLIENT_SECRET`, `AZURE_TENANT_ID` plus var +# `AZURE_SUBSCRIPTION_ID` assembled into azure/login `creds` JSON; omit id-token: write. +name: HAL eval CoreBench hard (VM) + +on: + push: + branches: [main] + pull_request: + branches: [main] + +# New run on the same branch/PR cancels the previous in-progress run (saves VM/API cost). +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + corebench-hard-vm: + runs-on: ubuntu-latest + # Create this Environment in Repo → Settings → Environments and attach secrets/vars. + environment: test-environment + permissions: + contents: read + env: + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} + WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }} + AZURE_SUBSCRIPTION_ID: ${{ vars.AZURE_SUBSCRIPTION_ID }} + AZURE_RESOURCE_GROUP_NAME: ${{ vars.AZURE_RESOURCE_GROUP_NAME }} + AZURE_LOCATION: ${{ vars.AZURE_LOCATION }} + NETWORK_SECURITY_GROUP_NAME: ${{ vars.NETWORK_SECURITY_GROUP_NAME }} + SSH_PRIVATE_KEY_PATH: ${{ vars.SSH_PRIVATE_KEY_PATH }} + SSH_PUBLIC_KEY_PATH: ${{ vars.SSH_PUBLIC_KEY_PATH }} + steps: + - uses: actions/checkout@v4 + + # Warm capsules avoid re-downloading from corebench.cs.princeton.edu every run. + # - uses: actions/cache@v4 + # with: + # path: hal/benchmarks/corebench/capsules + # key: corebench-capsules-${{ hashFiles('hal/benchmarks/corebench/core_test.json.gpg') }} + # restore-keys: | + # corebench-capsules- + + - name: Assemble Azure creds JSON + id: azure_creds + env: + AZURE_CLIENT_ID: ${{ secrets.AZURE_CLIENT_ID }} + AZURE_CLIENT_SECRET: ${{ secrets.AZURE_CLIENT_SECRET }} + AZURE_TENANT_ID: ${{ secrets.AZURE_TENANT_ID }} + run: | + set -euo pipefail + for v in AZURE_CLIENT_ID AZURE_CLIENT_SECRET AZURE_TENANT_ID AZURE_SUBSCRIPTION_ID; do + if [ -z "${!v:-}" ]; then + echo "::error::Required $v is empty (set on GitHub Environment \`test-environment\` secrets or vars)." >&2 + exit 1 + fi + done + CREDS="$(python3 -c 'import json, os; print(json.dumps({"clientId": os.environ["AZURE_CLIENT_ID"], "clientSecret": os.environ["AZURE_CLIENT_SECRET"], "subscriptionId": os.environ["AZURE_SUBSCRIPTION_ID"], "tenantId": os.environ["AZURE_TENANT_ID"]}))')" + { + echo 'creds<> "$GITHUB_OUTPUT" + + - name: Azure login (service principal) + uses: azure/login@v2 + with: + auth-type: SERVICE_PRINCIPAL + creds: ${{ steps.azure_creds.outputs.creds }} + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.12" + + - name: Install dependencies + run: | + set -euo pipefail + pip install -e ".[dev,azure,corebench,coreagent]" + # Ensure gql 4+ (weave needs TransportConnectionFailed); extras can pull gql 3.x. + pip install "gql[httpx]>=4.0,<5" + + - name: Generate SSH key pair for VM provisioning + run: | + set -euo pipefail + key="${SSH_PRIVATE_KEY_PATH:?SSH_PRIVATE_KEY_PATH must be set}" + pub="${SSH_PUBLIC_KEY_PATH:?SSH_PUBLIC_KEY_PATH must be set}" + mkdir -p "$(dirname "$key")" + rm -f "$key" "$pub" + ssh-keygen -t ed25519 -f "$key" -N "" -q + test -f "$pub" + chmod 600 "$key" + chmod 644 "$pub" + + # core_test.json is gitignored; benchmark loads decrypted JSON (see README / hal/benchmarks/corebench.py). + - name: Decrypt CoreBench test set + run: | + set -euo pipefail + gpg --batch --yes --pinentry-mode loopback \ + --passphrase 'reproducibility' \ + --output hal/benchmarks/corebench/core_test.json \ + --decrypt hal/benchmarks/corebench/core_test.json.gpg + + - name: Run hal-eval (CoreBench hard, one test, VM) + run: | + set -euo pipefail + if [ -z "${OPENAI_API_KEY:-}" ]; then + echo "::error::OPENAI_API_KEY is empty — add repository or environment secret \`OPENAI_API_KEY\` on GitHub Environment \`test-environment\`." >&2 + exit 1 + fi + log="$(mktemp)" + set +e + set -o pipefail + hal-eval --benchmark corebench_hard \ + --agent_dir agents/core_agent \ + --agent_function main.run \ + --agent_name "CORE-Agent" \ + --vm \ + --no-download-environment \ + --max_concurrent 1 \ + --max_tasks 1 \ + -A 'model_name=openai/gpt-4.1-2025-04-14' \ + 2>&1 | tee "$log" + eval_status=${PIPESTATUS[0]} + set -e + if [ "$eval_status" -ne 0 ]; then + echo "::error::hal-eval exited with status $eval_status" >&2 + exit "$eval_status" + fi + if ! grep -Fq 'hal.cli: Evaluation completed successfully' "$log"; then + echo "::error::hal-eval output must contain: hal.cli: Evaluation completed successfully" >&2 + exit 1 + fi + + - name: Verify CoreBench hard result layout and upload JSON + run: | + set -euo pipefail + bash tests/gh_actions/verify_corebench_hard_e2e.sh diff --git a/README.md b/README.md index ab4a4ef..1bbd0de 100644 --- a/README.md +++ b/README.md @@ -405,6 +405,7 @@ hal-eval --benchmark --agent_dir --agent_func - **`--max_concurrent `**: Number of parallel tasks (default: 1) - **`--conda_env_name `**: Conda environment for agent execution - **`--vm`**: Run evaluation on Azure VMs +- **`--no-download-environment`**: With **`--vm`**, omit the VM’s `environment/` directory (task data, code, and results mounts) when downloading results back to the host, so the SFTP step is much faster. `output.json`, logs, and other files under `/home/agent` are still downloaded. No effect without `--vm`. - **`--docker`**: Run evaluation in Docker containers for isolation - **`--run_id `**: Specify a run ID (useful for continuing runs) - **`--continue_run`**: Continue from a previous run (requires run_id) diff --git a/agents/core_agent/agent_hints.py b/agents/core_agent/agent_hints.py index e81c752..6ed27d3 100644 --- a/agents/core_agent/agent_hints.py +++ b/agents/core_agent/agent_hints.py @@ -56,7 +56,6 @@ • If there exists a file called 'manuscript' then first read this file to extract the required results to answer the questions of the task. • If you are extracting information from html (such as the output of a Jupyter notebook), convert it to a PDF or PNG first and then extract the relevant information. • Before running the code, first determine a list of package/dependency requirements that must be installed by reading through the README file or code itself. Then install those dependencies before running the code. -• Note: Symbolic links have been automatically created for environment/data → /data, environment/code → /code, and environment/results → /results to ensure proper file access. Constraints: • Use flags or modify commands to bypass any need for confirmations during execution to streamline processes. diff --git a/agents/core_agent/main.py b/agents/core_agent/main.py index 707dd3e..2e8ab98 100644 --- a/agents/core_agent/main.py +++ b/agents/core_agent/main.py @@ -551,17 +551,6 @@ def run(input: dict[str, dict], **kwargs) -> dict[str, str]: print(f"[WARNING] Failed to run 'conda list': {str(e)}") print("=== End of Package Versions and Environment Information ===") - # Create symbolic links - try: - cwd = os.getcwd() - os.symlink(f"{cwd}/environment/data", "/data", target_is_directory=True) - os.symlink(f"{cwd}/environment/code", "/code", target_is_directory=True) - os.symlink(f"{cwd}/environment/results", "/results", target_is_directory=True) - except Exception as e: - print( - f"[WARNING] Failed to create symbolic links for /data, /code, and /results: {str(e)}" - ) - assert "model_name" in kwargs, "model_name is required" assert len(input) == 1, "input must contain only one task" @@ -692,10 +681,19 @@ async def acompletion_with_provider(*args, **completion_kwargs): model = LiteLLMModel(**model_params) + # CoreBench layout: always use cwd-based paths (no /data, /code, /results symlinks). + cwd = os.getcwd() + env_paths = ( + f"• Task directories — use these absolute paths: " + f"{cwd}/environment/data (data), {cwd}/environment/code (code), " + f"{cwd}/environment/results (results).\n\n" + ) # Prepend hints to the task prompt if available prompt = task["prompt"] if hints: - prompt = f"{hints}\n\n{prompt}" + prompt = f"{env_paths}{hints}\n\n{prompt}" + else: + prompt = f"{env_paths}{prompt}" # Create a custom FinalAnswerTool that includes key validation and LLM-based giving-up detection class CustomFinalAnswerTool(Tool): diff --git a/docs/EXECUTION_FLOW.md b/docs/EXECUTION_FLOW.md new file mode 100644 index 0000000..f3f2c81 --- /dev/null +++ b/docs/EXECUTION_FLOW.md @@ -0,0 +1,232 @@ +# HAL evaluation execution flow (hal-eval → finish) + +High-level map of how a run flows from the `hal-eval` CLI to completion. + +--- + +## 1. Entry point + +``` +hal-eval (console script from pyproject.toml) + → hal.cli:main +``` + +**`pyproject.toml`** `[project.scripts]`: `hal-eval = "hal.cli:main"` + +--- + +## 2. CLI + +**File:** `hal/cli.py` + +- **Click** parses options (`--benchmark`, `--agent_dir`, `--vm`, `--task_timeout`, etc.). +- **main()**: + 1. **Parse args**: `parse_cli_args(a)`, `parse_cli_args(b)`, `parse_cli_args(i)` → `agent_args`, `benchmark_args`, `inspect_eval_args`. + 2. **run_id**: default `{benchmark}_{agent_name_sanitized}_{timestamp}` or use `--run_id`. + 3. **Logging**: `setup_logging(log_dir, run_id, use_vm=vm)`; `log_dir = results_dir/benchmark/run_id`. + 4. **Validation**: model pricing if `model_name` in agent_args; exactly one of conda/vm/docker; `--continue_run` requires `run_id`. + 5. **print_run_config(...)**. + 6. **Build run command**: `run_command = "hal-eval " + sys.argv[1:]` (for logging/repro). + 7. **Create AgentRunner** with all parsed options (including `task_timeout`, `agent_function`, `agent_dir`, `benchmark_name`, etc.). + 8. **Run**: `asyncio.run(runner.run(agent_name=agent_name, upload=upload))`. + 9. **Post-run**: `log_results(results)`, `log_run_summary(...)` (or warning if no benchmark/run_dir). + +--- + +## 3. AgentRunner + +**File:** `hal/agent_runner.py` + +### 3.1 Construction (`__init__`) + +- **BenchmarkManager** (`hal/benchmark_manager.py`): (agent_dir, config) → **get_benchmark(benchmark_name)** → concrete benchmark (e.g. `CoreBenchBenchmark` from `hal/benchmarks/corebench.py`). +- **Runner choice** (one of): + - **VirtualMachineRunner** (`hal/utils/virtual_machine_runner.py`): (log_dir, benchmark, task_timeout, max_concurrent) if `use_vm`; + - **DockerRunner** (`hal/utils/docker_runner.py`): (..., task_timeout) if `use_docker`; + - **LocalRunner** (`hal/utils/local_runner.py`): (..., conda_env, task_timeout) otherwise. +- Stores agent_function, agent_dir, agent_args, run_id, etc. + +### 3.2 Run pipeline (`run()`) + +1. **Weave**: `weave.init(self.run_id)`. +2. **Dataset**: `dataset = self.benchmark.get_dataset()`. +3. **Continue run** (if `continue_run`): + - If not `ignore_errors`: `dataset = self.get_remaining_tasks(dataset)` (filter out tasks already in `*_RAW_SUBMISSIONS.jsonl`). + - If `ignore_errors`: `dataset = {}` (evaluation uses only previous submissions). +4. **Filter**: by `--task_ids` if set; cap by `--max_tasks` if set. +5. **Prompt sensitivity** (if enabled): build variation datasets; then either single-variation or multi-variation runs. +6. **Continue run cleanup**: if continuing and not ignore_errors, delete Weave calls for tasks in dataset so they can be re-run. +7. **Run agent**: + - **Normal**: `agent_output = await self.runner.run_agent(dataset=dataset, agent_function=..., agent_dir=..., agent_args=..., run_id=..., benchmark=..., task=..., progress=...)`. + - **Prompt sensitivity (single var)**: same but with `single_variation_dataset` and then set `prompt_sensitivity = False` for evaluation. + - **Prompt sensitivity (multi var)**: loop over variation indices, run agent per variation, collect `all_variations_output`. + - If `continue_run`: merge `agent_output` with previous `*_RAW_SUBMISSIONS.jsonl`. +8. **Evaluate**: + - **Prompt sensitivity**: `weave.finish()`; for each task/variation call `benchmark.evaluate_output(single_output, run_id)`; collect scores. + - **Normal**: `weave.finish()`; `eval_results = self.benchmark.evaluate_output(agent_output, self.run_id)`. +9. **Process results**: `results = self.benchmark.process_results(agent_name=..., run_id=..., agent_args=..., run_command=..., eval_results=..., weave_client=..., agent_output=..., upload=..., prompt_sensitivity=..., )`. +10. **Return** `results` to CLI. + +--- + +## 4. Runner: VM path + +**File:** `hal/utils/virtual_machine_runner.py` + +### 4.1 `run_agent(dataset, agent_function, agent_dir, agent_args, run_id, benchmark, progress, task)` + +- For each `(task_id, input_data)` in `dataset`, an async **process_task(task_id, input_data)** is run. +- Concurrency is limited by **semaphore** `max_concurrent`; tasks are scheduled with **asyncio.gather(run_with_semaphore(...) for each task)**. +- Each **process_task** does the following (see below). +- Results are merged into one dict and appended to `{run_id}_RAW_SUBMISSIONS.jsonl` in `log_dir`, then returned. + +### 4.2 Per-task VM flow: `process_task(task_id, input_data)` + +1. **VM name**: `vm_name = "agent-{benchmark_name}-{uuid}"[:32]`. +2. **GPU**: read `benchmark.benchmark[task_id].get("gpu", False)`. +3. **Create VM**: `vm_manager.create_virtual_machine_by_name(...)` + → **VirtualMachineManager** (`hal/utils/virtual_machine_manager.py`) + **AzureVirtualMachine** (`hal/utils/vm/azure_virtual_machine.py`) (provision VM, wait for startup script, etc.). +4. **Temp dir**: build task payload: + - `input.json`, `agent_args.json`; + - copy any `input_data["files"]` into temp dir; + - copy benchmark `setup_script.sh` if present. +5. **Copy to VM**: + - `vm_manager.compress_and_copy_files_to_vm(vm_name, temp_dir)` (tar.gz → SFTP → extract under `/home/agent`); + - `vm_manager.compress_and_copy_files_to_vm(vm_name, agent_dir)` (agent code). +6. **Run agent on VM**: `vm_manager.run_agent_on_virtual_machine(vm_name, agent_function, task_id, input_data, agent_args, run_id, log_dir, benchmark)` (see below). +7. **Poll until done or timeout** (`task_timeout` seconds): + - Every 30s: `fetch_agent_logs(vm_name, ...)` (pull `agent_trace.log`), then `vm_manager.check_task_completion(vm_name)` (checks for `/home/agent/output.json`). + - If `output.json` appears → task complete; break. +8. **Copy results**: `vm_manager.copy_files_from_vm(vm_name, dest_dir, download_environment=...)` (e.g. `log_dir/{task_id}/`); read `output.json` from dest_dir → **result**. With `hal-eval --no-download-environment` (requires `--vm`), the archive omits `environment/` (task payload) for a faster download; `output.json` and other files under `/home/agent` are still copied. +9. **Cleanup**: `vm_manager.delete_virtual_machine_by_name(vm_name)`. +10. Return `{task_id: result}` (or error/timeout dict). + +--- + +## 5. VirtualMachineManager + +**File:** `hal/utils/virtual_machine_manager.py` + +### 5.1 `create_virtual_machine_by_name(vm_name, has_gpu, setup_timeout)` + +- **AzureVirtualMachine** (`hal/utils/vm/azure_virtual_machine.py`): (name, resource_group, location, subscription_id, nsg_id, ssh_public_key, gpu=..., timeout=setup_timeout). +- VM creation (Azure API), wait for SSH, run startup (e.g. cloud-init); **timeout** is used for waiting for startup script. + +### 5.2 `compress_and_copy_files_to_vm(vm_name, source_directory)` + +- Tar.gz `source_directory` → SFTP put to `/home/agent/` → SSH `tar -xzf ... --strip-components=1 -C /home/agent`; remove remote tar and local tar. + +### 5.3 `run_agent_on_virtual_machine(vm_name, agent_function, task_id, input_data, agent_args, run_id, log_dir, benchmark)` + +1. **copy_env_and_run_setup_script** (inner): + - SFTP: copy `.env` → `/home/agent/.env`; copy `setup_vm.sh` → VM, chmod +x, run `sudo bash setup_vm.sh`. + - If benchmark has `setup_script`, run that on VM (e.g. `bash setup_script.sh` in agent home). +2. **copy_env_and_run_setup_script** (outer call) is invoked again with same args (conda/env setup if used). +3. **SFTP**: + - Write `/home/agent/input.json` (`{task_id: input_data}`), `/home/agent/agent_args.json`. + - Write `/home/agent/run_agent.env` with `RUN_ID`, `AGENT_FUNCTION`, `TASK_ID`. + - Read static **run_agent.py** from `hal/utils/vm/run_agent.py`, write to `/home/agent/run_agent.py`, chmod +x. +4. **SSH**: run + `source ... conda && conda activate agent_env && python /home/agent/run_agent.py > agent_trace.log 2>&1` + (non-blocking; script runs in background on VM). + +### 5.4 `check_task_completion(vm_name)` + +- Uses **AzureVirtualMachine.check_for_file_presence_by_path** for `/home/agent/output.json` (SSH test -f). + +### 5.5 `get_agent_trace(vm_name)` + +- SFTP read `/home/agent/agent_trace.log` → returned as string (used by **fetch_agent_logs** in runner). + +### 5.6 `copy_files_from_vm(vm_name, destination_directory, *, download_environment=True)` + +- SSH: `rm -rf /home/agent/miniconda3`; tar home dir on VM (with `tar --exclude=environment` when `download_environment=False`); SFTP get tar; extract locally to `destination_directory`. + +### 5.7 `delete_virtual_machine_by_name(vm_name)` + +- **AzureVirtualMachine.delete()** (`hal/utils/vm/azure_virtual_machine.py`) (VM + NIC, disk, etc.). + +--- + +## 6. On-VM agent execution + +**File:** `hal/utils/vm/run_agent.py` + +- **Static script** (no string interpolation). +- **Load env**: `load_dotenv("/home/agent/.env")`, `load_dotenv("/home/agent/run_agent.env")`. +- **Require**: `RUN_ID`, `AGENT_FUNCTION`, `TASK_ID` (exit with error if missing). +- **Parse** `AGENT_FUNCTION` → `module_name`, `function_name`. +- **Weave**: `weave.init(RUN_ID)`. +- **Read** `/home/agent/input.json`, `/home/agent/agent_args.json`. +- **Load agent**: `importlib.util.spec_from_file_location(module_name, "/home/agent/{module_name}.py")` → `exec_module` → `getattr(module, function_name)`. +- **Run**: `with weave.attributes({"weave_task_id": TASK_ID}): result = agent(input_data, **agent_args)`. +- **Write** `/home/agent/output.json` with `result`. +- On exception: write `/home/agent/error.log`, re-raise. + +--- + +## 7. Local / Docker runners (brief) + +- **LocalRunner.run_agent** (`hal/utils/local_runner.py`): for each task, run agent in subprocess (conda env if set) with **task_timeout**; collect stdout/result; same high-level contract (dataset in → agent_output dict out). +- **DockerRunner.run_agent** (`hal/utils/docker_runner.py`): similar but each task runs in a container; same timeout and result shape. + +--- + +## 8. Benchmark layer + +**Files:** `hal/benchmark_manager.py` (registry); `hal/benchmarks/base_benchmark.py` (base); `hal/benchmarks/.py` (e.g. `corebench.py`, `gaia.py`). + +- **get_dataset()**: benchmark-specific (e.g. load from HuggingFace, disk); returns `Dict[task_id, task_input]`. +- **evaluate_output(agent_output, run_id)**: benchmark-specific scoring; returns eval result per task (e.g. scores). +- **process_results(...)**: build upload payload, write `*_UPLOAD.json`, optionally upload to HuggingFace, return final **results** dict for CLI. + +--- + +## 9. End-to-end flow (single path, VM, one task) + +``` +hal-eval + → main() hal/cli.py + → AgentRunner(...) hal/agent_runner.py + → runner.run(agent_name, upload) hal/agent_runner.py + → weave.init(run_id) + → benchmark.get_dataset() hal/benchmarks/base_benchmark.py (or subclass) + → (filters: continue_run, task_ids, max_tasks) + → runner.run_agent(dataset, ...) hal/utils/virtual_machine_runner.py + → process_task(task_id, input_data) [per task] + → create_virtual_machine_by_name() hal/utils/virtual_machine_manager.py + → compress_and_copy_files_to_vm() hal/utils/virtual_machine_manager.py + → run_agent_on_virtual_machine() hal/utils/virtual_machine_manager.py + → copy_env_and_run_setup_script (setup_vm.sh, benchmark setup_script) + → write input.json, agent_args.json, run_agent.env + → deploy run_agent.py from hal/utils/vm/run_agent.py + → SSH: conda activate agent_env && python run_agent.py (runs hal/utils/vm/run_agent.py on VM) + → poll: get_agent_trace(), check_task_completion() virtual_machine_manager.py + → copy_files_from_vm() hal/utils/virtual_machine_manager.py + → delete_virtual_machine_by_name() hal/utils/virtual_machine_manager.py + vm/azure_virtual_machine.py + → merge results, append RAW_SUBMISSIONS.jsonl + → weave.finish() + → benchmark.evaluate_output(agent_output, run_id) hal/benchmarks/*.py + → benchmark.process_results(...) hal/benchmarks/base_benchmark.py (or subclass) + → log_results(results), log_run_summary(...) hal/cli.py (implementations in hal/utils/logging_utils.py) +``` + +--- + +## 10. Key files reference + +| Role | File | +|------|------| +| Entry + CLI | `hal/cli.py` | +| Logging helpers (setup_logging, log_results, log_run_summary, print_run_config) | `hal/utils/logging_utils.py` | +| Orchestration + run pipeline | `hal/agent_runner.py` | +| Benchmark registry + get_benchmark | `hal/benchmark_manager.py` | +| VM run loop + per-task flow | `hal/utils/virtual_machine_runner.py` | +| VM lifecycle + SSH/SFTP + run script deploy | `hal/utils/virtual_machine_manager.py` | +| Setup script copied to VM | `hal/utils/setup_vm.sh` (dir next to virtual_machine_manager.py) | +| Static agent entrypoint on VM | `hal/utils/vm/run_agent.py` | +| Azure VM resource creation/deletion | `hal/utils/vm/azure_virtual_machine.py` | +| Benchmark base (get_dataset, evaluate_output, process_results) | `hal/benchmarks/base_benchmark.py` | +| Concrete benchmarks (corebench, gaia, scicode, etc.) | `hal/benchmarks/*.py` | +| Local process runner | `hal/utils/local_runner.py` | +| Docker runner | `hal/utils/docker_runner.py` | diff --git a/hal/agent_runner.py b/hal/agent_runner.py index c54a6d9..dc11adf 100644 --- a/hal/agent_runner.py +++ b/hal/agent_runner.py @@ -41,6 +41,7 @@ def __init__( variation_index: Optional[int] = None, results_dir: str = "results", task_ids: Optional[str] = None, + download_environment: bool = True, ): # Validate agent_function format if not isinstance(agent_function, str) or "." not in agent_function: @@ -72,9 +73,13 @@ def __init__( "Only one of conda_env, use_vm, or use_docker can be set at a time." ) - # Initialize benchmark first + # Initialize benchmark first (--max_tasks limits CoreBench capsule downloads; full load if + # --task_ids is used so those IDs are present in the benchmark dict). self.benchmark_manager = BenchmarkManager(agent_dir, config) - self.benchmark = self.benchmark_manager.get_benchmark(benchmark_name) + capsule_preload_limit = None if task_ids else max_tasks + self.benchmark = self.benchmark_manager.get_benchmark( + benchmark_name, max_tasks=capsule_preload_limit + ) self.benchmark.agent_args = agent_args # Override results directory if non-default @@ -121,6 +126,7 @@ def __init__( log_dir=self.benchmark.get_run_dir(self.run_id), benchmark=self.benchmark, task_timeout=task_timeout, + download_environment=download_environment, ) elif use_docker: self.runner = DockerRunner( @@ -250,11 +256,18 @@ async def run(self, agent_name: str, upload: bool = False) -> Dict[str, Any]: logger.error("No valid task IDs found. Exiting.") return {} - # Limit the number of tasks if max_tasks is specified - if self.max_tasks and self.max_tasks > 0 and self.max_tasks < len(dataset): - logger.info(f"Limiting to the first {self.max_tasks} tasks as requested") - task_ids = list(dataset.keys())[: self.max_tasks] - dataset = {task_id: dataset[task_id] for task_id in task_ids} + # Cap tasks (--max_tasks). Always apply here when set, not only when max_tasks < len(dataset), + # so execution cannot exceed N even if a benchmark constructor omits pre-slicing. + if self.max_tasks is not None and self.max_tasks > 0: + before = len(dataset) + selected = list(dataset.keys())[: self.max_tasks] + dataset = {k: dataset[k] for k in selected} + if len(dataset) < before: + logger.info( + "Limiting run to %s of %s tasks (--max_tasks)", + len(dataset), + before, + ) # Handle prompt sensitivity if enabled prompt_variations_map = None diff --git a/hal/benchmark_manager.py b/hal/benchmark_manager.py index 5c25562..e86fb8a 100644 --- a/hal/benchmark_manager.py +++ b/hal/benchmark_manager.py @@ -35,7 +35,11 @@ def __init__( "colbench_frontend_design", ] - def get_benchmark(self, benchmark_name: str) -> BaseBenchmark: + def get_benchmark( + self, + benchmark_name: str, + max_tasks: Optional[int] = None, + ) -> BaseBenchmark: """Get benchmark instance for given name""" if benchmark_name == "gaia": from .benchmarks.gaia import GaiaBenchmark @@ -71,15 +75,17 @@ def get_benchmark(self, benchmark_name: str) -> BaseBenchmark: elif benchmark_name == "corebench_easy": from .benchmarks.corebench import CoreBenchEasy - benchmark = CoreBenchEasy(self.agent_dir, self.config) + benchmark = CoreBenchEasy(self.agent_dir, self.config, max_tasks=max_tasks) elif benchmark_name == "corebench_medium": from .benchmarks.corebench import CoreBenchMedium - benchmark = CoreBenchMedium(self.agent_dir, self.config) + benchmark = CoreBenchMedium( + self.agent_dir, self.config, max_tasks=max_tasks + ) elif benchmark_name == "corebench_hard": from .benchmarks.corebench import CoreBenchHard - benchmark = CoreBenchHard(self.agent_dir, self.config) + benchmark = CoreBenchHard(self.agent_dir, self.config, max_tasks=max_tasks) elif benchmark_name == "scienceagentbench": from .benchmarks.scienceagentbench import ScienceAgentBench diff --git a/hal/benchmarks/corebench.py b/hal/benchmarks/corebench.py index eda19a1..28e14a0 100644 --- a/hal/benchmarks/corebench.py +++ b/hal/benchmarks/corebench.py @@ -3,7 +3,7 @@ import urllib.request import tarfile import time -from typing import Dict, Any +from typing import Any, Dict, Optional import numpy as np from scipy.stats import t import math @@ -20,7 +20,12 @@ class CoreBench(BaseBenchmark): _no_ground_truth = True - def __init__(self, agent_dir: str, config: Dict[str, Any]): + def __init__( + self, + agent_dir: str, + config: Dict[str, Any], + max_tasks: Optional[int] = None, + ): # Set benchmark_name in subclasses # Load tasks from core_test.json @@ -43,6 +48,18 @@ def __init__(self, agent_dir: str, config: Dict[str, Any]): with open(core_test_path, "r") as f: dataset = json.load(f) + if not isinstance(dataset, list): + raise TypeError("core_test.json must contain a JSON array of tasks") + + # Match hal-eval --max_tasks: only download/load capsules we will run (saves CI time). + if max_tasks is not None and max_tasks > 0 and max_tasks < len(dataset): + logger.info( + "CoreBench: loading %s of %s tasks (max_tasks); skipping other capsule downloads", + max_tasks, + len(dataset), + ) + dataset = dataset[:max_tasks] + self.benchmark = {} self.benchmark_answers = {} @@ -112,9 +129,6 @@ def __download_and_extract_capsule( backoff_factor=1, ): """Downloads and extracts a capsule archive from the CoreBench repository.""" - # FIXME: this doesn't respect the --max_tasks flag - # Expected: --max_tasks 2 only downloads 2 capsules - # Actual: --max_tasks 2 downloads all capsules, then runs only 2 capsule_dir = os.path.join(capsules_dir, capsule_id) capsule_url = f"https://corebench.cs.princeton.edu/capsules/{capsule_id}.tar.gz" tar_path = os.path.join(capsules_dir, f"{capsule_id}.tar.gz") @@ -441,9 +455,14 @@ def get_metrics(self, eval_results: Dict[str, Any]) -> Dict[str, Any]: class CoreBenchEasy(CoreBench): """CoreBench benchmark with easy difficulty level""" - def __init__(self, agent_dir: str, config: Dict[str, Any]): + def __init__( + self, + agent_dir: str, + config: Dict[str, Any], + max_tasks: Optional[int] = None, + ): self.benchmark_name = "corebench_easy" - super().__init__(agent_dir, config) + super().__init__(agent_dir, config, max_tasks=max_tasks) def _construct_prompt(self, task): """ @@ -476,9 +495,14 @@ def _get_capsule_files_dict(self, capsule_dir: str) -> Dict[str, str]: class CoreBenchMedium(CoreBench): """CoreBench benchmark with medium difficulty level""" - def __init__(self, agent_dir: str, config: Dict[str, Any]): + def __init__( + self, + agent_dir: str, + config: Dict[str, Any], + max_tasks: Optional[int] = None, + ): self.benchmark_name = "corebench_medium" - super().__init__(agent_dir, config) + super().__init__(agent_dir, config, max_tasks=max_tasks) def _construct_prompt(self, task): """ @@ -527,9 +551,14 @@ def _get_capsule_files_dict(self, capsule_dir: str) -> Dict[str, str]: class CoreBenchHard(CoreBench): """CoreBench benchmark with hard difficulty level""" - def __init__(self, agent_dir: str, config: Dict[str, Any]): + def __init__( + self, + agent_dir: str, + config: Dict[str, Any], + max_tasks: Optional[int] = None, + ): self.benchmark_name = "corebench_hard" - super().__init__(agent_dir, config) + super().__init__(agent_dir, config, max_tasks=max_tasks) def _construct_prompt(self, task): """ diff --git a/hal/cli.py b/hal/cli.py index c60440f..b9b439b 100644 --- a/hal/cli.py +++ b/hal/cli.py @@ -83,6 +83,15 @@ help="Path to configuration file. (currently not used)", ) @click.option("--vm", is_flag=True, help="Run the agent on azure VMs") +@click.option( + "--no-download-environment", + is_flag=True, + help=( + "With --vm only: when copying results from the VM, omit the environment/ " + "directory (task data/code/results) from the archive to speed up download. " + "output.json and other files under /home/agent are still retrieved." + ), +) @click.option( "--docker", is_flag=True, @@ -179,6 +188,7 @@ def main( task_timeout, results_dir, task_ids, + no_download_environment, **kwargs, ): """Run agent evaluation on specified benchmark with given model.""" @@ -228,6 +238,11 @@ def main( ) sys.exit(1) + if no_download_environment and not vm: + logger.warning( + "--no-download-environment only applies to --vm runs; ignoring." + ) + if continue_run and not set_run_id: raise ValueError("continue_run flag requires run_id to be set") @@ -252,6 +267,9 @@ def main( prompt_sensitivity=prompt_sensitivity, num_variations=num_variations, variation_strength=variation_strength, + max_tasks=max_tasks, + task_ids=task_ids, + no_download_environment=no_download_environment and vm, ) # get exact command used to run the evaluation from click @@ -282,6 +300,7 @@ def main( task_timeout=task_timeout, results_dir=results_dir, task_ids=task_ids, + download_environment=not (no_download_environment and vm), ) # Run evaluation diff --git a/hal/utils/logging_utils.py b/hal/utils/logging_utils.py index e7a8f77..d458775 100644 --- a/hal/utils/logging_utils.py +++ b/hal/utils/logging_utils.py @@ -22,7 +22,6 @@ def setup_logging(log_dir: str, run_id: str, use_vm: bool = False) -> None: Args: log_dir: Directory for log files run_id: Unique run identifier - use_vm: Unused; kept for API compatibility. """ # Create absolute path for log directory to avoid path duplication log_dir = os.path.abspath(log_dir) @@ -149,6 +148,9 @@ def print_run_config( prompt_sensitivity: bool = False, num_variations: int = 3, variation_strength: str = "mild", + max_tasks: Optional[int] = None, + task_ids: Optional[str] = None, + no_download_environment: bool = False, ) -> None: """Print a formatted table with the run configuration""" logger.info("=== Run Configuration ===") @@ -159,8 +161,17 @@ def print_run_config( logger.info(f" Agent Directory: {agent_dir}") logger.info(f" Log Directory: {log_dir}") logger.info(f" Max Concurrent: {max_concurrent}") + logger.info( + f" Max Tasks: {max_tasks if max_tasks is not None else 'all'}", + ) + if task_ids: + logger.info(f" Task IDs: {task_ids}") logger.info(f" Upload Results: {'Yes' if upload else 'No'}") logger.info(f" VM Execution: {'Yes' if vm else 'No'}") + if vm: + logger.info( + f" VM Download Environment: {'No (skip environment/)' if no_download_environment else 'Yes'}" + ) logger.info(f" Docker Execution: {'Yes' if docker else 'No'}") logger.info(f" Continue Previous Run: {'Yes' if continue_run else 'No'}") logger.info(f" Ignore Errors: {'Yes' if ignore_errors else 'No'}") diff --git a/hal/utils/virtual_machine_manager.py b/hal/utils/virtual_machine_manager.py index 54c2b4c..70882de 100644 --- a/hal/utils/virtual_machine_manager.py +++ b/hal/utils/virtual_machine_manager.py @@ -3,11 +3,13 @@ from azure.identity import DefaultAzureCredential import paramiko import os +import shlex import tarfile import json import logging from contextlib import contextmanager from pathlib import Path +from dotenv import dotenv_values from .vm.azure_virtual_machine import AzureVirtualMachine # Mount names for core_agent: used only under VM_AGENT_HOME/environment/ (e.g. environment/data, environment/code, environment/results from task payload). @@ -16,6 +18,33 @@ RUN_AGENT_SCRIPT_PATH = Path(__file__).resolve().parent / "vm" / "run_agent.py" + +def _vm_env_var_from_host(name: str) -> str | None: + """Resolve a secret from the host process env or local .env (for VM payload files).""" + v = os.environ.get(name, "").strip() + if v: + return v + env_file = Path.cwd() / ".env" + if env_file.is_file(): + raw = dotenv_values(env_file).get(name) + if raw: + s = str(raw).strip() + if s: + return s + return None + + +# Injected into run_agent.env so the remote agent sees keys when the host has no .env file +# to copy to the VM (e.g. CI: secrets exist only on the runner environment). +_VM_RUN_AGENT_SECRET_NAMES = ( + "WANDB_API_KEY", + "OPENAI_API_KEY", + "ANTHROPIC_API_KEY", + "GEMINI_API_KEY", + "TOGETHERAI_API_KEY", + "OPENROUTER_API_KEY", +) + # Set up base logger _base_logger = logging.getLogger(__name__) @@ -271,8 +300,20 @@ def compress_and_copy_files_to_vm(self, vm_name, source_directory): logger.error(f"Error copying files: {e}") raise - def copy_files_from_vm(self, vm_name, destination_directory): - """Copy files from the VM to local directory.""" + def copy_files_from_vm( + self, + vm_name, + destination_directory, + *, + download_environment: bool = True, + ): + """Copy files from the VM to local directory. + + When download_environment is False, omits ``/home/agent/environment`` from + the archive (task data/code/results) to speed up SFTP; ``output.json`` and + other home-directory files are still included. + """ + logger = _get_logger(vm_name) with self._get_sftp_client( vm_name, self.network_client, @@ -288,8 +329,15 @@ def copy_files_from_vm(self, vm_name, destination_directory): f"/home/agent/{os.path.basename(destination_directory)}_back.tar.gz" ) remote_home_directory = "/home/agent" + exclude = "" + if not download_environment: + logger.info( + "Excluding environment/ from VM results archive (faster download)" + ) + exclude = "--exclude=environment " + quoted_tar = shlex.quote(remote_tar_file_path) _, stdout, _ = ssh_client.exec_command( - f"tar -czf {remote_tar_file_path} -C {remote_home_directory} ." + f"tar {exclude}-czf {quoted_tar} -C {remote_home_directory} ." ) for _ in stdout: pass # Block until the tar command completes @@ -423,6 +471,10 @@ def copy_env_and_run_setup_script( # Write run-specific env vars for static run_agent.py run_agent_env = f"RUN_ID={run_id}\nAGENT_FUNCTION={agent_function}\nTASK_ID={task_id}\n" + for name in _VM_RUN_AGENT_SECRET_NAMES: + val = _vm_env_var_from_host(name) + if val: + run_agent_env += f"{name}={val}\n" with sftp_client.open("/home/agent/run_agent.env", "w") as f: f.write(run_agent_env) diff --git a/hal/utils/virtual_machine_runner.py b/hal/utils/virtual_machine_runner.py index 211cfb4..e380313 100644 --- a/hal/utils/virtual_machine_runner.py +++ b/hal/utils/virtual_machine_runner.py @@ -25,10 +25,12 @@ def __init__( task_timeout: int, max_concurrent: int = 1, benchmark: Optional[BaseBenchmark] = None, + download_environment: bool = True, ): self.max_concurrent = max_concurrent self.log_dir = log_dir self.task_timeout = task_timeout + self.download_environment = download_environment self.vm_manager = VirtualMachineManager() self._semaphore = asyncio.Semaphore(max_concurrent) self._file_lock = asyncio.Lock() @@ -248,6 +250,7 @@ async def process_task(task_id: str, input_data: Any) -> Optional[Dict]: self.vm_manager.copy_files_from_vm, vm_name, dest_dir, + download_environment=self.download_environment, ) # Read the output.json file from the copied directory @@ -277,7 +280,12 @@ async def process_task(task_id: str, input_data: Any) -> Optional[Dict]: except Exception as e: logger.error(f"Error processing task {task_id} on VM {vm_name}: {e}") traceback.print_exc() - return {task_id: f"ERROR: {str(e)}"} + # Re-raise so the eval run fails hard; `finally` below still deletes the VM (or + # whatever resources exist for this name). Swallowing here made hal-eval continue + # and report success despite Azure / provisioning failures. + raise RuntimeError( + f"VM run failed for task {task_id} (VM {vm_name}): {e}" + ) from e finally: wall_clock_time = time.time() - task_start_time diff --git a/pyproject.toml b/pyproject.toml index 7c02a40..64430da 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,6 +22,7 @@ dependencies = [ [project.optional-dependencies] dev = [ "ruff==0.15.4", + "jsonschema>=4.22.0", "pytest>=8.0.0", "pytest-mock>=3.12.0", "matplotlib>=3.7.0", diff --git a/tests/gh_actions/corebench_hard_upload.schema.json b/tests/gh_actions/corebench_hard_upload.schema.json new file mode 100644 index 0000000..2fcd8c2 --- /dev/null +++ b/tests/gh_actions/corebench_hard_upload.schema.json @@ -0,0 +1,173 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "hal-harness/tests/gh_actions/corebench_hard_upload", + "title": "CoreBench hard VM e2e upload artifact", + "description": "Matches hal.benchmarks.base_benchmark.BaseBenchmark.process_results output for a CoreBench run.", + "type": "object", + "required": [ + "config", + "results", + "raw_eval_results", + "raw_logging_results", + "total_usage", + "total_cost", + "git_info" + ], + "properties": { + "config": { + "type": "object", + "required": [ + "agent_name", + "benchmark_name", + "date", + "run_id", + "agent_args", + "run_command", + "prompt_sensitivity" + ], + "properties": { + "agent_name": { "const": "CORE-Agent" }, + "benchmark_name": { "const": "corebench_hard" }, + "date": { + "type": "string", + "pattern": "^[0-9]{4}-[0-9]{2}-[0-9]{2}$" + }, + "run_id": { + "type": "string", + "pattern": "^corebench_hard_coreagent_[0-9]+$" + }, + "agent_args": { + "type": "object", + "required": ["model_name", "benchmark_name"], + "properties": { + "model_name": { + "type": "string", + "const": "openai/gpt-4.1-2025-04-14" + }, + "benchmark_name": { + "type": "string", + "const": "corebench_hard" + } + }, + "additionalProperties": true + }, + "run_command": { + "type": "string", + "minLength": 1, + "pattern": "hal-eval.*--benchmark corebench_hard" + }, + "prompt_sensitivity": { "type": "boolean", "const": false } + }, + "additionalProperties": true + }, + "results": { + "type": "object", + "required": [ + "accuracy", + "written_accuracy", + "vision_accuracy", + "successful_tasks", + "failed_tasks", + "total_cost", + "latencies" + ], + "properties": { + "accuracy": { "type": "number" }, + "written_accuracy": { "type": "number" }, + "vision_accuracy": { "type": "number" }, + "successful_tasks": { + "type": "array", + "items": { "type": "string", "pattern": "^capsule-[0-9]+$" } + }, + "failed_tasks": { + "type": "array", + "items": { "type": "string", "pattern": "^capsule-[0-9]+$" } + }, + "total_cost": { "type": "number" }, + "latencies": { + "type": "object", + "minProperties": 1, + "additionalProperties": { + "type": "object", + "required": [ + "first_call_timestamp", + "last_call_timestamp", + "total_time" + ], + "properties": { + "first_call_timestamp": { "type": "string", "minLength": 1 }, + "last_call_timestamp": { "type": "string", "minLength": 1 }, + "total_time": { "type": "number" } + }, + "additionalProperties": true + } + } + }, + "additionalProperties": true + }, + "raw_eval_results": { + "type": "object", + "minProperties": 1, + "propertyNames": { "pattern": "^capsule-[0-9]+$" }, + "additionalProperties": { + "type": "object", + "required": [ + "correct_written_answers", + "correct_vision_answers", + "total_written_questions", + "total_vision_questions" + ], + "properties": { + "correct_written_answers": { "type": "integer", "minimum": 0 }, + "correct_vision_answers": { "type": "integer", "minimum": 0 }, + "total_written_questions": { "type": "integer", "minimum": 0 }, + "total_vision_questions": { "type": "integer", "minimum": 0 }, + "error": { "type": "string" } + }, + "additionalProperties": true + } + }, + "raw_logging_results": { + "type": "array", + "minItems": 1, + "items": { "type": "object", "additionalProperties": true } + }, + "total_usage": { + "type": "object", + "additionalProperties": { + "type": "object", + "properties": { + "prompt_tokens": { "type": "number", "minimum": 0 }, + "completion_tokens": { "type": "number", "minimum": 0 }, + "cache_creation_input_tokens": { "type": "number", "minimum": 0 }, + "cache_read_input_tokens": { "type": "number", "minimum": 0 } + }, + "additionalProperties": true + } + }, + "total_cost": { "type": "number" }, + "git_info": { + "type": "object", + "minProperties": 1, + "anyOf": [ + { + "required": ["commit", "repository_url", "branch"], + "properties": { + "commit": { "type": "string", "minLength": 1 }, + "repository_url": { "type": "string", "minLength": 1 }, + "branch": { "type": "string", "minLength": 1 }, + "commit_timestamp": { "type": "string" }, + "commit_url": { "type": "string" } + } + }, + { + "required": ["error"], + "properties": { "error": { "type": "string" } } + } + ] + }, + "prompt_sensitivity_metrics": { "type": "object" }, + "task_metrics": { "type": "object" } + }, + "additionalProperties": true +} diff --git a/tests/gh_actions/verify_corebench_hard_e2e.sh b/tests/gh_actions/verify_corebench_hard_e2e.sh new file mode 100755 index 0000000..03dc26f --- /dev/null +++ b/tests/gh_actions/verify_corebench_hard_e2e.sh @@ -0,0 +1,84 @@ +#!/usr/bin/env bash +# Validate CoreBench hard VM e2e run directory and *_UPLOAD.json (schema alongside this script). +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" +BENCH_DIR="${ROOT}/results/corebench_hard" +SCHEMA="${SCRIPT_DIR}/corebench_hard_upload.schema.json" + +if [[ ! -d "$BENCH_DIR" ]]; then + echo "::error::Expected benchmark results directory missing: $BENCH_DIR" >&2 + exit 1 +fi + +# Newest run directory (default hal-eval run_id: corebench_hard__). +shopt -s nullglob +dirs=("$BENCH_DIR"/corebench_hard_*) +shopt -u nullglob +run_dir="" +if ((${#dirs[@]})); then + run_dir="$(ls -td "${dirs[@]}" | head -n1)" +fi + +if [[ -z "${run_dir:-}" || ! -d "$run_dir" ]]; then + echo "::error::No run directory matching results/corebench_hard/corebench_hard_*" >&2 + exit 1 +fi + +run_id="$(basename "$run_dir")" +upload="${run_dir}/${run_id}_UPLOAD.json" + +echo "Verifying e2e artifacts under: $run_id" + +require_file() { + local f="$1" + if [[ ! -f "$f" ]]; then + echo "::error::Missing required file: $f" >&2 + exit 1 + fi +} + +require_dir() { + local d="$1" + if [[ ! -d "$d" ]]; then + echo "::error::Missing required directory: $d" >&2 + exit 1 + fi +} + +require_file "$upload" +require_file "${run_dir}/${run_id}.json" +require_file "${run_dir}/${run_id}.log" +require_file "${run_dir}/${run_id}_RAW_SUBMISSIONS.jsonl" + +require_dir "${run_dir}/agent_logs" +if ! compgen -G "${run_dir}/agent_logs/*_log.log" >/dev/null; then + echo "::error::Expected at least one agent_logs/*_log.log under $run_dir" >&2 + exit 1 +fi + +if ! find "$run_dir" -mindepth 1 -maxdepth 1 -type d -name 'capsule-*' -print -quit | grep -q .; then + echo "::error::Expected a per-task capsule-* directory under $run_dir (VM result copy)" >&2 + exit 1 +fi + +if ! find "$run_dir" -mindepth 1 -maxdepth 1 -type f -name 'setup_vm_log_*.log' -print -quit | grep -q .; then + echo "::error::Expected setup_vm_log_*.log under $run_dir" >&2 + exit 1 +fi + +python3 - "$SCHEMA" "$upload" <<'PY' +import json +import sys +from pathlib import Path + +from jsonschema import Draft202012Validator + +schema_path, upload_path = Path(sys.argv[1]), Path(sys.argv[2]) +schema = json.loads(schema_path.read_text(encoding="utf-8")) +instance = json.loads(upload_path.read_text(encoding="utf-8")) +Draft202012Validator(schema).validate(instance) +PY + +echo "CoreBench hard e2e artifact checks passed."