princeton-pli · cdev412 · Mar 19, 2026 · Mar 20, 2026 · Mar 20, 2026 · Mar 20, 2026
diff --git a/.github/workflows/claude-code-review.LMY b/.github/workflows/claude-code-review.LMY
@@ -32,6 +32,7 @@ jobs:
     #   github.event.pull_request.author_association == 'FIRST_TIME_CONTRIBUTOR'
 
     runs-on: ubuntu-latest
+    environment: test-environment
     permissions:
       contents: read
       pull-requests: read

diff --git a/.github/workflows/e2e.yml b/.github/workflows/e2e.yml
@@ -0,0 +1,142 @@
+# Runs a minimal CoreBench hard VM smoke eval. Requires a GitHub Environment
+# (update `environment.name` below) with the secrets/vars listed in the job `env` block.
+# Azure auth (pick one): (A) OIDC — Azure login with client-id, tenant-id, subscription-id
+# and id-token: write (see https://github.com/azure/login ); (B) service principal + secret —
+# Environment secrets `AZURE_CLIENT_ID`, `AZURE_CLIENT_SECRET`, `AZURE_TENANT_ID` plus var
+# `AZURE_SUBSCRIPTION_ID` assembled into azure/login `creds` JSON; omit id-token: write.
+name: HAL eval CoreBench hard (VM)
+
+on:
+  push:
+    branches: [main]
+  pull_request:
+    branches: [main]
+
+# New run on the same branch/PR cancels the previous in-progress run (saves VM/API cost).
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  corebench-hard-vm:
+    runs-on: ubuntu-latest
+    # Create this Environment in Repo → Settings → Environments and attach secrets/vars.
+    environment: test-environment
+    permissions:
+      contents: read
+    env:
+      OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+      ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
+      WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }}
+      AZURE_SUBSCRIPTION_ID: ${{ vars.AZURE_SUBSCRIPTION_ID }}
+      AZURE_RESOURCE_GROUP_NAME: ${{ vars.AZURE_RESOURCE_GROUP_NAME }}
+      AZURE_LOCATION: ${{ vars.AZURE_LOCATION }}
+      NETWORK_SECURITY_GROUP_NAME: ${{ vars.NETWORK_SECURITY_GROUP_NAME }}
+      SSH_PRIVATE_KEY_PATH: ${{ vars.SSH_PRIVATE_KEY_PATH }}
+      SSH_PUBLIC_KEY_PATH: ${{ vars.SSH_PUBLIC_KEY_PATH }}
+    steps:
+      - uses: actions/checkout@v4
+
+      # Warm capsules avoid re-downloading from corebench.cs.princeton.edu every run.
+      # - uses: actions/cache@v4
+      #   with:
+      #     path: hal/benchmarks/corebench/capsules
+      #     key: corebench-capsules-${{ hashFiles('hal/benchmarks/corebench/core_test.json.gpg') }}
+      #     restore-keys: |
+      #       corebench-capsules-
+
+      - name: Assemble Azure creds JSON
+        id: azure_creds
+        env:
+          AZURE_CLIENT_ID: ${{ secrets.AZURE_CLIENT_ID }}
+          AZURE_CLIENT_SECRET: ${{ secrets.AZURE_CLIENT_SECRET }}
+          AZURE_TENANT_ID: ${{ secrets.AZURE_TENANT_ID }}
+        run: |
+          set -euo pipefail
+          for v in AZURE_CLIENT_ID AZURE_CLIENT_SECRET AZURE_TENANT_ID AZURE_SUBSCRIPTION_ID; do
+            if [ -z "${!v:-}" ]; then
+              echo "::error::Required $v is empty (set on GitHub Environment \`test-environment\` secrets or vars)." >&2
+              exit 1
+            fi
+          done
+          CREDS="$(python3 -c 'import json, os; print(json.dumps({"clientId": os.environ["AZURE_CLIENT_ID"], "clientSecret": os.environ["AZURE_CLIENT_SECRET"], "subscriptionId": os.environ["AZURE_SUBSCRIPTION_ID"], "tenantId": os.environ["AZURE_TENANT_ID"]}))')"
+          {
+            echo 'creds<<CREDS_JSON_EOF'
+            echo "$CREDS"
+            echo 'CREDS_JSON_EOF'
+          } >> "$GITHUB_OUTPUT"
+
+      - name: Azure login (service principal)
+        uses: azure/login@v2
+        with:
+          auth-type: SERVICE_PRINCIPAL
+          creds: ${{ steps.azure_creds.outputs.creds }}
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+
+      - name: Install dependencies
+        run: |
+          set -euo pipefail
+          pip install -e ".[dev,azure,corebench,coreagent]"
+          # Ensure gql 4+ (weave needs TransportConnectionFailed); extras can pull gql 3.x.
+          pip install "gql[httpx]>=4.0,<5"
+
+      - name: Generate SSH key pair for VM provisioning
+        run: |
+          set -euo pipefail
+          key="${SSH_PRIVATE_KEY_PATH:?SSH_PRIVATE_KEY_PATH must be set}"
+          pub="${SSH_PUBLIC_KEY_PATH:?SSH_PUBLIC_KEY_PATH must be set}"
+          mkdir -p "$(dirname "$key")"
+          rm -f "$key" "$pub"
+          ssh-keygen -t ed25519 -f "$key" -N "" -q
+          test -f "$pub"
+          chmod 600 "$key"
+          chmod 644 "$pub"
+
+      # core_test.json is gitignored; benchmark loads decrypted JSON (see README / hal/benchmarks/corebench.py).
+      - name: Decrypt CoreBench test set
+        run: |
+          set -euo pipefail
+          gpg --batch --yes --pinentry-mode loopback \
+            --passphrase 'reproducibility' \
+            --output hal/benchmarks/corebench/core_test.json \
+            --decrypt hal/benchmarks/corebench/core_test.json.gpg
+
+      - name: Run hal-eval (CoreBench hard, one test, VM)
+        run: |
+          set -euo pipefail
+          if [ -z "${OPENAI_API_KEY:-}" ]; then
+            echo "::error::OPENAI_API_KEY is empty — add repository or environment secret \`OPENAI_API_KEY\` on GitHub Environment \`test-environment\`." >&2
+            exit 1
+          fi
+          log="$(mktemp)"
+          set +e
+          set -o pipefail
+          hal-eval --benchmark corebench_hard \
+            --agent_dir agents/core_agent \
+            --agent_function main.run \
+            --agent_name "CORE-Agent" \
+            --vm \
+            --no-download-environment \
+            --max_concurrent 1 \
+            --max_tasks 1 \
+            -A 'model_name=openai/gpt-4.1-2025-04-14' \
+            2>&1 | tee "$log"
+          eval_status=${PIPESTATUS[0]}
+          set -e
+          if [ "$eval_status" -ne 0 ]; then
+            echo "::error::hal-eval exited with status $eval_status" >&2
+            exit "$eval_status"
+          fi
+          if ! grep -Fq 'hal.cli: Evaluation completed successfully' "$log"; then
+            echo "::error::hal-eval output must contain: hal.cli: Evaluation completed successfully" >&2
+            exit 1
+          fi
+
+      - name: Verify CoreBench hard result layout and upload JSON
+        run: |
+          set -euo pipefail
+          bash tests/gh_actions/verify_corebench_hard_e2e.sh
diff --git a/README.md b/README.md
@@ -405,6 +405,7 @@ hal-eval --benchmark <benchmark_name> --agent_dir <agent_directory> --agent_func
 - **`--max_concurrent <number>`**: Number of parallel tasks (default: 1)
 - **`--conda_env_name <env_name>`**: Conda environment for agent execution
 - **`--vm`**: Run evaluation on Azure VMs
+- **`--no-download-environment`**: With **`--vm`**, omit the VM’s `environment/` directory (task data, code, and results mounts) when downloading results back to the host, so the SFTP step is much faster. `output.json`, logs, and other files under `/home/agent` are still downloaded. No effect without `--vm`.
 - **`--docker`**: Run evaluation in Docker containers for isolation
 - **`--run_id <run_id>`**: Specify a run ID (useful for continuing runs)
 - **`--continue_run`**: Continue from a previous run (requires run_id)

diff --git a/agents/core_agent/agent_hints.py b/agents/core_agent/agent_hints.py
@@ -56,7 +56,6 @@
 • If there exists a file called 'manuscript' then first read this file to extract the required results to answer the questions of the task.
 • If you are extracting information from html (such as the output of a Jupyter notebook), convert it to a PDF or PNG first and then extract the relevant information.
 • Before running the code, first determine a list of package/dependency requirements that must be installed by reading through the README file or code itself. Then install those dependencies before running the code.
-• Note: Symbolic links have been automatically created for environment/data → /data, environment/code → /code, and environment/results → /results to ensure proper file access.
 
 Constraints:
 • Use flags or modify commands to bypass any need for confirmations during execution to streamline processes.

diff --git a/agents/core_agent/main.py b/agents/core_agent/main.py
@@ -537,17 +537,6 @@ def run(input: dict[str, dict], **kwargs) -> dict[str, str]:
         print(f"[WARNING] Failed to run 'conda list': {str(e)}")
     print("=== End of Package Versions and Environment Information ===")
 
-    # Create symbolic links
-    try:
-        cwd = os.getcwd()
-        os.symlink(f"{cwd}/environment/data", "/data", target_is_directory=True)
-        os.symlink(f"{cwd}/environment/code", "/code", target_is_directory=True)
-        os.symlink(f"{cwd}/environment/results", "/results", target_is_directory=True)
-    except Exception as e:
-        print(
-            f"[WARNING] Failed to create symbolic links for /data, /code, and /results: {str(e)}"
-        )
-
     assert "model_name" in kwargs, "model_name is required"
     assert len(input) == 1, "input must contain only one task"
 
@@ -678,10 +667,19 @@ async def acompletion_with_provider(*args, **completion_kwargs):
 
     model = LiteLLMModel(**model_params)
 
+    # CoreBench layout: always use cwd-based paths (no /data, /code, /results symlinks).
+    cwd = os.getcwd()
+    env_paths = (
+        f"• Task directories — use these absolute paths: "
+        f"{cwd}/environment/data (data), {cwd}/environment/code (code), "
+        f"{cwd}/environment/results (results).\n\n"
+    )
     # Prepend hints to the task prompt if available
     prompt = task["prompt"]
     if hints:
-        prompt = f"{hints}\n\n{prompt}"
+        prompt = f"{env_paths}{hints}\n\n{prompt}"
+    else:
+        prompt = f"{env_paths}{prompt}"
 
     # Create a custom FinalAnswerTool that includes key validation and LLM-based giving-up detection
     class CustomFinalAnswerTool(Tool):