Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/claude-code-review.LMY
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ jobs:
# github.event.pull_request.author_association == 'FIRST_TIME_CONTRIBUTOR'

runs-on: ubuntu-latest
environment: test-environment
permissions:
contents: read
pull-requests: read
Expand Down
142 changes: 142 additions & 0 deletions .github/workflows/e2e.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
# Runs a minimal CoreBench hard VM smoke eval. Requires a GitHub Environment
# (update `environment.name` below) with the secrets/vars listed in the job `env` block.
# Azure auth (pick one): (A) OIDC — Azure login with client-id, tenant-id, subscription-id
# and id-token: write (see https://github.com/azure/login ); (B) service principal + secret —
# Environment secrets `AZURE_CLIENT_ID`, `AZURE_CLIENT_SECRET`, `AZURE_TENANT_ID` plus var
# `AZURE_SUBSCRIPTION_ID` assembled into azure/login `creds` JSON; omit id-token: write.
name: HAL eval CoreBench hard (VM)

on:
push:
branches: [main]
pull_request:
branches: [main]

# New run on the same branch/PR cancels the previous in-progress run (saves VM/API cost).
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true

jobs:
corebench-hard-vm:
runs-on: ubuntu-latest
# Create this Environment in Repo → Settings → Environments and attach secrets/vars.
environment: test-environment
permissions:
contents: read
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }}
AZURE_SUBSCRIPTION_ID: ${{ vars.AZURE_SUBSCRIPTION_ID }}
AZURE_RESOURCE_GROUP_NAME: ${{ vars.AZURE_RESOURCE_GROUP_NAME }}
AZURE_LOCATION: ${{ vars.AZURE_LOCATION }}
NETWORK_SECURITY_GROUP_NAME: ${{ vars.NETWORK_SECURITY_GROUP_NAME }}
SSH_PRIVATE_KEY_PATH: ${{ vars.SSH_PRIVATE_KEY_PATH }}
SSH_PUBLIC_KEY_PATH: ${{ vars.SSH_PUBLIC_KEY_PATH }}
steps:
- uses: actions/checkout@v4

# Warm capsules avoid re-downloading from corebench.cs.princeton.edu every run.
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please uncomment or delete

# - uses: actions/cache@v4
# with:
# path: hal/benchmarks/corebench/capsules
# key: corebench-capsules-${{ hashFiles('hal/benchmarks/corebench/core_test.json.gpg') }}
# restore-keys: |
# corebench-capsules-

- name: Assemble Azure creds JSON
id: azure_creds
env:
AZURE_CLIENT_ID: ${{ secrets.AZURE_CLIENT_ID }}
AZURE_CLIENT_SECRET: ${{ secrets.AZURE_CLIENT_SECRET }}
AZURE_TENANT_ID: ${{ secrets.AZURE_TENANT_ID }}
run: |
set -euo pipefail
for v in AZURE_CLIENT_ID AZURE_CLIENT_SECRET AZURE_TENANT_ID AZURE_SUBSCRIPTION_ID; do
if [ -z "${!v:-}" ]; then
echo "::error::Required $v is empty (set on GitHub Environment \`test-environment\` secrets or vars)." >&2
exit 1
fi
done
CREDS="$(python3 -c 'import json, os; print(json.dumps({"clientId": os.environ["AZURE_CLIENT_ID"], "clientSecret": os.environ["AZURE_CLIENT_SECRET"], "subscriptionId": os.environ["AZURE_SUBSCRIPTION_ID"], "tenantId": os.environ["AZURE_TENANT_ID"]}))')"
{
echo 'creds<<CREDS_JSON_EOF'
echo "$CREDS"
echo 'CREDS_JSON_EOF'
} >> "$GITHUB_OUTPUT"

- name: Azure login (service principal)
uses: azure/login@v2
with:
auth-type: SERVICE_PRINCIPAL
creds: ${{ steps.azure_creds.outputs.creds }}

- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: "3.12"

- name: Install dependencies
run: |
set -euo pipefail
pip install -e ".[dev,azure,corebench,coreagent]"
# Ensure gql 4+ (weave needs TransportConnectionFailed); extras can pull gql 3.x.
pip install "gql[httpx]>=4.0,<5"

- name: Generate SSH key pair for VM provisioning
run: |
set -euo pipefail
key="${SSH_PRIVATE_KEY_PATH:?SSH_PRIVATE_KEY_PATH must be set}"
pub="${SSH_PUBLIC_KEY_PATH:?SSH_PUBLIC_KEY_PATH must be set}"
mkdir -p "$(dirname "$key")"
rm -f "$key" "$pub"
ssh-keygen -t ed25519 -f "$key" -N "" -q
test -f "$pub"
chmod 600 "$key"
chmod 644 "$pub"

# core_test.json is gitignored; benchmark loads decrypted JSON (see README / hal/benchmarks/corebench.py).
- name: Decrypt CoreBench test set
run: |
set -euo pipefail
gpg --batch --yes --pinentry-mode loopback \
--passphrase 'reproducibility' \
--output hal/benchmarks/corebench/core_test.json \
--decrypt hal/benchmarks/corebench/core_test.json.gpg

- name: Run hal-eval (CoreBench hard, one test, VM)
run: |
set -euo pipefail
if [ -z "${OPENAI_API_KEY:-}" ]; then
echo "::error::OPENAI_API_KEY is empty — add repository or environment secret \`OPENAI_API_KEY\` on GitHub Environment \`test-environment\`." >&2
exit 1
fi
log="$(mktemp)"
set +e
set -o pipefail
hal-eval --benchmark corebench_hard \
--agent_dir agents/core_agent \
--agent_function main.run \
--agent_name "CORE-Agent" \
--vm \
--no-download-environment \
--max_concurrent 1 \
--max_tasks 1 \
-A 'model_name=openai/gpt-4.1-2025-04-14' \
2>&1 | tee "$log"
eval_status=${PIPESTATUS[0]}
set -e
if [ "$eval_status" -ne 0 ]; then
echo "::error::hal-eval exited with status $eval_status" >&2
exit "$eval_status"
fi
if ! grep -Fq 'hal.cli: Evaluation completed successfully' "$log"; then
echo "::error::hal-eval output must contain: hal.cli: Evaluation completed successfully" >&2
exit 1
fi

- name: Verify CoreBench hard result layout and upload JSON
run: |
set -euo pipefail
bash tests/gh_actions/verify_corebench_hard_e2e.sh
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -405,6 +405,7 @@ hal-eval --benchmark <benchmark_name> --agent_dir <agent_directory> --agent_func
- **`--max_concurrent <number>`**: Number of parallel tasks (default: 1)
- **`--conda_env_name <env_name>`**: Conda environment for agent execution
- **`--vm`**: Run evaluation on Azure VMs
- **`--no-download-environment`**: With **`--vm`**, omit the VM’s `environment/` directory (task data, code, and results mounts) when downloading results back to the host, so the SFTP step is much faster. `output.json`, logs, and other files under `/home/agent` are still downloaded. No effect without `--vm`.
- **`--docker`**: Run evaluation in Docker containers for isolation
- **`--run_id <run_id>`**: Specify a run ID (useful for continuing runs)
- **`--continue_run`**: Continue from a previous run (requires run_id)
Expand Down
1 change: 0 additions & 1 deletion agents/core_agent/agent_hints.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,6 @@
• If there exists a file called 'manuscript' then first read this file to extract the required results to answer the questions of the task.
• If you are extracting information from html (such as the output of a Jupyter notebook), convert it to a PDF or PNG first and then extract the relevant information.
• Before running the code, first determine a list of package/dependency requirements that must be installed by reading through the README file or code itself. Then install those dependencies before running the code.
• Note: Symbolic links have been automatically created for environment/data → /data, environment/code → /code, and environment/results → /results to ensure proper file access.

Constraints:
• Use flags or modify commands to bypass any need for confirmations during execution to streamline processes.
Expand Down
22 changes: 10 additions & 12 deletions agents/core_agent/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -551,17 +551,6 @@ def run(input: dict[str, dict], **kwargs) -> dict[str, str]:
print(f"[WARNING] Failed to run 'conda list': {str(e)}")
print("=== End of Package Versions and Environment Information ===")

# Create symbolic links
try:
cwd = os.getcwd()
os.symlink(f"{cwd}/environment/data", "/data", target_is_directory=True)
os.symlink(f"{cwd}/environment/code", "/code", target_is_directory=True)
os.symlink(f"{cwd}/environment/results", "/results", target_is_directory=True)
except Exception as e:
print(
f"[WARNING] Failed to create symbolic links for /data, /code, and /results: {str(e)}"
)

assert "model_name" in kwargs, "model_name is required"
assert len(input) == 1, "input must contain only one task"

Expand Down Expand Up @@ -692,10 +681,19 @@ async def acompletion_with_provider(*args, **completion_kwargs):

model = LiteLLMModel(**model_params)

# CoreBench layout: always use cwd-based paths (no /data, /code, /results symlinks).
cwd = os.getcwd()
env_paths = (
f"• Task directories — use these absolute paths: "
f"{cwd}/environment/data (data), {cwd}/environment/code (code), "
f"{cwd}/environment/results (results).\n\n"
)
# Prepend hints to the task prompt if available
prompt = task["prompt"]
if hints:
prompt = f"{hints}\n\n{prompt}"
prompt = f"{env_paths}{hints}\n\n{prompt}"
else:
prompt = f"{env_paths}{prompt}"

# Create a custom FinalAnswerTool that includes key validation and LLM-based giving-up detection
class CustomFinalAnswerTool(Tool):
Expand Down
Loading
Loading