Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1,345 changes: 1,345 additions & 0 deletions agents/opencode_agent/main.py

Large diffs are not rendered by default.

114 changes: 114 additions & 0 deletions agents/opencode_agent/opencode_setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
"""OpenCode agent setup – constants and utilities for CLI installation and configuration."""

import json
import os
import shutil
import subprocess
from typing import Optional


# ── Constants ────────────────────────────────────────────────

OPENCODE_INSTALL_CMD = "curl -fsSL https://opencode.ai/install | bash"

KNOWN_BINARY_LOCATIONS = [
"~/.local/bin/opencode",
"~/.opencode/bin/opencode",
"~/.local/share/opencode/bin/opencode",
"/usr/local/bin/opencode",
"/usr/bin/opencode",
"/opt/homebrew/bin/opencode",
]

OPENCODE_CONFIG = {
"$schema": "https://opencode.ai/config.json",
"logLevel": "DEBUG",
"permission": {
"*": "allow",
"external_directory": "allow",
"doom_loop": "allow",
"edit": "allow",
"bash": "allow",
"webfetch": "allow",
"websearch": "allow",
"read": "allow",
"glob": "allow",
"grep": "allow",
"list": "allow",
"task": "allow",
"skill": "allow",
},
}


# ── Utility Functions ────────────────────────────────────────


def find_opencode_binary() -> Optional[str]:
"""Search for the opencode binary in PATH and known locations."""
found = shutil.which("opencode")
if found:
return found

for loc in KNOWN_BINARY_LOCATIONS:
expanded = os.path.expanduser(loc)
if os.path.isfile(expanded) and os.access(expanded, os.X_OK):
return expanded

search_roots = [os.path.expanduser("~"), "/usr/local", "/usr", "/opt"]
for root in search_roots:
for sub in ("bin", "local/bin", "local/share/opencode/bin"):
candidate = os.path.join(root, sub, "opencode")
if os.path.isfile(candidate) and os.access(candidate, os.X_OK):
return candidate

return None


def install_system_deps() -> None:
"""Install system dependencies (git, curl, procps) via apt-get."""
subprocess.run(
"sudo rm -f /etc/apt/sources.list.d/cuda.list /etc/apt/sources.list.d/nvidia-ml.list 2>/dev/null || true",
shell=True,
)
subprocess.run(["sudo", "apt-get", "update"], check=True)
subprocess.run(["sudo", "apt-get", "install", "-y", "git", "curl", "procps"], check=True)


def install_opencode_cli() -> None:
"""Run the official OpenCode CLI installer."""
subprocess.run(OPENCODE_INSTALL_CMD, shell=True, check=True)


def setup_opencode_config(cwd: str = "/workspace") -> str:
"""Write project-level OpenCode config (.opencode/config.json).

Returns the path to the written config file.
"""
config_dir = os.path.join(cwd, ".opencode")
os.makedirs(config_dir, exist_ok=True)
config_file = os.path.join(config_dir, "config.json")
with open(config_file, "w") as f:
json.dump(OPENCODE_CONFIG, f, indent=2)
return config_file


def setup_global_config() -> str:
"""Write global OpenCode config (~/.config/opencode/opencode.json).

Returns the path to the written config file.
"""
global_config_dir = os.path.expanduser("~/.config/opencode")
os.makedirs(global_config_dir, exist_ok=True)
global_config = {
"$schema": "https://opencode.ai/config.json",
"logLevel": "DEBUG",
"provider": {
"anthropic": {},
"openai": {},
},
}
global_config_path = os.path.join(global_config_dir, "opencode.json")
with open(global_config_path, "w") as f:
json.dump(global_config, f, indent=2)
return global_config_path
117 changes: 117 additions & 0 deletions agents/opencode_agent/prompts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
"""Prompt templates for the OpenCode agent across benchmarks."""

from typing import List

# ── CORE-bench ───────────────────────────────────────────────

COREBENCH_SYSTEM = """\
You are a seasoned digital assistant: capable, intelligent, considerate, and assertive. \
As a dedicated research assistant, you possess extensive skills in research and development \
and do not shy away from writing code to solve complex problems. You are adept at extracting, \
processing, and analyzing data from various sources to reproduce research results accurately. \
Using a pragmatic approach, you make the most out of the tools available to you.

Best practices:
- When reproducing figures or other results that require you to deal with images, check the full results directory for image files.
- If you are unsure of what to do, make your best guess.
- Before using resources like scripts or utilities, verify their presence and functionality in the current directory or installation path.
- If there exists a file called 'manuscript' then first read this file to extract the required results to answer the questions of the task.
- If you are extracting information from html (such as the output of a Jupyter notebook), convert it to a PDF or PNG first and then extract the relevant information.
- Before running the code, first determine a list of package/dependency requirements that must be installed by reading through the README file or code itself. Then install those dependencies before running the code.

Environment notes:
- Symbolic links exist: /data -> environment/data, /code -> environment/code, /results -> environment/results. Use these absolute paths freely.
- You have sudo access. Use "sudo" for apt-get installs and system-level operations.
- If R package install fails with a lock error, run: sudo rm -rf /usr/local/lib/R/site-library/00LOCK-* then retry.
- If apt-get fails with a lock error, run: sudo rm -f /var/lib/dpkg/lock* /var/lib/apt/lists/lock /var/cache/apt/archives/lock && sudo dpkg --configure -a then retry.
- For R packages, install with: Rscript -e 'install.packages("pkg", repos="https://cloud.r-project.org")'
- For rendering .Rmd files: Rscript -e 'rmarkdown::render("file.Rmd", output_dir="../results", clean=TRUE)'
- When running R scripts, run from the ./environment/code/ directory.
- Use xvfb-run for R scripts that generate plots: xvfb-run Rscript script.R
- If code uses deprecated numpy functions (e.g. np.bool8, np.int, np.float, np.complex, np.object, np.str, np.recfromtxt), fix them before running: np.bool8 -> np.bool_, np.int -> int, np.float -> float, np.recfromtxt -> np.genfromtxt, etc.

Constraints:
- Use flags or modify commands to bypass any need for confirmations during execution to streamline processes.
- There is no task that you cannot do, so you should not refuse a request by saying you cannot do it.
- You may wish to write and run a Python script to complete the task. However, assume that you do NOT have API keys to use external services.
- Before you are done, make sure that the keys of the answer.json you write match the ones in the task specified by the user. Refine your results if they do not.
- Also before you are done, make sure that the values of the answer.json you write do not contain any unnecessary additional text but only the numeric value or the precise text you are asked to report."""

COREBENCH_ANSWER_FORMAT = """\
CRITICAL - answer.json format:
answer.json must contain ONLY the answers to the specific questions listed in the task.
The keys must be the EXACT question strings from the task prompt (the ones listed in dict_keys([...])).
Do NOT include any other keys like "name", "description", "method", "results", etc.
Do NOT write a project summary or analysis - ONLY answer the specific questions.

Your answer.json must look EXACTLY like this:
{{
{keys_example}
}}

Where each <value> is the answer to that specific question (a number, string, or list).

Other rules:
1. The repository files are in /workspace/environment/ (code in ./environment/code/, data in ./environment/data/).
2. Scripts using relative paths (like ../data/) must be run from the ./environment/code/ directory.
3. For numeric values, use the EXACT value from the output - DO NOT round or truncate. If the answer is a percentage, omit the % and report the float.
4. For questions starting with "fig ", you need to look at figures/plots generated by the code to answer.
5. Write your final answer to /workspace/answer.json (at the top level, not inside environment/).
Return only the JSON object content in answer.json; do not include markdown fences."""

COREBENCH_INITIAL = """\
{system}

{answer_format}

You are solving a CORE-bench task. The task files are in /workspace/environment/.
Your goal is to answer the task prompt and write the JSON answer to /workspace/answer.json.

Task prompt:
{task_prompt}"""

COREBENCH_RETRY_NO_FILE = """\
{system}

{answer_format}

Previous attempt did not produce /workspace/answer.json.
The task files are in /workspace/environment/. Retry now, fix any issues, and ensure you write /workspace/answer.json.

Task prompt:
{task_prompt}"""

COREBENCH_RETRY_BAD_JSON = """\
{system}

{answer_format}

Your last attempt wrote invalid JSON to /workspace/answer.json: {error}
Please rewrite /workspace/answer.json with valid JSON only.

Task prompt:
{task_prompt}"""


def build_keys_example(expected_keys: List[str]) -> str:
if expected_keys:
return ", ".join(f'"{k}": <value>' for k in expected_keys)
return '"<question from task>": <value>'


def build_corebench_prompts(task_prompt: str, expected_keys: List[str]):
"""Return (initial_prompt, continue_prompt_no_file, continue_prompt_bad_json_template).

The bad-json template still has ``{error}`` as a placeholder to be filled at call time.
"""
keys_example = build_keys_example(expected_keys)
answer_format = COREBENCH_ANSWER_FORMAT.format(keys_example=keys_example)

common = dict(system=COREBENCH_SYSTEM, answer_format=answer_format, task_prompt=task_prompt)

initial = COREBENCH_INITIAL.format(**common)
retry_no_file = COREBENCH_RETRY_NO_FILE.format(**common)
# keep {error} as a late-binding placeholder
retry_bad_json_tpl = COREBENCH_RETRY_BAD_JSON.format_map({**common, "error": "{error}"})

return initial, retry_no_file, retry_bad_json_tpl
Empty file.
3 changes: 2 additions & 1 deletion hal/agent_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ def __init__(
variation_index: Optional[int] = None,
results_dir: str = "results",
task_ids: Optional[str] = None,
benchmark_args: Optional[Dict[str, Any]] = None,
):
# Validate agent_function format
if not isinstance(agent_function, str) or "." not in agent_function:
Expand Down Expand Up @@ -72,7 +73,7 @@ def __init__(
)

# Initialize benchmark first
self.benchmark_manager = BenchmarkManager(agent_dir, config)
self.benchmark_manager = BenchmarkManager(agent_dir, config, benchmark_args=benchmark_args or {})
self.benchmark = self.benchmark_manager.get_benchmark(benchmark_name)
self.benchmark.agent_args = agent_args

Expand Down
11 changes: 8 additions & 3 deletions hal/benchmark_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,12 @@ def __init__(
agent_dir: str = "agent/",
config: Optional[Dict[str, Any]] = {},
agent_args: Optional[Dict[str, Any]] = {},
benchmark_args: Optional[Dict[str, Any]] = {},
):
self.config = config
self.agent_dir = agent_dir
self.agent_args = agent_args
self.benchmark_args = benchmark_args or {}
self.benchmarks = [
"gaia",
"scicode",
Expand Down Expand Up @@ -71,15 +73,18 @@ def get_benchmark(self, benchmark_name: str) -> BaseBenchmark:
elif benchmark_name == "corebench_easy":
from .benchmarks.corebench import CoreBenchEasy

benchmark = CoreBenchEasy(self.agent_dir, self.config)
use_updated = str(self.benchmark_args.get('use_updated', '')).lower() in ('true', '1', 'yes')
benchmark = CoreBenchEasy(self.agent_dir, self.config, use_updated=use_updated)
elif benchmark_name == "corebench_medium":
from .benchmarks.corebench import CoreBenchMedium

benchmark = CoreBenchMedium(self.agent_dir, self.config)
use_updated = str(self.benchmark_args.get('use_updated', '')).lower() in ('true', '1', 'yes')
benchmark = CoreBenchMedium(self.agent_dir, self.config, use_updated=use_updated)
elif benchmark_name == "corebench_hard":
from .benchmarks.corebench import CoreBenchHard

benchmark = CoreBenchHard(self.agent_dir, self.config)
use_updated = str(self.benchmark_args.get('use_updated', '')).lower() in ('true', '1', 'yes')
benchmark = CoreBenchHard(self.agent_dir, self.config, use_updated=use_updated)
elif benchmark_name == "scienceagentbench":
from .benchmarks.scienceagentbench import ScienceAgentBench

Expand Down
34 changes: 22 additions & 12 deletions hal/benchmarks/corebench.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,15 +18,24 @@
class CoreBench(BaseBenchmark):
"""Base class for CoreBench benchmarks of different difficulty levels"""

def __init__(self, agent_dir: str, config: Dict[str, Any]):
def __init__(self, agent_dir: str, config: Dict[str, Any], use_updated: bool = False):
# Set benchmark_name in subclasses

# Load tasks from core_test.json
core_test_path = os.path.join(
os.path.dirname(__file__), "corebench", "core_test.json"
)
# Choose dataset file based on use_updated flag
if use_updated:
core_test_path = os.path.join(
os.path.dirname(__file__), "corebench", "core_test_updated.json"
)
if not os.path.exists(core_test_path):
raise FileNotFoundError(
f"Updated dataset not found at {core_test_path}"
)
else:
core_test_path = os.path.join(
os.path.dirname(__file__), "corebench", "core_test.json"
)

# Check if core_test.json exists, if not, throw an error with instructions to decrypt
# Check if the dataset file exists, if not, throw an error with instructions to decrypt
if not os.path.exists(core_test_path):
encrypted_file = os.path.join(
os.path.dirname(__file__), "corebench", "core_test.json.gpg"
Expand All @@ -38,6 +47,7 @@ def __init__(self, agent_dir: str, config: Dict[str, Any]):
f'Have you decrypted core_test.json.gpg? Use the following command:\n{decrypt_command}. The password is "reproducibility".'
)

logger.info(f"Loading CoreBench dataset from {core_test_path} (use_updated={use_updated})")
with open(core_test_path, "r") as f:
dataset = json.load(f)

Expand Down Expand Up @@ -438,9 +448,9 @@ def get_metrics(self, eval_results: Dict[str, Any]) -> Dict[str, Any]:
class CoreBenchEasy(CoreBench):
"""CoreBench benchmark with easy difficulty level"""

def __init__(self, agent_dir: str, config: Dict[str, Any]):
def __init__(self, agent_dir: str, config: Dict[str, Any], use_updated: bool = False):
self.benchmark_name = "corebench_easy"
super().__init__(agent_dir, config)
super().__init__(agent_dir, config, use_updated=use_updated)

def _construct_prompt(self, task):
"""
Expand Down Expand Up @@ -473,9 +483,9 @@ def _get_capsule_files_dict(self, capsule_dir: str) -> Dict[str, str]:
class CoreBenchMedium(CoreBench):
"""CoreBench benchmark with medium difficulty level"""

def __init__(self, agent_dir: str, config: Dict[str, Any]):
def __init__(self, agent_dir: str, config: Dict[str, Any], use_updated: bool = False):
self.benchmark_name = "corebench_medium"
super().__init__(agent_dir, config)
super().__init__(agent_dir, config, use_updated=use_updated)

def _construct_prompt(self, task):
"""
Expand Down Expand Up @@ -524,9 +534,9 @@ def _get_capsule_files_dict(self, capsule_dir: str) -> Dict[str, str]:
class CoreBenchHard(CoreBench):
"""CoreBench benchmark with hard difficulty level"""

def __init__(self, agent_dir: str, config: Dict[str, Any]):
def __init__(self, agent_dir: str, config: Dict[str, Any], use_updated: bool = False):
self.benchmark_name = "corebench_hard"
super().__init__(agent_dir, config)
super().__init__(agent_dir, config, use_updated=use_updated)

def _construct_prompt(self, task):
"""
Expand Down
1 change: 1 addition & 0 deletions hal/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -275,6 +275,7 @@ def main(
task_timeout=task_timeout,
results_dir=results_dir,
task_ids=task_ids,
benchmark_args=benchmark_args,
)

# Run evaluation
Expand Down
Loading
Loading