princeton-pli · SaitejaUtpala · Jan 29, 2026 · Jan 29, 2026 · Jan 29, 2026 · Mar 20, 2026
diff --git a/agents/opencode_agent/main.py b/agents/opencode_agent/main.py
diff --git a/agents/opencode_agent/opencode_setup.py b/agents/opencode_agent/opencode_setup.py
@@ -0,0 +1,114 @@
+"""OpenCode agent setup – constants and utilities for CLI installation and configuration."""
+
+import json
+import os
+import shutil
+import subprocess
+from typing import Optional
+
+
+# ── Constants ────────────────────────────────────────────────
+
+OPENCODE_INSTALL_CMD = "curl -fsSL https://opencode.ai/install | bash"
+
+KNOWN_BINARY_LOCATIONS = [
+    "~/.local/bin/opencode",
+    "~/.opencode/bin/opencode",
+    "~/.local/share/opencode/bin/opencode",
+    "/usr/local/bin/opencode",
+    "/usr/bin/opencode",
+    "/opt/homebrew/bin/opencode",
+]
+
+OPENCODE_CONFIG = {
+    "$schema": "https://opencode.ai/config.json",
+    "logLevel": "DEBUG",
+    "permission": {
+        "*": "allow",
+        "external_directory": "allow",
+        "doom_loop": "allow",
+        "edit": "allow",
+        "bash": "allow",
+        "webfetch": "allow",
+        "websearch": "allow",
+        "read": "allow",
+        "glob": "allow",
+        "grep": "allow",
+        "list": "allow",
+        "task": "allow",
+        "skill": "allow",
+    },
+}
+
+
+# ── Utility Functions ────────────────────────────────────────
+
+
+def find_opencode_binary() -> Optional[str]:
+    """Search for the opencode binary in PATH and known locations."""
+    found = shutil.which("opencode")
+    if found:
+        return found
+
+    for loc in KNOWN_BINARY_LOCATIONS:
+        expanded = os.path.expanduser(loc)
+        if os.path.isfile(expanded) and os.access(expanded, os.X_OK):
+            return expanded
+
+    search_roots = [os.path.expanduser("~"), "/usr/local", "/usr", "/opt"]
+    for root in search_roots:
+        for sub in ("bin", "local/bin", "local/share/opencode/bin"):
+            candidate = os.path.join(root, sub, "opencode")
+            if os.path.isfile(candidate) and os.access(candidate, os.X_OK):
+                return candidate
+
+    return None
+
+
+def install_system_deps() -> None:
+    """Install system dependencies (git, curl, procps) via apt-get."""
+    subprocess.run(
+        "sudo rm -f /etc/apt/sources.list.d/cuda.list /etc/apt/sources.list.d/nvidia-ml.list 2>/dev/null || true",
+        shell=True,
+    )
+    subprocess.run(["sudo", "apt-get", "update"], check=True)
+    subprocess.run(["sudo", "apt-get", "install", "-y", "git", "curl", "procps"], check=True)
+
+
+def install_opencode_cli() -> None:
+    """Run the official OpenCode CLI installer."""
+    subprocess.run(OPENCODE_INSTALL_CMD, shell=True, check=True)
+
+
+def setup_opencode_config(cwd: str = "/workspace") -> str:
+    """Write project-level OpenCode config (.opencode/config.json).
+
+    Returns the path to the written config file.
+    """
+    config_dir = os.path.join(cwd, ".opencode")
+    os.makedirs(config_dir, exist_ok=True)
+    config_file = os.path.join(config_dir, "config.json")
+    with open(config_file, "w") as f:
+        json.dump(OPENCODE_CONFIG, f, indent=2)
+    return config_file
+
+
+def setup_global_config() -> str:
+    """Write global OpenCode config (~/.config/opencode/opencode.json).
+
+    Returns the path to the written config file.
+    """
+    global_config_dir = os.path.expanduser("~/.config/opencode")
+    os.makedirs(global_config_dir, exist_ok=True)
+    global_config = {
+        "$schema": "https://opencode.ai/config.json",
+        "logLevel": "DEBUG",
+        "provider": {
+            "anthropic": {},
+            "openai": {},
+        },
+    }
+    global_config_path = os.path.join(global_config_dir, "opencode.json")
+    with open(global_config_path, "w") as f:
+        json.dump(global_config, f, indent=2)
+    return global_config_path
diff --git a/agents/opencode_agent/prompts.py b/agents/opencode_agent/prompts.py
@@ -0,0 +1,117 @@
+"""Prompt templates for the OpenCode agent across benchmarks."""
+
+from typing import List
+
+# ── CORE-bench ───────────────────────────────────────────────
+
+COREBENCH_SYSTEM = """\
+You are a seasoned digital assistant: capable, intelligent, considerate, and assertive. \
+As a dedicated research assistant, you possess extensive skills in research and development \
+and do not shy away from writing code to solve complex problems. You are adept at extracting, \
+processing, and analyzing data from various sources to reproduce research results accurately. \
+Using a pragmatic approach, you make the most out of the tools available to you.
+
+Best practices:
+- When reproducing figures or other results that require you to deal with images, check the full results directory for image files.
+- If you are unsure of what to do, make your best guess.
+- Before using resources like scripts or utilities, verify their presence and functionality in the current directory or installation path.
+- If there exists a file called 'manuscript' then first read this file to extract the required results to answer the questions of the task.
+- If you are extracting information from html (such as the output of a Jupyter notebook), convert it to a PDF or PNG first and then extract the relevant information.
+- Before running the code, first determine a list of package/dependency requirements that must be installed by reading through the README file or code itself. Then install those dependencies before running the code.
+
+Environment notes:
+- Symbolic links exist: /data -> environment/data, /code -> environment/code, /results -> environment/results. Use these absolute paths freely.
+- You have sudo access. Use "sudo" for apt-get installs and system-level operations.
+- If R package install fails with a lock error, run: sudo rm -rf /usr/local/lib/R/site-library/00LOCK-* then retry.
+- If apt-get fails with a lock error, run: sudo rm -f /var/lib/dpkg/lock* /var/lib/apt/lists/lock /var/cache/apt/archives/lock && sudo dpkg --configure -a then retry.
+- For R packages, install with: Rscript -e 'install.packages("pkg", repos="https://cloud.r-project.org")'
+- For rendering .Rmd files: Rscript -e 'rmarkdown::render("file.Rmd", output_dir="../results", clean=TRUE)'
+- When running R scripts, run from the ./environment/code/ directory.
+- Use xvfb-run for R scripts that generate plots: xvfb-run Rscript script.R
+- If code uses deprecated numpy functions (e.g. np.bool8, np.int, np.float, np.complex, np.object, np.str, np.recfromtxt), fix them before running: np.bool8 -> np.bool_, np.int -> int, np.float -> float, np.recfromtxt -> np.genfromtxt, etc.
+
+Constraints:
+- Use flags or modify commands to bypass any need for confirmations during execution to streamline processes.
+- There is no task that you cannot do, so you should not refuse a request by saying you cannot do it.
+- You may wish to write and run a Python script to complete the task. However, assume that you do NOT have API keys to use external services.
+- Before you are done, make sure that the keys of the answer.json you write match the ones in the task specified by the user. Refine your results if they do not.
+- Also before you are done, make sure that the values of the answer.json you write do not contain any unnecessary additional text but only the numeric value or the precise text you are asked to report."""
+
+COREBENCH_ANSWER_FORMAT = """\
+CRITICAL - answer.json format:
+answer.json must contain ONLY the answers to the specific questions listed in the task.
+The keys must be the EXACT question strings from the task prompt (the ones listed in dict_keys([...])).
+Do NOT include any other keys like "name", "description", "method", "results", etc.
+Do NOT write a project summary or analysis - ONLY answer the specific questions.
+
+Your answer.json must look EXACTLY like this:
+{{
+  {keys_example}
+}}
+
+Where each <value> is the answer to that specific question (a number, string, or list).
+
+Other rules:
+1. The repository files are in /workspace/environment/ (code in ./environment/code/, data in ./environment/data/).
+2. Scripts using relative paths (like ../data/) must be run from the ./environment/code/ directory.
+3. For numeric values, use the EXACT value from the output - DO NOT round or truncate. If the answer is a percentage, omit the % and report the float.
+4. For questions starting with "fig ", you need to look at figures/plots generated by the code to answer.
+5. Write your final answer to /workspace/answer.json (at the top level, not inside environment/).
+Return only the JSON object content in answer.json; do not include markdown fences."""
+
+COREBENCH_INITIAL = """\
+{system}
+
+{answer_format}
+
+You are solving a CORE-bench task. The task files are in /workspace/environment/.
+Your goal is to answer the task prompt and write the JSON answer to /workspace/answer.json.
+
+Task prompt:
+{task_prompt}"""
+
+COREBENCH_RETRY_NO_FILE = """\
+{system}
+
+{answer_format}
+
+Previous attempt did not produce /workspace/answer.json.
+The task files are in /workspace/environment/. Retry now, fix any issues, and ensure you write /workspace/answer.json.
+
+Task prompt:
+{task_prompt}"""
+
+COREBENCH_RETRY_BAD_JSON = """\
+{system}
+
+{answer_format}
+
+Your last attempt wrote invalid JSON to /workspace/answer.json: {error}
+Please rewrite /workspace/answer.json with valid JSON only.
+
+Task prompt:
+{task_prompt}"""
+
+
+def build_keys_example(expected_keys: List[str]) -> str:
+    if expected_keys:
+        return ", ".join(f'"{k}": <value>' for k in expected_keys)
+    return '"<question from task>": <value>'
+
+
+def build_corebench_prompts(task_prompt: str, expected_keys: List[str]):
+    """Return (initial_prompt, continue_prompt_no_file, continue_prompt_bad_json_template).
+
+    The bad-json template still has ``{error}`` as a placeholder to be filled at call time.
+    """
+    keys_example = build_keys_example(expected_keys)
+    answer_format = COREBENCH_ANSWER_FORMAT.format(keys_example=keys_example)
+
+    common = dict(system=COREBENCH_SYSTEM, answer_format=answer_format, task_prompt=task_prompt)
+
+    initial = COREBENCH_INITIAL.format(**common)
+    retry_no_file = COREBENCH_RETRY_NO_FILE.format(**common)
+    # keep {error} as a late-binding placeholder
+    retry_bad_json_tpl = COREBENCH_RETRY_BAD_JSON.format_map({**common, "error": "{error}"})
+
+    return initial, retry_no_file, retry_bad_json_tpl
diff --git a/agents/opencode_agent/requirements.txt b/agents/opencode_agent/requirements.txt
diff --git a/hal/agent_runner.py b/hal/agent_runner.py
@@ -40,6 +40,7 @@ def __init__(
         variation_index: Optional[int] = None,
         results_dir: str = "results",
         task_ids: Optional[str] = None,
+        benchmark_args: Optional[Dict[str, Any]] = None,
     ):
         # Validate agent_function format
         if not isinstance(agent_function, str) or "." not in agent_function:
@@ -72,7 +73,7 @@ def __init__(
             )
 
         # Initialize benchmark first
-        self.benchmark_manager = BenchmarkManager(agent_dir, config)
+        self.benchmark_manager = BenchmarkManager(agent_dir, config, benchmark_args=benchmark_args or {})
         self.benchmark = self.benchmark_manager.get_benchmark(benchmark_name)
         self.benchmark.agent_args = agent_args
 

diff --git a/hal/benchmark_manager.py b/hal/benchmark_manager.py
@@ -10,10 +10,12 @@ def __init__(
         agent_dir: str = "agent/",
         config: Optional[Dict[str, Any]] = {},
         agent_args: Optional[Dict[str, Any]] = {},
+        benchmark_args: Optional[Dict[str, Any]] = {},
     ):
         self.config = config
         self.agent_dir = agent_dir
         self.agent_args = agent_args
+        self.benchmark_args = benchmark_args or {}
         self.benchmarks = [
             "gaia",
             "scicode",
@@ -71,15 +73,18 @@ def get_benchmark(self, benchmark_name: str) -> BaseBenchmark:
         elif benchmark_name == "corebench_easy":
             from .benchmarks.corebench import CoreBenchEasy
 
-            benchmark = CoreBenchEasy(self.agent_dir, self.config)
+            use_updated = str(self.benchmark_args.get('use_updated', '')).lower() in ('true', '1', 'yes')
+            benchmark = CoreBenchEasy(self.agent_dir, self.config, use_updated=use_updated)
         elif benchmark_name == "corebench_medium":
             from .benchmarks.corebench import CoreBenchMedium
 
-            benchmark = CoreBenchMedium(self.agent_dir, self.config)
+            use_updated = str(self.benchmark_args.get('use_updated', '')).lower() in ('true', '1', 'yes')
+            benchmark = CoreBenchMedium(self.agent_dir, self.config, use_updated=use_updated)
         elif benchmark_name == "corebench_hard":
             from .benchmarks.corebench import CoreBenchHard
 
-            benchmark = CoreBenchHard(self.agent_dir, self.config)
+            use_updated = str(self.benchmark_args.get('use_updated', '')).lower() in ('true', '1', 'yes')
+            benchmark = CoreBenchHard(self.agent_dir, self.config, use_updated=use_updated)
         elif benchmark_name == "scienceagentbench":
             from .benchmarks.scienceagentbench import ScienceAgentBench
 

diff --git a/hal/benchmarks/corebench.py b/hal/benchmarks/corebench.py
@@ -18,15 +18,24 @@
 class CoreBench(BaseBenchmark):
     """Base class for CoreBench benchmarks of different difficulty levels"""
 
-    def __init__(self, agent_dir: str, config: Dict[str, Any]):
+    def __init__(self, agent_dir: str, config: Dict[str, Any], use_updated: bool = False):
         # Set benchmark_name in subclasses
 
-        # Load tasks from core_test.json
-        core_test_path = os.path.join(
-            os.path.dirname(__file__), "corebench", "core_test.json"
-        )
+        # Choose dataset file based on use_updated flag
+        if use_updated:
+            core_test_path = os.path.join(
+                os.path.dirname(__file__), "corebench", "core_test_updated.json"
+            )
+            if not os.path.exists(core_test_path):
+                raise FileNotFoundError(
+                    f"Updated dataset not found at {core_test_path}"
+                )
+        else:
+            core_test_path = os.path.join(
+                os.path.dirname(__file__), "corebench", "core_test.json"
+            )
 
-        # Check if core_test.json exists, if not, throw an error with instructions to decrypt
+        # Check if the dataset file exists, if not, throw an error with instructions to decrypt
         if not os.path.exists(core_test_path):
             encrypted_file = os.path.join(
                 os.path.dirname(__file__), "corebench", "core_test.json.gpg"
@@ -38,6 +47,7 @@ def __init__(self, agent_dir: str, config: Dict[str, Any]):
                 f'Have you decrypted core_test.json.gpg? Use the following command:\n{decrypt_command}. The password is "reproducibility".'
             )
 
+        logger.info(f"Loading CoreBench dataset from {core_test_path} (use_updated={use_updated})")
         with open(core_test_path, "r") as f:
             dataset = json.load(f)
 
@@ -438,9 +448,9 @@ def get_metrics(self, eval_results: Dict[str, Any]) -> Dict[str, Any]:
 class CoreBenchEasy(CoreBench):
     """CoreBench benchmark with easy difficulty level"""
 
-    def __init__(self, agent_dir: str, config: Dict[str, Any]):
+    def __init__(self, agent_dir: str, config: Dict[str, Any], use_updated: bool = False):
         self.benchmark_name = "corebench_easy"
-        super().__init__(agent_dir, config)
+        super().__init__(agent_dir, config, use_updated=use_updated)
 
     def _construct_prompt(self, task):
         """
@@ -473,9 +483,9 @@ def _get_capsule_files_dict(self, capsule_dir: str) -> Dict[str, str]:
 class CoreBenchMedium(CoreBench):
     """CoreBench benchmark with medium difficulty level"""
 
-    def __init__(self, agent_dir: str, config: Dict[str, Any]):
+    def __init__(self, agent_dir: str, config: Dict[str, Any], use_updated: bool = False):
         self.benchmark_name = "corebench_medium"
-        super().__init__(agent_dir, config)
+        super().__init__(agent_dir, config, use_updated=use_updated)
 
     def _construct_prompt(self, task):
         """
@@ -524,9 +534,9 @@ def _get_capsule_files_dict(self, capsule_dir: str) -> Dict[str, str]:
 class CoreBenchHard(CoreBench):
     """CoreBench benchmark with hard difficulty level"""
 
-    def __init__(self, agent_dir: str, config: Dict[str, Any]):
+    def __init__(self, agent_dir: str, config: Dict[str, Any], use_updated: bool = False):
         self.benchmark_name = "corebench_hard"
-        super().__init__(agent_dir, config)
+        super().__init__(agent_dir, config, use_updated=use_updated)
 
     def _construct_prompt(self, task):
         """

diff --git a/hal/cli.py b/hal/cli.py
@@ -275,6 +275,7 @@ def main(
                 task_timeout=task_timeout,
                 results_dir=results_dir,
                 task_ids=task_ids,
+                benchmark_args=benchmark_args,
             )
 
             # Run evaluation