diff --git a/agents/opencode_agent/main.py b/agents/opencode_agent/main.py new file mode 100644 index 0000000..479a72c --- /dev/null +++ b/agents/opencode_agent/main.py @@ -0,0 +1,1345 @@ +import json +import os +import re +import shutil +import subprocess +import sys +from typing import Any, Dict, List, Optional + +from opencode_setup import ( + find_opencode_binary, + install_opencode_cli, + install_system_deps, + setup_global_config, + setup_opencode_config, +) +from prompts import build_corebench_prompts + + +# ============================================================ +# OpenCode CLI Management +# ============================================================ + + +def ensure_opencode_cli() -> str: + """Install the OpenCode CLI if missing and return the binary path.""" + existing = find_opencode_binary() + if existing: + return existing + + install_system_deps() + install_opencode_cli() + + opencode_path = find_opencode_binary() + if not opencode_path: + raise RuntimeError( + "OpenCode CLI install completed but `opencode` not found. " + "Check installer output and PATH." + ) + return opencode_path + + +def _setup_opencode_config(cwd="/workspace"): + """Write OpenCode project and global configs.""" + config_file = setup_opencode_config(cwd) + print(f"OpenCode project config written to {config_file}") + global_config_file = setup_global_config() + print(f"OpenCode global config written to {global_config_file}") + + +def _save_opencode_logs(): + """Zip ~/.local/share/opencode/ (excluding bin/) and copy opencode_exec.log to ~/.""" + import tarfile + src = os.path.expanduser("~/.local/share/opencode") + if os.path.isdir(src): + dest = os.path.expanduser("~/opencode_logs.tar.gz") + try: + with tarfile.open(dest, "w:gz") as tar: + tar.add(src, arcname="opencode", filter=lambda t: None if "/bin/" in t.name else t) + except Exception as e: + print(f"Warning: failed to zip OpenCode logs: {e}", flush=True) + + exec_log = "/workspace/opencode_exec.log" + if os.path.isfile(exec_log): + shutil.copy2(exec_log, os.path.expanduser("~/opencode_exec.log")) + + +def _format_opencode_line(line): + """Parse a JSON line from OpenCode CLI and return a human-readable summary. + + Produces detailed, multi-line output so the log is self-contained and easy + to follow without having to cross-reference the raw JSON. + """ + try: + data = json.loads(line.strip()) + except (json.JSONDecodeError, ValueError): + return line.rstrip() + + msg_type = data.get("type", "") + part = data.get("part", {}) + + if msg_type == "step_start": + return "--- step start ---" + + if msg_type == "step_finish": + reason = part.get("reason", "") + cost = part.get("cost", 0) + tokens = part.get("tokens", {}) + total_tok = tokens.get("total", 0) + cache = tokens.get("cache", {}) + cache_read = cache.get("read", 0) + cache_write = cache.get("write", 0) + return ( + f"--- step finish | reason={reason} | cost=${cost:.4f} " + f"| tokens={total_tok} (cache r={cache_read} w={cache_write}) ---" + ) + + if msg_type == "text": + text = part.get("text", "").strip() + if text: + return f"[text] {text}" + return None + + if msg_type == "tool_use": + tool = part.get("tool", "?") + state = part.get("state", {}) + status = state.get("status", "?") + inp = state.get("input", {}) + output = state.get("output", "") + + lines_out = [f"[tool:{tool}] status={status}"] + + if tool == "bash": + cmd = inp.get("command", "") + desc = inp.get("description", "") + if desc: + lines_out.append(f" desc : {desc}") + lines_out.append(f" cmd : {cmd}") + meta = state.get("metadata", {}) + exit_code = meta.get("exit", "") + if exit_code != "": + lines_out.append(f" exit : {exit_code}") + out_text = meta.get("output", output) or "" + if out_text: + if len(out_text) > 800: + out_text = out_text[:400] + "\n ... (truncated) ...\n" + out_text[-400:] + lines_out.append(f" output :\n{_indent(out_text, 4)}") + + elif tool == "read": + fp = inp.get("filePath", "") + lines_out.append(f" path : {fp}") + if output and isinstance(output, str): + preview = output + if len(preview) > 600: + preview = preview[:300] + "\n ... (truncated) ...\n" + preview[-300:] + lines_out.append(f" content:\n{_indent(preview, 4)}") + + elif tool in ("glob", "grep"): + pattern = inp.get("pattern", inp.get("query", "")) + path = inp.get("path", "") + lines_out.append(f" pattern: {pattern} path: {path}") + if output and isinstance(output, str): + preview = output + if len(preview) > 600: + preview = preview[:300] + "\n ... (truncated) ...\n" + preview[-300:] + lines_out.append(f" result :\n{_indent(preview, 4)}") + + elif tool == "edit": + fp = inp.get("filePath", "") + lines_out.append(f" path : {fp}") + + elif tool == "todowrite": + todos = inp.get("todos", []) + for t in todos: + lines_out.append(f" [{t.get('status', '?')}] {t.get('content', '')}") + + else: + if inp: + inp_str = json.dumps(inp, ensure_ascii=False) + if len(inp_str) > 300: + inp_str = inp_str[:300] + "..." + lines_out.append(f" input : {inp_str}") + if output and isinstance(output, str) and len(output) < 400: + lines_out.append(f" output : {output}") + + if status == "error": + err = state.get("error", "") + if err: + lines_out.append(f" ERROR : {err}") + + return "\n".join(lines_out) + + return None + + +def _indent(text, spaces): + """Indent every line of text by *spaces* spaces.""" + prefix = " " * spaces + return "\n".join(prefix + l for l in text.splitlines()) + + +def _run_opencode_cli(opencode_bin, model_name, prompt, cwd="/workspace", timeout=3600): + """Run OpenCode CLI once and return the log file path.""" + env = os.environ.copy() + env.setdefault("OPENAI_API_KEY", os.environ.get("OPENAI_API_KEY", "")) + env.setdefault("ANTHROPIC_API_KEY", os.environ.get("ANTHROPIC_API_KEY", "")) + env["OPENCODE_EXPERIMENTAL_BASH_DEFAULT_TIMEOUT_MS"] = "1800000" + + log_path = os.path.join(cwd, "opencode_exec.log") + cmd = [opencode_bin, "run", "--model", model_name, "--format", "json", prompt] + + print(f"Running opencode CLI: {opencode_bin} run --model {model_name} ...", flush=True) + print(f"--- PROMPT START ---\n{prompt}\n--- PROMPT END ---", flush=True) + log_f = open(log_path, "w") + step_num = 0 + try: + proc = subprocess.Popen( + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + env=env, + text=True, + cwd=cwd, + ) + for line in proc.stdout: + log_f.write(line) + log_f.flush() + formatted = _format_opencode_line(line) + if formatted is not None: + if formatted.startswith("--- step start"): + step_num += 1 + formatted = f"--- step {step_num} start ---" + sys.stdout.write(formatted + "\n") + sys.stdout.flush() + proc.wait(timeout=timeout) + except subprocess.TimeoutExpired: + proc.kill() + print("OpenCode CLI timed out", flush=True) + finally: + log_f.close() + print(f"OpenCode CLI exit code: {proc.returncode}", flush=True) + return log_path + + +def _read_file(path, default=None): + """Read file contents or return default.""" + try: + if os.path.exists(path): + with open(path, "r") as f: + return f.read() + except Exception: + pass + return default + + +def _remove_file(path): + """Remove a file if it exists.""" + try: + if os.path.exists(path): + os.remove(path) + except Exception: + pass + + +# ============================================================ +# CORE-bench Handler +# ============================================================ + + +def _copy_environment_to_workspace(): + """Copy ~/environment into /workspace/environment so all files are under the project root. + + Using a physical copy (not symlink) avoids OpenCode's external_directory + permission rejections that occur when symlinks resolve outside /workspace/. + """ + workspace_env = "/workspace/environment" + if os.path.exists(workspace_env): + print(f"Environment already at {workspace_env}", flush=True) + else: + candidates = [ + os.path.expanduser("~/environment"), + "/home/agent/environment", + ] + for src in candidates: + if os.path.isdir(src): + try: + shutil.copytree(src, workspace_env, symlinks=True) + print(f"Copied {src} -> {workspace_env}", flush=True) + except Exception as e: + print(f"copytree failed ({e}), trying cp -a", flush=True) + subprocess.run( + ["cp", "-a", src, workspace_env], + check=True, + ) + print(f"cp -a {src} -> {workspace_env}", flush=True) + break + else: + print("WARNING: environment directory not found at any candidate path", flush=True) + + # Create top-level symlinks so scripts using absolute paths like /data, /code, + # /results work without external_directory permission issues. + for name in ("data", "code", "results"): + link_path = f"/{name}" + target = f"{workspace_env}/{name}" + # Ensure the target directory exists even if the capsule didn't ship one + os.makedirs(target, exist_ok=True) + if not os.path.exists(link_path): + try: + os.symlink(target, link_path) + print(f"Symlinked {link_path} -> {target}", flush=True) + except OSError: + subprocess.run(["sudo", "ln", "-sfn", target, link_path], check=False) + print(f"Symlinked (sudo) {link_path} -> {target}", flush=True) + + +def _run_corebench(task_id, task_data, model_name, kwargs): + """Handle CORE-bench tasks (easy/medium/hard).""" + benchmark_name = kwargs.get("benchmark_name", "corebench_hard") + + print("=" * 60, flush=True) + print(f"CORE-BENCH TASK", flush=True) + print(f" task_id : {task_id}", flush=True) + print(f" benchmark : {benchmark_name}", flush=True) + print(f" model : {model_name}", flush=True) + print("=" * 60, flush=True) + + prompt = task_data.get("prompt", "") + print(f"\nTASK PROMPT:\n{prompt}\n", flush=True) + + # Extract the expected answer keys from the prompt's dict_keys([...]) pattern + expected_keys = [] + dk_match = re.search(r"dict_keys\(\[(.*?)\]\)", prompt, re.DOTALL) + if dk_match: + raw_keys = dk_match.group(1) + expected_keys = [k.strip().strip("'\"") for k in re.findall(r"'[^']*'|\"[^\"]*\"", raw_keys)] + print(f" Expected answer keys: {expected_keys}", flush=True) + + print("-- Setup --", flush=True) + opencode_bin = ensure_opencode_cli() + _setup_opencode_config() + config_path = "/workspace/.opencode/config.json" + if os.path.exists(config_path): + with open(config_path, "r") as f: + print(f" OpenCode config ({config_path}):\n{f.read()}", flush=True) + + # Pre-clean lock files that block R and apt package installs + subprocess.run("sudo rm -rf /usr/local/lib/R/site-library/00LOCK-* 2>/dev/null", shell=True, check=False) + subprocess.run("sudo rm -f /var/lib/apt/lists/lock /var/lib/dpkg/lock* /var/cache/apt/archives/lock 2>/dev/null", shell=True, check=False) + + _copy_environment_to_workspace() + + env_dir = "/workspace/environment" + dirs_present = os.listdir(env_dir) if os.path.isdir(env_dir) else [] + print(f" /workspace/environment contents: {dirs_present}", flush=True) + print("-- Setup complete --\n", flush=True) + + initial_prompt, retry_no_file, retry_bad_json_tpl = build_corebench_prompts(prompt, expected_keys) + + answer_file = "/workspace/answer.json" + max_attempts = 3 + current_prompt = initial_prompt + + try: + for attempt in range(max_attempts): + print(f"\n{'=' * 40}", flush=True) + print(f"ATTEMPT {attempt + 1} of {max_attempts}", flush=True) + print(f"{'=' * 40}\n", flush=True) + + _run_opencode_cli(opencode_bin, model_name, current_prompt) + + if os.path.exists(answer_file): + try: + with open(answer_file, "r") as f: + answer = json.load(f) + print(f"\nanswer.json loaded successfully: {json.dumps(answer, indent=2)}", flush=True) + return {task_id: answer} + except json.JSONDecodeError as e: + print(f"\nanswer.json has invalid JSON: {e}", flush=True) + os.remove(answer_file) + current_prompt = retry_bad_json_tpl.format(error=e) + continue + else: + print(f"\nanswer.json NOT found after attempt {attempt + 1}", flush=True) + + current_prompt = retry_no_file + + print(f"\nFAILED: answer.json not produced after {max_attempts} attempts", flush=True) + return {task_id: "ERROR: answer.json not produced after retries"} + + except subprocess.TimeoutExpired: + print(f"\nFAILED: Timeout", flush=True) + return {task_id: "ERROR: Timeout"} + except Exception as e: + print(f"\nFAILED: {e}", flush=True) + return {task_id: f"ERROR: {str(e)}"} + + +# ============================================================ +# GAIA Handler +# ============================================================ + + +def _run_gaia(task_id, task_data, model_name, kwargs): + """Handle GAIA benchmark tasks.""" + opencode_bin = ensure_opencode_cli() + _setup_opencode_config() + + question = task_data.get("Question", "") + + initial_prompt = f"""You are answering a GAIA benchmark question. Any attached files are stored in the current directory. + +Question: +{question} + +Instructions: +- Return only your answer, which should be a number, or a short phrase with as few words as possible, or a comma separated list of numbers and/or strings. +- If the answer is a number, return only the number without any units unless specified otherwise. +- If the answer is a string, don't include articles, and don't use abbreviations (e.g. for states). +- If the answer is a comma separated list, apply the above rules to each element in the list. + +Write ONLY your final answer (nothing else) to /workspace/answer.txt +Do NOT include any explanation, just the answer itself. +""" + + answer_file = "/workspace/answer.txt" + _remove_file(answer_file) + max_attempts = 3 + + try: + for attempt in range(max_attempts): + if attempt == 0: + prompt_to_use = initial_prompt + else: + prompt_to_use = ( + "Previous attempt did not produce /workspace/answer.txt. " + "Please answer the question and write ONLY the answer to /workspace/answer.txt.\n\n" + f"Question: {question}" + ) + _run_opencode_cli(opencode_bin, model_name, prompt_to_use) + + answer = _read_file(answer_file) + if answer is not None: + return { + task_id: { + "answer": answer.strip(), + "metrics": {}, + } + } + + return {task_id: {"answer": "", "metrics": {}}} + + except subprocess.TimeoutExpired: + return {task_id: {"answer": "ERROR: Timeout", "metrics": {}}} + except Exception as e: + return {task_id: {"answer": f"ERROR: {str(e)}", "metrics": {}}} + + +# ============================================================ +# USACO Handler +# ============================================================ + + +def _run_usaco(task_id, task_data, model_name, kwargs): + """Handle USACO programming problems.""" + opencode_bin = ensure_opencode_cli() + _setup_opencode_config() + + description = task_data.get("description", "") + + prompt = f"""Please solve the following USACO competitive programming problem. +Write a complete Python 3 solution that reads from stdin and writes to stdout. +No outside libraries are allowed. + +[BEGIN PROBLEM] +{description} +[END PROBLEM] + +Write your complete Python 3 solution to /workspace/solution.py +The solution must: +- Read input from stdin +- Write output to stdout +- Be a single, complete, runnable Python file +- Not use any external libraries +""" + + solution_file = "/workspace/solution.py" + _remove_file(solution_file) + max_attempts = 3 + + try: + for attempt in range(max_attempts): + if attempt == 0: + prompt_to_use = prompt + else: + prompt_to_use = ( + "Previous attempt did not produce /workspace/solution.py. " + "Please write a complete Python 3 solution to /workspace/solution.py.\n\n" + f"[BEGIN PROBLEM]\n{description}\n[END PROBLEM]" + ) + _run_opencode_cli(opencode_bin, model_name, prompt_to_use) + + code = _read_file(solution_file) + if code is not None: + return { + task_id: { + "answer": code, + "metrics": {}, + } + } + + return {task_id: {"answer": "", "metrics": {}}} + + except subprocess.TimeoutExpired: + return {task_id: {"answer": "ERROR: Timeout", "metrics": {}}} + except Exception as e: + return {task_id: {"answer": f"ERROR: {str(e)}", "metrics": {}}} + + +# ============================================================ +# ScienceAgentBench Handler +# ============================================================ + + +def _run_scienceagentbench(task_id, task_data, model_name, kwargs): + """Handle ScienceAgentBench tasks.""" + opencode_bin = ensure_opencode_cli() + _setup_opencode_config() + + task_inst = task_data.get("task_inst", "") + domain_knowledge = task_data.get("domain_knowledge", "") + dataset_folder_tree = task_data.get("dataset_folder_tree", "") + dataset_preview = task_data.get("dataset_preview", "") + output_fname = task_data.get("output_fname", "output.txt") + + dataset_path = "" + if dataset_folder_tree: + first_line = dataset_folder_tree.split("\n")[0] + if len(first_line) > 4: + dataset_path = "benchmark/datasets/" + first_line[4:] + + prompt = f"""Write a Python 3 script to solve the following scientific computing task. + +Task: +{task_inst} +""" + if domain_knowledge: + prompt += f"\n{domain_knowledge}\n" + + prompt += f""" +You can access the dataset at `{dataset_path}`. Here is the directory structure: +``` +{dataset_folder_tree} +``` +Here are some helpful previews for the dataset file(s): +{dataset_preview} + +Write your complete Python 3 script to /workspace/solution.py +The script should produce the output file: {output_fname} +""" + + solution_file = "/workspace/solution.py" + _remove_file(solution_file) + max_attempts = 3 + + try: + for attempt in range(max_attempts): + if attempt == 0: + prompt_to_use = prompt + else: + prompt_to_use = ( + "Previous attempt did not produce /workspace/solution.py. " + "Please write a complete Python 3 script to /workspace/solution.py.\n\n" + f"Task: {task_inst}" + ) + _run_opencode_cli(opencode_bin, model_name, prompt_to_use) + + code = _read_file(solution_file) + if code is not None: + return { + task_id: { + "history": [{"role": "assistant", "content": f"```python{code}```"}], + "cost": 0.0, + } + } + + return {task_id: {"history": [{"role": "assistant", "content": ""}], "cost": 0.0}} + + except subprocess.TimeoutExpired: + return {task_id: {"history": [{"role": "assistant", "content": "ERROR: Timeout"}], "cost": 0.0}} + except Exception as e: + return {task_id: {"history": [{"role": "assistant", "content": f"ERROR: {str(e)}"}], "cost": 0.0}} + + +# ============================================================ +# SWE-bench Handler +# ============================================================ + + +def _run_swebench(task_id, task_data, model_name, kwargs): + """Handle SWE-bench tasks (verified and verified_mini).""" + opencode_bin = ensure_opencode_cli() + _setup_opencode_config() + + repo = task_data.get("repo", "") + base_commit = task_data.get("base_commit", "") + problem_statement = task_data.get("problem_statement", "") + repo_name = repo.split("/")[-1] if repo else "repo" + repo_dir = f"/workspace/{repo_name}" + + # Clone and reset the repository + try: + if os.path.exists(repo_dir): + shutil.rmtree(repo_dir) + result = subprocess.run( + ["git", "clone", f"https://github.com/{repo}.git"], + capture_output=True, text=True, cwd="/workspace", timeout=300, + ) + if result.returncode != 0: + return {task_id: f"ERROR: Failed to clone: {result.stderr}"} + + result = subprocess.run( + ["git", "reset", "--hard", base_commit], + capture_output=True, text=True, cwd=repo_dir, timeout=60, + ) + if result.returncode != 0: + return {task_id: f"ERROR: Failed to reset: {result.stderr}"} + except Exception as e: + return {task_id: f"ERROR: Repo setup failed: {str(e)}"} + + prompt = f"""You need to fix a software issue in the repository at /workspace/{repo_name}. + +Problem Statement: +{problem_statement} + +Instructions: +1. Navigate to /workspace/{repo_name} and understand the codebase +2. Identify the root cause of the issue +3. Make the necessary code changes to fix it +4. Make changes directly to the source files + +Do NOT create a patch file. Just edit the source files directly to fix the issue. +""" + + try: + _run_opencode_cli(opencode_bin, model_name, prompt, cwd="/workspace") + + # Generate patch from git diff + result = subprocess.run( + ["git", "diff"], + capture_output=True, text=True, cwd=repo_dir, timeout=60, + ) + model_patch = result.stdout if result.returncode == 0 else "" + + if not model_patch: + # Also check staged changes + result = subprocess.run( + ["git", "diff", "--cached"], + capture_output=True, text=True, cwd=repo_dir, timeout=60, + ) + model_patch = result.stdout if result.returncode == 0 else "" + + return {task_id: model_patch} + + except subprocess.TimeoutExpired: + return {task_id: "ERROR: Timeout"} + except Exception as e: + return {task_id: f"ERROR: {str(e)}"} + + +# ============================================================ +# SciCode Handler +# ============================================================ + + +# Special-case hardcoded code for known problematic steps (matches generalist agent) +_SCICODE_SPECIAL_CASES = { + ("13", 5): '''\ + class Maxwell: + """ The base class for evolution of Maxwell's equations. + """ + + def __init__(self, n_grid, x_out): + self.n_grid = n_grid + self.n_vars = 7 + self.delta = float(x_out) / (n_grid - 2.0) + delta = self.delta + + self.x = np.linspace(-self.delta*0.5, x_out + 0.5*self.delta, self.n_grid)[:,None,None] + self.y = np.linspace(-self.delta*0.5, x_out + 0.5*self.delta, self.n_grid)[None,:,None] + self.z = np.linspace(-self.delta*0.5, x_out + 0.5*self.delta, self.n_grid)[None,None,:] + self.r = np.sqrt(self.x**2+self.y**2+self.z**2) + + self.E_x = zeros((n_grid, n_grid, n_grid)) + self.E_y = zeros((n_grid, n_grid, n_grid)) + self.E_z = zeros((n_grid, n_grid, n_grid)) + self.A_x = zeros((n_grid, n_grid, n_grid)) + self.A_y = zeros((n_grid, n_grid, n_grid)) + self.A_z = zeros((n_grid, n_grid, n_grid)) + self.phi = zeros((n_grid, n_grid, n_grid)) + self.constraint = zeros((n_grid, n_grid, n_grid)) + + self.t = 0.0 +''', + ("62", 0): ''' +class Block: + def __init__(self, length, basis_size, operator_dict): + self.length = length + self.basis_size = basis_size + self.operator_dict = operator_dict + + def print_all(self): + print(self.length) + print(self.basis_size) + for key, matrix in self.operator_dict.items(): + if isinstance(matrix, np.ndarray): + print(f"{key}:\\n{matrix}\\n") + else: + print(f"{key}:\\n{matrix.toarray()}\\n") + +class EnlargedBlock: + def __init__(self, length, basis_size, operator_dict): + self.length = length + self.basis_size = basis_size + self.operator_dict = operator_dict + + def print_all(self): + print(self.length) + print(self.basis_size) + for key, matrix in self.operator_dict.items(): + if isinstance(matrix, np.ndarray): + print(f"{key}:\\n{matrix}\\n") + else: + print(f"{key}:\\n{matrix.toarray()}\\n") +''', + ("76", 2): """ +def generate_dna(N: int, PWM: dict) -> tuple: + ''' + Input: + N (int): Length of the resultant DNA sequence. + PWM matrix with keys 'A', 'C', 'G', 'T' + + Output: + tuple: Insertion location (int), DNA sequence (str), DNA reverse complement (str) + ''' + p = random.randint(0, N-1) + + nucleotide = "ACGT" + uni_weights = [0.25,0.25,0.25,0.25] #uniform distribution + dna_string = ''.join(random.choices(nucleotide, uni_weights, k=N)) + + spike_mat = load_motif_from_df(PWM) + spiked_seq = ''.join(random.choices(nucleotide, weights=[PWM[nuc][i] for nuc in nucleotide], k=1)[0] + for i in range(len(PWM['A']))) + + complement = {'A':'T', 'T':'A', 'C':'G', 'G':'C'} + reversed_seq = dna_string[::-1] + reverse_complement = ''.join(complement[nuc] for nuc in reversed_seq if nuc in complement) + + new_seq = dna_string[:p] + spiked_seq + dna_string[p:] + new_seq_rc = reverse_complement[:N-p] + spiked_seq + reverse_complement[N-p:] + + return p, new_seq, new_seq_rc +""", +} + + +def _run_scicode(task_id, task_data, model_name, kwargs): + """Handle SciCode benchmark tasks.""" + opencode_bin = ensure_opencode_cli() + _setup_opencode_config() + + benchmark_name = kwargs.get("benchmark_name", "scicode") + easy = benchmark_name == "scicode_easy" + + prompt_template = """PROBLEM DESCRIPTION: +You will be provided with problem steps along with background knowledge necessary for solving the problem. Your task will be to develop a Python solution focused on the next step of the problem-solving process. + +PROBLEM STEPS AND FUNCTION CODE: +Here, you'll find the Python code for the initial steps of the problem-solving process. This code is integral to building the solution. + +{problem_steps_str} + +NEXT STEP - PROBLEM STEP AND FUNCTION HEADER: +This part will describe the next step in the problem-solving process. A function header will be provided, and your task is to develop the Python code for this next step based on the provided description and function header. + +{next_step_str} + +DEPENDENCIES: +Use only the following dependencies in your solution. Do not include these dependencies at the beginning of your code. + +{dependencies} + +RESPONSE GUIDELINES: +1. Write the complete and executable Python program for the next step in a single block. +3. Your response should focus exclusively on implementing the solution for the next step, adhering closely to the specified function header and the context provided by the initial steps. +4. DO NOT include previous function code, example usage or test code in your response. +5. Write ONLY the Python code to /workspace/step_code.py (no markdown fences in the file). +""" + + def process_problem_code(prob_data, num_steps): + header_docstring = prob_data["sub_steps"][num_steps - 1]["function_header"] + return_str = prob_data["sub_steps"][num_steps - 1]["return_line"] + return f"{header_docstring}\n\n{return_str}" + + def process_problem_steps(with_background, previous_llm_code, problem_data, num_steps): + output_lines = [] + for i in range(num_steps - 1): + step_desc = problem_data["sub_steps"][i]["step_description_prompt"] + if with_background: + step_desc += "\n" + problem_data["sub_steps"][i]["step_background"] + output_lines.append(step_desc) + output_lines.append(previous_llm_code[i]) + output_lines.append("------") + + next_step = [] + step_desc = problem_data["sub_steps"][num_steps - 1]["step_description_prompt"] + if with_background: + step_desc += "\n" + problem_data["sub_steps"][num_steps - 1]["step_background"] + next_step.append(step_desc) + next_step.append(process_problem_code(problem_data, num_steps)) + output_str = "\n\n".join(output_lines[:-1]) if output_lines else "" + next_step_str = "\n\n".join(next_step) + return output_str, next_step_str + + def generate_prompt_with_steps(with_background, previous_llm_code, prob_data, num_steps, template): + problem_steps_str, next_step_str = process_problem_steps( + with_background, previous_llm_code, prob_data, num_steps + ) + dependencies = prob_data["required_dependencies"] + assert next_step_str + return template.format( + problem_steps_str=problem_steps_str, + next_step_str=next_step_str, + dependencies=dependencies, + ), f"{dependencies}\n" + + previous_llm_code = [] + full_code = "" + steps = len(task_data["sub_steps"]) + steps_results = {} + step_code_file = "/workspace/step_code.py" + + print(f"Generating {task_id}...") + + try: + for i in range(steps): + # Check for special cases + special_key = (task_id, i) + if special_key in _SCICODE_SPECIAL_CASES: + step_code = _SCICODE_SPECIAL_CASES[special_key] + previous_llm_code.append(step_code) + full_code += f"\n{step_code}" + steps_results[f"{task_id}.{i + 1}"] = full_code + continue + + prompt, dependencies = generate_prompt_with_steps( + with_background=easy, + previous_llm_code=previous_llm_code, + prob_data=task_data, + num_steps=i + 1, + template=prompt_template, + ) + + _remove_file(step_code_file) + _run_opencode_cli(opencode_bin, model_name, prompt) + + generated_code = _read_file(step_code_file, "") + + if not generated_code.strip(): + # Fallback: try to extract from log + log_content = _read_file("/workspace/opencode_exec.log", "") + if "```python" in log_content: + generated_code = log_content.split("```python")[-1].split("```")[0].strip() + + generated_code = generated_code.replace("```python", "").replace("```", "").strip() + + previous_llm_code.append(generated_code) + full_code += f"\n{generated_code}" + + if easy: + steps_results[f"{task_id}.{i + 1}"] = full_code + else: + steps_results[f"{task_id}.{i + 1}"] = dependencies + full_code + + return {task_id: steps_results} + + except subprocess.TimeoutExpired: + return {task_id: "ERROR: Timeout"} + except Exception as e: + return {task_id: f"ERROR: {str(e)}"} + + +# ============================================================ +# AssistantBench Handler +# ============================================================ + + +def _run_assistantbench(task_id, task_data, model_name, kwargs): + """Handle AssistantBench tasks.""" + opencode_bin = ensure_opencode_cli() + _setup_opencode_config() + + task_question = task_data.get("task", "") + + prompt = f"""Provide a concise and accurate answer to the question below without any additional context in the format suggested by the prompt. Do not include any justification or any additional unnecessary text. Your answer does not need to be a full sentence. If you are unsure what the final answer is, generate an empty string. The answer should either be: a number, a string, a list of strings, or a list of jsons. The answer should be parsed with the python method: json.loads(input_str). If no answer is found, generate an empty string. If the prompt includes a specified answer format, respect that format. + +[BEGIN QUESTION] +{task_question} +[END QUESTION] + +Write ONLY your final answer to /workspace/answer.txt +The content should be directly parseable by json.loads(). For strings, include the quotes. +""" + + answer_file = "/workspace/answer.txt" + _remove_file(answer_file) + max_attempts = 3 + + try: + for attempt in range(max_attempts): + if attempt == 0: + prompt_to_use = prompt + else: + prompt_to_use = ( + "Previous attempt did not produce /workspace/answer.txt. " + "Please answer the question and write ONLY the answer to /workspace/answer.txt.\n\n" + f"Question: {task_question}" + ) + _run_opencode_cli(opencode_bin, model_name, prompt_to_use) + + answer = _read_file(answer_file) + if answer is not None: + return { + task_id: { + "answer": answer.strip(), + "metrics": {}, + } + } + + return {task_id: {"answer": "", "metrics": {}}} + + except subprocess.TimeoutExpired: + return {task_id: {"answer": "ERROR: Timeout", "metrics": {}}} + except Exception as e: + return {task_id: {"answer": f"ERROR: {str(e)}", "metrics": {}}} + + +# ============================================================ +# Tau-bench Handler (uses litellm directly, not OpenCode CLI) +# ============================================================ + + +def _make_tool(name, desc, properties, required=None): + """Create an OpenAI function calling tool definition.""" + tool = { + "type": "function", + "function": { + "name": name, + "description": desc, + "parameters": { + "type": "object", + "properties": properties, + }, + }, + } + if required: + tool["function"]["parameters"]["required"] = required + return tool + + +# Common tools shared by both airline and retail domains +_RESPOND_TOOL = _make_tool( + "respond", + "Send a message to the customer. Use this when you want to communicate with the customer.", + {"content": {"type": "string", "description": "The message to send to the customer."}}, + ["content"], +) + +_THINK_TOOL = _make_tool( + "think", + "Think about something. Does not obtain new info or change the database. Use for complex reasoning.", + {"thought": {"type": "string", "description": "Your thought process."}}, + ["thought"], +) + +_TRANSFER_TOOL = _make_tool( + "transfer_to_human_agents", + "Transfer to a human agent. Only use if user explicitly asks or issue cannot be resolved with available tools.", + {"summary": {"type": "string", "description": "Summary of the user's issue."}}, + ["summary"], +) + +_CALCULATE_TOOL = _make_tool( + "calculate", + "Calculate a math expression. Supports +, -, *, /, (, ). Returns result rounded to 2 decimals.", + {"expression": {"type": "string", "description": "The math expression to evaluate."}}, + ["expression"], +) + + +AIRLINE_TOOLS = [ + _RESPOND_TOOL, _THINK_TOOL, _TRANSFER_TOOL, _CALCULATE_TOOL, + _make_tool( + "book_reservation", "Book a flight reservation.", + { + "user_id": {"type": "string", "description": "User ID, e.g. 'sara_doe_496'."}, + "origin": {"type": "string", "description": "Origin IATA code, e.g. 'SFO'."}, + "destination": {"type": "string", "description": "Destination IATA code, e.g. 'JFK'."}, + "flight_type": {"type": "string", "enum": ["one_way", "round_trip"], "description": "Trip type."}, + "cabin": {"type": "string", "enum": ["basic_economy", "economy", "business"], "description": "Cabin class."}, + "flights": {"type": "array", "items": {"type": "object"}, "description": "Flight segments: [{flight_number, date}, ...]."}, + "passengers": {"type": "array", "items": {"type": "object"}, "description": "Passengers: [{first_name, last_name, dob}, ...]."}, + "payment_methods": {"type": "array", "items": {"type": "object"}, "description": "Payments: [{payment_id, amount}, ...]."}, + "total_baggages": {"type": "integer", "description": "Total baggage count."}, + "nonfree_baggages": {"type": "integer", "description": "Non-free baggage count."}, + "insurance": {"type": "string", "enum": ["yes", "no"], "description": "Travel insurance."}, + }, + ["user_id", "origin", "destination", "flight_type", "cabin", "flights", + "passengers", "payment_methods", "total_baggages", "nonfree_baggages", "insurance"], + ), + _make_tool( + "cancel_reservation", "Cancel a reservation.", + {"reservation_id": {"type": "string", "description": "Reservation ID, e.g. 'ZFA04Y'."}}, + ["reservation_id"], + ), + _make_tool( + "get_reservation_details", "Get reservation details.", + {"reservation_id": {"type": "string", "description": "Reservation ID, e.g. '8JX2WO'."}}, + ["reservation_id"], + ), + _make_tool( + "get_user_details", "Get user details including reservations.", + {"user_id": {"type": "string", "description": "User ID, e.g. 'sara_doe_496'."}}, + ["user_id"], + ), + _make_tool( + "list_all_airports", "List all airports and their cities.", + {}, + ), + _make_tool( + "search_direct_flight", "Search direct flights between two cities on a date.", + { + "origin": {"type": "string", "description": "Origin IATA code."}, + "destination": {"type": "string", "description": "Destination IATA code."}, + "date": {"type": "string", "description": "Date in YYYY-MM-DD format."}, + }, + ["origin", "destination", "date"], + ), + _make_tool( + "search_onestop_flight", "Search one-stop flights between two cities on a date.", + { + "origin": {"type": "string", "description": "Origin IATA code."}, + "destination": {"type": "string", "description": "Destination IATA code."}, + "date": {"type": "string", "description": "Date in YYYY-MM-DD format."}, + }, + ["origin", "destination", "date"], + ), + _make_tool( + "send_certificate", "Send a travel certificate to a user.", + { + "user_id": {"type": "string", "description": "User ID."}, + "amount": {"type": "number", "description": "Certificate amount."}, + }, + ["user_id", "amount"], + ), + _make_tool( + "update_reservation_baggages", "Update baggage info for a reservation.", + { + "reservation_id": {"type": "string", "description": "Reservation ID."}, + "total_baggages": {"type": "integer", "description": "Updated total baggage count."}, + "nonfree_baggages": {"type": "integer", "description": "Updated non-free baggage count."}, + "payment_id": {"type": "string", "description": "Payment ID for extra charges."}, + }, + ["reservation_id", "total_baggages", "nonfree_baggages", "payment_id"], + ), + _make_tool( + "update_reservation_flights", + "Update flight info for a reservation. Include ALL flight segments even if unchanged.", + { + "reservation_id": {"type": "string", "description": "Reservation ID."}, + "cabin": {"type": "string", "enum": ["basic_economy", "economy", "business"], "description": "Cabin class."}, + "flights": {"type": "array", "items": {"type": "object"}, + "description": "ALL flight segments (even unchanged): [{flight_number, date}, ...]."}, + "payment_id": {"type": "string", "description": "Payment ID for price difference."}, + }, + ["reservation_id", "cabin", "flights", "payment_id"], + ), + _make_tool( + "update_reservation_passengers", "Update passenger info for a reservation.", + { + "reservation_id": {"type": "string", "description": "Reservation ID."}, + "passengers": {"type": "array", "items": {"type": "object"}, + "description": "Updated passengers: [{first_name, last_name, dob}, ...]."}, + }, + ["reservation_id", "passengers"], + ), +] + + +RETAIL_TOOLS = [ + _RESPOND_TOOL, _THINK_TOOL, _TRANSFER_TOOL, _CALCULATE_TOOL, + _make_tool( + "get_user_details", "Get user details including orders.", + {"user_id": {"type": "string", "description": "User ID."}}, + ["user_id"], + ), + _make_tool( + "get_order_details", "Get order details.", + {"order_id": {"type": "string", "description": "Order ID, e.g. '#W0000000'."}}, + ["order_id"], + ), + _make_tool( + "get_product_details", "Get product details by product ID (not item ID).", + {"product_id": {"type": "string", "description": "Product ID, e.g. '6086499569'."}}, + ["product_id"], + ), + _make_tool( + "list_all_product_types", "List all product types and their product IDs.", + {}, + ), + _make_tool( + "find_user_id_by_email", "Find user ID by email address.", + {"email": {"type": "string", "description": "User's email address."}}, + ["email"], + ), + _make_tool( + "find_user_id_by_name_zip", "Find user ID by first name, last name, and zip code.", + { + "first_name": {"type": "string", "description": "User's first name."}, + "last_name": {"type": "string", "description": "User's last name."}, + "zip": {"type": "string", "description": "User's zip code."}, + }, + ["first_name", "last_name", "zip"], + ), + _make_tool( + "modify_pending_order_items", + "Modify items in a pending order. Can only be called once per order.", + { + "order_id": {"type": "string", "description": "Order ID."}, + "item_ids": {"type": "array", "items": {"type": "string"}, "description": "Item IDs to replace."}, + "new_item_ids": {"type": "array", "items": {"type": "string"}, + "description": "New item IDs (same product type)."}, + "payment_method_id": {"type": "string", "description": "Payment method for price difference."}, + }, + ["order_id", "item_ids", "new_item_ids", "payment_method_id"], + ), + _make_tool( + "modify_pending_order_address", "Modify shipping address for a pending order.", + { + "order_id": {"type": "string", "description": "Order ID."}, + "address1": {"type": "string", "description": "Street address line 1."}, + "address2": {"type": "string", "description": "Street address line 2."}, + "city": {"type": "string", "description": "City."}, + "state": {"type": "string", "description": "State."}, + "country": {"type": "string", "description": "Country."}, + "zip": {"type": "string", "description": "Zip code."}, + }, + ["order_id", "address1", "address2", "city", "state", "country", "zip"], + ), + _make_tool( + "modify_pending_order_payment", "Change payment method for a pending order.", + { + "order_id": {"type": "string", "description": "Order ID."}, + "payment_method_id": {"type": "string", "description": "New payment method ID."}, + }, + ["order_id", "payment_method_id"], + ), + _make_tool( + "cancel_pending_order", "Cancel a pending order.", + { + "order_id": {"type": "string", "description": "Order ID."}, + "reason": {"type": "string", "enum": ["no longer needed", "ordered by mistake"], + "description": "Cancellation reason."}, + }, + ["order_id", "reason"], + ), + _make_tool( + "return_delivered_order_items", "Return items from a delivered order.", + { + "order_id": {"type": "string", "description": "Order ID."}, + "item_ids": {"type": "array", "items": {"type": "string"}, "description": "Item IDs to return."}, + "payment_method_id": {"type": "string", + "description": "Refund payment method (original method or gift card)."}, + }, + ["order_id", "item_ids", "payment_method_id"], + ), + _make_tool( + "exchange_delivered_order_items", "Exchange items from a delivered order.", + { + "order_id": {"type": "string", "description": "Order ID."}, + "item_ids": {"type": "array", "items": {"type": "string"}, "description": "Item IDs to exchange."}, + "new_item_ids": {"type": "array", "items": {"type": "string"}, + "description": "New item IDs (same product type)."}, + "payment_method_id": {"type": "string", "description": "Payment method for price difference."}, + }, + ["order_id", "item_ids", "new_item_ids", "payment_method_id"], + ), + _make_tool( + "modify_user_address", "Modify user's default address.", + { + "user_id": {"type": "string", "description": "User ID."}, + "address1": {"type": "string", "description": "Street address line 1."}, + "address2": {"type": "string", "description": "Street address line 2."}, + "city": {"type": "string", "description": "City."}, + "state": {"type": "string", "description": "State."}, + "country": {"type": "string", "description": "Country."}, + "zip": {"type": "string", "description": "Zip code."}, + }, + ["user_id", "address1", "address2", "city", "state", "country", "zip"], + ), +] + + +def _run_taubench(task_id, task_data, model_name, kwargs): + """Handle tau-bench tasks using litellm for LLM calls. + + Tau-bench requires interactive tool calling which is not suited for the + OpenCode CLI one-shot approach. Instead we use litellm directly with + OpenAI-compatible function calling in a conversation loop. + """ + try: + from tau_bench.envs import get_env + from tau_bench.types import Action + import litellm + except ImportError as e: + return {task_id: f"ERROR: Missing dependency: {e}. Install tau-bench and litellm."} + + # Set up environment + env = get_env( + task_data["env"], + task_data["user_strategy"], + task_data["user_model"], + task_data["task_split"], + task_data["user_provider"], + task_data["task_index"], + ) + + obs = env.reset(task_data["task_index"]) + wiki = env.wiki + + domain = task_data["env"] + tools = AIRLINE_TOOLS if domain == "airline" else RETAIL_TOOLS + domain_desc = "an airline" if domain == "airline" else "a retail company" + + system_prompt = f"""You are a customer service agent for {domain_desc}. Follow the policies and guidelines below carefully. + +{wiki} + +Use the available tools to help the customer. When you want to respond to the customer, use the 'respond' tool. +Always verify the customer's identity before making any changes. +Think step by step before taking actions. Use the 'think' tool for complex reasoning.""" + + messages = [ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": obs.observation}, + ] + + litellm.drop_params = True + max_steps = 200 + + try: + for step in range(max_steps): + try: + response = litellm.completion( + model=model_name, + messages=messages, + tools=tools, + tool_choice="auto", + ) + except Exception as e: + print(f"LLM call failed at step {step}: {e}") + break + + choice = response.choices[0] + assistant_msg = choice.message + + if assistant_msg.tool_calls: + # Convert to serialisable dict for messages + messages.append(assistant_msg.model_dump()) + done = False + for tc in assistant_msg.tool_calls: + fn_name = tc.function.name + try: + fn_args = json.loads(tc.function.arguments) + except json.JSONDecodeError: + fn_args = {} + + action = Action(name=fn_name, kwargs=fn_args) + observation = env.step(action) + + messages.append({ + "role": "tool", + "content": observation.observation, + "tool_call_id": tc.id, + }) + + if observation.done: + done = True + break + + if done: + break + else: + # Text response without tool calls - treat as respond action + content = assistant_msg.content or "" + messages.append({"role": "assistant", "content": content}) + action = Action(name="respond", kwargs={"content": content}) + observation = env.step(action) + + if observation.done: + break + + messages.append({"role": "user", "content": observation.observation}) + + return { + task_id: { + "reward": env.reward, + "taken_actions": [a.model_dump() for a in env.actions], + "task": env.task.model_dump(), + } + } + + except Exception as e: + return {task_id: f"ERROR: {str(e)}"} + + +# ============================================================ +# Main Entry Point +# ============================================================ + + +def run(input: Dict[str, Dict], **kwargs) -> Dict[str, Any]: + """Run OpenCode agent on various benchmarks. + + For most benchmarks the OpenCode CLI is used as a one-shot coding agent. + For tau-bench (which requires interactive tool calling) litellm is used + directly with function calling. + """ + assert "model_name" in kwargs, "model_name is required" + assert len(input) == 1, "input must contain only one task" + + benchmark_name = kwargs.get("benchmark_name", "") + model_name: str = kwargs["model_name"] + task_id, task_data = list(input.items())[0] + + try: + if benchmark_name.startswith("corebench"): + return _run_corebench(task_id, task_data, model_name, kwargs) + elif benchmark_name == "gaia": + return _run_gaia(task_id, task_data, model_name, kwargs) + elif benchmark_name == "usaco": + return _run_usaco(task_id, task_data, model_name, kwargs) + elif benchmark_name == "scienceagentbench": + return _run_scienceagentbench(task_id, task_data, model_name, kwargs) + elif benchmark_name in ("swebench_verified", "swebench_verified_mini"): + return _run_swebench(task_id, task_data, model_name, kwargs) + elif benchmark_name.startswith("taubench"): + return _run_taubench(task_id, task_data, model_name, kwargs) + elif benchmark_name in ("scicode", "scicode_easy", "scicode_hard"): + return _run_scicode(task_id, task_data, model_name, kwargs) + elif benchmark_name == "assistantbench": + return _run_assistantbench(task_id, task_data, model_name, kwargs) + else: + return {task_id: f"ERROR: Unsupported benchmark: {benchmark_name}"} + finally: + _save_opencode_logs() diff --git a/agents/opencode_agent/opencode_setup.py b/agents/opencode_agent/opencode_setup.py new file mode 100644 index 0000000..9d3da8d --- /dev/null +++ b/agents/opencode_agent/opencode_setup.py @@ -0,0 +1,114 @@ +"""OpenCode agent setup – constants and utilities for CLI installation and configuration.""" + +import json +import os +import shutil +import subprocess +from typing import Optional + + +# ── Constants ──────────────────────────────────────────────── + +OPENCODE_INSTALL_CMD = "curl -fsSL https://opencode.ai/install | bash" + +KNOWN_BINARY_LOCATIONS = [ + "~/.local/bin/opencode", + "~/.opencode/bin/opencode", + "~/.local/share/opencode/bin/opencode", + "/usr/local/bin/opencode", + "/usr/bin/opencode", + "/opt/homebrew/bin/opencode", +] + +OPENCODE_CONFIG = { + "$schema": "https://opencode.ai/config.json", + "logLevel": "DEBUG", + "permission": { + "*": "allow", + "external_directory": "allow", + "doom_loop": "allow", + "edit": "allow", + "bash": "allow", + "webfetch": "allow", + "websearch": "allow", + "read": "allow", + "glob": "allow", + "grep": "allow", + "list": "allow", + "task": "allow", + "skill": "allow", + }, +} + + +# ── Utility Functions ──────────────────────────────────────── + + +def find_opencode_binary() -> Optional[str]: + """Search for the opencode binary in PATH and known locations.""" + found = shutil.which("opencode") + if found: + return found + + for loc in KNOWN_BINARY_LOCATIONS: + expanded = os.path.expanduser(loc) + if os.path.isfile(expanded) and os.access(expanded, os.X_OK): + return expanded + + search_roots = [os.path.expanduser("~"), "/usr/local", "/usr", "/opt"] + for root in search_roots: + for sub in ("bin", "local/bin", "local/share/opencode/bin"): + candidate = os.path.join(root, sub, "opencode") + if os.path.isfile(candidate) and os.access(candidate, os.X_OK): + return candidate + + return None + + +def install_system_deps() -> None: + """Install system dependencies (git, curl, procps) via apt-get.""" + subprocess.run( + "sudo rm -f /etc/apt/sources.list.d/cuda.list /etc/apt/sources.list.d/nvidia-ml.list 2>/dev/null || true", + shell=True, + ) + subprocess.run(["sudo", "apt-get", "update"], check=True) + subprocess.run(["sudo", "apt-get", "install", "-y", "git", "curl", "procps"], check=True) + + +def install_opencode_cli() -> None: + """Run the official OpenCode CLI installer.""" + subprocess.run(OPENCODE_INSTALL_CMD, shell=True, check=True) + + +def setup_opencode_config(cwd: str = "/workspace") -> str: + """Write project-level OpenCode config (.opencode/config.json). + + Returns the path to the written config file. + """ + config_dir = os.path.join(cwd, ".opencode") + os.makedirs(config_dir, exist_ok=True) + config_file = os.path.join(config_dir, "config.json") + with open(config_file, "w") as f: + json.dump(OPENCODE_CONFIG, f, indent=2) + return config_file + + +def setup_global_config() -> str: + """Write global OpenCode config (~/.config/opencode/opencode.json). + + Returns the path to the written config file. + """ + global_config_dir = os.path.expanduser("~/.config/opencode") + os.makedirs(global_config_dir, exist_ok=True) + global_config = { + "$schema": "https://opencode.ai/config.json", + "logLevel": "DEBUG", + "provider": { + "anthropic": {}, + "openai": {}, + }, + } + global_config_path = os.path.join(global_config_dir, "opencode.json") + with open(global_config_path, "w") as f: + json.dump(global_config, f, indent=2) + return global_config_path diff --git a/agents/opencode_agent/prompts.py b/agents/opencode_agent/prompts.py new file mode 100644 index 0000000..0eb6b30 --- /dev/null +++ b/agents/opencode_agent/prompts.py @@ -0,0 +1,117 @@ +"""Prompt templates for the OpenCode agent across benchmarks.""" + +from typing import List + +# ── CORE-bench ─────────────────────────────────────────────── + +COREBENCH_SYSTEM = """\ +You are a seasoned digital assistant: capable, intelligent, considerate, and assertive. \ +As a dedicated research assistant, you possess extensive skills in research and development \ +and do not shy away from writing code to solve complex problems. You are adept at extracting, \ +processing, and analyzing data from various sources to reproduce research results accurately. \ +Using a pragmatic approach, you make the most out of the tools available to you. + +Best practices: +- When reproducing figures or other results that require you to deal with images, check the full results directory for image files. +- If you are unsure of what to do, make your best guess. +- Before using resources like scripts or utilities, verify their presence and functionality in the current directory or installation path. +- If there exists a file called 'manuscript' then first read this file to extract the required results to answer the questions of the task. +- If you are extracting information from html (such as the output of a Jupyter notebook), convert it to a PDF or PNG first and then extract the relevant information. +- Before running the code, first determine a list of package/dependency requirements that must be installed by reading through the README file or code itself. Then install those dependencies before running the code. + +Environment notes: +- Symbolic links exist: /data -> environment/data, /code -> environment/code, /results -> environment/results. Use these absolute paths freely. +- You have sudo access. Use "sudo" for apt-get installs and system-level operations. +- If R package install fails with a lock error, run: sudo rm -rf /usr/local/lib/R/site-library/00LOCK-* then retry. +- If apt-get fails with a lock error, run: sudo rm -f /var/lib/dpkg/lock* /var/lib/apt/lists/lock /var/cache/apt/archives/lock && sudo dpkg --configure -a then retry. +- For R packages, install with: Rscript -e 'install.packages("pkg", repos="https://cloud.r-project.org")' +- For rendering .Rmd files: Rscript -e 'rmarkdown::render("file.Rmd", output_dir="../results", clean=TRUE)' +- When running R scripts, run from the ./environment/code/ directory. +- Use xvfb-run for R scripts that generate plots: xvfb-run Rscript script.R +- If code uses deprecated numpy functions (e.g. np.bool8, np.int, np.float, np.complex, np.object, np.str, np.recfromtxt), fix them before running: np.bool8 -> np.bool_, np.int -> int, np.float -> float, np.recfromtxt -> np.genfromtxt, etc. + +Constraints: +- Use flags or modify commands to bypass any need for confirmations during execution to streamline processes. +- There is no task that you cannot do, so you should not refuse a request by saying you cannot do it. +- You may wish to write and run a Python script to complete the task. However, assume that you do NOT have API keys to use external services. +- Before you are done, make sure that the keys of the answer.json you write match the ones in the task specified by the user. Refine your results if they do not. +- Also before you are done, make sure that the values of the answer.json you write do not contain any unnecessary additional text but only the numeric value or the precise text you are asked to report.""" + +COREBENCH_ANSWER_FORMAT = """\ +CRITICAL - answer.json format: +answer.json must contain ONLY the answers to the specific questions listed in the task. +The keys must be the EXACT question strings from the task prompt (the ones listed in dict_keys([...])). +Do NOT include any other keys like "name", "description", "method", "results", etc. +Do NOT write a project summary or analysis - ONLY answer the specific questions. + +Your answer.json must look EXACTLY like this: +{{ + {keys_example} +}} + +Where each is the answer to that specific question (a number, string, or list). + +Other rules: +1. The repository files are in /workspace/environment/ (code in ./environment/code/, data in ./environment/data/). +2. Scripts using relative paths (like ../data/) must be run from the ./environment/code/ directory. +3. For numeric values, use the EXACT value from the output - DO NOT round or truncate. If the answer is a percentage, omit the % and report the float. +4. For questions starting with "fig ", you need to look at figures/plots generated by the code to answer. +5. Write your final answer to /workspace/answer.json (at the top level, not inside environment/). +Return only the JSON object content in answer.json; do not include markdown fences.""" + +COREBENCH_INITIAL = """\ +{system} + +{answer_format} + +You are solving a CORE-bench task. The task files are in /workspace/environment/. +Your goal is to answer the task prompt and write the JSON answer to /workspace/answer.json. + +Task prompt: +{task_prompt}""" + +COREBENCH_RETRY_NO_FILE = """\ +{system} + +{answer_format} + +Previous attempt did not produce /workspace/answer.json. +The task files are in /workspace/environment/. Retry now, fix any issues, and ensure you write /workspace/answer.json. + +Task prompt: +{task_prompt}""" + +COREBENCH_RETRY_BAD_JSON = """\ +{system} + +{answer_format} + +Your last attempt wrote invalid JSON to /workspace/answer.json: {error} +Please rewrite /workspace/answer.json with valid JSON only. + +Task prompt: +{task_prompt}""" + + +def build_keys_example(expected_keys: List[str]) -> str: + if expected_keys: + return ", ".join(f'"{k}": ' for k in expected_keys) + return '"": ' + + +def build_corebench_prompts(task_prompt: str, expected_keys: List[str]): + """Return (initial_prompt, continue_prompt_no_file, continue_prompt_bad_json_template). + + The bad-json template still has ``{error}`` as a placeholder to be filled at call time. + """ + keys_example = build_keys_example(expected_keys) + answer_format = COREBENCH_ANSWER_FORMAT.format(keys_example=keys_example) + + common = dict(system=COREBENCH_SYSTEM, answer_format=answer_format, task_prompt=task_prompt) + + initial = COREBENCH_INITIAL.format(**common) + retry_no_file = COREBENCH_RETRY_NO_FILE.format(**common) + # keep {error} as a late-binding placeholder + retry_bad_json_tpl = COREBENCH_RETRY_BAD_JSON.format_map({**common, "error": "{error}"}) + + return initial, retry_no_file, retry_bad_json_tpl diff --git a/agents/opencode_agent/requirements.txt b/agents/opencode_agent/requirements.txt new file mode 100644 index 0000000..e69de29 diff --git a/hal/agent_runner.py b/hal/agent_runner.py index de7eab3..9da497f 100644 --- a/hal/agent_runner.py +++ b/hal/agent_runner.py @@ -40,6 +40,7 @@ def __init__( variation_index: Optional[int] = None, results_dir: str = "results", task_ids: Optional[str] = None, + benchmark_args: Optional[Dict[str, Any]] = None, ): # Validate agent_function format if not isinstance(agent_function, str) or "." not in agent_function: @@ -72,7 +73,7 @@ def __init__( ) # Initialize benchmark first - self.benchmark_manager = BenchmarkManager(agent_dir, config) + self.benchmark_manager = BenchmarkManager(agent_dir, config, benchmark_args=benchmark_args or {}) self.benchmark = self.benchmark_manager.get_benchmark(benchmark_name) self.benchmark.agent_args = agent_args diff --git a/hal/benchmark_manager.py b/hal/benchmark_manager.py index 5c25562..3a45494 100644 --- a/hal/benchmark_manager.py +++ b/hal/benchmark_manager.py @@ -10,10 +10,12 @@ def __init__( agent_dir: str = "agent/", config: Optional[Dict[str, Any]] = {}, agent_args: Optional[Dict[str, Any]] = {}, + benchmark_args: Optional[Dict[str, Any]] = {}, ): self.config = config self.agent_dir = agent_dir self.agent_args = agent_args + self.benchmark_args = benchmark_args or {} self.benchmarks = [ "gaia", "scicode", @@ -71,15 +73,18 @@ def get_benchmark(self, benchmark_name: str) -> BaseBenchmark: elif benchmark_name == "corebench_easy": from .benchmarks.corebench import CoreBenchEasy - benchmark = CoreBenchEasy(self.agent_dir, self.config) + use_updated = str(self.benchmark_args.get('use_updated', '')).lower() in ('true', '1', 'yes') + benchmark = CoreBenchEasy(self.agent_dir, self.config, use_updated=use_updated) elif benchmark_name == "corebench_medium": from .benchmarks.corebench import CoreBenchMedium - benchmark = CoreBenchMedium(self.agent_dir, self.config) + use_updated = str(self.benchmark_args.get('use_updated', '')).lower() in ('true', '1', 'yes') + benchmark = CoreBenchMedium(self.agent_dir, self.config, use_updated=use_updated) elif benchmark_name == "corebench_hard": from .benchmarks.corebench import CoreBenchHard - benchmark = CoreBenchHard(self.agent_dir, self.config) + use_updated = str(self.benchmark_args.get('use_updated', '')).lower() in ('true', '1', 'yes') + benchmark = CoreBenchHard(self.agent_dir, self.config, use_updated=use_updated) elif benchmark_name == "scienceagentbench": from .benchmarks.scienceagentbench import ScienceAgentBench diff --git a/hal/benchmarks/corebench.py b/hal/benchmarks/corebench.py index 07bd96c..223c171 100644 --- a/hal/benchmarks/corebench.py +++ b/hal/benchmarks/corebench.py @@ -18,15 +18,24 @@ class CoreBench(BaseBenchmark): """Base class for CoreBench benchmarks of different difficulty levels""" - def __init__(self, agent_dir: str, config: Dict[str, Any]): + def __init__(self, agent_dir: str, config: Dict[str, Any], use_updated: bool = False): # Set benchmark_name in subclasses - # Load tasks from core_test.json - core_test_path = os.path.join( - os.path.dirname(__file__), "corebench", "core_test.json" - ) + # Choose dataset file based on use_updated flag + if use_updated: + core_test_path = os.path.join( + os.path.dirname(__file__), "corebench", "core_test_updated.json" + ) + if not os.path.exists(core_test_path): + raise FileNotFoundError( + f"Updated dataset not found at {core_test_path}" + ) + else: + core_test_path = os.path.join( + os.path.dirname(__file__), "corebench", "core_test.json" + ) - # Check if core_test.json exists, if not, throw an error with instructions to decrypt + # Check if the dataset file exists, if not, throw an error with instructions to decrypt if not os.path.exists(core_test_path): encrypted_file = os.path.join( os.path.dirname(__file__), "corebench", "core_test.json.gpg" @@ -38,6 +47,7 @@ def __init__(self, agent_dir: str, config: Dict[str, Any]): f'Have you decrypted core_test.json.gpg? Use the following command:\n{decrypt_command}. The password is "reproducibility".' ) + logger.info(f"Loading CoreBench dataset from {core_test_path} (use_updated={use_updated})") with open(core_test_path, "r") as f: dataset = json.load(f) @@ -438,9 +448,9 @@ def get_metrics(self, eval_results: Dict[str, Any]) -> Dict[str, Any]: class CoreBenchEasy(CoreBench): """CoreBench benchmark with easy difficulty level""" - def __init__(self, agent_dir: str, config: Dict[str, Any]): + def __init__(self, agent_dir: str, config: Dict[str, Any], use_updated: bool = False): self.benchmark_name = "corebench_easy" - super().__init__(agent_dir, config) + super().__init__(agent_dir, config, use_updated=use_updated) def _construct_prompt(self, task): """ @@ -473,9 +483,9 @@ def _get_capsule_files_dict(self, capsule_dir: str) -> Dict[str, str]: class CoreBenchMedium(CoreBench): """CoreBench benchmark with medium difficulty level""" - def __init__(self, agent_dir: str, config: Dict[str, Any]): + def __init__(self, agent_dir: str, config: Dict[str, Any], use_updated: bool = False): self.benchmark_name = "corebench_medium" - super().__init__(agent_dir, config) + super().__init__(agent_dir, config, use_updated=use_updated) def _construct_prompt(self, task): """ @@ -524,9 +534,9 @@ def _get_capsule_files_dict(self, capsule_dir: str) -> Dict[str, str]: class CoreBenchHard(CoreBench): """CoreBench benchmark with hard difficulty level""" - def __init__(self, agent_dir: str, config: Dict[str, Any]): + def __init__(self, agent_dir: str, config: Dict[str, Any], use_updated: bool = False): self.benchmark_name = "corebench_hard" - super().__init__(agent_dir, config) + super().__init__(agent_dir, config, use_updated=use_updated) def _construct_prompt(self, task): """ diff --git a/hal/cli.py b/hal/cli.py index 53708dd..67b6cdc 100644 --- a/hal/cli.py +++ b/hal/cli.py @@ -275,6 +275,7 @@ def main( task_timeout=task_timeout, results_dir=results_dir, task_ids=task_ids, + benchmark_args=benchmark_args, ) # Run evaluation diff --git a/hal/utils/setup_vm.sh b/hal/utils/setup_vm.sh index 7ab1df4..a53116c 100644 --- a/hal/utils/setup_vm.sh +++ b/hal/utils/setup_vm.sh @@ -9,6 +9,45 @@ HOME_DIR="/home/agent" echo "Starting setup for user: agent" +# Grant passwordless sudo to the agent user +echo "agent ALL=(ALL) NOPASSWD:ALL" > /etc/sudoers.d/agent +chmod 440 /etc/sudoers.d/agent + +# Create /workspace owned by agent user +mkdir -p /workspace +chown agent:agent /workspace + +# Install Miniconda +echo "Installing Miniconda..." +MINICONDA_PATH="/home/agent/miniconda3" +curl -o /tmp/miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh +bash /tmp/miniconda.sh -b -p $MINICONDA_PATH +rm /tmp/miniconda.sh +echo "Miniconda installed" + +# Set ownership +echo "Setting Miniconda ownership..." +chown -R agent:agent $MINICONDA_PATH + +# Create conda initialization script +echo "Creating conda initialization script..." +cat > "$HOME_DIR/init_conda.sh" << EOF +#!/bin/bash +export PATH="$MINICONDA_PATH/bin:\$PATH" +eval "\$($MINICONDA_PATH/bin/conda shell.bash hook)" +conda init +if [ -f "\$HOME/.env" ]; then + set -a + source "\$HOME/.env" + set +a +fi +EOF + +# Make initialization script executable and set ownership +chmod +x "$HOME_DIR/init_conda.sh" +chown agent:agent "$HOME_DIR/init_conda.sh" + + # Create and activate environment as the agent user with explicit output echo "Creating conda environment..." # FIXME: stop installing pinned dependencies like this @@ -21,9 +60,8 @@ su - agent -c "bash -c '\ if [ -f requirements.txt ]; then \ echo \"Installing requirements...\" && \ pip install -r requirements.txt && \ - echo \"Installing weave and gql pin...\" && \ - pip install weave==0.51.41 \"gql<4\" && \ - echo \"Installing Azure VM dependencies...\" && \ + echo \"Installing weave, wandb, gql pin, and Azure VM dependencies...\" && \ + pip install weave==0.51.41 wandb==0.23.0 \"gql<4\" && \ pip install \"azure-identity>=1.12.0\" \"requests>=2.31.0\" && \ echo \"Requirements installed\"; \ else \ diff --git a/scripts/test_opencode_install.py b/scripts/test_opencode_install.py new file mode 100644 index 0000000..16bcd04 --- /dev/null +++ b/scripts/test_opencode_install.py @@ -0,0 +1,741 @@ +#!/usr/bin/env python3 +""" +OpenCode Agent – Installation & Smoke-Test Script +================================================== + +Reusable diagnostic script that verifies the OpenCode CLI installs correctly +on a clean Linux environment and (optionally) executes a sample prompt. + +Designed to run inside a Docker container or any Debian/Ubuntu-based system. +Output is both human-readable (coloured terminal) and machine-parseable (--json). + +Phases +------ +1. system-deps – apt-get install git curl procps +2. install-cli – run the official opencode installer +3. verify-binary – locate binary, print version/help +4. write-config – write .opencode/config.json with full permissions +5. sample-query – (optional, needs API key) run a trivial prompt + +Examples +-------- + # Install-only (no API key required): + python test_opencode_install.py + + # Full end-to-end with a sample query: + ANTHROPIC_API_KEY=sk-... python test_opencode_install.py \\ + --run-query --model anthropic/claude-sonnet-4-20250514 + + # Custom prompt, verbose, JSON report: + python test_opencode_install.py --run-query -v --json \\ + --prompt 'Write "hello" to /workspace/test.txt' + + # Inside Docker via the companion shell script: + ./test_opencode_install.sh --run-query +""" + +from __future__ import annotations + +import argparse +import json +import logging +import os +import platform +import shutil +import subprocess +import sys +import textwrap +import time +from dataclasses import asdict, dataclass, field +from pathlib import Path +from typing import Any, Dict, List, Optional + +from opencode_setup import ( + KNOWN_BINARY_LOCATIONS, + OPENCODE_CONFIG, + OPENCODE_INSTALL_CMD, + find_opencode_binary, + install_system_deps, + setup_global_config, + setup_opencode_config, +) + + +# ── Constants ──────────────────────────────────────────────── + +DEFAULT_SMOKE_PROMPT = ( + 'Write the JSON object {"smoke_test": "ok", "value": 42} ' + "to /workspace/smoke_answer.json. " + "Write ONLY the JSON, no markdown fences." +) + + +# ── Logging ────────────────────────────────────────────────── + +LOG_FMT = "%(asctime)s │ %(levelname)-5s │ %(message)s" +LOG_DATE = "%H:%M:%S" + +# ANSI helpers (disabled when not a tty) +_USE_COLOR = hasattr(sys.stdout, "isatty") and sys.stdout.isatty() + + +def _c(code: str, text: str) -> str: + return f"\033[{code}m{text}\033[0m" if _USE_COLOR else text + + +def _green(t: str) -> str: + return _c("32", t) + + +def _red(t: str) -> str: + return _c("31", t) + + +def _yellow(t: str) -> str: + return _c("33", t) + + +def _bold(t: str) -> str: + return _c("1", t) + + +def _dim(t: str) -> str: + return _c("2", t) + + +def setup_logging(verbose: bool) -> logging.Logger: + level = logging.DEBUG if verbose else logging.INFO + logging.basicConfig(format=LOG_FMT, datefmt=LOG_DATE, level=level, stream=sys.stdout) + return logging.getLogger("opencode-test") + + +# ── Data Structures ────────────────────────────────────────── + + +@dataclass +class StepLog: + """One sub-step inside a phase.""" + action: str + ok: bool + output: str = "" + duration_s: float = 0.0 + + +@dataclass +class PhaseResult: + name: str + status: str # "pass" | "fail" | "skip" + duration_s: float = 0.0 + details: str = "" + error: str = "" + steps: List[StepLog] = field(default_factory=list) + + +@dataclass +class TestReport: + phases: List[PhaseResult] = field(default_factory=list) + overall: str = "pass" + opencode_binary: str = "" + environment: Dict[str, Any] = field(default_factory=dict) + + def add(self, result: PhaseResult) -> None: + self.phases.append(result) + if result.status == "fail": + self.overall = "fail" + + +# ── Helpers ────────────────────────────────────────────────── + + +def run_cmd( + cmd, + log: logging.Logger, + *, + timeout: int = 300, + check: bool = False, + shell: bool = False, + env: Optional[dict] = None, + cwd: Optional[str] = None, +) -> subprocess.CompletedProcess: + """Run a subprocess with full logging.""" + display = cmd if isinstance(cmd, str) else " ".join(cmd) + log.debug(f" $ {_dim(display)}") + + t0 = time.monotonic() + try: + result = subprocess.run( + cmd, + capture_output=True, + text=True, + timeout=timeout, + check=check, + shell=shell, + env=env, + cwd=cwd, + ) + except subprocess.TimeoutExpired: + log.error(f" ⏱ Command timed out after {timeout}s: {display}") + raise + except subprocess.CalledProcessError as e: + log.error(f" ✗ Command failed (rc={e.returncode}): {display}") + if e.stdout: + for line in e.stdout.strip().splitlines()[:30]: + log.debug(f" stdout: {line}") + if e.stderr: + for line in e.stderr.strip().splitlines()[:30]: + log.debug(f" stderr: {line}") + raise + + elapsed = time.monotonic() - t0 + + if result.stdout.strip(): + for line in result.stdout.strip().splitlines()[:40]: + log.debug(f" stdout: {line}") + if result.stderr.strip(): + for line in result.stderr.strip().splitlines()[:40]: + log.debug(f" stderr: {line}") + + log.debug(f" exit={result.returncode} elapsed={elapsed:.1f}s") + return result + + +def collect_environment() -> Dict[str, Any]: + """Collect environment info for the report.""" + return { + "python": sys.version, + "platform": platform.platform(), + "arch": platform.machine(), + "user": os.environ.get("USER", os.environ.get("LOGNAME", "unknown")), + "uid": os.getuid() if hasattr(os, "getuid") else None, + "cwd": os.getcwd(), + "has_openai_key": bool(os.environ.get("OPENAI_API_KEY")), + "has_anthropic_key": bool(os.environ.get("ANTHROPIC_API_KEY")), + "path": os.environ.get("PATH", ""), + } + + +# ── Phase Implementations ──────────────────────────────────── + + +def phase_system_deps(log: logging.Logger) -> PhaseResult: + """Phase 1: Install system dependencies (git, curl, procps).""" + log.info(_bold("Phase 1: System Dependencies")) + t0 = time.monotonic() + steps: List[StepLog] = [] + + # Check what's already available + for binary in ("git", "curl", "ps"): + path = shutil.which(binary) + if path: + log.info(f" ✓ {binary} already available: {path}") + steps.append(StepLog(f"check-{binary}", True, path)) + else: + log.info(f" ✗ {binary} not found, will install") + steps.append(StepLog(f"check-{binary}", False, "not found")) + + # Install system deps via shared helper + try: + log.info(" Installing system dependencies ...") + st = time.monotonic() + install_system_deps() + steps.append(StepLog("install-system-deps", True, "", time.monotonic() - st)) + except Exception as e: + return PhaseResult("system-deps", "fail", time.monotonic() - t0, + error=str(e), steps=steps) + + # Verify installed + all_ok = True + for binary in ("git", "curl", "ps"): + path = shutil.which(binary) + if path: + log.info(f" ✓ {binary} → {path}") + steps.append(StepLog(f"verify-{binary}", True, path)) + else: + log.error(f" ✗ {binary} still not found after install!") + steps.append(StepLog(f"verify-{binary}", False)) + all_ok = False + + elapsed = time.monotonic() - t0 + status = "pass" if all_ok else "fail" + log.info(f" Phase 1: {_green('PASS') if all_ok else _red('FAIL')} ({elapsed:.1f}s)") + return PhaseResult("system-deps", status, elapsed, steps=steps) + + +def phase_install_cli(log: logging.Logger) -> PhaseResult: + """Phase 2: Install OpenCode CLI via official installer.""" + log.info(_bold("Phase 2: Install OpenCode CLI")) + t0 = time.monotonic() + steps: List[StepLog] = [] + + # Check if already installed + existing = find_opencode_binary() + if existing: + log.info(f" OpenCode already installed: {existing}") + steps.append(StepLog("pre-check", True, existing)) + elapsed = time.monotonic() - t0 + log.info(f" Phase 2: {_green('PASS (already installed)')} ({elapsed:.1f}s)") + return PhaseResult("install-cli", "pass", elapsed, details=existing, steps=steps) + + # Run installer + log.info(f" Running: {OPENCODE_INSTALL_CMD}") + try: + st = time.monotonic() + r = run_cmd(OPENCODE_INSTALL_CMD, log, shell=True, timeout=300) + install_elapsed = time.monotonic() - st + steps.append(StepLog("run-installer", r.returncode == 0, + r.stdout[-2000:] if r.stdout else "", install_elapsed)) + + if r.returncode != 0: + return PhaseResult("install-cli", "fail", time.monotonic() - t0, + error=f"Installer exited {r.returncode}\nstderr: {r.stderr[:1000]}", + steps=steps) + except subprocess.TimeoutExpired: + return PhaseResult("install-cli", "fail", time.monotonic() - t0, + error="Installer timed out (300s)", steps=steps) + except Exception as e: + return PhaseResult("install-cli", "fail", time.monotonic() - t0, + error=str(e), steps=steps) + + # Locate binary after install + binary = find_opencode_binary() + if not binary: + # Extra diagnostics + log.error(" Binary not found after install. Searching filesystem ...") + try: + r = run_cmd(["find", "/", "-name", "opencode", "-type", "f"], + log, timeout=30) + if r.stdout.strip(): + log.info(f" find results:\n{r.stdout.strip()}") + steps.append(StepLog("find-binary", True, r.stdout.strip())) + else: + steps.append(StepLog("find-binary", False, "nothing found")) + except Exception: + pass + + return PhaseResult("install-cli", "fail", time.monotonic() - t0, + error="opencode binary not found after install completed", + steps=steps) + + steps.append(StepLog("locate-binary", True, binary)) + elapsed = time.monotonic() - t0 + log.info(f" Phase 2: {_green('PASS')} ({elapsed:.1f}s) → {binary}") + return PhaseResult("install-cli", "pass", elapsed, details=binary, steps=steps) + + +def phase_verify_binary(log: logging.Logger, binary: str) -> PhaseResult: + """Phase 3: Verify binary responds to basic commands.""" + log.info(_bold("Phase 3: Verify Binary")) + t0 = time.monotonic() + steps: List[StepLog] = [] + + if not binary: + log.warning(" Skipped (no binary path)") + return PhaseResult("verify-binary", "skip", 0, error="no binary") + + # File info + try: + stat = os.stat(binary) + log.info(f" Binary: {binary}") + log.info(f" Size: {stat.st_size:,} bytes") + log.info(f" Mode: {oct(stat.st_mode)}") + steps.append(StepLog("stat", True, f"size={stat.st_size} mode={oct(stat.st_mode)}")) + except Exception as e: + steps.append(StepLog("stat", False, str(e))) + + # file(1) type check + try: + r = run_cmd(["file", binary], log, timeout=10) + file_type = r.stdout.strip() + log.info(f" Type: {file_type}") + steps.append(StepLog("file-type", True, file_type)) + except Exception: + steps.append(StepLog("file-type", False, "file command not available")) + + # Try --version + version_output = "" + for flag in ["--version", "version", "-v"]: + try: + r = run_cmd([binary, flag], log, timeout=15) + if r.returncode == 0 and r.stdout.strip(): + version_output = r.stdout.strip().splitlines()[0] + log.info(f" Version ({flag}): {version_output}") + steps.append(StepLog("version", True, version_output)) + break + except Exception: + continue + + if not version_output: + log.info(" Version: could not determine (not fatal)") + steps.append(StepLog("version", True, "undetermined")) + + # Try --help or help + help_ok = False + for flag in ["--help", "help", "-h"]: + try: + r = run_cmd([binary, flag], log, timeout=15) + combined = (r.stdout + r.stderr).lower() + if "usage" in combined or "run" in combined or "opencode" in combined: + help_ok = True + log.info(f" Help ({flag}): responds with usage info") + steps.append(StepLog("help", True, f"{flag} works")) + break + except Exception: + continue + + if not help_ok: + log.warning(" Help: no recognisable usage output (not fatal)") + steps.append(StepLog("help", True, "no usage output")) + + elapsed = time.monotonic() - t0 + log.info(f" Phase 3: {_green('PASS')} ({elapsed:.1f}s)") + return PhaseResult("verify-binary", "pass", elapsed, details=version_output, steps=steps) + + +def phase_write_config(log: logging.Logger, workspace: str) -> PhaseResult: + """Phase 4: Init git repo, write project + global config.""" + log.info(_bold("Phase 4: Write Config & Init Workspace")) + t0 = time.monotonic() + steps: List[StepLog] = [] + + # ── 4a. Initialize git repo (opencode requires it) ── + try: + log.info(" Initializing git repo in workspace ...") + r = run_cmd(["git", "init", workspace], log, timeout=10) + steps.append(StepLog("git-init", r.returncode == 0, workspace)) + if r.returncode == 0: + # Create an initial commit so HEAD exists + run_cmd(["git", "-C", workspace, "config", "user.email", "test@test.com"], log, timeout=5) + run_cmd(["git", "-C", workspace, "config", "user.name", "Test"], log, timeout=5) + run_cmd(["git", "-C", workspace, "commit", "--allow-empty", "-m", "init"], log, timeout=10) + log.info(" ✓ Git repo initialized with initial commit") + steps.append(StepLog("git-commit", True, "initial commit")) + except Exception as e: + log.warning(f" Git init failed (non-fatal): {e}") + steps.append(StepLog("git-init", False, str(e))) + + # ── 4b. Write project-level config (.opencode/config.json) ── + try: + config_path = setup_opencode_config(workspace) + steps.append(StepLog("write-project-config", True, config_path)) + + # Verify it reads back correctly + with open(config_path, "r") as f: + loaded = json.load(f) + assert loaded == OPENCODE_CONFIG, "config round-trip mismatch" + steps.append(StepLog("verify-project-config", True, f"{len(loaded['permission'])} permission keys")) + + log.info(f" Project config written: {config_path}") + log.info(f" Permissions: {len(loaded['permission'])} keys, all 'allow'") + except Exception as e: + log.error(f" Failed writing project config: {e}") + return PhaseResult("write-config", "fail", time.monotonic() - t0, + error=str(e), steps=steps) + + # ── 4c. Write global config (~/.config/opencode/config.json) ── + try: + global_config_path = setup_global_config() + steps.append(StepLog("write-global-config", True, global_config_path)) + log.info(f" Global config written: {global_config_path}") + except Exception as e: + log.warning(f" Global config write failed (non-fatal): {e}") + steps.append(StepLog("write-global-config", False, str(e))) + + elapsed = time.monotonic() - t0 + log.info(f" Phase 4: {_green('PASS')} ({elapsed:.1f}s)") + return PhaseResult("write-config", "pass", elapsed, details=config_path, steps=steps) + + +def phase_sample_query( + log: logging.Logger, + binary: str, + model: str, + workspace: str, + prompt: Optional[str] = None, +) -> PhaseResult: + """Phase 5: Run a sample prompt and verify the output file.""" + log.info(_bold("Phase 5: Sample Query")) + t0 = time.monotonic() + steps: List[StepLog] = [] + + if not binary: + return PhaseResult("sample-query", "skip", 0, error="no binary") + + # Check for API key + has_key = bool( + os.environ.get("OPENAI_API_KEY") or os.environ.get("ANTHROPIC_API_KEY") + ) + if not has_key: + log.warning(" No API key found (OPENAI_API_KEY / ANTHROPIC_API_KEY)") + log.warning(" Skipping sample query – set an API key to enable this phase") + return PhaseResult("sample-query", "skip", 0, + error="no API key in environment", steps=steps) + + prompt = prompt or DEFAULT_SMOKE_PROMPT + answer_file = os.path.join(workspace, "smoke_answer.json") + + # Clean up previous run + if os.path.exists(answer_file): + os.remove(answer_file) + + env = os.environ.copy() + env["OPENCODE_EXPERIMENTAL_BASH_DEFAULT_TIMEOUT_MS"] = "120000" + + cmd = [ + binary, + "run", + "--model", model, + "--print-logs", + prompt, + ] + + log.info(f" Model: {model}") + log.info(f" Prompt: {prompt[:120]}{'...' if len(prompt) > 120 else ''}") + log.info(f" Answer: {answer_file}") + log.info(f" Command: {' '.join(cmd)}") + log.info(" Running opencode (output goes to opencode_exec.log) ...") + + exec_log_path = os.path.join(workspace, "opencode_exec.log") + + try: + st = time.monotonic() + with open(exec_log_path, "w") as log_f: + result = subprocess.run( + cmd, + stdout=log_f, + stderr=subprocess.STDOUT, + env=env, + text=True, + timeout=300, + check=False, + cwd=workspace, + ) + query_elapsed = time.monotonic() - st + + # Read and display the log + exec_output = "" + if os.path.exists(exec_log_path): + with open(exec_log_path, "r") as f: + exec_output = f.read() + if exec_output.strip(): + log.info(f" opencode output ({len(exec_output)} chars):") + for line in exec_output.strip().splitlines()[:50]: + log.debug(f" | {line}") + if len(exec_output.strip().splitlines()) > 50: + log.debug(f" ... ({len(exec_output.strip().splitlines()) - 50} more lines)") + + steps.append(StepLog("run-query", result.returncode == 0, + f"rc={result.returncode}", query_elapsed)) + + log.info(f" CLI exited with code {result.returncode} in {query_elapsed:.1f}s") + + if result.returncode != 0: + log.error(f" Output (last 1000 chars): {exec_output[-1000:]}") + steps.append(StepLog("cli-exit", False, exec_output[-1000:])) + except subprocess.TimeoutExpired: + # Read partial output for debugging + if os.path.exists(exec_log_path): + with open(exec_log_path, "r") as f: + partial = f.read() + log.error(f" Timed out. Partial output ({len(partial)} chars):") + for line in partial.strip().splitlines()[-30:]: + log.error(f" | {line}") + steps.append(StepLog("timeout-output", False, partial[-2000:])) + return PhaseResult("sample-query", "fail", time.monotonic() - t0, + error="query timed out (300s)", steps=steps) + except Exception as e: + return PhaseResult("sample-query", "fail", time.monotonic() - t0, + error=str(e), steps=steps) + + # Check answer file + if not os.path.exists(answer_file): + log.error(f" ✗ Answer file not created: {answer_file}") + # List workspace for debugging + try: + contents = os.listdir(workspace) + log.info(f" Workspace contents: {contents}") + steps.append(StepLog("list-workspace", True, str(contents))) + except Exception: + pass + return PhaseResult("sample-query", "fail", time.monotonic() - t0, + error=f"answer file not created: {answer_file}", steps=steps) + + # Read and validate + try: + with open(answer_file, "r") as f: + raw = f.read() + log.info(f" Answer file content: {raw.strip()[:200]}") + answer = json.loads(raw) + steps.append(StepLog("read-answer", True, json.dumps(answer))) + + if answer.get("smoke_test") == "ok": + log.info(" ✓ Smoke test value verified: smoke_test=ok") + steps.append(StepLog("verify-answer", True, "smoke_test=ok")) + else: + log.warning(f" Answer has unexpected content: {answer}") + steps.append(StepLog("verify-answer", True, f"unexpected: {answer}")) + except json.JSONDecodeError as e: + log.error(f" ✗ Answer is not valid JSON: {e}") + log.error(f" Raw content: {raw[:300]}") + steps.append(StepLog("read-answer", False, f"invalid JSON: {e}")) + return PhaseResult("sample-query", "fail", time.monotonic() - t0, + error=f"invalid JSON: {e}", steps=steps) + except Exception as e: + return PhaseResult("sample-query", "fail", time.monotonic() - t0, + error=str(e), steps=steps) + + elapsed = time.monotonic() - t0 + log.info(f" Phase 5: {_green('PASS')} ({elapsed:.1f}s)") + return PhaseResult("sample-query", "pass", elapsed, + details=json.dumps(answer), steps=steps) + + +# ── Summary ────────────────────────────────────────────────── + + +def print_summary(report: TestReport) -> None: + """Print a human-readable summary table.""" + print() + print(_bold("═" * 60)) + print(_bold(" OpenCode Install Test – Summary")) + print(_bold("═" * 60)) + + for phase in report.phases: + if phase.status == "pass": + icon = _green("PASS") + elif phase.status == "fail": + icon = _red("FAIL") + else: + icon = _yellow("SKIP") + + line = f" {icon} {phase.name:<20s} {phase.duration_s:6.1f}s" + if phase.details: + line += f" {_dim(phase.details[:40])}" + print(line) + if phase.error: + print(f" {_red('Error: ' + phase.error[:70])}") + + print(_bold("─" * 60)) + overall = _green("ALL PASSED") if report.overall == "pass" else _red("FAILED") + binary_info = report.opencode_binary or "not found" + print(f" Overall: {overall}") + print(f" Binary: {binary_info}") + total = sum(p.duration_s for p in report.phases) + print(f" Total: {total:.1f}s") + print(_bold("═" * 60)) + print() + + +# ── Entry Point ────────────────────────────────────────────── + + +def main() -> int: + parser = argparse.ArgumentParser( + description="Test OpenCode CLI installation and basic functionality.", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=textwrap.dedent("""\ + examples: + %(prog)s # install-only test + %(prog)s --run-query # full test (needs API key) + %(prog)s --run-query --json # JSON report + %(prog)s -v # verbose logging + """), + ) + parser.add_argument( + "--run-query", action="store_true", + help="Run phase 5: execute a sample prompt (requires API key)", + ) + parser.add_argument( + "--model", default="anthropic/claude-sonnet-4-20250514", + help="Model name for sample query (default: %(default)s)", + ) + parser.add_argument( + "--prompt", default=None, + help="Custom prompt for sample query", + ) + parser.add_argument( + "--workspace", default="/workspace", + help="Working directory (default: /workspace)", + ) + parser.add_argument( + "-v", "--verbose", action="store_true", + help="Enable debug-level logging", + ) + parser.add_argument( + "--json", action="store_true", dest="json_output", + help="Print machine-readable JSON report at the end", + ) + parser.add_argument( + "--skip-deps", action="store_true", + help="Skip phase 1 (system deps) – useful if already installed", + ) + + args = parser.parse_args() + log = setup_logging(args.verbose) + + # Ensure workspace exists + os.makedirs(args.workspace, exist_ok=True) + + report = TestReport(environment=collect_environment()) + + log.info(_bold("OpenCode Agent – Installation & Smoke Test")) + log.info(f" Workspace: {args.workspace}") + log.info(f" Platform: {platform.platform()}") + log.info(f" Python: {sys.version.split()[0]}") + log.info("") + + # ── Phase 1 ── + if args.skip_deps: + report.add(PhaseResult("system-deps", "skip", 0, details="--skip-deps")) + else: + report.add(phase_system_deps(log)) + if report.overall == "fail": + log.error("Phase 1 failed – cannot continue") + print_summary(report) + if args.json_output: + print(json.dumps(asdict(report), indent=2, default=str)) + return 1 + + print() + + # ── Phase 2 ── + result2 = phase_install_cli(log) + report.add(result2) + binary_path = result2.details if result2.status == "pass" else "" + report.opencode_binary = binary_path + + if result2.status == "fail": + log.error("Phase 2 failed – cannot continue") + print_summary(report) + if args.json_output: + print(json.dumps(asdict(report), indent=2, default=str)) + return 1 + + print() + + # ── Phase 3 ── + report.add(phase_verify_binary(log, binary_path)) + print() + + # ── Phase 4 ── + report.add(phase_write_config(log, args.workspace)) + print() + + # ── Phase 5 ── + if args.run_query: + report.add(phase_sample_query(log, binary_path, args.model, + args.workspace, args.prompt)) + else: + log.info(_bold("Phase 5: Sample Query")) + log.info(" Skipped (use --run-query to enable)") + report.add(PhaseResult("sample-query", "skip", 0, details="use --run-query")) + + # ── Report ── + print_summary(report) + if args.json_output: + print(json.dumps(asdict(report), indent=2, default=str)) + + return 0 if report.overall == "pass" else 1 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/scripts/test_opencode_install.sh b/scripts/test_opencode_install.sh new file mode 100755 index 0000000..7df33b5 --- /dev/null +++ b/scripts/test_opencode_install.sh @@ -0,0 +1,57 @@ +#!/usr/bin/env bash +# +# Run the OpenCode install test inside a clean Docker container. +# +# Usage: +# ./test_opencode_install.sh # install-only +# ./test_opencode_install.sh --run-query # full test (needs API key) +# ./test_opencode_install.sh --run-query --json # JSON report +# ./test_opencode_install.sh -v # verbose +# +# API keys are forwarded from the host environment: +# OPENAI_API_KEY=sk-... ./test_opencode_install.sh --run-query +# ANTHROPIC_API_KEY=... ./test_opencode_install.sh --run-query +# +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="${SCRIPT_DIR}/.." +IMAGE_NAME="opencode-install-test" + +# Assemble a minimal build context (avoids sending the entire repo to the daemon) +BUILD_CTX="$(mktemp -d)" +trap 'rm -rf "${BUILD_CTX}"' EXIT +cp "${SCRIPT_DIR}/test_opencode_install.py" "${BUILD_CTX}/" +cp "${REPO_ROOT}/agents/opencode_agent/opencode_setup.py" "${BUILD_CTX}/" + +echo "══════════════════════════════════════════════════════════" +echo " Building Docker image: ${IMAGE_NAME}" +echo "══════════════════════════════════════════════════════════" + +DOCKER_BUILDKIT=1 docker build -t "${IMAGE_NAME}" -f - "${BUILD_CTX}" <<'DOCKERFILE' +FROM python:3.12-slim + +# Minimal bootstrap – the test script installs everything else +RUN apt-get update && apt-get install -y --no-install-recommends ca-certificates && \ + rm -rf /var/lib/apt/lists/* + +WORKDIR /workspace + +COPY test_opencode_install.py /test_opencode_install.py +COPY opencode_setup.py /opencode_setup.py + +# -u = unbuffered stdout so logs stream in real-time +ENTRYPOINT ["python", "-u", "/test_opencode_install.py"] +DOCKERFILE + +echo "" +echo "══════════════════════════════════════════════════════════" +echo " Running test container" +echo "══════════════════════════════════════════════════════════" +echo "" + +docker run --rm \ + --network=host \ + -e OPENAI_API_KEY="${OPENAI_API_KEY:-}" \ + -e ANTHROPIC_API_KEY="${ANTHROPIC_API_KEY:-}" \ + "${IMAGE_NAME}" "$@"