diff --git a/README.md b/README.md index 1a41a38..0741981 100644 --- a/README.md +++ b/README.md @@ -80,7 +80,7 @@ We also generate multimodal skin based on the rules of the same maze. Generated - [ ] Add Multimodal Environment Generation Pipelines. - [ ] Add Three-stage Verification Pipeline for both text and multimodal environments. - [ ] Add Learning Experiments Scripts. -- [ ] Add Coding Agents Option: Codex, Gemini Cli. +- [x] Add Coding Agents Option: Codex, Claude Code SDK. - [ ] Add 3D Environment Generation Pipelines. ## Repository Layout @@ -129,6 +129,200 @@ python run_environment_skin_generation.py Cost summaries are automatically saved to `workspace/costs/`. +## Coding Agents + +AutoEnv supports multiple coding agent backends for environment code generation and fixing. The code agent is used in the pipeline's CodeFixNode, LevelGenNode, and MaxRewardNode stages. + +### Backend Options + +| Backend | Description | Best For | +| --------- | ---------------------------------------------------------------------------- | ----------------------------------- | +| `miniswe` | Default [mini-swe-agent](https://github.com/SWE-agent/mini-swe-agent) | General use, works with any LLM | +| `codex` | OpenAI [Codex CLI](https://github.com/openai/codex) | OpenAI API users, fast execution | +| `claude` | Anthropic [Claude Agent SDK](https://github.com/anthropics/claude-agent-sdk) | Anthropic API users, custom proxies | + +### Configuration + +Set the `code_agent_backend` field in `config/env_gen.yaml`: + +```yaml +# Code agent backend: "miniswe" (default), "codex", "claude" +code_agent_backend: "codex" # or "claude" or "miniswe" +``` + +### MiniSWE Agent (Default) + +The default agent using mini-swe-agent. Works with any LLM configured in `config/model_config.yaml`. + +```yaml +# config/env_gen.yaml +code_agent_backend: "miniswe" +model: "gpt-4o" # or any configured LLM +``` + +No additional setup required beyond the standard model configuration. + +### Codex Agent + +Uses OpenAI's official Codex CLI for code generation. + +#### Prerequisites + +```bash +# Install Codex CLI +npm install -g @openai/codex +# or +brew install --cask codex + +# Authenticate (recommended) +codex login + +# Verify installation +codex whoami +``` + +#### Authentication Options + +**Option 1: CLI Login (Recommended)** + +```bash +codex login # Opens browser for OAuth +``` + +**Option 2: Environment Variable** + +```bash +export OPENAI_API_KEY=your-api-key +``` + +**Option 3: Custom Base URL** (for proxies) + +```bash +export OPENAI_API_KEY=your-api-key +export OPENAI_BASE_URL=https://your-proxy.example.com/v1 +``` + +#### Configuration + +```yaml +# config/env_gen.yaml +code_agent_backend: "codex" +``` + +### Claude Agent + +Uses Anthropic's Claude Agent SDK for code generation. + +#### Prerequisites + +```bash +# Install Claude Agent SDK +pip install claude-agent-sdk +``` + +The Python SDK authenticates via environment variables (see Authentication Options below). + +#### Authentication Options + +**Option 1: Environment Variables (Recommended)** + +```bash +export ANTHROPIC_API_KEY=your-api-key +``` + +**Option 2: Custom Base URL** (for proxies) + +```bash +export ANTHROPIC_API_KEY=your-api-key +export ANTHROPIC_BASE_URL=https://your-proxy.example.com/api +``` + +#### Configuration + +```yaml +# config/env_gen.yaml +code_agent_backend: "claude" +``` + +### Generate Environment with Code Agent + +#### Quick Start + +```bash +# 1. Copy example config +cp config/env_gen_example.yaml config/env_gen.yaml + +# 2. Edit config (set code_agent_backend, theme, etc.) +vim config/env_gen.yaml + +# 3. Run environment generation +python run_environment_generation.py +``` + +#### Example Configuration + +```yaml +# config/env_gen.yaml +mode: "textual" +model: "gpt-4o" +concurrency: 1 +theme: "A strategic puzzle game with resource management" +envs_root_path: "workspace/envs" +code_agent_backend: "codex" # Use Codex for code generation +``` + +#### Background Execution (Recommended for Long Tasks) + +Code agents can take 10-30 minutes for complex environments. Run in background: + +```bash +# Run in background with logging +nohup python run_environment_generation.py > /tmp/autoenv_gen.log 2>&1 & + +# Monitor progress +tail -f /tmp/autoenv_gen.log + +# Check if complete +ls workspace/envs/*/done.txt +``` + +### Troubleshooting + +#### Codex CLI Issues + +```bash +# Check if Codex is installed +codex --version + +# Re-authenticate +codex logout && codex login + +# Check current user +codex whoami +``` + +#### Claude Agent Issues + +```bash +# Check if Python SDK is installed +pip show claude-agent-sdk + +# Verify API key +echo $ANTHROPIC_API_KEY + +# Test import +python -c "from claude_agent_sdk import query; print('SDK available')" +``` + +#### Timeout Issues + +For complex environments, increase timeout in `autoenv/coder.py`: + +```python +# Current default: 900 seconds (15 minutes) +agent = CodexAgent(timeout=1200) # 20 minutes +``` + ## Benchmarking AutoEnv-36 Evaluate agents on the 36 benchmark environments (scores for all; cost only for LLM branch). See `benchmarks/README.md` for details. @@ -160,9 +354,10 @@ Programmatic APIs are available in `benchmarks/api.py` (`benchmark_llms`, `bench ## Acknowledgements -Thanks to -[mini-swe-agent](https://github.com/SWE-agent/mini-swe-agent), -[codex](https://github.com/openai/codex), +Thanks to +[mini-swe-agent](https://github.com/SWE-agent/mini-swe-agent), +[codex](https://github.com/openai/codex), +[claude-agent-sdk](https://github.com/anthropics/claude-agent-sdk-python), [rembg](https://github.com/danielgatis/rembg), for providing basic support for this project! diff --git a/autoenv/claude_code_agent.py b/autoenv/claude_code_agent.py new file mode 100644 index 0000000..b2943dd --- /dev/null +++ b/autoenv/claude_code_agent.py @@ -0,0 +1,403 @@ +from __future__ import annotations + +from pathlib import Path +from typing import Any, Dict, List, Optional + +from pydantic import Field, model_validator, PrivateAttr + +from base.agent.base_agent import BaseAgent + + +try: + from claude_agent_sdk import ( + query, + ClaudeAgentOptions, + ClaudeSDKError, + CLINotFoundError, + ProcessError, + CLIJSONDecodeError, + ) + CLAUDE_AGENT_AVAILABLE = True +except ImportError: + CLAUDE_AGENT_AVAILABLE = False + # Fallback exception definitions that maintain proper inheritance + # These match the SDK's exception hierarchy for consistent error handling + class ClaudeSDKError(Exception): + """Base exception for Claude SDK errors.""" + pass + + class CLINotFoundError(ClaudeSDKError): + """Exception raised when Claude CLI is not found.""" + pass + + class ProcessError(ClaudeSDKError): + """Exception raised when process execution fails.""" + def __init__(self, message="", exit_code=-1): + super().__init__(message) + self.exit_code = exit_code + + class CLIJSONDecodeError(ClaudeSDKError): + """Exception raised when JSON decoding fails.""" + pass + + # Stub for query and ClaudeAgentOptions when SDK is unavailable + query = None + ClaudeAgentOptions = None + + +class ClaudeCodeAgent(BaseAgent): + """Claude Code Agent for code generation using Claude Agent SDK.""" + + name: str = Field(default="claude_code", description="Agent name") + description: str = Field( + default="Claude Code agent for code generation and execution", + description="Agent description" + ) + + # Claude Code specific settings + # NOTE: Uses `max_turns` for conversation turns with Claude Code CLI (vs BaseAgent's generic `max_steps`) + max_turns: int = Field(default=10, description="Maximum conversation turns") + max_messages: int = Field( + default=1000, + description="Maximum number of messages to keep in history. Older messages are discarded." + ) + cwd: Optional[Path] = Field(default=None, description="Working directory") + allowed_tools: Optional[List[str]] = Field( + default=None, + description="Allowed tools (e.g., ['Read', 'Write', 'Bash'])" + ) + permission_mode: str = Field( + default="acceptEdits", + description=( + "Permission mode controlling agent's action execution behavior.\n" + "- 'default': Interactive mode, requests confirmation before actions\n" + "- 'acceptEdits': Auto-accepts code edits, still confirms dangerous operations\n" + "- 'bypassPermissions': ⚠️ DANGEROUS - Bypasses all permission checks and executes " + "actions without confirmation. Only use in fully trusted, isolated environments.\n" + "- 'plan': Planning mode, generates action plans without executing" + ) + ) + system_prompt_override: Optional[str] = Field( + default=None, + description="Override system prompt (only for non-interactive mode)" + ) + append_system_prompt: Optional[str] = Field( + default=None, + description="Append to system prompt (only for non-interactive mode)" + ) + api_base_url: Optional[str] = Field( + default=None, + description="Custom API base URL (e.g., for proxies or API gateways)" + ) + api_key: Optional[str] = Field( + default=None, + description="API key for authentication. If not provided, uses ANTHROPIC_API_KEY env var" + ) + env_vars: Optional[Dict[str, str]] = Field( + default=None, + description="Additional environment variables to pass to Claude CLI" + ) + + class Config: + arbitrary_types_allowed = True + + # Private attributes for internal state + _messages: List[Any] = PrivateAttr(default_factory=list) + _session_id: Optional[str] = PrivateAttr(default=None) + _total_cost_usd: float = PrivateAttr(default=0.0) + _current_prompt: Optional[str] = PrivateAttr(default=None) + _max_messages: int = PrivateAttr(default=1000) # Limit message history to prevent unbounded growth + + @model_validator(mode="after") + def validate_claude_agent_available(self) -> "ClaudeCodeAgent": + """Validate that Claude Agent SDK is available.""" + if not CLAUDE_AGENT_AVAILABLE: + raise ImportError( + "Claude Agent SDK is not installed. " + "Install it with: pip install claude-agent-sdk\n" + "Note: The Claude Code CLI is automatically bundled - no separate installation needed!" + ) + + # Set default cwd if not provided + if self.cwd is None: + self.cwd = Path.cwd() + else: + self.cwd = Path(self.cwd) + + # Validate working directory exists + self._validate_cwd() + + # Validate permission mode + valid_modes = ["default", "acceptEdits", "bypassPermissions", "plan"] + if self.permission_mode not in valid_modes: + raise ValueError( + f"Invalid permission_mode: {self.permission_mode}. " + f"Must be one of: {valid_modes}" + ) + + return self + + def _validate_cwd(self) -> None: + """Validate working directory exists and is a directory.""" + if not self.cwd.exists(): + raise FileNotFoundError(f"Working directory does not exist: {self.cwd}") + if not self.cwd.is_dir(): + raise NotADirectoryError(f"Working directory path is not a directory: {self.cwd}") + + def _create_options(self) -> ClaudeAgentOptions: + """Create ClaudeAgentOptions from agent settings.""" + options_dict = { + "max_turns": self.max_turns, + "cwd": str(self.cwd), + "permission_mode": self.permission_mode, + } + + if self.allowed_tools: + options_dict["allowed_tools"] = self.allowed_tools + if self.system_prompt_override: + options_dict["system_prompt"] = self.system_prompt_override + elif self.append_system_prompt: + options_dict["append_system_prompt"] = self.append_system_prompt + + # Build environment variables for Claude CLI + env_dict = {} + if self.env_vars: + env_dict.update(self.env_vars) + + # Set API key if provided + if self.api_key: + env_dict["ANTHROPIC_API_KEY"] = self.api_key + + # Set base URL if provided (Claude CLI respects ANTHROPIC_BASE_URL) + if self.api_base_url: + env_dict["ANTHROPIC_BASE_URL"] = self.api_base_url + + if env_dict: + options_dict["env"] = env_dict + + return ClaudeAgentOptions(**options_dict) + + def _handle_sdk_error(self, e: Exception) -> str: + """Handle Claude SDK errors with consistent formatting. + + Only converts expected SDK errors to user-friendly strings. + Re-raises unexpected exceptions (programming errors) for proper debugging. + + Args: + e: The exception to handle + + Returns: + Formatted error message string for SDK errors + + Raises: + Exception: Re-raises non-SDK exceptions for proper error tracking + """ + if isinstance(e, CLINotFoundError): + return f"Error: Claude Code CLI not found: {str(e)}" + elif isinstance(e, ProcessError): + exit_code = getattr(e, 'exit_code', 'unknown') + return f"Error: Process failed with exit code {exit_code}: {str(e)}" + elif isinstance(e, CLIJSONDecodeError): + return f"Error: Failed to parse Claude response: {str(e)}" + elif isinstance(e, ClaudeSDKError): + return f"Error: Claude SDK error: {str(e)}" + else: + # Re-raise unexpected exceptions (programming errors, etc.) for proper debugging + raise + + async def _process_query_stream(self, prompt: str, options: ClaudeAgentOptions) -> str: + """Process the query stream and extract result. + + Handles message iteration, session tracking, cost accumulation, and result extraction. + This is shared logic between step() and run() methods. + + Args: + prompt: The prompt to send to Claude + options: Claude agent options + + Returns: + Result text from the query + """ + result_text = "" + async for message in query(prompt=prompt, options=options): + # Enforce max_messages limit to prevent unbounded memory growth + if len(self._messages) >= self._max_messages: + # Remove oldest message to maintain sliding window + self._messages.pop(0) + self._messages.append(message) + + if hasattr(message, 'session_id'): + self._session_id = message.session_id + + # Check if this is a result message (by class name or type attribute) + is_result_message = ( + type(message).__name__ == "ResultMessage" or + (hasattr(message, 'type') and message.type == "result") + ) + + if not is_result_message: + continue + + # Track costs for result messages + if hasattr(message, 'total_cost_usd'): + self._total_cost_usd += message.total_cost_usd + + # Capture result if available + if hasattr(message, 'result') and message.result: + result_text = message.result + # Handle error or completion subtypes + elif hasattr(message, 'subtype'): + if message.subtype == "error_max_turns": + result_text = f"Error: Reached maximum turns ({self.max_turns})" + elif message.subtype == "error_during_execution": + result_text = "Error: Execution failed" + elif message.subtype == "success": + result_text = getattr(message, 'result', "Execution completed successfully") + else: + result_text = f"Completed with status: {message.subtype}" + else: + result_text = "Execution completed" + + return result_text if result_text else "No result received" + + async def step(self) -> str: + """Execute a single step in the agent's workflow.""" + if not self._current_prompt: + return "No prompt provided. Use run() method to execute tasks." + + try: + options = self._create_options() + return await self._process_query_stream(self._current_prompt, options) + except Exception as e: + return self._handle_sdk_error(e) + + async def run(self, request: Optional[str] = None, **kwargs) -> str: + """Execute the agent's main loop asynchronously. + + Args: + request: The task or prompt to execute + **kwargs: Temporary attribute overrides (max_turns, cwd, permission_mode, etc.) + + Returns: + Result string from execution or error message + + Warning: + This agent is NOT safe for concurrent use. Do not call run() from multiple + coroutines simultaneously on the same agent instance, as attribute modifications + will interfere with each other. + + Note: + Attributes modified via kwargs are restored after execution on a "best effort" basis. + In rare cases, restoration may fail to avoid masking the primary execution error. + """ + if not request: + return "Error: No request provided" + + self._current_prompt = request + self._messages = [] + self._session_id = None + self._total_cost_usd = 0.0 + + # Whitelist of attributes that can be modified via kwargs + modifiable_attrs = { + 'max_turns', 'cwd', 'permission_mode', 'allowed_tools', + 'system_prompt_override', 'append_system_prompt', + 'api_base_url', 'api_key', 'env_vars' + } + + # Safely modify attributes with validation + original_values = {} + for key, value in kwargs.items(): + if not hasattr(self, key): + return f"Error: Unknown attribute '{key}'" + + if key not in modifiable_attrs: + return f"Error: Attribute '{key}' cannot be modified via kwargs. Allowed: {sorted(modifiable_attrs)}" + + try: + original_values[key] = getattr(self, key) + setattr(self, key, value) + + # Validate critical attributes after modification + if key == 'cwd': + # Security: Validate and sanitize cwd to prevent directory traversal attacks + if isinstance(value, str): + new_cwd = Path(value) + elif isinstance(value, Path): + new_cwd = value + else: + raise ValueError("cwd must be a string or pathlib.Path") + + # Security: Prevent absolute paths to sensitive directories + if new_cwd.is_absolute(): + raise ValueError( + "For security reasons, cwd cannot be set to an absolute path via kwargs. " + "Set cwd during agent initialization instead." + ) + + # Security: Prevent directory traversal via '..' + if ".." in new_cwd.parts: + raise ValueError("cwd cannot contain parent directory references ('..')") + + # Resolve relative to current cwd and validate + self.cwd = (self.cwd / new_cwd).resolve() + self._validate_cwd() + elif key == 'permission_mode': + valid_modes = ["default", "acceptEdits", "bypassPermissions", "plan"] + if value not in valid_modes: + raise ValueError(f"Invalid permission_mode: {value}. Must be one of: {valid_modes}") + + except Exception as e: + # If validation fails, restore any attributes set so far + for restore_key, restore_value in original_values.items(): + try: + setattr(self, restore_key, restore_value) + except Exception: + pass # Best effort restoration + return f"Error: Failed to set attribute '{key}': {str(e)}" + + try: + options = self._create_options() + return await self._process_query_stream(request, options) + except Exception as e: + return self._handle_sdk_error(e) + finally: + # Restore original values (best effort) + restoration_failures = [] + for key, value in original_values.items(): + try: + setattr(self, key, value) + except Exception as e: + # Track restoration failures for potential debugging + restoration_failures.append(f"{key}: {e}") + + # Note: We don't raise restoration errors to avoid masking the primary execution result. + # In production, consider logging restoration_failures for debugging. + + async def __call__(self, **kwargs) -> str: + """Execute the agent with given parameters.""" + request = kwargs.pop('request', None) or kwargs.pop('task', None) or kwargs.pop('prompt', None) + return await self.run(request=request, **kwargs) + + def get_session_info(self) -> Dict[str, Any]: + """Get information about the current session.""" + return { + "session_id": self._session_id, + "total_cost_usd": self._total_cost_usd, + "num_messages": len(self._messages), + "cwd": str(self.cwd), + "max_turns": self.max_turns, + "permission_mode": self.permission_mode, + } + + def get_messages(self) -> List[Any]: + """Get all messages from the current session.""" + return self._messages.copy() + + def reset(self) -> None: + """Reset the agent state for a new session.""" + self._messages = [] + self._session_id = None + self._total_cost_usd = 0.0 + self._current_prompt = None + diff --git a/autoenv/coder.py b/autoenv/coder.py index 061502d..be08949 100644 --- a/autoenv/coder.py +++ b/autoenv/coder.py @@ -1,51 +1,80 @@ -from typing import Optional +from typing import Optional, Literal import os from base.agent.base_agent import BaseAgent from autoenv.miniswe_agent import MiniSWEAutoEnvAgent -class ECodeAgent(BaseAgent): - """Agent that asks mini-swe-agent to generate levels inside Docker. - - Mounts the target env folder at /workspace. - - Mounts repo root at /repo and sets PYTHONPATH for imports. - - Uses a persistent pip cache to speed up runs. - - Instructs the assistant to fix scripts when needed, generate 100 levels to ./levels, validate, then finish. +class ECodeAgent(BaseAgent): + """Agent for code generation tasks in AutoEnv. + + Supports multiple backends via Strategy Pattern: + - miniswe (default): Uses MiniSWEAutoEnvAgent, works with any LLM + - codex: Uses OpenAI Codex CLI (requires OPENAI_API_KEY) + - claude: Uses Claude Code CLI (requires ANTHROPIC_API_KEY) + + All backends implement the same BaseAgent.run(request=...) interface. """ name: str = "coder" desc: str = "A minimal coder for AutoEnv-generated environments" + + # Backend selection: "miniswe" (default), "codex", or "claude" + backend: Literal["miniswe", "codex", "claude"] = "miniswe" - async def __call__(self, requirements: Optional[str] = None, cwds: Optional[str] = None, environment_type: Optional[str] = "local") -> str: - # Resolve paths - repo_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) - base_env = {"PYTHONPATH": os.pathsep.join([repo_root, os.environ.get("PYTHONPATH", "")]).strip(os.pathsep)} - - if environment_type == "docker": - agent = MiniSWEAutoEnvAgent( - llm=self.llm, # Pass the LLM instance from BaseAgent - mode="yolo", - step_limit=50, - environment_type="docker", - cwd = cwds, - env = base_env, - timeout = 900, - docker_image="python:3.11-slim", + def _create_agent(self, cwds: str, environment_type: str = "local") -> BaseAgent: + """Factory method to create the appropriate agent based on backend.""" + if self.backend == "codex": + from autoenv.codex_agent import CodexAgent + agent = CodexAgent(cwd=cwds, permission_mode="acceptEdits", timeout=900) # 15 min timeout for complex tasks + + elif self.backend == "claude": + from autoenv.claude_code_agent import ClaudeCodeAgent + # Check environment variables for custom API configuration + api_base_url = os.environ.get("ANTHROPIC_BASE_URL") + api_key = os.environ.get("ANTHROPIC_API_KEY") + agent = ClaudeCodeAgent( + cwd=cwds, + permission_mode="bypassPermissions", # Allow file operations without confirmation + api_base_url=api_base_url, + api_key=api_key, + max_turns=50 # Allow more turns for complex tasks ) - elif environment_type == "local": - agent = MiniSWEAutoEnvAgent( - llm=self.llm, # Pass the LLM instance from BaseAgent - mode="yolo", - step_limit=100, - environment_type="local", - cwd = cwds, - env = base_env, - timeout = 900, - ) - else: - raise ValueError(f"Unsupported environment_type: {environment_type}") + + else: # miniswe (default) + repo_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) + base_env = {"PYTHONPATH": os.pathsep.join([repo_root, os.environ.get("PYTHONPATH", "")]).strip(os.pathsep)} + + if environment_type == "docker": + agent = MiniSWEAutoEnvAgent( + llm=self.llm, # Pass the LLM instance from BaseAgent + mode="yolo", + step_limit=50, + environment_type="docker", + cwd = cwds, + env = base_env, + timeout = 900, + docker_image="python:3.11-slim", + ) + elif environment_type == "local": + agent = MiniSWEAutoEnvAgent( + llm=self.llm, # Pass the LLM instance from BaseAgent + mode="yolo", + step_limit=100, + environment_type="local", + cwd = cwds, + env = base_env, + timeout = 900, + ) + else: + raise ValueError(f"Unsupported environment_type: {environment_type}") + + return agent - return await agent.run(task=requirements) + async def __call__(self, requirements: Optional[str] = None, cwds: Optional[str] = None, environment_type: Optional[str] = "local") -> str: + """Execute code task with configured backend.""" + agent = self._create_agent(cwds, environment_type) + return await agent.run(request=requirements) # BaseAgent abstract methods async def step(self) -> str: diff --git a/autoenv/codex_agent.py b/autoenv/codex_agent.py new file mode 100644 index 0000000..267809f --- /dev/null +++ b/autoenv/codex_agent.py @@ -0,0 +1,433 @@ +from __future__ import annotations + +import asyncio +import json +import os +import subprocess +from pathlib import Path +from typing import Any, Dict, List, Optional + +from pydantic import Field, model_validator, PrivateAttr + +from base.agent.base_agent import BaseAgent + + +# Check if Codex CLI is available +def _check_codex_cli() -> bool: + """Check if Codex CLI (codex command) is installed.""" + try: + result = subprocess.run( + ["codex", "--version"], + capture_output=True, + timeout=5 + ) + return result.returncode == 0 + except (FileNotFoundError, subprocess.TimeoutExpired): + return False + + +# Lazily-evaluated flag indicating whether the Codex CLI is available. +CODEX_CLI_AVAILABLE: Optional[bool] = None + + +def is_codex_cli_available(force_recheck: bool = False) -> bool: + """Check if Codex CLI is available (cached after first call). + + Args: + force_recheck: If True, bypass cache and recheck CLI availability + + Returns: + True if Codex CLI is available, False otherwise + """ + global CODEX_CLI_AVAILABLE + if CODEX_CLI_AVAILABLE is None or force_recheck: + CODEX_CLI_AVAILABLE = _check_codex_cli() + return CODEX_CLI_AVAILABLE + + +class CodexAgent(BaseAgent): + """Codex agent for code generation using a Codex CLI tool.""" + + name: str = Field(default="codex", description="Agent name") + description: str = Field( + default="Codex agent for code generation and execution", + description="Agent description" + ) + + # Codex specific settings + # NOTE: Uses `max_turns` for conversation turns with Codex CLI (vs BaseAgent's generic `max_steps`). + max_turns: int = Field(default=10, description="Maximum conversation turns") + cwd: Optional[Path] = Field(default=None, description="Working directory") + model: Optional[str] = Field( + default=None, + description="Model identifier for the Codex CLI. If not specified, uses the CLI's default model." + ) + permission_mode: str = Field( + default="acceptEdits", + description=( + "Permission mode controlling agent's action execution behavior.\n" + "- 'default': Interactive mode, requests confirmation before actions\n" + "- 'acceptEdits': Auto-accepts code edits, still confirms dangerous operations\n" + "- 'bypassPermissions': ⚠️ DANGEROUS - Bypasses all permission checks and executes " + "actions without confirmation. Only use in fully trusted, isolated environments.\n" + "- 'plan': Planning mode, generates action plans without executing" + ) + ) + timeout: int = Field( + default=300, + description="Timeout in seconds for CLI commands" + ) + api_key: Optional[str] = Field( + default=None, + description=( + "Optional API key for Codex CLI. If set, passed via OPENAI_API_KEY environment variable. " + "If not set, CLI uses its own login state (codex login)." + ) + ) + + class Config: + arbitrary_types_allowed = True + + # Private attributes for internal state + _messages: List[Any] = PrivateAttr(default_factory=list) + _session_id: Optional[str] = PrivateAttr(default=None) + _total_cost_usd: float = PrivateAttr(default=0.0) + _current_prompt: Optional[str] = PrivateAttr(default=None) + + @model_validator(mode="after") + def validate_codex_cli_available(self) -> "CodexAgent": + """Validate that Codex CLI is available.""" + # Force recheck to handle case where CLI was installed after module import + if not is_codex_cli_available(force_recheck=True): + raise ImportError( + "Codex CLI is not installed or not available on PATH. " + "Please install the 'codex' command-line tool required for this project " + "and ensure it is accessible in your PATH, then try again. " + "Refer to your project documentation for the correct installation instructions." + ) + + # Set default cwd if not provided + if self.cwd is None: + self.cwd = Path.cwd() + else: + self.cwd = Path(self.cwd) + + # Validate working directory exists + self._validate_cwd() + + # Validate permission mode + valid_modes = ["default", "acceptEdits", "bypassPermissions", "plan"] + if self.permission_mode not in valid_modes: + raise ValueError( + f"Invalid permission_mode: {self.permission_mode}. " + f"Must be one of: {valid_modes}" + ) + + return self + + def _validate_cwd(self) -> None: + """Validate working directory exists and is a directory.""" + if not self.cwd.exists(): + raise FileNotFoundError(f"Working directory does not exist: {self.cwd}") + if not self.cwd.is_dir(): + raise NotADirectoryError(f"Working directory path is not a directory: {self.cwd}") + + def _build_cli_command(self, prompt: str) -> List[str]: + """Build Codex CLI command.""" + cmd = ["codex", "exec"] + + if self.model: + cmd.extend(["-m", self.model]) + if self.cwd: + cmd.extend(["-C", str(self.cwd)]) + + sandbox_map = { + "default": "read-only", + "acceptEdits": "workspace-write", + "bypassPermissions": "danger-full-access", + "plan": "read-only" + } + cmd.extend(["-s", sandbox_map.get(self.permission_mode, "read-only")]) + + if self.permission_mode == "bypassPermissions": + cmd.append("--dangerously-bypass-approvals-and-sandbox") + elif self.permission_mode == "acceptEdits": + cmd.append("--full-auto") + + cmd.append("--json") + # Use "--" so prompt content (even starting with "--") is treated as positional, not CLI flags + cmd.append("--") + cmd.append(prompt) + + return cmd + + async def _run_cli_command(self, prompt: str) -> Dict[str, Any]: + """Run Codex CLI command and parse JSONL output.""" + # Validate cwd before use (handles case where cwd was modified via kwargs) + self._validate_cwd() + + cmd = self._build_cli_command(prompt) + + env = os.environ.copy() + if self.api_key: + env["OPENAI_API_KEY"] = self.api_key + + process = await asyncio.create_subprocess_exec( + *cmd, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + cwd=str(self.cwd), + env=env + ) + + try: + stdout, stderr = await asyncio.wait_for( + process.communicate(), + timeout=self.timeout + ) + except asyncio.TimeoutError: + process.kill() + await process.wait() + raise TimeoutError( + f"Codex CLI command timed out after {self.timeout} seconds" + ) + + if process.returncode != 0: + error_msg = stderr.decode('utf-8') if stderr else "Unknown error" + raise RuntimeError( + f"Codex CLI command failed with code {process.returncode}: {error_msg}" + ) + + output = stdout.decode('utf-8').strip() + + if not output: + return {"result": "No output received", "session_id": None} + + try: + lines = output.split('\n') + result = {"result": ""} + thread_id = None + + for line in lines: + if not line.strip(): + continue + + try: + obj = json.loads(line) + + if obj.get("type") == "thread.started": + thread_id = obj.get("thread_id") + elif obj.get("type") == "item.completed": + item = obj.get("item", {}) + if item.get("type") == "agent_message": + text = item.get("text", "") + if text: + if result["result"]: + result["result"] += "\n\n" + result["result"] += text + elif obj.get("type") == "turn.completed": + usage = obj.get("usage", {}) + if usage: + result["usage"] = usage + + except json.JSONDecodeError: + # Skip invalid JSON lines (e.g., empty/partial) and continue parsing the stream + pass + + if thread_id: + result["session_id"] = thread_id + + if result["result"]: + result["result"] = result["result"].strip() + else: + result["result"] = "No result received" + + return result + + except Exception as e: + raise ValueError( + f"Failed to parse Codex CLI output as JSON: {e}. Raw output: {output!r}" + ) from e + + def _process_result(self, result: Dict[str, Any]) -> str: + """Process CLI command result and update internal state. + + Args: + result: Result dictionary from _run_cli_command + + Returns: + Formatted result string + """ + # Update session info + if 'session_id' in result: + self._session_id = result['session_id'] + if 'total_cost_usd' in result: + self._total_cost_usd = result['total_cost_usd'] + + # Append to message history + self._messages.append(result) + + # Parse result based on type and subtype + if result.get('type') == 'result': + subtype = result.get('subtype', 'success') + if subtype == 'success': + return result.get('result', 'Execution completed') + elif subtype == 'error_max_turns': + return f"Error: Reached maximum turns ({self.max_turns})" + elif subtype == 'error_during_execution': + return "Error: Execution failed" + else: + return f"Completed with status: {subtype}" + + return result.get('result', 'No result received') + + async def step(self) -> str: + """Execute a single step in the agent's workflow.""" + if not self._current_prompt: + return "No prompt provided. Use run() method to execute tasks." + + try: + result = await self._run_cli_command(self._current_prompt) + return self._process_result(result) + except (TimeoutError, RuntimeError, ValueError, FileNotFoundError, NotADirectoryError) as e: + # Known CLI execution errors - return as error strings + return f"Error during execution: {str(e)}" + except Exception: + # Re-raise unexpected errors (programming errors) for proper debugging + raise + + async def run(self, request: Optional[str] = None, **kwargs) -> str: + """Execute the agent's main loop asynchronously. + + Args: + request: The task or prompt to execute + **kwargs: Temporary attribute overrides (max_turns, timeout, cwd, model, etc.) + + Returns: + Result string from execution or error message + + Warning: + This agent is NOT safe for concurrent use. Do not call run() from multiple + coroutines simultaneously on the same agent instance, as attribute modifications + will interfere with each other. + + Note: + Attributes modified via kwargs are restored after execution on a "best effort" basis. + In rare cases, restoration may fail to avoid masking the primary execution error. + """ + if not request: + return "Error: No request provided" + + self._current_prompt = request + self._messages = [] + self._session_id = None + self._total_cost_usd = 0.0 + + # Whitelist of attributes that can be modified via kwargs + modifiable_attrs = { + 'max_turns', 'timeout', 'cwd', 'model', 'permission_mode' + } + + # Safely modify attributes with validation + original_values = {} + for key, value in kwargs.items(): + if key not in modifiable_attrs: + return f"Error: Attribute '{key}' cannot be modified via kwargs. Allowed: {sorted(modifiable_attrs)}" + + if not hasattr(self, key): + return f"Error: Unknown attribute '{key}'" + + try: + original_values[key] = getattr(self, key) + setattr(self, key, value) + + # Validate critical attributes after modification + if key == 'cwd': + # Security: Validate and sanitize cwd to prevent directory traversal attacks + if isinstance(value, str): + new_cwd = Path(value) + elif isinstance(value, Path): + new_cwd = value + else: + raise ValueError("cwd must be a string or pathlib.Path") + + # Security: Prevent absolute paths to sensitive directories + if new_cwd.is_absolute(): + raise ValueError( + "For security reasons, cwd cannot be set to an absolute path via kwargs. " + "Set cwd during agent initialization instead." + ) + + # Security: Prevent directory traversal via '..' + if ".." in new_cwd.parts: + raise ValueError("cwd cannot contain parent directory references ('..')") + + # Resolve relative to current cwd and validate + self.cwd = (self.cwd / new_cwd).resolve() + self._validate_cwd() + elif key == 'permission_mode': + valid_modes = ["default", "acceptEdits", "bypassPermissions", "plan"] + if value not in valid_modes: + raise ValueError(f"Invalid permission_mode: {value}. Must be one of: {valid_modes}") + + except Exception as e: + # If validation fails, restore any attributes set so far + for restore_key, restore_value in original_values.items(): + try: + setattr(self, restore_key, restore_value) + except Exception: + pass # Best effort restoration + return f"Error: Failed to set attribute '{key}': {str(e)}" + + try: + result = await self._run_cli_command(request) + return self._process_result(result) + + except (TimeoutError, RuntimeError, ValueError, FileNotFoundError, NotADirectoryError) as e: + # Known CLI execution errors - return as error strings + return f"Error during execution: {str(e)}" + except Exception: + # Re-raise unexpected errors (programming errors) for proper debugging + raise + + finally: + # Restore original values (best effort) + restoration_failures = [] + for key, value in original_values.items(): + try: + setattr(self, key, value) + except Exception as e: + # Track restoration failures for potential debugging + restoration_failures.append(f"{key}: {e}") + + # Note: We don't raise restoration errors to avoid masking the primary execution result. + # In production, consider logging restoration_failures for debugging. + + async def __call__(self, **kwargs) -> str: + """Execute the agent with given parameters.""" + request = kwargs.pop('request', None) or kwargs.pop('task', None) or kwargs.pop('prompt', None) + return await self.run(request=request, **kwargs) + + def get_session_info(self) -> Dict[str, Any]: + """Get information about the current session.""" + return { + "session_id": self._session_id, + "total_cost_usd": self._total_cost_usd, + "num_messages": len(self._messages), + "cwd": str(self.cwd), + "model": self.model, + "max_turns": self.max_turns, + "permission_mode": self.permission_mode, + } + + def get_messages(self) -> List[Any]: + """Get all messages from the current session.""" + return self._messages.copy() + + def reset(self) -> None: + """Reset the agent state for a new session.""" + self._messages = [] + self._session_id = None + self._total_cost_usd = 0.0 + self._current_prompt = None + diff --git a/autoenv/pipeline/generator/nodes.py b/autoenv/pipeline/generator/nodes.py index 1c9217b..a1fbc70 100644 --- a/autoenv/pipeline/generator/nodes.py +++ b/autoenv/pipeline/generator/nodes.py @@ -70,6 +70,9 @@ class GeneratorContext(NodeContext): level_gen_result: Any = None max_reward_result: Any = None + # Code agent backend: "miniswe" (default), "codex", or "claude" + code_agent_backend: str = "miniswe" + # Status success: bool = False error: str | None = None @@ -104,7 +107,7 @@ def _init_env_folder(self, ctx: GeneratorContext) -> None: local_time = time.localtime(t) ctx.env_id = time.strftime("%Y%m%d_%H%M%S", local_time) + f"_env_{ctx.env_theme}" if not ctx.env_folder_path: - ctx.env_folder_path = ctx.envs_root_path / ctx.env_id + ctx.env_folder_path = (ctx.envs_root_path / ctx.env_id).resolve() # Use absolute path ctx.env_folder_path.mkdir(parents=True, exist_ok=True) @@ -216,7 +219,7 @@ async def execute(self, ctx: GeneratorContext) -> None: ctx.error = "CodeFixNode requires env_folder_path" return - code_agent = ECodeAgent(llm=AsyncLLM(self.llm.config)) + code_agent = ECodeAgent(llm=AsyncLLM(self.llm.config), backend=ctx.code_agent_backend) task = ECODE_AGENT_CODE_FIX_PROMPT.format( env_id=ctx.env_id, @@ -224,7 +227,13 @@ async def execute(self, ctx: GeneratorContext) -> None: validator_checklist=VALIDATOR_CHECKLIST, ) ctx.code_fix_result = await code_agent(requirements=task, cwds=str(ctx.env_folder_path)) - print(f"[CodeFixNode] ✓ code fix completed") + + # Check for errors in result + if ctx.code_fix_result and ctx.code_fix_result.startswith("Error"): + print(f"[CodeFixNode] ⚠️ WARNING: {ctx.code_fix_result[:200]}...") + print(f"[CodeFixNode] Continuing despite error...") + else: + print(f"[CodeFixNode] ✓ code fix completed (backend={ctx.code_agent_backend})") class LevelGenNode(BaseNode): @@ -239,7 +248,7 @@ async def execute(self, ctx: GeneratorContext) -> None: ctx.error = "LevelGenNode requires env_folder_path" return - code_agent = ECodeAgent(llm=AsyncLLM(self.llm.config)) + code_agent = ECodeAgent(llm=AsyncLLM(self.llm.config), backend=ctx.code_agent_backend) task = ECODE_AGENT_LEVEL_GENERATION_PROMPT.format( env_id=ctx.env_id, @@ -247,7 +256,24 @@ async def execute(self, ctx: GeneratorContext) -> None: validator_checklist=VALIDATOR_CHECKLIST, ) ctx.level_gen_result = await code_agent(requirements=task, cwds=str(ctx.env_folder_path)) - print(f"[LevelGenNode] ✓ level generation completed") + + # Check for errors in result + if ctx.level_gen_result and str(ctx.level_gen_result).startswith("Error"): + print(f"[LevelGenNode] ⚠️ WARNING: {str(ctx.level_gen_result)[:200]}...") + print(f"[LevelGenNode] Continuing despite error...") + else: + # Verify levels directory was created with level files + levels_dir = ctx.env_folder_path / "levels" + if levels_dir.exists(): + level_files = list(levels_dir.glob("*.yaml")) + if len(level_files) >= 1: + print(f"[LevelGenNode] ✓ level generation completed - {len(level_files)} levels created") + if len(level_files) < 15: + print(f"[LevelGenNode] ⚠️ Note: Expected 15 levels, got {len(level_files)}") + else: + print(f"[LevelGenNode] ⚠️ WARNING: levels/ directory exists but contains no .yaml files!") + else: + print(f"[LevelGenNode] ⚠️ WARNING: levels/ directory NOT created!") class MaxRewardNode(BaseNode): @@ -262,14 +288,25 @@ async def execute(self, ctx: GeneratorContext) -> None: ctx.error = "MaxRewardNode requires env_folder_path" return - code_agent = ECodeAgent(llm=AsyncLLM(self.llm.config)) + code_agent = ECodeAgent(llm=AsyncLLM(self.llm.config), backend=ctx.code_agent_backend) task = ECODE_AGENT_CALCULATE_MAX_REWARD_PROMPT.format( env_id=ctx.env_id, workspace=ctx.env_folder_path, ) ctx.max_reward_result = await code_agent(requirements=task, cwds=str(ctx.env_folder_path)) - print(f"[MaxRewardNode] ✓ max reward calculation completed") + + # Check for errors in result + if ctx.max_reward_result and ctx.max_reward_result.startswith("Error"): + print(f"[MaxRewardNode] ⚠️ WARNING: {ctx.max_reward_result[:200]}...") + print(f"[MaxRewardNode] Continuing despite error...") + else: + # Verify level_max_rewards.json was created + rewards_file = ctx.env_folder_path / "level_max_rewards.json" + if rewards_file.exists(): + print(f"[MaxRewardNode] ✓ max reward calculation completed - level_max_rewards.json created") + else: + print(f"[MaxRewardNode] ⚠️ WARNING: level_max_rewards.json NOT created!") class ArchiveNode(BaseNode): diff --git a/autoenv/pipeline/generator/pipeline.py b/autoenv/pipeline/generator/pipeline.py index 987a63e..8b11bca 100644 --- a/autoenv/pipeline/generator/pipeline.py +++ b/autoenv/pipeline/generator/pipeline.py @@ -70,6 +70,7 @@ async def run( requirements: str, output_dir: Path | str | None = None, env_theme: str = "random", + code_agent_backend: str = "miniswe", ) -> GeneratorContext: """ Run generation pipeline. @@ -78,6 +79,7 @@ async def run( requirements: Environment requirements (string or .txt file path) output_dir: Output root directory, defaults to workspace/envs env_theme: Environment theme name + code_agent_backend: Code agent backend ("miniswe", "codex", "claude") Returns: GeneratorContext: Context containing generation results @@ -92,6 +94,7 @@ async def run( ctx = GeneratorContext( requirements=requirements, env_theme=env_theme, + code_agent_backend=code_agent_backend, ) if output_dir: ctx.envs_root_path = Path(output_dir) diff --git a/config/env_gen_example.yaml b/config/env_gen_example.yaml index 491498f..6a436cf 100644 --- a/config/env_gen_example.yaml +++ b/config/env_gen_example.yaml @@ -19,3 +19,11 @@ theme: "" # Output directory for generated environments envs_root_path: "workspace/envs" +# Code agent backend: "miniswe" (default), "codex", or "claude" +# - miniswe: Uses MiniSWE agent with LLM (recommended, works with any LLM) +# - codex: Uses OpenAI Codex CLI (requires OPENAI_API_KEY) +# - claude: Uses Claude Agent SDK (Python package) +# • Required: Environment variable ANTHROPIC_API_KEY +# • Optional: Custom base URL ANTHROPIC_BASE_URL (set when using proxy) +# • Recommended: Write vars to .env and `source .env` in shell +code_agent_backend: "miniswe" diff --git a/run_environment_generation.py b/run_environment_generation.py index a683560..d9d46fd 100644 --- a/run_environment_generation.py +++ b/run_environment_generation.py @@ -38,6 +38,7 @@ async def run_generation( output: str, mode: str = "textual", image_model: str | None = None, + code_agent_backend: str = "miniswe", ): """Run a single generation task.""" label = theme @@ -46,9 +47,9 @@ async def run_generation( theme = Path(theme).read_text(encoding="utf-8") # Step 1: Run generator pipeline - print(f"🚀 [{label}] Generating environment...") + print(f"🚀 [{label}] Generating environment (backend={code_agent_backend})...") gen_pipeline = GeneratorPipeline.create_default(llm_name=model) - gen_ctx = await gen_pipeline.run(requirements=theme, output_dir=output) + gen_ctx = await gen_pipeline.run(requirements=theme, output_dir=output, code_agent_backend=code_agent_backend) if not gen_ctx.success: print(f"❌ [{label}] Generation failed: {gen_ctx.error}") @@ -97,6 +98,7 @@ async def main(): mode = args.mode or cfg.get("mode") or "textual" image_model = cfg.get("image_model") concurrency = cfg.get("concurrency", 1) + code_agent_backend = cfg.get("code_agent_backend", "miniswe") Path(output).mkdir(parents=True, exist_ok=True) print(f"🔧 Config: {args.config}") @@ -104,6 +106,7 @@ async def main(): print(f"🎨 Image Model: {image_model}") print(f"📁 Output: {output}") print(f"📦 Mode: {mode}") + print(f"🔧 Code Agent: {code_agent_backend}") # Determine themes (priority: CLI --theme > themes_folder > theme) themes: list[str] = [] @@ -123,7 +126,7 @@ async def main(): async def task(t: str): async with sem: - await run_generation(t, model, output, mode, image_model) + await run_generation(t, model, output, mode, image_model, code_agent_backend) with CostMonitor() as monitor: await asyncio.gather(*[task(t) for t in themes])