diff --git a/console/src/pages/Settings/Security/components/RuleModal.tsx b/console/src/pages/Settings/Security/components/RuleModal.tsx index af64826aa..e6cd46fc8 100644 --- a/console/src/pages/Settings/Security/components/RuleModal.tsx +++ b/console/src/pages/Settings/Security/components/RuleModal.tsx @@ -19,6 +19,7 @@ const BUILTIN_TOOLS = [ "execute_python_code", "browser_use", "desktop_screenshot", + "view_image", "read_file", "write_file", "edit_file", diff --git a/console/src/pages/Settings/Security/index.tsx b/console/src/pages/Settings/Security/index.tsx index db8ac913f..7a8e37703 100644 --- a/console/src/pages/Settings/Security/index.tsx +++ b/console/src/pages/Settings/Security/index.tsx @@ -19,6 +19,7 @@ const BUILTIN_TOOLS = [ "execute_python_code", "browser_use", "desktop_screenshot", + "view_image", "read_file", "write_file", "edit_file", diff --git a/src/copaw/agents/model_factory.py b/src/copaw/agents/model_factory.py index 658e5c8f7..36d2b8cc4 100644 --- a/src/copaw/agents/model_factory.py +++ b/src/copaw/agents/model_factory.py @@ -321,7 +321,10 @@ def _create_formatter_instance( formatter_class = _create_file_block_support_formatter( base_formatter_class, ) - return formatter_class() + kwargs: dict[str, Any] = {} + if issubclass(base_formatter_class, OpenAIChatFormatter): + kwargs["promote_tool_result_images"] = True + return formatter_class(**kwargs) __all__ = [ diff --git a/src/copaw/agents/react_agent.py b/src/copaw/agents/react_agent.py index 30313f5d1..93fb95efc 100644 --- a/src/copaw/agents/react_agent.py +++ b/src/copaw/agents/react_agent.py @@ -37,6 +37,7 @@ read_file, send_file_to_user, set_user_timezone, + view_image, write_file, create_memory_search_tool, ) @@ -187,6 +188,7 @@ def _create_toolkit( "edit_file": edit_file, "browser_use": browser_use, "desktop_screenshot": desktop_screenshot, + "view_image": view_image, "send_file_to_user": send_file_to_user, "get_current_time": get_current_time, "set_user_timezone": set_user_timezone, diff --git a/src/copaw/agents/tools/__init__.py b/src/copaw/agents/tools/__init__.py index aaa00a48d..08ae8c0af 100644 --- a/src/copaw/agents/tools/__init__.py +++ b/src/copaw/agents/tools/__init__.py @@ -19,6 +19,7 @@ from .send_file import send_file_to_user from .browser_control import browser_use from .desktop_screenshot import desktop_screenshot +from .view_image import view_image from .memory_search import create_memory_search_tool from .get_current_time import get_current_time, set_user_timezone from .get_token_usage import get_token_usage @@ -36,6 +37,7 @@ "glob_search", "send_file_to_user", "desktop_screenshot", + "view_image", "browser_use", "create_memory_search_tool", "get_current_time", diff --git a/src/copaw/agents/tools/view_image.py b/src/copaw/agents/tools/view_image.py new file mode 100644 index 000000000..577765231 --- /dev/null +++ b/src/copaw/agents/tools/view_image.py @@ -0,0 +1,81 @@ +# -*- coding: utf-8 -*- +"""Load an image file into the LLM context for visual analysis.""" + +import mimetypes +import os +import unicodedata +from pathlib import Path + +from agentscope.message import ImageBlock, TextBlock +from agentscope.tool import ToolResponse + +_IMAGE_EXTENSIONS = { + ".png", + ".jpg", + ".jpeg", + ".gif", + ".webp", + ".bmp", + ".tiff", + ".tif", +} + + +async def view_image(image_path: str) -> ToolResponse: + """Load an image file into the LLM context so the model can see it. + + Use this after desktop_screenshot, browser_use, or any tool that + produces an image file path. + + Args: + image_path (`str`): + Path to the image file to view. + + Returns: + `ToolResponse`: + An ImageBlock the model can inspect, or an error message. + """ + image_path = unicodedata.normalize( + "NFC", + os.path.expanduser(image_path), + ) + resolved = Path(image_path).resolve() + + if not resolved.exists() or not resolved.is_file(): + return ToolResponse( + content=[ + TextBlock( + type="text", + text=f"Error: {image_path} does not exist or " + "is not a file.", + ), + ], + ) + + ext = resolved.suffix.lower() + mime, _ = mimetypes.guess_type(str(resolved)) + if ext not in _IMAGE_EXTENSIONS and ( + not mime or not mime.startswith("image/") + ): + return ToolResponse( + content=[ + TextBlock( + type="text", + text=f"Error: {resolved.name} is not a supported " + "image format.", + ), + ], + ) + + return ToolResponse( + content=[ + ImageBlock( + type="image", + source={"type": "url", "url": str(resolved)}, + ), + TextBlock( + type="text", + text=f"Image loaded: {resolved.name}", + ), + ], + ) diff --git a/src/copaw/app/channels/base.py b/src/copaw/app/channels/base.py index c4e466bc4..96a230d32 100644 --- a/src/copaw/app/channels/base.py +++ b/src/copaw/app/channels/base.py @@ -34,6 +34,7 @@ from .renderer import MessageRenderer, RenderStyle from .schema import ChannelType +from ...config.utils import load_config # Optional callback to enqueue payload (set by manager) EnqueueCallback = Optional[Callable[[Any], None]] @@ -99,10 +100,17 @@ def __init__( self.deny_message = deny_message or "" self.require_mention = require_mention self._enqueue: EnqueueCallback = None + cfg = load_config() + internal_tools = frozenset( + name + for name, tc in cfg.tools.builtin_tools.items() + if not tc.display_to_user + ) self._render_style = RenderStyle( show_tool_details=show_tool_details, filter_tool_messages=filter_tool_messages, filter_thinking=filter_thinking, + internal_tools=internal_tools, ) self._renderer = MessageRenderer(self._render_style) self._http: Optional[Any] = None diff --git a/src/copaw/app/channels/renderer.py b/src/copaw/app/channels/renderer.py index 035e7382b..0aa65e00f 100644 --- a/src/copaw/app/channels/renderer.py +++ b/src/copaw/app/channels/renderer.py @@ -44,6 +44,7 @@ class RenderStyle: use_emoji: bool = True filter_tool_messages: bool = False filter_thinking: bool = False + internal_tools: frozenset = frozenset() def _fmt_tool_call( @@ -192,11 +193,17 @@ def _parts_for_tool_output(content_list: list) -> List[_OutgoingPart]: ContentType.VIDEO, ContentType.FILE, ) - media_parts = [ - p - for p in block_parts - if getattr(p, "type", None) in media_types - ] + # Internal tools (e.g. view_image) produce + # media for the LLM, not the user — skip. + media_parts = ( + [] + if name in s.internal_tools + else [ + p + for p in block_parts + if getattr(p, "type", None) in media_types + ] + ) out.extend(media_parts) if not media_parts: out.append( @@ -265,6 +272,9 @@ def _parts_for_tool_output(content_list: list) -> List[_OutgoingPart]: if getattr(c, "type", None) != ContentType.DATA: continue data = getattr(c, "data", None) or {} + name = data.get("name") or "tool" + if name in s.internal_tools: + continue output = data.get("output", "") try: output = json.loads(output) diff --git a/src/copaw/config/config.py b/src/copaw/config/config.py index a51275817..0d62c6e26 100644 --- a/src/copaw/config/config.py +++ b/src/copaw/config/config.py @@ -409,66 +409,90 @@ class BuiltinToolConfig(BaseModel): name: str = Field(..., description="Tool function name") enabled: bool = Field(True, description="Whether the tool is enabled") description: str = Field(default="", description="Tool description") + display_to_user: bool = Field( + True, + description="Whether tool output is rendered to user channels", + ) + + +def _default_builtin_tools() -> Dict[str, BuiltinToolConfig]: + """Return a fresh copy of the canonical built-in tool definitions.""" + return { + "execute_shell_command": BuiltinToolConfig( + name="execute_shell_command", + enabled=True, + description="Execute shell commands", + ), + "read_file": BuiltinToolConfig( + name="read_file", + enabled=True, + description="Read file contents", + ), + "write_file": BuiltinToolConfig( + name="write_file", + enabled=True, + description="Write content to file", + ), + "edit_file": BuiltinToolConfig( + name="edit_file", + enabled=True, + description="Edit file using find-and-replace", + ), + "browser_use": BuiltinToolConfig( + name="browser_use", + enabled=True, + description="Browser automation and web interaction", + ), + "desktop_screenshot": BuiltinToolConfig( + name="desktop_screenshot", + enabled=True, + description="Capture desktop screenshots", + ), + "view_image": BuiltinToolConfig( + name="view_image", + enabled=True, + description="Load an image into LLM context " + "for visual analysis", + display_to_user=False, + ), + "send_file_to_user": BuiltinToolConfig( + name="send_file_to_user", + enabled=True, + description="Send files to user", + ), + "get_current_time": BuiltinToolConfig( + name="get_current_time", + enabled=True, + description="Get current date and time", + ), + "set_user_timezone": BuiltinToolConfig( + name="set_user_timezone", + enabled=True, + description="Set user timezone", + ), + "get_token_usage": BuiltinToolConfig( + name="get_token_usage", + enabled=True, + description="Get llm token usage", + ), + } class ToolsConfig(BaseModel): """Built-in tools management configuration.""" builtin_tools: Dict[str, BuiltinToolConfig] = Field( - default_factory=lambda: { - "execute_shell_command": BuiltinToolConfig( - name="execute_shell_command", - enabled=True, - description="Execute shell commands", - ), - "read_file": BuiltinToolConfig( - name="read_file", - enabled=True, - description="Read file contents", - ), - "write_file": BuiltinToolConfig( - name="write_file", - enabled=True, - description="Write content to file", - ), - "edit_file": BuiltinToolConfig( - name="edit_file", - enabled=True, - description="Edit file using find-and-replace", - ), - "browser_use": BuiltinToolConfig( - name="browser_use", - enabled=True, - description="Browser automation and web interaction", - ), - "desktop_screenshot": BuiltinToolConfig( - name="desktop_screenshot", - enabled=True, - description="Capture desktop screenshots", - ), - "send_file_to_user": BuiltinToolConfig( - name="send_file_to_user", - enabled=True, - description="Send files to user", - ), - "get_current_time": BuiltinToolConfig( - name="get_current_time", - enabled=True, - description="Get current date and time", - ), - "set_user_timezone": BuiltinToolConfig( - name="set_user_timezone", - enabled=True, - description="Set user timezone", - ), - "get_token_usage": BuiltinToolConfig( - name="get_token_usage", - enabled=True, - description="Get llm token usage", - ), - }, + default_factory=_default_builtin_tools, ) + @model_validator(mode="after") + def _merge_default_tools(self): + """Ensure new code-defined tools are present in saved configs.""" + for name, tc in _default_builtin_tools().items(): + if name not in self.builtin_tools: + self.builtin_tools[name] = tc + return self + class ToolGuardRuleConfig(BaseModel): """A single user-defined guard rule (stored in config.json)."""