diff --git a/backend/app/agent/factory/browser.py b/backend/app/agent/factory/browser.py index 385ae167..30093e17 100644 --- a/backend/app/agent/factory/browser.py +++ b/backend/app/agent/factory/browser.py @@ -23,6 +23,7 @@ from app.agent.prompt import BROWSER_SYS_PROMPT from app.agent.toolkit.human_toolkit import HumanToolkit from app.agent.toolkit.hybrid_browser_toolkit import HybridBrowserToolkit +from app.agent.toolkit.image_analysis_toolkit import ImageAnalysisToolkit # TODO: Remove NoteTakingToolkit and use TerminalToolkit instead from app.agent.toolkit.note_taking_toolkit import NoteTakingToolkit @@ -96,6 +97,13 @@ def browser_agent(options: Chat): working_directory=working_directory, ) note_toolkit = message_integration.register_toolkits(note_toolkit) + # Save reference before registering for toolkits_to_register_agent + image_analysis_toolkit_for_agent_registration = ImageAnalysisToolkit( + options.project_id + ) + image_analysis_toolkit = message_integration.register_toolkits( + image_analysis_toolkit_for_agent_registration + ) search_tools = SearchToolkit.get_can_use_tools(options.project_id) if search_tools: @@ -110,6 +118,7 @@ def browser_agent(options: Chat): *web_toolkit_custom.get_tools(), *terminal_toolkit, *note_toolkit.get_tools(), + *image_analysis_toolkit.get_tools(), *search_tools, ] @@ -135,7 +144,11 @@ def browser_agent(options: Chat): HumanToolkit.toolkit_name(), NoteTakingToolkit.toolkit_name(), TerminalToolkit.toolkit_name(), + ImageAnalysisToolkit.toolkit_name(), + ], + toolkits_to_register_agent=[ + web_toolkit_for_agent_registration, + image_analysis_toolkit_for_agent_registration, ], - toolkits_to_register_agent=[web_toolkit_for_agent_registration], enable_snapshot_clean=True, ) diff --git a/backend/app/agent/factory/developer.py b/backend/app/agent/factory/developer.py index 96359e05..d19ec578 100644 --- a/backend/app/agent/factory/developer.py +++ b/backend/app/agent/factory/developer.py @@ -21,6 +21,7 @@ from app.agent.listen_chat_agent import logger from app.agent.prompt import DEVELOPER_SYS_PROMPT from app.agent.toolkit.human_toolkit import HumanToolkit +from app.agent.toolkit.image_analysis_toolkit import ImageAnalysisToolkit # TODO: Remove NoteTakingToolkit and use TerminalToolkit instead from app.agent.toolkit.note_taking_toolkit import NoteTakingToolkit @@ -57,9 +58,17 @@ async def developer_agent(options: Chat): screenshot_toolkit = ScreenshotToolkit( options.project_id, working_directory=working_directory ) + # Save reference before registering for toolkits_to_register_agent + screenshot_toolkit_for_agent_registration = screenshot_toolkit screenshot_toolkit = message_integration.register_toolkits( screenshot_toolkit ) + image_analysis_toolkit = ImageAnalysisToolkit(options.project_id) + # Save reference before registering for toolkits_to_register_agent + image_analysis_toolkit_for_agent_registration = image_analysis_toolkit + image_analysis_toolkit = message_integration.register_toolkits( + image_analysis_toolkit + ) terminal_toolkit = TerminalToolkit( options.project_id, @@ -78,6 +87,7 @@ async def developer_agent(options: Chat): *web_deploy_toolkit.get_tools(), *terminal_toolkit.get_tools(), *screenshot_toolkit.get_tools(), + *image_analysis_toolkit.get_tools(), ] system_message = DEVELOPER_SYS_PROMPT.format( platform_system=platform.system(), @@ -99,5 +109,10 @@ async def developer_agent(options: Chat): TerminalToolkit.toolkit_name(), NoteTakingToolkit.toolkit_name(), WebDeployToolkit.toolkit_name(), + ImageAnalysisToolkit.toolkit_name(), + ], + toolkits_to_register_agent=[ + screenshot_toolkit_for_agent_registration, + image_analysis_toolkit_for_agent_registration, ], ) diff --git a/backend/app/agent/factory/document.py b/backend/app/agent/factory/document.py index 0d3b2c89..815d226c 100644 --- a/backend/app/agent/factory/document.py +++ b/backend/app/agent/factory/document.py @@ -23,6 +23,7 @@ from app.agent.toolkit.file_write_toolkit import FileToolkit from app.agent.toolkit.google_drive_mcp_toolkit import GoogleDriveMCPToolkit from app.agent.toolkit.human_toolkit import HumanToolkit +from app.agent.toolkit.image_analysis_toolkit import ImageAnalysisToolkit from app.agent.toolkit.markitdown_toolkit import MarkItDownToolkit # TODO: Remove NoteTakingToolkit and use TerminalToolkit instead @@ -68,6 +69,13 @@ async def document_agent(options: Chat): working_directory=working_directory, ) note_toolkit = message_integration.register_toolkits(note_toolkit) + # Save reference before registering for toolkits_to_register_agent + image_analysis_toolkit_for_agent_registration = ImageAnalysisToolkit( + options.project_id + ) + image_analysis_toolkit = message_integration.register_toolkits( + image_analysis_toolkit_for_agent_registration + ) terminal_toolkit = TerminalToolkit( options.project_id, @@ -92,6 +100,7 @@ async def document_agent(options: Chat): *excel_toolkit.get_tools(), *note_toolkit.get_tools(), *terminal_toolkit.get_tools(), + *image_analysis_toolkit.get_tools(), *google_drive_tools, ] system_message = DOCUMENT_SYS_PROMPT.format( @@ -117,6 +126,10 @@ async def document_agent(options: Chat): ExcelToolkit.toolkit_name(), NoteTakingToolkit.toolkit_name(), TerminalToolkit.toolkit_name(), + ImageAnalysisToolkit.toolkit_name(), GoogleDriveMCPToolkit.toolkit_name(), ], + toolkits_to_register_agent=[ + image_analysis_toolkit_for_agent_registration, + ], ) diff --git a/backend/app/agent/prompt.py b/backend/app/agent/prompt.py index ee1d782d..a43910bb 100644 --- a/backend/app/agent/prompt.py +++ b/backend/app/agent/prompt.py @@ -263,6 +263,13 @@ Your capabilities include: +- **Image Analysis (PRIORITY)**: When a task involves analyzing, describing, or + extracting information from images, screenshots, diagrams, or picture files, + you MUST use ImageAnalysisToolkit. Use `image_to_text` to extract text or + `ask_question_about_image` to understand visual content. These tools work + with image file paths (e.g., /path/to/image.png). Check task.additional_info + for image file paths. DO NOT say you cannot see images - you have full vision + capabilities through ImageAnalysisToolkit. - Document Reading: - Read and understand the content of various file formats including - PDF (.pdf) @@ -413,6 +420,13 @@ Your capabilities are extensive and powerful: +- **Image Analysis (PRIORITY)**: When a task involves analyzing, describing, or + extracting information from images, screenshots, or picture files, you MUST + use ImageAnalysisToolkit. Use `image_to_text` to extract text from images or + `ask_question_about_image` to understand visual content. These tools work + with image file paths (e.g., /path/to/image.png). Check task.additional_info + for image file paths. DO NOT say you cannot see images - you have full vision + capabilities through ImageAnalysisToolkit. - **Unrestricted Code Execution**: You can write and execute code in any language to solve a task. You MUST first save your code to a file (e.g., `script.py`) and then run it from the terminal (e.g., @@ -581,6 +595,13 @@ Your capabilities include: +- **Image Analysis (PRIORITY)**: When a task mentions analyzing, describing, or + extracting information from an image, screenshot, or picture file, you MUST + use ImageAnalysisToolkit FIRST before any web browsing. Use `image_to_text` + to extract text or `ask_question_about_image` to understand visual content. + These tools work with image file paths (e.g., /path/to/image.png). Check + task.additional_info for image file paths. DO NOT say you cannot see images + - you have full vision capabilities through ImageAnalysisToolkit. - Search and get information from the web using the search tools. - Use the rich browser related toolset to investigate websites. - Use the terminal tools to perform local operations. **IMPORTANT:** Before the diff --git a/backend/tests/app/agent/factory/test_browser.py b/backend/tests/app/agent/factory/test_browser.py index e52a5a46..a922c992 100644 --- a/backend/tests/app/agent/factory/test_browser.py +++ b/backend/tests/app/agent/factory/test_browser.py @@ -40,6 +40,7 @@ def test_browser_agent_creation(sample_chat_data): patch(f"{_mod}.HybridBrowserToolkit") as mock_browser_toolkit, patch(f"{_mod}.TerminalToolkit") as mock_terminal_toolkit, patch(f"{_mod}.NoteTakingToolkit") as mock_note_toolkit, + patch(f"{_mod}.ImageAnalysisToolkit") as mock_image_toolkit, patch(f"{_mod}.SearchToolkit") as mock_search_toolkit, patch(f"{_mod}.ToolkitMessageIntegration"), patch("uuid.uuid4") as mock_uuid, @@ -54,6 +55,7 @@ def test_browser_agent_creation(sample_chat_data): mock_terminal_toolkit.return_value = mock_terminal_instance mock_note_toolkit.return_value.get_tools.return_value = [] + mock_image_toolkit.return_value.get_tools.return_value = [] mock_search_instance = MagicMock() mock_search_instance.search_google = MagicMock() mock_search_toolkit.return_value = mock_search_instance diff --git a/backend/tests/app/agent/factory/test_developer.py b/backend/tests/app/agent/factory/test_developer.py index 0a4e55ce..12a49dc5 100644 --- a/backend/tests/app/agent/factory/test_developer.py +++ b/backend/tests/app/agent/factory/test_developer.py @@ -41,6 +41,7 @@ async def test_developer_agent_creation(sample_chat_data): patch(f"{_mod}.NoteTakingToolkit") as mock_note_toolkit, patch(f"{_mod}.WebDeployToolkit") as mock_web_toolkit, patch(f"{_mod}.ScreenshotToolkit") as mock_screenshot_toolkit, + patch(f"{_mod}.ImageAnalysisToolkit") as mock_image_toolkit, patch(f"{_mod}.TerminalToolkit") as mock_terminal_toolkit, patch(f"{_mod}.ToolkitMessageIntegration"), ): @@ -49,6 +50,7 @@ async def test_developer_agent_creation(sample_chat_data): mock_note_toolkit.return_value.get_tools.return_value = [] mock_web_toolkit.return_value.get_tools.return_value = [] mock_screenshot_toolkit.return_value.get_tools.return_value = [] + mock_image_toolkit.return_value.get_tools.return_value = [] mock_terminal_toolkit.return_value.get_tools.return_value = [] mock_agent = MagicMock() @@ -87,6 +89,7 @@ async def test_developer_agent_with_multiple_toolkits(sample_chat_data): patch(f"{_mod}.NoteTakingToolkit") as mock_note_toolkit, patch(f"{_mod}.WebDeployToolkit") as mock_web_toolkit, patch(f"{_mod}.ScreenshotToolkit") as mock_screenshot_toolkit, + patch(f"{_mod}.ImageAnalysisToolkit") as mock_image_toolkit, patch(f"{_mod}.TerminalToolkit") as mock_terminal_toolkit, patch(f"{_mod}.ToolkitMessageIntegration"), ): @@ -95,6 +98,7 @@ async def test_developer_agent_with_multiple_toolkits(sample_chat_data): mock_note_toolkit.return_value.get_tools.return_value = [] mock_web_toolkit.return_value.get_tools.return_value = [] mock_screenshot_toolkit.return_value.get_tools.return_value = [] + mock_image_toolkit.return_value.get_tools.return_value = [] mock_terminal_toolkit.return_value.get_tools.return_value = [] mock_agent = MagicMock() diff --git a/backend/tests/app/agent/factory/test_document.py b/backend/tests/app/agent/factory/test_document.py index 034a1696..9b4955a5 100644 --- a/backend/tests/app/agent/factory/test_document.py +++ b/backend/tests/app/agent/factory/test_document.py @@ -43,6 +43,7 @@ async def test_document_agent_creation(sample_chat_data): patch(f"{_mod}.MarkItDownToolkit") as mock_markdown_toolkit, patch(f"{_mod}.ExcelToolkit") as mock_excel_toolkit, patch(f"{_mod}.NoteTakingToolkit") as mock_note_toolkit, + patch(f"{_mod}.ImageAnalysisToolkit") as mock_image_toolkit, patch(f"{_mod}.TerminalToolkit") as mock_terminal_toolkit, patch(f"{_mod}.GoogleDriveMCPToolkit") as mock_gdrive_toolkit, patch(f"{_mod}.ToolkitMessageIntegration"), @@ -54,6 +55,7 @@ async def test_document_agent_creation(sample_chat_data): mock_markdown_toolkit.return_value.get_tools.return_value = [] mock_excel_toolkit.return_value.get_tools.return_value = [] mock_note_toolkit.return_value.get_tools.return_value = [] + mock_image_toolkit.return_value.get_tools.return_value = [] mock_terminal_toolkit.return_value.get_tools.return_value = [] mock_gdrive_toolkit.get_can_use_tools = AsyncMock(return_value=[])