Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 14 additions & 1 deletion backend/app/agent/factory/browser.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from app.agent.prompt import BROWSER_SYS_PROMPT
from app.agent.toolkit.human_toolkit import HumanToolkit
from app.agent.toolkit.hybrid_browser_toolkit import HybridBrowserToolkit
from app.agent.toolkit.image_analysis_toolkit import ImageAnalysisToolkit

# TODO: Remove NoteTakingToolkit and use TerminalToolkit instead
from app.agent.toolkit.note_taking_toolkit import NoteTakingToolkit
Expand Down Expand Up @@ -96,6 +97,13 @@ def browser_agent(options: Chat):
working_directory=working_directory,
)
note_toolkit = message_integration.register_toolkits(note_toolkit)
# Save reference before registering for toolkits_to_register_agent
image_analysis_toolkit_for_agent_registration = ImageAnalysisToolkit(
options.project_id
)
image_analysis_toolkit = message_integration.register_toolkits(
image_analysis_toolkit_for_agent_registration
)

search_tools = SearchToolkit.get_can_use_tools(options.project_id)
if search_tools:
Expand All @@ -110,6 +118,7 @@ def browser_agent(options: Chat):
*web_toolkit_custom.get_tools(),
*terminal_toolkit,
*note_toolkit.get_tools(),
*image_analysis_toolkit.get_tools(),
*search_tools,
]

Expand All @@ -135,7 +144,11 @@ def browser_agent(options: Chat):
HumanToolkit.toolkit_name(),
NoteTakingToolkit.toolkit_name(),
TerminalToolkit.toolkit_name(),
ImageAnalysisToolkit.toolkit_name(),
],
toolkits_to_register_agent=[
web_toolkit_for_agent_registration,
image_analysis_toolkit_for_agent_registration,
],
toolkits_to_register_agent=[web_toolkit_for_agent_registration],
enable_snapshot_clean=True,
)
15 changes: 15 additions & 0 deletions backend/app/agent/factory/developer.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from app.agent.listen_chat_agent import logger
from app.agent.prompt import DEVELOPER_SYS_PROMPT
from app.agent.toolkit.human_toolkit import HumanToolkit
from app.agent.toolkit.image_analysis_toolkit import ImageAnalysisToolkit

# TODO: Remove NoteTakingToolkit and use TerminalToolkit instead
from app.agent.toolkit.note_taking_toolkit import NoteTakingToolkit
Expand Down Expand Up @@ -57,9 +58,17 @@ async def developer_agent(options: Chat):
screenshot_toolkit = ScreenshotToolkit(
options.project_id, working_directory=working_directory
)
# Save reference before registering for toolkits_to_register_agent
screenshot_toolkit_for_agent_registration = screenshot_toolkit
screenshot_toolkit = message_integration.register_toolkits(
screenshot_toolkit
)
image_analysis_toolkit = ImageAnalysisToolkit(options.project_id)
# Save reference before registering for toolkits_to_register_agent
image_analysis_toolkit_for_agent_registration = image_analysis_toolkit
image_analysis_toolkit = message_integration.register_toolkits(
image_analysis_toolkit
)

terminal_toolkit = TerminalToolkit(
options.project_id,
Expand All @@ -78,6 +87,7 @@ async def developer_agent(options: Chat):
*web_deploy_toolkit.get_tools(),
*terminal_toolkit.get_tools(),
*screenshot_toolkit.get_tools(),
*image_analysis_toolkit.get_tools(),
]
system_message = DEVELOPER_SYS_PROMPT.format(
platform_system=platform.system(),
Expand All @@ -99,5 +109,10 @@ async def developer_agent(options: Chat):
TerminalToolkit.toolkit_name(),
NoteTakingToolkit.toolkit_name(),
WebDeployToolkit.toolkit_name(),
ImageAnalysisToolkit.toolkit_name(),
],
toolkits_to_register_agent=[
screenshot_toolkit_for_agent_registration,
image_analysis_toolkit_for_agent_registration,
],
)
13 changes: 13 additions & 0 deletions backend/app/agent/factory/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from app.agent.toolkit.file_write_toolkit import FileToolkit
from app.agent.toolkit.google_drive_mcp_toolkit import GoogleDriveMCPToolkit
from app.agent.toolkit.human_toolkit import HumanToolkit
from app.agent.toolkit.image_analysis_toolkit import ImageAnalysisToolkit
from app.agent.toolkit.markitdown_toolkit import MarkItDownToolkit

# TODO: Remove NoteTakingToolkit and use TerminalToolkit instead
Expand Down Expand Up @@ -68,6 +69,13 @@ async def document_agent(options: Chat):
working_directory=working_directory,
)
note_toolkit = message_integration.register_toolkits(note_toolkit)
# Save reference before registering for toolkits_to_register_agent
image_analysis_toolkit_for_agent_registration = ImageAnalysisToolkit(
options.project_id
)
image_analysis_toolkit = message_integration.register_toolkits(
image_analysis_toolkit_for_agent_registration
)

terminal_toolkit = TerminalToolkit(
options.project_id,
Expand All @@ -92,6 +100,7 @@ async def document_agent(options: Chat):
*excel_toolkit.get_tools(),
*note_toolkit.get_tools(),
*terminal_toolkit.get_tools(),
*image_analysis_toolkit.get_tools(),
*google_drive_tools,
]
system_message = DOCUMENT_SYS_PROMPT.format(
Expand All @@ -117,6 +126,10 @@ async def document_agent(options: Chat):
ExcelToolkit.toolkit_name(),
NoteTakingToolkit.toolkit_name(),
TerminalToolkit.toolkit_name(),
ImageAnalysisToolkit.toolkit_name(),
GoogleDriveMCPToolkit.toolkit_name(),
],
toolkits_to_register_agent=[
image_analysis_toolkit_for_agent_registration,
],
)
21 changes: 21 additions & 0 deletions backend/app/agent/prompt.py
Original file line number Diff line number Diff line change
Expand Up @@ -263,6 +263,13 @@

<capabilities>
Your capabilities include:
- **Image Analysis (PRIORITY)**: When a task involves analyzing, describing, or
extracting information from images, screenshots, diagrams, or picture files,
you MUST use ImageAnalysisToolkit. Use `image_to_text` to extract text or
`ask_question_about_image` to understand visual content. These tools work
with image file paths (e.g., /path/to/image.png). Check task.additional_info
for image file paths. DO NOT say you cannot see images - you have full vision
capabilities through ImageAnalysisToolkit.
- Document Reading:
- Read and understand the content of various file formats including
- PDF (.pdf)
Expand Down Expand Up @@ -413,6 +420,13 @@

<capabilities>
Your capabilities are extensive and powerful:
- **Image Analysis (PRIORITY)**: When a task involves analyzing, describing, or
extracting information from images, screenshots, or picture files, you MUST
use ImageAnalysisToolkit. Use `image_to_text` to extract text from images or
`ask_question_about_image` to understand visual content. These tools work
with image file paths (e.g., /path/to/image.png). Check task.additional_info
for image file paths. DO NOT say you cannot see images - you have full vision
capabilities through ImageAnalysisToolkit.
- **Unrestricted Code Execution**: You can write and execute code in any
language to solve a task. You MUST first save your code to a file (e.g.,
`script.py`) and then run it from the terminal (e.g.,
Expand Down Expand Up @@ -581,6 +595,13 @@

<capabilities>
Your capabilities include:
- **Image Analysis (PRIORITY)**: When a task mentions analyzing, describing, or
extracting information from an image, screenshot, or picture file, you MUST
use ImageAnalysisToolkit FIRST before any web browsing. Use `image_to_text`
to extract text or `ask_question_about_image` to understand visual content.
These tools work with image file paths (e.g., /path/to/image.png). Check
task.additional_info for image file paths. DO NOT say you cannot see images
- you have full vision capabilities through ImageAnalysisToolkit.
- Search and get information from the web using the search tools.
- Use the rich browser related toolset to investigate websites.
- Use the terminal tools to perform local operations. **IMPORTANT:** Before the
Expand Down
2 changes: 2 additions & 0 deletions backend/tests/app/agent/factory/test_browser.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ def test_browser_agent_creation(sample_chat_data):
patch(f"{_mod}.HybridBrowserToolkit") as mock_browser_toolkit,
patch(f"{_mod}.TerminalToolkit") as mock_terminal_toolkit,
patch(f"{_mod}.NoteTakingToolkit") as mock_note_toolkit,
patch(f"{_mod}.ImageAnalysisToolkit") as mock_image_toolkit,
patch(f"{_mod}.SearchToolkit") as mock_search_toolkit,
patch(f"{_mod}.ToolkitMessageIntegration"),
patch("uuid.uuid4") as mock_uuid,
Expand All @@ -54,6 +55,7 @@ def test_browser_agent_creation(sample_chat_data):
mock_terminal_toolkit.return_value = mock_terminal_instance

mock_note_toolkit.return_value.get_tools.return_value = []
mock_image_toolkit.return_value.get_tools.return_value = []
mock_search_instance = MagicMock()
mock_search_instance.search_google = MagicMock()
mock_search_toolkit.return_value = mock_search_instance
Expand Down
4 changes: 4 additions & 0 deletions backend/tests/app/agent/factory/test_developer.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ async def test_developer_agent_creation(sample_chat_data):
patch(f"{_mod}.NoteTakingToolkit") as mock_note_toolkit,
patch(f"{_mod}.WebDeployToolkit") as mock_web_toolkit,
patch(f"{_mod}.ScreenshotToolkit") as mock_screenshot_toolkit,
patch(f"{_mod}.ImageAnalysisToolkit") as mock_image_toolkit,
patch(f"{_mod}.TerminalToolkit") as mock_terminal_toolkit,
patch(f"{_mod}.ToolkitMessageIntegration"),
):
Expand All @@ -49,6 +50,7 @@ async def test_developer_agent_creation(sample_chat_data):
mock_note_toolkit.return_value.get_tools.return_value = []
mock_web_toolkit.return_value.get_tools.return_value = []
mock_screenshot_toolkit.return_value.get_tools.return_value = []
mock_image_toolkit.return_value.get_tools.return_value = []
mock_terminal_toolkit.return_value.get_tools.return_value = []

mock_agent = MagicMock()
Expand Down Expand Up @@ -87,6 +89,7 @@ async def test_developer_agent_with_multiple_toolkits(sample_chat_data):
patch(f"{_mod}.NoteTakingToolkit") as mock_note_toolkit,
patch(f"{_mod}.WebDeployToolkit") as mock_web_toolkit,
patch(f"{_mod}.ScreenshotToolkit") as mock_screenshot_toolkit,
patch(f"{_mod}.ImageAnalysisToolkit") as mock_image_toolkit,
patch(f"{_mod}.TerminalToolkit") as mock_terminal_toolkit,
patch(f"{_mod}.ToolkitMessageIntegration"),
):
Expand All @@ -95,6 +98,7 @@ async def test_developer_agent_with_multiple_toolkits(sample_chat_data):
mock_note_toolkit.return_value.get_tools.return_value = []
mock_web_toolkit.return_value.get_tools.return_value = []
mock_screenshot_toolkit.return_value.get_tools.return_value = []
mock_image_toolkit.return_value.get_tools.return_value = []
mock_terminal_toolkit.return_value.get_tools.return_value = []

mock_agent = MagicMock()
Expand Down
2 changes: 2 additions & 0 deletions backend/tests/app/agent/factory/test_document.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ async def test_document_agent_creation(sample_chat_data):
patch(f"{_mod}.MarkItDownToolkit") as mock_markdown_toolkit,
patch(f"{_mod}.ExcelToolkit") as mock_excel_toolkit,
patch(f"{_mod}.NoteTakingToolkit") as mock_note_toolkit,
patch(f"{_mod}.ImageAnalysisToolkit") as mock_image_toolkit,
patch(f"{_mod}.TerminalToolkit") as mock_terminal_toolkit,
patch(f"{_mod}.GoogleDriveMCPToolkit") as mock_gdrive_toolkit,
patch(f"{_mod}.ToolkitMessageIntegration"),
Expand All @@ -54,6 +55,7 @@ async def test_document_agent_creation(sample_chat_data):
mock_markdown_toolkit.return_value.get_tools.return_value = []
mock_excel_toolkit.return_value.get_tools.return_value = []
mock_note_toolkit.return_value.get_tools.return_value = []
mock_image_toolkit.return_value.get_tools.return_value = []
mock_terminal_toolkit.return_value.get_tools.return_value = []
mock_gdrive_toolkit.get_can_use_tools = AsyncMock(return_value=[])

Expand Down