Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 32 additions & 29 deletions app/cosight/agent/actor/instance/actor_agent_instance.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,40 +37,43 @@ def create_actor_instance(agent_instance_name, work_space_path):

def create_actor_template(template_name, work_space_path):
template_content = {
'template_name': template_name,
'template_version': 'v1',
"template_name": template_name,
"template_version": "v1",
"agent_type": "actor_agent",
'display_name_zh': '任务执行专家',
'display_name_en': 'Task Execution Expert',
'description_zh': '负责具体任务执行',
'description_en': 'Responsible for task execution',
"display_name_zh": "任务执行专家",
"display_name_en": "Task Execution Expert",
"description_zh": "负责具体任务执行",
"description_en": "Responsible for task execution",
"profile": [],
'service_name': 'execution_service',
'service_version': 'v1',
'default_replay_zh': '任务执行专家',
'default_replay_en': 'Task Execution Expert',
"service_name": "execution_service",
"service_version": "v1",
"default_replay_zh": "任务执行专家",
"default_replay_en": "Task Execution Expert",
"icon": "",
'skills': [execute_code_skill(work_space_path),
search_baidu_skill(),
mark_step_skill(),
browser_use_skill(),
file_saver_skill(),
file_read_skill(),
file_str_replace_skill(),
file_find_in_content_skill(),
ask_question_about_image_skill(),
extract_document_content_skill(),
create_html_report_skill(),
fetch_website_content_skill(),
# search_duckgo_skill(),
search_wiki_skill(),
audio_recognition_skill(),
ask_question_about_video_skill()],
"skills": [
execute_code_skill(work_space_path),
search_baidu_skill(),
mark_step_skill(),
browser_use_skill(),
check_browser_session_skill(),
file_saver_skill(),
file_read_skill(),
file_str_replace_skill(),
file_find_in_content_skill(),
ask_question_about_image_skill(),
extract_document_content_skill(),
create_html_report_skill(),
fetch_website_content_skill(),
# search_duckgo_skill(),
search_wiki_skill(),
audio_recognition_skill(),
ask_question_about_video_skill(),
],
# , terminate_skill(), browser_use_skill()
"organizations": [],
'knowledge': [],
'max_iteration': 20,
'business_type': {}
"knowledge": [],
"max_iteration": 20,
"business_type": {},
}
template_content['skills'].extend(register_mcp_tools())
load_search_skill(template_content)
Expand Down
28 changes: 28 additions & 0 deletions app/cosight/agent/actor/instance/actor_agent_skill.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,34 @@ def browser_use_skill():
)
}

def check_browser_session_skill():
return {
"skill_name": "check_browser_session",
"skill_type": "function",
"display_name_zh": "检查浏览器会话状态",
"display_name_en": "Check Browser Session Status",
"description_zh": "检查当前浏览器会话是否存在,如果存在但当前任务不需要使用浏览器,则自动关闭浏览器会话",
"description_en": "Check if a browser session exists. If it exists but the current task does not require browser interaction, automatically close the browser session",
"semantic_apis": ["api_browser_management"],
"function": SkillFunction(
id="2c44f9ad-be5c-4e6c-a9d8-1426b23828c1",
name="app.cosight.browser_toolkit.check_browser_session",
description_zh="检查浏览器会话状态并根据当前任务需求自动关闭不需要的会话",
description_en="Check browser session status and auto-close if not needed for current task",
parameters={
"type": "object",
"properties": {
"task_requires_browser": {
"type": "boolean",
"description_zh": "当前任务是否需要浏览器交互。如果为 false 且浏览器会话存在,会自动关闭浏览器",
"description_en": "Whether the current task requires browser interaction. If false and a browser session exists, it will be closed automatically",
}
},
"required": ["task_requires_browser"],
},
),
}


def fetch_website_content_skill():
return {
Expand Down
2 changes: 2 additions & 0 deletions app/cosight/agent/actor/prompt/actor_prompt.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ def actor_system_prompt(work_space_path: str):

# General Rules
1. You MUST plan extensively before each function call, and reflect extensively on the outcomes of the previous function calls. DO NOT do this entire process by making function calls only, as this can impair your ability to solve the problem and think insightfully.
2. At the beginning of each task, you MUST call check_browser_session tool. If the current task does not require using browser, please set task_requires_browser to False, for example: report generation/data analysis and so on do not need to use browser.

# Task Execution Rules:
1. For all output tasks (file generation and information gathering):
Expand Down Expand Up @@ -221,6 +222,7 @@ def actor_system_prompt_zh(work_space_path):

# 通用规则
1. 在每次函数调用前必须进行充分规划,并深入反思之前函数调用的结果。不要仅通过函数调用完成整个过程,这可能会影响你的问题解决能力和洞察力。
2. 在任务开始时,必须调用 check_browser_session 工具,如果当前任务不需要使用browser,请将task_requires_browser设置为False,例如:报告生成/数据分析等不需要使用browser,请将task_requires_browser设置为False。

# 任务执行规则:
1. 对于所有输出任务(文件生成和信息收集):
Expand Down
49 changes: 36 additions & 13 deletions app/cosight/agent/base/base_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
from app.cosight.task.plan_report_manager import plan_report_event_manager
from app.common.logger_util import logger
from app.cosight.agent.base.tool_arg_mapping import FUNCTION_ARG_MAPPING
from app.cosight.tool.web_util import WebToolkit


class BaseAgent:
Expand Down Expand Up @@ -363,21 +364,43 @@ def _get_verification_steps(self, tool_name: str) -> list[str]:
# 未在清单中的工具:不返回任何步骤
return []

def execute(self, messages: List[Dict[str, Any]], step_index=None, max_iteration=10): #调试修改的10
for i in range(max_iteration):
logger.info(f'act agent call with tools message: {messages}')
response = self.llm.create_with_tools(messages, self.tools)
logger.info(f'act agent call with tools response: {response}')
def _cleanup_browser_session(self):
"""在所有step执行完成后检查并关闭浏览器会话"""
try:
# 检查是否存在活跃的浏览器会话
if WebToolkit.has_active_browser_session():
logger.info("检测到活跃的浏览器会话,准备关闭")
# 关闭浏览器会话
result = WebToolkit.close_browser()
logger.info(f"浏览器会话清理结果: {result}")
else:
logger.info("没有活跃的浏览器会话需要清理")
except Exception as e:
logger.error(f"清理浏览器会话时发生错误: {str(e)}", exc_info=True)

# Process initial response
result = self._process_response(response, messages, step_index)
logger.info(f'iter {i} for {self.agent_instance.instance_name} call tools result: {result}')
if result:
return result
def execute(
self, messages: List[Dict[str, Any]], step_index=None, max_iteration=10
):
try:
for i in range(max_iteration):
logger.info(f"act agent call with tools message: {messages}")
response = self.llm.create_with_tools(messages, self.tools)
logger.info(f"act agent call with tools response: {response}")

# Process initial response
result = self._process_response(response, messages, step_index)
logger.info(
f"iter {i} for {self.agent_instance.instance_name} call tools result: {result}"
)
if result:
return result

if max_iteration > 1:
return self._handle_max_iteration(messages, step_index)
return messages[-1].get("content")
if max_iteration > 1:
return self._handle_max_iteration(messages, step_index)
return messages[-1].get("content")
finally:
# 在所有step执行完成后,检查并关闭浏览器会话
self._cleanup_browser_session()

def _process_response(self, response, messages, step_index):
if not response.tool_calls:
Expand Down
125 changes: 90 additions & 35 deletions app/cosight/tool/web_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@
from browser_use.llm import ChatOpenAI

from app.common.logger_util import logger
from config.config import get_browser_model_config

_browser_loop: Optional[asyncio.AbstractEventLoop] = None
_browser_loop_thread: Optional[threading.Thread] = None
Expand Down Expand Up @@ -59,7 +58,7 @@ def _env_float(name: str, default: float) -> float:
return default


def _env_int(name: str, default: Optional[int] = None) -> Optional[int]:
def _env_int(name: str, default: int = 1) -> int:
value = os.environ.get(name)
if value is None or not value.strip():
return default
Expand Down Expand Up @@ -130,7 +129,9 @@ async def create_browser_session():
server=proxy_url,
username=proxy_user,
password=proxy_password,
bypass=os.environ.get("BROWSER_PROXY_BYPASS", "localhost,127.0.0.1,*.internal"),
bypass=os.environ.get(
"BROWSER_PROXY_BYPASS", "localhost,127.0.0.1,*.internal"
),
)

profile = BrowserProfile(
Expand Down Expand Up @@ -175,22 +176,8 @@ class WebToolkit:
_shared_browser_session: Optional[BrowserSession] = None
_session_lock: Optional[asyncio.Lock] = None

def __init__(self, llm_config=None):
"""
初始化WebToolkit

Args:
llm_config: 可选的LLM配置,如果不提供则使用专门的浏览器自动化模型配置
"""
if llm_config is None:
# 使用专门的浏览器自动化模型配置
self.llm_config: dict = get_browser_model_config()
logger.info("使用专门的浏览器自动化模型配置")
else:
# 使用提供的配置
self.llm_config: dict = llm_config
logger.info("使用提供的LLM配置")

def __init__(self, llm_config):
self.llm_config: dict = llm_config
self._llm: Optional[ChatOpenAI] = None

@classmethod
Expand Down Expand Up @@ -219,6 +206,77 @@ async def _reset_browser_session(cls) -> None:
finally:
cls._shared_browser_session = None

@classmethod
def has_active_browser_session(cls) -> bool:
"""Check if there is an active browser session.

This method allows external agents to check if a browser session currently exists.

Returns:
bool: True if there is an active browser session, False otherwise.
"""
return cls._shared_browser_session is not None

@classmethod
def check_browser_session(cls, task_requires_browser: bool) -> str:
"""Check browser session status and auto-close if not needed for current task.

This method allows external actor_agent to:
1. Check if a browser session currently exists
2. Auto-close the browser if the current task does not require it

Args:
task_requires_browser (bool): Whether the current task requires browser interaction.
If False and a browser session exists, it will be closed automatically.

Returns:
str: A message indicating the browser session status and any actions taken.
"""
has_session = cls.has_active_browser_session()

if not has_session:
logger.info("No active browser session exists")
return "No active browser session exists"

if not task_requires_browser:
logger.info(
"Browser session exists but current task does not require it - auto-closing"
)
try:
_run_in_browser_loop(cls._reset_browser_session())
logger.info("Browser session auto-closed successfully")
return "Browser session existed but was not needed for current task - closed successfully"
except Exception as e:
logger.error(
f"Failed to auto-close browser session: {str(e)}", exc_info=True
)
return f"Failed to auto-close browser session: {str(e)}"
else:
logger.info("Browser session exists and is available for current task")
return "Browser session exists and is ready for use in current task"

@classmethod
def close_browser(cls) -> str:
"""Close the shared browser session.

This method allows external agents to explicitly close the browser when it's no longer needed.
For example, during the planning phase, if the agent determines that the current browser
session is no longer required, it can call this method to clean up resources.

Returns:
str: A message indicating whether the browser was closed successfully or if there was no active session.
"""
logger.info("External request to close shared browser session")
try:
_run_in_browser_loop(cls._reset_browser_session())
logger.info(
"Shared browser session closed successfully by external request"
)
return "Browser session closed successfully"
except Exception as e:
logger.error(f"Failed to close browser session: {str(e)}", exc_info=True)
return f"Failed to close browser session: {str(e)}"

def browser_use(self, task_prompt: str):
r"""A powerful toolkit which can simulate the browser interaction to solve the task which needs multi-step actions.

Expand Down Expand Up @@ -247,35 +305,32 @@ async def inner_browser_use(self, task_prompt):
try:
browser_session = await self._get_shared_browser_session()
if self._llm is None:
llm_kwargs = {**self.llm_config}
llm_kwargs.setdefault("temperature", 0.0)
llm_kwargs["add_schema_to_system_prompt"] = _env_bool(
"ADD_SCHEMA_TO_SYSTEM_PROMPT",
llm_kwargs.get("add_schema_to_system_prompt", True),
self._llm = ChatOpenAI(
**self.llm_config,
max_completion_tokens=8192,
temperature=0.0,
add_schema_to_system_prompt=_env_bool(
"ADD_SCHEMA_TO_SYSTEM_PROMPT", True
),
)
self._llm = ChatOpenAI(**llm_kwargs)
# 创建agent,复用共享的browser session
agent_kwargs: dict[str, Any] = dict(

agent = Agent(
task=task_prompt,
browser_session=browser_session, # 使用共享的browser session
llm=self._llm,
use_vision=False,
max_actions_per_step=1,
max_actions_per_step=_env_int("MAX_ACTIONS_PER_STEP", 1),
directly_open_url=False,
flash_mode=_env_bool("FLASH_MODE", True),
include_tool_call_examples=True,
extend_system_message="""
ADDITIONAL INSTRUCTIONS:
YOU **MUST** FOLLOW THESE INSTRUCTIONS:
- Your answers **MUST NOT** contain any of the markdown code blocks such as ``` or ```json.
- **Directly** return the final answer as a plain text **without any additional formatting**.
""",
)

max_tokens_per_step = _env_int("MAX_TOKENS_PER_STEP")
if max_tokens_per_step is not None:
agent_kwargs["max_tokens_per_step"] = max_tokens_per_step

agent = Agent(**agent_kwargs)

# 运行agent
result = await agent.run()
final_result = result.final_result()
Expand All @@ -288,4 +343,4 @@ async def inner_browser_use(self, task_prompt):
await self._reset_browser_session()
return f"fail, because: {str(e)}"
# 注意:不要在这里关闭browser_session,因为它是共享的
# browser session会通过keep_alive=True保持活跃,供后续agent复用
# browser session会通过keep_alive=True保持活跃,供后续agent复用
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ ffmpeg-python==0.2.0
baidusearch==1.0.3
retry==0.9.2
googlesearch-python==1.3.0
browser-use==0.7.9
browser-use==0.8.1
xmltodict==0.14.2
soundfile==0.13.1
wikipedia==1.4.0
Expand Down