diff --git a/app/cosight/agent/actor/instance/actor_agent_instance.py b/app/cosight/agent/actor/instance/actor_agent_instance.py index acd45b7..2ab95f5 100644 --- a/app/cosight/agent/actor/instance/actor_agent_instance.py +++ b/app/cosight/agent/actor/instance/actor_agent_instance.py @@ -37,40 +37,43 @@ def create_actor_instance(agent_instance_name, work_space_path): def create_actor_template(template_name, work_space_path): template_content = { - 'template_name': template_name, - 'template_version': 'v1', + "template_name": template_name, + "template_version": "v1", "agent_type": "actor_agent", - 'display_name_zh': '任务执行专家', - 'display_name_en': 'Task Execution Expert', - 'description_zh': '负责具体任务执行', - 'description_en': 'Responsible for task execution', + "display_name_zh": "任务执行专家", + "display_name_en": "Task Execution Expert", + "description_zh": "负责具体任务执行", + "description_en": "Responsible for task execution", "profile": [], - 'service_name': 'execution_service', - 'service_version': 'v1', - 'default_replay_zh': '任务执行专家', - 'default_replay_en': 'Task Execution Expert', + "service_name": "execution_service", + "service_version": "v1", + "default_replay_zh": "任务执行专家", + "default_replay_en": "Task Execution Expert", "icon": "", - 'skills': [execute_code_skill(work_space_path), - search_baidu_skill(), - mark_step_skill(), - browser_use_skill(), - file_saver_skill(), - file_read_skill(), - file_str_replace_skill(), - file_find_in_content_skill(), - ask_question_about_image_skill(), - extract_document_content_skill(), - create_html_report_skill(), - fetch_website_content_skill(), - # search_duckgo_skill(), - search_wiki_skill(), - audio_recognition_skill(), - ask_question_about_video_skill()], + "skills": [ + execute_code_skill(work_space_path), + search_baidu_skill(), + mark_step_skill(), + browser_use_skill(), + check_browser_session_skill(), + file_saver_skill(), + file_read_skill(), + file_str_replace_skill(), + file_find_in_content_skill(), + ask_question_about_image_skill(), + extract_document_content_skill(), + create_html_report_skill(), + fetch_website_content_skill(), + # search_duckgo_skill(), + search_wiki_skill(), + audio_recognition_skill(), + ask_question_about_video_skill(), + ], # , terminate_skill(), browser_use_skill() "organizations": [], - 'knowledge': [], - 'max_iteration': 20, - 'business_type': {} + "knowledge": [], + "max_iteration": 20, + "business_type": {}, } template_content['skills'].extend(register_mcp_tools()) load_search_skill(template_content) diff --git a/app/cosight/agent/actor/instance/actor_agent_skill.py b/app/cosight/agent/actor/instance/actor_agent_skill.py index e417fc1..1ad0ebd 100644 --- a/app/cosight/agent/actor/instance/actor_agent_skill.py +++ b/app/cosight/agent/actor/instance/actor_agent_skill.py @@ -227,6 +227,34 @@ def browser_use_skill(): ) } +def check_browser_session_skill(): + return { + "skill_name": "check_browser_session", + "skill_type": "function", + "display_name_zh": "检查浏览器会话状态", + "display_name_en": "Check Browser Session Status", + "description_zh": "检查当前浏览器会话是否存在,如果存在但当前任务不需要使用浏览器,则自动关闭浏览器会话", + "description_en": "Check if a browser session exists. If it exists but the current task does not require browser interaction, automatically close the browser session", + "semantic_apis": ["api_browser_management"], + "function": SkillFunction( + id="2c44f9ad-be5c-4e6c-a9d8-1426b23828c1", + name="app.cosight.browser_toolkit.check_browser_session", + description_zh="检查浏览器会话状态并根据当前任务需求自动关闭不需要的会话", + description_en="Check browser session status and auto-close if not needed for current task", + parameters={ + "type": "object", + "properties": { + "task_requires_browser": { + "type": "boolean", + "description_zh": "当前任务是否需要浏览器交互。如果为 false 且浏览器会话存在,会自动关闭浏览器", + "description_en": "Whether the current task requires browser interaction. If false and a browser session exists, it will be closed automatically", + } + }, + "required": ["task_requires_browser"], + }, + ), + } + def fetch_website_content_skill(): return { diff --git a/app/cosight/agent/actor/prompt/actor_prompt.py b/app/cosight/agent/actor/prompt/actor_prompt.py index 3aab3d9..00b7f61 100644 --- a/app/cosight/agent/actor/prompt/actor_prompt.py +++ b/app/cosight/agent/actor/prompt/actor_prompt.py @@ -39,6 +39,7 @@ def actor_system_prompt(work_space_path: str): # General Rules 1. You MUST plan extensively before each function call, and reflect extensively on the outcomes of the previous function calls. DO NOT do this entire process by making function calls only, as this can impair your ability to solve the problem and think insightfully. +2. At the beginning of each task, you MUST call check_browser_session tool. If the current task does not require using browser, please set task_requires_browser to False, for example: report generation/data analysis and so on do not need to use browser. # Task Execution Rules: 1. For all output tasks (file generation and information gathering): @@ -221,6 +222,7 @@ def actor_system_prompt_zh(work_space_path): # 通用规则 1. 在每次函数调用前必须进行充分规划,并深入反思之前函数调用的结果。不要仅通过函数调用完成整个过程,这可能会影响你的问题解决能力和洞察力。 +2. 在任务开始时,必须调用 check_browser_session 工具,如果当前任务不需要使用browser,请将task_requires_browser设置为False,例如:报告生成/数据分析等不需要使用browser,请将task_requires_browser设置为False。 # 任务执行规则: 1. 对于所有输出任务(文件生成和信息收集): diff --git a/app/cosight/agent/base/base_agent.py b/app/cosight/agent/base/base_agent.py index 0ad3984..534726a 100644 --- a/app/cosight/agent/base/base_agent.py +++ b/app/cosight/agent/base/base_agent.py @@ -30,6 +30,7 @@ from app.cosight.task.plan_report_manager import plan_report_event_manager from app.common.logger_util import logger from app.cosight.agent.base.tool_arg_mapping import FUNCTION_ARG_MAPPING +from app.cosight.tool.web_util import WebToolkit class BaseAgent: @@ -363,21 +364,43 @@ def _get_verification_steps(self, tool_name: str) -> list[str]: # 未在清单中的工具:不返回任何步骤 return [] - def execute(self, messages: List[Dict[str, Any]], step_index=None, max_iteration=10): #调试修改的10 - for i in range(max_iteration): - logger.info(f'act agent call with tools message: {messages}') - response = self.llm.create_with_tools(messages, self.tools) - logger.info(f'act agent call with tools response: {response}') + def _cleanup_browser_session(self): + """在所有step执行完成后检查并关闭浏览器会话""" + try: + # 检查是否存在活跃的浏览器会话 + if WebToolkit.has_active_browser_session(): + logger.info("检测到活跃的浏览器会话,准备关闭") + # 关闭浏览器会话 + result = WebToolkit.close_browser() + logger.info(f"浏览器会话清理结果: {result}") + else: + logger.info("没有活跃的浏览器会话需要清理") + except Exception as e: + logger.error(f"清理浏览器会话时发生错误: {str(e)}", exc_info=True) - # Process initial response - result = self._process_response(response, messages, step_index) - logger.info(f'iter {i} for {self.agent_instance.instance_name} call tools result: {result}') - if result: - return result + def execute( + self, messages: List[Dict[str, Any]], step_index=None, max_iteration=10 + ): + try: + for i in range(max_iteration): + logger.info(f"act agent call with tools message: {messages}") + response = self.llm.create_with_tools(messages, self.tools) + logger.info(f"act agent call with tools response: {response}") + + # Process initial response + result = self._process_response(response, messages, step_index) + logger.info( + f"iter {i} for {self.agent_instance.instance_name} call tools result: {result}" + ) + if result: + return result - if max_iteration > 1: - return self._handle_max_iteration(messages, step_index) - return messages[-1].get("content") + if max_iteration > 1: + return self._handle_max_iteration(messages, step_index) + return messages[-1].get("content") + finally: + # 在所有step执行完成后,检查并关闭浏览器会话 + self._cleanup_browser_session() def _process_response(self, response, messages, step_index): if not response.tool_calls: diff --git a/app/cosight/tool/web_util.py b/app/cosight/tool/web_util.py index 1f841b9..f210a0a 100644 --- a/app/cosight/tool/web_util.py +++ b/app/cosight/tool/web_util.py @@ -24,7 +24,6 @@ from browser_use.llm import ChatOpenAI from app.common.logger_util import logger -from config.config import get_browser_model_config _browser_loop: Optional[asyncio.AbstractEventLoop] = None _browser_loop_thread: Optional[threading.Thread] = None @@ -59,7 +58,7 @@ def _env_float(name: str, default: float) -> float: return default -def _env_int(name: str, default: Optional[int] = None) -> Optional[int]: +def _env_int(name: str, default: int = 1) -> int: value = os.environ.get(name) if value is None or not value.strip(): return default @@ -130,7 +129,9 @@ async def create_browser_session(): server=proxy_url, username=proxy_user, password=proxy_password, - bypass=os.environ.get("BROWSER_PROXY_BYPASS", "localhost,127.0.0.1,*.internal"), + bypass=os.environ.get( + "BROWSER_PROXY_BYPASS", "localhost,127.0.0.1,*.internal" + ), ) profile = BrowserProfile( @@ -175,22 +176,8 @@ class WebToolkit: _shared_browser_session: Optional[BrowserSession] = None _session_lock: Optional[asyncio.Lock] = None - def __init__(self, llm_config=None): - """ - 初始化WebToolkit - - Args: - llm_config: 可选的LLM配置,如果不提供则使用专门的浏览器自动化模型配置 - """ - if llm_config is None: - # 使用专门的浏览器自动化模型配置 - self.llm_config: dict = get_browser_model_config() - logger.info("使用专门的浏览器自动化模型配置") - else: - # 使用提供的配置 - self.llm_config: dict = llm_config - logger.info("使用提供的LLM配置") - + def __init__(self, llm_config): + self.llm_config: dict = llm_config self._llm: Optional[ChatOpenAI] = None @classmethod @@ -219,6 +206,77 @@ async def _reset_browser_session(cls) -> None: finally: cls._shared_browser_session = None + @classmethod + def has_active_browser_session(cls) -> bool: + """Check if there is an active browser session. + + This method allows external agents to check if a browser session currently exists. + + Returns: + bool: True if there is an active browser session, False otherwise. + """ + return cls._shared_browser_session is not None + + @classmethod + def check_browser_session(cls, task_requires_browser: bool) -> str: + """Check browser session status and auto-close if not needed for current task. + + This method allows external actor_agent to: + 1. Check if a browser session currently exists + 2. Auto-close the browser if the current task does not require it + + Args: + task_requires_browser (bool): Whether the current task requires browser interaction. + If False and a browser session exists, it will be closed automatically. + + Returns: + str: A message indicating the browser session status and any actions taken. + """ + has_session = cls.has_active_browser_session() + + if not has_session: + logger.info("No active browser session exists") + return "No active browser session exists" + + if not task_requires_browser: + logger.info( + "Browser session exists but current task does not require it - auto-closing" + ) + try: + _run_in_browser_loop(cls._reset_browser_session()) + logger.info("Browser session auto-closed successfully") + return "Browser session existed but was not needed for current task - closed successfully" + except Exception as e: + logger.error( + f"Failed to auto-close browser session: {str(e)}", exc_info=True + ) + return f"Failed to auto-close browser session: {str(e)}" + else: + logger.info("Browser session exists and is available for current task") + return "Browser session exists and is ready for use in current task" + + @classmethod + def close_browser(cls) -> str: + """Close the shared browser session. + + This method allows external agents to explicitly close the browser when it's no longer needed. + For example, during the planning phase, if the agent determines that the current browser + session is no longer required, it can call this method to clean up resources. + + Returns: + str: A message indicating whether the browser was closed successfully or if there was no active session. + """ + logger.info("External request to close shared browser session") + try: + _run_in_browser_loop(cls._reset_browser_session()) + logger.info( + "Shared browser session closed successfully by external request" + ) + return "Browser session closed successfully" + except Exception as e: + logger.error(f"Failed to close browser session: {str(e)}", exc_info=True) + return f"Failed to close browser session: {str(e)}" + def browser_use(self, task_prompt: str): r"""A powerful toolkit which can simulate the browser interaction to solve the task which needs multi-step actions. @@ -247,35 +305,32 @@ async def inner_browser_use(self, task_prompt): try: browser_session = await self._get_shared_browser_session() if self._llm is None: - llm_kwargs = {**self.llm_config} - llm_kwargs.setdefault("temperature", 0.0) - llm_kwargs["add_schema_to_system_prompt"] = _env_bool( - "ADD_SCHEMA_TO_SYSTEM_PROMPT", - llm_kwargs.get("add_schema_to_system_prompt", True), + self._llm = ChatOpenAI( + **self.llm_config, + max_completion_tokens=8192, + temperature=0.0, + add_schema_to_system_prompt=_env_bool( + "ADD_SCHEMA_TO_SYSTEM_PROMPT", True + ), ) - self._llm = ChatOpenAI(**llm_kwargs) # 创建agent,复用共享的browser session - agent_kwargs: dict[str, Any] = dict( + + agent = Agent( task=task_prompt, browser_session=browser_session, # 使用共享的browser session llm=self._llm, use_vision=False, - max_actions_per_step=1, + max_actions_per_step=_env_int("MAX_ACTIONS_PER_STEP", 1), directly_open_url=False, flash_mode=_env_bool("FLASH_MODE", True), + include_tool_call_examples=True, extend_system_message=""" -ADDITIONAL INSTRUCTIONS: +YOU **MUST** FOLLOW THESE INSTRUCTIONS: - Your answers **MUST NOT** contain any of the markdown code blocks such as ``` or ```json. - **Directly** return the final answer as a plain text **without any additional formatting**. """, ) - max_tokens_per_step = _env_int("MAX_TOKENS_PER_STEP") - if max_tokens_per_step is not None: - agent_kwargs["max_tokens_per_step"] = max_tokens_per_step - - agent = Agent(**agent_kwargs) - # 运行agent result = await agent.run() final_result = result.final_result() @@ -288,4 +343,4 @@ async def inner_browser_use(self, task_prompt): await self._reset_browser_session() return f"fail, because: {str(e)}" # 注意:不要在这里关闭browser_session,因为它是共享的 - # browser session会通过keep_alive=True保持活跃,供后续agent复用 \ No newline at end of file + # browser session会通过keep_alive=True保持活跃,供后续agent复用 diff --git a/requirements.txt b/requirements.txt index 8e2c223..1cdf57b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,7 +11,7 @@ ffmpeg-python==0.2.0 baidusearch==1.0.3 retry==0.9.2 googlesearch-python==1.3.0 -browser-use==0.7.9 +browser-use==0.8.1 xmltodict==0.14.2 soundfile==0.13.1 wikipedia==1.4.0