From 4374a11c3a4a8c7da3ed02d23d244ab936ad96aa Mon Sep 17 00:00:00 2001 From: Shriyansh Agnihotri Date: Thu, 28 Nov 2024 20:12:57 +0530 Subject: [PATCH] Major bugfix: 1) Solving tabindex issue on pressing backspace while entering text 2) removing prompt compression as it was causing halucination 3) fixing bulk text fill to improve speed specially in case of filling OTPs. --- .../core/agents/browser_nav_agent.py | 46 +++++++++---- .../core/memory/prompt_compressor.py | 6 +- .../core/tools/enter_text_using_selector.py | 64 +++++++++++-------- testzeus_hercules/utils/logger.py | 2 +- 4 files changed, 75 insertions(+), 43 deletions(-) diff --git a/testzeus_hercules/core/agents/browser_nav_agent.py b/testzeus_hercules/core/agents/browser_nav_agent.py index ef79463..38ca6a4 100644 --- a/testzeus_hercules/core/agents/browser_nav_agent.py +++ b/testzeus_hercules/core/agents/browser_nav_agent.py @@ -56,12 +56,20 @@ def __init__(self, model_config_list, llm_config_params: dict[str, Any], system_ system_message = "\n".join(system_prompt) else: system_message = system_prompt - logger.info(f"Using custom system prompt for BrowserNavAgent: {system_message}") - - system_message = system_message + "\n" + f"Today's date is {datetime.now().strftime('%d %B %Y')}" + logger.info( + f"Using custom system prompt for BrowserNavAgent: {system_message}" + ) + + system_message = ( + system_message + + "\n" + + f"Today's date is {datetime.now().strftime('%d %B %Y')}" + ) if user_ltm: # add the user LTM to the system prompt if it exists user_ltm = "\n" + user_ltm - system_message = Template(system_message).substitute(basic_user_information=user_ltm) + system_message = Template(system_message).substitute( + basic_user_information=user_ltm + ) logger.info(f"Browser nav agent using model: {model_config_list[0]['model']}") self.agent = autogen.ConversableAgent( name="browser_navigation_agent", @@ -71,7 +79,7 @@ def __init__(self, model_config_list, llm_config_params: dict[str, Any], system_ **llm_config_params, # unpack all the name value pairs in llm_config_params as is }, ) - # add_text_compressor(self.agent) + add_text_compressor(self.agent) self.__register_tools() def __get_ltm(self) -> str | None: @@ -94,25 +102,37 @@ def __register_tools(self) -> None: # self.agent.register_for_llm(description=LLM_PROMPTS["ENTER_TEXT_AND_CLICK_PROMPT"])(enter_text_and_click) # self.browser_nav_executor.register_for_execution()(enter_text_and_click) - self.agent.register_for_llm(description=LLM_PROMPTS["GET_DOM_WITH_CONTENT_TYPE_PROMPT"])(get_dom_with_content_type) + self.agent.register_for_llm( + description=LLM_PROMPTS["GET_DOM_WITH_CONTENT_TYPE_PROMPT"] + )(get_dom_with_content_type) self.browser_nav_executor.register_for_execution()(get_dom_with_content_type) - self.agent.register_for_llm(description=LLM_PROMPTS["CLICK_PROMPT"])(click_element) + self.agent.register_for_llm(description=LLM_PROMPTS["CLICK_PROMPT"])( + click_element + ) self.browser_nav_executor.register_for_execution()(click_element) self.agent.register_for_llm(description=LLM_PROMPTS["GET_URL_PROMPT"])(geturl) self.browser_nav_executor.register_for_execution()(geturl) - self.agent.register_for_llm(description=LLM_PROMPTS["BULK_ENTER_TEXT_PROMPT"])(bulk_enter_text) + self.agent.register_for_llm(description=LLM_PROMPTS["BULK_ENTER_TEXT_PROMPT"])( + bulk_enter_text + ) self.browser_nav_executor.register_for_execution()(bulk_enter_text) - self.agent.register_for_llm(description=LLM_PROMPTS["ENTER_TEXT_PROMPT"])(entertext) + self.agent.register_for_llm(description=LLM_PROMPTS["ENTER_TEXT_PROMPT"])( + entertext + ) self.browser_nav_executor.register_for_execution()(entertext) - self.agent.register_for_llm(description=LLM_PROMPTS["PRESS_KEY_COMBINATION_PROMPT"])(press_key_combination) + self.agent.register_for_llm( + description=LLM_PROMPTS["PRESS_KEY_COMBINATION_PROMPT"] + )(press_key_combination) self.browser_nav_executor.register_for_execution()(press_key_combination) - self.agent.register_for_llm(description=LLM_PROMPTS["EXTRACT_TEXT_FROM_PDF_PROMPT"])(extract_text_from_pdf) + self.agent.register_for_llm( + description=LLM_PROMPTS["EXTRACT_TEXT_FROM_PDF_PROMPT"] + )(extract_text_from_pdf) self.browser_nav_executor.register_for_execution()(extract_text_from_pdf) self.agent.register_for_llm(description=LLM_PROMPTS["HOVER_PROMPT"])(hover) @@ -162,7 +182,9 @@ def __load_additional_tools(self) -> None: elif tool_path.endswith(".py") and os.path.isfile(tool_path): # If the path is a specific .py file, load it directly - module_name = os.path.basename(tool_path)[:-3] # Strip .py extension + module_name = os.path.basename(tool_path)[ + :-3 + ] # Strip .py extension directory_path = os.path.dirname(tool_path).replace("/", ".") module_path = f"{directory_path}.{module_name}" importlib.import_module(module_path) diff --git a/testzeus_hercules/core/memory/prompt_compressor.py b/testzeus_hercules/core/memory/prompt_compressor.py index 159675d..b5bd18b 100644 --- a/testzeus_hercules/core/memory/prompt_compressor.py +++ b/testzeus_hercules/core/memory/prompt_compressor.py @@ -4,8 +4,8 @@ from autogen.agentchat.contrib.capabilities.transforms import TextMessageCompressor from testzeus_hercules.utils.logger import logger -TEXT_COMPRESSOR_LLM = LLMLingua() -TEXT_COMPRESSOR = TextMessageCompressor(text_compressor=TEXT_COMPRESSOR_LLM) +# TEXT_COMPRESSOR_LLM = LLMLingua() +# TEXT_COMPRESSOR = TextMessageCompressor(text_compressor=TEXT_COMPRESSOR_LLM) def add_text_compressor(agent: ConversableAgent) -> None: @@ -14,6 +14,8 @@ def add_text_compressor(agent: ConversableAgent) -> None: Args: agent (ConversableAgent): The agent that needs text compression in prompts """ + return + # removed the text compressor as its making the prompt lossy and causing lots of halucination. context_handling = transform_messages.TransformMessages( transforms=[TEXT_COMPRESSOR] ) diff --git a/testzeus_hercules/core/tools/enter_text_using_selector.py b/testzeus_hercules/core/tools/enter_text_using_selector.py index c4468ce..7d08f15 100644 --- a/testzeus_hercules/core/tools/enter_text_using_selector.py +++ b/testzeus_hercules/core/tools/enter_text_using_selector.py @@ -125,7 +125,9 @@ async def custom_fill_element(page: Page, selector: str, text_to_enter: str): ) logger.debug(f"custom_fill_element result: {result}") except Exception as e: - logger.error(f"Error in custom_fill_element, Selector: {selector}, Text: {text_to_enter}. Error: {str(e)}") + logger.error( + f"Error in custom_fill_element, Selector: {selector}, Text: {text_to_enter}. Error: {str(e)}" + ) raise @@ -249,18 +251,24 @@ def detect_dom_changes(changes: str): # type: ignore ) result = await do_entertext(page, query_selector, text_to_enter) - await asyncio.sleep(0.1) # sleep for 100ms to allow the mutation observer to detect changes + await asyncio.sleep( + 0.1 + ) # sleep for 100ms to allow the mutation observer to detect changes unsubscribe(detect_dom_changes) await browser_manager.take_screenshots(f"{function_name}_end", page) - await browser_manager.notify_user(result["summary_message"], message_type=MessageType.ACTION) + await browser_manager.notify_user( + result["summary_message"], message_type=MessageType.ACTION + ) if dom_changes_detected: return f"{result['detailed_message']}.\n As a consequence of this action, new elements have appeared in view: {dom_changes_detected}. This means that the action of entering text {text_to_enter} is not yet executed and needs further interaction. Get all_fields DOM to complete the interaction." return result["detailed_message"] -async def do_entertext(page: Page, selector: str, text_to_enter: str, use_keyboard_fill: bool = True): +async def do_entertext( + page: Page, selector: str, text_to_enter: str, use_keyboard_fill: bool = True +): """ Performs the text entry operation on a DOM or Shadow DOM element. @@ -359,18 +367,20 @@ async def find_element_in_shadow_dom(page: Page, selector: str): if use_keyboard_fill: await elem.focus() - await asyncio.sleep(0.1) + await asyncio.sleep(0.05) await press_key_combination("Control+A") - await asyncio.sleep(0.1) - await press_key_combination("Backspace") - await asyncio.sleep(0.1) + await asyncio.sleep(0.05) + await press_key_combination("Delete") + await asyncio.sleep(0.05) logger.debug(f"Focused element with selector {selector} to enter text") await page.keyboard.type(text_to_enter, delay=1) else: await custom_fill_element(page, selector, text_to_enter) await elem.focus() - logger.info(f'Success. Text "{text_to_enter}" set successfully in the element with selector {selector}') + logger.info( + f'Success. Text "{text_to_enter}" set successfully in the element with selector {selector}' + ) success_msg = f'Success. Text "{text_to_enter}" set successfully in the element with selector {selector}' return { "summary_message": success_msg, @@ -385,46 +395,44 @@ async def find_element_in_shadow_dom(page: Page, selector: str): async def bulk_enter_text( entries: Annotated[ - List[dict[str, str]], - "List of objects, each containing 'query_selector' and 'text'.", + List[EnterTextEntry], + "List of EnterTextEntry objects. An object containing 'query_selector' (DOM selector query using mmid attribute e.g. [mmid='114']) and 'text' (text to enter on the element).", ] # noqa: UP006 ) -> Annotated[ - List[dict[str, str]], - "List of dictionaries, each containing 'query_selector' and the result of the operation.", + List[str], + "List of results from the entertext operation for each entry.", ]: # noqa: UP006 """ Enters text into multiple DOM elements using a bulk operation. This function enters text into multiple DOM elements using a bulk operation. - It takes a list of dictionaries, where each dictionary contains a 'query_selector' and 'text' pair. + It takes a list of EnterTextEntry objects, where each contains 'query_selector' and 'text' attributes. The function internally calls the 'entertext' function to perform the text entry operation for each entry. Args: - entries: List of objects, each containing 'query_selector' and 'text'. + entries: List of EnterTextEntry objects. Returns: - List of dictionaries, each containing 'query_selector' and the result of the operation. + List of results from the entertext operation for each entry. Example: entries = [ - {"query_selector": "#username", "text": "test_user"}, - {"query_selector": "#password", "text": "test_password"} + EnterTextEntry(query_selector="#username", text="test_user"), + EnterTextEntry(query_selector="#password", text="test_password") ] results = await bulk_enter_text(entries) Note: - - Each entry in the 'entries' list should be a dictionary with 'query_selector' and 'text' keys. - - The result is a list of dictionaries, where each dictionary contains the 'query_selector' and the result of the operation. + - Each entry in the 'entries' list should be an instance of EnterTextEntry. + - The result is a list of strings returned by the 'entertext' function for each entry. """ add_event(EventType.INTERACTION, EventData(detail="bulk_enter_text")) - results: List[dict[str, str]] = [] # noqa: UP006 + results: List[str] = [] # noqa: UP006 logger.info("Executing bulk Enter Text Command") for entry in entries: - query_selector = entry["query_selector"] - text_to_enter = entry["text"] - logger.info(f"Entering text: {text_to_enter} in element with selector: {query_selector}") - result = await entertext(EnterTextEntry(query_selector=query_selector, text=text_to_enter)) - - results.append({"query_selector": query_selector, "result": result}) - + logger.info( + f"Entering text: {entry['text']} in element with selector: {entry['query_selector']}" + ) + result = await entertext(entry) + results.append(result) return results diff --git a/testzeus_hercules/utils/logger.py b/testzeus_hercules/utils/logger.py index b9a680a..74243d1 100644 --- a/testzeus_hercules/utils/logger.py +++ b/testzeus_hercules/utils/logger.py @@ -37,7 +37,7 @@ def configure_logger(level: str = "INFO") -> None: http_loggers = ["openai", "autogen"] for http_logger in http_loggers: lib_logger = logging.getLogger(http_logger) - lib_logger.setLevel(logging.DEBUG) + lib_logger.setLevel(logging.INFO) lib_logger.handlers = [] # Clear any existing handlers lib_logger.addHandler(handler) # Add the same handler