Skip to content

Commit

Permalink
Major bugfix:
Browse files Browse the repository at this point in the history
1) Solving tabindex issue on pressing backspace while entering text
2) removing prompt compression as it was causing halucination
3) fixing bulk text fill to improve speed specially in case of filling OTPs.
  • Loading branch information
shriyanshagnihotri committed Nov 28, 2024
1 parent b10ff8e commit 4374a11
Show file tree
Hide file tree
Showing 4 changed files with 75 additions and 43 deletions.
46 changes: 34 additions & 12 deletions testzeus_hercules/core/agents/browser_nav_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,12 +56,20 @@ def __init__(self, model_config_list, llm_config_params: dict[str, Any], system_
system_message = "\n".join(system_prompt)
else:
system_message = system_prompt
logger.info(f"Using custom system prompt for BrowserNavAgent: {system_message}")

system_message = system_message + "\n" + f"Today's date is {datetime.now().strftime('%d %B %Y')}"
logger.info(
f"Using custom system prompt for BrowserNavAgent: {system_message}"
)

system_message = (
system_message
+ "\n"
+ f"Today's date is {datetime.now().strftime('%d %B %Y')}"
)
if user_ltm: # add the user LTM to the system prompt if it exists
user_ltm = "\n" + user_ltm
system_message = Template(system_message).substitute(basic_user_information=user_ltm)
system_message = Template(system_message).substitute(
basic_user_information=user_ltm
)
logger.info(f"Browser nav agent using model: {model_config_list[0]['model']}")
self.agent = autogen.ConversableAgent(
name="browser_navigation_agent",
Expand All @@ -71,7 +79,7 @@ def __init__(self, model_config_list, llm_config_params: dict[str, Any], system_
**llm_config_params, # unpack all the name value pairs in llm_config_params as is
},
)
# add_text_compressor(self.agent)
add_text_compressor(self.agent)
self.__register_tools()

def __get_ltm(self) -> str | None:
Expand All @@ -94,25 +102,37 @@ def __register_tools(self) -> None:
# self.agent.register_for_llm(description=LLM_PROMPTS["ENTER_TEXT_AND_CLICK_PROMPT"])(enter_text_and_click)
# self.browser_nav_executor.register_for_execution()(enter_text_and_click)

self.agent.register_for_llm(description=LLM_PROMPTS["GET_DOM_WITH_CONTENT_TYPE_PROMPT"])(get_dom_with_content_type)
self.agent.register_for_llm(
description=LLM_PROMPTS["GET_DOM_WITH_CONTENT_TYPE_PROMPT"]
)(get_dom_with_content_type)
self.browser_nav_executor.register_for_execution()(get_dom_with_content_type)

self.agent.register_for_llm(description=LLM_PROMPTS["CLICK_PROMPT"])(click_element)
self.agent.register_for_llm(description=LLM_PROMPTS["CLICK_PROMPT"])(
click_element
)
self.browser_nav_executor.register_for_execution()(click_element)

self.agent.register_for_llm(description=LLM_PROMPTS["GET_URL_PROMPT"])(geturl)
self.browser_nav_executor.register_for_execution()(geturl)

self.agent.register_for_llm(description=LLM_PROMPTS["BULK_ENTER_TEXT_PROMPT"])(bulk_enter_text)
self.agent.register_for_llm(description=LLM_PROMPTS["BULK_ENTER_TEXT_PROMPT"])(
bulk_enter_text
)
self.browser_nav_executor.register_for_execution()(bulk_enter_text)

self.agent.register_for_llm(description=LLM_PROMPTS["ENTER_TEXT_PROMPT"])(entertext)
self.agent.register_for_llm(description=LLM_PROMPTS["ENTER_TEXT_PROMPT"])(
entertext
)
self.browser_nav_executor.register_for_execution()(entertext)

self.agent.register_for_llm(description=LLM_PROMPTS["PRESS_KEY_COMBINATION_PROMPT"])(press_key_combination)
self.agent.register_for_llm(
description=LLM_PROMPTS["PRESS_KEY_COMBINATION_PROMPT"]
)(press_key_combination)
self.browser_nav_executor.register_for_execution()(press_key_combination)

self.agent.register_for_llm(description=LLM_PROMPTS["EXTRACT_TEXT_FROM_PDF_PROMPT"])(extract_text_from_pdf)
self.agent.register_for_llm(
description=LLM_PROMPTS["EXTRACT_TEXT_FROM_PDF_PROMPT"]
)(extract_text_from_pdf)
self.browser_nav_executor.register_for_execution()(extract_text_from_pdf)

self.agent.register_for_llm(description=LLM_PROMPTS["HOVER_PROMPT"])(hover)
Expand Down Expand Up @@ -162,7 +182,9 @@ def __load_additional_tools(self) -> None:

elif tool_path.endswith(".py") and os.path.isfile(tool_path):
# If the path is a specific .py file, load it directly
module_name = os.path.basename(tool_path)[:-3] # Strip .py extension
module_name = os.path.basename(tool_path)[
:-3
] # Strip .py extension
directory_path = os.path.dirname(tool_path).replace("/", ".")
module_path = f"{directory_path}.{module_name}"
importlib.import_module(module_path)
Expand Down
6 changes: 4 additions & 2 deletions testzeus_hercules/core/memory/prompt_compressor.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@
from autogen.agentchat.contrib.capabilities.transforms import TextMessageCompressor
from testzeus_hercules.utils.logger import logger

TEXT_COMPRESSOR_LLM = LLMLingua()
TEXT_COMPRESSOR = TextMessageCompressor(text_compressor=TEXT_COMPRESSOR_LLM)
# TEXT_COMPRESSOR_LLM = LLMLingua()
# TEXT_COMPRESSOR = TextMessageCompressor(text_compressor=TEXT_COMPRESSOR_LLM)


def add_text_compressor(agent: ConversableAgent) -> None:
Expand All @@ -14,6 +14,8 @@ def add_text_compressor(agent: ConversableAgent) -> None:
Args:
agent (ConversableAgent): The agent that needs text compression in prompts
"""
return
# removed the text compressor as its making the prompt lossy and causing lots of halucination.
context_handling = transform_messages.TransformMessages(
transforms=[TEXT_COMPRESSOR]
)
Expand Down
64 changes: 36 additions & 28 deletions testzeus_hercules/core/tools/enter_text_using_selector.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,9 @@ async def custom_fill_element(page: Page, selector: str, text_to_enter: str):
)
logger.debug(f"custom_fill_element result: {result}")
except Exception as e:
logger.error(f"Error in custom_fill_element, Selector: {selector}, Text: {text_to_enter}. Error: {str(e)}")
logger.error(
f"Error in custom_fill_element, Selector: {selector}, Text: {text_to_enter}. Error: {str(e)}"
)
raise


Expand Down Expand Up @@ -249,18 +251,24 @@ def detect_dom_changes(changes: str): # type: ignore
)

result = await do_entertext(page, query_selector, text_to_enter)
await asyncio.sleep(0.1) # sleep for 100ms to allow the mutation observer to detect changes
await asyncio.sleep(
0.1
) # sleep for 100ms to allow the mutation observer to detect changes
unsubscribe(detect_dom_changes)

await browser_manager.take_screenshots(f"{function_name}_end", page)

await browser_manager.notify_user(result["summary_message"], message_type=MessageType.ACTION)
await browser_manager.notify_user(
result["summary_message"], message_type=MessageType.ACTION
)
if dom_changes_detected:
return f"{result['detailed_message']}.\n As a consequence of this action, new elements have appeared in view: {dom_changes_detected}. This means that the action of entering text {text_to_enter} is not yet executed and needs further interaction. Get all_fields DOM to complete the interaction."
return result["detailed_message"]


async def do_entertext(page: Page, selector: str, text_to_enter: str, use_keyboard_fill: bool = True):
async def do_entertext(
page: Page, selector: str, text_to_enter: str, use_keyboard_fill: bool = True
):
"""
Performs the text entry operation on a DOM or Shadow DOM element.
Expand Down Expand Up @@ -359,18 +367,20 @@ async def find_element_in_shadow_dom(page: Page, selector: str):

if use_keyboard_fill:
await elem.focus()
await asyncio.sleep(0.1)
await asyncio.sleep(0.05)
await press_key_combination("Control+A")
await asyncio.sleep(0.1)
await press_key_combination("Backspace")
await asyncio.sleep(0.1)
await asyncio.sleep(0.05)
await press_key_combination("Delete")
await asyncio.sleep(0.05)
logger.debug(f"Focused element with selector {selector} to enter text")
await page.keyboard.type(text_to_enter, delay=1)
else:
await custom_fill_element(page, selector, text_to_enter)

await elem.focus()
logger.info(f'Success. Text "{text_to_enter}" set successfully in the element with selector {selector}')
logger.info(
f'Success. Text "{text_to_enter}" set successfully in the element with selector {selector}'
)
success_msg = f'Success. Text "{text_to_enter}" set successfully in the element with selector {selector}'
return {
"summary_message": success_msg,
Expand All @@ -385,46 +395,44 @@ async def find_element_in_shadow_dom(page: Page, selector: str):

async def bulk_enter_text(
entries: Annotated[
List[dict[str, str]],
"List of objects, each containing 'query_selector' and 'text'.",
List[EnterTextEntry],
"List of EnterTextEntry objects. An object containing 'query_selector' (DOM selector query using mmid attribute e.g. [mmid='114']) and 'text' (text to enter on the element).",
] # noqa: UP006
) -> Annotated[
List[dict[str, str]],
"List of dictionaries, each containing 'query_selector' and the result of the operation.",
List[str],
"List of results from the entertext operation for each entry.",
]: # noqa: UP006
"""
Enters text into multiple DOM elements using a bulk operation.
This function enters text into multiple DOM elements using a bulk operation.
It takes a list of dictionaries, where each dictionary contains a 'query_selector' and 'text' pair.
It takes a list of EnterTextEntry objects, where each contains 'query_selector' and 'text' attributes.
The function internally calls the 'entertext' function to perform the text entry operation for each entry.
Args:
entries: List of objects, each containing 'query_selector' and 'text'.
entries: List of EnterTextEntry objects.
Returns:
List of dictionaries, each containing 'query_selector' and the result of the operation.
List of results from the entertext operation for each entry.
Example:
entries = [
{"query_selector": "#username", "text": "test_user"},
{"query_selector": "#password", "text": "test_password"}
EnterTextEntry(query_selector="#username", text="test_user"),
EnterTextEntry(query_selector="#password", text="test_password")
]
results = await bulk_enter_text(entries)
Note:
- Each entry in the 'entries' list should be a dictionary with 'query_selector' and 'text' keys.
- The result is a list of dictionaries, where each dictionary contains the 'query_selector' and the result of the operation.
- Each entry in the 'entries' list should be an instance of EnterTextEntry.
- The result is a list of strings returned by the 'entertext' function for each entry.
"""
add_event(EventType.INTERACTION, EventData(detail="bulk_enter_text"))
results: List[dict[str, str]] = [] # noqa: UP006
results: List[str] = [] # noqa: UP006
logger.info("Executing bulk Enter Text Command")
for entry in entries:
query_selector = entry["query_selector"]
text_to_enter = entry["text"]
logger.info(f"Entering text: {text_to_enter} in element with selector: {query_selector}")
result = await entertext(EnterTextEntry(query_selector=query_selector, text=text_to_enter))

results.append({"query_selector": query_selector, "result": result})

logger.info(
f"Entering text: {entry['text']} in element with selector: {entry['query_selector']}"
)
result = await entertext(entry)
results.append(result)
return results
2 changes: 1 addition & 1 deletion testzeus_hercules/utils/logger.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def configure_logger(level: str = "INFO") -> None:
http_loggers = ["openai", "autogen"]
for http_logger in http_loggers:
lib_logger = logging.getLogger(http_logger)
lib_logger.setLevel(logging.DEBUG)
lib_logger.setLevel(logging.INFO)
lib_logger.handlers = [] # Clear any existing handlers
lib_logger.addHandler(handler) # Add the same handler

Expand Down

0 comments on commit 4374a11

Please sign in to comment.