Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 47 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# =============================================================================
# Autonomous Browser AI Agent - Environment Configuration
# =============================================================================
# Copy this file to .env and fill in your API keys
# At least ONE LLM provider is required for the agent to function
# =============================================================================

# -----------------------------------------------------------------------------
# AWS Bedrock Configuration (Claude, Titan, etc.)
# -----------------------------------------------------------------------------
# AWS_ACCESS_KEY_ID=your_access_key
# AWS_SECRET_ACCESS_KEY=your_secret_key
# AWS_REGION=us-east-1
# BEDROCK_MODEL_ID=anthropic.claude-3-sonnet-20240229-v1:0

# -----------------------------------------------------------------------------
# Google Gemini Configuration
# -----------------------------------------------------------------------------
# GEMINI_API_KEY=your_gemini_api_key
# GEMINI_MODEL=gemini-1.5-pro

# -----------------------------------------------------------------------------
# OpenAI Configuration
# -----------------------------------------------------------------------------
# OPENAI_API_KEY=your_openai_api_key
# OPENAI_MODEL=gpt-4-turbo

# -----------------------------------------------------------------------------
# LLM Provider Selection (bedrock, gemini, openai)
# The orchestrator, planner, and executor can use different models
# -----------------------------------------------------------------------------
# ORCHESTRATOR_PROVIDER=bedrock
# PLANNER_PROVIDER=bedrock
# EXECUTOR_PROVIDER=bedrock

# -----------------------------------------------------------------------------
# Agent Configuration
# -----------------------------------------------------------------------------
# MAX_PLANNING_STEPS=10
# MAX_RETRIES=3
# PLANNING_LOOKAHEAD=4

# -----------------------------------------------------------------------------
# Browser Configuration
# -----------------------------------------------------------------------------
# BROWSER_HEADLESS=true
# BROWSER_TIMEOUT=30
10 changes: 8 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,21 @@ requires-python = ">=3.10,<3.13"
dependencies = [
"playwright>=1.48.0,<2.0.0",
"pytest>=8.2.0,<9.0.0",
"pytest-asyncio>=0.23.6,<0.24.0"
"pytest-asyncio>=0.23.6,<0.24.0",
"python-dotenv>=1.0.0",
"langchain>=0.3.0",
"langchain-aws>=0.2.0",
"langchain-google-genai>=2.0.0",
"langchain-openai>=0.2.0",
"pydantic>=2.0.0",
]

[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"

[tool.hatch.build.targets.wheel]
packages = ["src/agent", "src/browser", "src/config", "src/controller", "src/examples"]
packages = ["src/agent", "src/browser", "src/config", "src/controller", "src/examples", "src/llm"]

[tool.hatch.build.targets.sdist]
include = [
Expand Down
261 changes: 261 additions & 0 deletions src/agent/evaluator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,261 @@
"""Evaluator Agent - evaluates step results and triggers re-planning.

The evaluator analyzes execution results to determine:
1. Did the step succeed?
2. Is the overall task complete?
3. Should we re-plan?
"""

from __future__ import annotations

import json
from typing import Any, Dict, Optional, TYPE_CHECKING

if TYPE_CHECKING:
from browser.dom_analyzer import PageStructure
from llm.base import BaseLLMProvider


EVALUATION_SCHEMA = {
"type": "object",
"properties": {
"success": {
"type": "boolean",
"description": "Whether the step executed successfully",
},
"task_complete": {
"type": "boolean",
"description": "Whether the overall task is now complete",
},
"confidence": {
"type": "number",
"description": "Confidence level 0-1 in the evaluation",
},
"error": {
"type": "string",
"description": "Error description if step failed",
},
"result": {
"type": "string",
"description": "Extracted result if task is complete",
},
"should_replan": {
"type": "boolean",
"description": "Whether the planner should create a new plan",
},
"replan_reason": {
"type": "string",
"description": "Why re-planning is needed",
},
"next_action_hint": {
"type": "string",
"description": "Suggestion for what to do next",
},
},
"required": ["success", "task_complete"],
}


class EvaluatorAgent:
"""Evaluates execution results and provides feedback for the orchestrator.

The evaluator:
1. Checks if an individual step succeeded
2. Determines if the overall task is complete
3. Decides whether re-planning is needed
4. Provides hints for the next action
"""

SYSTEM_PROMPT = """You are an evaluation agent for a browser automation system.

After each step is executed, you must evaluate:

1. SUCCESS: Did the action complete without errors? Check:
- Was the element found?
- Did the action execute?
- Did the page respond as expected?

2. TASK COMPLETION: Is the user's original task now complete? Consider:
- Did we achieve the stated goal?
- Do we have the information/result requested?
- Are there remaining sub-goals?

3. RE-PLANNING: Should we create a new plan? Consider:
- Did the page change unexpectedly?
- Are there new elements we didn't account for?
- Is the current plan still valid?

Be precise and conservative - only mark task_complete=true when you're confident
the original goal has been achieved."""

def __init__(self, llm: Optional["BaseLLMProvider"] = None):
"""Initialize evaluator.

Args:
llm: LLM provider for intelligent evaluation
"""
self.llm = llm

async def evaluate(
self,
step: Dict[str, Any],
result: Dict[str, Any],
page: Any = None,
task: str = "",
remaining_steps: int = 0,
page_structure: Optional["PageStructure"] = None,
) -> Dict[str, Any]:
"""Evaluate a step result.

Args:
step: The step that was executed
result: Result from the executor
page: Current page state
task: Original task description
remaining_steps: Steps remaining in current plan
page_structure: Current DOM structure

Returns:
Evaluation result dictionary
"""
# Quick check for obvious failures
if not result.get("ok"):
return {
"success": False,
"task_complete": False,
"error": result.get("error", "Unknown error"),
"should_replan": True,
"replan_reason": f"Step failed: {result.get('error')}",
}

# If no LLM, use simple heuristics
if not self.llm:
return self._simple_evaluation(step, result, remaining_steps)

# Use LLM for intelligent evaluation
return await self._llm_evaluation(
step, result, page, task, remaining_steps, page_structure
)

def _simple_evaluation(
self,
step: Dict[str, Any],
result: Dict[str, Any],
remaining_steps: int,
) -> Dict[str, Any]:
"""Simple rule-based evaluation."""
step_type = step.get("type")

# Extract text steps might complete the task
if step_type == "extract_text" and result.get("result"):
extracted = result.get("result", "")
if extracted and len(extracted) > 10:
# We got meaningful content
if remaining_steps == 0:
return {
"success": True,
"task_complete": True,
"result": extracted,
"confidence": 0.7,
}

# Default: step succeeded, continue with plan
return {
"success": True,
"task_complete": remaining_steps == 0,
"confidence": 0.5,
}

async def _llm_evaluation(
self,
step: Dict[str, Any],
result: Dict[str, Any],
page: Any,
task: str,
remaining_steps: int,
page_structure: Optional["PageStructure"],
) -> Dict[str, Any]:
"""Use LLM for intelligent evaluation."""

# Get current DOM context
dom_context = ""
if page_structure:
dom_context = page_structure.to_prompt_context()
elif page:
from browser.dom_analyzer import DOMAnalyzer
analyzer = DOMAnalyzer()
structure = await analyzer.analyze(page)
dom_context = structure.to_prompt_context()

prompt = f"""{self.SYSTEM_PROMPT}

## ORIGINAL TASK
{task}

## STEP EXECUTED
{json.dumps(step, indent=2)}

## EXECUTION RESULT
{json.dumps({k: v for k, v in result.items() if k != 'page'}, indent=2)}

## CURRENT PAGE STATE
{dom_context}

## REMAINING PLANNED STEPS
{remaining_steps}

Evaluate this step and provide your assessment."""

try:
evaluation = await self.llm.complete_json(prompt, EVALUATION_SCHEMA)
return evaluation
except Exception as e:
# Fallback to simple evaluation on error
return {
"success": True,
"task_complete": False,
"error": f"Evaluation failed: {e}",
}

async def check_task_completion(
self,
task: str,
execution_log: list,
page: Any = None,
) -> Dict[str, Any]:
"""Check if the overall task is complete based on execution history.

Called at the end of execution or when steps are exhausted.
"""
if not self.llm:
# Simple check: did any step extract meaningful content?
for entry in execution_log:
result = entry.get("result", {})
if result.get("ok") and result.get("result"):
return {
"complete": True,
"result": result.get("result"),
}
return {"complete": False, "reason": "No meaningful result extracted"}

prompt = f"""Review this task execution and determine if the task was completed:

Task: {task}

Execution log:
{json.dumps(execution_log, indent=2, default=str)}

Was the task completed successfully? What was the final result?"""

try:
result = await self.llm.complete_json(prompt, {
"type": "object",
"properties": {
"complete": {"type": "boolean"},
"result": {"type": "string"},
"reason": {"type": "string"},
},
})
return result
except Exception:
return {"complete": False, "reason": "Evaluation failed"}
Loading