diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..28f833a --- /dev/null +++ b/.dockerignore @@ -0,0 +1,60 @@ +# Git +.git +.gitignore + +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +*.egg-info/ +.eggs/ +dist/ +build/ +*.egg + +# Virtual environments +.venv/ +venv/ +ENV/ + +# IDE +.idea/ +.vscode/ +*.swp +*.swo + +# Testing +.pytest_cache/ +.coverage +htmlcov/ +.tox/ +.nox/ + +# Playwright +playwright-report/ +test-results/ + +# Environment +.env +.env.local +.env.*.local + +# Output +output/ +screenshots/ +*.png +*.jpg + +# Docs (keep in image if needed) +# docs/ + +# Tests (not needed in production image) +tests/ + +# Misc +*.log +*.tmp +.DS_Store +Thumbs.db diff --git a/.gitignore b/.gitignore index 2a995dd..ebd6240 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,8 @@ __pycache__/ .vscode/ playwright-report/ test-results/ + +# Environment variables (NEVER commit!) +.env +.env.local +.env.*.local diff --git a/Dockerfile b/Dockerfile index 9ae1570..def264d 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,32 +1,86 @@ -FROM Comingsoon -# FROM python:3.11-slim AS builder - -# WORKDIR /app - -# RUN apt-get update && apt-get install -y --no-install-recommends \ -# wget \ -# gnupg \ -# ca-certificates \ -# fonts-liberation \ -# libasound2 \ -# libatk-bridge2.0-0 \ -# libatk1.0-0 \ -# libatspi2.0-0 \ -# libcups2 \ -# libdbus-1-3 \ -# libdrm2 \ -# libgbm1 \ -# libgtk-3-0 \ -# libnspr4 \ -# libnss3 \ -# libwayland-client0 \ -# libxcomposite1 \ -# libxdamage1 \ -# libxfixes3 \ -# libxkbcommon0 \ -# libxrandr2 \ -# xdg-utils \ -# && rm -rf /var/lib/apt/lists/* -# COPY pyproject.toml . - -# RUN uv sync +# Multi-stage Dockerfile for Autonomous Browser AI Agent +# Stage 1: Build dependencies +FROM python:3.11-slim AS builder + +WORKDIR /app + +# Install build dependencies +RUN apt-get update && apt-get install -y --no-install-recommends \ + curl \ + && rm -rf /var/lib/apt/lists/* + +# Install uv +RUN curl -LsSf https://astral.sh/uv/install.sh | sh +ENV PATH="/root/.local/bin:$PATH" + +# Copy dependency files +COPY pyproject.toml uv.lock ./ + +# Install Python dependencies +RUN uv sync --frozen --no-dev + +# Stage 2: Runtime +FROM python:3.11-slim AS runtime + +WORKDIR /app + +# Install Playwright dependencies +RUN apt-get update && apt-get install -y --no-install-recommends \ + # Playwright dependencies + libnss3 \ + libnspr4 \ + libatk1.0-0 \ + libatk-bridge2.0-0 \ + libcups2 \ + libdrm2 \ + libdbus-1-3 \ + libatspi2.0-0 \ + libxcomposite1 \ + libxdamage1 \ + libxfixes3 \ + libxrandr2 \ + libgbm1 \ + libxkbcommon0 \ + libasound2 \ + libpango-1.0-0 \ + libcairo2 \ + # Fonts + fonts-liberation \ + fonts-noto-color-emoji \ + # Utilities + wget \ + ca-certificates \ + && rm -rf /var/lib/apt/lists/* + +# Install uv +COPY --from=builder /root/.local/bin/uv /usr/local/bin/uv + +# Copy virtual environment from builder +COPY --from=builder /app/.venv /app/.venv +ENV PATH="/app/.venv/bin:$PATH" +ENV VIRTUAL_ENV="/app/.venv" + +# Copy source code +COPY src/ ./src/ +COPY pyproject.toml ./ + +# Install Playwright browsers +RUN playwright install chromium + +# Create non-root user for security +RUN useradd -m -u 1000 agent +RUN chown -R agent:agent /app +USER agent + +# Environment variables (override at runtime) +ENV PYTHONUNBUFFERED=1 +ENV PYTHONDONTWRITEBYTECODE=1 +ENV BROWSER_HEADLESS=true + +# Default command +ENTRYPOINT ["python", "-m", "src"] +CMD ["--help"] + +# Example usage: +# docker build -t browser-agent . +# docker run -e GEMINI_API_KEY=xxx browser-agent --url "https://example.com" --task "extract title" diff --git a/README.md b/README.md index 6cbb278..5227811 100644 --- a/README.md +++ b/README.md @@ -1,15 +1,16 @@ # πŸ€– Autonomous Browser AI Agent -An intelligent browser automation agent built with **Playwright** and a modular **agent-controller-browser** architecture. Plan tasks in natural language, execute them via browser actions, and collect results β€” all autonomously. +An intelligent **multi-agent browser automation system** powered by LLMs (Gemini, OpenAI, AWS Bedrock). The agent can understand natural language tasks, plan multi-step browser actions, execute them autonomously, and self-correct when things go wrong. ## ✨ Features -- **Browser Automation**: Full Playwright integration (goto, click, fill, extract text, screenshot, etc.) +- **Multi-Agent Architecture**: Orchestrator β†’ Planner β†’ Executor β†’ Evaluator loop +- **LLM Integration**: AWS Bedrock (Claude), Google Gemini, OpenAI support +- **DOM-Aware Planning**: Intelligent element detection and selector generation +- **Self-Correction**: Automatic re-planning on failures with retry logic +- **Browser Automation**: Full Playwright integration (navigate, click, fill, extract, screenshot) - **Safety Controls**: URL scheme filtering, loop detection, max-step limits -- **Modular Architecture**: Agent β†’ Controller β†’ Browser layers for testability -- **Human-like Behavior**: Configurable random delays to reduce bot detection -- **CLI & API**: Run from command line or integrate into your Python code -- **Extensible Planner**: Pluggable LLM interface for intelligent task planning +- **Human-like Behavior**: Configurable delays to reduce bot detection ## πŸš€ Quick Start @@ -20,54 +21,174 @@ An intelligent browser automation agent built with **Playwright** and a modular git clone https://github.com/Kaangml/autonomous_browser_ai_agent.git cd autonomous_browser_ai_agent -# Install dependencies with uv (recommended) +# Install dependencies with uv uv sync # Install Playwright browsers uv run playwright install chromium + +# Copy environment template and add your API key +cp .env.example .env +# Edit .env and add your GEMINI_API_KEY (or other provider) ``` -### Run from CLI +### Run Your First Task ```bash -# Extract text from a webpage +# Simple CLI usage uv run python -m src --url "https://example.com" --task "extract the page title" -# Run with visible browser window -uv run python -m src --url "https://example.com" --task "extract the page title" --no-headless +# With visible browser +uv run python -m src --url "https://example.com" --task "extract content" --no-headless -# Output as JSON -uv run python -m src --url "https://example.com" --task "read the main content" --json +# JSON output +uv run python -m src --url "https://example.com" --task "get the heading" --json ``` ### Run Examples ```bash -# Wikipedia example: extract featured article +# Multi-agent example with Gemini +uv run python -m src.examples.example_multiagent + +# Wikipedia extraction uv run python -m src.examples.example_wikipedia -# DuckDuckGo search example +# DuckDuckGo search uv run python -m src.examples.example_search ``` -## πŸ“– Usage in Python +## πŸ—οΈ Architecture + +``` +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ ORCHESTRATOR β”‚ +β”‚ Coordinates the multi-agent workflow β”‚ +β”‚ Plan β†’ Execute β†’ Evaluate β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ + β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” + β–Ό β–Ό β–Ό +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚PLANNER β”‚ β”‚EXECUTORβ”‚ β”‚ EVALUATOR β”‚ +β”‚ β”‚ β”‚ β”‚ β”‚ β”‚ +β”‚ - DOM β”‚ β”‚ - Run β”‚ β”‚ - Check β”‚ +β”‚ awareβ”‚ β”‚ stepsβ”‚ β”‚ success β”‚ +β”‚ - LLM β”‚ β”‚ - Retryβ”‚ β”‚ - Trigger β”‚ +β”‚ plan β”‚ β”‚ logicβ”‚ β”‚ replan β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ + β–Ό + β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” + β”‚ BROWSER β”‚ + β”‚ CONTROLLER β”‚ + β”‚ β”‚ + β”‚ - Playwright β”‚ + β”‚ - Safety β”‚ + β”‚ - Actions β”‚ + β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ +``` + +See [docs/ARCHITECTURE.md](docs/ARCHITECTURE.md) for detailed documentation. + +## βš™οΈ Configuration + +### Environment Variables (.env) + +```bash +# LLM Provider (choose one) +GEMINI_API_KEY=your_key_here +GEMINI_MODEL=gemini-2.0-flash + +# Or OpenAI +# OPENAI_API_KEY=your_key_here +# OPENAI_MODEL=gpt-4-turbo + +# Or AWS Bedrock +# AWS_ACCESS_KEY_ID=your_key +# AWS_SECRET_ACCESS_KEY=your_secret +# AWS_REGION=us-east-1 +# BEDROCK_MODEL_ID=anthropic.claude-3-sonnet-20240229-v1:0 +``` + +### Browser Settings ```python -import asyncio from browser.browser_config import BrowserConfigManager + +config = BrowserConfigManager.load_from_settings() +# config.config.headless = False # Show browser +# config.config.timeout = 30 # Timeout in seconds +# config.config.human_delay_min = 0.5 # Min delay between actions +``` + +## πŸ“– Python API + +### Using the Multi-Agent System + +```python +import asyncio +from dotenv import load_dotenv +load_dotenv() + +from llm.factory import get_llm_provider +from agent.planner import PlannerAgent +from agent.executor import ExecutorAgent from browser.browser import BrowserManager +from browser.browser_config import BrowserConfigManager from browser.actions import BrowserActions from controller.browser_controller import BrowserController async def main(): - # Setup - config = BrowserConfigManager() - config.config.headless = True + # Setup LLM and agents + llm = get_llm_provider() + planner = PlannerAgent(llm=llm) + # Setup browser + config = BrowserConfigManager.load_from_settings() browser = BrowserManager(config) + await browser.start() + actions = BrowserActions(browser) controller = BrowserController(actions) + executor = ExecutorAgent(controller=controller) + + try: + # Plan the task + steps = await planner.plan("Go to example.com and extract the title") + print(f"Plan: {len(steps)} steps") + + # Execute each step + page = None + for step in steps: + result = await executor.execute(step, page) + if result.get("page"): + page = result["page"] + print(f"{step['type']}: {result.get('ok')}") + + finally: + await browser.close() + +asyncio.run(main()) +``` + +### Low-Level Controller Usage +```python +import asyncio +from browser.browser_config import BrowserConfigManager +from browser.browser import BrowserManager +from browser.actions import BrowserActions +from controller.browser_controller import BrowserController + +async def main(): + config = BrowserConfigManager.load_from_settings() + browser = BrowserManager(config) + await browser.start() + + actions = BrowserActions(browser) + controller = BrowserController(actions) + try: # Navigate result = await controller.execute_action({ @@ -81,7 +202,7 @@ async def main(): "type": "extract_text", "args": {"page": page, "selector": "h1"} }) - print(text["result"]) + print(text["result"]) # "Example Domain" finally: await browser.close() @@ -89,24 +210,32 @@ async def main(): asyncio.run(main()) ``` -## πŸ—οΈ Architecture +## πŸ“‹ Supported Actions -``` -src/ -β”œβ”€β”€ agent/ # Task planning and reasoning -β”‚ β”œβ”€β”€ agent.py # Main agent class (plan β†’ execute β†’ reflect) -β”‚ β”œβ”€β”€ planner.py # LLM-based task decomposition -β”‚ └── memory.py # Short-term memory -β”œβ”€β”€ browser/ # Playwright automation layer -β”‚ β”œβ”€β”€ browser.py # Browser lifecycle management -β”‚ β”œβ”€β”€ actions.py # High-level actions (click, fill, extract, etc.) -β”‚ └── utils.py # Retry logic, human delays, URL normalization -β”œβ”€β”€ controller/ # Action orchestration -β”‚ └── browser_controller.py # Maps agent actions to browser calls -β”œβ”€β”€ config/ # Configuration management -└── examples/ # Working examples +| Action | Description | Args | +|--------|-------------|------| +| `goto` | Navigate to URL | `url` | +| `click` | Click element | `page`, `selector` | +| `fill` | Type into input | `page`, `selector`, `text` | +| `extract_text` | Get element text | `page`, `selector` | +| `links` | Get all links | `page`, `selector?` | +| `screenshot` | Capture page | `page`, `full_page?` | +| `scroll` | Scroll page | `page`, `selector?` | +| `wait` | Wait for element | `page`, `selector` | + +## 🐳 Docker + +```bash +# Build the image +docker build -t browser-agent . + +# Run with your API key +docker run -e GEMINI_API_KEY=your_key browser-agent \ + --url "https://example.com" --task "extract the title" ``` +See [Dockerfile](Dockerfile) for details. + ## πŸ§ͺ Testing ```bash @@ -116,46 +245,35 @@ uv run pytest # Run with verbose output uv run pytest -v -# Run specific test file -uv run pytest tests/browser/test_actions.py -``` - -## πŸ“‹ Supported Actions - -| Action | Description | Args | -|--------|-------------|------| -| `goto` | Navigate to URL | `url` | -| `click` | Click element | `page`, `selector` | -| `fill` | Type into input | `page`, `selector`, `text` | -| `extract_text` | Get element text | `page`, `selector` | -| `links` | Get all links | `page`, `selector` (optional) | -| `screenshot` | Capture page | `page`, `full_page` (optional) | +# Run specific test module +uv run pytest tests/agent/ -v -## βš™οΈ Configuration +# Run with coverage +uv run pytest --cov=src +``` -Browser behavior can be customized via `BrowserConfigManager`: +## πŸ“š Documentation -```python -config = BrowserConfigManager() -config.config.headless = False # Show browser window -config.config.timeout = 30 # Timeout in seconds -config.config.viewport_width = 1920 # Browser width -config.config.viewport_height = 1080 # Browser height -config.config.human_delay_min = 0.5 # Min delay between actions -config.config.human_delay_max = 1.5 # Max delay between actions -config.config.channel = "chrome" # Use Chrome instead of Chromium -``` +- [QUICKSTART.md](docs/QUICKSTART.md) - Step-by-step getting started guide +- [ARCHITECTURE.md](docs/ARCHITECTURE.md) - Detailed system architecture +- [ROADMAP.md](docs/ROADMAP.md) - Development roadmap +- [DEV_NOTES.md](docs/DEV_NOTES.md) - Developer notes ## πŸ—ΊοΈ Roadmap -See [docs/ROADMAP.md](docs/ROADMAP.md) for the development roadmap. +### Completed βœ… +- [x] Multi-agent LLM system (Orchestrator, Planner, Executor, Evaluator) +- [x] LLM provider abstraction (Bedrock, Gemini, OpenAI) +- [x] DOM-aware intelligent planning +- [x] Retry logic and error handling +- [x] Mock provider for testing -### Planned Features +### Planned πŸ“‹ - [ ] Persistent memory (SQLite/vector DB) -- [ ] Real LLM integration (OpenAI, Anthropic, Bedrock) -- [ ] Job queue and workflow management -- [ ] Retry policies with exponential backoff -- [ ] Structured logging and metrics +- [ ] Job queue and workflow scheduling +- [ ] Web UI for task management +- [ ] Browser extension integration +- [ ] Multi-tab support ## πŸ“„ License @@ -163,4 +281,4 @@ MIT ## 🀝 Contributing -Contributions welcome! Please read the roadmap first, then open a PR. +Contributions welcome! Please read the [ROADMAP.md](docs/ROADMAP.md) first, then open a PR. diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..d4cc359 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,35 @@ +version: '3.8' + +services: + browser-agent: + build: + context: . + dockerfile: Dockerfile + environment: + # LLM Provider - uncomment and set one + - GEMINI_API_KEY=${GEMINI_API_KEY:-} + - GEMINI_MODEL=${GEMINI_MODEL:-gemini-2.0-flash} + # - OPENAI_API_KEY=${OPENAI_API_KEY:-} + # - OPENAI_MODEL=${OPENAI_MODEL:-gpt-4-turbo} + # Browser settings + - BROWSER_HEADLESS=true + - BROWSER_TIMEOUT=30 + volumes: + # Mount screenshots output + - ./output:/app/output + # Run a specific task + command: [ "--url", "https://example.com", "--task", "extract the page title" ] + + # Development mode with source code mounted + dev: + build: + context: . + dockerfile: Dockerfile + environment: + - GEMINI_API_KEY=${GEMINI_API_KEY:-} + - GEMINI_MODEL=${GEMINI_MODEL:-gemini-2.0-flash} + - BROWSER_HEADLESS=true + volumes: + - ./src:/app/src:ro + - ./output:/app/output + command: [ "--help" ] diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md new file mode 100644 index 0000000..69f462b --- /dev/null +++ b/docs/ARCHITECTURE.md @@ -0,0 +1,555 @@ +# πŸ—οΈ Architecture Guide + +This document describes the multi-agent architecture of the Autonomous Browser AI Agent. + +## Overview + +The system uses a **multi-agent architecture** where different AI agents collaborate to accomplish browser automation tasks. Each agent has a specialized role: + +``` +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ USER TASK β”‚ +β”‚ "Go to Wikipedia and extract the featured article" β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ + β–Ό +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ ORCHESTRATOR β”‚ +β”‚ β”‚ +β”‚ β€’ Receives high-level task from user β”‚ +β”‚ β€’ Coordinates the Plan β†’ Execute β†’ Evaluate loop β”‚ +β”‚ β€’ Handles retries and re-planning on failures β”‚ +β”‚ β€’ Decides when task is complete β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ + β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” + β”‚ β”‚ β”‚ + β–Ό β–Ό β–Ό +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ PLANNER β”‚ β”‚ EXECUTOR β”‚ β”‚ EVALUATOR β”‚ +β”‚ β”‚ β”‚ β”‚ β”‚ β”‚ +β”‚ β€’ Analyzes DOM β”‚ β”‚ β€’ Runs browser β”‚ β”‚ β€’ Checks step β”‚ +β”‚ β€’ Creates multi-β”‚ β”‚ actions β”‚ β”‚ success β”‚ +β”‚ step plan β”‚ β”‚ β€’ Handles retry β”‚ β”‚ β€’ Detects β”‚ +β”‚ β€’ Uses LLM for β”‚ β”‚ on failure β”‚ β”‚ failures β”‚ +β”‚ intelligence β”‚ β”‚ β€’ Returns β”‚ β”‚ β€’ Triggers β”‚ +β”‚ β”‚ β”‚ results β”‚ β”‚ re-planning β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ β”‚ β”‚ + β”‚ β–Ό β”‚ + β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ + β”‚ β”‚ BROWSER β”‚ β”‚ + β”‚ β”‚ CONTROLLER β”‚ β”‚ + β”‚ β”‚ β”‚ β”‚ + β”‚ β”‚ β€’ Execute actionβ”‚ β”‚ + β”‚ β”‚ β€’ Safety checks β”‚ β”‚ + β”‚ β”‚ β€’ URL filtering β”‚ β”‚ + β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ + β”‚ β”‚ β”‚ + β”‚ β–Ό β”‚ + β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ + └────────────▢│ PLAYWRIGHT β”‚β—€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ BROWSER β”‚ + β”‚ β”‚ + β”‚ β€’ Chromium β”‚ + β”‚ β€’ Page actions β”‚ + β”‚ β€’ DOM access β”‚ + β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ +``` + +## Components + +### 1. Orchestrator (`src/agent/orchestrator.py`) + +The **Orchestrator** is the main entry point. It coordinates the entire workflow: + +```python +class Orchestrator: + async def execute_task(self, task: str, page: Page = None) -> TaskResult: + """ + Main loop: + 1. Create plan using Planner + 2. Execute steps using Executor + 3. Evaluate results using Evaluator + 4. Re-plan if needed + 5. Return final result + """ +``` + +**Key Responsibilities:** +- Initialize and coordinate all agents +- Manage task state (pending, planning, executing, evaluating, complete, failed) +- Handle max retries and max steps limits +- Aggregate execution logs + +**Configuration:** +```python +orchestrator = Orchestrator( + llm=provider, + planner=planner, + executor=executor, + evaluator=evaluator, + max_retries=3, # Max re-planning attempts + max_steps=20, # Max total steps before giving up +) +``` + +### 2. Planner (`src/agent/planner.py`) + +The **Planner** creates multi-step plans using LLM intelligence: + +```python +class PlannerAgent: + async def plan( + self, + task: str, + page: Page = None, + page_structure: PageStructure = None, + ) -> List[Dict[str, Any]]: + """ + 1. Analyze current page DOM (if available) + 2. Generate plan using LLM + 3. Parse plan into executable steps + 4. Fallback to deterministic planner on error + """ +``` + +**Key Features:** +- **DOM-Aware**: Analyzes page structure to find correct selectors +- **Lookahead**: Plans 3-4 steps ahead considering future states +- **Fallback**: Uses deterministic planner when LLM fails +- **Metadata**: Includes reason, expected outcome, and fallback for each step + +**Example Plan Output:** +```python +[ + { + "type": "goto", + "args": {"url": "https://wikipedia.org"}, + "metadata": { + "reason": "Navigate to Wikipedia homepage", + "expected_outcome": "Page loads with search box visible", + "fallback": "Try https://en.wikipedia.org" + } + }, + { + "type": "fill", + "args": {"selector": "#searchInput", "text": "Python programming"}, + "metadata": { + "reason": "Enter search query", + "expected_outcome": "Text appears in search box" + } + }, + { + "type": "click", + "args": {"selector": "#searchButton"}, + "metadata": { + "reason": "Submit search", + "expected_outcome": "Navigate to search results" + } + } +] +``` + +### 3. Executor (`src/agent/executor.py`) + +The **Executor** runs individual browser actions: + +```python +class ExecutorAgent: + async def execute( + self, + step: Dict[str, Any], + page: Page = None, + ) -> Dict[str, Any]: + """ + 1. Validate step has required fields + 2. Pre-validate selector exists (optional) + 3. Execute action via BrowserController + 4. Return structured result + """ + + async def execute_with_retry( + self, + step: Dict[str, Any], + page: Page = None, + max_retries: int = 3, + ) -> Dict[str, Any]: + """Execute with automatic retries on failure.""" +``` + +**Key Features:** +- **Validation**: Checks action type and required arguments +- **Retry Logic**: Automatic retries with configurable attempts +- **Error Handling**: Catches exceptions and returns structured errors +- **DOM Analysis**: Optional pre-validation of selectors + +### 4. Evaluator (`src/agent/evaluator.py`) + +The **Evaluator** assesses step results and decides next actions: + +```python +class EvaluatorAgent: + async def evaluate( + self, + step: Dict[str, Any], + result: Dict[str, Any], + page: Page = None, + task: str = "", + remaining_steps: int = 0, + ) -> Dict[str, Any]: + """ + Returns: + { + "success": bool, + "task_complete": bool, + "should_replan": bool, + "replan_reason": str, + "confidence": float + } + """ + + async def check_task_completion( + self, + task: str, + execution_log: list, + ) -> Dict[str, Any]: + """Check if overall task is complete.""" +``` + +**Key Features:** +- **Simple Mode**: Rule-based evaluation when no LLM +- **LLM Mode**: Intelligent evaluation with context understanding +- **Re-plan Triggers**: Detects when plan needs adjustment +- **Confidence Scoring**: Indicates certainty of evaluation + +### 5. DOM Analyzer (`src/browser/dom_analyzer.py`) + +The **DOM Analyzer** extracts page structure for intelligent planning: + +```python +class DOMAnalyzer: + async def analyze(self, page: Page) -> PageStructure: + """ + Extract: + - URL, title + - Interactive elements (buttons, links, inputs) + - Forms with their fields + - Main content structure + """ + +@dataclass +class PageStructure: + url: str + title: str + elements: List[InteractiveElement] + forms: List[FormInfo] + + def to_prompt_context(self) -> str: + """Format for LLM prompt.""" +``` + +**Example Output:** +``` +URL: https://wikipedia.org +Title: Wikipedia + +INTERACTIVE ELEMENTS: +- input#searchInput: text input, name="search" +- button#searchButton: button "Search" +- a.main-link: link "Main Page" + +FORMS: +- form#searchform: search form + - input#searchInput (text) + - button#searchButton (submit) +``` + +### 6. Browser Controller (`src/controller/browser_controller.py`) + +The **Browser Controller** maps actions to Playwright calls: + +```python +class BrowserController: + async def execute_action(self, action: Dict[str, Any]) -> Dict[str, Any]: + """ + Supported actions: + - goto: Navigate to URL + - click: Click element + - fill: Type into input + - extract_text: Get element text + - links: Get all links + - screenshot: Capture page + - scroll: Scroll page + - wait: Wait for element + """ +``` + +**Safety Features:** +- URL scheme filtering (only http/https allowed) +- Loop detection (prevents infinite action loops) +- Max steps limit +- Timeout handling + +### 7. LLM Providers (`src/llm/`) + +Abstraction layer for different LLM backends: + +```python +class BaseLLMProvider(ABC): + @abstractmethod + async def complete(self, prompt: str, **kwargs) -> LLMResponse: + """Generate text completion.""" + + @abstractmethod + async def chat(self, messages: List[Message], **kwargs) -> LLMResponse: + """Generate chat response.""" + + async def complete_json(self, prompt: str, schema: Dict) -> Dict: + """Generate JSON response matching schema.""" +``` + +**Available Providers:** +- `GeminiProvider`: Google Gemini (via langchain-google-genai) +- `OpenAIProvider`: OpenAI GPT models (via langchain-openai) +- `BedrockProvider`: AWS Bedrock Claude (via langchain-aws) +- `MockLLMProvider`: For testing without API calls + +**Factory Pattern:** +```python +from llm.factory import get_llm_provider, get_provider_for_role + +# Auto-detect from environment +provider = get_llm_provider() + +# Specific provider +provider = get_llm_provider(LLMProvider.GEMINI) + +# Role-based (different models for different agents) +planner_llm = get_provider_for_role("planner") +executor_llm = get_provider_for_role("executor") +``` + +## Data Flow + +### 1. Task Execution Flow + +``` +User Task β†’ Orchestrator + β”‚ + β”œβ”€β”€β–Ά Planner.plan(task, page) + β”‚ β”‚ + β”‚ β–Ό + β”‚ [DOM Analysis] ──▢ LLM ──▢ Step List + β”‚ + β”œβ”€β”€β–Ά For each step: + β”‚ β”‚ + β”‚ β”œβ”€β”€β–Ά Executor.execute(step, page) + β”‚ β”‚ β”‚ + β”‚ β”‚ β–Ό + β”‚ β”‚ BrowserController ──▢ Playwright + β”‚ β”‚ β”‚ + β”‚ β”‚ β–Ό + β”‚ β”‚ Result {ok, result, error} + β”‚ β”‚ + β”‚ β”œβ”€β”€β–Ά Evaluator.evaluate(step, result) + β”‚ β”‚ β”‚ + β”‚ β”‚ β–Ό + β”‚ β”‚ {success, task_complete, should_replan} + β”‚ β”‚ + β”‚ └──▢ If should_replan: goto Planner + β”‚ + └──▢ TaskResult {success, steps_executed, final_result} +``` + +### 2. Re-planning Flow + +``` +Step Failed + β”‚ + β–Ό +Evaluator detects failure + β”‚ + β–Ό +should_replan = True + β”‚ + β–Ό +Orchestrator increments retry_count + β”‚ + β”œβ”€β”€β–Ά If retry_count < max_retries: + β”‚ β”‚ + β”‚ β–Ό + β”‚ Planner.plan(task, page, executed_steps) + β”‚ β”‚ + β”‚ β–Ό + β”‚ New plan considering what already happened + β”‚ + └──▢ If retry_count >= max_retries: + β”‚ + β–Ό + TaskResult(success=False, error="Max retries exceeded") +``` + +## Directory Structure + +``` +src/ +β”œβ”€β”€ agent/ +β”‚ β”œβ”€β”€ __init__.py +β”‚ β”œβ”€β”€ orchestrator.py # Main coordinator +β”‚ β”œβ”€β”€ planner.py # LLM-based planning +β”‚ β”œβ”€β”€ executor.py # Action execution +β”‚ β”œβ”€β”€ evaluator.py # Result evaluation +β”‚ β”œβ”€β”€ agent.py # Legacy simple agent +β”‚ β”œβ”€β”€ memory.py # Short-term memory +β”‚ └── tools.py # Action definitions +β”‚ +β”œβ”€β”€ browser/ +β”‚ β”œβ”€β”€ __init__.py +β”‚ β”œβ”€β”€ browser.py # Browser lifecycle +β”‚ β”œβ”€β”€ browser_config.py # Configuration +β”‚ β”œβ”€β”€ actions.py # High-level actions +β”‚ β”œβ”€β”€ dom_analyzer.py # DOM extraction +β”‚ └── utils.py # Utilities +β”‚ +β”œβ”€β”€ controller/ +β”‚ β”œβ”€β”€ __init__.py +β”‚ β”œβ”€β”€ browser_controller.py # Action mapping +β”‚ β”œβ”€β”€ task_manager.py # Task queue (future) +β”‚ └── workflow.py # Workflow engine (future) +β”‚ +β”œβ”€β”€ llm/ +β”‚ β”œβ”€β”€ __init__.py +β”‚ β”œβ”€β”€ base.py # Base provider class +β”‚ β”œβ”€β”€ factory.py # Provider factory +β”‚ β”œβ”€β”€ gemini_provider.py +β”‚ β”œβ”€β”€ openai_provider.py +β”‚ β”œβ”€β”€ bedrock_provider.py +β”‚ └── mock_provider.py # For testing +β”‚ +β”œβ”€β”€ config/ +β”‚ β”œβ”€β”€ __init__.py +β”‚ β”œβ”€β”€ settings.py # Global settings +β”‚ └── llm_config.py # LLM configuration +β”‚ +└── examples/ + β”œβ”€β”€ example_wikipedia.py + β”œβ”€β”€ example_search.py + └── example_multiagent.py +``` + +## Configuration + +### Environment Variables + +```bash +# LLM Provider +GEMINI_API_KEY=xxx +GEMINI_MODEL=gemini-2.0-flash + +# Or OpenAI +OPENAI_API_KEY=xxx +OPENAI_MODEL=gpt-4-turbo + +# Or Bedrock +AWS_ACCESS_KEY_ID=xxx +AWS_SECRET_ACCESS_KEY=xxx +AWS_REGION=us-east-1 +BEDROCK_MODEL_ID=anthropic.claude-3-sonnet-20240229-v1:0 + +# Role-based providers (optional) +ORCHESTRATOR_PROVIDER=gemini +PLANNER_PROVIDER=gemini +EXECUTOR_PROVIDER=gemini + +# Agent settings +MAX_PLANNING_STEPS=10 +MAX_RETRIES=3 +PLANNING_LOOKAHEAD=4 + +# Browser settings +BROWSER_HEADLESS=true +BROWSER_TIMEOUT=30 +``` + +### Programmatic Configuration + +```python +from config.llm_config import LLMConfig, get_llm_config + +config = get_llm_config() +print(config.gemini.is_configured) +print(config.get_available_providers()) +``` + +## Extending the System + +### Adding a New LLM Provider + +1. Create provider class in `src/llm/`: + +```python +from llm.base import BaseLLMProvider, LLMResponse, Message + +class MyProvider(BaseLLMProvider): + def __init__(self, api_key: str, model: str = "default"): + self._api_key = api_key + self._model = model + + async def complete(self, prompt: str, **kwargs) -> LLMResponse: + # Implementation + pass + + async def chat(self, messages: List[Message], **kwargs) -> LLMResponse: + # Implementation + pass +``` + +2. Add to factory in `src/llm/factory.py` +3. Add configuration in `src/config/llm_config.py` + +### Adding a New Browser Action + +1. Add method to `src/browser/actions.py`: + +```python +class BrowserActions: + async def my_action(self, page: Page, arg1: str) -> Any: + # Implementation + pass +``` + +2. Add handler in `src/controller/browser_controller.py`: + +```python +if typ == "my_action": + result = await self.browser_actions.my_action(page, args["arg1"]) + return {"ok": True, "result": result} +``` + +3. Update planner schema in `src/agent/planner.py` + +## Testing + +The codebase uses pytest with async support: + +```bash +# All tests +uv run pytest + +# Specific module +uv run pytest tests/agent/ + +# With coverage +uv run pytest --cov=src + +# Verbose +uv run pytest -v +``` + +Key test files: +- `tests/agent/test_orchestrator.py` - Orchestrator tests +- `tests/agent/test_planner_agent.py` - Planner tests +- `tests/agent/test_executor.py` - Executor tests +- `tests/agent/test_evaluator.py` - Evaluator tests +- `tests/llm/test_mock_provider.py` - Mock provider tests diff --git a/docs/QUICKSTART.md b/docs/QUICKSTART.md new file mode 100644 index 0000000..f38eb5e --- /dev/null +++ b/docs/QUICKSTART.md @@ -0,0 +1,232 @@ +# πŸš€ Quick Start Guide + +This guide will get you up and running with the Autonomous Browser AI Agent in under 5 minutes. + +## Prerequisites + +- **Python 3.11+** +- **uv** (Python package manager) - [Install uv](https://docs.astral.sh/uv/getting-started/installation/) +- **API Key** for at least one LLM provider (Gemini recommended for quick start) + +## Step 1: Clone and Install + +```bash +# Clone the repository +git clone https://github.com/Kaangml/autonomous_browser_ai_agent.git +cd autonomous_browser_ai_agent + +# Install all dependencies +uv sync + +# Install Playwright browsers (required for browser automation) +uv run playwright install chromium +``` + +## Step 2: Configure Your LLM Provider + +Copy the environment template: + +```bash +cp .env.example .env +``` + +Edit `.env` and add your API key. The easiest option is **Google Gemini**: + +```bash +# .env +GEMINI_API_KEY=your_gemini_api_key_here +GEMINI_MODEL=gemini-2.0-flash +``` + +### Get a Gemini API Key + +1. Go to [Google AI Studio](https://aistudio.google.com/app/apikey) +2. Click "Create API Key" +3. Copy the key and paste it in `.env` + +### Alternative: OpenAI + +```bash +OPENAI_API_KEY=your_openai_api_key_here +OPENAI_MODEL=gpt-4-turbo +``` + +### Alternative: AWS Bedrock + +```bash +AWS_ACCESS_KEY_ID=your_access_key +AWS_SECRET_ACCESS_KEY=your_secret_key +AWS_REGION=us-east-1 +BEDROCK_MODEL_ID=anthropic.claude-3-sonnet-20240229-v1:0 +``` + +## Step 3: Verify Installation + +Run the tests to make sure everything is working: + +```bash +uv run pytest tests/ -v --tb=short +``` + +You should see all tests passing (68+ tests). + +## Step 4: Run Your First Task + +### Option A: CLI + +```bash +# Extract text from a webpage +uv run python -m src --url "https://example.com" --task "extract the page title" +``` + +Expected output: +``` +Task: extract the page title +URL: https://example.com +Result: Example Domain +``` + +### Option B: Python Script + +Create a file `my_first_task.py`: + +```python +import asyncio +from dotenv import load_dotenv +load_dotenv() + +from llm.factory import get_llm_provider +from agent.planner import PlannerAgent +from agent.executor import ExecutorAgent +from browser.browser import BrowserManager +from browser.browser_config import BrowserConfigManager +from browser.actions import BrowserActions +from controller.browser_controller import BrowserController + +async def main(): + # Initialize LLM + llm = get_llm_provider() + print(f"Using LLM: {type(llm).__name__}") + + # Create planner and executor + planner = PlannerAgent(llm=llm) + + # Setup browser + config = BrowserConfigManager.load_from_settings() + browser = BrowserManager(config) + await browser.start() + + actions = BrowserActions(browser) + controller = BrowserController(actions) + executor = ExecutorAgent(controller=controller) + + try: + # Define your task + task = "Go to wikipedia.org and extract the featured article title" + print(f"\nπŸ“‹ Task: {task}") + + # Let the LLM plan the steps + steps = await planner.plan(task) + print(f"\nπŸ”§ Plan ({len(steps)} steps):") + for i, step in enumerate(steps, 1): + print(f" {i}. {step['type']}: {step['args']}") + + # Execute the plan + print("\nπŸš€ Executing...") + page = None + for step in steps: + result = await executor.execute(step, page) + status = "βœ…" if result.get("ok") else "❌" + print(f" {status} {step['type']}") + + if result.get("page"): + page = result["page"] + + if result.get("result") and step["type"] == "extract_text": + print(f"\nπŸ“„ Result: {result['result'][:200]}...") + + print("\nβœ… Done!") + + finally: + await browser.close() + +if __name__ == "__main__": + asyncio.run(main()) +``` + +Run it: + +```bash +uv run python my_first_task.py +``` + +## Step 5: Try More Examples + +### Wikipedia Example + +```bash +uv run python -m src.examples.example_wikipedia +``` + +### Search Example + +```bash +uv run python -m src.examples.example_search +``` + +### Multi-Agent Example + +```bash +uv run python -m src.examples.example_multiagent +``` + +## Common Issues + +### "GEMINI_API_KEY not configured" + +Make sure your `.env` file exists and contains the API key: + +```bash +cat .env | grep GEMINI +``` + +### "Playwright browsers not installed" + +Run the install command: + +```bash +uv run playwright install chromium +``` + +### "Import errors" + +Make sure you're running from the project root and using `uv run`: + +```bash +cd autonomous_browser_ai_agent +uv run python -m src --help +``` + +### "API key was reported as leaked" + +Your API key was exposed (e.g., pushed to GitHub). Generate a new one and update `.env`. + +## Next Steps + +- πŸ“– Read the [Architecture Guide](ARCHITECTURE.md) to understand how the system works +- πŸ”§ Check [ROADMAP.md](ROADMAP.md) for planned features +- 🐳 Try the [Docker setup](../Dockerfile) for containerized deployment +- πŸ§ͺ Write your own tasks and explore the capabilities! + +## Tips for Writing Good Tasks + +βœ… **Good tasks:** +- "Go to example.com and extract the main heading" +- "Search Google for 'python tutorial' and get the first 3 result titles" +- "Navigate to wikipedia.org, search for 'artificial intelligence', and extract the first paragraph" + +❌ **Too vague:** +- "Get information" (what information? from where?) +- "Do something on this page" (what exactly?) + +The more specific your task, the better the LLM can plan the steps! diff --git a/docs/ROADMAP.md b/docs/ROADMAP.md index aa32135..8547c2d 100644 --- a/docs/ROADMAP.md +++ b/docs/ROADMAP.md @@ -1,148 +1,197 @@ ---- - -# πŸš€ Autonomous Browser AI Agent +# πŸ—ΊοΈ Development Roadmap -## Development Direction (Faz 1 β†’ Faz 2 Yol HaritasΔ±) +## Project Vision -Bu dosya, repoyu inceledikten sonra Faz 1 mevcut durumunu, Faz 2 iΓ§in yol haritasΔ±nΔ± ve VS Code ΓΌzerinde nasΔ±l ilerleyeceğinizi net bir şekilde aΓ§Δ±klamak iΓ§in hazΔ±rlandΔ±. Aşağıdaki iΓ§eriği README altΔ±na veya /docs iΓ§inde `DEVELOPMENT_DIRECTION.md` olarak kullanabilirsiniz. +Build an intelligent, autonomous browser agent that can: +- Understand natural language tasks +- Plan multi-step browser automations +- Execute and self-correct using LLM intelligence +- Work with minimal human intervention --- -## 1) Projenin Genel AmacΔ± - -Bu proje, web tarayΔ±cΔ±sΔ±nΔ± akΔ±llΔ± bir yazΔ±lΔ±m ajanΔ± ile kontrol eden, gΓΆrev odaklΔ±, plan ΓΌretebilen ve kendini yΓΆneten bir autonomous browser agent oluşturmayΔ± hedefler. Sistem; `controller`, `browser`, `agent`, `config` katmanlarΔ±na ayrΔ±lmıştΔ±r. Bu modΓΌler yapΔ± sayesinde: - -- Test edilebilir -- Genişletilebilir -- Yeni gΓΆrev tiplerine adapte edilebilir -- Model veya tarayΔ±cΔ± kitaplığı kolayca değiştirilebilir +## Completed Phases βœ… + +### Phase 1: Foundation (Completed) +- [x] Project structure and architecture +- [x] Basic browser automation skeleton +- [x] Agent-Controller-Browser layered design +- [x] Configuration management + +### Phase 2: Browser Engine (Completed) +- [x] Full Playwright integration +- [x] Browser lifecycle management (start, stop, restart) +- [x] Core actions: goto, click, fill, extract_text, screenshot, scroll +- [x] Safety controls: URL scheme filtering, timeout handling +- [x] Human-like behavior: configurable delays +- [x] Stealth mode support + +### Phase 3: Controller & Basic Agent (Completed) +- [x] BrowserController for action mapping +- [x] Loop detection and max-step limits +- [x] Simple deterministic planner +- [x] CLI interface (`python -m src`) +- [x] Working examples (Wikipedia, search) +- [x] Test suite (68+ tests) + +### Phase 4: Multi-Agent LLM System (Completed) +- [x] **LLM Provider Abstraction** + - BaseLLMProvider interface + - Google Gemini provider (langchain-google-genai) + - OpenAI provider (langchain-openai) + - AWS Bedrock provider (langchain-aws) + - MockLLMProvider for testing + - Factory pattern for provider selection + +- [x] **Multi-Agent Architecture** + - Orchestrator: Coordinates planβ†’executeβ†’evaluate loop + - PlannerAgent: DOM-aware multi-step planning + - ExecutorAgent: Action execution with retry logic + - EvaluatorAgent: Result analysis and re-plan triggers + +- [x] **DOM Analysis** + - DOMAnalyzer for page structure extraction + - Interactive element detection + - Form analysis + - Selector generation for LLM context + +- [x] **Configuration** + - .env-based API key management + - Role-based provider selection + - LLMConfig with environment loading + +- [x] **Documentation** + - Updated README with multi-agent examples + - QUICKSTART.md for getting started + - ARCHITECTURE.md for system design + - Working Dockerfile --- -## 2) Faz 1 β€” Mevcut Durum Değerlendirmesi - -Faz 1 incelendiğinde aşağıdaki temel parΓ§alar hazΔ±r: - -### 2.1 Mimari - -- `agent/` β†’ Ajan zekΓ’sΔ±, reasoning pipeline -- `controller/` β†’ TarayΔ±cΔ± komutlarΔ±nΔ±n orkestrasyonu -- `browser/` β†’ Web automation iskeleti (şu an skeleton) -- `config/` β†’ Ajan parametreleri, gΓΆrev tanΔ±mlarΔ±, model yΓΆnlendirmeleri - -YapΔ±, nΓΌmerik olarak genişlemeye uygun ve ΓΆlΓ§eklenebilir. +## Current Phase 🚧 -### 2.2 Temel Akış (Flow) +### Phase 5: Production Readiness -Task β†’ Agent reasoning β†’ Controller β†’ Browser executes β†’ Result β†’ Agent feedback loop +#### 5.1 Persistent Memory +- [ ] Short-term memory (conversation context) +- [ ] Long-term memory (SQLite or vector DB) +- [ ] Task history and learning from past executions +- [ ] Element selector caching -### 2.3 Prompt YapΔ±sΔ± +#### 5.2 Enhanced Error Handling +- [ ] Detailed error classification +- [ ] Automatic recovery strategies +- [ ] Fallback selector chains +- [ ] Network error handling -- System prompt -- Task prompt -- Action-output formatΔ± - -### 2.4 Kodun Durumu / Eksikler - -Faz 1 temelde tamamlanmış gΓΆzΓΌkse de aşağıda eksiklikler var (Faz 2 hedefleri): - -- Browser hΓ’len dummy -- Controller gerΓ§ek aksiyon ΓΌretmiyor -- Agent reasoning tek adΔ±mlΔ± -- Memory yok -- Tools eksik +#### 5.3 Logging & Monitoring +- [ ] Structured logging (JSON format) +- [ ] Execution metrics and timing +- [ ] Debug mode with screenshots +- [ ] Trace export for debugging --- -## 3) Faz 2 β€” TODO Roadmap - -Aşağıdaki alt başlΔ±klar Faz 2 kapsamΔ±ndaki hedeflerdir. VSCode ΓΌzerinde bir feature branch aΓ§Δ±p adΔ±m adΔ±m ilerlemeniz ΓΆnerilir. - -### Faz 2.1 β€” Browser Engine’in TamamlanmasΔ± - -Browser katmanΔ±nΔ± Playwright ile etkinleştirin ve temel eylemleri uygulayΔ±n: - -- Playwright entegrasyonu - - Browser launch, Context, Page - - Stealth mode, Headless toggle -- Temel aksiyonlar - - `goto(url)`, `click(selector)`, `type(selector, text)`, `wait_for(selector)` - - `extract_text(selector)`, `extract_all_links()`, `screenshot()` -- Error management - - Retry wrapper, timeout policy +## Future Phases πŸ“‹ + +### Phase 6: Advanced Features + +#### 6.1 Multi-Tab Support +- [ ] Tab management (open, close, switch) +- [ ] Cross-tab data passing +- [ ] Parallel execution + +#### 6.2 Authentication & Sessions +- [ ] Cookie management +- [ ] Session persistence +- [ ] OAuth flow handling +- [ ] 2FA support (TOTP) + +#### 6.3 Advanced Extraction +- [ ] Table extraction to structured data +- [ ] Form auto-fill with validation +- [ ] File download management +- [ ] PDF extraction + +### Phase 7: Integration & Deployment + +#### 7.1 Web UI +- [ ] Task management dashboard +- [ ] Real-time execution viewer +- [ ] Result export (JSON, CSV) +- [ ] Task scheduling + +#### 7.2 API Server +- [ ] REST API for task submission +- [ ] WebSocket for real-time updates +- [ ] Authentication and rate limiting +- [ ] Webhook callbacks + +#### 7.3 Job Queue +- [ ] Redis-based task queue +- [ ] Worker pool management +- [ ] Priority scheduling +- [ ] Retry policies + +### Phase 8: Enterprise Features + +#### 8.1 Scaling +- [ ] Kubernetes deployment +- [ ] Horizontal scaling +- [ ] Browser pool management +- [ ] Load balancing + +#### 8.2 Security +- [ ] Audit logging +- [ ] Role-based access control +- [ ] Secrets management +- [ ] VPN/proxy support + +#### 8.3 Compliance +- [ ] GDPR data handling +- [ ] Data retention policies +- [ ] Export/delete capabilities -### Faz 2.2 β€” Controller’ın TamamlanmasΔ± - -Controller sorumluluklarΔ±: - -- Agent tarafΔ±ndan ΓΌretilen aksiyonlarΔ± al ve browser metoduna Γ§evir -- SonuΓ§larΔ± geri ilet - -YapΔ±lacaklar: - -- Action parser: Agent Γ§Δ±ktΔ±sΔ±nΔ± JSON β†’ method mapping -- Execution pipeline: Komut al β†’ Browser’a ilet β†’ Completion β†’ Controller response -- Safety layer: URL filter, infinite loop detection, max step control - -### Faz 2.3 β€” Agent Reasoning Geliştirme - -- Multi-step reasoning: Plan β†’ Execute β†’ Reflect -- Tool-based reasoning: `browser.goto`, `browser.click`, `browser.type`, `browser.extract_text`, `browser.links` -- Self-correction: Ajan aldığı hataya gΓΆre planΔ±nΔ± gΓΌncelleyecek - -### Faz 2.4 β€” Config ve Prompt Geliştirme - -- Dynamic task config dosyasΔ± (YAML) β€” her gΓΆrev iΓ§in ayrΔ± tanΔ±m - -```yaml -task: - name: "linkedin profile extraction" - target_url: "https://linkedin.com/..." - goal: "Extract basic info" - constraints: - - "No login" - - "Max 10 actions" -``` - -- System prompt genişletmesi: Kurallar, format, reasoning tarzΔ± -- Global ayarlar: Model, timeout, max steps, debug mode - -### Faz 2.5 β€” Faz 2 Sonu: Δ°lk Γ‡alışan Senaryo +--- -Hedef senaryo (başarΔ± kriteri): +## Technical Debt & Improvements -``` -Git Google'a, "Kaangml GitHub" ara, Γ§Δ±kan ilk linki aΓ§, repository aΓ§Δ±klamasΔ±nΔ± oku ve metni JSON olarak dΓΆndΓΌr. -``` +### Code Quality +- [ ] Increase test coverage to 90%+ +- [ ] Add integration tests with real browser +- [ ] Performance benchmarking +- [ ] Memory leak detection -Bu senaryo başarΔ±lΔ± şekilde Γ§alışırsa Faz 2 tamamlanmış kabul edilecektir. +### Developer Experience +- [ ] Better error messages +- [ ] Development mode with hot reload +- [ ] Plugin system for custom actions +- [ ] VS Code extension for task authoring --- -## 4) VS Code ΓΌzerinde Γ§alışma ΓΆnerileri - -1. Yeni bir branch aΓ§Δ±n (ΓΆr. `feature/phase2-browser-agent`). -2. TasklarΔ± sΔ±rayla Γ§ΓΆzΓΌn, her ana değişiklik iΓ§in ayrΔ± commit yapΔ±n. - -Γ–rnek commit akışı: - -```bash -git checkout -b feature/phase2-browser-agent -git add browser/* -git commit -m "Browser engine: goto, click, type added" -``` +## Contributing -Copilot kullanΔ±rken ΓΆrnek komutlar: +Contributions are welcome! Please: -``` -BrowserController iΓ§in execute_action(action) fonksiyonunu yaz. action.type β†’ browser metodu maplensin. -Playwright tabanlΔ± async wrapper oluştur, tΓΌm browser fonksiyonlarΔ±nΔ± tek yerden yΓΆnet. -``` +1. Check this roadmap for planned features +2. Open an issue to discuss your idea +3. Fork and create a feature branch +4. Submit a PR with tests -Test dosyasΔ± oluşturun: `tests/test_browser.py` ve eylemlerinizin gerΓ§ekten Γ§alıştığından emin olun. +Priority areas for contribution: +- New LLM providers +- Additional browser actions +- Documentation improvements +- Test coverage --- -## 5) SonuΓ§ +## Version History -Bu dokΓΌman Faz 1 durumu, mimari tercihleri, Faz 2 iΓ§in teknik adΔ±mlar ve bir TODO listesi iΓ§erir. ArtΔ±k proje Faz 2 geliştirme dΓΆnemine geΓ§meye uygundur. +| Version | Date | Highlights | +|---------|------|------------| +| 0.1.0 | 2024-11 | Initial structure, basic browser | +| 0.2.0 | 2024-11 | Playwright integration, controller | +| 0.3.0 | 2024-11 | CLI, examples, tests | +| 0.4.0 | 2024-11 | Multi-agent LLM system, Gemini/OpenAI/Bedrock | diff --git a/src/agent/.gitkeep b/src/agent/.gitkeep deleted file mode 100644 index e69de29..0000000 diff --git a/src/examples/example_multiagent.py b/src/examples/example_multiagent.py new file mode 100644 index 0000000..8c6d94b --- /dev/null +++ b/src/examples/example_multiagent.py @@ -0,0 +1,124 @@ +"""Multi-Agent Browser Automation Example. + +This example demonstrates the full multi-agent system: +1. Planner creates a multi-step plan using LLM +2. Executor runs each step +3. Results are collected and displayed + +Requirements: +- Set GEMINI_API_KEY in .env (or another LLM provider) +- Run: uv run python -m src.examples.example_multiagent +""" + +import asyncio +from dotenv import load_dotenv + +load_dotenv() + +from llm.factory import get_llm_provider +from agent.planner import PlannerAgent +from agent.executor import ExecutorAgent +from browser.browser import BrowserManager +from browser.browser_config import BrowserConfigManager +from browser.actions import BrowserActions +from controller.browser_controller import BrowserController + + +async def main(): + print("πŸ€– Multi-Agent Browser Automation Example") + print("=" * 50) + + # Initialize LLM provider + try: + llm = get_llm_provider() + print(f"βœ… LLM Provider: {type(llm).__name__}") + except Exception as e: + print(f"❌ Failed to initialize LLM: {e}") + print(" Make sure you have set an API key in .env") + return + + # Create agents + planner = PlannerAgent(llm=llm) + + # Setup browser + config = BrowserConfigManager.load_from_settings() + browser = BrowserManager(config) + await browser.start() + + actions = BrowserActions(browser) + controller = BrowserController(actions) + executor = ExecutorAgent(controller=controller) + + # Define the task + task = "Go to example.com and extract the main heading and first paragraph" + + print(f"\nπŸ“‹ Task: {task}") + print("-" * 50) + + try: + # Phase 1: Planning + print("\nπŸ”§ Phase 1: Planning...") + steps = await planner.plan(task) + + print(f" Generated {len(steps)} steps:") + for i, step in enumerate(steps, 1): + args_str = ", ".join(f"{k}={v}" for k, v in step.get("args", {}).items()) + print(f" {i}. {step['type']}({args_str})") + + # Phase 2: Execution + print("\nπŸš€ Phase 2: Executing...") + page = None + results = [] + + for i, step in enumerate(steps, 1): + result = await executor.execute(step, page) + status = "βœ…" if result.get("ok") else "❌" + print(f" {status} Step {i}: {step['type']}") + + # Update page reference + if result.get("page"): + page = result["page"] + + # Collect results + if result.get("ok") and result.get("result"): + if result["result"] not in ("navigated", "clicked", "filled"): + results.append({ + "step": i, + "type": step["type"], + "data": result["result"] + }) + + # Stop on error + if not result.get("ok"): + print(f" Error: {result.get('error')}") + break + + # Phase 3: Results + print("\nπŸ“„ Results:") + print("-" * 50) + + if results: + for r in results: + data = str(r["data"]) + if len(data) > 200: + data = data[:200] + "..." + print(f"Step {r['step']} ({r['type']}):") + print(f" {data}") + print() + else: + print(" No data extracted") + + print("βœ… Task completed!") + + except Exception as e: + print(f"\n❌ Error: {e}") + import traceback + traceback.print_exc() + + finally: + await browser.close() + print("\nπŸ”’ Browser closed") + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/src/llm/gemini_provider.py b/src/llm/gemini_provider.py index 4b90074..81b9a56 100644 --- a/src/llm/gemini_provider.py +++ b/src/llm/gemini_provider.py @@ -14,7 +14,7 @@ class GeminiProvider(BaseLLMProvider): def __init__( self, api_key: str, - model: str = "gemini-1.5-pro", + model: str = "gemini-2.0-flash", ): self._api_key = api_key self._model = model