diff --git a/.gitignore b/.gitignore index 9f67e1c..2a995dd 100644 --- a/.gitignore +++ b/.gitignore @@ -7,4 +7,3 @@ __pycache__/ .vscode/ playwright-report/ test-results/ -docs/ diff --git a/README.md b/README.md index ea8dd82..6cbb278 100644 --- a/README.md +++ b/README.md @@ -1,28 +1,166 @@ -# Autonomous Browser AI Agent (Bedrock) +# 🤖 Autonomous Browser AI Agent +An intelligent browser automation agent built with **Playwright** and a modular **agent-controller-browser** architecture. Plan tasks in natural language, execute them via browser actions, and collect results — all autonomously. -## Proje yapısı (ASCII) +## ✨ Features -```text +- **Browser Automation**: Full Playwright integration (goto, click, fill, extract text, screenshot, etc.) +- **Safety Controls**: URL scheme filtering, loop detection, max-step limits +- **Modular Architecture**: Agent → Controller → Browser layers for testability +- **Human-like Behavior**: Configurable random delays to reduce bot detection +- **CLI & API**: Run from command line or integrate into your Python code +- **Extensible Planner**: Pluggable LLM interface for intelligent task planning + +## 🚀 Quick Start + +### Installation + +```bash +# Clone the repository +git clone https://github.com/Kaangml/autonomous_browser_ai_agent.git +cd autonomous_browser_ai_agent + +# Install dependencies with uv (recommended) +uv sync + +# Install Playwright browsers +uv run playwright install chromium +``` + +### Run from CLI + +```bash +# Extract text from a webpage +uv run python -m src --url "https://example.com" --task "extract the page title" + +# Run with visible browser window +uv run python -m src --url "https://example.com" --task "extract the page title" --no-headless + +# Output as JSON +uv run python -m src --url "https://example.com" --task "read the main content" --json +``` + +### Run Examples + +```bash +# Wikipedia example: extract featured article +uv run python -m src.examples.example_wikipedia + +# DuckDuckGo search example +uv run python -m src.examples.example_search +``` + +## 📖 Usage in Python + +```python +import asyncio +from browser.browser_config import BrowserConfigManager +from browser.browser import BrowserManager +from browser.actions import BrowserActions +from controller.browser_controller import BrowserController + +async def main(): + # Setup + config = BrowserConfigManager() + config.config.headless = True + + browser = BrowserManager(config) + actions = BrowserActions(browser) + controller = BrowserController(actions) + + try: + # Navigate + result = await controller.execute_action({ + "type": "goto", + "args": {"url": "https://example.com"} + }) + page = result["page"] + + # Extract text + text = await controller.execute_action({ + "type": "extract_text", + "args": {"page": page, "selector": "h1"} + }) + print(text["result"]) + + finally: + await browser.close() + +asyncio.run(main()) +``` + +## 🏗️ Architecture + +``` src/ -├── agent/ -│ ├── agent.py -│ ├── memory.py -│ ├── planning.py -│ ├── prompt_templates.py -│ └── tools.py -├── browser/ -│ ├── actions.py -│ ├── browser_config.py -│ ├── browser.py -│ └── utils.py -├── config/ -│ └── settings.py -├── controller/ -│ ├── logger.py -│ ├── task_manager.py -│ └── workflow.py -└── examples/ - ├── example_login_automation.py - └── example_scrape_google.py +├── agent/ # Task planning and reasoning +│ ├── agent.py # Main agent class (plan → execute → reflect) +│ ├── planner.py # LLM-based task decomposition +│ └── memory.py # Short-term memory +├── browser/ # Playwright automation layer +│ ├── browser.py # Browser lifecycle management +│ ├── actions.py # High-level actions (click, fill, extract, etc.) +│ └── utils.py # Retry logic, human delays, URL normalization +├── controller/ # Action orchestration +│ └── browser_controller.py # Maps agent actions to browser calls +├── config/ # Configuration management +└── examples/ # Working examples ``` + +## 🧪 Testing + +```bash +# Run all tests +uv run pytest + +# Run with verbose output +uv run pytest -v + +# Run specific test file +uv run pytest tests/browser/test_actions.py +``` + +## 📋 Supported Actions + +| Action | Description | Args | +|--------|-------------|------| +| `goto` | Navigate to URL | `url` | +| `click` | Click element | `page`, `selector` | +| `fill` | Type into input | `page`, `selector`, `text` | +| `extract_text` | Get element text | `page`, `selector` | +| `links` | Get all links | `page`, `selector` (optional) | +| `screenshot` | Capture page | `page`, `full_page` (optional) | + +## ⚙️ Configuration + +Browser behavior can be customized via `BrowserConfigManager`: + +```python +config = BrowserConfigManager() +config.config.headless = False # Show browser window +config.config.timeout = 30 # Timeout in seconds +config.config.viewport_width = 1920 # Browser width +config.config.viewport_height = 1080 # Browser height +config.config.human_delay_min = 0.5 # Min delay between actions +config.config.human_delay_max = 1.5 # Max delay between actions +config.config.channel = "chrome" # Use Chrome instead of Chromium +``` + +## 🗺️ Roadmap + +See [docs/ROADMAP.md](docs/ROADMAP.md) for the development roadmap. + +### Planned Features +- [ ] Persistent memory (SQLite/vector DB) +- [ ] Real LLM integration (OpenAI, Anthropic, Bedrock) +- [ ] Job queue and workflow management +- [ ] Retry policies with exponential backoff +- [ ] Structured logging and metrics + +## 📄 License + +MIT + +## 🤝 Contributing + +Contributions welcome! Please read the roadmap first, then open a PR. diff --git a/docs/DEV_NOTES.md b/docs/DEV_NOTES.md new file mode 100644 index 0000000..6e7ab90 --- /dev/null +++ b/docs/DEV_NOTES.md @@ -0,0 +1,94 @@ +# agent.md — in-repo agent memory / plan + +This file is the agent's on-disk memory and work-plan to implement Faz 2 features. Keep this short and actionable — it will be updated as I complete tasks. + +## Current understanding (from `docs/faz1_2.md`) +- Faz 2 focuses on completing the Browser engine, Controller, Agent reasoning and config improvements. +- Browser (Playwright) needs robust actions: goto, click, type/fill, wait_for, extract_text, extract_all_links, screenshot, error handling/retries. +- Controller must convert agent action JSON into browser calls and provide safety checks (URL filters, loop detection, max steps). +- Agent needs multi-step reasoning (Plan → Execute → Reflect), tool-based reasoning, and self-correction. + +## Priority (what I'll implement first) +1. Browser actions (Faz 2.1): make sure all core actions exist and are tested. +2. Unit tests for browser actions (pytest style) — file-level tests under `tests/browser`. +3. Controller small improvements / scaffolding (Faz 2.2) — map agent JSON to BrowserActions. +4. Agent reasoning upgrade (Faz 2.3) — design plan + unit tests for planning flow. + +## First task breakdown (concrete) +- Add `extract_all_links(page)` to `src/browser/actions.py`. +- Add `screenshot(page)` to `src/browser/actions.py`. +- Add tests for both functions in `tests/browser/test_actions.py`. + +After each file-level change: +- Add/update a unit test in `tests/browser/` (pytest async tests where needed). +- Run tests locally to ensure everything passes. + +## Testing rules (my memory) +- Every implemented action must have a unit test that covers success and core happy path. +- Use small Dummy classes in tests for `Page`, `BrowserManager` to avoid needing an actual browser. +- Tests must be pytest compatible and located under `tests/browser`. + +## Notes / constraints +- Keep changes small and incremental — test after each file. +- Use `BrowserUtils.retry` for network-sensitive operations. + +## Next steps after tests +- Expand controller mapping using a lightweight JSON action format and tests. +- Then update agent logic to use multi-step reasoning and enable tools. + +## Recent updates (done) + +- Planner: `src/agent/planner.py` was added. The agent can now use an injected LLM planner (mockable) to create multi-step plans. +- Memory: improved in-memory workflow; tests added for planner and agent execution. +- Playwright configs: `BrowserConfig` now supports `channel`, and `human_delay_min/max` to add human-like timing jitter for automation to reduce bot detection. + +## Faz 2 — status summary ✅ + +All Faz 2 goals are implemented and tested locally: + +- Browser actions: goto, click, fill, wait_for, extract_text, extract_all_links, screenshot, and helper utilities (normalize_url, human_delay, retry). +- Controller: `BrowserController` with execute_action/execute_sequence, safety checks (URL scheme filter, max steps, loop detection). +- Agent: simple LLMPlanner interface and async planning pipeline; agent executes steps with injected planner + short-term memory skeleton. +- Tests: comprehensive unit tests added across browser, controller and agent layers; e2e Playwright tests included (fixture based), with an optional headful test guarded by RUN_HEADFUL for local debugging. + +Local test result (most recent run): 29 passed, 1 skipped. + +Notes: +- Playwright headful tests can be flaky against public sites (Google blocks automated clients); use local fixtures or less-restrictive targets in CI. +- Changes are committed locally in logical groups (agent, browser, controller, e2e, ci), but pushing to remote failed due to repository auth; you'll need to run `git push` with the correct credentials or switch accounts if you want me to push from here. + +--- + +# Faz 3 — next phase (high level goals) 🔭 + +Faz 3 upgrades will move the project from a local POC into a more production-ready, resilient agent. + +Planned work (concrete, test-driven) + +1) Persistent/smart memory (short + long term) 💾 + - Add a persistent store (SQLite) for short-term memory and a pluggable vector DB adapter for longer-term semantic memory (optional: FAISS / Milvus / SQLite+annlite). + - Write migrations, test dataset fixtures, and unit tests for reads/writes and eviction policies. + - Optionally add minimal embedding + retrieval using a mock embedding provider to allow offline tests. + +2) LLM planning contract and adapter 🧭 + - Implement a strict planner contract requiring structured JSON output (schema) with steps and tool names to reduce hallucinations. + - Add an LLM adapter interface to swap providers (mock tests for CI + sample provider configs). + - Add unit tests that assert the planner always returns a valid plan or a clear, recoverable error response. + +3) Controller hardening & workflow engine 🔧 + - Add a persistent job queue + state machine for long-running workflows (states: pending, running, succeeded, failed, retrying). + - Implement retry/backoff policies (max attempts, exponential backoff) and durable input/output logs for observability. + - Add safe scheduling and re-entrancy guarantees (so controllers can resume from failure). + +4) Monitoring, instrumentation & CI ✅ + - Add metrics and structured logs for each step (execution time, step result, errors) and unit tests to verify logging of key events. + - Harden CI to include unit + e2e fixture tests, and avoid unreliable public-site headful tests in CI. + +5) Release & collaboration tasks + - Push local branch to remote (requires correct auth). If you'd like I can attempt the push again after you switch accounts or provide push access. + - Open a PR from the feature branch with Faz 2 changes and Faz 3 follow-up work split into smaller PRs. + +If you'd like, I can start with the memory persistence work (SQLite + tests) next. + + +(Agent memory file — update this as steps complete.) diff --git a/docs/ROADMAP.md b/docs/ROADMAP.md new file mode 100644 index 0000000..aa32135 --- /dev/null +++ b/docs/ROADMAP.md @@ -0,0 +1,148 @@ +--- + +# 🚀 Autonomous Browser AI Agent + +## Development Direction (Faz 1 → Faz 2 Yol Haritası) + +Bu dosya, repoyu inceledikten sonra Faz 1 mevcut durumunu, Faz 2 için yol haritasını ve VS Code üzerinde nasıl ilerleyeceğinizi net bir şekilde açıklamak için hazırlandı. Aşağıdaki içeriği README altına veya /docs içinde `DEVELOPMENT_DIRECTION.md` olarak kullanabilirsiniz. + +--- + +## 1) Projenin Genel Amacı + +Bu proje, web tarayıcısını akıllı bir yazılım ajanı ile kontrol eden, görev odaklı, plan üretebilen ve kendini yöneten bir autonomous browser agent oluşturmayı hedefler. Sistem; `controller`, `browser`, `agent`, `config` katmanlarına ayrılmıştır. Bu modüler yapı sayesinde: + +- Test edilebilir +- Genişletilebilir +- Yeni görev tiplerine adapte edilebilir +- Model veya tarayıcı kitaplığı kolayca değiştirilebilir + +--- + +## 2) Faz 1 — Mevcut Durum Değerlendirmesi + +Faz 1 incelendiğinde aşağıdaki temel parçalar hazır: + +### 2.1 Mimari + +- `agent/` → Ajan zekâsı, reasoning pipeline +- `controller/` → Tarayıcı komutlarının orkestrasyonu +- `browser/` → Web automation iskeleti (şu an skeleton) +- `config/` → Ajan parametreleri, görev tanımları, model yönlendirmeleri + +Yapı, nümerik olarak genişlemeye uygun ve ölçeklenebilir. + +### 2.2 Temel Akış (Flow) + +Task → Agent reasoning → Controller → Browser executes → Result → Agent feedback loop + +### 2.3 Prompt Yapısı + +- System prompt +- Task prompt +- Action-output formatı + +### 2.4 Kodun Durumu / Eksikler + +Faz 1 temelde tamamlanmış gözükse de aşağıda eksiklikler var (Faz 2 hedefleri): + +- Browser hâlen dummy +- Controller gerçek aksiyon üretmiyor +- Agent reasoning tek adımlı +- Memory yok +- Tools eksik + +--- + +## 3) Faz 2 — TODO Roadmap + +Aşağıdaki alt başlıklar Faz 2 kapsamındaki hedeflerdir. VSCode üzerinde bir feature branch açıp adım adım ilerlemeniz önerilir. + +### Faz 2.1 — Browser Engine’in Tamamlanması + +Browser katmanını Playwright ile etkinleştirin ve temel eylemleri uygulayın: + +- Playwright entegrasyonu + - Browser launch, Context, Page + - Stealth mode, Headless toggle +- Temel aksiyonlar + - `goto(url)`, `click(selector)`, `type(selector, text)`, `wait_for(selector)` + - `extract_text(selector)`, `extract_all_links()`, `screenshot()` +- Error management + - Retry wrapper, timeout policy + +### Faz 2.2 — Controller’ın Tamamlanması + +Controller sorumlulukları: + +- Agent tarafından üretilen aksiyonları al ve browser metoduna çevir +- Sonuçları geri ilet + +Yapılacaklar: + +- Action parser: Agent çıktısını JSON → method mapping +- Execution pipeline: Komut al → Browser’a ilet → Completion → Controller response +- Safety layer: URL filter, infinite loop detection, max step control + +### Faz 2.3 — Agent Reasoning Geliştirme + +- Multi-step reasoning: Plan → Execute → Reflect +- Tool-based reasoning: `browser.goto`, `browser.click`, `browser.type`, `browser.extract_text`, `browser.links` +- Self-correction: Ajan aldığı hataya göre planını güncelleyecek + +### Faz 2.4 — Config ve Prompt Geliştirme + +- Dynamic task config dosyası (YAML) — her görev için ayrı tanım + +```yaml +task: + name: "linkedin profile extraction" + target_url: "https://linkedin.com/..." + goal: "Extract basic info" + constraints: + - "No login" + - "Max 10 actions" +``` + +- System prompt genişletmesi: Kurallar, format, reasoning tarzı +- Global ayarlar: Model, timeout, max steps, debug mode + +### Faz 2.5 — Faz 2 Sonu: İlk Çalışan Senaryo + +Hedef senaryo (başarı kriteri): + +``` +Git Google'a, "Kaangml GitHub" ara, çıkan ilk linki aç, repository açıklamasını oku ve metni JSON olarak döndür. +``` + +Bu senaryo başarılı şekilde çalışırsa Faz 2 tamamlanmış kabul edilecektir. + +--- + +## 4) VS Code üzerinde çalışma önerileri + +1. Yeni bir branch açın (ör. `feature/phase2-browser-agent`). +2. Taskları sırayla çözün, her ana değişiklik için ayrı commit yapın. + +Örnek commit akışı: + +```bash +git checkout -b feature/phase2-browser-agent +git add browser/* +git commit -m "Browser engine: goto, click, type added" +``` + +Copilot kullanırken örnek komutlar: + +``` +BrowserController için execute_action(action) fonksiyonunu yaz. action.type → browser metodu maplensin. +Playwright tabanlı async wrapper oluştur, tüm browser fonksiyonlarını tek yerden yönet. +``` + +Test dosyası oluşturun: `tests/test_browser.py` ve eylemlerinizin gerçekten çalıştığından emin olun. + +--- + +## 5) Sonuç + +Bu doküman Faz 1 durumu, mimari tercihleri, Faz 2 için teknik adımlar ve bir TODO listesi içerir. Artık proje Faz 2 geliştirme dönemine geçmeye uygundur. diff --git a/pyproject.toml b/pyproject.toml index 1dd2416..ab58ac0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,14 +18,7 @@ requires = ["hatchling"] build-backend = "hatchling.build" [tool.hatch.build.targets.wheel] -packages = [ - "agent", - "browser", - "config", - "controller", - "examples", -] -sources = ["src"] +packages = ["src/agent", "src/browser", "src/config", "src/controller", "src/examples"] [tool.hatch.build.targets.sdist] include = [ diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..4e6396d --- /dev/null +++ b/src/__init__.py @@ -0,0 +1,33 @@ +"""Main package for autonomous browser agent.""" + +# Re-exports for convenient importing when package is installed +# These will work when the package is properly installed via pip/uv + +__all__ = [ + "Agent", + "BrowserManager", + "BrowserConfigManager", + "BrowserActions", + "BrowserController", +] + + +def __getattr__(name): + """Lazy imports to avoid circular dependencies and allow flexible usage.""" + if name == "Agent": + from agent.agent import Agent + return Agent + if name == "BrowserManager": + from browser.browser import BrowserManager + return BrowserManager + if name == "BrowserConfigManager": + from browser.browser_config import BrowserConfigManager + return BrowserConfigManager + if name == "BrowserActions": + from browser.actions import BrowserActions + return BrowserActions + if name == "BrowserController": + from controller.browser_controller import BrowserController + return BrowserController + raise AttributeError(f"module {__name__!r} has no attribute {name!r}") + diff --git a/src/__main__.py b/src/__main__.py new file mode 100644 index 0000000..25dd420 --- /dev/null +++ b/src/__main__.py @@ -0,0 +1,174 @@ +#!/usr/bin/env python3 +"""CLI entry point for autonomous browser agent. + +Usage: + uv run python -m src --url https://example.com --task "extract page title" + uv run python -m src --help +""" + +from __future__ import annotations + +import argparse +import asyncio +import json +import sys +from typing import Optional + + +async def run_agent(url: str, task: str, headless: bool = True) -> dict: + """Run the browser agent on a given URL with a task description. + + Returns a dict with the execution result. + """ + from browser.browser_config import BrowserConfigManager, BrowserConfig + from browser.browser import BrowserManager + from browser.actions import BrowserActions + from controller.browser_controller import BrowserController + from agent.agent import Agent + + # Create browser config directly + browser_config = BrowserConfig( + headless=headless, + viewport={"width": 1920, "height": 1080}, + timeout=30, + user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36", + stealth=True, + channel=None, + human_delay_min=0.1, + human_delay_max=0.3, + ) + config = BrowserConfigManager(browser_config) + browser_manager = BrowserManager(config) + actions = BrowserActions(browser_manager) + controller = BrowserController(actions) + + # Setup agent with controller as tools + agent = Agent(name="CLI-Agent") + agent.tools = controller + + try: + # Create the task with URL context + full_task = f"Open {url} and {task}" + agent.receive_task(full_task) + + # Plan and execute + steps = await agent.plan_task(full_task) + + results = [] + current_page = None + + for step in steps: + # Inject page from previous goto step if needed + if step.get("type") in {"extract_text", "click", "fill", "links", "screenshot"}: + if current_page and "page" not in step.get("args", {}): + step.setdefault("args", {})["page"] = current_page + + result = await agent.execute_step(step) + agent.evaluate_result(result) + results.append(result) + + # Track page from goto results + if result.get("ok") and result.get("page"): + current_page = result["page"] + + # Stop on first error + if not result.get("ok"): + break + + return { + "ok": all(r.get("ok") for r in results), + "task": full_task, + "steps": steps, + "results": results, + } + + finally: + await browser_manager.close() + + +def main(): + parser = argparse.ArgumentParser( + prog="autonomous-browser-agent", + description="Run an autonomous browser agent to perform web tasks", + ) + parser.add_argument( + "--url", + "-u", + required=True, + help="Target URL to navigate to", + ) + parser.add_argument( + "--task", + "-t", + required=True, + help='Task description (e.g., "extract the page title")', + ) + parser.add_argument( + "--headless", + action="store_true", + default=True, + help="Run browser in headless mode (default: True)", + ) + parser.add_argument( + "--no-headless", + action="store_true", + help="Run browser with visible window", + ) + parser.add_argument( + "--json", + action="store_true", + help="Output result as JSON", + ) + + args = parser.parse_args() + + headless = not args.no_headless + + try: + result = asyncio.run(run_agent(args.url, args.task, headless=headless)) + + if args.json: + # Filter out non-serializable page objects + def clean_result(obj): + if isinstance(obj, dict): + return {k: clean_result(v) for k, v in obj.items() if k != "page"} + if isinstance(obj, list): + return [clean_result(i) for i in obj] + if isinstance(obj, bytes): + return "" + return obj + + print(json.dumps(clean_result(result), indent=2, ensure_ascii=False)) + else: + # Human-readable output + print(f"\n{'='*60}") + print(f"Task: {result['task']}") + print(f"Status: {'✅ Success' if result['ok'] else '❌ Failed'}") + print(f"{'='*60}") + + for i, (step, res) in enumerate(zip(result["steps"], result["results"]), 1): + status = "✓" if res.get("ok") else "✗" + step_type = step.get("type", "unknown") + print(f" {i}. [{status}] {step_type}: {step.get('args', {})}") + if res.get("result") and step_type == "extract_text": + text = res["result"] + if len(text) > 200: + text = text[:200] + "..." + print(f" → {text}") + if res.get("error"): + print(f" ⚠ Error: {res['error']}") + + print() + + sys.exit(0 if result["ok"] else 1) + + except KeyboardInterrupt: + print("\nInterrupted") + sys.exit(130) + except Exception as e: + print(f"Error: {e}", file=sys.stderr) + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/src/examples/example_login_automation.py b/src/examples/example_login_automation.py deleted file mode 100644 index e69de29..0000000 diff --git a/src/examples/example_scrape_google.py b/src/examples/example_scrape_google.py deleted file mode 100644 index e69de29..0000000 diff --git a/src/examples/example_search.py b/src/examples/example_search.py new file mode 100644 index 0000000..c50e1c8 --- /dev/null +++ b/src/examples/example_search.py @@ -0,0 +1,92 @@ +#!/usr/bin/env python3 +"""Example: Search DuckDuckGo and extract results. + +Demonstrates: +- Navigation +- Form filling (search input) +- Waiting for results +- Extracting text + +Run: + uv run python -m src.examples.example_search +""" + +from __future__ import annotations + +import asyncio + +from browser.browser_config import BrowserConfigManager, BrowserConfig +from browser.browser import BrowserManager +from browser.actions import BrowserActions +from controller.browser_controller import BrowserController + + +async def main(): + """Search DuckDuckGo for a term and extract first result titles.""" + + search_query = "autonomous browser agent" + + browser_config = BrowserConfig( + headless=True, + viewport={"width": 1920, "height": 1080}, + timeout=30, + user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36", + stealth=True, + human_delay_min=0.1, # small delays for realism + human_delay_max=0.3, + ) + config = BrowserConfigManager(browser_config) + + browser = BrowserManager(config) + actions = BrowserActions(browser) + controller = BrowserController(actions) + + try: + # 1. Go to DuckDuckGo + print(f"Searching DuckDuckGo for: '{search_query}'...") + result = await controller.execute_action({ + "type": "goto", + "args": {"url": "https://duckduckgo.com"} + }) + + if not result["ok"]: + print(f"Failed: {result.get('error')}") + return + + page = result["page"] + + # 2. Fill the search box + await controller.execute_action({ + "type": "fill", + "args": {"page": page, "selector": 'input[name="q"]', "text": search_query} + }) + + # 3. Click search button (or submit form) + await controller.execute_action({ + "type": "click", + "args": {"page": page, "selector": 'button[type="submit"]'} + }) + + # 4. Wait for results to load + await page.wait_for_selector('[data-testid="result"]', timeout=10000) + + # 5. Extract result titles + titles = await page.eval_on_selector_all( + '[data-testid="result-title-a"]', + "nodes => nodes.map(n => n.innerText)" + ) + + print(f"\n✅ Found {len(titles)} results:\n") + for i, title in enumerate(titles[:5], 1): + print(f" {i}. {title}") + print() + + except Exception as e: + print(f"Error during search: {e}") + + finally: + await browser.close() + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/src/examples/example_wikipedia.py b/src/examples/example_wikipedia.py new file mode 100644 index 0000000..8fe1d08 --- /dev/null +++ b/src/examples/example_wikipedia.py @@ -0,0 +1,92 @@ +#!/usr/bin/env python3 +"""Example: Scrape Wikipedia main page and extract featured article title. + +This demonstrates a simple browser agent workflow: +1. Navigate to a URL +2. Extract text from a specific selector +3. Print the result + +Run: + uv run python -m src.examples.example_wikipedia +""" + +from __future__ import annotations + +import asyncio + +from browser.browser_config import BrowserConfigManager, BrowserConfig +from browser.browser import BrowserManager +from browser.actions import BrowserActions +from controller.browser_controller import BrowserController + + +async def main(): + """Extract featured article title from Wikipedia main page.""" + + # 1. Setup browser with config + browser_config = BrowserConfig( + headless=True, + viewport={"width": 1920, "height": 1080}, + timeout=30, + user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36", + stealth=True, + ) + config = BrowserConfigManager(browser_config) + + browser = BrowserManager(config) + actions = BrowserActions(browser) + controller = BrowserController(actions) + + try: + # 2. Navigate to Wikipedia + print("Navigating to Wikipedia...") + goto_result = await controller.execute_action({ + "type": "goto", + "args": {"url": "https://en.wikipedia.org/wiki/Main_Page"} + }) + + if not goto_result["ok"]: + print(f"Failed to navigate: {goto_result.get('error')}") + return + + page = goto_result["page"] + + # 3. Extract the featured article heading + print("Extracting featured article...") + extract_result = await controller.execute_action({ + "type": "extract_text", + "args": {"page": page, "selector": "#mp-tfa b"} # Featured article bold text + }) + + if extract_result["ok"]: + print(f"\n✅ Today's Featured Article: {extract_result['result']}\n") + else: + # Fallback: extract page title + title_result = await controller.execute_action({ + "type": "extract_text", + "args": {"page": page, "selector": "title"} + }) + if title_result["ok"]: + print(f"\n✅ Page title: {title_result['result']}\n") + else: + print(f"Failed: {extract_result.get('error')}") + + # 4. Bonus: get some links + print("Extracting links from main content...") + links_result = await controller.execute_action({ + "type": "links", + "args": {"page": page, "selector": "#mp-tfa"} + }) + + if links_result["ok"]: + links = links_result["result"][:5] # first 5 links + print(f"Found {len(links_result['result'])} links. First 5:") + for link in links: + print(f" - {link}") + + finally: + await browser.close() + + +if __name__ == "__main__": + asyncio.run(main())