diff --git a/harbor_cookbook/recipes/computer-use-windows/README.md b/harbor_cookbook/recipes/computer-use-windows/README.md index 79e9eb9..aabd224 100644 --- a/harbor_cookbook/recipes/computer-use-windows/README.md +++ b/harbor_cookbook/recipes/computer-use-windows/README.md @@ -1,42 +1,24 @@ # computer-use-windows -Computer-use task where an agent interacts with a Windows virtual desktop to solve a multi-step GUI challenge. Requires the [Mascobot/harbor fork](https://github.com/Mascobot/harbor) which adds Windows desktop support via Daytona. +Computer-use task on a remote Windows desktop. A companion MCP server creates a [Daytona](https://daytona.io) Windows sandbox, deploys a multi-step tkinter challenge, and proxies computer-use tools (screenshot, click, type) to the agent. Same tool set as `computer-use-ubuntu`, backed by the Daytona Computer Use API. -## How it works - -Harbor creates a Daytona sandbox from a pre-built Windows snapshot (`windows-base`). A setup script deploys a tkinter challenge application and launches it on the desktop. CUA agents (anthropic-cua, openai-cua) interact with the Windows desktop through Harbor's `DaytonaWindowsDesktopInterface`, which executes pyautogui commands on the sandbox via the Daytona SDK. - -The challenge presents a multi-step GUI task requiring the agent to click buttons, type text, and read the result from the screen. The task cannot be solved without genuine GUI interaction. - -## Prerequisites - -- [Mascobot/harbor fork](https://github.com/Mascobot/harbor) (not upstream harbor-framework/harbor) -- Daytona API key (`DAYTONA_API_KEY`) -- `windows-base` snapshot in your Daytona account (Windows Computer Use private alpha) -- Anthropic or OpenAI API key for the CUA agent +> **Note:** Daytona Windows Computer Use is currently in early preview. Access requires a beta account at [win.trydaytona.com](https://win.trydaytona.com/) with the `windows-base` snapshot available. ## Run ```bash harbor run -p harbor_cookbook/recipes/computer-use-windows \ - --agent anthropic-cua --model anthropic/claude-sonnet-4-6 \ - --environment-type daytona \ - --environment-kwarg windows_snapshot=windows-base \ - --environment-kwarg windows_setup_script=harbor_cookbook/recipes/computer-use-windows/scripts/setup.py \ - --environment-kwarg skip_osworld_setup=true + --agent claude-code --model anthropic/claude-sonnet-4-6 \ + --env-file .env -y ``` -## Docker oracle test - -The oracle test validates the test/solution pipeline on Docker (no Windows desktop needed): - -```bash -harbor trials start -p harbor_cookbook/recipes/computer-use-windows --agent oracle +Your `.env` needs: +``` +DAYTONA_API_KEY=your_key_here +DAYTONA_API_URL=https://win.trydaytona.com/api ``` ## Limitations -- Requires the Mascobot/harbor fork with CUA agent and Windows desktop support -- Requires the `daytona` environment provider with a Windows snapshot -- Windows Computer Use is currently in Daytona private alpha -- Standard agents (claude-code, codex) cannot interact with the desktop — use CUA agents +- Requires internet access (MCP server connects to the Daytona API) +- Windows sandbox takes ~30-60s to start after creation diff --git a/harbor_cookbook/recipes/computer-use-windows/environment/desktop/Dockerfile b/harbor_cookbook/recipes/computer-use-windows/environment/desktop/Dockerfile new file mode 100644 index 0000000..02a6315 --- /dev/null +++ b/harbor_cookbook/recipes/computer-use-windows/environment/desktop/Dockerfile @@ -0,0 +1,14 @@ +FROM python:3.12-slim + +COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/ + +ENV UV_PRERELEASE=allow + +WORKDIR /app + +COPY server.py . +COPY challenge.py . + +EXPOSE 8000 + +CMD ["uv", "run", "server.py"] diff --git a/harbor_cookbook/recipes/computer-use-windows/environment/desktop/server.py b/harbor_cookbook/recipes/computer-use-windows/environment/desktop/server.py new file mode 100644 index 0000000..4f21743 --- /dev/null +++ b/harbor_cookbook/recipes/computer-use-windows/environment/desktop/server.py @@ -0,0 +1,252 @@ +# /// script +# requires-python = ">=3.12" +# dependencies = ["fastmcp", "daytona==0.131.0a1"] +# /// +"""MCP server exposing computer-use tools backed by a Daytona Windows sandbox. + +On startup the server creates a Windows sandbox from the ``windows-base`` +snapshot, deploys a tkinter challenge application, and launches it on the +desktop. Every MCP tool call is proxied to the sandbox via the Daytona +Computer Use API. + +If the sandbox cannot be created (e.g. missing credentials), the server +still starts so that Docker healthchecks pass and oracle tests work. +""" + +import atexit +import base64 +import functools +import logging +import os +import time + +from fastmcp import FastMCP +from fastmcp.utilities.types import Image + +logging.basicConfig(level=logging.INFO) +log = logging.getLogger(__name__) + +WIN_APP_DIR = "C:/Users/Administrator/harbor" + +sandbox = None + + +def _setup_sandbox(): + """Create the Windows sandbox and deploy the challenge app.""" + global sandbox + + from daytona import CreateSandboxFromSnapshotParams, Daytona, DaytonaConfig + + api_url = os.environ.get("DAYTONA_API_URL", "https://win.trydaytona.com/api") + log.info("Connecting to Daytona at %s", api_url) + daytona = Daytona( + DaytonaConfig( + api_key=os.environ["DAYTONA_API_KEY"], + api_url=api_url, + ) + ) + + log.info("Creating Windows sandbox from 'windows-base' snapshot …") + sandbox = daytona.create( + CreateSandboxFromSnapshotParams(snapshot="windows-base"), + timeout=120, + ) + log.info("Sandbox created: %s (state=%s)", sandbox.id, sandbox.state) + + def _cleanup(): + try: + log.info("Deleting sandbox %s …", sandbox.id) + daytona.delete(sandbox) + except Exception as exc: + log.warning("Sandbox cleanup failed: %s", exc) + + atexit.register(_cleanup) + + try: + sandbox.computer_use.start() + log.info("computer_use.start() succeeded") + except Exception as exc: + log.info("computer_use.start() skipped: %s", exc) + + sandbox.process.exec(f"mkdir {WIN_APP_DIR}", timeout=5) + + log.info("Deploying challenge app …") + with open(os.path.join(os.path.dirname(__file__) or ".", "challenge.py")) as f: + challenge_source = f.read() + + sandbox.fs.upload_file(challenge_source.encode(), f"{WIN_APP_DIR}/challenge.py") + + # DETACHED_PROCESS flag so the GUI outlives the launcher + launcher = ( + "import subprocess\n" + "subprocess.Popen(\n" + f" ['python', r'{WIN_APP_DIR}/challenge.py'],\n" + " creationflags=0x00000008,\n" + ")\n" + "print('launched')\n" + ) + sandbox.fs.upload_file(launcher.encode(), f"{WIN_APP_DIR}/launch.py") + + log.info("Launching challenge app on desktop …") + r = sandbox.process.exec(f"python {WIN_APP_DIR}/launch.py", timeout=15) + log.info("Launch result: %s", r.result) + time.sleep(5) + + try: + windows = sandbox.computer_use.display.get_windows() + titles = [w.title for w in windows.windows] if windows.windows else [] + log.info("Open windows: %s", titles) + if not any("Harbor" in t for t in titles): + log.warning("Challenge window not found, retrying …") + sandbox.process.exec(f"python {WIN_APP_DIR}/launch.py", timeout=15) + time.sleep(5) + except Exception as exc: + log.warning("Could not verify window list: %s", exc) + + log.info("Setup complete") + + +try: + _setup_sandbox() +except Exception as exc: + log.warning("Sandbox setup failed (tools will be unavailable): %s", exc) + +log.info("Starting MCP server (sandbox=%s)", "ready" if sandbox else "unavailable") + +mcp = FastMCP("computer-use") + + +def requires_sandbox(fn): + """Decorator that raises if the Windows sandbox is unavailable.""" + + @functools.wraps(fn) + def wrapper(*args, **kwargs): + if sandbox is None: + raise RuntimeError( + "Windows sandbox is not available. " + "Check DAYTONA_API_KEY and DAYTONA_API_URL." + ) + return fn(*args, **kwargs) + + return wrapper + + +@mcp.tool() +@requires_sandbox +def screenshot() -> Image: + """Take a screenshot of the Windows desktop and return it as an image.""" + resp = sandbox.computer_use.screenshot.take_full_screen() + return Image(data=base64.b64decode(resp.screenshot), format="png") + + +@mcp.tool() +@requires_sandbox +def mouse_move(x: int, y: int) -> str: + """Move the mouse cursor to the given (x, y) pixel coordinate.""" + result = sandbox.computer_use.mouse.move(x, y) + return f"Moved mouse to ({result.x}, {result.y})" + + +@mcp.tool() +@requires_sandbox +def left_click(x: int, y: int) -> str: + """Left-click at the given (x, y) pixel coordinate.""" + sandbox.computer_use.mouse.click(x, y) + return f"Left clicked at ({x}, {y})" + + +@mcp.tool() +@requires_sandbox +def right_click(x: int, y: int) -> str: + """Right-click at the given (x, y) pixel coordinate.""" + sandbox.computer_use.mouse.click(x, y, button="right") + return f"Right clicked at ({x}, {y})" + + +@mcp.tool() +@requires_sandbox +def middle_click(x: int, y: int) -> str: + """Middle-click at the given (x, y) pixel coordinate.""" + sandbox.computer_use.mouse.click(x, y, button="middle") + return f"Middle clicked at ({x}, {y})" + + +@mcp.tool() +@requires_sandbox +def double_click(x: int, y: int) -> str: + """Double-click at the given (x, y) pixel coordinate.""" + sandbox.computer_use.mouse.click(x, y, double=True) + return f"Double clicked at ({x}, {y})" + + +@mcp.tool() +@requires_sandbox +def triple_click(x: int, y: int) -> str: + """Triple-click at the given (x, y) coordinate (e.g. to select a line).""" + sandbox.computer_use.mouse.click(x, y) + sandbox.computer_use.mouse.click(x, y) + sandbox.computer_use.mouse.click(x, y) + return f"Triple clicked at ({x}, {y})" + + +@mcp.tool() +@requires_sandbox +def left_click_drag(start_x: int, start_y: int, end_x: int, end_y: int) -> str: + """Click and drag from (start_x, start_y) to (end_x, end_y).""" + sandbox.computer_use.mouse.drag(start_x, start_y, end_x, end_y) + return f"Dragged from ({start_x}, {start_y}) to ({end_x}, {end_y})" + + +@mcp.tool() +@requires_sandbox +def scroll(x: int, y: int, direction: str, clicks: int = 3) -> str: + """Scroll at (x, y). direction is 'up' or 'down'. clicks controls amount.""" + sandbox.computer_use.mouse.scroll(x, y, direction, clicks) + return f"Scrolled {direction} {clicks} clicks at ({x}, {y})" + + +@mcp.tool() +@requires_sandbox +def cursor_position() -> str: + """Return the current (x, y) position of the mouse cursor.""" + pos = sandbox.computer_use.mouse.get_position() + return f"Cursor position: ({pos.x}, {pos.y})" + + +@mcp.tool() +@requires_sandbox +def type_text(text: str) -> str: + """Type the given text on the Windows desktop.""" + sandbox.computer_use.keyboard.type(text) + return f"Typed: {text}" + + +@mcp.tool() +@requires_sandbox +def press_key(key: str) -> str: + """Press a key or key combination (e.g. 'Return', 'ctrl+c', 'alt+F4').""" + if "+" in key: + sandbox.computer_use.keyboard.hotkey(key) + else: + sandbox.computer_use.keyboard.press(key) + return f"Pressed: {key}" + + +@mcp.tool() +@requires_sandbox +def hold_key(key: str, duration: float = 0.5) -> str: + """Hold a key down for the given duration in seconds.""" + sandbox.computer_use.keyboard.press(key) + time.sleep(duration) + return f"Held {key} for {duration}s" + + +@mcp.tool() +def wait(seconds: float = 1.0) -> str: + """Wait for the specified number of seconds.""" + time.sleep(seconds) + return f"Waited {seconds} seconds" + + +if __name__ == "__main__": + mcp.run(transport="streamable-http", host="0.0.0.0", port=8000) diff --git a/harbor_cookbook/recipes/computer-use-windows/environment/docker-compose.yaml b/harbor_cookbook/recipes/computer-use-windows/environment/docker-compose.yaml new file mode 100644 index 0000000..58a1dae --- /dev/null +++ b/harbor_cookbook/recipes/computer-use-windows/environment/docker-compose.yaml @@ -0,0 +1,24 @@ +# This file is merged on top of Harbor's base docker-compose config. +# The `main` service is automatically configured by Harbor with the build +# context, image, command, volumes, and resource limits. +# You only need to specify overrides for `main` and define additional services. +services: + main: + depends_on: + desktop: + condition: service_healthy + + desktop: + build: + context: ./desktop + expose: + - "8000" + environment: + - DAYTONA_API_KEY + - DAYTONA_API_URL + healthcheck: + test: ["CMD", "python", "-c", "import socket; s=socket.create_connection(('localhost',8000),timeout=2); s.close()"] + interval: 2s + timeout: 5s + retries: 15 + start_period: 60s diff --git a/harbor_cookbook/recipes/computer-use-windows/instruction.md b/harbor_cookbook/recipes/computer-use-windows/instruction.md index d4529cd..c429ea7 100644 --- a/harbor_cookbook/recipes/computer-use-windows/instruction.md +++ b/harbor_cookbook/recipes/computer-use-windows/instruction.md @@ -7,11 +7,6 @@ Your task: 1. Take screenshots and interact with the application on the desktop 2. The application has a multi-step challenge — you will need to navigate through it 3. Find the secret code revealed at the end of the challenge -4. Write **only the secret code value** to `C:\app\secret.txt` - -You can open a command prompt or PowerShell and run: -``` -echo SECRET_CODE_HERE > C:\app\secret.txt -``` +4. Write **only the secret code value** to `/app/secret.txt` Write the value exactly as displayed, with no additional formatting or whitespace. diff --git a/harbor_cookbook/recipes/computer-use-windows/scripts/setup.py b/harbor_cookbook/recipes/computer-use-windows/scripts/setup.py deleted file mode 100644 index 45641fb..0000000 --- a/harbor_cookbook/recipes/computer-use-windows/scripts/setup.py +++ /dev/null @@ -1,144 +0,0 @@ -"""Setup script for the computer-use-windows recipe. - -Uploaded to the Windows sandbox and executed during environment startup. -Deploys the challenge app and launches it on the desktop. -""" - -import os -import subprocess -import textwrap - -CHALLENGE_DIR = r"C:\harbor_challenge" -CHALLENGE_PY = os.path.join(CHALLENGE_DIR, "challenge.py") -APP_DIR = r"C:\app" - -CHALLENGE_SOURCE = textwrap.dedent('''\ - """Multi-step tkinter challenge that requires genuine GUI interaction.""" - - import tkinter as tk - - - class ChallengeApp: - SECRET_CODE = "HARBOR-CU-2025-VERIFIED" - PASSPHRASE = "open sesame" - - def __init__(self, root: tk.Tk) -> None: - self.root = root - self.root.title("Harbor Challenge") - self.root.geometry("800x500") - self.root.resizable(False, False) - - self.frame = tk.Frame(self.root) - self.frame.pack(expand=True) - - self.show_welcome() - - def clear(self) -> None: - for widget in self.frame.winfo_children(): - widget.destroy() - - def show_welcome(self) -> None: - self.clear() - tk.Label( - self.frame, - text="Welcome to the Harbor Challenge", - font=("Helvetica", 26), - ).pack(pady=(60, 30)) - tk.Label( - self.frame, - text="Click the button below to begin.", - font=("Helvetica", 18), - ).pack(pady=10) - tk.Button( - self.frame, - text="Start Challenge", - font=("Helvetica", 18), - width=20, - command=self.show_passphrase, - ).pack(pady=40) - - def show_passphrase(self) -> None: - self.clear() - tk.Label( - self.frame, - text="Enter the passphrase to continue:", - font=("Helvetica", 22), - ).pack(pady=(60, 10)) - tk.Label( - self.frame, - text=f"The passphrase is: {self.PASSPHRASE}", - font=("Helvetica", 18), - fg="blue", - ).pack(pady=10) - - self.entry = tk.Entry(self.frame, font=("Helvetica", 18), width=30) - self.entry.pack(pady=20) - self.entry.focus_set() - - self.error_label = tk.Label( - self.frame, text="", font=("Helvetica", 14), fg="red" - ) - self.error_label.pack(pady=5) - - tk.Button( - self.frame, - text="Submit", - font=("Helvetica", 18), - width=20, - command=self.check_passphrase, - ).pack(pady=10) - - def check_passphrase(self) -> None: - if self.entry.get().strip().lower() == self.PASSPHRASE: - self.show_secret() - else: - self.error_label.config(text="Wrong passphrase. Try again.") - - def show_secret(self) -> None: - self.clear() - tk.Label( - self.frame, - text="Congratulations!", - font=("Helvetica", 26), - fg="green", - ).pack(pady=(80, 30)) - tk.Label( - self.frame, - text=f"SECRET_CODE: {self.SECRET_CODE}", - font=("Helvetica", 22), - ).pack(pady=20) - - - if __name__ == "__main__": - root = tk.Tk() - ChallengeApp(root) - root.mainloop() -''') - - -def main(): - # Create directories - os.makedirs(CHALLENGE_DIR, exist_ok=True) - os.makedirs(APP_DIR, exist_ok=True) - - # Write challenge app - with open(CHALLENGE_PY, "w") as f: - f.write(CHALLENGE_SOURCE) - print(f"Wrote challenge app to {CHALLENGE_PY}") - - # Install pyautogui (needed for CUA desktop interface) - subprocess.run( - ["python", "-m", "pip", "install", "pyautogui", "Pillow"], - check=False, - ) - - # Launch challenge app in the background - subprocess.Popen( - ["python", CHALLENGE_PY], - creationflags=0x00000008, # DETACHED_PROCESS on Windows - ) - print("Launched challenge app on desktop") - - -if __name__ == "__main__": - main() diff --git a/harbor_cookbook/recipes/computer-use-windows/task.toml b/harbor_cookbook/recipes/computer-use-windows/task.toml index 0fa6880..73997f1 100644 --- a/harbor_cookbook/recipes/computer-use-windows/task.toml +++ b/harbor_cookbook/recipes/computer-use-windows/task.toml @@ -7,9 +7,17 @@ timeout_sec = 600.0 timeout_sec = 600.0 [environment] -build_timeout_sec = 1800.0 -cpus = 4 -memory_mb = 8192 -storage_mb = 65536 +build_timeout_sec = 600.0 +cpus = 1 +memory_mb = 2048 +storage_mb = 10240 allow_internet = true -os_type = "windows" + +[environment.env] +DAYTONA_API_KEY = "${DAYTONA_API_KEY}" +DAYTONA_API_URL = "${DAYTONA_API_URL:-https://win.trydaytona.com/api}" + +[[environment.mcp_servers]] +name = "desktop" +transport = "streamable-http" +url = "http://desktop:8000/mcp" diff --git a/harbor_cookbook/recipes/computer-use-windows/tests/test.py b/harbor_cookbook/recipes/computer-use-windows/tests/test.py deleted file mode 100644 index a277a70..0000000 --- a/harbor_cookbook/recipes/computer-use-windows/tests/test.py +++ /dev/null @@ -1,48 +0,0 @@ -"""Tests that the agent completed the multi-step GUI challenge.""" - -import os -import sys - -SECRET_VALUE = "HARBOR-CU-2025-VERIFIED" - -# Support both Windows (Daytona) and Linux (Docker oracle) paths -SECRET_PATHS = [r"C:\app\secret.txt", "/app/secret.txt"] -REWARD_PATHS = [r"C:\logs\verifier\reward.txt", "/logs/verifier/reward.txt"] - - -def find_existing(paths): - for p in paths: - if os.path.exists(p): - return p - return paths[0] - - -def run_tests(): - errors = [] - - secret_path = find_existing(SECRET_PATHS) - if not os.path.exists(secret_path): - errors.append(f"secret.txt was not created at any of {SECRET_PATHS}") - else: - with open(secret_path) as f: - content = f.read().strip() - if content != SECRET_VALUE: - errors.append(f"secret.txt contains '{content}', expected '{SECRET_VALUE}'") - - reward = "1" if not errors else "0" - - reward_path = find_existing(REWARD_PATHS) - os.makedirs(os.path.dirname(reward_path), exist_ok=True) - with open(reward_path, "w") as f: - f.write(reward) - - if errors: - for e in errors: - print(f"FAIL: {e}", file=sys.stderr) - sys.exit(1) - else: - print("PASS: secret.txt matches expected value") - - -if __name__ == "__main__": - run_tests() diff --git a/harbor_cookbook/recipes/registry.json b/harbor_cookbook/recipes/registry.json index 5c13d38..fb667ae 100644 --- a/harbor_cookbook/recipes/registry.json +++ b/harbor_cookbook/recipes/registry.json @@ -38,7 +38,7 @@ }, { "name": "computer-use-windows", - "description": "Computer-use task on a Windows virtual desktop requiring genuine GUI interaction via CUA agents.", + "description": "Computer-use task on a remote Daytona Windows desktop requiring genuine GUI interaction via MCP tools.", "path": "recipes/computer-use-windows" } ]