From 7dad1484898cc7319c36a18e7634334c02fa1560 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Mon, 11 May 2026 10:26:03 +0000 Subject: [PATCH] Add osaurus.macos-use v3.0.2 --- plugins/osaurus.macos-use.json | 100 ++++++++++++++++++++------------- 1 file changed, 60 insertions(+), 40 deletions(-) diff --git a/plugins/osaurus.macos-use.json b/plugins/osaurus.macos-use.json index 0491717..6263225 100644 --- a/plugins/osaurus.macos-use.json +++ b/plugins/osaurus.macos-use.json @@ -10,84 +10,88 @@ "capabilities": { "tools": [ { - "name": "open_application", - "description": "Launch (if needed) and prepare an app for backgrounded driving. By default `background: true` — the app's window is NOT raised and the user's foreground app is untouched. Returns pid, name, bundleId, and an initial snapshot (`mode`-shaped) so the next step can act immediately." + "description": "Launch (if needed) and prepare an app for backgrounded driving. By default `background: true` — the app's window is NOT raised and the user's foreground app is untouched. Returns pid, name, bundleId, and an initial snapshot (`mode`-shaped) so the next step can act immediately.", + "name": "open_application" }, { - "name": "list_apps", - "description": "List all running GUI apps (regular activation policy) with pid, name, bundleId, active, hidden. Use this before open_application when you want to attach to something already running without bringing it forward." + "widget": true, + "description": "List all running GUI apps (regular activation policy) with pid, name, bundleId, active, hidden. Use this before open_application when you want to attach to something already running without bringing it forward.", + "name": "list_apps" }, { - "name": "list_windows", - "description": "List all windows for a pid with their CGWindowID, title, focused/minimized flags, and bounds. Use the returned `windowId` with `take_screenshot` (windowId arg) and `get_ui_elements` (windowId arg via find_elements) to address one specific window without raising it." + "widget": true, + "description": "List all windows for a pid with their CGWindowID, title, focused/minimized flags, and bounds. Use the returned `windowId` with `take_screenshot` (windowId arg) and `get_ui_elements` (windowId arg via find_elements) to address one specific window without raising it.", + "name": "list_windows" }, { - "name": "get_ui_elements", - "description": "Capture the accessibility tree for a pid. Returns either a TraversalResult (mode='ax') or an SOMResult (mode='som'/'vision') with AX tree + screenshot + an `elements[]` array carrying both the snapshot id ('s7-12') and an `elementIndex` (1, 2, 3, …) for vision-first agents that prefer numeric indexing." + "description": "Capture the accessibility tree for a pid. Returns either a TraversalResult (mode='ax') or an SOMResult (mode='som'/'vision') with AX tree + screenshot + an `elements[]` array carrying both the snapshot id ('s7-12') and an `elementIndex` (1, 2, 3, …) for vision-first agents that prefer numeric indexing.", + "name": "get_ui_elements" }, { - "name": "find_elements", - "description": "Server-side search for elements by label/value/placeholder text and/or role. Cheaper than scanning a get_ui_elements result by hand. Returns a TraversalResult; matched elements are cached and immediately usable with click_element, set_value, etc." + "description": "Server-side search for elements by label/value/placeholder text and/or role. Cheaper than scanning a get_ui_elements result by hand. Returns a TraversalResult; matched elements are cached and immediately usable with click_element, set_value, etc.", + "name": "find_elements" }, { - "name": "get_active_window", - "description": "Returns the currently active window's pid, app name, title, and bounds. Useful when you don't yet have a pid and want to discover the foreground app." + "widget": true, + "description": "Returns the currently active window's pid, app name, title, and bounds. Useful when you don't yet have a pid and want to discover the foreground app.", + "name": "get_active_window" }, { - "name": "click_element", - "description": "Click an element by its snapshot id ('s7-12'). Always tries AXPress first (fully backgrounded, no cursor warp). Falls back to a per-pid SkyLight click at the element's center. Final fallback is the HID tap which moves the user's cursor — hits a small set of canvas/Blender/Unity-style apps. NOTE on Chromium right-click: the renderer-IPC layer coerces synthetic right-clicks on web content to left-clicks. AXShowMenu (which click_element prefers) is the only reliable right-click path for those targets." + "description": "Click an element by its snapshot id ('s7-12'). Always tries AXPress first (fully backgrounded, no cursor warp). Falls back to a per-pid SkyLight click at the element's center. Final fallback is the HID tap which moves the user's cursor — hits a small set of canvas/Blender/Unity-style apps. NOTE on Chromium right-click: the renderer-IPC layer coerces synthetic right-clicks on web content to left-clicks. AXShowMenu (which click_element prefers) is the only reliable right-click path for those targets.", + "name": "click_element" }, { - "name": "click", - "description": "Click at raw screen coordinates. With `pid`, routes per-pid (SkyLight when available — no cursor warp). Without `pid`, falls back to the global HID tap which warps the cursor. Prefer click_element whenever you have a snapshot id." + "description": "Click at raw screen coordinates. With `pid`, routes per-pid (SkyLight when available — no cursor warp). Without `pid`, falls back to the global HID tap which warps the cursor. Prefer click_element whenever you have a snapshot id.", + "name": "click" }, { - "name": "type_text", - "description": "Type text into the focused element. With `id` (a snapshot id), focuses that element first AND clears it (replace=true by default). With `pid` (or implicitly, the pid derived from `id` or the most-recent snapshot), keystrokes are routed per-pid via CGEvent.postToPid — the user can keep typing in their own app. Without any pid hint, falls back to the HID tap (visible to the user). If the snapshot id is stale, returns 'stale: true' and the agent should re-observe." + "description": "Type text into the focused element. With `id` (a snapshot id), focuses that element first AND clears it (replace=true by default). With `pid` (or implicitly, the pid derived from `id` or the most-recent snapshot), keystrokes are routed per-pid via CGEvent.postToPid — the user can keep typing in their own app. Without any pid hint, falls back to the HID tap (visible to the user). If the snapshot id is stale, returns 'stale: true' and the agent should re-observe.", + "name": "type_text" }, { - "name": "set_value", - "description": "Directly set a text field's value via accessibility (kAXValueAttribute). Instant and replaces existing content. Preferred over type_text for forms when the field is AX-editable. REQUIRES a recent snapshot id; if 'stale: true' is returned, observe again." + "description": "Directly set a text field's value via accessibility (kAXValueAttribute). Instant and replaces existing content. Preferred over type_text for forms when the field is AX-editable. REQUIRES a recent snapshot id; if 'stale: true' is returned, observe again.", + "name": "set_value" }, { - "name": "clear_field", - "description": "Clear a text field by snapshot id. Tries set_value(\"\") first, falls back to focus + Cmd+A + delete (routed per-pid)." + "description": "Clear a text field by snapshot id. Tries set_value(\"\") first, falls back to focus + Cmd+A + delete (routed per-pid).", + "name": "clear_field" }, { - "name": "press_key", - "description": "Press a keyboard key with optional modifiers. With `pid` (or the most-recent snapshot's pid), routes per-pid so the keystroke lands in that app without affecting the user's frontmost window." + "description": "Press a keyboard key with optional modifiers. With `pid` (or the most-recent snapshot's pid), routes per-pid so the keystroke lands in that app without affecting the user's frontmost window.", + "name": "press_key" }, { - "name": "scroll", - "description": "Scroll in a direction. With `pid`, routes per-pid (no cursor warp). Without `pid`, optionally moves the global cursor first (legacy behavior)." + "description": "Scroll in a direction. With `pid`, routes per-pid (no cursor warp). Without `pid`, optionally moves the global cursor first (legacy behavior).", + "name": "scroll" }, { - "name": "drag", - "description": "Drag from one screen coordinate to another. NOTE: drag is one operation that genuinely needs the cursor to move (most drag-receivers key on the global mouse position) so it ALWAYS warps the cursor. The mouse button is always released even on errors so a stuck-down mouse cannot happen." + "description": "Drag from one screen coordinate to another. NOTE: drag is one operation that genuinely needs the cursor to move (most drag-receivers key on the global mouse position) so it ALWAYS warps the cursor. The mouse button is always released even on errors so a stuck-down mouse cannot happen.", + "name": "drag" }, { - "name": "act_and_observe", - "description": "Run a single action and immediately re-observe in one call. Eliminates the 'forgot to re-observe after navigation' failure mode. Returns { action, snapshot? | som? }." + "description": "Run a single action and immediately re-observe in one call. Eliminates the 'forgot to re-observe after navigation' failure mode. Returns { action, snapshot? | som? }.", + "name": "act_and_observe" }, { - "name": "take_screenshot", - "description": "Capture a screenshot. Defaults: jpeg, quality 0.7, scale 0.5. Pass `windowId` to capture exactly one window (works for occluded / off-Space windows). Set 'annotate: true' (with `pid` or `windowId`) to overlay element ids from the most recent snapshot." + "description": "Capture a screenshot. Defaults: jpeg, quality 0.7, scale 0.5. Pass `windowId` to capture exactly one window (works for occluded / off-Space windows). Set 'annotate: true' (with `pid` or `windowId`) to overlay element ids from the most recent snapshot.", + "name": "take_screenshot" }, { - "name": "list_displays", - "description": "List all connected displays with positions and dimensions." + "widget": true, + "description": "List all connected displays with positions and dimensions.", + "name": "list_displays" }, { - "name": "start_automation_session", - "description": "Record a session title and optional step count. v0.4 removed the on-screen HUD and the global Esc-cancel monitor — backgrounded automations are invisible to the user and there's nothing to interrupt. The session tools remain as a side-effect-free telemetry channel: callers can read back state via the same response shape." + "description": "Record a session title and optional step count. v0.4 removed the on-screen HUD and the global Esc-cancel monitor — backgrounded automations are invisible to the user and there's nothing to interrupt. The session tools remain as a side-effect-free telemetry channel: callers can read back state via the same response shape.", + "name": "start_automation_session" }, { - "name": "update_automation_session", - "description": "Update title/narration/step counter on the current session. Side-effect-free; no UI change." + "description": "Update title/narration/step counter on the current session. Side-effect-free; no UI change.", + "name": "update_automation_session" }, { - "name": "end_automation_session", - "description": "Reset the session record. Optional, since session state is purely informational now." + "description": "Reset the session record. Optional, since session state is purely informational now.", + "name": "end_automation_session" } ], "skills": [ @@ -213,6 +217,22 @@ } } ] + }, + { + "version": "3.0.2", + "release_date": "2026-05-11", + "artifacts": [ + { + "os": "macos", + "arch": "arm64", + "url": "https://github.com/osaurus-ai/osaurus-macos-use/releases/download/3.0.2/osaurus.macos-use-3.0.2-macos-arm64.zip", + "sha256": "3139cc89b2ffe1c2e490c77a109e0c18031cc06fa93724fa80ec42be4a807394", + "min_macos": "13.0", + "minisign": { + "signature": "untrusted comment: signature from minisign secret key\nRWRh6pAWTUF1v53Sc5qV/gmL9rjTDfNeBmmZOrFpq2v71klWuE00aztyvmu0ppCh0W/oSKhP5O3qakinp2NY2v7gL3zztc1GZgs=\ntrusted comment: timestamp:1778495138\tfile:osaurus.macos-use-3.0.2-macos-arm64.zip\npl5lyz7bkBhFRrASiwLDbk5hj0fgrDZkqY9qljK1iItZtz5q/v1+s80HMW3IBBLIBwOD2UXYAnWCX3ELZCw+Bw==" + } + } + ] } ], "skill": "---\nname: osaurus-macos-use\ndescription: Drive macOS apps in the background. Use when the user asks you to interact with native Mac apps, automate UI tasks, browse the web in Safari, fill forms, navigate menus, or perform any on-screen action — without taking the user's screen or cursor away from them.\nmetadata:\n author: osaurus\n version: \"3.0.0\"\n---\n\n# Osaurus macOS Use\n\nA backgrounded computer-use driver. Open apps, observe the UI, click buttons, type into fields, navigate menus, browse the web — **all while the user keeps working in the foreground**. Cursor never moves, focus never changes, Spaces never follow.\n\nBuilt on the cua-driver recipe: SkyLight `SLEventPostToPid` for cursor-warp-free routing, yabai-style `focusWithoutRaise` for AppKit-active flips, the (-1,-1) Chromium primer click for renderer-IPC user-activation gates, and a per-pid `CGEvent.postToPid` fallback for the rest.\n\n## The Contract\n\nFive rules. Follow them in order, every time.\n\n1. **Discover** with `list_apps()` if the target is already running, or skip to step 2 to launch fresh.\n2. **Open** with `open_application` (defaults to `background: true` — the app is NOT raised). It returns the app `pid` AND a starting capture (default `mode: \"som\"` — AX tree + screenshot + numeric `elementIndex` per element). **Read the capture before doing anything else.**\n3. **Locate** with `find_elements({ pid, text, role? })` whenever you know what you're looking for. Faster, cheaper, more reliable than scanning a `get_ui_elements` result by hand.\n4. **Act** with `click_element`, `set_value`, `type_text`, `clear_field`, `press_key`. Always pass element ids in the `s{snapshot}-{n}` format (e.g. `\"s7-12\"`). For raw-coordinate `click`/`scroll`/`type_text`/`press_key`, **always pass `pid`** — that's what keeps routing per-pid (no cursor warp).\n5. **Re-observe** only when the result tells you to:\n - `\"stale\": true` → call `get_ui_elements` (or `find_elements`) again, then retry.\n - `\"removed\": true` → element is gone; observe and find a new one.\n - `delta.focusedWindow` changed in a way you didn't expect → observe again.\n - Otherwise → keep going.\n\nIf you'd rather not think about re-observing, use `act_and_observe` — runs an action and returns a fresh capture in one call.\n\n## Capture modes (cua-style)\n\n`open_application`, `get_ui_elements`, and `act_and_observe` accept `mode`:\n\n- **`som`** (default) — AX tree + annotated screenshot + per-element `elementIndex`. Best for vision-first agents that ground on pixels.\n- **`ax`** — tree only. Fastest. **No Screen Recording permission needed.** Best for AppKit/SwiftUI apps with rich AX trees.\n- **`vision`** — screenshot only. Smallest payload for VLMs that don't need the tree.\n\nIn `som` mode, every element is addressable two ways: by snapshot id (`\"s7-12\"`) AND by `elementIndex` (1, 2, 3, …). Use whichever your model prefers — both resolve to the same element.\n\n## Routing chain\n\nYou don't usually need to think about this, but it's useful when debugging:\n\n1. **AXPress / AXShowMenu / AXValue** — `click_element` etc. try this first. Fully backgrounded.\n2. **`SLEventPostToPid`** — SkyLight private framework, loaded at runtime. Trusted by Chromium renderers, no cursor warp.\n3. **`CGEvent.postToPid`** — public CoreGraphics. Works for almost everything except Chromium web content.\n4. **HID tap** — last resort. **This is the only path that warps the user's cursor.** Auto-falls-back here for canvas/Blender/Unity.\n\n`drag` is the one operation that ALWAYS uses the HID tap (drop receivers key on the global cursor position).\n\n## Canonical Recipe\n\n```\n1. list_apps()\n → { apps: [..., { pid: 1234, name: \"Safari\", bundleId: \"com.apple.Safari\", active: false }, ...] }\n\n2. open_application({ identifier: \"Safari\", mode: \"som\" })\n → { pid: 1234, som: { snapshot: { snapshotId: 1, ... }, image: { mimeType: \"image/jpeg\", data: \"...\" }, elements: [{ elementIndex: 1, id: \"s1-3\", role: \"textfield\", label: \"Address\" }, ...] } }\n\n3. press_key({ key: \"l\", modifiers: [\"command\"], pid: 1234 })\n → { success: true, delta: { focusedElement: { role: \"textfield\", label: \"Address\" } } }\n\n4. type_text({ text: \"https://example.com\", pid: 1234 })\n5. press_key({ key: \"return\", pid: 1234 })\n\n6. find_elements({ pid: 1234, text: \"More information\", role: \"link\" })\n → { snapshotId: 2, elements: [{ id: \"s2-3\", role: \"link\", label: \"More information...\" }] }\n\n7. click_element({ id: \"s2-3\" })\n → { success: true, delta: { focusedWindow: \"IANA-managed Reserved Domains\" } }\n```\n\nThe user never sees Safari come forward. Their own app stays focused throughout.\n\n## Snapshot Ids and the Cache\n\n- Element ids look like `s7-12`. The `s7` is the snapshot they came from.\n- The plugin keeps the **last 2 snapshots** in cache. Ids from older snapshots return `\"stale\": true`.\n- Each call to `get_ui_elements`, `find_elements`, or `open_application` (with default `observe: true`) **starts a new snapshot** and bumps the counter.\n- This means: if you call `find_elements` twice in a row, the ids from the first call become stale on the third call, not the second.\n\nIf you ever see a result with `\"stale\": true`, the fix is always the same: re-observe and retry with the new id.\n\n## Tool Reference\n\n### Discovery / observation\n\n| Tool | When to use |\n| ----------------------- | ---------------------------------------------------------------------------------------------------------------------------------------- |\n| `list_apps` | List all running GUI apps. Use before `open_application` if the target is already up. |\n| `list_windows({ pid })` | Per-pid window list with `windowId`. Pass the windowId to `take_screenshot` (windowId arg) to read a specific window without raising it. |\n| `open_application` | First step for a fresh app. Defaults to `background: true` — never raises. Returns an initial capture in your chosen `mode`. |\n| `get_ui_elements` | Capture by pid. `mode: \"som\"` (default) returns tree + screenshot + elementIndex; `\"ax\"` is tree only; `\"vision\"` is screenshot only. |\n| `find_elements` | Server-side search by text and/or role. **Prefer this over `get_ui_elements` whenever you know what you're looking for.** |\n| `get_active_window` | The user's frontmost window (mostly for figuring out where they are). |\n\n### Element actions (snapshot id required)\n\n| Tool | When to use |\n| --------------- | ------------------------------------------------------------------------------------------------------------------------------------------ |\n| `click_element` | Click by snapshot id. `button: \"right\"` and `doubleClick: true` supported. Tries AXPress first → SkyLight per-pid → HID tap. |\n| `set_value` | Replace a field's value instantly via AX. Best for forms. |\n| `clear_field` | Empty a text field. Use before `type_text` if you want to replace, not append. |\n| `type_text` | Keystroke-by-keystroke typing. Pass `id` to focus first; `replace: true` (default) clears the field. Routed per-pid via the element's pid. |\n\n### Coordinate / keyboard actions\n\n| Tool | When to use |\n| ----------- | ----------------------------------------------------------------------------------------------------------------------------------- |\n| `press_key` | Keyboard shortcuts. Pass `pid` to route per-pid (no foreground steal) AND get a focus delta back. |\n| `click` | Coordinate click. **Always pass `pid`** to keep routing backgrounded. Without it, falls back to the HID tap which warps the cursor. |\n| `scroll` | Direction + amount. Pass `pid` to route per-pid. |\n| `drag` | Coordinate drag. Always uses the HID tap; drop receivers need the cursor to track. |\n\n### Combined / utility\n\n| Tool | When to use |\n| ----------------- | ------------------------------------------------------------------------------------------------------------------------------------- |\n| `act_and_observe` | Run any element action AND get a fresh capture (`mode`-shaped) in one call. |\n| `take_screenshot` | When you need pixels. `windowId` captures exactly one window (works for occluded / off-Space). `annotate: true` overlays element ids. |\n| `list_displays` | Multi-monitor setups only. |\n\n### Session telemetry\n\n| Tool | When to use |\n| ------------------------------------- | ------------------------------------------------------------------------------------------------- |\n| `start_automation_session({ title })` | Optional. Records a title for the agent transcript. **No HUD, no Esc cancel, no UI side effect.** |\n| `update_automation_session` | Update title / narration / step counter. |\n| `end_automation_session` | Reset the record. Optional. |\n\n## Tips That Actually Matter\n\n1. **Pass `pid` to every coordinate-based action** (`click`, `scroll`, `type_text`, `press_key`). Without it, the call falls back to the HID tap and warps the cursor.\n2. **`set_value` first, `type_text` if it fails.** `set_value` is instant and correct for most fields. `type_text` (focus + clear + type) is the fallback for search fields, password fields, anything that needs per-keystroke events.\n3. **Use `windowId` from `list_windows` when capturing a specific window.** It works for windows that are occluded, hidden, or on a different Space.\n4. **Check `truncated: true` in any snapshot.** If true, raise `maxElements` or use `find_elements`.\n5. **`focusedWindowOnly: true` is your friend.** Cheap re-observation when the action only changed the focused window.\n6. **Keyboard shortcuts beat menu navigation.** `press_key(\"s\", modifiers: [\"command\"], pid: ...)` is one tool call; clicking File > Save is three.\n7. **Roles can be passed in any case.** `\"button\"`, `\"Button\"`, and `\"AXButton\"` all work.\n\n## Troubleshooting\n\n| Symptom | Fix |\n| ---------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- |\n| `\"stale\": true` | Re-observe (any of `get_ui_elements`, `find_elements`, `open_application`) and retry with the fresh id. |\n| `\"removed\": true` | The element is gone from the UI. Re-observe and find a new one. |\n| Cursor moves during a click | You called `click` without `pid`, OR the target is a canvas/game app filtering per-pid routes. Pass `pid`; for canvas apps, accept the warp (it's by design). |\n| Right-click on a Chrome web page does a left-click instead | Chromium coerces synthetic right-clicks at the renderer-IPC layer. Use `click_element` on an AX-addressable target so `AXShowMenu` fires instead. |\n| `error` mentions \"not a valid snapshot id\" | You passed a v0.2 integer id. Use the `s{n}-{n}` strings returned by the new tools. |\n| Empty `elements: []` from `get_ui_elements` | Check `truncated`; lower `maxDepth`; broaden `interactiveOnly: false`; or use `find_elements`. |\n| `set_value` returns \"not editable\" | Fall back to `type_text` with the element id (auto-focuses and replaces). |\n| App won't open / wrong app focused | Try the bundle id (`com.apple.Safari`) instead of name. Use `get_active_window` to confirm. |\n| No elements at all | The host app needs Accessibility permission in System Settings > Privacy & Security > Accessibility. |\n\n## Reference\n\nFor per-app recipes (Safari URL bar, web forms, tab management) and the full keyboard shortcut catalog, see [REFERENCE.md](REFERENCE.md). Don't load it unless you need it — most automation only needs the contract above.\n\n## Limitations\n\n- **Canvas apps** (Figma, games, Blender, Unity): per-pid event routes are filtered. Driver auto-falls back to the HID tap, which warps the cursor.\n- **Chromium right-click on web content**: coerced to left-click. Use AX paths instead.\n- **Highly dynamic web apps**: re-observe more often; prefer `find_elements` over caching ids across navigation.\n- **Iframes**: AX coverage varies. Safari is most reliable.",