NNTin · NNTin · Mar 18, 2026 · Mar 18, 2026 · Mar 18, 2026 · Mar 18, 2026
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -26,7 +26,7 @@ concurrency:
 jobs:
   ci:
     runs-on: ubuntu-latest
-    timeout-minutes: 15
+    timeout-minutes: 25
 
     steps:
       - name: Checkout
@@ -54,6 +54,12 @@ jobs:
         working-directory: webview-ui
         run: npm ci
 
+      - name: Install Playwright Dependencies
+        id: install_playwright_deps
+        if: always() && steps.install_root.outcome == 'success'
+        run: npx playwright install --with-deps chromium
+        continue-on-error: true
+
       # --- Quality Checks (blocking) ---
 
       - name: Type Check
@@ -75,6 +81,13 @@ jobs:
         run: npm run lint
         continue-on-error: true
 
+      - name: Webview Tests
+        id: webview_test
+        if: always() && steps.install_webview.outcome == 'success'
+        working-directory: webview-ui
+        run: npm test
+        continue-on-error: true
+
       - name: Format Check
         id: format_check
         if: always() && steps.install_root.outcome == 'success'
@@ -91,6 +104,12 @@ jobs:
           cd webview-ui && npm run build
         continue-on-error: true
 
+      - name: E2E Tests
+        id: e2e_test
+        if: always() && steps.build.outcome == 'success' && steps.install_playwright_deps.outcome == 'success'
+        run: npm run e2e
+        continue-on-error: true
+
       # --- Advisory Checks (non-blocking) ---
 
       - name: Audit Root Dependencies
@@ -115,11 +134,14 @@ jobs:
           SETUP_NODE: ${{ steps.setup_node.outcome }}
           INSTALL_ROOT: ${{ steps.install_root.outcome }}
           INSTALL_WEBVIEW: ${{ steps.install_webview.outcome }}
+          INSTALL_PLAYWRIGHT_DEPS: ${{ steps.install_playwright_deps.outcome }}
           TYPE_CHECK: ${{ steps.type_check.outcome }}
           ROOT_LINT: ${{ steps.root_lint.outcome }}
           WEBVIEW_LINT: ${{ steps.webview_lint.outcome }}
+          WEBVIEW_TEST: ${{ steps.webview_test.outcome }}
           FORMAT_CHECK: ${{ steps.format_check.outcome }}
           BUILD: ${{ steps.build.outcome }}
+          E2E_TEST: ${{ steps.e2e_test.outcome }}
           AUDIT_ROOT: ${{ steps.audit_root.outcome }}
           AUDIT_WEBVIEW: ${{ steps.audit_webview.outcome }}
         run: |
@@ -135,11 +157,14 @@ jobs:
             echo "| Setup Node | $(status "$SETUP_NODE") |"
             echo "| Install root deps | $(status "$INSTALL_ROOT") |"
             echo "| Install webview deps | $(status "$INSTALL_WEBVIEW") |"
+            echo "| Install Playwright deps | $(status "$INSTALL_PLAYWRIGHT_DEPS") |"
             echo "| **Type check** | $(status "$TYPE_CHECK") |"
             echo "| **Root lint** | $(status "$ROOT_LINT") |"
             echo "| **Webview lint** | $(status "$WEBVIEW_LINT") |"
+            echo "| **Webview tests** | $(status "$WEBVIEW_TEST") |"
             echo "| **Format check** | $(status "$FORMAT_CHECK") |"
             echo "| **Build** | $(status "$BUILD") |"
+            echo "| **E2E tests** | $(status "$E2E_TEST") |"
             echo "| Audit root _(advisory)_ | $(status "$AUDIT_ROOT") |"
             echo "| Audit webview _(advisory)_ | $(status "$AUDIT_WEBVIEW") |"
           } >> "$GITHUB_STEP_SUMMARY"
@@ -153,16 +178,19 @@ jobs:
           SETUP_NODE: ${{ steps.setup_node.outcome }}
           INSTALL_ROOT: ${{ steps.install_root.outcome }}
           INSTALL_WEBVIEW: ${{ steps.install_webview.outcome }}
+          INSTALL_PLAYWRIGHT_DEPS: ${{ steps.install_playwright_deps.outcome }}
           TYPE_CHECK: ${{ steps.type_check.outcome }}
           ROOT_LINT: ${{ steps.root_lint.outcome }}
           WEBVIEW_LINT: ${{ steps.webview_lint.outcome }}
+          WEBVIEW_TEST: ${{ steps.webview_test.outcome }}
           FORMAT_CHECK: ${{ steps.format_check.outcome }}
           BUILD: ${{ steps.build.outcome }}
+          E2E_TEST: ${{ steps.e2e_test.outcome }}
         run: |
           failed=0
           for step in CHECKOUT SETUP_NODE INSTALL_ROOT INSTALL_WEBVIEW \
-                      TYPE_CHECK ROOT_LINT WEBVIEW_LINT FORMAT_CHECK \
-                      BUILD; do
+                      INSTALL_PLAYWRIGHT_DEPS TYPE_CHECK ROOT_LINT WEBVIEW_LINT \
+                      WEBVIEW_TEST FORMAT_CHECK BUILD E2E_TEST; do
             eval "val=\$$step"
             if [ "$val" != "success" ]; then
               echo "::error::$step failed"

diff --git a/.gitignore b/.gitignore
@@ -17,6 +17,10 @@ Thumbs.db
 .vscode-test/
 /.idea
 
+# E2E test artifacts
+test-results/
+playwright-report/
+
 # Build artifacts
 *.vsix
 *.map

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -94,6 +94,52 @@ These conventions are enforced by custom ESLint rules (`eslint-rules/pixel-agent
 
 These rules are set to `warn` — they won't block your PR but will flag violations for cleanup.
 
+## End-to-End Tests
+
+The `e2e/` directory contains Playwright tests that launch a real VS Code instance with the extension loaded in development mode.
+
+### Running e2e tests locally
+
+```bash
+# Build the extension first (tests load the compiled output)
+npm run build
+
+# Headless (default — uses xvfb-run on Linux)
+npm run e2e
+
+# Headed (shows the VS Code window)
+npm run e2e:headed
+
+# Step-by-step debug mode
+npm run e2e:debug
+```
+
+On the first run, `@vscode/test-electron` will download a stable VS Code release into `.vscode-test/` (≈200 MB). Subsequent runs reuse the cache.
+
+### Artifacts
+
+All test artifacts are written to `test-results/e2e/`:
+
+| Path | Contents |
+|---|---|
+| `test-results/e2e/videos/<test-name>/` | `.webm` screen recording for every test |
+| `test-results/e2e/html/` | Playwright HTML report (`npx playwright show-report test-results/e2e/html`) |
+| `test-results/e2e/*.png` | Final screenshots saved on failure |
+
+On failure, the test output prints the path to the video for that run.
+
+### Mock claude
+
+Tests never invoke the real `claude` CLI. Instead, a bash script at `e2e/fixtures/mock-claude` is copied into an isolated `bin/` directory and prepended to `PATH` before VS Code starts.
+
+The mock:
+1. Parses `--session-id <uuid>` from its arguments.
+2. Appends a line to `$HOME/.claude-mock/invocations.log` so tests can assert it was called.
+3. Creates `$HOME/.claude/projects/<project-hash>/<session-id>.jsonl` with a minimal init line so the extension's file-watcher can detect the session.
+4. Sleeps for 30 s (keeps the terminal alive) then exits.
+
+Each test runs with an isolated `HOME` and `--user-data-dir`, so no test state leaks between runs or into your real VS Code profile.
+
 ## Submitting a Pull Request
 
 1. Fork the repo and create a feature branch from `main`

diff --git a/e2e/fixtures/mock-claude b/e2e/fixtures/mock-claude
@@ -0,0 +1,47 @@
+#!/usr/bin/env bash
+# Mock 'claude' executable for Pixel Agents e2e tests.
+#
+# Behaviour:
+#   1. Parses --session-id <id> from args.
+#   2. Appends an invocation record to $HOME/.claude-mock/invocations.log.
+#   3. Creates the expected JSONL file under $HOME/.claude/projects/<hash>/<id>.jsonl
+#      using the same path-hash algorithm as agentManager.ts
+#      (replace every non-[a-zA-Z0-9-] char with '-').
+#   4. Writes a minimal valid JSONL line so the extension file-watcher can proceed.
+#   5. Stays alive for up to 30 s (tests can kill it once assertions pass).
+
+set -euo pipefail
+
+SESSION_ID=""
+PREV=""
+for arg in "$@"; do
+  if [ "$PREV" = "--session-id" ]; then
+    SESSION_ID="$arg"
+  fi
+  PREV="$arg"
+done
+
+LOG_DIR="${HOME}/.claude-mock"
+mkdir -p "$LOG_DIR"
+echo "$(date -Iseconds) session-id=${SESSION_ID} cwd=$(pwd) args=$*" >> "${LOG_DIR}/invocations.log"
+
+if [ -n "$SESSION_ID" ]; then
+  CWD="$(pwd)"
+  # Replicate agentManager.ts: workspacePath.replace(/[^a-zA-Z0-9-]/g, '-')
+  DIR_NAME="$(printf '%s' "$CWD" | tr -cs 'a-zA-Z0-9-' '-')"
+  PROJECT_DIR="${HOME}/.claude/projects/${DIR_NAME}"
+  mkdir -p "$PROJECT_DIR"
+  JSONL_FILE="${PROJECT_DIR}/${SESSION_ID}.jsonl"
+
+  # Write a minimal system init line so the extension watcher sees the file.
+  printf '{"type":"system","subtype":"init","content":"mock-claude-ready"}\n' >> "$JSONL_FILE"
+fi
+
+# Stay alive so the VS Code terminal doesn't immediately close.
+sleep 30 &
+SLEEP_PID=$!
+
+# Clean exit on SIGTERM/SIGINT.
+trap 'kill $SLEEP_PID 2>/dev/null; exit 0' SIGTERM SIGINT
+
+wait $SLEEP_PID || true
diff --git a/e2e/global-setup.ts b/e2e/global-setup.ts
@@ -0,0 +1,18 @@
+import { downloadAndUnzipVSCode } from '@vscode/test-electron';
+import fs from 'fs';
+import path from 'path';
+
+export const VSCODE_CACHE_DIR = path.join(__dirname, '../.vscode-test');
+export const VSCODE_PATH_FILE = path.join(VSCODE_CACHE_DIR, 'vscode-executable.txt');
+
+export default async function globalSetup(): Promise<void> {
+  console.log('[e2e] Ensuring VS Code is downloaded...');
+  const vscodePath = await downloadAndUnzipVSCode({
+    version: 'stable',
+    cachePath: VSCODE_CACHE_DIR,
+  });
+  console.log(`[e2e] VS Code executable: ${vscodePath}`);
+
+  fs.mkdirSync(VSCODE_CACHE_DIR, { recursive: true });
+  fs.writeFileSync(VSCODE_PATH_FILE, vscodePath, 'utf8');
+}
diff --git a/e2e/helpers/launch.ts b/e2e/helpers/launch.ts
@@ -0,0 +1,132 @@
+import { _electron as electron } from '@playwright/test';
+import type { ElectronApplication, Page } from '@playwright/test';
+import fs from 'fs';
+import os from 'os';
+import path from 'path';
+
+const REPO_ROOT = path.join(__dirname, '../..');
+const VSCODE_PATH_FILE = path.join(REPO_ROOT, '.vscode-test/vscode-executable.txt');
+const MOCK_CLAUDE_PATH = path.join(REPO_ROOT, 'e2e/fixtures/mock-claude');
+const ARTIFACTS_DIR = path.join(REPO_ROOT, 'test-results/e2e');
+
+export interface VSCodeSession {
+  app: ElectronApplication;
+  window: Page;
+  /** Isolated HOME directory for this test session. */
+  tmpHome: string;
+  /** Workspace directory opened in VS Code. */
+  workspaceDir: string;
+  /** Path to the mock invocations log. */
+  mockLogFile: string;
+  cleanup: () => Promise<void>;
+}
+
+/**
+ * Launch VS Code with the Pixel Agents extension loaded in development mode.
+ *
+ * Uses an isolated temp HOME and injects the mock `claude` binary at the
+ * front of PATH so no real Claude CLI is needed.
+ */
+export async function launchVSCode(testTitle: string): Promise<VSCodeSession> {
+  const vscodePath = fs.readFileSync(VSCODE_PATH_FILE, 'utf8').trim();
+
+  // --- Isolated temp directories ---
+  const tmpBase = fs.mkdtempSync(path.join(os.tmpdir(), 'pixel-e2e-'));
+  const tmpHome = path.join(tmpBase, 'home');
+  const workspaceDir = path.join(tmpBase, 'workspace');
+  const userDataDir = path.join(tmpBase, 'userdata');
+  const mockBinDir = path.join(tmpBase, 'bin');
+
+  fs.mkdirSync(tmpHome, { recursive: true });
+  fs.mkdirSync(workspaceDir, { recursive: true });
+  fs.mkdirSync(userDataDir, { recursive: true });
+  fs.mkdirSync(mockBinDir, { recursive: true });
+
+  // Copy mock-claude into an isolated bin dir and symlink as 'claude'
+  const mockDest = path.join(mockBinDir, 'claude');
+  fs.copyFileSync(MOCK_CLAUDE_PATH, mockDest);
+  fs.chmodSync(mockDest, 0o755);
+
+  const mockLogFile = path.join(tmpHome, '.claude-mock', 'invocations.log');
+
+  // --- Video output dir ---
+  const safeTitle = testTitle.replace(/[^a-z0-9]+/gi, '-').toLowerCase();
+  const videoDir = path.join(ARTIFACTS_DIR, 'videos', safeTitle);
+  fs.mkdirSync(videoDir, { recursive: true });
+
+  // --- Environment for VS Code process ---
+  const env: Record<string, string> = {
+    ...process.env as Record<string, string>,
+    HOME: tmpHome,
+    // Prepend mock bin so 'claude' resolves to our mock
+    PATH: `${mockBinDir}:${process.env['PATH'] ?? '/usr/local/bin:/usr/bin:/bin'}`,
+    // Prevent VS Code from trying to talk to real accounts / telemetry
+    VSCODE_TELEMETRY_DISABLED: '1',
+  };
+
+  // --- VS Code launch args ---
+  const args = [
+    // Load our extension in dev mode (this overrides the installed version)
+    `--extensionDevelopmentPath=${REPO_ROOT}`,
+    // Disable all other extensions so tests are isolated
+    '--disable-extensions',
+    // Isolated user-data (settings, state, etc.)
+    `--user-data-dir=${userDataDir}`,
+    // Skip interactive prompts
+    '--disable-workspace-trust',
+    '--skip-release-notes',
+    '--skip-welcome',
+    '--no-sandbox',
+    // Open the workspace folder
+    workspaceDir,
+  ];
+
+  const cleanup = async (): Promise<void> => {
+    try {
+      if (app) {
+        await app.close();
+      }
+    } catch {
+      // ignore close errors
+    }
+    try {
+      fs.rmSync(tmpBase, { recursive: true, force: true });
+    } catch {
+      // ignore cleanup errors
+    }
+  };
+
+  let app: ElectronApplication | undefined;
+
+  try {
+    app = await electron.launch({
+      executablePath: vscodePath,
+      args,
+      env,
+      cwd: workspaceDir,
+      recordVideo: {
+        dir: videoDir,
+        size: { width: 1280, height: 800 },
+      },
+      timeout: 60_000,
+    });
+
+    // Electron can expose the window before the page lifecycle events settle.
+    // The test waits for `.monaco-workbench`, so returning the window here is
+    // more reliable than waiting on `domcontentloaded` in CI.
+    const window = await app.firstWindow();
+
+    return { app, window, tmpHome, workspaceDir, mockLogFile, cleanup };
+  } catch (error) {
+    await cleanup();
+    throw error;
+  }
+}
+
+/**
+ * Wait for VS Code's workbench to be fully ready before interacting.
+ */
+export async function waitForWorkbench(window: Page): Promise<void> {
+  // VS Code renders a div.monaco-workbench when the shell is ready
+  await window.waitForSelector('.monaco-workbench', { timeout: 60_000 });
+}