From ce04c6c52b784a0cfb2d99758391ee871b1e9de0 Mon Sep 17 00:00:00 2001
From: Rod Boev <rod.boev@gmail.com>
Date: Fri, 13 Feb 2026 02:31:23 -0500
Subject: [PATCH 1/9] chore: add proper-lockfile for spawn coordination

---
 package.json | 1 +
 1 file changed, 1 insertion(+)
diff --git a/package.json b/package.json
index a8fec4c20..d7804c818 100644
--- a/package.json
+++ b/package.json
@@ -103,6 +103,7 @@
     "express": "^4.18.2",
     "glob": "^11.0.3",
     "handlebars": "^4.7.8",
+    "proper-lockfile": "^4.1.2",
     "react": "^18.3.1",
     "react-dom": "^18.3.1",
     "yaml": "^2.8.2",

From f6b4d8152fb76d31d51c6c0a2de2349ce8730806 Mon Sep 17 00:00:00 2001
From: Rod Boev <rod.boev@gmail.com>
Date: Fri, 13 Feb 2026 02:32:12 -0500
Subject: [PATCH 2/9] feat: add filesystem mutex for worker spawn coordination

Uses proper-lockfile (per-port lock file) so only one process at a time
can attempt to spawn a worker daemon. Lock-losers get null and fall back
to waiting for port health instead.
---
 .../infrastructure/singleton-manager.ts       | 61 +++++++++++++++++++
 .../infrastructure/singleton-manager.test.ts  | 55 +++++++++++++++++
 2 files changed, 116 insertions(+)
 create mode 100644 src/services/infrastructure/singleton-manager.ts
 create mode 100644 tests/infrastructure/singleton-manager.test.ts

diff --git a/src/services/infrastructure/singleton-manager.ts b/src/services/infrastructure/singleton-manager.ts
new file mode 100644
index 000000000..1df0e2880
--- /dev/null
+++ b/src/services/infrastructure/singleton-manager.ts
@@ -0,0 +1,61 @@
+/**
+ * Filesystem-based mutex for worker spawn coordination.
+ *
+ * Prevents TOCTOU race: only one process at a time can attempt to spawn
+ * a worker daemon. Concurrent hook invocations that lose the lock race
+ * fall back to waiting for port health.
+ *
+ * Uses proper-lockfile (same library npm uses for package-lock coordination).
+ * Lock location: ~/.claude-mem/worker-spawn-<port>.lock
+ */
+
+import path from 'path';
+import { homedir } from 'os';
+import { mkdirSync, writeFileSync, existsSync } from 'fs';
+import lockfile from 'proper-lockfile';
+import { logger } from '../../utils/logger.js';
+
+const DATA_DIR = path.join(homedir(), '.claude-mem');
+
+/**
+ * Acquire the spawn lock and execute the callback.
+ * Returns null if the lock is already held (another process is spawning).
+ * Lock path is per-port to support multiple worker instances.
+ */
+export async function acquireSpawnLock<T>(
+  fn: () => Promise<T>,
+  port: number = 37777
+): Promise<T | null> {
+  const lockPath = path.join(DATA_DIR, `worker-spawn-${port}.lock`);
+
+  // Ensure lock file exists (proper-lockfile needs the file to exist)
+  mkdirSync(DATA_DIR, { recursive: true });
+  if (!existsSync(lockPath)) {
+    writeFileSync(lockPath, '');
+  }
+
+  let release: (() => Promise<void>) | null = null;
+  try {
+    release = await lockfile.lock(lockPath, {
+      realpath: false,
+      retries: 0,       // Don't wait — if locked, fall back immediately
+      stale: 30_000     // Auto-release after 30s (handles crashed processes)
+    });
+  } catch (err: any) {
+    if (err.code === 'ELOCKED') {
+      logger.info('SYSTEM', 'Spawn lock held by another process — waiting for port health instead');
+      return null;
+    }
+    // Non-lock errors (filesystem, permissions) — log and proceed without lock
+    logger.warn('SYSTEM', 'Failed to acquire spawn lock, proceeding without', { error: err.message });
+    return fn();
+  }
+
+  try {
+    return await fn();
+  } finally {
+    if (release) {
+      try { await release(); } catch {}
+    }
+  }
+}
diff --git a/tests/infrastructure/singleton-manager.test.ts b/tests/infrastructure/singleton-manager.test.ts
new file mode 100644
index 000000000..c8ab650cd
--- /dev/null
+++ b/tests/infrastructure/singleton-manager.test.ts
@@ -0,0 +1,55 @@
+import { describe, it, expect, afterEach } from 'bun:test';
+import { acquireSpawnLock } from '../../src/services/infrastructure/singleton-manager.js';
+import path from 'path';
+import os from 'os';
+import fs from 'fs';
+
+// Default port lock path — tests use default port 37777
+const LOCK_PATH = path.join(os.homedir(), '.claude-mem', 'worker-spawn-37777.lock');
+
+afterEach(() => {
+  // Clean up lock file if test fails
+  try { fs.unlinkSync(LOCK_PATH); } catch {}
+});
+
+describe('singleton-manager', () => {
+  it('acquires lock and releases it via callback', async () => {
+    const result = await acquireSpawnLock(async () => {
+      // Lock should exist while we're inside
+      expect(fs.existsSync(LOCK_PATH)).toBe(true);
+      return 'spawned';
+    });
+    expect(result).toBe('spawned');
+  });
+
+  it('returns null when lock is already held', async () => {
+    // Simulate held lock by acquiring it first
+    const lockfile = await import('proper-lockfile');
+    const release = await lockfile.default.lock(LOCK_PATH, {
+      realpath: false,
+      retries: 0
+    });
+
+    try {
+      const result = await acquireSpawnLock(async () => {
+        return 'should-not-reach';
+      });
+      // Should return null (lock not acquired)
+      expect(result).toBeNull();
+    } finally {
+      await release();
+    }
+  });
+
+  it('releases lock even if callback throws', async () => {
+    try {
+      await acquireSpawnLock(async () => {
+        throw new Error('boom');
+      });
+    } catch {}
+
+    // Lock should be released — second acquire should succeed
+    const result = await acquireSpawnLock(async () => 'ok');
+    expect(result).toBe('ok');
+  });
+});

From f6f8605ec165475c8d4d2fe4f08bd1059f06203c Mon Sep 17 00:00:00 2001
From: Rod Boev <rod.boev@gmail.com>
Date: Fri, 13 Feb 2026 02:34:58 -0500
Subject: [PATCH 3/9] feat: wrap worker spawn in filesystem mutex to prevent
 TOCTOU race

Only the lock holder can spawn a worker. Lock-losers get null and fall
back to waiting for port health. Double-check pattern inside the lock
re-verifies health before spawning.
---
 src/services/worker-service.ts | 54 +++++++++++++++++++++++-----------
 1 file changed, 37 insertions(+), 17 deletions(-)

diff --git a/src/services/worker-service.ts b/src/services/worker-service.ts
index f2498d3a0..514c55e1a 100644
--- a/src/services/worker-service.ts
+++ b/src/services/worker-service.ts
@@ -18,6 +18,7 @@ import { HOOK_TIMEOUTS } from '../shared/hook-constants.js';
 import { SettingsDefaultsManager } from '../shared/SettingsDefaultsManager.js';
 import { getAuthMethodDescription } from '../shared/EnvManager.js';
 import { logger } from '../utils/logger.js';
+import { acquireSpawnLock } from './infrastructure/singleton-manager.js';
 
 // Windows: avoid repeated spawn popups when startup fails (issue #921)
 const WINDOWS_SPAWN_COOLDOWN_MS = 2 * 60 * 1000;
@@ -858,28 +859,47 @@ async function ensureWorkerStarted(port: number): Promise<boolean> {
     return false;
   }
 
-  // Spawn new worker daemon
-  logger.info('SYSTEM', 'Starting worker daemon');
-  markWorkerSpawnAttempted();
-  const pid = spawnDaemon(__filename, port);
-  if (pid === undefined) {
-    logger.error('SYSTEM', 'Failed to spawn worker daemon');
-    return false;
-  }
+  // Acquire spawn lock — only one process at a time can attempt to spawn
+  const spawnResult = await acquireSpawnLock(async () => {
+    // Re-check health inside lock (another process may have spawned while we waited)
+    if (await waitForHealth(port, 500)) {
+      logger.info('SYSTEM', 'Worker became healthy while acquiring lock');
+      return true;
+    }
+
+    // Spawn new worker daemon
+    logger.info('SYSTEM', 'Starting worker daemon');
+    markWorkerSpawnAttempted();
+    const pid = spawnDaemon(__filename, port);
+    if (pid === undefined) {
+      logger.error('SYSTEM', 'Failed to spawn worker daemon');
+      return false;
+    }
 
-  // PID file is written by the worker itself after listen() succeeds
-  // This is race-free and works correctly on Windows where cmd.exe PID is useless
+    const healthy = await waitForHealth(port, getPlatformTimeout(HOOK_TIMEOUTS.POST_SPAWN_WAIT));
+    if (!healthy) {
+      removePidFile();
+      logger.error('SYSTEM', 'Worker failed to start (health check timeout)');
+      return false;
+    }
+
+    clearWorkerSpawnAttempted();
+    logger.info('SYSTEM', 'Worker started successfully');
+    return true;
+  }, port);
 
-  const healthy = await waitForHealth(port, getPlatformTimeout(HOOK_TIMEOUTS.POST_SPAWN_WAIT));
-  if (!healthy) {
-    removePidFile();
-    logger.error('SYSTEM', 'Worker failed to start (health check timeout)');
+  // If lock was held by another process, wait for port health
+  if (spawnResult === null) {
+    const healthy = await waitForHealth(port, getPlatformTimeout(HOOK_TIMEOUTS.POST_SPAWN_WAIT));
+    if (healthy) {
+      logger.info('SYSTEM', 'Worker started by another process');
+      return true;
+    }
+    logger.error('SYSTEM', 'Worker not healthy after waiting for other spawner');
     return false;
   }
 
-  clearWorkerSpawnAttempted();
-  logger.info('SYSTEM', 'Worker started successfully');
-  return true;
+  return spawnResult;
 }
 
 // ============================================================================

From 79388c9eb15da8357870522a4526ef1b45154ec6 Mon Sep 17 00:00:00 2001
From: Rod Boev <rod.boev@gmail.com>
Date: Fri, 13 Feb 2026 02:35:33 -0500
Subject: [PATCH 4/9] fix: exit immediately on EADDRINUSE instead of spawning
 zombie subprocesses

Workers that lose the port race now exit(0) before entering
initializeBackground(), preventing each race-loser from spawning
its own tree of chroma-mcp subprocesses.
---
 src/services/worker-service.ts                | 11 +++++++++-
 .../infrastructure/worker-eaddrinuse.test.ts  | 21 +++++++++++++++++++
 2 files changed, 31 insertions(+), 1 deletion(-)
 create mode 100644 tests/infrastructure/worker-eaddrinuse.test.ts

diff --git a/src/services/worker-service.ts b/src/services/worker-service.ts
index 514c55e1a..7d334c2ee 100644
--- a/src/services/worker-service.ts
+++ b/src/services/worker-service.ts
@@ -336,7 +336,16 @@ export class WorkerService {
     const host = getWorkerHost();
 
     // Start HTTP server FIRST - make port available immediately
-    await this.server.listen(port, host);
+    // If port is already bound, exit immediately (another worker won the race)
+    try {
+      await this.server.listen(port, host);
+    } catch (error: any) {
+      if (error.code === 'EADDRINUSE') {
+        logger.info('SYSTEM', 'Port already bound by another worker — exiting', { port });
+        process.exit(0);
+      }
+      throw error;
+    }
 
     // Worker writes its own PID - reliable on all platforms
     // This happens after listen() succeeds, ensuring the worker is actually ready
diff --git a/tests/infrastructure/worker-eaddrinuse.test.ts b/tests/infrastructure/worker-eaddrinuse.test.ts
new file mode 100644
index 000000000..aa9960bb5
--- /dev/null
+++ b/tests/infrastructure/worker-eaddrinuse.test.ts
@@ -0,0 +1,21 @@
+import { describe, it, expect } from 'bun:test';
+import net from 'net';
+
+describe('EADDRINUSE detection concept', () => {
+  it('net.createServer throws EADDRINUSE when port is taken', async () => {
+    // Bind a port
+    const server1 = net.createServer();
+    await new Promise<void>((resolve) => server1.listen(0, resolve));
+    const port = (server1.address() as net.AddressInfo).port;
+
+    // Try to bind same port
+    const server2 = net.createServer();
+    const error = await new Promise<any>((resolve) => {
+      server2.on('error', resolve);
+      server2.listen(port);
+    });
+
+    expect(error.code).toBe('EADDRINUSE');
+    server1.close();
+  });
+});

From 0000942d9a10e605cb896da6409fecb84b64a387 Mon Sep 17 00:00:00 2001
From: Rod Boev <rod.boev@gmail.com>
Date: Fri, 13 Feb 2026 02:36:04 -0500
Subject: [PATCH 5/9] =?UTF-8?q?fix:=20remove=20redundant=20start=20command?=
 =?UTF-8?q?s=20from=20hooks=20=E2=80=94=20halves=20spawn=20attempts=20per?=
 =?UTF-8?q?=20event?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The hook command already calls ensureWorkerStarted() internally (line 1009
of worker-service.ts). The separate start command doubled the spawn
attempt for every hook event.
---
 plugin/hooks/hooks.json | 20 --------------------
 1 file changed, 20 deletions(-)

diff --git a/plugin/hooks/hooks.json b/plugin/hooks/hooks.json
index efdcb2fb4..6f8e60287 100644
--- a/plugin/hooks/hooks.json
+++ b/plugin/hooks/hooks.json
@@ -22,11 +22,6 @@
             "command": "node \"${CLAUDE_PLUGIN_ROOT}/scripts/smart-install.js\"",
             "timeout": 300
           },
-          {
-            "type": "command",
-            "command": "node \"${CLAUDE_PLUGIN_ROOT}/scripts/bun-runner.js\" \"${CLAUDE_PLUGIN_ROOT}/scripts/worker-service.cjs\" start",
-            "timeout": 60
-          },
           {
             "type": "command",
             "command": "node \"${CLAUDE_PLUGIN_ROOT}/scripts/bun-runner.js\" \"${CLAUDE_PLUGIN_ROOT}/scripts/worker-service.cjs\" hook claude-code context",
@@ -38,11 +33,6 @@
     "UserPromptSubmit": [
       {
         "hooks": [
-          {
-            "type": "command",
-            "command": "node \"${CLAUDE_PLUGIN_ROOT}/scripts/bun-runner.js\" \"${CLAUDE_PLUGIN_ROOT}/scripts/worker-service.cjs\" start",
-            "timeout": 60
-          },
           {
             "type": "command",
             "command": "node \"${CLAUDE_PLUGIN_ROOT}/scripts/bun-runner.js\" \"${CLAUDE_PLUGIN_ROOT}/scripts/worker-service.cjs\" hook claude-code session-init",
@@ -55,11 +45,6 @@
       {
         "matcher": "*",
         "hooks": [
-          {
-            "type": "command",
-            "command": "node \"${CLAUDE_PLUGIN_ROOT}/scripts/bun-runner.js\" \"${CLAUDE_PLUGIN_ROOT}/scripts/worker-service.cjs\" start",
-            "timeout": 60
-          },
           {
             "type": "command",
             "command": "node \"${CLAUDE_PLUGIN_ROOT}/scripts/bun-runner.js\" \"${CLAUDE_PLUGIN_ROOT}/scripts/worker-service.cjs\" hook claude-code observation",
@@ -71,11 +56,6 @@
     "Stop": [
       {
         "hooks": [
-          {
-            "type": "command",
-            "command": "node \"${CLAUDE_PLUGIN_ROOT}/scripts/bun-runner.js\" \"${CLAUDE_PLUGIN_ROOT}/scripts/worker-service.cjs\" start",
-            "timeout": 60
-          },
           {
             "type": "command",
             "command": "node \"${CLAUDE_PLUGIN_ROOT}/scripts/bun-runner.js\" \"${CLAUDE_PLUGIN_ROOT}/scripts/worker-service.cjs\" hook claude-code summarize",

From 9f0c02499373fb59d204d41da2f8d291061d7b0e Mon Sep 17 00:00:00 2001
From: Rod Boev <rod.boev@gmail.com>
Date: Fri, 13 Feb 2026 02:38:01 -0500
Subject: [PATCH 6/9] fix: add ChromaSync connection mutex, circuit breaker,
 and safe state reset

Prevents concurrent ensureConnection() calls from spawning multiple
chroma-mcp subprocesses. Circuit breaker stops retry storms after 3
consecutive failures. Safe reset closes transport before nulling
reference to prevent orphaned subprocesses. Fixed close() early-return
bug where error handlers could skip subprocess cleanup.
---
 src/services/sync/ChromaSync.ts      | 131 ++++++++++++++++-----------
 tests/sync/chroma-sync-mutex.test.ts |  93 +++++++++++++++++++
 2 files changed, 173 insertions(+), 51 deletions(-)
 create mode 100644 tests/sync/chroma-sync-mutex.test.ts

diff --git a/src/services/sync/ChromaSync.ts b/src/services/sync/ChromaSync.ts
index 95bb10189..b7321692d 100644
--- a/src/services/sync/ChromaSync.ts
+++ b/src/services/sync/ChromaSync.ts
@@ -90,6 +90,15 @@ export class ChromaSync {
   // MCP SDK's StdioClientTransport uses shell:false and no detached flag, so console is inherited.
   private readonly disabled: boolean = false;
 
+  // Connection mutex — coalesces concurrent ensureConnection() calls onto single spawn
+  private connectionPromise: Promise<void> | null = null;
+
+  // Circuit breaker — stops retry storms after repeated failures
+  private consecutiveFailures: number = 0;
+  private circuitOpenUntil: number = 0;
+  private static readonly MAX_FAILURES = 3;
+  private static readonly CIRCUIT_OPEN_MS = 60_000;
+
   constructor(project: string) {
     this.project = project;
     this.collectionName = `cm__${project}`;
@@ -186,15 +195,49 @@ export class ChromaSync {
       return;
     }
 
+    // Circuit breaker: stop retrying after repeated failures
+    if (Date.now() < this.circuitOpenUntil) {
+      throw new Error('Chroma circuit breaker open — connection disabled for 60s after repeated failures');
+    }
+
+    // Connection mutex: coalesce concurrent callers onto single spawn
+    if (this.connectionPromise) {
+      return this.connectionPromise;
+    }
+
+    // Capture reference to detect if a newer call replaced us.
+    // Without this, a concurrent caller's promise could be cleared by
+    // an older caller's finally{} block — a subtle race condition.
+    const p = this.connectionPromise = this._doConnect();
+    try {
+      await p;
+      this.consecutiveFailures = 0; // Reset on success
+    } catch (error) {
+      this.consecutiveFailures++;
+      if (this.consecutiveFailures >= ChromaSync.MAX_FAILURES) {
+        this.circuitOpenUntil = Date.now() + ChromaSync.CIRCUIT_OPEN_MS;
+        logger.warn('CHROMA_SYNC', 'Circuit breaker tripped — disabling Chroma for 60s', {
+          failures: this.consecutiveFailures
+        });
+      }
+      throw error;
+    } finally {
+      // Only clear if still the same promise (newer call may have replaced it)
+      if (this.connectionPromise === p) {
+        this.connectionPromise = null;
+      }
+    }
+  }
+
+  /**
+   * Internal connection logic — called only by ensureConnection() mutex
+   */
+  private async _doConnect(): Promise<void> {
     logger.info('CHROMA_SYNC', 'Connecting to Chroma MCP server...', { project: this.project });
 
     try {
-      // Use Python 3.13 by default to avoid onnxruntime compatibility issues with Python 3.14+
-      // See: https://github.com/thedotmack/claude-mem/issues/170 (Python 3.14 incompatibility)
       const settings = SettingsDefaultsManager.loadFromFile(USER_SETTINGS_PATH);
       const pythonVersion = settings.CLAUDE_MEM_PYTHON_VERSION;
-
-      // Get combined SSL certificate bundle for Zscaler/corporate proxy environments
       const combinedCertPath = this.getCombinedCertPath();
 
       const transportOptions: any = {
@@ -208,7 +251,6 @@ export class ChromaSync {
         stderr: 'ignore'
       };
 
-      // Add SSL certificate environment variables for corporate proxy/Zscaler environments
       if (combinedCertPath) {
         transportOptions.env = {
           ...process.env,
@@ -221,13 +263,8 @@ export class ChromaSync {
         });
       }
 
-      // Note: windowsHide is not needed here because the worker daemon starts with
-      // -WindowStyle Hidden, so child processes inherit the hidden console.
-      // The MCP SDK ignores custom windowsHide anyway (overridden internally).
-
       this.transport = new StdioClientTransport(transportOptions);
 
-      // Empty capabilities object: this client only calls Chroma tools, doesn't expose any
       this.client = new Client({
         name: 'claude-mem-chroma-sync',
         version: packageVersion
@@ -240,6 +277,13 @@ export class ChromaSync {
 
       logger.info('CHROMA_SYNC', 'Connected to Chroma MCP server', { project: this.project });
     } catch (error) {
+      // Safe cleanup: close transport before nulling reference
+      if (this.transport) {
+        try { await this.transport.close(); } catch {}
+      }
+      this.transport = null;
+      this.client = null;
+      this.connected = false;
       logger.error('CHROMA_SYNC', 'Failed to connect to Chroma MCP server', { project: this.project }, error as Error);
       throw new Error(`Chroma connection failed: ${error instanceof Error ? error.message : String(error)}`);
     }
@@ -278,19 +322,7 @@ export class ChromaSync {
         errorMessage.includes('MCP error -32000');
 
       if (isConnectionError) {
-        // FIX: Close transport to kill subprocess before resetting state
-        // Without this, old chroma-mcp processes leak as zombies
-        if (this.transport) {
-          try {
-            await this.transport.close();
-          } catch (closeErr) {
-            logger.debug('CHROMA_SYNC', 'Transport close error (expected if already dead)', {}, closeErr as Error);
-          }
-        }
-        // Reset connection state so next call attempts reconnect
-        this.connected = false;
-        this.client = null;
-        this.transport = null;
+        await this.safeResetConnection();
         logger.error('CHROMA_SYNC', 'Connection lost during collection check',
           { collection: this.collectionName }, error as Error);
         throw new Error(`Chroma connection lost: ${errorMessage}`);
@@ -948,18 +980,7 @@ export class ChromaSync {
         errorMessage.includes('MCP error -32000');
 
       if (isConnectionError) {
-        // FIX: Close transport to kill subprocess before resetting state
-        if (this.transport) {
-          try {
-            await this.transport.close();
-          } catch (closeErr) {
-            logger.debug('CHROMA_SYNC', 'Transport close error (expected if already dead)', {}, closeErr as Error);
-          }
-        }
-        // Reset connection state so next call attempts reconnect
-        this.connected = false;
-        this.client = null;
-        this.transport = null;
+        await this.safeResetConnection();
         logger.error('CHROMA_SYNC', 'Connection lost during query',
           { project: this.project, query }, error as Error);
         throw new Error(`Chroma query failed - connection lost: ${errorMessage}`);
@@ -1019,26 +1040,34 @@ export class ChromaSync {
   /**
    * Close the Chroma client connection and cleanup subprocess
    */
-  async close(): Promise<void> {
-    if (!this.connected && !this.client && !this.transport) {
-      return;
-    }
-
-    // Close client first
-    if (this.client) {
-      await this.client.close();
-    }
-
-    // Explicitly close transport to kill subprocess
-    if (this.transport) {
-      await this.transport.close();
-    }
+  /**
+   * Safely reset connection state — closes transport BEFORE nulling reference.
+   * Prevents orphaned chroma-mcp subprocesses.
+   */
+  private async safeResetConnection(): Promise<void> {
+    const t = this.transport;
+    const c = this.client;
+    this.connected = false;
+    this.client = null;
+    this.transport = null;
+    if (c) { try { await c.close(); } catch {} }
+    if (t) { try { await t.close(); } catch {} }
+  }
 
-    logger.info('CHROMA_SYNC', 'Chroma client and subprocess closed', { project: this.project });
+  async close(): Promise<void> {
+    // Always attempt cleanup — don't skip based on state flags alone.
+    // Error handlers may have set connected=false while subprocess still runs.
+    const t = this.transport;
+    const c = this.client;
 
-    // Always reset state
     this.connected = false;
     this.client = null;
     this.transport = null;
+    this.connectionPromise = null;
+
+    if (c) { try { await c.close(); } catch {} }
+    if (t) { try { await t.close(); } catch {} }
+
+    logger.info('CHROMA_SYNC', 'Chroma client and subprocess closed', { project: this.project });
   }
 }
diff --git a/tests/sync/chroma-sync-mutex.test.ts b/tests/sync/chroma-sync-mutex.test.ts
new file mode 100644
index 000000000..6acfbba51
--- /dev/null
+++ b/tests/sync/chroma-sync-mutex.test.ts
@@ -0,0 +1,93 @@
+import { describe, it, expect } from 'bun:test';
+
+describe('ChromaSync connection mutex', () => {
+  it('connectionPromise field coalesces concurrent calls', async () => {
+    // Simulate the mutex pattern
+    let connectCount = 0;
+    let connectionPromise: Promise<void> | null = null;
+
+    async function doConnect(): Promise<void> {
+      connectCount++;
+      await new Promise(r => setTimeout(r, 50));
+    }
+
+    async function ensureConnection(): Promise<void> {
+      if (connectionPromise) return connectionPromise;
+      const p = connectionPromise = doConnect();
+      try { await p; }
+      finally {
+        if (connectionPromise === p) connectionPromise = null;
+      }
+    }
+
+    // Fire 10 concurrent calls
+    await Promise.all(Array(10).fill(null).map(() => ensureConnection()));
+
+    // Only 1 actual connection should have been made
+    expect(connectCount).toBe(1);
+  });
+
+  it('promise memoization race: newer caller not cleared by older finally', async () => {
+    let connectCount = 0;
+    let connectionPromise: Promise<void> | null = null;
+
+    async function doConnect(delay: number): Promise<void> {
+      connectCount++;
+      await new Promise(r => setTimeout(r, delay));
+    }
+
+    async function ensureConnection(delay: number): Promise<void> {
+      if (connectionPromise) return connectionPromise;
+      const p = connectionPromise = doConnect(delay);
+      try { await p; }
+      finally {
+        // Only clear if still the same promise
+        if (connectionPromise === p) connectionPromise = null;
+      }
+    }
+
+    // First call starts a slow connection
+    const p1 = ensureConnection(100);
+    // Wait for first to complete, then start a second
+    await p1;
+    const p2 = ensureConnection(50);
+    await p2;
+
+    // Both should complete independently
+    expect(connectCount).toBe(2);
+  });
+
+  it('circuit breaker stops retries after MAX_FAILURES', async () => {
+    let attempts = 0;
+    let consecutiveFailures = 0;
+    let circuitOpenUntil = 0;
+    const MAX_FAILURES = 3;
+    const CIRCUIT_OPEN_MS = 60_000;
+
+    function isCircuitOpen(): boolean {
+      return Date.now() < circuitOpenUntil;
+    }
+
+    async function tryConnect(): Promise<boolean> {
+      if (isCircuitOpen()) return false;
+      attempts++;
+      consecutiveFailures++;
+      if (consecutiveFailures >= MAX_FAILURES) {
+        circuitOpenUntil = Date.now() + CIRCUIT_OPEN_MS;
+      }
+      return false; // simulate failure
+    }
+
+    // 3 failures should trip the breaker
+    await tryConnect();
+    await tryConnect();
+    await tryConnect();
+    expect(attempts).toBe(3);
+    expect(isCircuitOpen()).toBe(true);
+
+    // 4th attempt should be blocked by circuit breaker
+    const blocked = await tryConnect();
+    expect(blocked).toBe(false);
+    expect(attempts).toBe(3); // no new attempt
+  });
+});

From 2f092cbbc4f431da04803dafd25f833d7c853096 Mon Sep 17 00:00:00 2001
From: Rod Boev <rod.boev@gmail.com>
Date: Fri, 13 Feb 2026 02:38:59 -0500
Subject: [PATCH 7/9] fix: lower orphan age to 5min and add count-based chroma
 reaper

Age-based cleanup (was 30min) completely missed spawn storms where all
processes are <5min old. Count-based reaper kills excess chroma-mcp
regardless of age, keeping only the 2 newest.
---
 src/services/infrastructure/ProcessManager.ts | 50 ++++++++++++++++++-
 1 file changed, 49 insertions(+), 1 deletion(-)

diff --git a/src/services/infrastructure/ProcessManager.ts b/src/services/infrastructure/ProcessManager.ts
index 7013892ec..40b8a05f2 100644
--- a/src/services/infrastructure/ProcessManager.ts
+++ b/src/services/infrastructure/ProcessManager.ts
@@ -31,7 +31,7 @@ const ORPHAN_PROCESS_PATTERNS = [
 ];
 
 // Only kill processes older than this to avoid killing the current session
-const ORPHAN_MAX_AGE_MINUTES = 30;
+const ORPHAN_MAX_AGE_MINUTES = 5;
 
 export interface PidInfo {
   pid: number;
@@ -337,6 +337,54 @@ export async function cleanupOrphanedProcesses(): Promise<void> {
   }
 
   logger.info('SYSTEM', 'Orphaned processes cleaned up', { count: pidsToKill.length });
+
+  // Also check for excess chroma-mcp by count (catches recent spawn storms)
+  await cleanupExcessChromaProcesses();
+}
+
+/**
+ * Kill excess chroma-mcp processes by count, regardless of age.
+ * Keeps the newest MAX_CHROMA_PROCESSES alive, kills the rest.
+ * This catches spawn storms where all processes are < ORPHAN_MAX_AGE_MINUTES old.
+ */
+const MAX_CHROMA_PROCESSES = 2;
+
+export async function cleanupExcessChromaProcesses(): Promise<void> {
+  if (process.platform === 'win32') return;
+
+  try {
+    const { stdout } = await execAsync(
+      'ps -eo pid,etimes,command | grep chroma-mcp | grep -v grep | sort -k2 -n || true'
+    );
+
+    if (!stdout.trim()) return;
+
+    const lines = stdout.trim().split('\n');
+    if (lines.length <= MAX_CHROMA_PROCESSES) return;
+
+    // etimes = elapsed seconds, sort -k2 -n ascending = newest first (smallest etimes)
+    // Keep the newest (start of list), kill the oldest (end of list)
+    const toKill = lines.slice(MAX_CHROMA_PROCESSES);
+
+    for (const line of toKill) {
+      const match = line.trim().match(/^(\d+)/);
+      if (!match) continue;
+      const pid = parseInt(match[1], 10);
+      if (pid <= 0 || pid === process.pid) continue;
+      try {
+        process.kill(pid, 'SIGKILL');
+        logger.info('SYSTEM', 'Killed excess chroma-mcp process', { pid });
+      } catch {}
+    }
+
+    logger.info('SYSTEM', 'Excess chroma cleanup complete', {
+      found: lines.length,
+      killed: toKill.length,
+      kept: MAX_CHROMA_PROCESSES
+    });
+  } catch (error) {
+    logger.debug('SYSTEM', 'Excess chroma cleanup failed (non-critical)', {}, error as Error);
+  }
 }
 
 /**

From d6c8d6df835d31badf8890f157e2b88029fa2e28 Mon Sep 17 00:00:00 2001
From: Rod Boev <rod.boev@gmail.com>
Date: Fri, 13 Feb 2026 02:41:56 -0500
Subject: [PATCH 8/9] chore: add @types/proper-lockfile for type safety

---
 package.json | 1 +
 1 file changed, 1 insertion(+)

diff --git a/package.json b/package.json
index d7804c818..3371de540 100644
--- a/package.json
+++ b/package.json
@@ -114,6 +114,7 @@
     "@types/dompurify": "^3.0.5",
     "@types/express": "^4.17.21",
     "@types/node": "^20.0.0",
+    "@types/proper-lockfile": "^4.1.4",
     "@types/react": "^18.3.5",
     "@types/react-dom": "^18.3.0",
     "esbuild": "^0.27.2",

From 0b77f8f89bb8f873501368730e93dacbbbefa7f4 Mon Sep 17 00:00:00 2001
From: Rod Boev <rod.boev@gmail.com>
Date: Fri, 13 Feb 2026 02:46:01 -0500
Subject: [PATCH 9/9] refactor: address Gemini peer review findings

- DRY: close() now delegates to safeResetConnection() instead of
  duplicating the capture-null-close pattern
- Reduce stale lock timeout from 30s to 10s (spawn should complete
  well within 10s; shorter timeout = faster recovery from crashes)
- Add comment clarifying Windows gap in count-based cleanup
---
 src/services/infrastructure/ProcessManager.ts    |  6 +++++-
 src/services/infrastructure/singleton-manager.ts |  2 +-
 src/services/sync/ChromaSync.ts                  | 13 +------------
 3 files changed, 7 insertions(+), 14 deletions(-)

diff --git a/src/services/infrastructure/ProcessManager.ts b/src/services/infrastructure/ProcessManager.ts
index 40b8a05f2..c3b1887f2 100644
--- a/src/services/infrastructure/ProcessManager.ts
+++ b/src/services/infrastructure/ProcessManager.ts
@@ -350,7 +350,11 @@ export async function cleanupOrphanedProcesses(): Promise<void> {
 const MAX_CHROMA_PROCESSES = 2;
 
 export async function cleanupExcessChromaProcesses(): Promise<void> {
-  if (process.platform === 'win32') return;
+  if (process.platform === 'win32') {
+    // Windows cleanup relies on age-based cleanupOrphanedProcesses() above
+    // which uses PowerShell Get-CimInstance. Count-based cleanup not yet implemented.
+    return;
+  }
 
   try {
     const { stdout } = await execAsync(
diff --git a/src/services/infrastructure/singleton-manager.ts b/src/services/infrastructure/singleton-manager.ts
index 1df0e2880..3d5da1eda 100644
--- a/src/services/infrastructure/singleton-manager.ts
+++ b/src/services/infrastructure/singleton-manager.ts
@@ -39,7 +39,7 @@ export async function acquireSpawnLock<T>(
     release = await lockfile.lock(lockPath, {
       realpath: false,
       retries: 0,       // Don't wait — if locked, fall back immediately
-      stale: 30_000     // Auto-release after 30s (handles crashed processes)
+      stale: 10_000     // Auto-release after 10s (handles crashed processes)
     });
   } catch (err: any) {
     if (err.code === 'ELOCKED') {
diff --git a/src/services/sync/ChromaSync.ts b/src/services/sync/ChromaSync.ts
index b7321692d..4d0000285 100644
--- a/src/services/sync/ChromaSync.ts
+++ b/src/services/sync/ChromaSync.ts
@@ -1055,19 +1055,8 @@ export class ChromaSync {
   }
 
   async close(): Promise<void> {
-    // Always attempt cleanup — don't skip based on state flags alone.
-    // Error handlers may have set connected=false while subprocess still runs.
-    const t = this.transport;
-    const c = this.client;
-
-    this.connected = false;
-    this.client = null;
-    this.transport = null;
+    await this.safeResetConnection();
     this.connectionPromise = null;
-
-    if (c) { try { await c.close(); } catch {} }
-    if (t) { try { await t.close(); } catch {} }
-
     logger.info('CHROMA_SYNC', 'Chroma client and subprocess closed', { project: this.project });
   }
 }