From ce04c6c52b784a0cfb2d99758391ee871b1e9de0 Mon Sep 17 00:00:00 2001 From: Rod Boev Date: Fri, 13 Feb 2026 02:31:23 -0500 Subject: [PATCH 1/9] chore: add proper-lockfile for spawn coordination --- package.json | 1 + 1 file changed, 1 insertion(+) diff --git a/package.json b/package.json index a8fec4c20..d7804c818 100644 --- a/package.json +++ b/package.json @@ -103,6 +103,7 @@ "express": "^4.18.2", "glob": "^11.0.3", "handlebars": "^4.7.8", + "proper-lockfile": "^4.1.2", "react": "^18.3.1", "react-dom": "^18.3.1", "yaml": "^2.8.2", From f6b4d8152fb76d31d51c6c0a2de2349ce8730806 Mon Sep 17 00:00:00 2001 From: Rod Boev Date: Fri, 13 Feb 2026 02:32:12 -0500 Subject: [PATCH 2/9] feat: add filesystem mutex for worker spawn coordination Uses proper-lockfile (per-port lock file) so only one process at a time can attempt to spawn a worker daemon. Lock-losers get null and fall back to waiting for port health instead. --- .../infrastructure/singleton-manager.ts | 61 +++++++++++++++++++ .../infrastructure/singleton-manager.test.ts | 55 +++++++++++++++++ 2 files changed, 116 insertions(+) create mode 100644 src/services/infrastructure/singleton-manager.ts create mode 100644 tests/infrastructure/singleton-manager.test.ts diff --git a/src/services/infrastructure/singleton-manager.ts b/src/services/infrastructure/singleton-manager.ts new file mode 100644 index 000000000..1df0e2880 --- /dev/null +++ b/src/services/infrastructure/singleton-manager.ts @@ -0,0 +1,61 @@ +/** + * Filesystem-based mutex for worker spawn coordination. + * + * Prevents TOCTOU race: only one process at a time can attempt to spawn + * a worker daemon. Concurrent hook invocations that lose the lock race + * fall back to waiting for port health. + * + * Uses proper-lockfile (same library npm uses for package-lock coordination). + * Lock location: ~/.claude-mem/worker-spawn-.lock + */ + +import path from 'path'; +import { homedir } from 'os'; +import { mkdirSync, writeFileSync, existsSync } from 'fs'; +import lockfile from 'proper-lockfile'; +import { logger } from '../../utils/logger.js'; + +const DATA_DIR = path.join(homedir(), '.claude-mem'); + +/** + * Acquire the spawn lock and execute the callback. + * Returns null if the lock is already held (another process is spawning). + * Lock path is per-port to support multiple worker instances. + */ +export async function acquireSpawnLock( + fn: () => Promise, + port: number = 37777 +): Promise { + const lockPath = path.join(DATA_DIR, `worker-spawn-${port}.lock`); + + // Ensure lock file exists (proper-lockfile needs the file to exist) + mkdirSync(DATA_DIR, { recursive: true }); + if (!existsSync(lockPath)) { + writeFileSync(lockPath, ''); + } + + let release: (() => Promise) | null = null; + try { + release = await lockfile.lock(lockPath, { + realpath: false, + retries: 0, // Don't wait — if locked, fall back immediately + stale: 30_000 // Auto-release after 30s (handles crashed processes) + }); + } catch (err: any) { + if (err.code === 'ELOCKED') { + logger.info('SYSTEM', 'Spawn lock held by another process — waiting for port health instead'); + return null; + } + // Non-lock errors (filesystem, permissions) — log and proceed without lock + logger.warn('SYSTEM', 'Failed to acquire spawn lock, proceeding without', { error: err.message }); + return fn(); + } + + try { + return await fn(); + } finally { + if (release) { + try { await release(); } catch {} + } + } +} diff --git a/tests/infrastructure/singleton-manager.test.ts b/tests/infrastructure/singleton-manager.test.ts new file mode 100644 index 000000000..c8ab650cd --- /dev/null +++ b/tests/infrastructure/singleton-manager.test.ts @@ -0,0 +1,55 @@ +import { describe, it, expect, afterEach } from 'bun:test'; +import { acquireSpawnLock } from '../../src/services/infrastructure/singleton-manager.js'; +import path from 'path'; +import os from 'os'; +import fs from 'fs'; + +// Default port lock path — tests use default port 37777 +const LOCK_PATH = path.join(os.homedir(), '.claude-mem', 'worker-spawn-37777.lock'); + +afterEach(() => { + // Clean up lock file if test fails + try { fs.unlinkSync(LOCK_PATH); } catch {} +}); + +describe('singleton-manager', () => { + it('acquires lock and releases it via callback', async () => { + const result = await acquireSpawnLock(async () => { + // Lock should exist while we're inside + expect(fs.existsSync(LOCK_PATH)).toBe(true); + return 'spawned'; + }); + expect(result).toBe('spawned'); + }); + + it('returns null when lock is already held', async () => { + // Simulate held lock by acquiring it first + const lockfile = await import('proper-lockfile'); + const release = await lockfile.default.lock(LOCK_PATH, { + realpath: false, + retries: 0 + }); + + try { + const result = await acquireSpawnLock(async () => { + return 'should-not-reach'; + }); + // Should return null (lock not acquired) + expect(result).toBeNull(); + } finally { + await release(); + } + }); + + it('releases lock even if callback throws', async () => { + try { + await acquireSpawnLock(async () => { + throw new Error('boom'); + }); + } catch {} + + // Lock should be released — second acquire should succeed + const result = await acquireSpawnLock(async () => 'ok'); + expect(result).toBe('ok'); + }); +}); From f6f8605ec165475c8d4d2fe4f08bd1059f06203c Mon Sep 17 00:00:00 2001 From: Rod Boev Date: Fri, 13 Feb 2026 02:34:58 -0500 Subject: [PATCH 3/9] feat: wrap worker spawn in filesystem mutex to prevent TOCTOU race Only the lock holder can spawn a worker. Lock-losers get null and fall back to waiting for port health. Double-check pattern inside the lock re-verifies health before spawning. --- src/services/worker-service.ts | 54 +++++++++++++++++++++++----------- 1 file changed, 37 insertions(+), 17 deletions(-) diff --git a/src/services/worker-service.ts b/src/services/worker-service.ts index f2498d3a0..514c55e1a 100644 --- a/src/services/worker-service.ts +++ b/src/services/worker-service.ts @@ -18,6 +18,7 @@ import { HOOK_TIMEOUTS } from '../shared/hook-constants.js'; import { SettingsDefaultsManager } from '../shared/SettingsDefaultsManager.js'; import { getAuthMethodDescription } from '../shared/EnvManager.js'; import { logger } from '../utils/logger.js'; +import { acquireSpawnLock } from './infrastructure/singleton-manager.js'; // Windows: avoid repeated spawn popups when startup fails (issue #921) const WINDOWS_SPAWN_COOLDOWN_MS = 2 * 60 * 1000; @@ -858,28 +859,47 @@ async function ensureWorkerStarted(port: number): Promise { return false; } - // Spawn new worker daemon - logger.info('SYSTEM', 'Starting worker daemon'); - markWorkerSpawnAttempted(); - const pid = spawnDaemon(__filename, port); - if (pid === undefined) { - logger.error('SYSTEM', 'Failed to spawn worker daemon'); - return false; - } + // Acquire spawn lock — only one process at a time can attempt to spawn + const spawnResult = await acquireSpawnLock(async () => { + // Re-check health inside lock (another process may have spawned while we waited) + if (await waitForHealth(port, 500)) { + logger.info('SYSTEM', 'Worker became healthy while acquiring lock'); + return true; + } + + // Spawn new worker daemon + logger.info('SYSTEM', 'Starting worker daemon'); + markWorkerSpawnAttempted(); + const pid = spawnDaemon(__filename, port); + if (pid === undefined) { + logger.error('SYSTEM', 'Failed to spawn worker daemon'); + return false; + } - // PID file is written by the worker itself after listen() succeeds - // This is race-free and works correctly on Windows where cmd.exe PID is useless + const healthy = await waitForHealth(port, getPlatformTimeout(HOOK_TIMEOUTS.POST_SPAWN_WAIT)); + if (!healthy) { + removePidFile(); + logger.error('SYSTEM', 'Worker failed to start (health check timeout)'); + return false; + } + + clearWorkerSpawnAttempted(); + logger.info('SYSTEM', 'Worker started successfully'); + return true; + }, port); - const healthy = await waitForHealth(port, getPlatformTimeout(HOOK_TIMEOUTS.POST_SPAWN_WAIT)); - if (!healthy) { - removePidFile(); - logger.error('SYSTEM', 'Worker failed to start (health check timeout)'); + // If lock was held by another process, wait for port health + if (spawnResult === null) { + const healthy = await waitForHealth(port, getPlatformTimeout(HOOK_TIMEOUTS.POST_SPAWN_WAIT)); + if (healthy) { + logger.info('SYSTEM', 'Worker started by another process'); + return true; + } + logger.error('SYSTEM', 'Worker not healthy after waiting for other spawner'); return false; } - clearWorkerSpawnAttempted(); - logger.info('SYSTEM', 'Worker started successfully'); - return true; + return spawnResult; } // ============================================================================ From 79388c9eb15da8357870522a4526ef1b45154ec6 Mon Sep 17 00:00:00 2001 From: Rod Boev Date: Fri, 13 Feb 2026 02:35:33 -0500 Subject: [PATCH 4/9] fix: exit immediately on EADDRINUSE instead of spawning zombie subprocesses Workers that lose the port race now exit(0) before entering initializeBackground(), preventing each race-loser from spawning its own tree of chroma-mcp subprocesses. --- src/services/worker-service.ts | 11 +++++++++- .../infrastructure/worker-eaddrinuse.test.ts | 21 +++++++++++++++++++ 2 files changed, 31 insertions(+), 1 deletion(-) create mode 100644 tests/infrastructure/worker-eaddrinuse.test.ts diff --git a/src/services/worker-service.ts b/src/services/worker-service.ts index 514c55e1a..7d334c2ee 100644 --- a/src/services/worker-service.ts +++ b/src/services/worker-service.ts @@ -336,7 +336,16 @@ export class WorkerService { const host = getWorkerHost(); // Start HTTP server FIRST - make port available immediately - await this.server.listen(port, host); + // If port is already bound, exit immediately (another worker won the race) + try { + await this.server.listen(port, host); + } catch (error: any) { + if (error.code === 'EADDRINUSE') { + logger.info('SYSTEM', 'Port already bound by another worker — exiting', { port }); + process.exit(0); + } + throw error; + } // Worker writes its own PID - reliable on all platforms // This happens after listen() succeeds, ensuring the worker is actually ready diff --git a/tests/infrastructure/worker-eaddrinuse.test.ts b/tests/infrastructure/worker-eaddrinuse.test.ts new file mode 100644 index 000000000..aa9960bb5 --- /dev/null +++ b/tests/infrastructure/worker-eaddrinuse.test.ts @@ -0,0 +1,21 @@ +import { describe, it, expect } from 'bun:test'; +import net from 'net'; + +describe('EADDRINUSE detection concept', () => { + it('net.createServer throws EADDRINUSE when port is taken', async () => { + // Bind a port + const server1 = net.createServer(); + await new Promise((resolve) => server1.listen(0, resolve)); + const port = (server1.address() as net.AddressInfo).port; + + // Try to bind same port + const server2 = net.createServer(); + const error = await new Promise((resolve) => { + server2.on('error', resolve); + server2.listen(port); + }); + + expect(error.code).toBe('EADDRINUSE'); + server1.close(); + }); +}); From 0000942d9a10e605cb896da6409fecb84b64a387 Mon Sep 17 00:00:00 2001 From: Rod Boev Date: Fri, 13 Feb 2026 02:36:04 -0500 Subject: [PATCH 5/9] =?UTF-8?q?fix:=20remove=20redundant=20start=20command?= =?UTF-8?q?s=20from=20hooks=20=E2=80=94=20halves=20spawn=20attempts=20per?= =?UTF-8?q?=20event?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The hook command already calls ensureWorkerStarted() internally (line 1009 of worker-service.ts). The separate start command doubled the spawn attempt for every hook event. --- plugin/hooks/hooks.json | 20 -------------------- 1 file changed, 20 deletions(-) diff --git a/plugin/hooks/hooks.json b/plugin/hooks/hooks.json index efdcb2fb4..6f8e60287 100644 --- a/plugin/hooks/hooks.json +++ b/plugin/hooks/hooks.json @@ -22,11 +22,6 @@ "command": "node \"${CLAUDE_PLUGIN_ROOT}/scripts/smart-install.js\"", "timeout": 300 }, - { - "type": "command", - "command": "node \"${CLAUDE_PLUGIN_ROOT}/scripts/bun-runner.js\" \"${CLAUDE_PLUGIN_ROOT}/scripts/worker-service.cjs\" start", - "timeout": 60 - }, { "type": "command", "command": "node \"${CLAUDE_PLUGIN_ROOT}/scripts/bun-runner.js\" \"${CLAUDE_PLUGIN_ROOT}/scripts/worker-service.cjs\" hook claude-code context", @@ -38,11 +33,6 @@ "UserPromptSubmit": [ { "hooks": [ - { - "type": "command", - "command": "node \"${CLAUDE_PLUGIN_ROOT}/scripts/bun-runner.js\" \"${CLAUDE_PLUGIN_ROOT}/scripts/worker-service.cjs\" start", - "timeout": 60 - }, { "type": "command", "command": "node \"${CLAUDE_PLUGIN_ROOT}/scripts/bun-runner.js\" \"${CLAUDE_PLUGIN_ROOT}/scripts/worker-service.cjs\" hook claude-code session-init", @@ -55,11 +45,6 @@ { "matcher": "*", "hooks": [ - { - "type": "command", - "command": "node \"${CLAUDE_PLUGIN_ROOT}/scripts/bun-runner.js\" \"${CLAUDE_PLUGIN_ROOT}/scripts/worker-service.cjs\" start", - "timeout": 60 - }, { "type": "command", "command": "node \"${CLAUDE_PLUGIN_ROOT}/scripts/bun-runner.js\" \"${CLAUDE_PLUGIN_ROOT}/scripts/worker-service.cjs\" hook claude-code observation", @@ -71,11 +56,6 @@ "Stop": [ { "hooks": [ - { - "type": "command", - "command": "node \"${CLAUDE_PLUGIN_ROOT}/scripts/bun-runner.js\" \"${CLAUDE_PLUGIN_ROOT}/scripts/worker-service.cjs\" start", - "timeout": 60 - }, { "type": "command", "command": "node \"${CLAUDE_PLUGIN_ROOT}/scripts/bun-runner.js\" \"${CLAUDE_PLUGIN_ROOT}/scripts/worker-service.cjs\" hook claude-code summarize", From 9f0c02499373fb59d204d41da2f8d291061d7b0e Mon Sep 17 00:00:00 2001 From: Rod Boev Date: Fri, 13 Feb 2026 02:38:01 -0500 Subject: [PATCH 6/9] fix: add ChromaSync connection mutex, circuit breaker, and safe state reset Prevents concurrent ensureConnection() calls from spawning multiple chroma-mcp subprocesses. Circuit breaker stops retry storms after 3 consecutive failures. Safe reset closes transport before nulling reference to prevent orphaned subprocesses. Fixed close() early-return bug where error handlers could skip subprocess cleanup. --- src/services/sync/ChromaSync.ts | 131 ++++++++++++++++----------- tests/sync/chroma-sync-mutex.test.ts | 93 +++++++++++++++++++ 2 files changed, 173 insertions(+), 51 deletions(-) create mode 100644 tests/sync/chroma-sync-mutex.test.ts diff --git a/src/services/sync/ChromaSync.ts b/src/services/sync/ChromaSync.ts index 95bb10189..b7321692d 100644 --- a/src/services/sync/ChromaSync.ts +++ b/src/services/sync/ChromaSync.ts @@ -90,6 +90,15 @@ export class ChromaSync { // MCP SDK's StdioClientTransport uses shell:false and no detached flag, so console is inherited. private readonly disabled: boolean = false; + // Connection mutex — coalesces concurrent ensureConnection() calls onto single spawn + private connectionPromise: Promise | null = null; + + // Circuit breaker — stops retry storms after repeated failures + private consecutiveFailures: number = 0; + private circuitOpenUntil: number = 0; + private static readonly MAX_FAILURES = 3; + private static readonly CIRCUIT_OPEN_MS = 60_000; + constructor(project: string) { this.project = project; this.collectionName = `cm__${project}`; @@ -186,15 +195,49 @@ export class ChromaSync { return; } + // Circuit breaker: stop retrying after repeated failures + if (Date.now() < this.circuitOpenUntil) { + throw new Error('Chroma circuit breaker open — connection disabled for 60s after repeated failures'); + } + + // Connection mutex: coalesce concurrent callers onto single spawn + if (this.connectionPromise) { + return this.connectionPromise; + } + + // Capture reference to detect if a newer call replaced us. + // Without this, a concurrent caller's promise could be cleared by + // an older caller's finally{} block — a subtle race condition. + const p = this.connectionPromise = this._doConnect(); + try { + await p; + this.consecutiveFailures = 0; // Reset on success + } catch (error) { + this.consecutiveFailures++; + if (this.consecutiveFailures >= ChromaSync.MAX_FAILURES) { + this.circuitOpenUntil = Date.now() + ChromaSync.CIRCUIT_OPEN_MS; + logger.warn('CHROMA_SYNC', 'Circuit breaker tripped — disabling Chroma for 60s', { + failures: this.consecutiveFailures + }); + } + throw error; + } finally { + // Only clear if still the same promise (newer call may have replaced it) + if (this.connectionPromise === p) { + this.connectionPromise = null; + } + } + } + + /** + * Internal connection logic — called only by ensureConnection() mutex + */ + private async _doConnect(): Promise { logger.info('CHROMA_SYNC', 'Connecting to Chroma MCP server...', { project: this.project }); try { - // Use Python 3.13 by default to avoid onnxruntime compatibility issues with Python 3.14+ - // See: https://github.com/thedotmack/claude-mem/issues/170 (Python 3.14 incompatibility) const settings = SettingsDefaultsManager.loadFromFile(USER_SETTINGS_PATH); const pythonVersion = settings.CLAUDE_MEM_PYTHON_VERSION; - - // Get combined SSL certificate bundle for Zscaler/corporate proxy environments const combinedCertPath = this.getCombinedCertPath(); const transportOptions: any = { @@ -208,7 +251,6 @@ export class ChromaSync { stderr: 'ignore' }; - // Add SSL certificate environment variables for corporate proxy/Zscaler environments if (combinedCertPath) { transportOptions.env = { ...process.env, @@ -221,13 +263,8 @@ export class ChromaSync { }); } - // Note: windowsHide is not needed here because the worker daemon starts with - // -WindowStyle Hidden, so child processes inherit the hidden console. - // The MCP SDK ignores custom windowsHide anyway (overridden internally). - this.transport = new StdioClientTransport(transportOptions); - // Empty capabilities object: this client only calls Chroma tools, doesn't expose any this.client = new Client({ name: 'claude-mem-chroma-sync', version: packageVersion @@ -240,6 +277,13 @@ export class ChromaSync { logger.info('CHROMA_SYNC', 'Connected to Chroma MCP server', { project: this.project }); } catch (error) { + // Safe cleanup: close transport before nulling reference + if (this.transport) { + try { await this.transport.close(); } catch {} + } + this.transport = null; + this.client = null; + this.connected = false; logger.error('CHROMA_SYNC', 'Failed to connect to Chroma MCP server', { project: this.project }, error as Error); throw new Error(`Chroma connection failed: ${error instanceof Error ? error.message : String(error)}`); } @@ -278,19 +322,7 @@ export class ChromaSync { errorMessage.includes('MCP error -32000'); if (isConnectionError) { - // FIX: Close transport to kill subprocess before resetting state - // Without this, old chroma-mcp processes leak as zombies - if (this.transport) { - try { - await this.transport.close(); - } catch (closeErr) { - logger.debug('CHROMA_SYNC', 'Transport close error (expected if already dead)', {}, closeErr as Error); - } - } - // Reset connection state so next call attempts reconnect - this.connected = false; - this.client = null; - this.transport = null; + await this.safeResetConnection(); logger.error('CHROMA_SYNC', 'Connection lost during collection check', { collection: this.collectionName }, error as Error); throw new Error(`Chroma connection lost: ${errorMessage}`); @@ -948,18 +980,7 @@ export class ChromaSync { errorMessage.includes('MCP error -32000'); if (isConnectionError) { - // FIX: Close transport to kill subprocess before resetting state - if (this.transport) { - try { - await this.transport.close(); - } catch (closeErr) { - logger.debug('CHROMA_SYNC', 'Transport close error (expected if already dead)', {}, closeErr as Error); - } - } - // Reset connection state so next call attempts reconnect - this.connected = false; - this.client = null; - this.transport = null; + await this.safeResetConnection(); logger.error('CHROMA_SYNC', 'Connection lost during query', { project: this.project, query }, error as Error); throw new Error(`Chroma query failed - connection lost: ${errorMessage}`); @@ -1019,26 +1040,34 @@ export class ChromaSync { /** * Close the Chroma client connection and cleanup subprocess */ - async close(): Promise { - if (!this.connected && !this.client && !this.transport) { - return; - } - - // Close client first - if (this.client) { - await this.client.close(); - } - - // Explicitly close transport to kill subprocess - if (this.transport) { - await this.transport.close(); - } + /** + * Safely reset connection state — closes transport BEFORE nulling reference. + * Prevents orphaned chroma-mcp subprocesses. + */ + private async safeResetConnection(): Promise { + const t = this.transport; + const c = this.client; + this.connected = false; + this.client = null; + this.transport = null; + if (c) { try { await c.close(); } catch {} } + if (t) { try { await t.close(); } catch {} } + } - logger.info('CHROMA_SYNC', 'Chroma client and subprocess closed', { project: this.project }); + async close(): Promise { + // Always attempt cleanup — don't skip based on state flags alone. + // Error handlers may have set connected=false while subprocess still runs. + const t = this.transport; + const c = this.client; - // Always reset state this.connected = false; this.client = null; this.transport = null; + this.connectionPromise = null; + + if (c) { try { await c.close(); } catch {} } + if (t) { try { await t.close(); } catch {} } + + logger.info('CHROMA_SYNC', 'Chroma client and subprocess closed', { project: this.project }); } } diff --git a/tests/sync/chroma-sync-mutex.test.ts b/tests/sync/chroma-sync-mutex.test.ts new file mode 100644 index 000000000..6acfbba51 --- /dev/null +++ b/tests/sync/chroma-sync-mutex.test.ts @@ -0,0 +1,93 @@ +import { describe, it, expect } from 'bun:test'; + +describe('ChromaSync connection mutex', () => { + it('connectionPromise field coalesces concurrent calls', async () => { + // Simulate the mutex pattern + let connectCount = 0; + let connectionPromise: Promise | null = null; + + async function doConnect(): Promise { + connectCount++; + await new Promise(r => setTimeout(r, 50)); + } + + async function ensureConnection(): Promise { + if (connectionPromise) return connectionPromise; + const p = connectionPromise = doConnect(); + try { await p; } + finally { + if (connectionPromise === p) connectionPromise = null; + } + } + + // Fire 10 concurrent calls + await Promise.all(Array(10).fill(null).map(() => ensureConnection())); + + // Only 1 actual connection should have been made + expect(connectCount).toBe(1); + }); + + it('promise memoization race: newer caller not cleared by older finally', async () => { + let connectCount = 0; + let connectionPromise: Promise | null = null; + + async function doConnect(delay: number): Promise { + connectCount++; + await new Promise(r => setTimeout(r, delay)); + } + + async function ensureConnection(delay: number): Promise { + if (connectionPromise) return connectionPromise; + const p = connectionPromise = doConnect(delay); + try { await p; } + finally { + // Only clear if still the same promise + if (connectionPromise === p) connectionPromise = null; + } + } + + // First call starts a slow connection + const p1 = ensureConnection(100); + // Wait for first to complete, then start a second + await p1; + const p2 = ensureConnection(50); + await p2; + + // Both should complete independently + expect(connectCount).toBe(2); + }); + + it('circuit breaker stops retries after MAX_FAILURES', async () => { + let attempts = 0; + let consecutiveFailures = 0; + let circuitOpenUntil = 0; + const MAX_FAILURES = 3; + const CIRCUIT_OPEN_MS = 60_000; + + function isCircuitOpen(): boolean { + return Date.now() < circuitOpenUntil; + } + + async function tryConnect(): Promise { + if (isCircuitOpen()) return false; + attempts++; + consecutiveFailures++; + if (consecutiveFailures >= MAX_FAILURES) { + circuitOpenUntil = Date.now() + CIRCUIT_OPEN_MS; + } + return false; // simulate failure + } + + // 3 failures should trip the breaker + await tryConnect(); + await tryConnect(); + await tryConnect(); + expect(attempts).toBe(3); + expect(isCircuitOpen()).toBe(true); + + // 4th attempt should be blocked by circuit breaker + const blocked = await tryConnect(); + expect(blocked).toBe(false); + expect(attempts).toBe(3); // no new attempt + }); +}); From 2f092cbbc4f431da04803dafd25f833d7c853096 Mon Sep 17 00:00:00 2001 From: Rod Boev Date: Fri, 13 Feb 2026 02:38:59 -0500 Subject: [PATCH 7/9] fix: lower orphan age to 5min and add count-based chroma reaper Age-based cleanup (was 30min) completely missed spawn storms where all processes are <5min old. Count-based reaper kills excess chroma-mcp regardless of age, keeping only the 2 newest. --- src/services/infrastructure/ProcessManager.ts | 50 ++++++++++++++++++- 1 file changed, 49 insertions(+), 1 deletion(-) diff --git a/src/services/infrastructure/ProcessManager.ts b/src/services/infrastructure/ProcessManager.ts index 7013892ec..40b8a05f2 100644 --- a/src/services/infrastructure/ProcessManager.ts +++ b/src/services/infrastructure/ProcessManager.ts @@ -31,7 +31,7 @@ const ORPHAN_PROCESS_PATTERNS = [ ]; // Only kill processes older than this to avoid killing the current session -const ORPHAN_MAX_AGE_MINUTES = 30; +const ORPHAN_MAX_AGE_MINUTES = 5; export interface PidInfo { pid: number; @@ -337,6 +337,54 @@ export async function cleanupOrphanedProcesses(): Promise { } logger.info('SYSTEM', 'Orphaned processes cleaned up', { count: pidsToKill.length }); + + // Also check for excess chroma-mcp by count (catches recent spawn storms) + await cleanupExcessChromaProcesses(); +} + +/** + * Kill excess chroma-mcp processes by count, regardless of age. + * Keeps the newest MAX_CHROMA_PROCESSES alive, kills the rest. + * This catches spawn storms where all processes are < ORPHAN_MAX_AGE_MINUTES old. + */ +const MAX_CHROMA_PROCESSES = 2; + +export async function cleanupExcessChromaProcesses(): Promise { + if (process.platform === 'win32') return; + + try { + const { stdout } = await execAsync( + 'ps -eo pid,etimes,command | grep chroma-mcp | grep -v grep | sort -k2 -n || true' + ); + + if (!stdout.trim()) return; + + const lines = stdout.trim().split('\n'); + if (lines.length <= MAX_CHROMA_PROCESSES) return; + + // etimes = elapsed seconds, sort -k2 -n ascending = newest first (smallest etimes) + // Keep the newest (start of list), kill the oldest (end of list) + const toKill = lines.slice(MAX_CHROMA_PROCESSES); + + for (const line of toKill) { + const match = line.trim().match(/^(\d+)/); + if (!match) continue; + const pid = parseInt(match[1], 10); + if (pid <= 0 || pid === process.pid) continue; + try { + process.kill(pid, 'SIGKILL'); + logger.info('SYSTEM', 'Killed excess chroma-mcp process', { pid }); + } catch {} + } + + logger.info('SYSTEM', 'Excess chroma cleanup complete', { + found: lines.length, + killed: toKill.length, + kept: MAX_CHROMA_PROCESSES + }); + } catch (error) { + logger.debug('SYSTEM', 'Excess chroma cleanup failed (non-critical)', {}, error as Error); + } } /** From d6c8d6df835d31badf8890f157e2b88029fa2e28 Mon Sep 17 00:00:00 2001 From: Rod Boev Date: Fri, 13 Feb 2026 02:41:56 -0500 Subject: [PATCH 8/9] chore: add @types/proper-lockfile for type safety --- package.json | 1 + 1 file changed, 1 insertion(+) diff --git a/package.json b/package.json index d7804c818..3371de540 100644 --- a/package.json +++ b/package.json @@ -114,6 +114,7 @@ "@types/dompurify": "^3.0.5", "@types/express": "^4.17.21", "@types/node": "^20.0.0", + "@types/proper-lockfile": "^4.1.4", "@types/react": "^18.3.5", "@types/react-dom": "^18.3.0", "esbuild": "^0.27.2", From 0b77f8f89bb8f873501368730e93dacbbbefa7f4 Mon Sep 17 00:00:00 2001 From: Rod Boev Date: Fri, 13 Feb 2026 02:46:01 -0500 Subject: [PATCH 9/9] refactor: address Gemini peer review findings - DRY: close() now delegates to safeResetConnection() instead of duplicating the capture-null-close pattern - Reduce stale lock timeout from 30s to 10s (spawn should complete well within 10s; shorter timeout = faster recovery from crashes) - Add comment clarifying Windows gap in count-based cleanup --- src/services/infrastructure/ProcessManager.ts | 6 +++++- src/services/infrastructure/singleton-manager.ts | 2 +- src/services/sync/ChromaSync.ts | 13 +------------ 3 files changed, 7 insertions(+), 14 deletions(-) diff --git a/src/services/infrastructure/ProcessManager.ts b/src/services/infrastructure/ProcessManager.ts index 40b8a05f2..c3b1887f2 100644 --- a/src/services/infrastructure/ProcessManager.ts +++ b/src/services/infrastructure/ProcessManager.ts @@ -350,7 +350,11 @@ export async function cleanupOrphanedProcesses(): Promise { const MAX_CHROMA_PROCESSES = 2; export async function cleanupExcessChromaProcesses(): Promise { - if (process.platform === 'win32') return; + if (process.platform === 'win32') { + // Windows cleanup relies on age-based cleanupOrphanedProcesses() above + // which uses PowerShell Get-CimInstance. Count-based cleanup not yet implemented. + return; + } try { const { stdout } = await execAsync( diff --git a/src/services/infrastructure/singleton-manager.ts b/src/services/infrastructure/singleton-manager.ts index 1df0e2880..3d5da1eda 100644 --- a/src/services/infrastructure/singleton-manager.ts +++ b/src/services/infrastructure/singleton-manager.ts @@ -39,7 +39,7 @@ export async function acquireSpawnLock( release = await lockfile.lock(lockPath, { realpath: false, retries: 0, // Don't wait — if locked, fall back immediately - stale: 30_000 // Auto-release after 30s (handles crashed processes) + stale: 10_000 // Auto-release after 10s (handles crashed processes) }); } catch (err: any) { if (err.code === 'ELOCKED') { diff --git a/src/services/sync/ChromaSync.ts b/src/services/sync/ChromaSync.ts index b7321692d..4d0000285 100644 --- a/src/services/sync/ChromaSync.ts +++ b/src/services/sync/ChromaSync.ts @@ -1055,19 +1055,8 @@ export class ChromaSync { } async close(): Promise { - // Always attempt cleanup — don't skip based on state flags alone. - // Error handlers may have set connected=false while subprocess still runs. - const t = this.transport; - const c = this.client; - - this.connected = false; - this.client = null; - this.transport = null; + await this.safeResetConnection(); this.connectionPromise = null; - - if (c) { try { await c.close(); } catch {} } - if (t) { try { await t.close(); } catch {} } - logger.info('CHROMA_SYNC', 'Chroma client and subprocess closed', { project: this.project }); } }