diff --git a/src/services/infrastructure/HealthMonitor.ts b/src/services/infrastructure/HealthMonitor.ts index 3f262774d..4619b99c5 100644 --- a/src/services/infrastructure/HealthMonitor.ts +++ b/src/services/infrastructure/HealthMonitor.ts @@ -150,6 +150,22 @@ export async function getRunningWorkerVersion(port: number): Promise { + try { + const response = await fetch(`http://127.0.0.1:${port}/api/health`); + if (!response.ok) return null; + const data = await response.json() as { pid?: number }; + return data.pid ?? null; + } catch { + return null; + } +} + export interface VersionCheckResult { matches: boolean; pluginVersion: string; diff --git a/src/services/worker-service.ts b/src/services/worker-service.ts index 02dd4f328..0020bbd8a 100644 --- a/src/services/worker-service.ts +++ b/src/services/worker-service.ts @@ -88,7 +88,8 @@ import { waitForReadiness, waitForPortFree, httpShutdown, - checkVersionMatch + checkVersionMatch, + getHealthPid } from './infrastructure/HealthMonitor.js'; import { performGracefulShutdown } from './infrastructure/GracefulShutdown.js'; @@ -903,6 +904,29 @@ async function ensureWorkerStarted(port: number): Promise { // Clean stale PID file first (cheap: 1 fs read + 1 signal-0 check) cleanStalePidFile(); + // Cross-check: if PID file survived cleanup (process appears alive), + // verify the health endpoint's PID matches. Catches PID reuse by unrelated + // processes and zombie PID files after OOM/sleep/wake (#1231). + const pidInfo = readPidFile(); + if (pidInfo) { + const healthPid = await getHealthPid(port); + if (healthPid !== null && healthPid !== pidInfo.pid) { + logger.info('SYSTEM', 'PID file is stale: health endpoint reports different PID', { + pidFilePid: pidInfo.pid, + healthPid, + port + }); + removePidFile(); + } else if (healthPid === null && !await waitForHealth(port, 1000)) { + // PID file says alive but health endpoint unreachable — stale + logger.info('SYSTEM', 'PID file is stale: process alive but not responding to health checks', { + pid: pidInfo.pid, + port + }); + removePidFile(); + } + } + // Check if worker is already running and healthy if (await waitForHealth(port, 1000)) { const versionCheck = await checkVersionMatch(port); diff --git a/tests/infrastructure/health-monitor.test.ts b/tests/infrastructure/health-monitor.test.ts index f58120512..1ba6ed157 100644 --- a/tests/infrastructure/health-monitor.test.ts +++ b/tests/infrastructure/health-monitor.test.ts @@ -4,7 +4,8 @@ import { waitForHealth, waitForPortFree, getInstalledPluginVersion, - checkVersionMatch + checkVersionMatch, + getHealthPid } from '../../src/services/infrastructure/index.js'; describe('HealthMonitor', () => { @@ -183,6 +184,47 @@ describe('HealthMonitor', () => { }); }); + describe('getHealthPid (#1231)', () => { + it('should return PID from health endpoint', async () => { + global.fetch = mock(() => Promise.resolve({ + ok: true, + json: () => Promise.resolve({ pid: 12345, status: 'ok' }) + } as Response)); + + const pid = await getHealthPid(37777); + + expect(pid).toBe(12345); + expect(global.fetch).toHaveBeenCalledWith('http://127.0.0.1:37777/api/health'); + }); + + it('should return null when worker is not responding', async () => { + global.fetch = mock(() => Promise.reject(new Error('ECONNREFUSED'))); + + const pid = await getHealthPid(39999); + + expect(pid).toBeNull(); + }); + + it('should return null when health returns non-ok', async () => { + global.fetch = mock(() => Promise.resolve({ ok: false, status: 503 } as Response)); + + const pid = await getHealthPid(37777); + + expect(pid).toBeNull(); + }); + + it('should return null when response has no pid field', async () => { + global.fetch = mock(() => Promise.resolve({ + ok: true, + json: () => Promise.resolve({ status: 'ok' }) + } as Response)); + + const pid = await getHealthPid(37777); + + expect(pid).toBeNull(); + }); + }); + describe('waitForPortFree', () => { it('should return true immediately when port is already free', async () => { global.fetch = mock(() => Promise.reject(new Error('ECONNREFUSED'))); diff --git a/tests/infrastructure/process-manager.test.ts b/tests/infrastructure/process-manager.test.ts index 8733f0b11..27d197bc5 100644 --- a/tests/infrastructure/process-manager.test.ts +++ b/tests/infrastructure/process-manager.test.ts @@ -16,6 +16,7 @@ import { spawnDaemon, resolveWorkerRuntimePath, runOneTimeChromaMigration, + getHealthPid, type PidInfo } from '../../src/services/infrastructure/index.js'; @@ -518,4 +519,58 @@ describe('ProcessManager', () => { expect(existsSync(path.join(testDataDir, '.chroma-cleaned-v10.3'))).toBe(true); }); }); + + describe('stale PID detection integration (#1231)', () => { + const originalFetch = global.fetch; + + afterEach(() => { + global.fetch = originalFetch; + }); + + it('should detect stale PID when health reports different PID', async () => { + // Simulate: PID file says PID 99999, but health endpoint reports PID 12345 + writePidFile({ pid: process.pid, port: 37777, startedAt: new Date().toISOString() }); + + // Mock health endpoint returning a DIFFERENT PID + global.fetch = (() => Promise.resolve({ + ok: true, + json: () => Promise.resolve({ pid: process.pid + 1, status: 'ok' }) + } as Response)) as typeof fetch; + + const healthPid = await getHealthPid(37777); + const pidInfo = readPidFile(); + + expect(pidInfo).not.toBeNull(); + expect(healthPid).not.toBeNull(); + expect(healthPid).not.toBe(pidInfo!.pid); + }); + + it('should detect stale PID when health endpoint is unreachable', async () => { + // Simulate: PID file exists but worker is dead (no health response) + writePidFile({ pid: process.pid, port: 37777, startedAt: new Date().toISOString() }); + + global.fetch = (() => Promise.reject(new Error('ECONNREFUSED'))) as typeof fetch; + + const healthPid = await getHealthPid(37777); + const pidInfo = readPidFile(); + + expect(pidInfo).not.toBeNull(); + expect(healthPid).toBeNull(); + }); + + it('should confirm healthy when PIDs match', async () => { + writePidFile({ pid: process.pid, port: 37777, startedAt: new Date().toISOString() }); + + global.fetch = (() => Promise.resolve({ + ok: true, + json: () => Promise.resolve({ pid: process.pid, status: 'ok' }) + } as Response)) as typeof fetch; + + const healthPid = await getHealthPid(37777); + const pidInfo = readPidFile(); + + expect(pidInfo).not.toBeNull(); + expect(healthPid).toBe(pidInfo!.pid); + }); + }); });