Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions src/services/infrastructure/HealthMonitor.ts
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,22 @@ export async function getRunningWorkerVersion(port: number): Promise<string | nu
}
}

/**
* Fetch the PID from a running worker's /api/health endpoint.
* Returns the PID if the worker responds, or null on any failure.
* Used by cleanStalePidFile to cross-check PID file against actual worker.
*/
export async function getHealthPid(port: number): Promise<number | null> {
try {
const response = await fetch(`http://127.0.0.1:${port}/api/health`);
if (!response.ok) return null;
const data = await response.json() as { pid?: number };
return data.pid ?? null;
} catch {
return null;
}
}

export interface VersionCheckResult {
matches: boolean;
pluginVersion: string;
Expand Down
26 changes: 25 additions & 1 deletion src/services/worker-service.ts
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,8 @@ import {
waitForReadiness,
waitForPortFree,
httpShutdown,
checkVersionMatch
checkVersionMatch,
getHealthPid
} from './infrastructure/HealthMonitor.js';
import { performGracefulShutdown } from './infrastructure/GracefulShutdown.js';

Expand Down Expand Up @@ -903,6 +904,29 @@ async function ensureWorkerStarted(port: number): Promise<boolean> {
// Clean stale PID file first (cheap: 1 fs read + 1 signal-0 check)
cleanStalePidFile();

// Cross-check: if PID file survived cleanup (process appears alive),
// verify the health endpoint's PID matches. Catches PID reuse by unrelated
// processes and zombie PID files after OOM/sleep/wake (#1231).
const pidInfo = readPidFile();
if (pidInfo) {
const healthPid = await getHealthPid(port);
if (healthPid !== null && healthPid !== pidInfo.pid) {
logger.info('SYSTEM', 'PID file is stale: health endpoint reports different PID', {
pidFilePid: pidInfo.pid,
healthPid,
port
});
removePidFile();
} else if (healthPid === null && !await waitForHealth(port, 1000)) {
// PID file says alive but health endpoint unreachable — stale
logger.info('SYSTEM', 'PID file is stale: process alive but not responding to health checks', {
pid: pidInfo.pid,
port
});
removePidFile();
}
}

// Check if worker is already running and healthy
if (await waitForHealth(port, 1000)) {
const versionCheck = await checkVersionMatch(port);
Expand Down
44 changes: 43 additions & 1 deletion tests/infrastructure/health-monitor.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@ import {
waitForHealth,
waitForPortFree,
getInstalledPluginVersion,
checkVersionMatch
checkVersionMatch,
getHealthPid
} from '../../src/services/infrastructure/index.js';

describe('HealthMonitor', () => {
Expand Down Expand Up @@ -183,6 +184,47 @@ describe('HealthMonitor', () => {
});
});

describe('getHealthPid (#1231)', () => {
it('should return PID from health endpoint', async () => {
global.fetch = mock(() => Promise.resolve({
ok: true,
json: () => Promise.resolve({ pid: 12345, status: 'ok' })
} as Response));

const pid = await getHealthPid(37777);

expect(pid).toBe(12345);
expect(global.fetch).toHaveBeenCalledWith('http://127.0.0.1:37777/api/health');
});

it('should return null when worker is not responding', async () => {
global.fetch = mock(() => Promise.reject(new Error('ECONNREFUSED')));

const pid = await getHealthPid(39999);

expect(pid).toBeNull();
});

it('should return null when health returns non-ok', async () => {
global.fetch = mock(() => Promise.resolve({ ok: false, status: 503 } as Response));

const pid = await getHealthPid(37777);

expect(pid).toBeNull();
});

it('should return null when response has no pid field', async () => {
global.fetch = mock(() => Promise.resolve({
ok: true,
json: () => Promise.resolve({ status: 'ok' })
} as Response));

const pid = await getHealthPid(37777);

expect(pid).toBeNull();
});
});

describe('waitForPortFree', () => {
it('should return true immediately when port is already free', async () => {
global.fetch = mock(() => Promise.reject(new Error('ECONNREFUSED')));
Expand Down
55 changes: 55 additions & 0 deletions tests/infrastructure/process-manager.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ import {
spawnDaemon,
resolveWorkerRuntimePath,
runOneTimeChromaMigration,
getHealthPid,
type PidInfo
} from '../../src/services/infrastructure/index.js';

Expand Down Expand Up @@ -518,4 +519,58 @@ describe('ProcessManager', () => {
expect(existsSync(path.join(testDataDir, '.chroma-cleaned-v10.3'))).toBe(true);
});
});

describe('stale PID detection integration (#1231)', () => {
const originalFetch = global.fetch;

afterEach(() => {
global.fetch = originalFetch;
});

it('should detect stale PID when health reports different PID', async () => {
// Simulate: PID file says PID 99999, but health endpoint reports PID 12345
writePidFile({ pid: process.pid, port: 37777, startedAt: new Date().toISOString() });

// Mock health endpoint returning a DIFFERENT PID
global.fetch = (() => Promise.resolve({
ok: true,
json: () => Promise.resolve({ pid: process.pid + 1, status: 'ok' })
} as Response)) as typeof fetch;

const healthPid = await getHealthPid(37777);
const pidInfo = readPidFile();

expect(pidInfo).not.toBeNull();
expect(healthPid).not.toBeNull();
expect(healthPid).not.toBe(pidInfo!.pid);
});

it('should detect stale PID when health endpoint is unreachable', async () => {
// Simulate: PID file exists but worker is dead (no health response)
writePidFile({ pid: process.pid, port: 37777, startedAt: new Date().toISOString() });

global.fetch = (() => Promise.reject(new Error('ECONNREFUSED'))) as typeof fetch;

const healthPid = await getHealthPid(37777);
const pidInfo = readPidFile();

expect(pidInfo).not.toBeNull();
expect(healthPid).toBeNull();
});

it('should confirm healthy when PIDs match', async () => {
writePidFile({ pid: process.pid, port: 37777, startedAt: new Date().toISOString() });

global.fetch = (() => Promise.resolve({
ok: true,
json: () => Promise.resolve({ pid: process.pid, status: 'ok' })
} as Response)) as typeof fetch;

const healthPid = await getHealthPid(37777);
const pidInfo = readPidFile();

expect(pidInfo).not.toBeNull();
expect(healthPid).toBe(pidInfo!.pid);
});
});
});
Loading