Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 11 additions & 1 deletion src/services/worker-service.ts
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ import { SettingsRoutes } from './worker/http/routes/SettingsRoutes.js';
import { LogsRoutes } from './worker/http/routes/LogsRoutes.js';

// Process management for zombie cleanup (Issue #737)
import { startOrphanReaper, reapOrphanedProcesses } from './worker/ProcessRegistry.js';
import { startOrphanReaper, reapSystemOrphansOnly } from './worker/ProcessRegistry.js';

/**
* Build JSON status output for hook framework communication.
Expand Down Expand Up @@ -309,6 +309,16 @@ export class WorkerService {
this.resolveInitialization();
logger.info('SYSTEM', 'Background initialization complete');

// Defensive cleanup: reap only OS-level orphans (ppid=1) from previous daemon run (Issue #1007)
try {
const startupReaped = await reapSystemOrphansOnly();
if (startupReaped > 0) {
logger.info('SYSTEM', `Reaped ${startupReaped} system orphan(s) from previous run`, { reaped: startupReaped });
}
} catch (err) {
logger.warn('SYSTEM', 'Startup reap of system orphans failed (non-fatal)', {}, err as Error);
}

// Start orphan reaper to clean up zombie processes (Issue #737)
this.stopOrphanReaper = startOrphanReaper(() => {
const activeIds = new Set<number>();
Expand Down
52 changes: 48 additions & 4 deletions src/services/worker/ProcessRegistry.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,16 @@
* Fixes Issue #737: Claude haiku subprocesses don't terminate properly,
* causing zombie process accumulation (user reported 155 processes / 51GB RAM).
*
* Root causes:
* 1. SDK's SpawnedProcess interface hides subprocess PIDs
* 2. deleteSession() doesn't verify subprocess exit before cleanup
* 3. abort() is fire-and-forget with no confirmation
* Issue #1007: Unbounded spawning can exhaust memory/swap over multi-day daemon
* runs. We enforce a max concurrent subprocess limit and reap orphans at startup.
*
* Solution:
* - Use SDK's spawnClaudeCodeProcess option to capture PIDs
* - Track all spawned processes with session association
* - Enforce MAX_CONCURRENT_CLAUDE_SUBPROCESSES; fail fast when at cap
* - Verify exit on session deletion with timeout + SIGKILL escalation
* - Safety net orphan reaper runs every 5 minutes
* - Defensive reap of system orphans only at startup (ppid=1), never registry
*/

import { spawn, exec, ChildProcess } from 'child_process';
Expand All @@ -32,6 +32,16 @@ interface TrackedProcess {
// PID Registry - tracks spawned Claude subprocesses
const processRegistry = new Map<number, TrackedProcess>();

/** Maximum concurrent Claude subprocesses (safety limit to prevent resource exhaustion, Issue #1007) */
export const MAX_CONCURRENT_CLAUDE_SUBPROCESSES = 50;

/**
* Get current count of tracked subprocesses (for cap enforcement)
*/
export function getActiveProcessCount(): number {
return processRegistry.size;
}

/**
* Register a spawned process in the registry
*/
Expand Down Expand Up @@ -156,6 +166,30 @@ async function killSystemOrphans(): Promise<number> {
}
}

/**
* Reap only OS-level orphans (ppid=1). Safe to call at daemon startup:
* does not touch the registry, so in-flight subprocesses are never killed.
*/
export async function reapSystemOrphansOnly(): Promise<number> {
return killSystemOrphans();
}

/**
* Remove from registry any tracked process that has already exited.
* Prevents cap enforcement from being stuck when exit events were missed.
*/
export function pruneExitedProcesses(): void {
const toRemove: number[] = [];
for (const [pid, info] of processRegistry) {
if (info.process.killed || info.process.exitCode !== null) {
toRemove.push(pid);
}
}
for (const pid of toRemove) {
unregisterProcess(pid);
}
}

/**
* Reap orphaned processes - both registry-tracked and system-level
*/
Expand Down Expand Up @@ -199,6 +233,16 @@ export function createPidCapturingSpawn(sessionDbId: number) {
env?: NodeJS.ProcessEnv;
signal?: AbortSignal;
}) => {
pruneExitedProcesses();
if (processRegistry.size >= MAX_CONCURRENT_CLAUDE_SUBPROCESSES) {
logger.warn('PROCESS', `Refusing to spawn: at safety limit (${MAX_CONCURRENT_CLAUDE_SUBPROCESSES} concurrent Claude subprocesses)`, {
sessionDbId,
current: processRegistry.size
});
throw new Error(
`Maximum concurrent Claude subprocesses reached (${MAX_CONCURRENT_CLAUDE_SUBPROCESSES}). Please try again later or restart the worker.`
);
}
const child = spawn(spawnOptions.command, spawnOptions.args, {
cwd: spawnOptions.cwd,
env: spawnOptions.env,
Expand Down
Loading