Skip to content
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions src/services/worker-service.ts
Original file line number Diff line number Diff line change
Expand Up @@ -309,6 +309,16 @@ export class WorkerService {
this.resolveInitialization();
logger.info('SYSTEM', 'Background initialization complete');

// Defensive cleanup: reap stale Claude subprocesses from previous daemon run (Issue #1007)
try {
const startupReaped = await reapOrphanedProcesses(new Set());
if (startupReaped > 0) {
logger.info('SYSTEM', `Reaped ${startupReaped} orphaned process(es) from previous run`, { reaped: startupReaped });
}
} catch (err) {
logger.warn('SYSTEM', 'Startup reap of orphaned processes failed (non-fatal)', {}, err as Error);
}

// Start orphan reaper to clean up zombie processes (Issue #737)
this.stopOrphanReaper = startOrphanReaper(() => {
const activeIds = new Set<number>();
Expand Down
27 changes: 23 additions & 4 deletions src/services/worker/ProcessRegistry.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,16 @@
* Fixes Issue #737: Claude haiku subprocesses don't terminate properly,
* causing zombie process accumulation (user reported 155 processes / 51GB RAM).
*
* Root causes:
* 1. SDK's SpawnedProcess interface hides subprocess PIDs
* 2. deleteSession() doesn't verify subprocess exit before cleanup
* 3. abort() is fire-and-forget with no confirmation
* Issue #1007: Unbounded spawning can exhaust memory/swap over multi-day daemon
* runs. We enforce a max concurrent subprocess limit and reap orphans at startup.
*
* Solution:
* - Use SDK's spawnClaudeCodeProcess option to capture PIDs
* - Track all spawned processes with session association
* - Enforce MAX_CONCURRENT_CLAUDE_SUBPROCESSES; fail fast when at cap
* - Verify exit on session deletion with timeout + SIGKILL escalation
* - Safety net orphan reaper runs every 5 minutes
* - Defensive reap of system orphans on daemon startup
*/

import { spawn, exec, ChildProcess } from 'child_process';
Expand All @@ -32,6 +32,16 @@ interface TrackedProcess {
// PID Registry - tracks spawned Claude subprocesses
const processRegistry = new Map<number, TrackedProcess>();

/** Maximum concurrent Claude subprocesses (safety limit to prevent resource exhaustion, Issue #1007) */
export const MAX_CONCURRENT_CLAUDE_SUBPROCESSES = 50;

/**
* Get current count of tracked subprocesses (for cap enforcement)
*/
export function getActiveProcessCount(): number {
return processRegistry.size;
}

/**
* Register a spawned process in the registry
*/
Expand Down Expand Up @@ -199,6 +209,15 @@ export function createPidCapturingSpawn(sessionDbId: number) {
env?: NodeJS.ProcessEnv;
signal?: AbortSignal;
}) => {
if (processRegistry.size >= MAX_CONCURRENT_CLAUDE_SUBPROCESSES) {
logger.warn('PROCESS', `Refusing to spawn: at safety limit (${MAX_CONCURRENT_CLAUDE_SUBPROCESSES} concurrent Claude subprocesses)`, {
sessionDbId,
current: processRegistry.size
});
throw new Error(
`Maximum concurrent Claude subprocesses reached (${MAX_CONCURRENT_CLAUDE_SUBPROCESSES}). Please try again later or restart the worker.`
);
}
const child = spawn(spawnOptions.command, spawnOptions.args, {
cwd: spawnOptions.cwd,
env: spawnOptions.env,
Expand Down
Loading