From c2f30357268c1808838efe29aa05c28e7daff2f3 Mon Sep 17 00:00:00 2001 From: Rasmus Widing Date: Wed, 4 Mar 2026 17:24:21 +0200 Subject: [PATCH 1/4] perf: process kill retry, PID jitter, daemon runtime - Add process.wait() after kill() to verify termination and prevent zombie processes - Add +/-20% random jitter to PID file polling interval to prevent thundering herd when multiple kild create commands run simultaneously - Switch daemon from multi-threaded tokio runtime to current_thread since it's I/O-bound with low concurrency (PTY reads + IPC) Closes #478 --- crates/kild-core/src/process/operations.rs | 15 ++++++++++----- crates/kild-core/src/process/pid_file.rs | 17 ++++++++++++++--- crates/kild-daemon/src/main.rs | 13 +++++++++---- 3 files changed, 33 insertions(+), 12 deletions(-) diff --git a/crates/kild-core/src/process/operations.rs b/crates/kild-core/src/process/operations.rs index f68af5bb..412e175f 100644 --- a/crates/kild-core/src/process/operations.rs +++ b/crates/kild-core/src/process/operations.rs @@ -103,14 +103,19 @@ pub fn kill_process( }); } - if process.kill() { - Ok(()) - } else { - Err(ProcessError::KillFailed { + if !process.kill() { + return Err(ProcessError::KillFailed { pid, message: "Process kill signal failed".to_string(), - }) + }); } + + // Verify the process actually terminated. process.kill() sends + // the signal but the process may linger briefly. Poll with a + // bounded timeout to avoid orphaning zombies. + process.wait(); + + Ok(()) } None => Err(ProcessError::NotFound { pid }), } diff --git a/crates/kild-core/src/process/pid_file.rs b/crates/kild-core/src/process/pid_file.rs index e97d6d23..85e17c43 100644 --- a/crates/kild-core/src/process/pid_file.rs +++ b/crates/kild-core/src/process/pid_file.rs @@ -41,9 +41,11 @@ pub fn ensure_pid_dir(kild_dir: &Path) -> Result { /// /// The PID file is written by `echo $$ > file && exec cmd` before the /// agent process starts, so it typically appears within milliseconds. -/// Polls at 100ms intervals with a 3s timeout. +/// Polls at ~100ms intervals (with +/-20% jitter to prevent thundering +/// herd when multiple `kild create` commands run simultaneously) with +/// a 3s timeout. pub fn read_pid_file_with_retry(pid_file: &Path) -> Result, ProcessError> { - const POLL_INTERVAL: Duration = Duration::from_millis(100); + const BASE_INTERVAL_MS: u64 = 100; const MAX_WAIT: Duration = Duration::from_secs(3); let start = std::time::Instant::now(); @@ -82,7 +84,16 @@ pub fn read_pid_file_with_retry(pid_file: &Path) -> Result, ProcessE path = %pid_file.display() ); } - std::thread::sleep(POLL_INTERVAL); + + // Add +/-20% jitter to prevent thundering herd + let jitter_range = BASE_INTERVAL_MS / 5; // 20ms + let nanos = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap_or_default() + .subsec_nanos(); + let jitter = (nanos as u64 % (jitter_range * 2 + 1)) as i64 - jitter_range as i64; + let interval_ms = (BASE_INTERVAL_MS as i64 + jitter) as u64; + std::thread::sleep(Duration::from_millis(interval_ms)); } // Timeout reached — surface errors encountered during polling diff --git a/crates/kild-daemon/src/main.rs b/crates/kild-daemon/src/main.rs index 5e9422d9..01fd945a 100644 --- a/crates/kild-daemon/src/main.rs +++ b/crates/kild-daemon/src/main.rs @@ -29,10 +29,15 @@ fn run() -> Result<(), Box> { e })?; - let rt = tokio::runtime::Runtime::new().map_err(|e| { - error!(event = "daemon.runtime_init_failed", error = %e); - e - })?; + // The daemon is I/O-bound with low concurrency (PTY reads + IPC), + // so a single-threaded runtime avoids the overhead of a thread pool. + let rt = tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + .map_err(|e| { + error!(event = "daemon.runtime_init_failed", error = %e); + e + })?; rt.block_on(async { kild_daemon::run_server(config).await.map_err(|e| { From ec40bc2f9125bd7f14ffa0aac53574ae493fae17 Mon Sep 17 00:00:00 2001 From: Rasmus Widing Date: Wed, 4 Mar 2026 18:01:16 +0200 Subject: [PATCH 2/4] fix: bounded kill wait, PID-based jitter, revert daemon runtime - Replace process.wait() with bounded 500ms poll loop to avoid blocking indefinitely on uninterruptible sleep - Use PID-based jitter instead of SystemTime nanos which are correlated across simultaneous launches - Revert current_thread runtime: daemon multiplexes PTYs + IPC across multiple sessions, single-threaded would stall all sessions on any slow operation --- crates/kild-core/src/process/operations.rs | 13 +++++++++---- crates/kild-core/src/process/pid_file.rs | 9 +++------ crates/kild-daemon/src/main.rs | 13 ++++--------- 3 files changed, 16 insertions(+), 19 deletions(-) diff --git a/crates/kild-core/src/process/operations.rs b/crates/kild-core/src/process/operations.rs index 412e175f..fa8b8362 100644 --- a/crates/kild-core/src/process/operations.rs +++ b/crates/kild-core/src/process/operations.rs @@ -110,10 +110,15 @@ pub fn kill_process( }); } - // Verify the process actually terminated. process.kill() sends - // the signal but the process may linger briefly. Poll with a - // bounded timeout to avoid orphaning zombies. - process.wait(); + // Best-effort wait: give the process up to 500ms to exit after + // SIGKILL. Avoids blocking indefinitely on uninterruptible sleep. + let deadline = std::time::Instant::now() + std::time::Duration::from_millis(500); + while std::time::Instant::now() < deadline { + if !is_process_running(pid).unwrap_or(false) { + break; + } + std::thread::sleep(std::time::Duration::from_millis(10)); + } Ok(()) } diff --git a/crates/kild-core/src/process/pid_file.rs b/crates/kild-core/src/process/pid_file.rs index 85e17c43..4df24e71 100644 --- a/crates/kild-core/src/process/pid_file.rs +++ b/crates/kild-core/src/process/pid_file.rs @@ -85,13 +85,10 @@ pub fn read_pid_file_with_retry(pid_file: &Path) -> Result, ProcessE ); } - // Add +/-20% jitter to prevent thundering herd + // Add +/-20% jitter using PID to decorrelate simultaneous launches let jitter_range = BASE_INTERVAL_MS / 5; // 20ms - let nanos = std::time::SystemTime::now() - .duration_since(std::time::UNIX_EPOCH) - .unwrap_or_default() - .subsec_nanos(); - let jitter = (nanos as u64 % (jitter_range * 2 + 1)) as i64 - jitter_range as i64; + let pid_jitter = (std::process::id() as u64) % (jitter_range * 2 + 1); + let jitter = pid_jitter as i64 - jitter_range as i64; let interval_ms = (BASE_INTERVAL_MS as i64 + jitter) as u64; std::thread::sleep(Duration::from_millis(interval_ms)); } diff --git a/crates/kild-daemon/src/main.rs b/crates/kild-daemon/src/main.rs index 01fd945a..5e9422d9 100644 --- a/crates/kild-daemon/src/main.rs +++ b/crates/kild-daemon/src/main.rs @@ -29,15 +29,10 @@ fn run() -> Result<(), Box> { e })?; - // The daemon is I/O-bound with low concurrency (PTY reads + IPC), - // so a single-threaded runtime avoids the overhead of a thread pool. - let rt = tokio::runtime::Builder::new_current_thread() - .enable_all() - .build() - .map_err(|e| { - error!(event = "daemon.runtime_init_failed", error = %e); - e - })?; + let rt = tokio::runtime::Runtime::new().map_err(|e| { + error!(event = "daemon.runtime_init_failed", error = %e); + e + })?; rt.block_on(async { kild_daemon::run_server(config).await.map_err(|e| { From c87a5df95fa3b8e42bfaade43226b8dc0309f3b5 Mon Sep 17 00:00:00 2001 From: Rasmus Widing Date: Tue, 17 Mar 2026 13:22:28 +0200 Subject: [PATCH 3/4] fix: address review feedback on kill wait loop and PID jitter - Reuse existing `system` in kill wait loop instead of allocating a new System::new() on each of up to 50 poll iterations - Add debug log when kill wait times out (process didn't exit within 500ms after SIGKILL) - Hoist constant jitter calculation above the polling loop - Align docstring to use "decorrelate" instead of "thundering herd" --- crates/kild-core/src/process/operations.rs | 15 +++++++++++++-- crates/kild-core/src/process/pid_file.rs | 19 ++++++++++--------- 2 files changed, 23 insertions(+), 11 deletions(-) diff --git a/crates/kild-core/src/process/operations.rs b/crates/kild-core/src/process/operations.rs index fa8b8362..410828d6 100644 --- a/crates/kild-core/src/process/operations.rs +++ b/crates/kild-core/src/process/operations.rs @@ -111,15 +111,26 @@ pub fn kill_process( } // Best-effort wait: give the process up to 500ms to exit after - // SIGKILL. Avoids blocking indefinitely on uninterruptible sleep. + // SIGKILL. Reuses the existing `system` to avoid repeated allocations. + let mut exited = false; let deadline = std::time::Instant::now() + std::time::Duration::from_millis(500); while std::time::Instant::now() < deadline { - if !is_process_running(pid).unwrap_or(false) { + system.refresh_processes(ProcessesToUpdate::Some(&[pid_obj]), true); + if system.process(pid_obj).is_none() { + exited = true; break; } std::thread::sleep(std::time::Duration::from_millis(10)); } + if !exited { + debug!( + event = "core.process.kill_wait_timeout", + pid = pid, + message = "process did not exit within 500ms after SIGKILL" + ); + } + Ok(()) } None => Err(ProcessError::NotFound { pid }), diff --git a/crates/kild-core/src/process/pid_file.rs b/crates/kild-core/src/process/pid_file.rs index 4df24e71..e83e2a54 100644 --- a/crates/kild-core/src/process/pid_file.rs +++ b/crates/kild-core/src/process/pid_file.rs @@ -41,13 +41,19 @@ pub fn ensure_pid_dir(kild_dir: &Path) -> Result { /// /// The PID file is written by `echo $$ > file && exec cmd` before the /// agent process starts, so it typically appears within milliseconds. -/// Polls at ~100ms intervals (with +/-20% jitter to prevent thundering -/// herd when multiple `kild create` commands run simultaneously) with -/// a 3s timeout. +/// Polls at ~100ms intervals (with +/-20% PID-based jitter to decorrelate +/// simultaneous `kild create` launches) with a 3s timeout. pub fn read_pid_file_with_retry(pid_file: &Path) -> Result, ProcessError> { const BASE_INTERVAL_MS: u64 = 100; const MAX_WAIT: Duration = Duration::from_secs(3); + // Compute jitter once — deterministic per-process, varies across concurrent launches + let jitter_range = BASE_INTERVAL_MS / 5; // 20ms + let pid_jitter = (std::process::id() as u64) % (jitter_range * 2 + 1); + let jitter = pid_jitter as i64 - jitter_range as i64; + // poll_interval is in [80, 120] ms — cannot underflow + let poll_interval = Duration::from_millis((BASE_INTERVAL_MS as i64 + jitter) as u64); + let start = std::time::Instant::now(); let mut last_error: Option = None; @@ -85,12 +91,7 @@ pub fn read_pid_file_with_retry(pid_file: &Path) -> Result, ProcessE ); } - // Add +/-20% jitter using PID to decorrelate simultaneous launches - let jitter_range = BASE_INTERVAL_MS / 5; // 20ms - let pid_jitter = (std::process::id() as u64) % (jitter_range * 2 + 1); - let jitter = pid_jitter as i64 - jitter_range as i64; - let interval_ms = (BASE_INTERVAL_MS as i64 + jitter) as u64; - std::thread::sleep(Duration::from_millis(interval_ms)); + std::thread::sleep(poll_interval); } // Timeout reached — surface errors encountered during polling From 600973b60f7c3d9f4110a23a2e171c419af7c0aa Mon Sep 17 00:00:00 2001 From: Rasmus Widing Date: Tue, 17 Mar 2026 13:24:21 +0200 Subject: [PATCH 4/4] refactor: simplify kill wait loop and jitter arithmetic MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Replace exited flag with early return on process exit - Use elapsed() pattern consistent with read_pid_file_with_retry - Remove unnecessary signed-integer casts in jitter computation — stays in u64 since BASE_INTERVAL_MS (100) > JITTER_RANGE_MS (20) --- crates/kild-core/src/process/operations.rs | 20 ++++++++------------ crates/kild-core/src/process/pid_file.rs | 11 +++++------ 2 files changed, 13 insertions(+), 18 deletions(-) diff --git a/crates/kild-core/src/process/operations.rs b/crates/kild-core/src/process/operations.rs index 410828d6..cdd9a4d3 100644 --- a/crates/kild-core/src/process/operations.rs +++ b/crates/kild-core/src/process/operations.rs @@ -112,24 +112,20 @@ pub fn kill_process( // Best-effort wait: give the process up to 500ms to exit after // SIGKILL. Reuses the existing `system` to avoid repeated allocations. - let mut exited = false; - let deadline = std::time::Instant::now() + std::time::Duration::from_millis(500); - while std::time::Instant::now() < deadline { + let start = std::time::Instant::now(); + while start.elapsed() < std::time::Duration::from_millis(500) { system.refresh_processes(ProcessesToUpdate::Some(&[pid_obj]), true); if system.process(pid_obj).is_none() { - exited = true; - break; + return Ok(()); } std::thread::sleep(std::time::Duration::from_millis(10)); } - if !exited { - debug!( - event = "core.process.kill_wait_timeout", - pid = pid, - message = "process did not exit within 500ms after SIGKILL" - ); - } + debug!( + event = "core.process.kill_wait_timeout", + pid = pid, + message = "process did not exit within 500ms after SIGKILL" + ); Ok(()) } diff --git a/crates/kild-core/src/process/pid_file.rs b/crates/kild-core/src/process/pid_file.rs index e83e2a54..b14ef9f5 100644 --- a/crates/kild-core/src/process/pid_file.rs +++ b/crates/kild-core/src/process/pid_file.rs @@ -47,12 +47,11 @@ pub fn read_pid_file_with_retry(pid_file: &Path) -> Result, ProcessE const BASE_INTERVAL_MS: u64 = 100; const MAX_WAIT: Duration = Duration::from_secs(3); - // Compute jitter once — deterministic per-process, varies across concurrent launches - let jitter_range = BASE_INTERVAL_MS / 5; // 20ms - let pid_jitter = (std::process::id() as u64) % (jitter_range * 2 + 1); - let jitter = pid_jitter as i64 - jitter_range as i64; - // poll_interval is in [80, 120] ms — cannot underflow - let poll_interval = Duration::from_millis((BASE_INTERVAL_MS as i64 + jitter) as u64); + // Compute jitter once — deterministic per-process, varies across concurrent launches. + // Maps PID to [0, 40], subtracts 20 → poll_interval in [80, 120] ms (no underflow). + const JITTER_RANGE_MS: u64 = BASE_INTERVAL_MS / 5; // 20ms + let pid_offset = (std::process::id() as u64) % (JITTER_RANGE_MS * 2 + 1); + let poll_interval = Duration::from_millis(BASE_INTERVAL_MS + pid_offset - JITTER_RANGE_MS); let start = std::time::Instant::now(); let mut last_error: Option = None;