diff --git a/crates/kild-core/src/process/operations.rs b/crates/kild-core/src/process/operations.rs index f68af5bb..cdd9a4d3 100644 --- a/crates/kild-core/src/process/operations.rs +++ b/crates/kild-core/src/process/operations.rs @@ -103,14 +103,31 @@ pub fn kill_process( }); } - if process.kill() { - Ok(()) - } else { - Err(ProcessError::KillFailed { + if !process.kill() { + return Err(ProcessError::KillFailed { pid, message: "Process kill signal failed".to_string(), - }) + }); + } + + // Best-effort wait: give the process up to 500ms to exit after + // SIGKILL. Reuses the existing `system` to avoid repeated allocations. + let start = std::time::Instant::now(); + while start.elapsed() < std::time::Duration::from_millis(500) { + system.refresh_processes(ProcessesToUpdate::Some(&[pid_obj]), true); + if system.process(pid_obj).is_none() { + return Ok(()); + } + std::thread::sleep(std::time::Duration::from_millis(10)); } + + debug!( + event = "core.process.kill_wait_timeout", + pid = pid, + message = "process did not exit within 500ms after SIGKILL" + ); + + Ok(()) } None => Err(ProcessError::NotFound { pid }), } diff --git a/crates/kild-core/src/process/pid_file.rs b/crates/kild-core/src/process/pid_file.rs index e97d6d23..b14ef9f5 100644 --- a/crates/kild-core/src/process/pid_file.rs +++ b/crates/kild-core/src/process/pid_file.rs @@ -41,11 +41,18 @@ pub fn ensure_pid_dir(kild_dir: &Path) -> Result { /// /// The PID file is written by `echo $$ > file && exec cmd` before the /// agent process starts, so it typically appears within milliseconds. -/// Polls at 100ms intervals with a 3s timeout. +/// Polls at ~100ms intervals (with +/-20% PID-based jitter to decorrelate +/// simultaneous `kild create` launches) with a 3s timeout. pub fn read_pid_file_with_retry(pid_file: &Path) -> Result, ProcessError> { - const POLL_INTERVAL: Duration = Duration::from_millis(100); + const BASE_INTERVAL_MS: u64 = 100; const MAX_WAIT: Duration = Duration::from_secs(3); + // Compute jitter once — deterministic per-process, varies across concurrent launches. + // Maps PID to [0, 40], subtracts 20 → poll_interval in [80, 120] ms (no underflow). + const JITTER_RANGE_MS: u64 = BASE_INTERVAL_MS / 5; // 20ms + let pid_offset = (std::process::id() as u64) % (JITTER_RANGE_MS * 2 + 1); + let poll_interval = Duration::from_millis(BASE_INTERVAL_MS + pid_offset - JITTER_RANGE_MS); + let start = std::time::Instant::now(); let mut last_error: Option = None; @@ -82,7 +89,8 @@ pub fn read_pid_file_with_retry(pid_file: &Path) -> Result, ProcessE path = %pid_file.display() ); } - std::thread::sleep(POLL_INTERVAL); + + std::thread::sleep(poll_interval); } // Timeout reached — surface errors encountered during polling