From 81f4a212b212d7528fb958a2c52c67b3f53b2cee Mon Sep 17 00:00:00 2001 From: Forketyfork Date: Tue, 24 Feb 2026 15:36:44 +0100 Subject: [PATCH 1/2] fix(runtime): preserve agent session capture during quit Issue: Quitting Architect with multiple active agent sessions could miss resume IDs because trailing PTY output arrived after sessions were marked dead, and shutdown debug tail logging printed control-heavy terminal data into the launcher shell. Solution: Keep draining PTY output for spawned sessions even after process exit so late bytes are still processed. Add a bounded post-worker drain-until-quiet pass before UUID extraction to capture trailing shutdown output deterministically. Remove raw terminal-tail debug logging to avoid control-sequence noise in the launcher shell while retaining concise capture diagnostics. --- docs/ARCHITECTURE.md | 2 +- src/app/runtime.zig | 63 ++++++++++++++++++++++++++++++++++++++++++- src/session/state.zig | 14 +++++++++- 3 files changed, 76 insertions(+), 3 deletions(-) diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md index 2da3252..f3d0478 100644 --- a/docs/ARCHITECTURE.md +++ b/docs/ARCHITECTURE.md @@ -477,7 +477,7 @@ Renderer draws attention border / story overlay ### ADR-014: Agent Session Detection, Persistence, and Resumption - **Decision:** Architect detects running AI agents at quit time, captures their session UUIDs, persists them in `persistence.toml`, and automatically resumes them on next launch. The quit-time teardown runs asynchronously on a background worker thread while the main thread keeps rendering terminal updates. -- **Context:** To persist an agent's session ID for resumption on next launch, Architect must capture the session UUID that the agent prints to the PTY during graceful shutdown. The quit sequence is: detect running agent via macOS `sysctl`/process inspection → start a background teardown worker → worker launches one teardown task per detected agent session in parallel; each task injects `Ctrl+C` twice (all supported agents), waits, retries once, and finally sends SIGTERM as last resort → main thread continues polling PTY output/rendering terminals, so users can see agents stopping in real time → a full-screen `quit_blocking_overlay` blocks all input and renders a shimmering gray veil while teardown is in progress → Architect extracts UUIDs only from PTY bytes captured after shutdown begins (not full history) and persists successful captures to `persistence.toml`. +- **Context:** To persist an agent's session ID for resumption on next launch, Architect must capture the session UUID that the agent prints to the PTY during graceful shutdown. The quit sequence is: detect running agent via macOS `sysctl`/process inspection → start a background teardown worker → worker launches one teardown task per detected agent session in parallel; each task injects `Ctrl+C` twice (all supported agents), waits, retries once, and finally sends SIGTERM as last resort → main thread continues polling PTY output/rendering terminals (including post-exit PTY drain while sessions are still allocated), so users can see agents stopping in real time and trailing output is not dropped → a full-screen `quit_blocking_overlay` blocks all input and renders a shimmering gray veil while teardown is in progress → after worker completion, runtime performs a bounded drain-until-quiet pass over all affected PTYs to capture trailing output that arrived after the worker reported done → Architect extracts UUIDs only from PTY bytes captured after shutdown begins (not full history) and persists successful captures to `persistence.toml`. - **Agent detection strategy:** `session/state.detectForegroundAgent()` reads the foreground process-group leader's process image name (`kp_proc.p_comm`) via `sysctl KERN_PROC_PID`. If `p_comm` is `"claude"`, `"codex"`, or `"gemini"`, the agent is identified directly. If `p_comm` is `"node"`, `KERN_PROCARGS2` is read to inspect `argv[1]`; if the script path contains `"claude"`, `"codex"`, or `"gemini"`, the corresponding agent is matched. This uniform approach covers both direct binaries and Node.js-wrapped agents. - **Resume-command injection:** On next launch, `app/runtime.zig` reads the persisted `agent_type` and `agent_session_id` from `persistence.toml`. If both are present, it appends the resume command (e.g., `claude --resume `) to `session.pending_write` immediately after spawning the shell. The shell reads this input once it is ready, so no timing synchronization is needed. - **Layer boundary:** `app/runtime.zig` owns quit orchestration (worker lifecycle, PTY exit signaling by fd, persistence timing) and UI blocking state. `session/state.zig` owns agent detection and session metadata access. `app/terminal_history.zig` owns text analysis (UUID extraction). UI components (`ui/components/quit_blocking_overlay.zig`) own the visual/input lock behavior. diff --git a/src/app/runtime.zig b/src/app/runtime.zig index 8bba3ca..7cd84b4 100644 --- a/src/app/runtime.zig +++ b/src/app/runtime.zig @@ -511,6 +511,9 @@ fn handleQuitRequest( const quit_primary_wait_ms: u64 = 2500; const quit_retry_wait_ms: u64 = 2500; const quit_term_wait_ms: u64 = 500; +const quit_capture_drain_poll_ns: u64 = 20 * std.time.ns_per_ms; +const quit_capture_drain_quiet_ns: i128 = 250 * @as(i128, std.time.ns_per_ms); +const quit_capture_drain_max_ns: i128 = 2500 * @as(i128, std.time.ns_per_ms); const QuitTeardownTask = struct { session_idx: usize, @@ -653,6 +656,47 @@ fn foregroundPgrp(slave_path_z: [:0]const u8, shell_pid: posix.pid_t) ?posix.pid return fg_pgrp; } +fn drainQuitCaptureOutput(tasks: []const QuitTeardownTask, sessions: []const *SessionState) void { + if (tasks.len == 0) return; + + var last_capture_lengths: [grid_layout.max_terminals]usize = [_]usize{0} ** grid_layout.max_terminals; + for (tasks, 0..) |task, idx| { + last_capture_lengths[idx] = sessions[task.session_idx].quitCaptureBytes().len; + } + + const start_ns = std.time.nanoTimestamp(); + var last_growth_ns = start_ns; + + while (true) { + var saw_growth = false; + for (tasks, 0..) |task, idx| { + const session = sessions[task.session_idx]; + session.processOutput() catch |err| { + log.warn("quit teardown: session {d} post-worker output drain failed: {}", .{ task.session_idx, err }); + }; + const new_len = session.quitCaptureBytes().len; + if (new_len > last_capture_lengths[idx]) { + saw_growth = true; + } + last_capture_lengths[idx] = new_len; + } + + const now_ns = std.time.nanoTimestamp(); + if (saw_growth) { + last_growth_ns = now_ns; + } + + if (!shouldContinueQuitCaptureDrain(start_ns, last_growth_ns, now_ns)) break; + std.Thread.sleep(quit_capture_drain_poll_ns); + } +} + +fn shouldContinueQuitCaptureDrain(start_ns: i128, last_growth_ns: i128, now_ns: i128) bool { + const quiet_elapsed = now_ns - last_growth_ns; + const total_elapsed = now_ns - start_ns; + return quiet_elapsed < quit_capture_drain_quiet_ns and total_elapsed < quit_capture_drain_max_ns; +} + fn startQuitFlow( quit_state: *QuitTeardownState, sessions: []*SessionState, @@ -2482,6 +2526,7 @@ pub fn run() !void { if (quit_teardown.active) { quit_blocking_overlay_component.setActive(false); quit_teardown.join(); + drainQuitCaptureOutput(quit_teardown.tasks[0..quit_teardown.task_count], sessions[0..]); for (quit_teardown.tasks[0..quit_teardown.task_count]) |task| { const session = sessions[task.session_idx]; session.stopQuitCapture(); @@ -2492,7 +2537,6 @@ pub fn run() !void { session.agent_kind = null; const text = session.quitCaptureBytes(); log.debug("quit teardown: session {d} extracted {d} bytes of terminal text", .{ task.session_idx, text.len }); - log.debug("quit teardown: session {d} terminal text tail: {s}", .{ task.session_idx, text[@max(0, text.len -| 1000)..] }); if (terminal_history.extractLastUuid(text)) |uuid| { log.info("quit teardown: session {d} captured session id: {s}", .{ task.session_idx, uuid }); session.agent_kind = task.agent_kind; @@ -2550,6 +2594,23 @@ test "markTeardownComplete returns true only once" { try std.testing.expect(!markTeardownComplete(&done)); } +test "shouldContinueQuitCaptureDrain stops after quiet window" { + const start_ns: i128 = 0; + const last_growth_ns: i128 = 0; + const at_quiet_boundary = quit_capture_drain_quiet_ns; + try std.testing.expect(!shouldContinueQuitCaptureDrain(start_ns, last_growth_ns, at_quiet_boundary)); + + const just_before_quiet = quit_capture_drain_quiet_ns - 1; + try std.testing.expect(shouldContinueQuitCaptureDrain(start_ns, last_growth_ns, just_before_quiet)); +} + +test "shouldContinueQuitCaptureDrain stops after max window" { + const start_ns: i128 = 0; + const recent_growth_ns = quit_capture_drain_max_ns - 1; + const at_max_boundary = quit_capture_drain_max_ns; + try std.testing.expect(!shouldContinueQuitCaptureDrain(start_ns, recent_growth_ns, at_max_boundary)); +} + const TestSwapError = error{InitFailed}; const TestResource = struct { diff --git a/src/session/state.zig b/src/session/state.zig index f3b4a24..23bec50 100644 --- a/src/session/state.zig +++ b/src/session/state.zig @@ -455,7 +455,7 @@ pub const SessionState = struct { } pub fn processOutput(self: *SessionState) ProcessOutputError!void { - if (!self.spawned or self.dead) return; + if (!shouldProcessOutput(self.spawned, self.dead)) return; const shell = &(self.shell orelse return); const stream = &(self.stream orelse return); @@ -484,6 +484,11 @@ pub const SessionState = struct { } } + fn shouldProcessOutput(spawned: bool, dead: bool) bool { + _ = dead; + return spawned; + } + /// Try to flush any queued stdin data; preserves ordering relative to new input. pub fn flushPendingWrites(self: *SessionState) !void { if (self.pending_write.items.len == 0) return; @@ -842,6 +847,13 @@ test "pending write shrinks when empty and over threshold" { try std.testing.expect(buf.capacity <= pending_write_shrink_threshold); } +test "shouldProcessOutput keeps draining after process exit" { + try std.testing.expect(!SessionState.shouldProcessOutput(false, false)); + try std.testing.expect(!SessionState.shouldProcessOutput(false, true)); + try std.testing.expect(SessionState.shouldProcessOutput(true, false)); + try std.testing.expect(SessionState.shouldProcessOutput(true, true)); +} + test "AgentKind.fromComm recognises known agent names" { try std.testing.expectEqual(AgentKind.claude, AgentKind.fromComm("claude").?); try std.testing.expectEqual(AgentKind.codex, AgentKind.fromComm("codex").?); From 96d18c868148dba6ce286e326c39130d2f5ef182 Mon Sep 17 00:00:00 2001 From: Forketyfork Date: Tue, 24 Feb 2026 15:49:49 +0100 Subject: [PATCH 2/2] fix(session): avoid dead PTY reads outside quit capture Address PR review comment: https://github.com/forketyfork/architect/pull/243#discussion_r2847569529 Limit post-exit output draining to sessions with active quit capture, and treat PTY EIO as terminal EOF during reads. This preserves quit-time UUID capture while avoiding runtime failures when dead sessions are polled in normal loops. --- docs/ARCHITECTURE.md | 2 +- src/session/state.zig | 29 +++++++++++++++++------------ 2 files changed, 18 insertions(+), 13 deletions(-) diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md index f3d0478..e883530 100644 --- a/docs/ARCHITECTURE.md +++ b/docs/ARCHITECTURE.md @@ -477,7 +477,7 @@ Renderer draws attention border / story overlay ### ADR-014: Agent Session Detection, Persistence, and Resumption - **Decision:** Architect detects running AI agents at quit time, captures their session UUIDs, persists them in `persistence.toml`, and automatically resumes them on next launch. The quit-time teardown runs asynchronously on a background worker thread while the main thread keeps rendering terminal updates. -- **Context:** To persist an agent's session ID for resumption on next launch, Architect must capture the session UUID that the agent prints to the PTY during graceful shutdown. The quit sequence is: detect running agent via macOS `sysctl`/process inspection → start a background teardown worker → worker launches one teardown task per detected agent session in parallel; each task injects `Ctrl+C` twice (all supported agents), waits, retries once, and finally sends SIGTERM as last resort → main thread continues polling PTY output/rendering terminals (including post-exit PTY drain while sessions are still allocated), so users can see agents stopping in real time and trailing output is not dropped → a full-screen `quit_blocking_overlay` blocks all input and renders a shimmering gray veil while teardown is in progress → after worker completion, runtime performs a bounded drain-until-quiet pass over all affected PTYs to capture trailing output that arrived after the worker reported done → Architect extracts UUIDs only from PTY bytes captured after shutdown begins (not full history) and persists successful captures to `persistence.toml`. +- **Context:** To persist an agent's session ID for resumption on next launch, Architect must capture the session UUID that the agent prints to the PTY during graceful shutdown. The quit sequence is: detect running agent via macOS `sysctl`/process inspection → start a background teardown worker → worker launches one teardown task per detected agent session in parallel; each task injects `Ctrl+C` twice (all supported agents), waits, retries once, and finally sends SIGTERM as last resort → main thread continues polling PTY output/rendering terminals (including post-exit PTY drain only for sessions with active quit capture while they are still allocated), so users can see agents stopping in real time and trailing output is not dropped → a full-screen `quit_blocking_overlay` blocks all input and renders a shimmering gray veil while teardown is in progress → after worker completion, runtime performs a bounded drain-until-quiet pass over all affected PTYs to capture trailing output that arrived after the worker reported done → Architect extracts UUIDs only from PTY bytes captured after shutdown begins (not full history) and persists successful captures to `persistence.toml`. - **Agent detection strategy:** `session/state.detectForegroundAgent()` reads the foreground process-group leader's process image name (`kp_proc.p_comm`) via `sysctl KERN_PROC_PID`. If `p_comm` is `"claude"`, `"codex"`, or `"gemini"`, the agent is identified directly. If `p_comm` is `"node"`, `KERN_PROCARGS2` is read to inspect `argv[1]`; if the script path contains `"claude"`, `"codex"`, or `"gemini"`, the corresponding agent is matched. This uniform approach covers both direct binaries and Node.js-wrapped agents. - **Resume-command injection:** On next launch, `app/runtime.zig` reads the persisted `agent_type` and `agent_session_id` from `persistence.toml`. If both are present, it appends the resume command (e.g., `claude --resume `) to `session.pending_write` immediately after spawning the shell. The shell reads this input once it is ready, so no timing synchronization is needed. - **Layer boundary:** `app/runtime.zig` owns quit orchestration (worker lifecycle, PTY exit signaling by fd, persistence timing) and UI blocking state. `session/state.zig` owns agent detection and session metadata access. `app/terminal_history.zig` owns text analysis (UUID extraction). UI components (`ui/components/quit_blocking_overlay.zig`) own the visual/input lock behavior. diff --git a/src/session/state.zig b/src/session/state.zig index 23bec50..db71599 100644 --- a/src/session/state.zig +++ b/src/session/state.zig @@ -455,15 +455,18 @@ pub const SessionState = struct { } pub fn processOutput(self: *SessionState) ProcessOutputError!void { - if (!shouldProcessOutput(self.spawned, self.dead)) return; + if (!shouldProcessOutput(self.spawned, self.dead, self.quit_capture_active)) return; const shell = &(self.shell orelse return); const stream = &(self.stream orelse return); while (true) { - const n = shell.read(&self.output_buf) catch |err| { - if (err == error.WouldBlock) return; - return err; + const n = shell.read(&self.output_buf) catch |err| switch (err) { + error.WouldBlock => return, + // Linux PTYs can report EIO after the slave side closes. + // Treat it as terminal EOF so normal dead sessions don't fail the runtime loop. + error.InputOutput => return, + else => return err, }; if (n == 0) return; @@ -484,9 +487,10 @@ pub const SessionState = struct { } } - fn shouldProcessOutput(spawned: bool, dead: bool) bool { - _ = dead; - return spawned; + fn shouldProcessOutput(spawned: bool, dead: bool, quit_capture_active: bool) bool { + if (!spawned) return false; + if (!dead) return true; + return quit_capture_active; } /// Try to flush any queued stdin data; preserves ordering relative to new input. @@ -847,11 +851,12 @@ test "pending write shrinks when empty and over threshold" { try std.testing.expect(buf.capacity <= pending_write_shrink_threshold); } -test "shouldProcessOutput keeps draining after process exit" { - try std.testing.expect(!SessionState.shouldProcessOutput(false, false)); - try std.testing.expect(!SessionState.shouldProcessOutput(false, true)); - try std.testing.expect(SessionState.shouldProcessOutput(true, false)); - try std.testing.expect(SessionState.shouldProcessOutput(true, true)); +test "shouldProcessOutput drains dead sessions only during quit capture" { + try std.testing.expect(!SessionState.shouldProcessOutput(false, false, false)); + try std.testing.expect(!SessionState.shouldProcessOutput(false, true, false)); + try std.testing.expect(SessionState.shouldProcessOutput(true, false, false)); + try std.testing.expect(!SessionState.shouldProcessOutput(true, true, false)); + try std.testing.expect(SessionState.shouldProcessOutput(true, true, true)); } test "AgentKind.fromComm recognises known agent names" {