diff --git a/Sources/TerminalController.swift b/Sources/TerminalController.swift index aa983480e..92b2b91f3 100644 --- a/Sources/TerminalController.swift +++ b/Sources/TerminalController.swift @@ -49,6 +49,20 @@ class TerminalController { private nonisolated(unsafe) var listenerStartInProgress = false private nonisolated let listenerStateLock = NSLock() private var clientHandlers: [Int32: Thread] = [:] + /// Dispatches accepted socket connections onto GCD instead of one `NSThread` per client. + /// This queue is **concurrent**: many `async` blocks may be waiting; the semaphore below caps + /// how many run `handleClient` at once (so a connection flood still queues work, but avoids + /// unbounded thread creation when handlers stall on the main actor). + private nonisolated let clientQueue = DispatchQueue( + label: "com.cmuxterm.socket-clients", + qos: .utility, + attributes: .concurrent + ) + /// Max concurrent `handleClient` executions (tunable). Chosen as a generous ceiling for normal + /// CLI/automation parallelism while bounding worst-case thread/memory growth. + private nonisolated let clientConcurrencyLimit = DispatchSemaphore(value: 64) + /// How long a connection waits for a handler slot before we reject with "Server busy". + private nonisolated let clientHandlerSlotWait: DispatchTimeInterval = .seconds(30) private var tabManager: TabManager? private var accessMode: SocketControlMode = .cmuxOnly private let myPid = getpid() @@ -1512,9 +1526,22 @@ class TerminalController { // the time a new thread starts the peer may already be gone. let peerPid = getPeerPid(clientSocket) - // Handle client in new thread - Thread.detachNewThread { [weak self] in - self?.handleClient(clientSocket, peerPid: peerPid) + // Handle client on bounded queue to prevent unbounded thread growth + // when the main thread is temporarily unresponsive. + clientQueue.async { [weak self] in + guard let self else { + close(clientSocket) + return + } + guard self.clientConcurrencyLimit.wait(timeout: .now() + self.clientHandlerSlotWait) == .success else { + // All handler slots full — reject this connection to apply backpressure. + let msg = "ERROR: Server busy\n" + msg.withCString { ptr in _ = write(clientSocket, ptr, strlen(ptr)) } + close(clientSocket) + return + } + defer { self.clientConcurrencyLimit.signal() } + self.handleClient(clientSocket, peerPid: peerPid) } } } diff --git a/docs/pr-bound-socket-thread-pool.md b/docs/pr-bound-socket-thread-pool.md new file mode 100644 index 000000000..85db208f3 --- /dev/null +++ b/docs/pr-bound-socket-thread-pool.md @@ -0,0 +1,37 @@ +## Summary + +Replace unbounded `Thread.detachNewThread` for socket client handlers with a +bounded concurrent `DispatchQueue` and semaphore, preventing runaway thread +accumulation when the main thread is temporarily unresponsive. + +## Problem + +Each incoming socket connection (e.g. `cmux claude-hook` calls from Claude Code +hooks) spawns a new `NSThread` via `Thread.detachNewThread`. When the main +thread stalls — for example during a heavy SwiftUI render pass — handler threads +that call `@MainActor`-isolated methods block on `DispatchQueue.main.sync`. +Because thread creation is unbounded, these blocked threads pile up indefinitely. + +In a real incident this produced **150+ blocked threads** and a **10.93 GB** +memory footprint, making the app permanently unresponsive and unrecoverable +without a force kill. + +## Fix + +- Add a private concurrent `DispatchQueue` (`com.cmuxterm.socket-clients`) for + client handler dispatch. +- Gate handler execution behind a `DispatchSemaphore(value: 64)` to cap + concurrent handlers. +- Handlers that cannot acquire a slot within 30 seconds return `ERROR: Server busy` + and close the connection, applying backpressure to callers instead of + accumulating memory. +- The accept loop's own `Thread.detachNewThread` (single long-lived thread) is + unchanged. + +## Test plan + +- [ ] Verify normal `cmux` CLI commands still work (ping, report_*, etc.) +- [ ] Simulate main thread stall with a debug sleep and confirm thread count + stays bounded +- [ ] Confirm "Server busy" response when all slots are exhausted +- [ ] Run existing socket tests (`tests_v2/`) against a tagged debug build