diff --git a/Type4Me/Audio/AudioCaptureEngine.swift b/Type4Me/Audio/AudioCaptureEngine.swift index 2397dea..c587b11 100644 --- a/Type4Me/Audio/AudioCaptureEngine.swift +++ b/Type4Me/Audio/AudioCaptureEngine.swift @@ -1,5 +1,14 @@ @preconcurrency import AVFoundation +protocol AudioCapturing: AnyObject, Sendable { + var onAudioChunk: ((Data) -> Void)? { get set } + var onAudioLevel: ((Float) -> Void)? { get set } + + func warmUp() + func start() throws + func stop() +} + enum AudioCaptureError: Error, LocalizedError { case converterCreationFailed case microphonePermissionDenied @@ -17,7 +26,7 @@ enum AudioCaptureError: Error, LocalizedError { } } -final class AudioCaptureEngine: NSObject, @unchecked Sendable, AVCaptureAudioDataOutputSampleBufferDelegate { +final class AudioCaptureEngine: NSObject, @unchecked Sendable, AVCaptureAudioDataOutputSampleBufferDelegate, AudioCapturing { // MARK: - Static properties @@ -128,11 +137,16 @@ final class AudioCaptureEngine: NSObject, @unchecked Sendable, AVCaptureAudioDat } func stop() { - captureSession?.stopRunning() - captureSession = nil - converter = nil - levelCounter = 0 - flushRemaining() + outputQueue.sync { + // Drain any pending AVCapture callbacks on the delegate queue before + // flushing the tail buffer, otherwise the last spoken frames can be + // stranded behind stop() and never reach the session pipeline. + captureSession?.stopRunning() + captureSession = nil + converter = nil + levelCounter = 0 + flushRemaining() + } NSLog("[Audio] Capture session stopped") } diff --git a/Type4Me/Session/RecognitionSession.swift b/Type4Me/Session/RecognitionSession.swift index cbc9fa0..3f57434 100644 --- a/Type4Me/Session/RecognitionSession.swift +++ b/Type4Me/Session/RecognitionSession.swift @@ -2,6 +2,8 @@ import AppKit import os actor RecognitionSession { + private static let stopCaptureGracePeriod: Duration = .milliseconds(350) + private static let stopSoundRestoreDelay: Duration = .milliseconds(50) // MARK: - State @@ -30,9 +32,9 @@ actor RecognitionSession { // MARK: - Dependencies - private let audioEngine = AudioCaptureEngine() - private let injectionEngine = TextInjectionEngine() - let historyStore = HistoryStore() + private let audioEngine: any AudioCapturing + private let injectionEngine: TextInjectionEngine + let historyStore: HistoryStore private var asrClient: (any SpeechRecognizer)? private let logger = Logger( @@ -40,6 +42,16 @@ actor RecognitionSession { category: "RecognitionSession" ) + init( + audioEngine: any AudioCapturing = AudioCaptureEngine(), + injectionEngine: TextInjectionEngine = TextInjectionEngine(), + historyStore: HistoryStore = HistoryStore() + ) { + self.audioEngine = audioEngine + self.injectionEngine = injectionEngine + self.historyStore = historyStore + } + /// Return the appropriate LLM client for the currently selected provider. private func currentLLMClient() -> any LLMClient { let provider = KeychainService.selectedLLMProvider @@ -368,15 +380,23 @@ actor RecognitionSession { } let stopT0 = ContinuousClock.now - SystemVolumeManager.restore() // Restore before stop sound plays - try? await Task.sleep(for: .milliseconds(50)) // Let OS apply volume change - SoundFeedback.playStop() state = .finishing + // Give CoreAudio a brief grace window to deliver frames that were spoken + // just before the stop hotkey event but have not yet surfaced as callbacks. + try? await Task.sleep(for: Self.stopCaptureGracePeriod) + // Stop capture first so flushRemaining() can emit the tail audio chunk. audioEngine.stop() - audioEngine.onAudioChunk = nil await finishAudioChunkPipeline() + audioEngine.onAudioChunk = nil + audioEngine.onAudioLevel = nil + + // Stop feedback should play only after the microphone is fully stopped, + // otherwise the end sound itself can mask the user's trailing syllables. + SystemVolumeManager.restore() + try? await Task.sleep(for: Self.stopSoundRestoreDelay) + SoundFeedback.playStop() DebugFileLogger.log("stop: audio stopped +\(ContinuousClock.now - stopT0)") guard sessionGeneration == myGeneration else { DebugFileLogger.log("stopRecording: zombie after audio pipeline, bailing") @@ -391,12 +411,14 @@ actor RecognitionSession { let provider = KeychainService.selectedASRProvider let canEarlyLLM = ASRProviderRegistry.capabilities(for: provider).isStreaming var earlyLLMTask: Task? + var earlyLLMInputText: String? if needsLLM && canEarlyLLM { var earlyText = currentTranscript.composedText .trimmingCharacters(in: .whitespacesAndNewlines) earlyText = SnippetStorage.applyEffective(to: earlyText) DebugFileLogger.log("stop: needsLLM=true mode=\(currentMode.name) text=\(earlyText.count)chars specMatch=\(earlyText == speculativeLLMText)") if !earlyText.isEmpty { + earlyLLMInputText = earlyText if earlyText == speculativeLLMText, let specTask = speculativeLLMTask { // Speculative LLM matches — reuse (may already be done!) earlyLLMTask = specTask @@ -426,55 +448,50 @@ actor RecognitionSession { } } - // ASR teardown: streaming providers can skip endAudio in LLM modes since - // we already have text. Batch providers (e.g. OpenAI REST) MUST await endAudio - // because that's where the actual recognition happens. + // ASR teardown: correctness matters more than latency here. Always await + // final ASR teardown so trailing words that were captured before the stop + // hotkey still have a chance to become part of the final transcript. + // Batch providers (e.g. OpenAI REST) MUST await endAudio because that's + // where the actual recognition happens. let providerIsStreaming = ASRProviderRegistry.capabilities(for: provider).isStreaming if let client = asrClient { - if needsLLM && earlyLLMTask != nil && providerIsStreaming { - // Fast path (streaming only): just disconnect, skip the 2-3s finalization. - eventConsumptionTask?.cancel() - await client.disconnect() - DebugFileLogger.log("stop: ASR fast-disconnect +\(ContinuousClock.now - stopT0)") - } else { - // Full teardown: batch providers get a longer timeout for the HTTP round-trip. - let endAudioTimeout: Duration = providerIsStreaming ? .seconds(3) : .seconds(60) - do { - try await withThrowingTaskGroup(of: Void.self) { group in - group.addTask { try await client.endAudio() } - group.addTask { - try await Task.sleep(for: endAudioTimeout) - throw CancellationError() - } - try await group.next() - group.cancelAll() + // Full teardown: batch providers get a longer timeout for the HTTP round-trip. + let endAudioTimeout: Duration = providerIsStreaming ? .seconds(3) : .seconds(60) + do { + try await withThrowingTaskGroup(of: Void.self) { group in + group.addTask { try await client.endAudio() } + group.addTask { + try await Task.sleep(for: endAudioTimeout) + throw CancellationError() } - } catch { - NSLog("[Session] endAudio timed out or failed: %@", String(describing: error)) - DebugFileLogger.log("endAudio timeout/error: \(error)") + try await group.next() + group.cancelAll() } - let drainTimeout: Duration = providerIsStreaming ? .seconds(2) : .seconds(5) - if let task = eventConsumptionTask { - let streamDrained = await withTaskGroup(of: Bool.self) { group in - group.addTask { - await task.value - return true - } - group.addTask { - try? await Task.sleep(for: drainTimeout) - return false - } - let first = await group.next() ?? true - group.cancelAll() - return first + } catch { + NSLog("[Session] endAudio timed out or failed: %@", String(describing: error)) + DebugFileLogger.log("endAudio timeout/error: \(error)") + } + let drainTimeout: Duration = providerIsStreaming ? .seconds(2) : .seconds(5) + if let task = eventConsumptionTask { + let streamDrained = await withTaskGroup(of: Bool.self) { group in + group.addTask { + await task.value + return true } - if !streamDrained { - task.cancel() - DebugFileLogger.log("event stream drain timeout; eventConsumptionTask cancelled") + group.addTask { + try? await Task.sleep(for: drainTimeout) + return false } + let first = await group.next() ?? true + group.cancelAll() + return first + } + if !streamDrained { + task.cancel() + DebugFileLogger.log("event stream drain timeout; eventConsumptionTask cancelled") } - await client.disconnect() } + await client.disconnect() } eventConsumptionTask = nil asrClient = nil @@ -500,7 +517,8 @@ actor RecognitionSession { // LLM post-processing: prefer early result (fired at stop time), // fall back to synchronous call for very short recordings where // no streaming text was available yet. - if let earlyTask = earlyLLMTask { + if let earlyTask = earlyLLMTask, + earlyLLMInputText == finalText { state = .postProcessing DebugFileLogger.log("stop: awaiting early LLM result +\(ContinuousClock.now - stopT0)") let earlyResult = await earlyTask.value @@ -518,6 +536,9 @@ actor RecognitionSession { } } else if needsLLM { state = .postProcessing + if let earlyLLMInputText, earlyLLMInputText != finalText { + DebugFileLogger.log("stop: final transcript changed after stop, discarding stale early LLM result") + } if let llmConfig = KeychainService.loadLLMConfig() { DebugFileLogger.log("stop: sync LLM firing mode=\(currentMode.name) model=\(llmConfig.model) with \(finalText.count) chars") do { @@ -716,6 +737,22 @@ actor RecognitionSession { audioChunkSenderTask = nil } + func prepareForStopTesting( + client: any SpeechRecognizer, + mode: ProcessingMode = .direct, + transcript: RecognitionTranscript = .empty + ) { + asrClient = client + currentMode = mode + currentTranscript = transcript + state = .recording + + let chunkContinuation = setupAudioChunkPipeline() + audioEngine.onAudioChunk = { data in + chunkContinuation.yield(data) + } + } + private func markReadyIfNeeded() { guard !hasEmittedReadyForCurrentSession else { return } hasEmittedReadyForCurrentSession = true @@ -804,9 +841,9 @@ actor RecognitionSession { resetSpeculativeLLM() audioEngine.stop() + await finishAudioChunkPipeline(timeout: .milliseconds(100)) audioEngine.onAudioChunk = nil audioEngine.onAudioLevel = nil - await finishAudioChunkPipeline(timeout: .milliseconds(100)) if let client = asrClient { Task { await client.disconnect() } // fire-and-forget: don't block reset on WebSocket teardown diff --git a/Type4MeTests/RecognitionSessionTests.swift b/Type4MeTests/RecognitionSessionTests.swift index 3bf0ccf..5e65aec 100644 --- a/Type4MeTests/RecognitionSessionTests.swift +++ b/Type4MeTests/RecognitionSessionTests.swift @@ -50,4 +50,105 @@ final class RecognitionSessionTests: XCTestCase { let mode = await session.currentModeForTesting() XCTAssertEqual(mode.id, ProcessingMode.directId) } + + func testStopRecordingWaitsForTailChunkBeforeEndAudio() async throws { + KeychainService.selectedASRProvider = .volcano + + let tempDB = URL(fileURLWithPath: NSTemporaryDirectory()) + .appendingPathComponent(UUID().uuidString) + .appendingPathExtension("sqlite") + let audioEngine = FakeAudioEngine(tailChunk: Data([1, 2, 3, 4])) + let recognizer = FakeSpeechRecognizer() + let session = RecognitionSession( + audioEngine: audioEngine, + historyStore: HistoryStore(path: tempDB.path) + ) + + await session.prepareForStopTesting(client: recognizer) + + let stopTask = Task { + await session.stopRecording() + } + + try await Task.sleep(for: .milliseconds(100)) + let didCallEndAudio = await recognizer.didCallEndAudio + let sentAudioBeforeResume = await recognizer.sentAudio + XCTAssertFalse(didCallEndAudio) + XCTAssertEqual(sentAudioBeforeResume, []) + + await recognizer.resumeSendAudio() + await stopTask.value + + let sentAudioAfterResume = await recognizer.sentAudio + let callOrder = await recognizer.callOrder + XCTAssertEqual(sentAudioAfterResume, [Data([1, 2, 3, 4])]) + XCTAssertEqual(callOrder, ["sendAudio", "endAudio", "disconnect"]) + } +} + +private final class FakeAudioEngine: AudioCapturing, @unchecked Sendable { + var onAudioChunk: ((Data) -> Void)? + var onAudioLevel: ((Float) -> Void)? + + private let tailChunk: Data + + init(tailChunk: Data) { + self.tailChunk = tailChunk + } + + func warmUp() {} + func start() throws {} + + func stop() { + onAudioChunk?(tailChunk) + } +} + +private actor FakeSpeechRecognizer: SpeechRecognizer { + private var sendContinuation: CheckedContinuation? + private var shouldResumeImmediately = false + private(set) var sentAudio: [Data] = [] + private(set) var callOrder: [String] = [] + + var didCallEndAudio: Bool { + callOrder.contains("endAudio") + } + + var events: AsyncStream { + let (stream, continuation) = AsyncStream.makeStream() + continuation.finish() + return stream + } + + func connect(config: any ASRProviderConfig, options: ASRRequestOptions) async throws {} + + func sendAudio(_ data: Data) async throws { + callOrder.append("sendAudio") + if shouldResumeImmediately { + shouldResumeImmediately = false + sentAudio.append(data) + return + } + await withCheckedContinuation { continuation in + sendContinuation = continuation + } + sentAudio.append(data) + } + + func endAudio() async throws { + callOrder.append("endAudio") + } + + func disconnect() async { + callOrder.append("disconnect") + } + + func resumeSendAudio() { + if let sendContinuation { + sendContinuation.resume() + self.sendContinuation = nil + } else { + shouldResumeImmediately = true + } + } }