diff --git a/App/osaurus.xcodeproj/project.xcworkspace/xcshareddata/swiftpm/Package.resolved b/App/osaurus.xcodeproj/project.xcworkspace/xcshareddata/swiftpm/Package.resolved index 74c2599bc..4a16d5c29 100644 --- a/App/osaurus.xcodeproj/project.xcworkspace/xcshareddata/swiftpm/Package.resolved +++ b/App/osaurus.xcodeproj/project.xcworkspace/xcshareddata/swiftpm/Package.resolved @@ -42,8 +42,8 @@ "kind" : "remoteSourceControl", "location" : "https://github.com/FluidInference/FluidAudio.git", "state" : { - "revision" : "d302273d49ef4d8914b27f20d342be482e8810f1", - "version" : "0.14.1" + "revision" : "ce59fb14b8b8978b196f6a34282e20ea6762d164", + "version" : "0.14.5" } }, { diff --git a/Packages/OsaurusCore/Managers/TTSService.swift b/Packages/OsaurusCore/Managers/TTSService.swift index df6665b04..35d1cfd89 100644 --- a/Packages/OsaurusCore/Managers/TTSService.swift +++ b/Packages/OsaurusCore/Managers/TTSService.swift @@ -61,6 +61,7 @@ public final class TTSService: ObservableObject { // MARK: - Private state private var manager: PocketTtsManager? + private var managerLanguage: TTSLanguage? private var playbackTask: Task? private var initTask: Task? @@ -84,7 +85,11 @@ public final class TTSService: ObservableObject { /// True when the model is fully loaded and ready to synthesize. public var isModelReady: Bool { - if case .ready = modelState { return true } + if case .ready = modelState, + managerLanguage == TTSConfigurationStore.load().language + { + return true + } return false } @@ -98,7 +103,7 @@ public final class TTSService: ObservableObject { } guard isModelReady else { - if Self.pocketTtsModelsExistOnDisk() { + if Self.pocketTtsModelsExistOnDisk(language: TTSConfigurationStore.load().language) { // Models already downloaded; just load them into memory. ensureModelLoaded() } else { @@ -119,7 +124,7 @@ public final class TTSService: ObservableObject { /// `activeSpeakCallId` so the row spinner runs until audio drains public func startToolPlayback(text: String, messageId: UUID, callId: String, voiceOverride: String? = nil) throws { guard isModelReady else { - if Self.pocketTtsModelsExistOnDisk() { + if Self.pocketTtsModelsExistOnDisk(language: TTSConfigurationStore.load().language) { ensureModelLoaded() } else { NotificationCenter.default.post(name: .openTTSSettingsRequested, object: nil) @@ -150,16 +155,22 @@ public final class TTSService: ObservableObject { /// Begin a background download/initialize. Safe to call multiple times. public func ensureModelLoaded() { - if case .ready = modelState { return } + let config = TTSConfigurationStore.load() + if case .ready = modelState, managerLanguage == config.language { return } if initTask != nil { return } + stop() + manager = nil + managerLanguage = nil modelState = .downloading(fraction: nil) - let voice = TTSConfigurationStore.load().voice + let voice = config.voice + let language = config.language initTask = Task { [weak self] in do { // Route through the downloader explicitly so we get progress callbacks. // When models are already cached this returns nearly instantly. _ = try await PocketTtsResourceDownloader.ensureModels( + language: language.fluidAudioLanguage, directory: nil, progressHandler: { progress in Task { @MainActor in @@ -176,11 +187,15 @@ public final class TTSService: ObservableObject { } ) - let mgr = PocketTtsManager(defaultVoice: voice) + let mgr = PocketTtsManager( + defaultVoice: voice, + language: language.fluidAudioLanguage + ) try await mgr.initialize() await MainActor.run { guard let self else { return } self.manager = mgr + self.managerLanguage = language self.modelState = .ready self.initTask = nil } @@ -198,24 +213,35 @@ public final class TTSService: ObservableObject { /// Call this on app launch and when returning to the settings tab. /// If models are already present, transitions to `.ready` after a fast local load. public func refreshModelState() { - if case .ready = modelState { return } + let language = TTSConfigurationStore.load().language + if case .ready = modelState, managerLanguage == language { return } if initTask != nil { return } - if Self.pocketTtsModelsExistOnDisk() { + if Self.pocketTtsModelsExistOnDisk(language: language) { ensureModelLoaded() } else { + stop() + manager = nil + managerLanguage = nil modelState = .notReady } } - private static func pocketTtsModelsExistOnDisk() -> Bool { - let home = FileManager.default.homeDirectoryForCurrentUser - let repoDir = - home + nonisolated static func pocketTtsModelCacheDirectory( + language: TTSLanguage, + homeDirectory: URL = FileManager.default.homeDirectoryForCurrentUser + ) -> URL { + homeDirectory .appendingPathComponent(".cache", isDirectory: true) .appendingPathComponent("fluidaudio", isDirectory: true) .appendingPathComponent("Models", isDirectory: true) - .appendingPathComponent("pocket-tts", isDirectory: true) + .appendingPathComponent(Repo.pocketTts.folderName, isDirectory: true) + .appendingPathComponent("v2", isDirectory: true) + .appendingPathComponent(language.rawValue, isDirectory: true) + } + + private static func pocketTtsModelsExistOnDisk(language: TTSLanguage) -> Bool { + let repoDir = pocketTtsModelCacheDirectory(language: language) let required = ModelNames.PocketTTS.requiredModels let fm = FileManager.default return required.allSatisfy { fm.fileExists(atPath: repoDir.appendingPathComponent($0).path) } @@ -224,15 +250,17 @@ public final class TTSService: ObservableObject { // MARK: - Playback private func startPlayback(text: String, messageId: UUID, voiceOverride: String? = nil) { - do { - try configureEngineIfNeeded() - } catch { - modelState = .failed(error.localizedDescription) + let config = TTSConfigurationStore.load() + guard let manager, managerLanguage == config.language else { playingMessageId = nil + ensureModelLoaded() return } - guard let manager else { + do { + try configureEngineIfNeeded() + } catch { + modelState = .failed(error.localizedDescription) playingMessageId = nil return } @@ -241,7 +269,6 @@ public final class TTSService: ObservableObject { pendingBufferCount = 0 playerNode.play() - let config = TTSConfigurationStore.load() let trimmedOverride = voiceOverride?.trimmingCharacters(in: .whitespacesAndNewlines) let voice = (trimmedOverride?.isEmpty == false ? trimmedOverride! : config.voice) let temperature = Float(config.temperature) @@ -332,6 +359,25 @@ public final class TTSService: ObservableObject { } } +private extension TTSLanguage { + /// FluidAudio binds language to `PocketTtsManager`, while Osaurus + /// persists raw IDs for migration stability. + var fluidAudioLanguage: PocketTtsLanguage { + switch self { + case .english: return .english + case .french24L: return .french24L + case .german: return .german + case .german24L: return .german24L + case .italian: return .italian + case .italian24L: return .italian24L + case .portuguese: return .portuguese + case .portuguese24L: return .portuguese24L + case .spanish: return .spanish + case .spanish24L: return .spanish24L + } + } +} + /// built-in PocketTTS voices (kyutai/pocket-tts on HuggingFace). shared by /// the TTS settings tab and the per-agent voice picker. public enum PocketTTSVoiceCatalog { diff --git a/Packages/OsaurusCore/Models/Voice/TTSConfiguration.swift b/Packages/OsaurusCore/Models/Voice/TTSConfiguration.swift index 5fcf785d8..d467985dc 100644 --- a/Packages/OsaurusCore/Models/Voice/TTSConfiguration.swift +++ b/Packages/OsaurusCore/Models/Voice/TTSConfiguration.swift @@ -7,11 +7,46 @@ import Foundation +/// Stable app-owned IDs for PocketTTS language packs so persisted +/// preferences do not depend on FluidAudio's Swift symbol names. +public enum TTSLanguage: String, Codable, CaseIterable, Identifiable, Sendable { + case english + case french24L = "french_24l" + case german + case german24L = "german_24l" + case italian + case italian24L = "italian_24l" + case portuguese + case portuguese24L = "portuguese_24l" + case spanish + case spanish24L = "spanish_24l" + + public var id: String { rawValue } + + public var displayName: String { + switch self { + case .english: return "English" + case .french24L: return "French (24-layer)" + case .german: return "German" + case .german24L: return "German (24-layer)" + case .italian: return "Italian" + case .italian24L: return "Italian (24-layer)" + case .portuguese: return "Portuguese" + case .portuguese24L: return "Portuguese (24-layer)" + case .spanish: return "Spanish" + case .spanish24L: return "Spanish (24-layer)" + } + } +} + /// Configuration settings for PocketTTS text-to-speech. public struct TTSConfiguration: Codable, Equatable, Sendable { /// Master enable toggle. When false, speaker buttons are hidden from message cells. public var enabled: Bool + /// PocketTTS language pack used for synthesis. + public var language: TTSLanguage + /// PocketTTS voice identifier. public var voice: String @@ -24,6 +59,13 @@ public struct TTSConfiguration: Codable, Equatable, Sendable { let container = try decoder.container(keyedBy: CodingKeys.self) let defaults = TTSConfiguration.default self.enabled = try container.decodeIfPresent(Bool.self, forKey: .enabled) ?? defaults.enabled + if let rawLanguage = try container.decodeIfPresent(String.self, forKey: .language), + let language = TTSLanguage(rawValue: rawLanguage) + { + self.language = language + } else { + self.language = defaults.language + } self.voice = try container.decodeIfPresent(String.self, forKey: .voice) ?? defaults.voice self.temperature = try container.decodeIfPresent(Double.self, forKey: .temperature) ?? defaults.temperature @@ -31,10 +73,12 @@ public struct TTSConfiguration: Codable, Equatable, Sendable { public init( enabled: Bool = true, + language: TTSLanguage = .english, voice: String = TTSConfiguration.defaultVoice, temperature: Double = 0.7 ) { self.enabled = enabled + self.language = language self.voice = voice self.temperature = temperature } diff --git a/Packages/OsaurusCore/Package.swift b/Packages/OsaurusCore/Package.swift index 3b06b5ef8..8ccd3f12b 100644 --- a/Packages/OsaurusCore/Package.swift +++ b/Packages/OsaurusCore/Package.swift @@ -36,11 +36,10 @@ let package = Package( url: "https://github.com/osaurus-ai/vmlx-swift", revision: "4356ef1985344757a4326dd08ba27b5cbff230ab" ), - // FluidAudio 0.14.3 added a breaking `language:` parameter to TTS - // calls that osaurus's `TTSService` doesn't pass. Pinning to the - // last working version until osaurus catches up. Bumping requires - // a paired osaurus-side TTSService update. - .package(url: "https://github.com/FluidInference/FluidAudio.git", "0.14.0" ..< "0.14.2"), + // FluidAudio 0.14.3 introduced language-bound PocketTTS managers. + // Osaurus persists the selected language and reloads PocketTTS when + // that language pack changes. + .package(url: "https://github.com/FluidInference/FluidAudio.git", "0.14.3" ..< "0.15.0"), // Pinned by commit (was `branch: "main"`) — same reasoning as the // consolidated vmlx-swift pin above. .package( diff --git a/Packages/OsaurusCore/Tests/Voice/TTSConfigurationTests.swift b/Packages/OsaurusCore/Tests/Voice/TTSConfigurationTests.swift new file mode 100644 index 000000000..a13829c5c --- /dev/null +++ b/Packages/OsaurusCore/Tests/Voice/TTSConfigurationTests.swift @@ -0,0 +1,95 @@ +// +// TTSConfigurationTests.swift +// osaurusTests +// + +import Foundation +import Testing + +@testable import OsaurusCore + +struct TTSConfigurationTests { + @Test func decodeLegacyConfigDefaultsToEnglish() throws { + let json = """ + { + "enabled": true, + "voice": "alba", + "temperature": 0.7 + } + """ + + let decoded = try JSONDecoder().decode( + TTSConfiguration.self, + from: Data(json.utf8) + ) + + #expect(decoded.enabled) + #expect(decoded.voice == "alba") + #expect(decoded.temperature == 0.7) + #expect(decoded.language == .english) + } + + @Test func decodeUnknownLanguageFallsBackToEnglish() throws { + let json = """ + { + "enabled": true, + "language": "klingon", + "voice": "michael", + "temperature": 0.6 + } + """ + + let decoded = try JSONDecoder().decode( + TTSConfiguration.self, + from: Data(json.utf8) + ) + + #expect(decoded.language == .english) + #expect(decoded.voice == "michael") + } + + @Test func roundTripPersistsSelectedLanguageRawValue() throws { + let config = TTSConfiguration( + enabled: true, + language: .french24L, + voice: "eve", + temperature: 0.55 + ) + + let data = try JSONEncoder().encode(config) + let object = try #require(JSONSerialization.jsonObject(with: data) as? [String: Any]) + let decoded = try JSONDecoder().decode(TTSConfiguration.self, from: data) + + #expect(object["language"] as? String == "french_24l") + #expect(decoded == config) + } + + @Test func languageCatalogIncludesPocketTTSLanguagePacks() { + let languages = TTSLanguage.allCases + + #expect(languages.first == .english) + #expect(languages.contains(.french24L)) + #expect(languages.contains(.german)) + #expect(languages.contains(.german24L)) + #expect(languages.contains(.italian)) + #expect(languages.contains(.italian24L)) + #expect(languages.contains(.portuguese)) + #expect(languages.contains(.portuguese24L)) + #expect(languages.contains(.spanish)) + #expect(languages.contains(.spanish24L)) + #expect(TTSLanguage.french24L.displayName == "French (24-layer)") + } + + @Test func pocketTTSCacheDirectoryMatchesFluidAudioLayout() { + let home = URL(fileURLWithPath: "/tmp/osaurus-home", isDirectory: true) + let directory = TTSService.pocketTtsModelCacheDirectory( + language: .french24L, + homeDirectory: home + ) + + #expect( + directory.path + == "/tmp/osaurus-home/.cache/fluidaudio/Models/pocket-tts/v2/french_24l" + ) + } +} diff --git a/Packages/OsaurusCore/Views/Voice/TTSModeSettingsTab.swift b/Packages/OsaurusCore/Views/Voice/TTSModeSettingsTab.swift index 55f7031ac..f13c50988 100644 --- a/Packages/OsaurusCore/Views/Voice/TTSModeSettingsTab.swift +++ b/Packages/OsaurusCore/Views/Voice/TTSModeSettingsTab.swift @@ -3,7 +3,7 @@ // osaurus // // Settings UI for text-to-speech (PocketTTS). -// Toggle TTS, pick voice/temperature, download model, preview. +// Toggle TTS, pick language/voice/temperature, download model, preview. // import SwiftUI @@ -21,6 +21,14 @@ struct TTSModeSettingsTab: View { PocketTTSVoiceCatalog.displayName(for: voice) } + private func displayName(for language: TTSLanguage) -> String { + language.displayName + } + + private var languageMenuOptions: [TTSLanguage] { + TTSLanguage.allCases + } + private var voiceMenuOptions: [String] { let builtIn = PocketTTSVoiceCatalog.availableVoices let current = config.voice.trimmingCharacters(in: .whitespacesAndNewlines) @@ -72,6 +80,7 @@ struct TTSModeSettingsTab: View { } .onReceive(NotificationCenter.default.publisher(for: .ttsConfigurationChanged)) { _ in loadSettings() + ttsService.refreshModelState() } } @@ -122,7 +131,7 @@ struct TTSModeSettingsTab: View { .foregroundColor(theme.accentColor) Text( - "Powered by FluidAudio PocketTTS. English only. Streams audio as it's synthesized.", + "Powered by FluidAudio PocketTTS. Pick the language pack that matches the reply text.", bundle: .module ) .font(.system(size: 12)) @@ -190,7 +199,7 @@ struct TTSModeSettingsTab: View { private var modelStatusText: String { switch ttsService.modelState { - case .notReady: return L("Not downloaded — about 700 MB") + case .notReady: return L("Selected language pack not downloaded") case .downloading(let fraction): if let fraction { return String(format: "%@ %d%%", L("Downloading"), Int(fraction * 100)) @@ -274,6 +283,25 @@ struct TTSModeSettingsTab: View { .font(.system(size: 14, weight: .semibold)) .foregroundColor(theme.primaryText) + HStack { + Text("Language", bundle: .module) + .font(.system(size: 12)) + .foregroundColor(theme.secondaryText) + Spacer() + Picker("", selection: $config.language) { + ForEach(languageMenuOptions) { language in + Text(displayName(for: language)).tag(language) + } + } + .labelsHidden() + .pickerStyle(MenuPickerStyle()) + .frame(maxWidth: 180) + .onChange(of: config.language) { _, _ in + saveSettings() + ttsService.refreshModelState() + } + } + HStack { Text("Voice", bundle: .module) .font(.system(size: 12)) diff --git a/osaurus.xcworkspace/xcshareddata/swiftpm/Package.resolved b/osaurus.xcworkspace/xcshareddata/swiftpm/Package.resolved index 6fd24bb02..c0bb7ae25 100644 --- a/osaurus.xcworkspace/xcshareddata/swiftpm/Package.resolved +++ b/osaurus.xcworkspace/xcshareddata/swiftpm/Package.resolved @@ -42,8 +42,8 @@ "kind" : "remoteSourceControl", "location" : "https://github.com/FluidInference/FluidAudio.git", "state" : { - "revision" : "d302273d49ef4d8914b27f20d342be482e8810f1", - "version" : "0.14.1" + "revision" : "ce59fb14b8b8978b196f6a34282e20ea6762d164", + "version" : "0.14.5" } }, {