Skip to content
Open

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

84 changes: 65 additions & 19 deletions Packages/OsaurusCore/Managers/TTSService.swift
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ public final class TTSService: ObservableObject {
// MARK: - Private state

private var manager: PocketTtsManager?
private var managerLanguage: TTSLanguage?
private var playbackTask: Task<Void, Never>?
private var initTask: Task<Void, Never>?

Expand All @@ -84,7 +85,11 @@ public final class TTSService: ObservableObject {

/// True when the model is fully loaded and ready to synthesize.
public var isModelReady: Bool {
if case .ready = modelState { return true }
if case .ready = modelState,
managerLanguage == TTSConfigurationStore.load().language
{
return true
}
return false
}

Expand All @@ -98,7 +103,7 @@ public final class TTSService: ObservableObject {
}

guard isModelReady else {
if Self.pocketTtsModelsExistOnDisk() {
if Self.pocketTtsModelsExistOnDisk(language: TTSConfigurationStore.load().language) {
// Models already downloaded; just load them into memory.
ensureModelLoaded()
} else {
Expand All @@ -119,7 +124,7 @@ public final class TTSService: ObservableObject {
/// `activeSpeakCallId` so the row spinner runs until audio drains
public func startToolPlayback(text: String, messageId: UUID, callId: String, voiceOverride: String? = nil) throws {
guard isModelReady else {
if Self.pocketTtsModelsExistOnDisk() {
if Self.pocketTtsModelsExistOnDisk(language: TTSConfigurationStore.load().language) {
ensureModelLoaded()
} else {
NotificationCenter.default.post(name: .openTTSSettingsRequested, object: nil)
Expand Down Expand Up @@ -150,16 +155,22 @@ public final class TTSService: ObservableObject {

/// Begin a background download/initialize. Safe to call multiple times.
public func ensureModelLoaded() {
if case .ready = modelState { return }
let config = TTSConfigurationStore.load()
if case .ready = modelState, managerLanguage == config.language { return }
if initTask != nil { return }

stop()
manager = nil
managerLanguage = nil
modelState = .downloading(fraction: nil)
let voice = TTSConfigurationStore.load().voice
let voice = config.voice
let language = config.language
initTask = Task { [weak self] in
do {
// Route through the downloader explicitly so we get progress callbacks.
// When models are already cached this returns nearly instantly.
_ = try await PocketTtsResourceDownloader.ensureModels(
language: language.fluidAudioLanguage,
directory: nil,
progressHandler: { progress in
Task { @MainActor in
Expand All @@ -176,11 +187,15 @@ public final class TTSService: ObservableObject {
}
)

let mgr = PocketTtsManager(defaultVoice: voice)
let mgr = PocketTtsManager(
defaultVoice: voice,
language: language.fluidAudioLanguage
)
try await mgr.initialize()
await MainActor.run {
guard let self else { return }
self.manager = mgr
self.managerLanguage = language
self.modelState = .ready
self.initTask = nil
}
Expand All @@ -198,24 +213,35 @@ public final class TTSService: ObservableObject {
/// Call this on app launch and when returning to the settings tab.
/// If models are already present, transitions to `.ready` after a fast local load.
public func refreshModelState() {
if case .ready = modelState { return }
let language = TTSConfigurationStore.load().language
if case .ready = modelState, managerLanguage == language { return }
if initTask != nil { return }

if Self.pocketTtsModelsExistOnDisk() {
if Self.pocketTtsModelsExistOnDisk(language: language) {
ensureModelLoaded()
} else {
stop()
manager = nil
managerLanguage = nil
modelState = .notReady
}
}

private static func pocketTtsModelsExistOnDisk() -> Bool {
let home = FileManager.default.homeDirectoryForCurrentUser
let repoDir =
home
nonisolated static func pocketTtsModelCacheDirectory(
language: TTSLanguage,
homeDirectory: URL = FileManager.default.homeDirectoryForCurrentUser
) -> URL {
homeDirectory
.appendingPathComponent(".cache", isDirectory: true)
.appendingPathComponent("fluidaudio", isDirectory: true)
.appendingPathComponent("Models", isDirectory: true)
.appendingPathComponent("pocket-tts", isDirectory: true)
.appendingPathComponent(Repo.pocketTts.folderName, isDirectory: true)
.appendingPathComponent("v2", isDirectory: true)
.appendingPathComponent(language.rawValue, isDirectory: true)
}

private static func pocketTtsModelsExistOnDisk(language: TTSLanguage) -> Bool {
let repoDir = pocketTtsModelCacheDirectory(language: language)
let required = ModelNames.PocketTTS.requiredModels
let fm = FileManager.default
return required.allSatisfy { fm.fileExists(atPath: repoDir.appendingPathComponent($0).path) }
Expand All @@ -224,15 +250,17 @@ public final class TTSService: ObservableObject {
// MARK: - Playback

private func startPlayback(text: String, messageId: UUID, voiceOverride: String? = nil) {
do {
try configureEngineIfNeeded()
} catch {
modelState = .failed(error.localizedDescription)
let config = TTSConfigurationStore.load()
guard let manager, managerLanguage == config.language else {
playingMessageId = nil
ensureModelLoaded()
return
}

guard let manager else {
do {
try configureEngineIfNeeded()
} catch {
modelState = .failed(error.localizedDescription)
playingMessageId = nil
return
}
Expand All @@ -241,7 +269,6 @@ public final class TTSService: ObservableObject {
pendingBufferCount = 0
playerNode.play()

let config = TTSConfigurationStore.load()
let trimmedOverride = voiceOverride?.trimmingCharacters(in: .whitespacesAndNewlines)
let voice = (trimmedOverride?.isEmpty == false ? trimmedOverride! : config.voice)
let temperature = Float(config.temperature)
Expand Down Expand Up @@ -332,6 +359,25 @@ public final class TTSService: ObservableObject {
}
}

private extension TTSLanguage {
/// FluidAudio binds language to `PocketTtsManager`, while Osaurus
/// persists raw IDs for migration stability.
var fluidAudioLanguage: PocketTtsLanguage {
switch self {
case .english: return .english
case .french24L: return .french24L
case .german: return .german
case .german24L: return .german24L
case .italian: return .italian
case .italian24L: return .italian24L
case .portuguese: return .portuguese
case .portuguese24L: return .portuguese24L
case .spanish: return .spanish
case .spanish24L: return .spanish24L
}
}
}

/// built-in PocketTTS voices (kyutai/pocket-tts on HuggingFace). shared by
/// the TTS settings tab and the per-agent voice picker.
public enum PocketTTSVoiceCatalog {
Expand Down
44 changes: 44 additions & 0 deletions Packages/OsaurusCore/Models/Voice/TTSConfiguration.swift
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,46 @@

import Foundation

/// Stable app-owned IDs for PocketTTS language packs so persisted
/// preferences do not depend on FluidAudio's Swift symbol names.
public enum TTSLanguage: String, Codable, CaseIterable, Identifiable, Sendable {
case english
case french24L = "french_24l"
case german
case german24L = "german_24l"
case italian
case italian24L = "italian_24l"
case portuguese
case portuguese24L = "portuguese_24l"
case spanish
case spanish24L = "spanish_24l"

public var id: String { rawValue }

public var displayName: String {
switch self {
case .english: return "English"
case .french24L: return "French (24-layer)"
case .german: return "German"
case .german24L: return "German (24-layer)"
case .italian: return "Italian"
case .italian24L: return "Italian (24-layer)"
case .portuguese: return "Portuguese"
case .portuguese24L: return "Portuguese (24-layer)"
case .spanish: return "Spanish"
case .spanish24L: return "Spanish (24-layer)"
}
}
}

/// Configuration settings for PocketTTS text-to-speech.
public struct TTSConfiguration: Codable, Equatable, Sendable {
/// Master enable toggle. When false, speaker buttons are hidden from message cells.
public var enabled: Bool

/// PocketTTS language pack used for synthesis.
public var language: TTSLanguage

/// PocketTTS voice identifier.
public var voice: String

Expand All @@ -24,17 +59,26 @@ public struct TTSConfiguration: Codable, Equatable, Sendable {
let container = try decoder.container(keyedBy: CodingKeys.self)
let defaults = TTSConfiguration.default
self.enabled = try container.decodeIfPresent(Bool.self, forKey: .enabled) ?? defaults.enabled
if let rawLanguage = try container.decodeIfPresent(String.self, forKey: .language),
let language = TTSLanguage(rawValue: rawLanguage)
{
self.language = language
} else {
self.language = defaults.language
}
self.voice = try container.decodeIfPresent(String.self, forKey: .voice) ?? defaults.voice
self.temperature =
try container.decodeIfPresent(Double.self, forKey: .temperature) ?? defaults.temperature
}

public init(
enabled: Bool = true,
language: TTSLanguage = .english,
voice: String = TTSConfiguration.defaultVoice,
temperature: Double = 0.7
) {
self.enabled = enabled
self.language = language
self.voice = voice
self.temperature = temperature
}
Expand Down
9 changes: 4 additions & 5 deletions Packages/OsaurusCore/Package.swift
Original file line number Diff line number Diff line change
Expand Up @@ -302,11 +302,10 @@ let package = Package(
url: "https://github.com/osaurus-ai/swift-transformers",
revision: "087a66b17e482220b94909c5cf98688383ae481a"
),
// FluidAudio 0.14.3 added a breaking `language:` parameter to TTS
// calls that osaurus's `TTSService` doesn't pass. Pinning to the
// last working version until osaurus catches up. Bumping requires
// a paired osaurus-side TTSService update.
.package(url: "https://github.com/FluidInference/FluidAudio.git", "0.14.0" ..< "0.14.2"),
// FluidAudio 0.14.3 introduced language-bound PocketTTS managers.
// Osaurus now persists the selected language and reloads PocketTTS
// when that language pack changes.
.package(url: "https://github.com/FluidInference/FluidAudio.git", "0.14.3" ..< "0.15.0"),
// Pinned by commit (was `branch: "main"`) — same reasoning as
// vmlx-swift-lm above.
.package(
Expand Down
95 changes: 95 additions & 0 deletions Packages/OsaurusCore/Tests/Voice/TTSConfigurationTests.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
//
// TTSConfigurationTests.swift
// osaurusTests
//

import Foundation
import Testing

@testable import OsaurusCore

struct TTSConfigurationTests {
@Test func decodeLegacyConfigDefaultsToEnglish() throws {
let json = """
{
"enabled": true,
"voice": "alba",
"temperature": 0.7
}
"""

let decoded = try JSONDecoder().decode(
TTSConfiguration.self,
from: Data(json.utf8)
)

#expect(decoded.enabled)
#expect(decoded.voice == "alba")
#expect(decoded.temperature == 0.7)
#expect(decoded.language == .english)
}

@Test func decodeUnknownLanguageFallsBackToEnglish() throws {
let json = """
{
"enabled": true,
"language": "klingon",
"voice": "michael",
"temperature": 0.6
}
"""

let decoded = try JSONDecoder().decode(
TTSConfiguration.self,
from: Data(json.utf8)
)

#expect(decoded.language == .english)
#expect(decoded.voice == "michael")
}

@Test func roundTripPersistsSelectedLanguageRawValue() throws {
let config = TTSConfiguration(
enabled: true,
language: .french24L,
voice: "eve",
temperature: 0.55
)

let data = try JSONEncoder().encode(config)
let object = try #require(JSONSerialization.jsonObject(with: data) as? [String: Any])
let decoded = try JSONDecoder().decode(TTSConfiguration.self, from: data)

#expect(object["language"] as? String == "french_24l")
#expect(decoded == config)
}

@Test func languageCatalogIncludesPocketTTSLanguagePacks() {
let languages = TTSLanguage.allCases

#expect(languages.first == .english)
#expect(languages.contains(.french24L))
#expect(languages.contains(.german))
#expect(languages.contains(.german24L))
#expect(languages.contains(.italian))
#expect(languages.contains(.italian24L))
#expect(languages.contains(.portuguese))
#expect(languages.contains(.portuguese24L))
#expect(languages.contains(.spanish))
#expect(languages.contains(.spanish24L))
#expect(TTSLanguage.french24L.displayName == "French (24-layer)")
}

@Test func pocketTTSCacheDirectoryMatchesFluidAudioLayout() {
let home = URL(fileURLWithPath: "/tmp/osaurus-home", isDirectory: true)
let directory = TTSService.pocketTtsModelCacheDirectory(
language: .french24L,
homeDirectory: home
)

#expect(
directory.path
== "/tmp/osaurus-home/.cache/fluidaudio/Models/pocket-tts/v2/french_24l"
)
}
}
Loading
Loading