diff --git a/app/src/components/Generation/FloatingGenerateBox.tsx b/app/src/components/Generation/FloatingGenerateBox.tsx index d4ab574d..8a8512f9 100644 --- a/app/src/components/Generation/FloatingGenerateBox.tsx +++ b/app/src/components/Generation/FloatingGenerateBox.tsx @@ -13,7 +13,7 @@ import { } from '@/components/ui/select'; import { Textarea } from '@/components/ui/textarea'; import { useToast } from '@/components/ui/use-toast'; -import { LANGUAGE_OPTIONS } from '@/lib/constants/languages'; +import { getLanguageOptionsForEngine } from '@/lib/constants/languages'; import { useGenerationForm } from '@/lib/hooks/useGenerationForm'; import { useProfile, useProfiles } from '@/lib/hooks/useProfiles'; import { useAddStoryItem, useStory } from '@/lib/hooks/useStories'; @@ -381,25 +381,30 @@ export function FloatingGenerateBox({ ( - - - - - )} + render={({ field }) => { + const engineLangs = getLanguageOptionsForEngine( + form.watch('engine') || 'qwen', + ); + return ( + + + + + ); + }} /> @@ -409,13 +414,19 @@ export function FloatingGenerateBox({ ? 'luxtts' : form.watch('engine') === 'chatterbox' ? 'chatterbox' - : `qwen:${form.watch('modelSize') || '1.7B'}` + : form.watch('engine') === 'chatterbox_turbo' + ? 'chatterbox_turbo' + : `qwen:${form.watch('modelSize') || '1.7B'}` } onValueChange={(value) => { if (value === 'luxtts') { form.setValue('engine', 'luxtts'); + form.setValue('language', 'en'); } else if (value === 'chatterbox') { form.setValue('engine', 'chatterbox'); + } else if (value === 'chatterbox_turbo') { + form.setValue('engine', 'chatterbox_turbo'); + form.setValue('language', 'en'); } else { const [, modelSize] = value.split(':'); form.setValue('engine', 'qwen'); @@ -441,6 +452,12 @@ export function FloatingGenerateBox({ Chatterbox + + Chatterbox Turbo + diff --git a/app/src/components/Generation/GenerationForm.tsx b/app/src/components/Generation/GenerationForm.tsx index 26fd13e3..a3c96cbc 100644 --- a/app/src/components/Generation/GenerationForm.tsx +++ b/app/src/components/Generation/GenerationForm.tsx @@ -19,7 +19,7 @@ import { SelectValue, } from '@/components/ui/select'; import { Textarea } from '@/components/ui/textarea'; -import { LANGUAGE_OPTIONS } from '@/lib/constants/languages'; +import { getLanguageOptionsForEngine } from '@/lib/constants/languages'; import { useGenerationForm } from '@/lib/hooks/useGenerationForm'; import { useProfile } from '@/lib/hooks/useProfiles'; import { useUIStore } from '@/stores/uiStore'; @@ -109,13 +109,19 @@ export function GenerationForm() { ? 'luxtts' : form.watch('engine') === 'chatterbox' ? 'chatterbox' - : `qwen:${form.watch('modelSize') || '1.7B'}` + : form.watch('engine') === 'chatterbox_turbo' + ? 'chatterbox_turbo' + : `qwen:${form.watch('modelSize') || '1.7B'}` } onValueChange={(value) => { if (value === 'luxtts') { form.setValue('engine', 'luxtts'); + form.setValue('language', 'en'); } else if (value === 'chatterbox') { form.setValue('engine', 'chatterbox'); + } else if (value === 'chatterbox_turbo') { + form.setValue('engine', 'chatterbox_turbo'); + form.setValue('language', 'en'); } else { const [, modelSize] = value.split(':'); form.setValue('engine', 'qwen'); @@ -133,40 +139,46 @@ export function GenerationForm() { Qwen3-TTS 0.6B LuxTTS Chatterbox + Chatterbox Turbo {form.watch('engine') === 'luxtts' ? 'Fast, English-focused' : form.watch('engine') === 'chatterbox' - ? 'Multilingual, incl. Hebrew' - : 'Multi-language, two sizes'} + ? '23 languages, incl. Hebrew' + : form.watch('engine') === 'chatterbox_turbo' + ? 'English, [laugh] [cough] tags' + : 'Multi-language, two sizes'} ( - - Language - - - - )} + render={({ field }) => { + const engineLangs = getLanguageOptionsForEngine(form.watch('engine') || 'qwen'); + return ( + + Language + + + + ); + }} /> = { + qwen: ['zh', 'en', 'ja', 'ko', 'de', 'fr', 'ru', 'pt', 'es', 'it'], + luxtts: ['en'], + chatterbox: [ + 'ar', + 'da', + 'de', + 'el', + 'en', + 'es', + 'fi', + 'fr', + 'he', + 'hi', + 'it', + 'ja', + 'ko', + 'ms', + 'nl', + 'no', + 'pl', + 'pt', + 'ru', + 'sv', + 'sw', + 'tr', + 'zh', + ], + chatterbox_turbo: ['en'], +} as const; -export const LANGUAGE_CODES = Object.keys(SUPPORTED_LANGUAGES) as LanguageCode[]; +/** Helper: get language options for a given engine. */ +export function getLanguageOptionsForEngine(engine: string) { + const codes = ENGINE_LANGUAGES[engine] ?? ENGINE_LANGUAGES.qwen; + return codes.map((code) => ({ + value: code, + label: ALL_LANGUAGES[code], + })); +} +// ── Backwards-compatible exports used elsewhere ────────────────────── +export const SUPPORTED_LANGUAGES = ALL_LANGUAGES; +export const LANGUAGE_CODES = Object.keys(ALL_LANGUAGES) as LanguageCode[]; export const LANGUAGE_OPTIONS = LANGUAGE_CODES.map((code) => ({ value: code, - label: SUPPORTED_LANGUAGES[code], + label: ALL_LANGUAGES[code], })); diff --git a/app/src/lib/hooks/useGenerationForm.ts b/app/src/lib/hooks/useGenerationForm.ts index ec5b9d6a..5a83ce41 100644 --- a/app/src/lib/hooks/useGenerationForm.ts +++ b/app/src/lib/hooks/useGenerationForm.ts @@ -16,7 +16,7 @@ const generationSchema = z.object({ seed: z.number().int().optional(), modelSize: z.enum(['1.7B', '0.6B']).optional(), instruct: z.string().max(500).optional(), - engine: z.enum(['qwen', 'luxtts', 'chatterbox']).optional(), + engine: z.enum(['qwen', 'luxtts', 'chatterbox', 'chatterbox_turbo']).optional(), }); export type GenerationFormValues = z.infer; @@ -75,15 +75,19 @@ export function useGenerationForm(options: UseGenerationFormOptions = {}) { ? 'luxtts' : engine === 'chatterbox' ? 'chatterbox-tts' - : `qwen-tts-${data.modelSize}`; + : engine === 'chatterbox_turbo' + ? 'chatterbox-turbo' + : `qwen-tts-${data.modelSize}`; const displayName = engine === 'luxtts' ? 'LuxTTS' : engine === 'chatterbox' ? 'Chatterbox TTS' - : data.modelSize === '1.7B' - ? 'Qwen TTS 1.7B' - : 'Qwen TTS 0.6B'; + : engine === 'chatterbox_turbo' + ? 'Chatterbox Turbo' + : data.modelSize === '1.7B' + ? 'Qwen TTS 1.7B' + : 'Qwen TTS 0.6B'; try { const modelStatus = await apiClient.getModelStatus(); diff --git a/backend/backends/__init__.py b/backend/backends/__init__.py index a7b4d54c..f120e6ec 100644 --- a/backend/backends/__init__.py +++ b/backend/backends/__init__.py @@ -122,6 +122,7 @@ def is_loaded(self) -> bool: "qwen": "Qwen TTS", "luxtts": "LuxTTS", "chatterbox": "Chatterbox TTS", + "chatterbox_turbo": "Chatterbox Turbo", } @@ -171,6 +172,9 @@ def get_tts_backend_for_engine(engine: str) -> TTSBackend: elif engine == "chatterbox": from .chatterbox_backend import ChatterboxTTSBackend backend = ChatterboxTTSBackend() + elif engine == "chatterbox_turbo": + from .chatterbox_turbo_backend import ChatterboxTurboTTSBackend + backend = ChatterboxTurboTTSBackend() else: raise ValueError(f"Unknown TTS engine: {engine}. Supported: {list(TTS_ENGINES.keys())}") diff --git a/backend/backends/chatterbox_turbo_backend.py b/backend/backends/chatterbox_turbo_backend.py new file mode 100644 index 00000000..16bb5d70 --- /dev/null +++ b/backend/backends/chatterbox_turbo_backend.py @@ -0,0 +1,307 @@ +""" +Chatterbox Turbo TTS backend implementation. + +Wraps ChatterboxTurboTTS from chatterbox-tts for fast, English-only +voice cloning with paralinguistic tag support ([laugh], [cough], etc.). +Forces CPU on macOS due to known MPS tensor issues. +""" + +import asyncio +import logging +import platform +import threading +from pathlib import Path +from typing import ClassVar, List, Optional, Tuple + +import numpy as np + +from . import TTSBackend +from ..utils.audio import normalize_audio, load_audio +from ..utils.progress import get_progress_manager +from ..utils.tasks import get_task_manager + +logger = logging.getLogger(__name__) + +CHATTERBOX_TURBO_HF_REPO = "ResembleAI/chatterbox-turbo" + +# Files that must be present for the turbo model +_TURBO_WEIGHT_FILES = [ + "t3_turbo_v1.safetensors", + "s3gen_meanflow.safetensors", + "ve.safetensors", +] + + +class ChatterboxTurboTTSBackend: + """Chatterbox Turbo TTS backend — fast, English-only, with paralinguistic tags.""" + + # Class-level lock for torch.load monkey-patching + _load_lock: ClassVar[threading.Lock] = threading.Lock() + + def __init__(self): + self.model = None + self.model_size = "default" + self._device = None + self._model_load_lock = asyncio.Lock() + + def _get_device(self) -> str: + """Get the best available device. Forces CPU on macOS (MPS issue).""" + if platform.system() == "Darwin": + return "cpu" + try: + import torch + + if torch.cuda.is_available(): + return "cuda" + except ImportError: + pass + return "cpu" + + def is_loaded(self) -> bool: + return self.model is not None + + def _get_model_path(self, model_size: str = "default") -> str: + return CHATTERBOX_TURBO_HF_REPO + + def _is_model_cached(self, model_size: str = "default") -> bool: + """Check if the Chatterbox Turbo model is cached locally.""" + try: + from huggingface_hub import constants as hf_constants + + repo_cache = Path(hf_constants.HF_HUB_CACHE) / ( + "models--" + CHATTERBOX_TURBO_HF_REPO.replace("/", "--") + ) + + if not repo_cache.exists(): + return False + + blobs_dir = repo_cache / "blobs" + if blobs_dir.exists() and any(blobs_dir.glob("*.incomplete")): + return False + + # Check for turbo weight files + snapshots_dir = repo_cache / "snapshots" + if snapshots_dir.exists(): + for fname in _TURBO_WEIGHT_FILES: + if not any(snapshots_dir.rglob(fname)): + return False + return True + + return False + except Exception as e: + logger.warning(f"Error checking Chatterbox Turbo cache: {e}") + return False + + async def load_model(self, model_size: str = "default") -> None: + """Load the Chatterbox Turbo model.""" + if self.model is not None: + return + async with self._model_load_lock: + if self.model is not None: + return + await asyncio.to_thread(self._load_model_sync) + + def _load_model_sync(self): + """Synchronous model loading.""" + from ..utils.hf_progress import HFProgressTracker, create_hf_progress_callback + + progress_manager = get_progress_manager() + task_manager = get_task_manager() + model_name = "chatterbox-turbo" + + is_cached = self._is_model_cached() + + # Set up HF progress tracking (intercepts tqdm for file-level progress) + progress_callback = create_hf_progress_callback(model_name, progress_manager) + tracker = HFProgressTracker(progress_callback, filter_non_downloads=is_cached) + tracker_context = tracker.patch_download() + tracker_context.__enter__() + + if not is_cached: + task_manager.start_download(model_name) + progress_manager.update_progress( + model_name=model_name, + current=0, + total=0, + filename="Connecting to HuggingFace...", + status="downloading", + ) + + try: + device = self._get_device() + self._device = device + + logger.info(f"Loading Chatterbox Turbo TTS on {device}...") + + import torch + from huggingface_hub import snapshot_download + from chatterbox.tts_turbo import ChatterboxTurboTTS + + # Download model files ourselves so we can pass token=None + # (upstream from_pretrained passes token=True which requires + # a stored HF token even though the repo is public). + try: + local_path = snapshot_download( + repo_id=CHATTERBOX_TURBO_HF_REPO, + token=None, + allow_patterns=[ + "*.safetensors", "*.json", "*.txt", "*.pt", "*.model", + ], + ) + finally: + tracker_context.__exit__(None, None, None) + + # Monkey-patch torch.load for CPU loading. The model's .pt files + # were saved on CUDA; from_local() doesn't pass map_location + # so loading on CPU fails without this. + if device == "cpu": + _orig_torch_load = torch.load + + def _patched_load(*args, **kwargs): + kwargs.setdefault("map_location", "cpu") + return _orig_torch_load(*args, **kwargs) + + with ChatterboxTurboTTSBackend._load_lock: + torch.load = _patched_load + try: + self.model = ChatterboxTurboTTS.from_local( + local_path, device, + ) + finally: + torch.load = _orig_torch_load + else: + self.model = ChatterboxTurboTTS.from_local( + local_path, device, + ) + + if not is_cached: + progress_manager.mark_complete(model_name) + task_manager.complete_download(model_name) + + logger.info("Chatterbox Turbo TTS loaded successfully") + + except ImportError as e: + logger.error( + "chatterbox-tts package not found. " + "Install with: pip install chatterbox-tts" + ) + if not is_cached: + progress_manager.mark_error(model_name, str(e)) + task_manager.error_download(model_name, str(e)) + raise + except Exception as e: + logger.error(f"Failed to load Chatterbox Turbo: {e}") + if not is_cached: + progress_manager.mark_error(model_name, str(e)) + task_manager.error_download(model_name, str(e)) + raise + + def unload_model(self) -> None: + """Unload model to free memory.""" + if self.model is not None: + device = self._device + del self.model + self.model = None + self._device = None + if device == "cuda": + import torch + + torch.cuda.empty_cache() + logger.info("Chatterbox Turbo unloaded") + + async def create_voice_prompt( + self, + audio_path: str, + reference_text: str, + use_cache: bool = True, + ) -> Tuple[dict, bool]: + """ + Create voice prompt from reference audio. + + Chatterbox Turbo processes reference audio at generation time, so the + prompt just stores the file path. + """ + voice_prompt = { + "ref_audio": str(audio_path), + "ref_text": reference_text, + } + return voice_prompt, False + + async def combine_voice_prompts( + self, + audio_paths: List[str], + reference_texts: List[str], + ) -> Tuple[np.ndarray, str]: + """Combine multiple reference samples.""" + combined_audio = [] + for path in audio_paths: + audio, _sr = load_audio(path) + audio = normalize_audio(audio) + combined_audio.append(audio) + + mixed = np.concatenate(combined_audio) + mixed = normalize_audio(mixed) + combined_text = " ".join(reference_texts) + return mixed, combined_text + + async def generate( + self, + text: str, + voice_prompt: dict, + language: str = "en", + seed: Optional[int] = None, + instruct: Optional[str] = None, + ) -> Tuple[np.ndarray, int]: + """ + Generate audio using Chatterbox Turbo TTS. + + Supports paralinguistic tags in text: [laugh], [cough], [chuckle], etc. + + Args: + text: Text to synthesize (may include paralinguistic tags) + voice_prompt: Dict with ref_audio path + language: Ignored (Turbo is English-only) + seed: Random seed for reproducibility + instruct: Unused (protocol compatibility) + + Returns: + Tuple of (audio_array, sample_rate) + """ + await self.load_model() + + ref_audio = voice_prompt.get("ref_audio") + if ref_audio and not Path(ref_audio).exists(): + logger.warning(f"Reference audio not found: {ref_audio}") + ref_audio = None + + def _generate_sync(): + import torch + + if seed is not None: + torch.manual_seed(seed) + + logger.info("[Chatterbox Turbo] Generating (English)") + + wav = self.model.generate( + text, + audio_prompt_path=ref_audio, + temperature=0.8, + top_k=1000, + top_p=0.95, + repetition_penalty=1.2, + ) + + # Convert tensor -> numpy + if isinstance(wav, torch.Tensor): + audio = wav.squeeze().cpu().numpy().astype(np.float32) + else: + audio = np.asarray(wav, dtype=np.float32) + + sample_rate = ( + getattr(self.model, "sr", None) + or getattr(self.model, "sample_rate", 24000) + ) + + return audio, sample_rate + + return await asyncio.to_thread(_generate_sync) diff --git a/backend/main.py b/backend/main.py index 3d2ec359..d0be9a54 100644 --- a/backend/main.py +++ b/backend/main.py @@ -699,6 +699,29 @@ async def download_chatterbox_background(): ) await tts_model.load_model() + elif engine == "chatterbox_turbo": + if not tts_model._is_model_cached(): + model_name = "chatterbox-turbo" + + async def download_chatterbox_turbo_background(): + try: + await tts_model.load_model() + except Exception as e: + task_manager.error_download(model_name, str(e)) + + task_manager.start_download(model_name) + asyncio.create_task(download_chatterbox_turbo_background()) + + raise HTTPException( + status_code=202, + detail={ + "message": "Chatterbox Turbo model is being downloaded. Please wait and try again.", + "model_name": model_name, + "downloading": True, + }, + ) + + await tts_model.load_model() # Create voice prompt from profile voice_prompt = await profiles.create_voice_prompt_for_profile( @@ -717,7 +740,7 @@ async def download_chatterbox_background(): ) # Trim trailing silence/hallucination for Chatterbox output - if engine == "chatterbox": + if engine in ("chatterbox", "chatterbox_turbo"): from .utils.audio import trim_tts_output audio = trim_tts_output(audio, sample_rate) @@ -798,6 +821,13 @@ async def stream_speech( detail="Chatterbox model is not downloaded yet. Use /generate to trigger a download.", ) await tts_model.load_model() + elif engine == "chatterbox_turbo": + if not tts_model._is_model_cached(): + raise HTTPException( + status_code=400, + detail="Chatterbox Turbo model is not downloaded yet. Use /generate to trigger a download.", + ) + await tts_model.load_model() voice_prompt = await profiles.create_voice_prompt_for_profile( data.profile_id, db, engine=engine, @@ -812,7 +842,7 @@ async def stream_speech( ) # Trim trailing silence/hallucination for Chatterbox output - if engine == "chatterbox": + if engine in ("chatterbox", "chatterbox_turbo"): from .utils.audio import trim_tts_output audio = trim_tts_output(audio, sample_rate) @@ -1433,6 +1463,15 @@ def check_chatterbox_loaded(): except Exception: return False + # Check if Chatterbox Turbo backend is loaded + def check_chatterbox_turbo_loaded(): + try: + from .backends import get_tts_backend_for_engine + backend = get_tts_backend_for_engine("chatterbox_turbo") + return backend.is_loaded() + except Exception: + return False + model_configs = [ { "model_name": "qwen-tts-1.7B", @@ -1462,6 +1501,13 @@ def check_chatterbox_loaded(): "model_size": "default", "check_loaded": check_chatterbox_loaded, }, + { + "model_name": "chatterbox-turbo", + "display_name": "Chatterbox Turbo (English, Tags)", + "hf_repo_id": "ResembleAI/chatterbox-turbo", + "model_size": "default", + "check_loaded": check_chatterbox_turbo_loaded, + }, { "model_name": "whisper-base", "display_name": "Whisper Base", @@ -1668,6 +1714,10 @@ async def trigger_model_download(request: models.ModelDownloadRequest): "model_size": "default", "load_func": lambda: get_tts_backend_for_engine("chatterbox").load_model(), }, + "chatterbox-turbo": { + "model_size": "default", + "load_func": lambda: get_tts_backend_for_engine("chatterbox_turbo").load_model(), + }, "whisper-base": { "model_size": "base", "load_func": lambda: transcribe.get_whisper_model().load_model("base"), @@ -1790,6 +1840,11 @@ async def delete_model(model_name: str): "model_size": "default", "model_type": "chatterbox", }, + "chatterbox-turbo": { + "hf_repo_id": "ResembleAI/chatterbox-turbo", + "model_size": "default", + "model_type": "chatterbox_turbo", + }, "whisper-base": { "hf_repo_id": "openai/whisper-base", "model_size": "base", @@ -1834,6 +1889,11 @@ async def delete_model(model_name: str): chatterbox = get_tts_backend_for_engine("chatterbox") if chatterbox.is_loaded(): chatterbox.unload_model() + elif config["model_type"] == "chatterbox_turbo": + from .backends import get_tts_backend_for_engine + turbo = get_tts_backend_for_engine("chatterbox_turbo") + if turbo.is_loaded(): + turbo.unload_model() elif config["model_type"] == "whisper": whisper_model = transcribe.get_whisper_model() if whisper_model.is_loaded() and whisper_model.model_size == config["model_size"]: diff --git a/backend/models.py b/backend/models.py index c46ded58..80d69495 100644 --- a/backend/models.py +++ b/backend/models.py @@ -11,7 +11,7 @@ class VoiceProfileCreate(BaseModel): """Request model for creating a voice profile.""" name: str = Field(..., min_length=1, max_length=100) description: Optional[str] = Field(None, max_length=500) - language: str = Field(default="en", pattern="^(zh|en|ja|ko|de|fr|ru|pt|es|it|he)$") + language: str = Field(default="en", pattern="^(zh|en|ja|ko|de|fr|ru|pt|es|it|he|ar|da|el|fi|hi|ms|nl|no|pl|sv|sw|tr)$") class VoiceProfileResponse(BaseModel): @@ -57,7 +57,7 @@ class GenerationRequest(BaseModel): seed: Optional[int] = Field(None, ge=0) model_size: Optional[str] = Field(default="1.7B", pattern="^(1\\.7B|0\\.6B)$") instruct: Optional[str] = Field(None, max_length=500) - engine: Optional[str] = Field(default="qwen", pattern="^(qwen|luxtts|chatterbox)$") + engine: Optional[str] = Field(default="qwen", pattern="^(qwen|luxtts|chatterbox|chatterbox_turbo)$") class GenerationResponse(BaseModel): diff --git a/docs/plans/PROJECT_STATUS.md b/docs/plans/PROJECT_STATUS.md index 57cafbf6..d47dfebf 100644 --- a/docs/plans/PROJECT_STATUS.md +++ b/docs/plans/PROJECT_STATUS.md @@ -1,6 +1,6 @@ # Voicebox Project Status & Roadmap -> Last updated: 2026-03-12 | Current version: **v0.1.13** | 13.1k stars | 176 open issues | 28 open PRs +> Last updated: 2026-03-13 | Current version: **v0.1.13** | 13.1k stars | ~176 open issues | 25 open PRs --- @@ -30,14 +30,18 @@ │ │ HTTP :17493 │ │ ┌──────────────────────▼────────────────────────┐ │ │ │ FastAPI Backend (backend/) │ │ -│ │ ┌─────────────┐ ┌───────────┐ ┌─────────┐ │ │ -│ │ │ TTSBackend │ │ STTBackend│ │ Profiles│ │ │ -│ │ │ (Protocol) │ │ (Whisper) │ │ History │ │ │ -│ │ │ ┌────────┐ │ └───────────┘ │ Stories │ │ │ -│ │ │ │PyTorch │ │ └─────────┘ │ │ -│ │ │ │or MLX │ │ │ │ -│ │ │ └────────┘ │ │ │ -│ │ └─────────────┘ │ │ +│ │ ┌─────────────────────────────────────────┐ │ │ +│ │ │ TTSBackend Protocol │ │ │ +│ │ │ ┌──────────┐ ┌───────┐ ┌───────────┐ │ │ │ +│ │ │ │ Qwen3-TTS│ │LuxTTS │ │Chatterbox │ │ │ │ +│ │ │ │(Py/MLX) │ │ │ │(MTL+Turbo)│ │ │ │ +│ │ │ └──────────┘ └───────┘ └───────────┘ │ │ │ +│ │ └─────────────────────────────────────────┘ │ │ +│ │ ┌───────────┐ ┌─────────┐ │ │ +│ │ │ STTBackend│ │ Profiles│ │ │ +│ │ │ (Whisper) │ │ History │ │ │ +│ │ └───────────┘ │ Stories │ │ │ +│ │ └─────────┘ │ │ │ └───────────────────────────────────────────────┘ │ └─────────────────────────────────────────────────────┘ ``` @@ -46,131 +50,180 @@ | Layer | File | Purpose | |-------|------|---------| -| Backend entry | `backend/main.py` | FastAPI app, all API routes (~1700 lines) | +| Backend entry | `backend/main.py` | FastAPI app, all API routes (~2100 lines) | | TTS protocol | `backend/backends/__init__.py:14-81` | `TTSBackend` Protocol definition | -| TTS factory | `backend/backends/__init__.py:118-137` | Singleton backend selection (MLX vs PyTorch) | +| TTS factory | `backend/backends/__init__.py:138-178` | Thread-safe engine registry (double-checked locking) | | PyTorch TTS | `backend/backends/pytorch_backend.py` | Qwen3-TTS via `qwen_tts` package | | MLX TTS | `backend/backends/mlx_backend.py` | Qwen3-TTS via `mlx_audio.tts` | +| LuxTTS | `backend/backends/luxtts_backend.py` | LuxTTS — fast, CPU-friendly | +| Chatterbox MTL | `backend/backends/chatterbox_backend.py` | Chatterbox Multilingual — 23 languages | +| Chatterbox Turbo | `backend/backends/chatterbox_turbo_backend.py` | Chatterbox Turbo — English, paralinguistic tags | | Platform detect | `backend/platform_detect.py` | Apple Silicon → MLX, else → PyTorch | | API types | `backend/models.py` | Pydantic request/response models | +| HF progress | `backend/utils/hf_progress.py` | HFProgressTracker (tqdm patching for download progress) | +| Audio utils | `backend/utils/audio.py` | `trim_tts_output()`, normalize, load/save audio | | Frontend API | `app/src/lib/api/client.ts` | Hand-written fetch wrapper | | Frontend types | `app/src/lib/api/types.ts` | TypeScript API types | | Generation form | `app/src/components/Generation/GenerationForm.tsx` | TTS generation UI | -| Model manager | `app/src/components/ServerSettings/ModelManagement.tsx` | Model download/status UI | +| Floating gen box | `app/src/components/Generation/FloatingGenerateBox.tsx` | Compact generation UI | +| Model manager | `app/src/components/ServerSettings/ModelManagement.tsx` | Model download/status/progress UI | +| GPU acceleration | `app/src/components/ServerSettings/GpuAcceleration.tsx` | CUDA backend swap UI | | Gen form hook | `app/src/lib/hooks/useGenerationForm.ts` | Form validation + submission | +| Language constants | `app/src/lib/constants/languages.ts` | Per-engine language maps | ### How TTS Generation Works (Current Flow) ``` POST /generate 1. Look up voice profile from DB - 2. Check model cache → if missing, trigger background download, return HTTP 202 - 3. Load model (lazy): tts_backend.load_model(model_size) - 4. Create voice prompt: profiles.create_voice_prompt_for_profile() + 2. Resolve engine from request (qwen | luxtts | chatterbox | chatterbox_turbo) + 3. Get backend: get_tts_backend_for_engine(engine) # thread-safe singleton per engine + 4. Check model cache → if missing, trigger background download, return HTTP 202 + 5. Load model (lazy): tts_backend.load_model(model_size) + 6. Create voice prompt: profiles.create_voice_prompt_for_profile(engine=engine) → tts_backend.create_voice_prompt(audio_path, reference_text) - 5. Generate: tts_backend.generate(text, voice_prompt, language, seed, instruct) - 6. Save WAV → data/generations/{id}.wav - 7. Insert history record in SQLite - 8. Return GenerationResponse + 7. Generate: tts_backend.generate(text, voice_prompt, language, seed, instruct) + 8. Post-process: trim_tts_output() for Chatterbox engines + 9. Save WAV → data/generations/{id}.wav + 10. Insert history record in SQLite + 11. Return GenerationResponse ``` --- ## Current State -### What's Shipped (v0.1.13) +### What's Shipped (v0.1.13 + recent merges) +**Core TTS:** - Qwen3-TTS voice cloning (1.7B and 0.6B models) - MLX backend for Apple Silicon, PyTorch for everything else +- Multi-engine TTS architecture with thread-safe backend registry (PR #254) +- LuxTTS integration — fast, CPU-friendly English TTS (PR #254) +- Chatterbox Multilingual TTS — 23 languages including Hebrew (PR #257) +- Delivery instructions (instruct parameter, Qwen only) +- Single flat model dropdown (Qwen 1.7B, Qwen 0.6B, LuxTTS, Chatterbox, Chatterbox Turbo) + +**Infrastructure:** +- CUDA backend swap via binary download and restart (PR #252) +- GPU acceleration settings UI - Voice profiles with multi-sample support - Stories editor (multi-track DAW timeline) - Whisper transcription (base, small, medium, large variants) -- Model management UI with download progress (SSE) +- Model management UI with inline download progress bars (HFProgressTracker) +- Download cancel/clear UI with error panel (PR #238) - Generation history with caching - Streaming generation endpoint (MLX only) -- Delivery instructions (instruct parameter) - -### What's NOT Shipped But Has Code - -| Feature | Branch | Status | -|---------|--------|--------| -| External provider binaries (CUDA split) | `external-provider-binaries` | PR #33, significant work done, stale since Feb | -| Dual server binaries | `feat/dual-server-binaries` | Branch exists, no PR | -| Multi-sample fix | `fix-multi-sample` | Branch exists, no PR | -| Model download notification fix | `fix-dl-notification-...` | Branch exists, no PR | - -### Hardcoded Qwen3-TTS Assumptions - -These are the specific coupling points that block multi-model support: - -| Location | What's Hardcoded | -|----------|-----------------| -| `backend/models.py:58` | `model_size` regex: `^(1\.7B\|0\.6B)$` | -| `backend/main.py:611` | Default: `model_size or "1.7B"` | -| `backend/main.py:1322-1365` | Model status list (2 Qwen + 4 Whisper) | -| `backend/main.py:1523-1548` | Download trigger map | -| `backend/main.py:1597-1628` | Delete map | -| `backend/backends/pytorch_backend.py:65-68` | HF repo ID map | -| `backend/backends/mlx_backend.py:41-44` | MLX repo ID map | -| `backend/backends/__init__.py:118-137` | Single global TTS backend | -| `app/src/lib/hooks/useGenerationForm.ts:17` | `modelSize: z.enum(['1.7B', '0.6B'])` | -| `app/src/lib/hooks/useGenerationForm.ts:70-71` | `modelName = "qwen-tts-${data.modelSize}"` | -| `app/src/components/Generation/GenerationForm.tsx:140-141` | Hardcoded "Qwen TTS" labels | -| `app/src/components/ServerSettings/ModelManagement.tsx:166-213` | Filters by `qwen-tts` and `whisper` prefix | -| `backend/utils/cache.py` | Voice prompt cache uses `torch.save()` | +- Duplicate profile name validation (PR #175) +- Linux NVIDIA GBM buffer + WebKitGTK microphone fix (PR #210) + +### What's In-Flight + +| Feature | Branch/PR | Status | +|---------|-----------|--------| +| Chatterbox Turbo + per-engine language lists | `feat/chatterbox-turbo` / PR #258 | Open, ready for review | + +### TTS Engine Comparison + +| Engine | Model Name | Languages | Size | Key Features | +|--------|-----------|-----------|------|-------------| +| Qwen3-TTS 1.7B | `qwen-tts-1.7B` | 10 (zh, en, ja, ko, de, fr, ru, pt, es, it) | ~3.5 GB | Instruct mode, highest quality | +| Qwen3-TTS 0.6B | `qwen-tts-0.6B` | 10 | ~1.2 GB | Lighter, faster | +| LuxTTS | `luxtts` | English | ~300 MB | CPU-friendly, 48 kHz, fast | +| Chatterbox | `chatterbox-tts` | 23 (incl. Hebrew, Arabic, Hindi, etc.) | ~3.2 GB | Zero-shot cloning, multilingual | +| Chatterbox Turbo | `chatterbox-turbo` | English | ~1.5 GB | Paralinguistic tags ([laugh], [cough]), 350M params, low latency | + +### Multi-Engine Architecture (Shipped) + +The singleton TTS backend blocker described in the previous version of this doc has been **resolved**. The architecture now supports: + +- **Thread-safe backend registry** (`_tts_backends` dict + `_tts_backends_lock`) with double-checked locking +- **Per-engine backend instances** — each engine gets its own singleton, loaded lazily +- **Engine field on GenerationRequest** — frontend sends `engine: 'qwen' | 'luxtts' | 'chatterbox' | 'chatterbox_turbo'` +- **Per-engine language filtering** — `ENGINE_LANGUAGES` map in frontend, backend regex accepts all languages +- **Per-engine voice prompts** — `create_voice_prompt_for_profile()` dispatches to the correct backend +- **Trim post-processing** — `trim_tts_output()` for Chatterbox engines (cuts trailing silence/hallucination) + +### Known Limitations + +- **HF XET progress**: Large files downloaded via `hf-xet` (HuggingFace's new transfer backend) report `n=0` in tqdm updates. Progress bars may appear stuck for large `.safetensors` files even though the download is proceeding. This is a known upstream limitation. +- **Chatterbox Turbo upstream token bug**: `from_pretrained()` passes `token=os.getenv("HF_TOKEN") or True` which fails without a stored HF token. Our backend works around this by calling `snapshot_download(token=None)` + `from_local()`. +- **chatterbox-tts must install with `--no-deps`**: It pins `numpy<1.26`, `torch==2.6.0`, `transformers==4.46.3` — all incompatible with our stack (Python 3.12, torch 2.10, transformers 4.57.3). Sub-deps listed explicitly in `requirements.txt`. +- **Streaming generation** only works for Qwen on MLX. Other engines use the non-streaming `/generate` endpoint. +- **dicta-onnx** (Hebrew diacritization) not included — upstream Chatterbox bug requires `model_path` arg but calls `Dicta()` with none. Hebrew works fine without it. --- ## Open PRs — Triage & Analysis +### Recently Merged (Since Last Update) + +| PR | Title | Merged | +|----|-------|--------| +| **#257** | feat: Chatterbox TTS engine with multilingual voice cloning | 2026-03-13 | +| **#254** | feat: LuxTTS integration — multi-engine TTS support | 2026-03-13 | +| **#252** | feat: CUDA backend swap via binary download and restart | 2026-03-13 | +| **#238** | Download cancel/clear UI, fixed model downloading | 2026-03-13 | +| **#250** | docs: align local API port examples | 2026-03-13 | +| **#210** | fix: Linux NVIDIA GBM buffer crash | 2026-03-13 | +| **#175** | Fix #134: duplicate profile name validation | 2026-03-13 | + +### In-Flight (Our Work) + +| PR | Title | Status | Notes | +|----|-------|--------|-------| +| **#258** | feat: Chatterbox Turbo engine + per-engine language lists | Open | Ready for review. Adds Turbo engine + dynamic language dropdown. | + ### Merge-Ready / Near-Ready (Bug Fixes & Small Features) | PR | Title | Risk | Notes | |----|-------|------|-------| -| **#250** | docs: align local API port examples | None | Docs-only | | **#230** | docs: fix README grammar | None | Docs-only | | **#243** | a11y: screen reader and keyboard improvements | Low | Accessibility, no backend changes | -| **#175** | Fix #134: duplicate profile name validation | Low | Simple validation | | **#178** | Fix #168 #140: generation error handling | Low | Error handling improvements | | **#152** | Fix: prevent crashes when HuggingFace unreachable | Medium | Monkey-patches HF hub; solves real offline bug (#150, #151) | | **#218** | fix: unify qwen tts cache dir on Windows | Low | Windows-specific path fix | | **#214** | fix: panic on launch from tokio::spawn | Low | Rust-side Tauri fix | -| **#210** | fix: Linux NVIDIA GBM buffer crash | Low | Linux-specific, narrowly scoped | | **#88** | security: restrict CORS to known local origins | Low | Security hardening | +| **#133** | feat: network access toggle | Low | Wires up existing plumbing | ### Significant Feature PRs -| PR | Title | Complexity | Dependencies | Notes | -|----|-------|-----------|--------------|-------| -| **#97** | fix: pass language parameter to TTS models | Medium | None | **Critical bug** — language param was silently dropped. Adds `LANGUAGE_CODE_TO_NAME` mapping to both backends. Should be high priority. | -| **#133** | feat: network access toggle | Low | None | Wires up existing plumbing (`--host 0.0.0.0`). Clean, small. | -| **#238** | download cancel/clear UI + error panel | Medium | None | Adds cancel buttons, VS Code-style Problems panel, fixes whisper-large repo. Quality-of-life win. | -| **#99** | feat: chunked TTS with quality selector | Medium | None | Solves the 500-char/2048-token limit. Sentence-aware splitting, crossfade concat, 44.1kHz upsampling. Addresses #191, #203, #69, #111. | -| **#154** | feat: Audiobook tab | Medium | Depends on #99 concepts | Full audiobook workflow — chunked gen, preview, auto-save to Stories. New route + tab. | -| **#91** | fix: CoreAudio device enumeration | Medium | None | macOS audio device handling. | +| PR | Title | Complexity | Notes | +|----|-------|-----------|-------| +| **#253** | Enhance speech tokenizer with 48kHz version | Medium | Qwen tokenizer upgrade | +| **#97** | fix: pass language parameter to TTS models | Medium | May be partially obsoleted by multi-engine work — needs review | +| **#99** | feat: chunked TTS with quality selector | Medium | Solves 500-char limit. Addresses #191, #203, #69, #111. | +| **#154** | feat: Audiobook tab | Medium | Full audiobook workflow. Depends on #99 concepts. | +| **#91** | fix: CoreAudio device enumeration | Medium | macOS audio device handling | ### Architectural PRs (Need Careful Review) | PR | Title | Complexity | Notes | |----|-------|-----------|-------| -| **#33** | CUDA GPU Support — External Provider Binaries | **Very High** | The big one. Splits monolithic backend into main app + downloadable provider executables (PyTorch CPU, CUDA). New provider management system, CI/CD for R2 uploads, provider settings UI. Created Feb 1, significant codebase. **This is the foundation for multi-model support** but is currently Qwen-only. | -| **#225** | feat: custom HuggingFace model support | High | Adds `custom_models.py`, `custom:` model IDs, frontend model grouping (Built-in vs Custom). **Takes a different approach than #33** — keeps single backend but allows arbitrary HF repos. These two PRs may conflict architecturally. | -| **#194** | feat: Hebrew + Chatterbox TTS | High | **First non-Qwen TTS model.** Adds `ChatterboxTTSBackend` alongside existing backends. Routes by language (`he` → Chatterbox, else → Qwen). Adds Hebrew Whisper models. Includes a lot of cleanup. Important precedent for multi-model. | -| **#195** | feat: per-profile LoRA fine-tuning | **Very High** | Depends on #194. Training pipeline, adapter management, SSE progress, 15 new API endpoints. New DB tables. Forces PyTorch even on MLX systems for adapter inference. | -| **#161** | feat: Docker + web deployment | High | 3-stage Dockerfile, SPA serving from FastAPI, docker-compose. Implements the Docker deployment plan. | -| **#124** | Add Dockerfiles + docker-compose + docs | Medium | Earlier, simpler Docker attempt. Overlaps with #161. | -| **#123** | added docker | Low | Minimal Docker PR. Overlaps with #161 and #124. | -| **#227** | fix: harden input validation & file safety | Medium | Follow-up to #225. Atomic writes, threading locks, input validation. Good hardening but coupled to the custom models feature. | +| **#225** | feat: custom HuggingFace model support | High | Arbitrary HF repo loading. May need rework given multi-engine arch is now shipped. | +| **#194** | feat: Hebrew + Chatterbox TTS | High | **Superseded** by PR #257 which shipped Chatterbox multilingual (23 langs incl. Hebrew). May be closeable. | +| **#195** | feat: per-profile LoRA fine-tuning | Very High | Training pipeline, adapter management, 15 new endpoints. Depends on #194 (now superseded). | +| **#161** | feat: Docker + web deployment | High | 3-stage Dockerfile, SPA serving. Independent of TTS engine work. | +| **#124** / **#123** | Docker (simpler attempts) | Low-Medium | Overlap with #161 | +| **#227** | fix: harden input validation & file safety | Medium | Coupled to #225 (custom models) | ### PRs That Need Author Action / Are Stale | PR | Title | Notes | |----|-------|-------| -| **#237** | fix: bundle qwen_tts source files in PyInstaller | Solves #212 but needs review for build system impact | +| **#237** | fix: bundle qwen_tts source files in PyInstaller | Build system, needs review | | **#215** | Update prerequisites with Tauri deps | Branch is `main` — will have conflicts | | **#89** | Linux Support | Branch is `main` — will have conflicts. Broad scope. | | **#83** | Update download links for v0.1.12 | Outdated (we're on v0.1.13) | +### PRs Likely Superseded + +| PR | Superseded By | Notes | +|----|--------------|-------| +| **#194** (Hebrew + Chatterbox) | PR #257 (merged) | #257 ships Chatterbox multilingual with 23 languages including Hebrew. #194 took a different approach (route by language). Can likely be closed. | +| **#33** (External provider binaries) | PR #252 (merged) | #252 shipped CUDA backend swap. #33's broader provider architecture may still have value but needs reassessment. | + --- ## Open Issues — Categorized @@ -186,15 +239,15 @@ The single most reported category. Users on Windows with NVIDIA GPUs frequently **Key issues:** #239, #222, #220, #217, #208, #198, #192, #167, #164, #141, #130, #127 -**Fix path:** PR #33 (external provider binaries) is designed to solve this. Ship a small main app, let users download the CUDA provider separately. +**Fix path:** PR #252 (CUDA backend swap) is now merged. Users can download the CUDA binary separately from the GPU acceleration settings. Many of these issues may now be resolvable — needs triage to confirm. ### Model Downloads (20 issues) -Second most reported. Users get stuck downloads, can't resume, no cancel button, no offline fallback. +Second most reported. Users get stuck downloads, can't resume, no offline fallback. **Key issues:** #249, #240, #221, #216, #212, #181, #180, #159, #150, #149, #145, #143, #135, #134 -**Fix path:** PR #238 (cancel/clear UI), PR #152 (offline crash fix). Resume support not yet addressed. +**Fix path:** PR #238 (cancel/clear UI) is now merged. PR #152 (offline crash fix) still open. Inline progress bars now show for all engines. Resume support not yet addressed. ### Language Requests (18 issues) @@ -202,7 +255,7 @@ Strong demand for: Hindi (#245), Indonesian (#247), Dutch (#236), Hebrew (#199), **Key issues:** #247, #245, #236, #211, #205, #199, #189, #188, #187, #183, #179, #162 -**Fix path:** PR #97 (pass language param — currently silently dropped!) is the prerequisite. Qwen3-TTS already supports many languages; the bug is that the language code isn't forwarded. Multi-model (#194 Chatterbox for Hebrew) expands coverage further. +**Fix path:** Chatterbox Multilingual (merged via #257) now supports 23 languages including many of the requested ones: Arabic, Danish, German, Greek, Finnish, Hebrew, Hindi, Dutch, Norwegian, Polish, Swedish, Swahili, Turkish. Per-engine language filtering (PR #258) ensures the UI shows correct options. Several of these issues may be closeable. ### New Model Requests (5 explicit issues) @@ -214,7 +267,7 @@ Strong demand for: Hindi (#245), Indonesian (#247), Dutch (#236), Hebrew (#199), | #132 | LavaSR (transcription) | | #76 | (General model expansion) | -Community is also vocally requesting: LuxTTS, Chatterbox, XTTS-v2, Fish Speech, CosyVoice, Kokoro on social media and in issue comments. +Community also requests: XTTS-v2, Fish Speech, CosyVoice, Kokoro. The multi-engine architecture is now in place, making new model integration significantly easier. ### Long-Form / Chunking (5 issues) @@ -255,17 +308,14 @@ Notable requests: | Document | Target Version | Status | Relevance | |----------|---------------|--------|-----------| -| `TTS_PROVIDER_ARCHITECTURE.md` | v0.1.13 | **Partially implemented** in PR #33 | Core architecture for multi-model + CUDA distribution | -| `EXTERNAL_PROVIDERS.md` | v0.2.0 | **Not started** | Remote server support. API path inconsistency with provider arch doc (`/v1/` vs `/tts/`) | -| `MLX_AUDIO.md` | — | **Shipped** (the only one) | MLX backend is live. 0.6B MLX model still missing. | -| `DOCKER_DEPLOYMENT.md` | v0.2.0 | **PR exists** (#161) | Waiting on review. No official images published. | -| `OPENAI_SUPPORT.md` | v0.2.0 | **Not started** | OpenAI-compatible API layer. Linked to issue #10. Low complexity. | - -### Cross-Document Conflicts - -1. **API path inconsistency:** Provider arch uses `/tts/generate`, External providers uses `/v1/generate`, OpenAI compat uses `/v1/audio/speech`. Need to reconcile. -2. **Docker vs. Provider split:** Docker doc assumes monolithic backend. Provider arch splits into separate binaries. Need to decide: does Docker run the monolith or individual providers? -3. **Version targeting:** Provider arch targets v0.1.13 (current!) but isn't merged. Everything else targets v0.2.0. +| `TTS_PROVIDER_ARCHITECTURE.md` | v0.1.13 | **Partially superseded** by multi-engine arch + CUDA swap | Core concepts implemented differently than planned | +| `CUDA_BACKEND_SWAP.md` | — | **Shipped** (PR #252) | CUDA binary download + backend restart | +| `CUDA_BACKEND_SWAP_FINAL.md` | — | **Shipped** (PR #252) | Final implementation plan | +| `EXTERNAL_PROVIDERS.md` | v0.2.0 | **Not started** | Remote server support | +| `MLX_AUDIO.md` | — | **Shipped** | MLX backend is live | +| `DOCKER_DEPLOYMENT.md` | v0.2.0 | **PR exists** (#161) | Waiting on review | +| `OPENAI_SUPPORT.md` | v0.2.0 | **Not started** | OpenAI-compatible API layer | +| `PR33_CUDA_PROVIDER_REVIEW.md` | — | **Reference** | Analysis of the original provider approach | --- @@ -273,135 +323,95 @@ Notable requests: ### Models Worth Supporting (2026 SOTA) -| Model | Cloning | Speed | Sample Rate | Languages | VRAM | Integration Ease | Repo | -|-------|---------|-------|-------------|-----------|------|-----------------|------| -| **LuxTTS** | 3s zero-shot | 150x RT, CPU ok | 48 kHz | English-first | <1 GB | Easy | `ysharma3501/LuxTTS` | -| **Chatterbox** | 5s zero-shot | Sub-200ms streaming | 24-48 kHz | 23+ | Low | Medium | `resemble-ai/chatterbox` | -| **XTTS-v2** | 6s zero-shot | Fast mid-GPU | 24 kHz | 17+ | Medium | Medium | `coqui/XTTS-v2` | -| **Fish Speech** | 10-30s few-shot | Real-time | 24-44 kHz | 50+ | Medium | Medium | `fishaudio/fish-speech` | -| **CosyVoice2-0.5B** | 3-10s zero-shot | Very fast | 24 kHz | Multilingual | Low | Easy | Alibaba HF org | -| **Kokoro-82M** | 3s instant | CPU realtime | 24 kHz | English | Tiny | Medium | Kokoro repo | - -### What's Needed Architecturally for Multi-Model - -The current codebase assumes one TTS model family (Qwen3-TTS). Adding any new model requires: +| Model | Cloning | Speed | Sample Rate | Languages | VRAM | Integration Ease | Status | +|-------|---------|-------|-------------|-----------|------|-----------------|--------| +| **Qwen3-TTS** | 10s zero-shot | Medium | 24 kHz | 10 | Medium | **Shipped** | v0.1.13 | +| **LuxTTS** | 3s zero-shot | 150x RT, CPU ok | 48 kHz | English | <1 GB | **Shipped** | PR #254 | +| **Chatterbox MTL** | 5s zero-shot | Medium | 24 kHz | 23 | Medium | **Shipped** | PR #257 | +| **Chatterbox Turbo** | 5s zero-shot | Fast | 24 kHz | English | Low | **PR #258** | In review | +| **XTTS-v2** | 6s zero-shot | Mid-GPU | 24 kHz | 17+ | Medium | Ready | Multi-engine arch in place | +| **Fish Speech** | 10-30s few-shot | Real-time | 24-44 kHz | 50+ | Medium | Ready | Multi-engine arch in place | +| **CosyVoice2-0.5B** | 3-10s zero-shot | Very fast | 24 kHz | Multilingual | Low | Ready | Multi-engine arch in place | +| **Kokoro-82M** | 3s instant | CPU realtime | 24 kHz | English | Tiny | Ready | Multi-engine arch in place | -1. **Model type concept** — A `model_type` field (e.g. `qwen`, `luxtts`, `chatterbox`) alongside `model_size`. The `GenerationRequest` schema, frontend form, and all model config dicts need updating. +### Adding a New Engine (Now Straightforward) -2. **Multiple backend instances** — The singleton `get_tts_backend()` needs to become a registry. Different models have different voice prompt formats, different inference APIs, different sample rates. +With the multi-engine architecture shipped, adding a new TTS engine requires: -3. **Voice prompt format abstraction** — Qwen uses `torch.save()`-serialized tensors. LuxTTS uses `encode_prompt()` returning its own format. Chatterbox uses audio-path-based cloning. The cache system (`backend/utils/cache.py`) needs to handle heterogeneous formats. +1. **Create `backend/backends/_backend.py`** — implement `TTSBackend` protocol (~200-300 lines) +2. **Register in `backend/backends/__init__.py`** — add to `TTS_ENGINES` dict + factory function +3. **Update `backend/models.py`** — add engine name to regex +4. **Update `backend/main.py`** — add engine cases in generate, stream, model-status, download, delete (5 dispatch points) +5. **Update frontend** — add to engine union type, form schema, model dropdown, language map (5-6 files) -4. **Sample rate normalization** — Qwen outputs 24 kHz. LuxTTS outputs 48 kHz. The Stories editor and audio pipeline need to handle mixed rates. - -5. **Per-model capabilities** — Not all models support `instruct` (delivery instructions), not all support streaming, not all support the same languages. The UI needs to adapt. - -### PR #194 as Precedent - -The Hebrew/Chatterbox PR (#194) is the first attempt at multi-model. It takes a pragmatic approach: route by language (`he` → Chatterbox, else → Qwen). This works for one extra model but doesn't scale — what happens when you want Chatterbox for English too? - -### PR #225 as Alternative Approach - -The custom HuggingFace models PR (#225) takes a different angle: let users register arbitrary HF repos and attempt to load them through the existing Qwen backend. This is flexible but fragile — it assumes all models have the same API as Qwen3-TTS. - -### PR #33 as Foundation - -The external provider binaries PR (#33) has the most robust architecture for multi-model, since each provider is a separate process with its own dependencies. But it's complex, currently Qwen-only, and has been stale since early February. +Total effort: **~1 day** for a well-documented model with a PyPI package. --- ## Architectural Bottlenecks -### 1. Single Backend Singleton - -**File:** `backend/backends/__init__.py:118-137` +### ~~1. Single Backend Singleton~~ — RESOLVED -The entire TTS system runs through one global `_tts_backend` instance. You literally cannot have two models loaded. This is the #1 blocker for multi-model support. +The singleton TTS backend was replaced with a thread-safe per-engine registry in PR #254. Multiple engines can now be loaded simultaneously. -### 2. `main.py` is 1700+ Lines +### 2. `main.py` is 2100+ Lines -All API routes, all model configs, all business logic in one file. Three separate hardcoded model config dicts that must stay in sync. Any multi-model change touches this file heavily. +All API routes, all model configs, all business logic in one file. Five separate dispatch points for each engine. Any new engine touches this file in 5 places. A model config registry pattern would reduce duplication. -### 3. Model Config is Scattered +### 3. Model Config is Scattered (Improved) -Model identifiers, HF repo IDs, display names, and download logic are duplicated across: -- `main.py` (3 separate dicts) -- `pytorch_backend.py` (HF repo map) -- `mlx_backend.py` (MLX repo map) -- `GenerationForm.tsx` (UI labels) -- `useGenerationForm.ts` (validation schema) -- `ModelManagement.tsx` (prefix filters) - -There is no single source of truth for "what models does Voicebox support." +Model identifiers are still duplicated across `main.py` (3 dicts), backend files, frontend components, and the languages constant. However, the pattern is now consistent and well-understood. A centralized model registry would help but isn't blocking. ### 4. Voice Prompt Cache Assumes PyTorch Tensors -`backend/utils/cache.py` uses `torch.save()` / `torch.load()` for caching voice prompts. Models that don't use PyTorch tensors (LuxTTS, MLX-native models) can't use this cache. +`backend/utils/cache.py` uses `torch.save()` / `torch.load()`. LuxTTS and Chatterbox backends work around this by storing reference audio paths instead of tensors in their voice prompt dicts. Not ideal but functional. -### 5. Frontend Assumes Qwen Model Sizes +### 5. ~~Frontend Assumes Qwen Model Sizes~~ — RESOLVED -The generation form schema (`useGenerationForm.ts:17`) validates `model_size` as `'1.7B' | '0.6B'`. The model management UI filters by string prefix `qwen-tts`. Adding any model requires touching 3-4 frontend files. +The generation form now uses a flat model dropdown with engine-based routing. Per-engine language filtering is in place. Model size is only sent for Qwen. --- ## Recommended Priorities -### Tier 1 — Ship Now (Bug Fixes & Critical Improvements) - -These PRs fix real user pain with low risk. Can be reviewed and merged quickly. - -| Priority | PR | Impact | Effort | -|----------|-----|--------|--------| -| 1 | **#97** — Pass language param to TTS | Fixes all non-English generation (18 language issues) | Low | -| 2 | **#238** — Download cancel/clear UI | Addresses 20 download-related issues | Low | -| 3 | **#152** — Offline mode crash fix | Fixes #150, #151 | Low | -| 4 | **#99** — Chunked TTS + quality selector | Removes 500-char limit, addresses 5 issues | Medium | -| 5 | **#218** — Windows HF cache dir fix | Windows-specific pain | Low | -| 6 | **#175, #178** — Profile validation + error handling | Small fixes | Low | -| 7 | **#250, #230** — Docs fixes | Zero risk | None | -| 8 | **#133** — Network access toggle | Wires up existing code | Low | -| 9 | **#88** — CORS restriction | Security improvement | Low | -| 10 | **#214** — Tauri window close panic fix | Stability | Low | - -### Tier 2 — Next Release (v0.2.0 Foundations) - -These require more review but unlock major capabilities. - -| Priority | Item | Impact | Effort | Dependencies | -|----------|------|--------|--------|-------------| -| 1 | **PR #33** — External provider binaries | Solves GPU distribution (19 issues), foundation for multi-model | Very High | Needs rebase, thorough review | -| 2 | **Multi-model abstraction layer** | Required before adding LuxTTS/Chatterbox/etc. | High | Informed by #33, #194, #225 | -| 3 | **PR #161** — Docker deployment | Server/headless users | Medium | Independent of #33 | -| 4 | **PR #194** — Hebrew + Chatterbox | First non-Qwen model, language expansion | High | Should align with multi-model abstraction | -| 5 | **PR #154** — Audiobook tab | Significant feature for long-form users | Medium | Benefits from #99 (chunking) | +### Tier 1 — Ship Now (Low Risk) + +| Priority | PR/Item | Impact | Effort | +|----------|---------|--------|--------| +| 1 | **#258** — Chatterbox Turbo + per-engine languages | Paralinguistic tags, proper language filtering | Review only | +| 2 | **#152** — Offline mode crash fix | Fixes #150, #151 | Low | +| 3 | **#99** — Chunked TTS + quality selector | Removes 500-char limit, addresses 5 issues | Medium | +| 4 | **#218** — Windows HF cache dir fix | Windows-specific pain | Low | +| 5 | **#178** — Generation error handling | Error UX | Low | +| 6 | **#230** — Docs fixes | Zero risk | None | +| 7 | **#133** — Network access toggle | Wires up existing code | Low | +| 8 | **#88** — CORS restriction | Security improvement | Low | +| 9 | **#214** — Tauri window close panic fix | Stability | Low | +| 10 | Triage GPU issues | Many may be resolved by CUDA swap (#252) | Low | +| 11 | Close superseded PRs | #194 (superseded by #257), #83 (outdated) | None | + +### Tier 2 — Next Release (v0.2.0) + +| Priority | Item | Impact | Effort | +|----------|------|--------|--------| +| 1 | **#253** — 48kHz speech tokenizer | Quality improvement | Medium | +| 2 | **#161** — Docker deployment | Server/headless users | Medium | +| 3 | **#154** — Audiobook tab | Long-form users | Medium | +| 4 | **Model config registry** | Reduce 5-dispatch-point duplication in main.py | Medium | +| 5 | **#225** — Custom HuggingFace models | User-supplied models | High (needs rework for multi-engine) | ### Tier 3 — Future (v0.3.0+) | Item | Notes | |------|-------| -| LuxTTS integration | 48 kHz, low VRAM, but needs multi-model arch first | -| XTTS-v2 / Fish Speech | Multilingual powerhouses | +| XTTS-v2 / Fish Speech / CosyVoice | Multi-engine arch is ready; just needs backend implementation | | OpenAI-compatible API (plan doc exists) | Low effort once API is stable | -| LoRA fine-tuning (PR #195) | Complex, depends on #194 | -| External/remote providers (plan doc exists) | Depends on provider architecture | +| LoRA fine-tuning (PR #195) | Complex, needs rework for multi-engine | +| External/remote providers | Depends on use case demand | | GGUF support (#226) | Depends on model ecosystem maturity | | Queue system (#234) | Batch generation | -| Real-time streaming synthesis | MLX-only currently, needs PyTorch path | - -### Decision Point: Multi-Model Architecture - -Before adding any new TTS model, a decision is needed on *how*: - -**Option A — Provider Binary Split (PR #33 approach)** -Each model family is a separate executable/process. Most isolated, most flexible, but most complex. Solves the CUDA distribution problem simultaneously. - -**Option B — In-Process Model Registry** -Keep everything in one process but replace the singleton with a registry that can instantiate multiple `TTSBackend` implementations. Simpler, but doesn't solve binary size / CUDA distribution. - -**Option C — Hybrid (Recommended)** -Use Option B for lightweight models (LuxTTS, Kokoro — small, CPU-friendly) that can coexist in-process. Use Option A for heavy models (CUDA Qwen3-TTS, Fish Speech) that need their own process/dependencies. The provider architecture from PR #33 becomes the escape hatch for heavy models, while light models are built-in. - -This matches how PR #194 already works (Chatterbox loaded in-process alongside Qwen) while keeping the door open for PR #33's provider split. +| Streaming for non-MLX engines | Currently MLX-only | +| Kokoro-82M | Tiny model, great for CPU-only machines | --- @@ -409,24 +419,20 @@ This matches how PR #194 already works (Chatterbox loaded in-process alongside Q | Branch | PR | Status | Notes | |--------|-----|--------|-------| -| `external-provider-binaries` | #33 | Open, stale | Major architecture work | -| `feat/dual-server-binaries` | — | No PR | Related to provider split? | +| `feat/chatterbox-turbo` | #258 | Open | Chatterbox Turbo + per-engine languages | +| `feat/chatterbox` | #257 | **Merged** | Chatterbox Multilingual | +| `feat/luxtts` | #254 | **Merged** | LuxTTS + multi-engine arch | +| `external-provider-binaries` | #33 | Superseded by #252 | Original CUDA provider approach | +| `feat/dual-server-binaries` | — | No PR | Related to provider split | | `fix-multi-sample` | — | No PR | Voice profile multi-sample fix | | `fix-dl-notification-...` | — | No PR | Model download UX | -| `improvements` | — | No PR | Unknown scope | -| `stories` | — | No PR | Stories editor work? | -| `windows-server-shutdown` | — | No PR | Windows lifecycle | -| `model-dl-fix` | — | No PR | Model download fix | -| `channels` | — | No PR | Audio channels | -| `audio-export-entitlement-fix` | — | No PR | macOS entitlements | -| `better-docs` | — | No PR | Documentation | --- ## Quick Reference: API Endpoints
-All current endpoints (v0.1.13) +All current endpoints | Endpoint | Method | Purpose | |----------|--------|---------| @@ -437,20 +443,21 @@ This matches how PR #194 already works (Chatterbox loaded in-process alongside Q | `/profiles/{id}/avatar` | POST, GET, DELETE | Avatar management | | `/profiles/{id}/export` | GET | Export profile as ZIP | | `/profiles/import` | POST | Import profile from ZIP | -| `/generate` | POST | Generate speech | -| `/generate/stream` | POST | Stream speech (SSE) | +| `/generate` | POST | Generate speech (engine param selects TTS backend) | +| `/generate/stream` | POST | Stream speech (MLX only) | | `/history` | GET | List generation history | | `/history/{id}` | GET, DELETE | Get/delete generation | | `/history/{id}/export` | GET | Export generation ZIP | | `/history/{id}/export-audio` | GET | Export audio only | | `/transcribe` | POST | Transcribe audio (Whisper) | -| `/models/status` | GET | All model statuses | +| `/models/status` | GET | All model statuses (Qwen, LuxTTS, Chatterbox, Chatterbox Turbo, Whisper) | | `/models/download` | POST | Trigger model download | +| `/models/download/cancel` | POST | Cancel/dismiss download | | `/models/{name}` | DELETE | Delete downloaded model | | `/models/load` | POST | Load model into memory | | `/models/unload` | POST | Unload model | | `/models/progress/{name}` | GET | SSE download progress | -| `/tasks/active` | GET | Active downloads/generations | +| `/tasks/active` | GET | Active downloads/generations (with inline progress) | | `/stories` | POST, GET | Create/list stories | | `/stories/{id}` | GET, PUT, DELETE | Story CRUD | | `/stories/{id}/items` | POST, GET | Story items CRUD | @@ -458,5 +465,8 @@ This matches how PR #194 already works (Chatterbox loaded in-process alongside Q | `/channels` | POST, GET | Audio channel CRUD | | `/channels/{id}` | PUT, DELETE | Channel update/delete | | `/cache/clear` | POST | Clear voice prompt cache | +| `/server/cuda/status` | GET | CUDA binary availability | +| `/server/cuda/download` | POST | Download CUDA binary | +| `/server/cuda/switch` | POST | Switch to CUDA backend |