diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 1312ca3..8b2311f 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -63,6 +63,7 @@ jobs: pip install -r backend/requirements.txt pip install --no-deps chatterbox-tts pip install --no-deps hume-tada + git clone --recursive --depth 1 https://github.com/FunAudioLLM/CosyVoice.git backend/vendors/CosyVoice - name: Install MLX dependencies (Apple Silicon only) if: matrix.backend == 'mlx' @@ -190,6 +191,7 @@ jobs: pip install -r backend/requirements.txt pip install --no-deps chatterbox-tts pip install --no-deps hume-tada + git clone --recursive --depth 1 https://github.com/FunAudioLLM/CosyVoice.git backend/vendors/CosyVoice - name: Install PyTorch with CUDA 12.6 run: | diff --git a/.gitignore b/.gitignore index 130a7aa..285881a 100644 --- a/.gitignore +++ b/.gitignore @@ -59,6 +59,9 @@ tauri/src-tauri/gen/partial.plist # Windows artifacts nul +# Vendored source clones (fetched at setup time) +backend/vendors/ + # Temporary tmp/ temp/ diff --git a/Dockerfile b/Dockerfile index 1ad85e5..d2cfb8a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -39,6 +39,7 @@ RUN pip install --no-cache-dir --prefix=/install --no-deps chatterbox-tts RUN pip install --no-cache-dir --prefix=/install --no-deps hume-tada RUN pip install --no-cache-dir --prefix=/install \ git+https://github.com/QwenLM/Qwen3-TTS.git +RUN git clone --recursive --depth 1 https://github.com/FunAudioLLM/CosyVoice.git /build/CosyVoice # === Stage 3: Runtime === @@ -62,6 +63,9 @@ COPY --from=backend-builder /install /usr/local # Copy backend application code COPY --chown=voicebox:voicebox backend/ /app/backend/ +# Copy CosyVoice source from builder stage +COPY --from=backend-builder --chown=voicebox:voicebox /build/CosyVoice/ /app/backend/vendors/CosyVoice/ + # Copy built frontend from frontend stage COPY --from=frontend --chown=voicebox:voicebox /build/web/dist /app/frontend/ diff --git a/app/src/components/Generation/EngineModelSelector.tsx b/app/src/components/Generation/EngineModelSelector.tsx index 4382d3f..efef063 100644 --- a/app/src/components/Generation/EngineModelSelector.tsx +++ b/app/src/components/Generation/EngineModelSelector.tsx @@ -22,6 +22,8 @@ const ENGINE_OPTIONS = [ { value: 'chatterbox_turbo', label: 'Chatterbox Turbo' }, { value: 'tada:1B', label: 'TADA 1B' }, { value: 'tada:3B', label: 'TADA 3B Multilingual' }, + { value: 'cosyvoice:v2', label: 'CosyVoice2 0.5B' }, + { value: 'cosyvoice:v3', label: 'CosyVoice3 0.5B' }, ] as const; const ENGINE_DESCRIPTIONS: Record = { @@ -30,6 +32,7 @@ const ENGINE_DESCRIPTIONS: Record = { chatterbox: '23 languages, incl. Hebrew', chatterbox_turbo: 'English, [laugh] [cough] tags', tada: 'HumeAI, 700s+ coherent audio', + cosyvoice: 'Alibaba, instruct + cloning', }; /** Engines that only support English and should force language to 'en' on select. */ @@ -38,6 +41,7 @@ const ENGLISH_ONLY_ENGINES = new Set(['luxtts', 'chatterbox_turbo']); function getSelectValue(engine: string, modelSize?: string): string { if (engine === 'qwen') return `qwen:${modelSize || '1.7B'}`; if (engine === 'tada') return `tada:${modelSize || '1B'}`; + if (engine === 'cosyvoice') return `cosyvoice:${modelSize || 'v2'}`; return engine; } @@ -66,6 +70,15 @@ function handleEngineChange(form: UseFormReturn, value: st form.setValue('language', available[0]?.value ?? 'en'); } } + } else if (value.startsWith('cosyvoice:')) { + const [, modelSize] = value.split(':'); + form.setValue('engine', 'cosyvoice'); + form.setValue('modelSize', modelSize as 'v2' | 'v3'); + const currentLang = form.getValues('language'); + const available = getLanguageOptionsForEngine('cosyvoice'); + if (!available.some((l) => l.value === currentLang)) { + form.setValue('language', available[0]?.value ?? 'en'); + } } else { form.setValue('engine', value as GenerationFormValues['engine']); form.setValue('modelSize', undefined as unknown as '1.7B' | '0.6B'); diff --git a/app/src/components/ServerSettings/ModelManagement.tsx b/app/src/components/ServerSettings/ModelManagement.tsx index c415306..618be27 100644 --- a/app/src/components/ServerSettings/ModelManagement.tsx +++ b/app/src/components/ServerSettings/ModelManagement.tsx @@ -66,6 +66,10 @@ const MODEL_DESCRIPTIONS: Record = { 'HumeAI TADA 1B — English speech-language model built on Llama 3.2 1B. Generates 700s+ of coherent audio with synchronized text-acoustic alignment.', 'tada-3b-ml': 'HumeAI TADA 3B Multilingual — built on Llama 3.2 3B. Supports 10 languages with high-fidelity voice cloning via text-acoustic dual alignment.', + 'cosyvoice2-0.5b': + 'CosyVoice2 0.5B by Alibaba. Multilingual TTS with instruct support for emotions, speed, volume, and dialects. 9 languages with zero-shot voice cloning.', + 'cosyvoice3-0.5b': + 'Fun-CosyVoice3 0.5B by Alibaba. Improved robustness, prosody, and Chinese dialect support over CosyVoice2. Best quality for in-the-wild speech generation.', 'whisper-base': 'Smallest Whisper model (74M parameters). Fast transcription with moderate accuracy.', 'whisper-small': @@ -390,14 +394,7 @@ export function ModelManagement() { setDetailOpen(true); }; - const voiceModels = - modelStatus?.models.filter( - (m) => - m.model_name.startsWith('qwen-tts') || - m.model_name.startsWith('luxtts') || - m.model_name.startsWith('chatterbox') || - m.model_name.startsWith('tada'), - ) ?? []; + const voiceModels = modelStatus?.models.filter((m) => !m.model_name.startsWith('whisper')) ?? []; const whisperModels = modelStatus?.models.filter((m) => m.model_name.startsWith('whisper')) ?? []; // Build sections diff --git a/app/src/lib/api/types.ts b/app/src/lib/api/types.ts index aa85d00..4b18f60 100644 --- a/app/src/lib/api/types.ts +++ b/app/src/lib/api/types.ts @@ -42,8 +42,8 @@ export interface GenerationRequest { text: string; language: LanguageCode; seed?: number; - model_size?: '1.7B' | '0.6B' | '1B' | '3B'; - engine?: 'qwen' | 'luxtts' | 'chatterbox' | 'chatterbox_turbo' | 'tada'; + model_size?: '1.7B' | '0.6B' | '1B' | '3B' | 'v2' | 'v3'; + engine?: 'qwen' | 'luxtts' | 'chatterbox' | 'chatterbox_turbo' | 'tada' | 'cosyvoice'; instruct?: string; max_chunk_chars?: number; crossfade_ms?: number; diff --git a/app/src/lib/constants/languages.ts b/app/src/lib/constants/languages.ts index a0d233a..b7bf9bd 100644 --- a/app/src/lib/constants/languages.ts +++ b/app/src/lib/constants/languages.ts @@ -67,6 +67,7 @@ export const ENGINE_LANGUAGES: Record = { ], chatterbox_turbo: ['en'], tada: ['en', 'ar', 'zh', 'de', 'es', 'fr', 'it', 'ja', 'pl', 'pt'], + cosyvoice: ['zh', 'en', 'ja', 'ko', 'de', 'fr', 'ru', 'es', 'it'], } as const; /** Helper: get language options for a given engine. */ diff --git a/app/src/lib/hooks/useGenerationForm.ts b/app/src/lib/hooks/useGenerationForm.ts index 8e73ce0..d59f7bc 100644 --- a/app/src/lib/hooks/useGenerationForm.ts +++ b/app/src/lib/hooks/useGenerationForm.ts @@ -15,9 +15,11 @@ const generationSchema = z.object({ text: z.string().min(1, '').max(50000), language: z.enum(LANGUAGE_CODES as [LanguageCode, ...LanguageCode[]]), seed: z.number().int().optional(), - modelSize: z.enum(['1.7B', '0.6B', '1B', '3B']).optional(), + modelSize: z.enum(['1.7B', '0.6B', '1B', '3B', 'v2', 'v3']).optional(), instruct: z.string().max(500).optional(), - engine: z.enum(['qwen', 'luxtts', 'chatterbox', 'chatterbox_turbo', 'tada']).optional(), + engine: z + .enum(['qwen', 'luxtts', 'chatterbox', 'chatterbox_turbo', 'tada', 'cosyvoice']) + .optional(), }); export type GenerationFormValues = z.infer; @@ -83,7 +85,11 @@ export function useGenerationForm(options: UseGenerationFormOptions = {}) { ? data.modelSize === '3B' ? 'tada-3b-ml' : 'tada-1b' - : `qwen-tts-${data.modelSize}`; + : engine === 'cosyvoice' + ? data.modelSize === 'v3' + ? 'cosyvoice3-0.5b' + : 'cosyvoice2-0.5b' + : `qwen-tts-${data.modelSize}`; const displayName = engine === 'luxtts' ? 'LuxTTS' @@ -95,9 +101,13 @@ export function useGenerationForm(options: UseGenerationFormOptions = {}) { ? data.modelSize === '3B' ? 'TADA 3B Multilingual' : 'TADA 1B' - : data.modelSize === '1.7B' - ? 'Qwen TTS 1.7B' - : 'Qwen TTS 0.6B'; + : engine === 'cosyvoice' + ? data.modelSize === 'v3' + ? 'CosyVoice3 0.5B' + : 'CosyVoice2 0.5B' + : data.modelSize === '1.7B' + ? 'Qwen TTS 1.7B' + : 'Qwen TTS 0.6B'; // Check if model needs downloading try { @@ -112,7 +122,7 @@ export function useGenerationForm(options: UseGenerationFormOptions = {}) { console.error('Failed to check model status:', error); } - const hasModelSizes = engine === 'qwen' || engine === 'tada'; + const hasModelSizes = engine === 'qwen' || engine === 'tada' || engine === 'cosyvoice'; const effectsChain = options.getEffectsChain?.(); // This now returns immediately with status="generating" const result = await generation.mutateAsync({ @@ -122,7 +132,8 @@ export function useGenerationForm(options: UseGenerationFormOptions = {}) { seed: data.seed, model_size: hasModelSizes ? data.modelSize : undefined, engine, - instruct: engine === 'qwen' ? data.instruct || undefined : undefined, + instruct: + engine === 'qwen' || engine === 'cosyvoice' ? data.instruct || undefined : undefined, max_chunk_chars: maxChunkChars, crossfade_ms: crossfadeMs, normalize: normalizeAudio, diff --git a/backend/backends/__init__.py b/backend/backends/__init__.py index a4f5113..d49f520 100644 --- a/backend/backends/__init__.py +++ b/backend/backends/__init__.py @@ -167,6 +167,7 @@ def is_loaded(self) -> bool: "chatterbox": "Chatterbox TTS", "chatterbox_turbo": "Chatterbox Turbo", "tada": "TADA", + "cosyvoice": "CosyVoice", } @@ -278,6 +279,26 @@ def _get_non_qwen_tts_configs() -> list[ModelConfig]: size_mb=8000, languages=["en", "ar", "zh", "de", "es", "fr", "it", "ja", "pl", "pt"], ), + ModelConfig( + model_name="cosyvoice2-0.5b", + display_name="CosyVoice2 0.5B (Multilingual, Instruct)", + engine="cosyvoice", + hf_repo_id="FunAudioLLM/CosyVoice2-0.5B", + model_size="v2", + size_mb=4600, + supports_instruct=True, + languages=["zh", "en", "ja", "ko", "de", "fr", "ru", "es", "it"], + ), + ModelConfig( + model_name="cosyvoice3-0.5b", + display_name="CosyVoice3 0.5B (Best Quality)", + engine="cosyvoice", + hf_repo_id="FunAudioLLM/Fun-CosyVoice3-0.5B-2512", + model_size="v3", + size_mb=4600, + supports_instruct=True, + languages=["zh", "en", "ja", "ko", "de", "fr", "ru", "es", "it"], + ), ] @@ -362,7 +383,7 @@ async def load_engine_model(engine: str, model_size: str = "default") -> None: backend = get_tts_backend_for_engine(engine) if engine == "qwen": await backend.load_model_async(model_size) - elif engine == "tada": + elif engine in ("tada", "cosyvoice"): await backend.load_model(model_size) else: await backend.load_model() @@ -379,7 +400,7 @@ async def ensure_model_cached_or_raise(engine: str, model_size: str = "default") cfg = c break - if engine in ("qwen", "tada"): + if engine in ("qwen", "tada", "cosyvoice"): if not backend._is_model_cached(model_size): raise HTTPException( status_code=400, @@ -454,6 +475,9 @@ def get_model_load_func(config: ModelConfig): if config.engine == "qwen": return lambda: tts.get_tts_model().load_model(config.model_size) + if config.engine in ("tada", "cosyvoice"): + return lambda: get_tts_backend_for_engine(config.engine).load_model(config.model_size) + return lambda: get_tts_backend_for_engine(config.engine).load_model() @@ -515,6 +539,10 @@ def get_tts_backend_for_engine(engine: str) -> TTSBackend: from .hume_backend import HumeTadaBackend backend = HumeTadaBackend() + elif engine == "cosyvoice": + from .cosyvoice_backend import CosyVoiceTTSBackend + + backend = CosyVoiceTTSBackend() else: raise ValueError(f"Unknown TTS engine: {engine}. Supported: {list(TTS_ENGINES.keys())}") diff --git a/backend/backends/cosyvoice_backend.py b/backend/backends/cosyvoice_backend.py new file mode 100644 index 0000000..62c49f9 --- /dev/null +++ b/backend/backends/cosyvoice_backend.py @@ -0,0 +1,433 @@ +""" +CosyVoice2 / CosyVoice3 TTS backend implementation. + +Wraps the upstream FunAudioLLM/CosyVoice library for zero-shot voice cloning +with instruct support (emotions, speed, volume, dialects). The CosyVoice repo +is cloned at setup time (``just setup-python``) and added to ``sys.path`` at +import time. + +Model variants: + - CosyVoice2-0.5B: ``inference_instruct2()`` for 9-language cloning + instruct + - Fun-CosyVoice3-0.5B: improved robustness, prosody, and Chinese dialects + +Both variants share a single ``cosyvoice`` engine key; the ``model_size`` +field selects which HuggingFace checkpoint to download. +""" + +import asyncio +import logging +import os +import sys +import threading +from pathlib import Path +from typing import ClassVar, List, Optional, Tuple + +import numpy as np + +from . import TTSBackend +from .base import ( + is_model_cached, + get_torch_device, + combine_voice_prompts as _combine_voice_prompts, + model_load_progress, +) + +logger = logging.getLogger(__name__) + +# ── HuggingFace repos ───────────────────────────────────────────────── + +COSYVOICE_HF_REPOS = { + "v2": "FunAudioLLM/CosyVoice2-0.5B", + "v3": "FunAudioLLM/Fun-CosyVoice3-0.5B-2512", +} + +# Files that must be present for CosyVoice2 / CosyVoice3 +_REQUIRED_FILES = { + "v2": ["llm.pt", "flow.pt", "hift.pt", "cosyvoice2.yaml", "campplus.onnx"], + "v3": ["llm.pt", "flow.pt", "hift.pt", "cosyvoice3.yaml", "campplus.onnx"], +} + +# Model name → variant key +_MODEL_NAME_TO_VARIANT = { + "cosyvoice2-0.5b": "v2", + "cosyvoice3-0.5b": "v3", +} + +# Default sample rate (both models produce 24 kHz audio) +COSYVOICE_SAMPLE_RATE = 24000 + + +def _ensure_cosyvoice_on_path() -> None: + """Add the cloned CosyVoice repo + Matcha-TTS to sys.path if not already present.""" + backend_dir = Path(__file__).resolve().parent.parent # backend/ + cosyvoice_root = backend_dir / "vendors" / "CosyVoice" + + if not cosyvoice_root.exists(): + raise RuntimeError( + f"CosyVoice source not found at {cosyvoice_root}. " + "Run `just setup-python` to clone it." + ) + + cosyvoice_str = str(cosyvoice_root) + matcha_str = str(cosyvoice_root / "third_party" / "Matcha-TTS") + + if cosyvoice_str not in sys.path: + sys.path.insert(0, cosyvoice_str) + if os.path.isdir(matcha_str) and matcha_str not in sys.path: + sys.path.insert(0, matcha_str) + + +def _shim_training_only_modules() -> None: + """ + Pre-populate ``sys.modules`` with lightweight stubs for modules that + the CosyVoice YAML configs reference but are only needed for training. + + ``hyperpyyaml`` resolves every ``!name:`` / ``!new:`` tag via + ``pydoc.locate`` which eagerly imports the target module. The YAML + references ``cosyvoice.dataset.processor`` (12 times) which pulls in + ``pyarrow``, ``pyworld``, etc. at module level. + + Several ``matcha.utils.*`` submodules also import + ``lightning.pytorch`` at module level. We stub those so the real + ``matcha.utils`` package can still expose ``audio.py`` and ``model.py`` + for inference. + """ + import types + import logging as _logging + + _noop = lambda *a, **kw: None + + def get_pylogger(name: str = __name__) -> _logging.Logger: + return _logging.getLogger(name) + + # ── matcha.utils submodules that import lightning ────────────── + fake_pylogger = types.ModuleType("matcha.utils.pylogger") + fake_pylogger.get_pylogger = get_pylogger # type: ignore[attr-defined] + + fake_logging_utils = types.ModuleType("matcha.utils.logging_utils") + fake_logging_utils.log_hyperparameters = _noop # type: ignore[attr-defined] + + fake_rich_utils = types.ModuleType("matcha.utils.rich_utils") + fake_rich_utils.enforce_tags = _noop # type: ignore[attr-defined] + fake_rich_utils.print_config_tree = _noop # type: ignore[attr-defined] + + fake_instantiators = types.ModuleType("matcha.utils.instantiators") + fake_instantiators.instantiate_callbacks = lambda *a, **kw: [] # type: ignore[attr-defined] + fake_instantiators.instantiate_loggers = lambda *a, **kw: [] # type: ignore[attr-defined] + + fake_utils_utils = types.ModuleType("matcha.utils.utils") + fake_utils_utils.extras = _noop # type: ignore[attr-defined] + fake_utils_utils.get_metric_value = _noop # type: ignore[attr-defined] + fake_utils_utils.task_wrapper = lambda fn: fn # type: ignore[attr-defined] + + sys.modules["matcha.utils.pylogger"] = fake_pylogger + sys.modules["matcha.utils.logging_utils"] = fake_logging_utils + sys.modules["matcha.utils.rich_utils"] = fake_rich_utils + sys.modules["matcha.utils.instantiators"] = fake_instantiators + sys.modules["matcha.utils.utils"] = fake_utils_utils + + # ── cosyvoice.dataset.processor (training data pipeline) ────── + # Referenced 12 times in cosyvoice2.yaml / cosyvoice3.yaml via + # !name: tags. Imports pyarrow, pyworld, whisper at module level. + fake_dataset = types.ModuleType("cosyvoice.dataset") + fake_dataset.__path__ = [] # type: ignore[attr-defined] + fake_processor = types.ModuleType("cosyvoice.dataset.processor") + for _fn in ( + "parquet_opener", "tokenize", "filter", "resample", "truncate", + "compute_fbank", "compute_whisper_fbank", "compute_f0", + "parse_embedding", "shuffle", "sort", "batch", "padding", + ): + setattr(fake_processor, _fn, _noop) + + sys.modules.setdefault("cosyvoice.dataset", fake_dataset) + sys.modules["cosyvoice.dataset.processor"] = fake_processor + + + + +def _patch_modelscope_to_hf() -> None: + """ + Monkey-patch ``modelscope.snapshot_download`` → ``huggingface_hub.snapshot_download`` + so that CosyVoice's ``__init__`` downloads from HuggingFace instead of ModelScope. + + Also passes ``token=None`` to avoid HF auth prompts on public repos. + """ + import types + from huggingface_hub import snapshot_download as hf_snapshot_download + + def _hf_download(model_id, **kwargs): + kwargs.pop("revision", None) + kwargs.pop("model_version", None) + return hf_snapshot_download(model_id, token=None, **kwargs) + + # Create a fake "modelscope" module so ``from modelscope import snapshot_download`` works. + fake_ms = types.ModuleType("modelscope") + fake_ms.snapshot_download = _hf_download + sys.modules["modelscope"] = fake_ms + + +def _patch_torchaudio_load() -> None: + """ + Replace ``torchaudio.load`` with a soundfile-backed implementation. + + torchaudio >= 2.9 unconditionally delegates to TorchCodec and ignores + the ``backend`` parameter. CosyVoice calls ``torchaudio.load(wav, + backend='soundfile')`` which now fails unless ``torchcodec`` is + installed. We swap in a lightweight wrapper that reads via soundfile + and returns the same ``(Tensor, sample_rate)`` tuple. + """ + import torch + import torchaudio + import soundfile as sf + + def _sf_load(uri, frame_offset=0, num_frames=-1, normalize=True, + channels_first=True, format=None, buffer_size=4096, + backend=None): + data, sr = sf.read(uri, start=frame_offset, + stop=None if num_frames < 0 else frame_offset + num_frames, + dtype="float32", always_2d=True) + # data shape: (frames, channels) → tensor + tensor = torch.from_numpy(data) + if channels_first: + tensor = tensor.T # (channels, frames) + return tensor, sr + + torchaudio.load = _sf_load + + +class CosyVoiceTTSBackend: + """CosyVoice2 / CosyVoice3 TTS backend for voice cloning with instruct support.""" + + # Class-level lock for import patching + _import_lock: ClassVar[threading.Lock] = threading.Lock() + _patched: ClassVar[bool] = False + + def __init__(self): + self.model = None + self._variant: Optional[str] = None # "v2" or "v3" + self._device: Optional[str] = None + self._model_load_lock = asyncio.Lock() + + def _get_device(self) -> str: + # CosyVoice has no MPS support — force CPU on macOS + return get_torch_device(force_cpu_on_mac=True) + + def is_loaded(self) -> bool: + return self.model is not None + + def _get_model_path(self, model_size: str = "v2") -> str: + return COSYVOICE_HF_REPOS.get(model_size, COSYVOICE_HF_REPOS["v2"]) + + def _is_model_cached(self, model_size: str = "v2") -> bool: + variant = model_size if model_size in COSYVOICE_HF_REPOS else "v2" + repo = COSYVOICE_HF_REPOS[variant] + required = _REQUIRED_FILES[variant] + return is_model_cached(repo, required_files=required) + + async def load_model(self, model_size: str = "v2") -> None: + """Load a CosyVoice model variant. + + Args: + model_size: ``"v2"`` for CosyVoice2-0.5B or ``"v3"`` for CosyVoice3-0.5B. + """ + variant = model_size if model_size in COSYVOICE_HF_REPOS else "v2" + + # If already loaded with the right variant, skip + if self.model is not None and self._variant == variant: + return + + async with self._model_load_lock: + if self.model is not None and self._variant == variant: + return + # Unload previous variant if switching + if self.model is not None: + self.unload_model() + await asyncio.to_thread(self._load_model_sync, variant) + + def _load_model_sync(self, variant: str) -> None: + """Synchronous model loading.""" + model_name = f"cosyvoice{'2' if variant == 'v2' else '3'}-0.5b" + is_cached = self._is_model_cached(variant) + + with model_load_progress(model_name, is_cached): + device = self._get_device() + self._device = device + hf_repo = COSYVOICE_HF_REPOS[variant] + logger.info( + "Loading CosyVoice %s (%s) on %s...", + "2" if variant == "v2" else "3", + hf_repo, + device, + ) + + # 1. Ensure cosyvoice source is on sys.path + _ensure_cosyvoice_on_path() + + # 2. Patch imports (thread-safe, once) + with CosyVoiceTTSBackend._import_lock: + if not CosyVoiceTTSBackend._patched: + _shim_training_only_modules() + _patch_modelscope_to_hf() + _patch_torchaudio_load() + CosyVoiceTTSBackend._patched = True + + # 3. Patch torch.load to force map_location on CPU + import torch + + if device == "cpu": + _orig_torch_load = torch.load + + def _patched_load(*args, **kwargs): + kwargs.setdefault("map_location", "cpu") + return _orig_torch_load(*args, **kwargs) + + torch.load = _patched_load + + try: + if variant == "v2": + from cosyvoice.cli.cosyvoice import CosyVoice2 + + model = CosyVoice2(hf_repo) + else: + from cosyvoice.cli.cosyvoice import CosyVoice3 + + model = CosyVoice3(hf_repo) + finally: + # Restore original torch.load + if device == "cpu": + torch.load = _orig_torch_load + + self.model = model + self._variant = variant + + logger.info("CosyVoice %s loaded successfully", "2" if variant == "v2" else "3") + + def unload_model(self) -> None: + """Unload model to free memory.""" + if self.model is not None: + device = self._device + del self.model + self.model = None + self._variant = None + self._device = None + if device == "cuda": + import torch + + torch.cuda.empty_cache() + logger.info("CosyVoice unloaded") + + async def create_voice_prompt( + self, + audio_path: str, + reference_text: str, + use_cache: bool = True, + ) -> Tuple[dict, bool]: + """ + Create voice prompt from reference audio. + + CosyVoice processes the reference at generation time via + ``frontend_zero_shot`` / ``frontend_instruct2``, so we just + store the path + text for later use. + """ + voice_prompt = { + "ref_audio": str(audio_path), + "ref_text": reference_text, + } + return voice_prompt, False + + async def combine_voice_prompts( + self, + audio_paths: List[str], + reference_texts: List[str], + ) -> Tuple[np.ndarray, str]: + return await _combine_voice_prompts(audio_paths, reference_texts) + + async def generate( + self, + text: str, + voice_prompt: dict, + language: str = "en", + seed: Optional[int] = None, + instruct: Optional[str] = None, + ) -> Tuple[np.ndarray, int]: + """ + Generate audio using CosyVoice instruct2 (with cloning) or zero-shot. + + If ``instruct`` is provided, uses ``inference_instruct2()`` which + supports emotion, speed, volume, and dialect control. + Otherwise falls back to ``inference_zero_shot()``. + + Args: + text: Text to synthesize. + voice_prompt: Dict with ``ref_audio`` path and ``ref_text``. + language: BCP-47 language code (unused by CosyVoice directly, + but kept for protocol compatibility). + seed: Random seed for reproducibility. + instruct: Instruct text for style control, e.g. + ``"Read with a happy tone, slowly."``. + + Returns: + Tuple of (audio_array, sample_rate). + """ + await self.load_model(self._variant or "v2") + + ref_audio = voice_prompt.get("ref_audio") + ref_text = voice_prompt.get("ref_text", "") + + if ref_audio and not Path(ref_audio).exists(): + logger.warning("Reference audio not found: %s", ref_audio) + ref_audio = None + + def _generate_sync(): + import torch + + if seed is not None: + torch.manual_seed(seed) + + # Collect all chunks from the generator + audio_chunks = [] + + if instruct and ref_audio: + # instruct2: text + instruct + reference audio → cloned + styled + logger.info("[CosyVoice] instruct2: lang=%s instruct=%s", language, instruct[:60]) + for chunk in self.model.inference_instruct2( + tts_text=text, + instruct_text=instruct, + prompt_wav=ref_audio, + stream=False, + speed=1.0, + ): + audio_chunks.append(chunk["tts_speech"]) + elif ref_audio: + # zero-shot voice cloning + logger.info("[CosyVoice] zero_shot: lang=%s", language) + for chunk in self.model.inference_zero_shot( + tts_text=text, + prompt_text=ref_text, + prompt_wav=ref_audio, + stream=False, + speed=1.0, + ): + audio_chunks.append(chunk["tts_speech"]) + else: + # cross-lingual (no reference audio, shouldn't normally happen + # in voicebox since profiles always have samples, but handle it) + logger.info("[CosyVoice] cross_lingual fallback: lang=%s", language) + for chunk in self.model.inference_cross_lingual( + tts_text=text, + prompt_wav=ref_audio or "", + stream=False, + speed=1.0, + ): + audio_chunks.append(chunk["tts_speech"]) + + # Concatenate all chunks + if not audio_chunks: + return np.zeros(COSYVOICE_SAMPLE_RATE, dtype=np.float32), COSYVOICE_SAMPLE_RATE + + full_audio = torch.cat(audio_chunks, dim=-1) + audio_np = full_audio.squeeze().cpu().numpy().astype(np.float32) + + return audio_np, COSYVOICE_SAMPLE_RATE + + return await asyncio.to_thread(_generate_sync) diff --git a/backend/build_binary.py b/backend/build_binary.py index 901514f..05b1ebd 100644 --- a/backend/build_binary.py +++ b/backend/build_binary.py @@ -228,9 +228,40 @@ def build_server(cuda=False): "torchaudio", "--collect-submodules", "tada", + # CosyVoice2/3 — Alibaba TTS with instruct + cloning + "--hidden-import", + "backend.backends.cosyvoice_backend", + # hyperpyyaml dynamically instantiates classes from YAML — + # needs source files and the ruamel.yaml backend + "--collect-all", + "hyperpyyaml", + # onnxruntime ships native shared libraries + provider plugins + "--collect-all", + "onnxruntime", + "--copy-metadata", + "onnxruntime", + # openai-whisper ships mel filter assets and uses tiktoken + "--collect-all", + "whisper", + "--collect-all", + "tiktoken", + # einops used by CosyVoice flow/decoder + "--hidden-import", + "einops", ] ) + # Bundle the vendored CosyVoice source tree for frozen builds. + # The clone lives at backend/vendors/CosyVoice/ at build time. + cosyvoice_vendor = backend_dir / "vendors" / "CosyVoice" + if cosyvoice_vendor.exists(): + args.extend([ + "--add-data", + f"{cosyvoice_vendor / 'cosyvoice'}{os.pathsep}cosyvoice", + "--add-data", + f"{cosyvoice_vendor / 'third_party' / 'Matcha-TTS' / 'matcha'}{os.pathsep}matcha", + ]) + # Add CUDA-specific hidden imports if cuda: logger.info("Building with CUDA support") diff --git a/backend/models.py b/backend/models.py index 4dd2b36..d898101 100644 --- a/backend/models.py +++ b/backend/models.py @@ -66,9 +66,9 @@ class GenerationRequest(BaseModel): text: str = Field(..., min_length=1, max_length=50000) language: str = Field(default="en", pattern="^(zh|en|ja|ko|de|fr|ru|pt|es|it|he|ar|da|el|fi|hi|ms|nl|no|pl|sv|sw|tr)$") seed: Optional[int] = Field(None, ge=0) - model_size: Optional[str] = Field(default="1.7B", pattern="^(1\\.7B|0\\.6B|1B|3B)$") + model_size: Optional[str] = Field(default="1.7B", pattern="^(1\\.7B|0\\.6B|1B|3B|v2|v3)$") instruct: Optional[str] = Field(None, max_length=500) - engine: Optional[str] = Field(default="qwen", pattern="^(qwen|luxtts|chatterbox|chatterbox_turbo|tada)$") + engine: Optional[str] = Field(default="qwen", pattern="^(qwen|luxtts|chatterbox|chatterbox_turbo|tada|cosyvoice)$") max_chunk_chars: int = Field( default=800, ge=100, le=5000, description="Max characters per chunk for long text splitting" ) diff --git a/backend/requirements.txt b/backend/requirements.txt index d77f97b..d5ff724 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -40,6 +40,16 @@ pyloudnorm # provides the only class TADA uses: Snake1d.) torchaudio +# CosyVoice2/3 sub-dependencies (the cosyvoice source is cloned at +# setup time into backend/vendors/CosyVoice — no PyPI package exists) +hyperpyyaml>=1.2.0 +onnxruntime>=1.18.0 +openai-whisper>=20231117 +tiktoken +einops +inflect +matplotlib + # Audio processing librosa>=0.10.0 soundfile>=0.12.0 diff --git a/backend/server.py b/backend/server.py index bc6a81b..2bba5d1 100644 --- a/backend/server.py +++ b/backend/server.py @@ -39,6 +39,11 @@ def _is_writable(stream): _espeak_data = os.path.join(_meipass, 'piper_phonemize', 'espeak-ng-data') if os.path.isdir(_espeak_data): os.environ.setdefault('ESPEAK_DATA_PATH', _espeak_data) + # CosyVoice source + Matcha-TTS are bundled as --add-data into _MEIPASS. + # Add them to sys.path so ``from cosyvoice...`` and ``from matcha...`` + # resolve at runtime. + if os.path.isdir(os.path.join(_meipass, 'cosyvoice')): + sys.path.insert(0, _meipass) # Fast path: handle --version before any heavy imports so the Rust # version check doesn't block for 30+ seconds loading torch etc. diff --git a/justfile b/justfile index a1d4013..7b8e5e7 100644 --- a/justfile +++ b/justfile @@ -48,6 +48,12 @@ setup-python: {{ pip }} install --no-deps chatterbox-tts # HumeAI TADA pins torch>=2.7,<2.8 which conflicts with our torch>=2.1 {{ pip }} install --no-deps hume-tada + # CosyVoice: clone source into backend/vendors/ (no PyPI package exists) + if [ ! -d "{{ backend_dir }}/vendors/CosyVoice" ]; then + echo "Cloning CosyVoice source..." + mkdir -p {{ backend_dir }}/vendors + git clone --recursive --depth 1 https://github.com/FunAudioLLM/CosyVoice.git {{ backend_dir }}/vendors/CosyVoice + fi # Apple Silicon: install MLX backend if [ "$(uname -m)" = "arm64" ] && [ "$(uname)" = "Darwin" ]; then echo "Detected Apple Silicon — installing MLX dependencies..." @@ -77,6 +83,11 @@ setup-python: & "{{ pip }}" install -r {{ backend_dir }}/requirements.txt & "{{ pip }}" install --no-deps chatterbox-tts & "{{ pip }}" install --no-deps hume-tada + if (-not (Test-Path "{{ backend_dir }}/vendors/CosyVoice")) { \ + Write-Host "Cloning CosyVoice source..."; \ + New-Item -ItemType Directory -Force -Path "{{ backend_dir }}/vendors" | Out-Null; \ + git clone --recursive --depth 1 https://github.com/FunAudioLLM/CosyVoice.git "{{ backend_dir }}/vendors/CosyVoice"; \ + } & "{{ pip }}" install git+https://github.com/QwenLM/Qwen3-TTS.git & "{{ pip }}" install pyinstaller ruff pytest pytest-asyncio -q Write-Host "Python environment ready."