diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 1312ca3..8b2311f 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -63,6 +63,7 @@ jobs:
           pip install -r backend/requirements.txt
           pip install --no-deps chatterbox-tts
           pip install --no-deps hume-tada
+          git clone --recursive --depth 1 https://github.com/FunAudioLLM/CosyVoice.git backend/vendors/CosyVoice
 
       - name: Install MLX dependencies (Apple Silicon only)
         if: matrix.backend == 'mlx'
@@ -190,6 +191,7 @@ jobs:
           pip install -r backend/requirements.txt
           pip install --no-deps chatterbox-tts
           pip install --no-deps hume-tada
+          git clone --recursive --depth 1 https://github.com/FunAudioLLM/CosyVoice.git backend/vendors/CosyVoice
 
       - name: Install PyTorch with CUDA 12.6
         run: |
diff --git a/.gitignore b/.gitignore
index 130a7aa..285881a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -59,6 +59,9 @@ tauri/src-tauri/gen/partial.plist
 # Windows artifacts
 nul
 
+# Vendored source clones (fetched at setup time)
+backend/vendors/
+
 # Temporary
 tmp/
 temp/
diff --git a/Dockerfile b/Dockerfile
index 1ad85e5..d2cfb8a 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -39,6 +39,7 @@ RUN pip install --no-cache-dir --prefix=/install --no-deps chatterbox-tts
 RUN pip install --no-cache-dir --prefix=/install --no-deps hume-tada
 RUN pip install --no-cache-dir --prefix=/install \
     git+https://github.com/QwenLM/Qwen3-TTS.git
+RUN git clone --recursive --depth 1 https://github.com/FunAudioLLM/CosyVoice.git /build/CosyVoice
 
 
 # === Stage 3: Runtime ===
@@ -62,6 +63,9 @@ COPY --from=backend-builder /install /usr/local
 # Copy backend application code
 COPY --chown=voicebox:voicebox backend/ /app/backend/
 
+# Copy CosyVoice source from builder stage
+COPY --from=backend-builder --chown=voicebox:voicebox /build/CosyVoice/ /app/backend/vendors/CosyVoice/
+
 # Copy built frontend from frontend stage
 COPY --from=frontend --chown=voicebox:voicebox /build/web/dist /app/frontend/
 
diff --git a/app/src/components/Generation/EngineModelSelector.tsx b/app/src/components/Generation/EngineModelSelector.tsx
index 4382d3f..efef063 100644
--- a/app/src/components/Generation/EngineModelSelector.tsx
+++ b/app/src/components/Generation/EngineModelSelector.tsx
@@ -22,6 +22,8 @@ const ENGINE_OPTIONS = [
   { value: 'chatterbox_turbo', label: 'Chatterbox Turbo' },
   { value: 'tada:1B', label: 'TADA 1B' },
   { value: 'tada:3B', label: 'TADA 3B Multilingual' },
+  { value: 'cosyvoice:v2', label: 'CosyVoice2 0.5B' },
+  { value: 'cosyvoice:v3', label: 'CosyVoice3 0.5B' },
 ] as const;
 
 const ENGINE_DESCRIPTIONS: Record<string, string> = {
@@ -30,6 +32,7 @@ const ENGINE_DESCRIPTIONS: Record<string, string> = {
   chatterbox: '23 languages, incl. Hebrew',
   chatterbox_turbo: 'English, [laugh] [cough] tags',
   tada: 'HumeAI, 700s+ coherent audio',
+  cosyvoice: 'Alibaba, instruct + cloning',
 };
 
 /** Engines that only support English and should force language to 'en' on select. */
@@ -38,6 +41,7 @@ const ENGLISH_ONLY_ENGINES = new Set(['luxtts', 'chatterbox_turbo']);
 function getSelectValue(engine: string, modelSize?: string): string {
   if (engine === 'qwen') return `qwen:${modelSize || '1.7B'}`;
   if (engine === 'tada') return `tada:${modelSize || '1B'}`;
+  if (engine === 'cosyvoice') return `cosyvoice:${modelSize || 'v2'}`;
   return engine;
 }
 
@@ -66,6 +70,15 @@ function handleEngineChange(form: UseFormReturn<GenerationFormValues>, value: st
         form.setValue('language', available[0]?.value ?? 'en');
       }
     }
+  } else if (value.startsWith('cosyvoice:')) {
+    const [, modelSize] = value.split(':');
+    form.setValue('engine', 'cosyvoice');
+    form.setValue('modelSize', modelSize as 'v2' | 'v3');
+    const currentLang = form.getValues('language');
+    const available = getLanguageOptionsForEngine('cosyvoice');
+    if (!available.some((l) => l.value === currentLang)) {
+      form.setValue('language', available[0]?.value ?? 'en');
+    }
   } else {
     form.setValue('engine', value as GenerationFormValues['engine']);
     form.setValue('modelSize', undefined as unknown as '1.7B' | '0.6B');
diff --git a/app/src/components/ServerSettings/ModelManagement.tsx b/app/src/components/ServerSettings/ModelManagement.tsx
index c415306..618be27 100644
--- a/app/src/components/ServerSettings/ModelManagement.tsx
+++ b/app/src/components/ServerSettings/ModelManagement.tsx
@@ -66,6 +66,10 @@ const MODEL_DESCRIPTIONS: Record<string, string> = {
     'HumeAI TADA 1B — English speech-language model built on Llama 3.2 1B. Generates 700s+ of coherent audio with synchronized text-acoustic alignment.',
   'tada-3b-ml':
     'HumeAI TADA 3B Multilingual — built on Llama 3.2 3B. Supports 10 languages with high-fidelity voice cloning via text-acoustic dual alignment.',
+  'cosyvoice2-0.5b':
+    'CosyVoice2 0.5B by Alibaba. Multilingual TTS with instruct support for emotions, speed, volume, and dialects. 9 languages with zero-shot voice cloning.',
+  'cosyvoice3-0.5b':
+    'Fun-CosyVoice3 0.5B by Alibaba. Improved robustness, prosody, and Chinese dialect support over CosyVoice2. Best quality for in-the-wild speech generation.',
   'whisper-base':
     'Smallest Whisper model (74M parameters). Fast transcription with moderate accuracy.',
   'whisper-small':
@@ -390,14 +394,7 @@ export function ModelManagement() {
     setDetailOpen(true);
   };
 
-  const voiceModels =
-    modelStatus?.models.filter(
-      (m) =>
-        m.model_name.startsWith('qwen-tts') ||
-        m.model_name.startsWith('luxtts') ||
-        m.model_name.startsWith('chatterbox') ||
-        m.model_name.startsWith('tada'),
-    ) ?? [];
+  const voiceModels = modelStatus?.models.filter((m) => !m.model_name.startsWith('whisper')) ?? [];
   const whisperModels = modelStatus?.models.filter((m) => m.model_name.startsWith('whisper')) ?? [];
 
   // Build sections
diff --git a/app/src/lib/api/types.ts b/app/src/lib/api/types.ts
index aa85d00..4b18f60 100644
--- a/app/src/lib/api/types.ts
+++ b/app/src/lib/api/types.ts
@@ -42,8 +42,8 @@ export interface GenerationRequest {
   text: string;
   language: LanguageCode;
   seed?: number;
-  model_size?: '1.7B' | '0.6B' | '1B' | '3B';
-  engine?: 'qwen' | 'luxtts' | 'chatterbox' | 'chatterbox_turbo' | 'tada';
+  model_size?: '1.7B' | '0.6B' | '1B' | '3B' | 'v2' | 'v3';
+  engine?: 'qwen' | 'luxtts' | 'chatterbox' | 'chatterbox_turbo' | 'tada' | 'cosyvoice';
   instruct?: string;
   max_chunk_chars?: number;
   crossfade_ms?: number;
diff --git a/app/src/lib/constants/languages.ts b/app/src/lib/constants/languages.ts
index a0d233a..b7bf9bd 100644
--- a/app/src/lib/constants/languages.ts
+++ b/app/src/lib/constants/languages.ts
@@ -67,6 +67,7 @@ export const ENGINE_LANGUAGES: Record<string, readonly LanguageCode[]> = {
   ],
   chatterbox_turbo: ['en'],
   tada: ['en', 'ar', 'zh', 'de', 'es', 'fr', 'it', 'ja', 'pl', 'pt'],
+  cosyvoice: ['zh', 'en', 'ja', 'ko', 'de', 'fr', 'ru', 'es', 'it'],
 } as const;
 
 /** Helper: get language options for a given engine. */
diff --git a/app/src/lib/hooks/useGenerationForm.ts b/app/src/lib/hooks/useGenerationForm.ts
index 8e73ce0..d59f7bc 100644
--- a/app/src/lib/hooks/useGenerationForm.ts
+++ b/app/src/lib/hooks/useGenerationForm.ts
@@ -15,9 +15,11 @@ const generationSchema = z.object({
   text: z.string().min(1, '').max(50000),
   language: z.enum(LANGUAGE_CODES as [LanguageCode, ...LanguageCode[]]),
   seed: z.number().int().optional(),
-  modelSize: z.enum(['1.7B', '0.6B', '1B', '3B']).optional(),
+  modelSize: z.enum(['1.7B', '0.6B', '1B', '3B', 'v2', 'v3']).optional(),
   instruct: z.string().max(500).optional(),
-  engine: z.enum(['qwen', 'luxtts', 'chatterbox', 'chatterbox_turbo', 'tada']).optional(),
+  engine: z
+    .enum(['qwen', 'luxtts', 'chatterbox', 'chatterbox_turbo', 'tada', 'cosyvoice'])
+    .optional(),
 });
 
 export type GenerationFormValues = z.infer<typeof generationSchema>;
@@ -83,7 +85,11 @@ export function useGenerationForm(options: UseGenerationFormOptions = {}) {
                 ? data.modelSize === '3B'
                   ? 'tada-3b-ml'
                   : 'tada-1b'
-                : `qwen-tts-${data.modelSize}`;
+                : engine === 'cosyvoice'
+                  ? data.modelSize === 'v3'
+                    ? 'cosyvoice3-0.5b'
+                    : 'cosyvoice2-0.5b'
+                  : `qwen-tts-${data.modelSize}`;
       const displayName =
         engine === 'luxtts'
           ? 'LuxTTS'
@@ -95,9 +101,13 @@ export function useGenerationForm(options: UseGenerationFormOptions = {}) {
                 ? data.modelSize === '3B'
                   ? 'TADA 3B Multilingual'
                   : 'TADA 1B'
-                : data.modelSize === '1.7B'
-                  ? 'Qwen TTS 1.7B'
-                  : 'Qwen TTS 0.6B';
+                : engine === 'cosyvoice'
+                  ? data.modelSize === 'v3'
+                    ? 'CosyVoice3 0.5B'
+                    : 'CosyVoice2 0.5B'
+                  : data.modelSize === '1.7B'
+                    ? 'Qwen TTS 1.7B'
+                    : 'Qwen TTS 0.6B';
 
       // Check if model needs downloading
       try {
@@ -112,7 +122,7 @@ export function useGenerationForm(options: UseGenerationFormOptions = {}) {
         console.error('Failed to check model status:', error);
       }
 
-      const hasModelSizes = engine === 'qwen' || engine === 'tada';
+      const hasModelSizes = engine === 'qwen' || engine === 'tada' || engine === 'cosyvoice';
       const effectsChain = options.getEffectsChain?.();
       // This now returns immediately with status="generating"
       const result = await generation.mutateAsync({
@@ -122,7 +132,8 @@ export function useGenerationForm(options: UseGenerationFormOptions = {}) {
         seed: data.seed,
         model_size: hasModelSizes ? data.modelSize : undefined,
         engine,
-        instruct: engine === 'qwen' ? data.instruct || undefined : undefined,
+        instruct:
+          engine === 'qwen' || engine === 'cosyvoice' ? data.instruct || undefined : undefined,
         max_chunk_chars: maxChunkChars,
         crossfade_ms: crossfadeMs,
         normalize: normalizeAudio,
diff --git a/backend/backends/__init__.py b/backend/backends/__init__.py
index a4f5113..d49f520 100644
--- a/backend/backends/__init__.py
+++ b/backend/backends/__init__.py
@@ -167,6 +167,7 @@ def is_loaded(self) -> bool:
     "chatterbox": "Chatterbox TTS",
     "chatterbox_turbo": "Chatterbox Turbo",
     "tada": "TADA",
+    "cosyvoice": "CosyVoice",
 }
 
 
@@ -278,6 +279,26 @@ def _get_non_qwen_tts_configs() -> list[ModelConfig]:
             size_mb=8000,
             languages=["en", "ar", "zh", "de", "es", "fr", "it", "ja", "pl", "pt"],
         ),
+        ModelConfig(
+            model_name="cosyvoice2-0.5b",
+            display_name="CosyVoice2 0.5B (Multilingual, Instruct)",
+            engine="cosyvoice",
+            hf_repo_id="FunAudioLLM/CosyVoice2-0.5B",
+            model_size="v2",
+            size_mb=4600,
+            supports_instruct=True,
+            languages=["zh", "en", "ja", "ko", "de", "fr", "ru", "es", "it"],
+        ),
+        ModelConfig(
+            model_name="cosyvoice3-0.5b",
+            display_name="CosyVoice3 0.5B (Best Quality)",
+            engine="cosyvoice",
+            hf_repo_id="FunAudioLLM/Fun-CosyVoice3-0.5B-2512",
+            model_size="v3",
+            size_mb=4600,
+            supports_instruct=True,
+            languages=["zh", "en", "ja", "ko", "de", "fr", "ru", "es", "it"],
+        ),
     ]
 
 
@@ -362,7 +383,7 @@ async def load_engine_model(engine: str, model_size: str = "default") -> None:
     backend = get_tts_backend_for_engine(engine)
     if engine == "qwen":
         await backend.load_model_async(model_size)
-    elif engine == "tada":
+    elif engine in ("tada", "cosyvoice"):
         await backend.load_model(model_size)
     else:
         await backend.load_model()
@@ -379,7 +400,7 @@ async def ensure_model_cached_or_raise(engine: str, model_size: str = "default")
             cfg = c
             break
 
-    if engine in ("qwen", "tada"):
+    if engine in ("qwen", "tada", "cosyvoice"):
         if not backend._is_model_cached(model_size):
             raise HTTPException(
                 status_code=400,
@@ -454,6 +475,9 @@ def get_model_load_func(config: ModelConfig):
     if config.engine == "qwen":
         return lambda: tts.get_tts_model().load_model(config.model_size)
 
+    if config.engine in ("tada", "cosyvoice"):
+        return lambda: get_tts_backend_for_engine(config.engine).load_model(config.model_size)
+
     return lambda: get_tts_backend_for_engine(config.engine).load_model()
 
 
@@ -515,6 +539,10 @@ def get_tts_backend_for_engine(engine: str) -> TTSBackend:
             from .hume_backend import HumeTadaBackend
 
             backend = HumeTadaBackend()
+        elif engine == "cosyvoice":
+            from .cosyvoice_backend import CosyVoiceTTSBackend
+
+            backend = CosyVoiceTTSBackend()
         else:
             raise ValueError(f"Unknown TTS engine: {engine}. Supported: {list(TTS_ENGINES.keys())}")
 
diff --git a/backend/backends/cosyvoice_backend.py b/backend/backends/cosyvoice_backend.py
new file mode 100644
index 0000000..62c49f9
--- /dev/null
+++ b/backend/backends/cosyvoice_backend.py
@@ -0,0 +1,433 @@
+"""
+CosyVoice2 / CosyVoice3 TTS backend implementation.
+
+Wraps the upstream FunAudioLLM/CosyVoice library for zero-shot voice cloning
+with instruct support (emotions, speed, volume, dialects).  The CosyVoice repo
+is cloned at setup time (``just setup-python``) and added to ``sys.path`` at
+import time.
+
+Model variants:
+    - CosyVoice2-0.5B: ``inference_instruct2()`` for 9-language cloning + instruct
+    - Fun-CosyVoice3-0.5B: improved robustness, prosody, and Chinese dialects
+
+Both variants share a single ``cosyvoice`` engine key; the ``model_size``
+field selects which HuggingFace checkpoint to download.
+"""
+
+import asyncio
+import logging
+import os
+import sys
+import threading
+from pathlib import Path
+from typing import ClassVar, List, Optional, Tuple
+
+import numpy as np
+
+from . import TTSBackend
+from .base import (
+    is_model_cached,
+    get_torch_device,
+    combine_voice_prompts as _combine_voice_prompts,
+    model_load_progress,
+)
+
+logger = logging.getLogger(__name__)
+
+# ── HuggingFace repos ─────────────────────────────────────────────────
+
+COSYVOICE_HF_REPOS = {
+    "v2": "FunAudioLLM/CosyVoice2-0.5B",
+    "v3": "FunAudioLLM/Fun-CosyVoice3-0.5B-2512",
+}
+
+# Files that must be present for CosyVoice2 / CosyVoice3
+_REQUIRED_FILES = {
+    "v2": ["llm.pt", "flow.pt", "hift.pt", "cosyvoice2.yaml", "campplus.onnx"],
+    "v3": ["llm.pt", "flow.pt", "hift.pt", "cosyvoice3.yaml", "campplus.onnx"],
+}
+
+# Model name → variant key
+_MODEL_NAME_TO_VARIANT = {
+    "cosyvoice2-0.5b": "v2",
+    "cosyvoice3-0.5b": "v3",
+}
+
+# Default sample rate (both models produce 24 kHz audio)
+COSYVOICE_SAMPLE_RATE = 24000
+
+
+def _ensure_cosyvoice_on_path() -> None:
+    """Add the cloned CosyVoice repo + Matcha-TTS to sys.path if not already present."""
+    backend_dir = Path(__file__).resolve().parent.parent  # backend/
+    cosyvoice_root = backend_dir / "vendors" / "CosyVoice"
+
+    if not cosyvoice_root.exists():
+        raise RuntimeError(
+            f"CosyVoice source not found at {cosyvoice_root}. "
+            "Run `just setup-python` to clone it."
+        )
+
+    cosyvoice_str = str(cosyvoice_root)
+    matcha_str = str(cosyvoice_root / "third_party" / "Matcha-TTS")
+
+    if cosyvoice_str not in sys.path:
+        sys.path.insert(0, cosyvoice_str)
+    if os.path.isdir(matcha_str) and matcha_str not in sys.path:
+        sys.path.insert(0, matcha_str)
+
+
+def _shim_training_only_modules() -> None:
+    """
+    Pre-populate ``sys.modules`` with lightweight stubs for modules that
+    the CosyVoice YAML configs reference but are only needed for training.
+
+    ``hyperpyyaml`` resolves every ``!name:`` / ``!new:`` tag via
+    ``pydoc.locate`` which eagerly imports the target module.  The YAML
+    references ``cosyvoice.dataset.processor`` (12 times) which pulls in
+    ``pyarrow``, ``pyworld``, etc. at module level.
+
+    Several ``matcha.utils.*`` submodules also import
+    ``lightning.pytorch`` at module level.  We stub those so the real
+    ``matcha.utils`` package can still expose ``audio.py`` and ``model.py``
+    for inference.
+    """
+    import types
+    import logging as _logging
+
+    _noop = lambda *a, **kw: None
+
+    def get_pylogger(name: str = __name__) -> _logging.Logger:
+        return _logging.getLogger(name)
+
+    # ── matcha.utils submodules that import lightning ──────────────
+    fake_pylogger = types.ModuleType("matcha.utils.pylogger")
+    fake_pylogger.get_pylogger = get_pylogger  # type: ignore[attr-defined]
+
+    fake_logging_utils = types.ModuleType("matcha.utils.logging_utils")
+    fake_logging_utils.log_hyperparameters = _noop  # type: ignore[attr-defined]
+
+    fake_rich_utils = types.ModuleType("matcha.utils.rich_utils")
+    fake_rich_utils.enforce_tags = _noop  # type: ignore[attr-defined]
+    fake_rich_utils.print_config_tree = _noop  # type: ignore[attr-defined]
+
+    fake_instantiators = types.ModuleType("matcha.utils.instantiators")
+    fake_instantiators.instantiate_callbacks = lambda *a, **kw: []  # type: ignore[attr-defined]
+    fake_instantiators.instantiate_loggers = lambda *a, **kw: []  # type: ignore[attr-defined]
+
+    fake_utils_utils = types.ModuleType("matcha.utils.utils")
+    fake_utils_utils.extras = _noop  # type: ignore[attr-defined]
+    fake_utils_utils.get_metric_value = _noop  # type: ignore[attr-defined]
+    fake_utils_utils.task_wrapper = lambda fn: fn  # type: ignore[attr-defined]
+
+    sys.modules["matcha.utils.pylogger"] = fake_pylogger
+    sys.modules["matcha.utils.logging_utils"] = fake_logging_utils
+    sys.modules["matcha.utils.rich_utils"] = fake_rich_utils
+    sys.modules["matcha.utils.instantiators"] = fake_instantiators
+    sys.modules["matcha.utils.utils"] = fake_utils_utils
+
+    # ── cosyvoice.dataset.processor (training data pipeline) ──────
+    # Referenced 12 times in cosyvoice2.yaml / cosyvoice3.yaml via
+    # !name: tags.  Imports pyarrow, pyworld, whisper at module level.
+    fake_dataset = types.ModuleType("cosyvoice.dataset")
+    fake_dataset.__path__ = []  # type: ignore[attr-defined]
+    fake_processor = types.ModuleType("cosyvoice.dataset.processor")
+    for _fn in (
+        "parquet_opener", "tokenize", "filter", "resample", "truncate",
+        "compute_fbank", "compute_whisper_fbank", "compute_f0",
+        "parse_embedding", "shuffle", "sort", "batch", "padding",
+    ):
+        setattr(fake_processor, _fn, _noop)
+
+    sys.modules.setdefault("cosyvoice.dataset", fake_dataset)
+    sys.modules["cosyvoice.dataset.processor"] = fake_processor
+
+
+
+
+def _patch_modelscope_to_hf() -> None:
+    """
+    Monkey-patch ``modelscope.snapshot_download`` → ``huggingface_hub.snapshot_download``
+    so that CosyVoice's ``__init__`` downloads from HuggingFace instead of ModelScope.
+
+    Also passes ``token=None`` to avoid HF auth prompts on public repos.
+    """
+    import types
+    from huggingface_hub import snapshot_download as hf_snapshot_download
+
+    def _hf_download(model_id, **kwargs):
+        kwargs.pop("revision", None)
+        kwargs.pop("model_version", None)
+        return hf_snapshot_download(model_id, token=None, **kwargs)
+
+    # Create a fake "modelscope" module so ``from modelscope import snapshot_download`` works.
+    fake_ms = types.ModuleType("modelscope")
+    fake_ms.snapshot_download = _hf_download
+    sys.modules["modelscope"] = fake_ms
+
+
+def _patch_torchaudio_load() -> None:
+    """
+    Replace ``torchaudio.load`` with a soundfile-backed implementation.
+
+    torchaudio >= 2.9 unconditionally delegates to TorchCodec and ignores
+    the ``backend`` parameter.  CosyVoice calls ``torchaudio.load(wav,
+    backend='soundfile')`` which now fails unless ``torchcodec`` is
+    installed.  We swap in a lightweight wrapper that reads via soundfile
+    and returns the same ``(Tensor, sample_rate)`` tuple.
+    """
+    import torch
+    import torchaudio
+    import soundfile as sf
+
+    def _sf_load(uri, frame_offset=0, num_frames=-1, normalize=True,
+                 channels_first=True, format=None, buffer_size=4096,
+                 backend=None):
+        data, sr = sf.read(uri, start=frame_offset,
+                           stop=None if num_frames < 0 else frame_offset + num_frames,
+                           dtype="float32", always_2d=True)
+        # data shape: (frames, channels) → tensor
+        tensor = torch.from_numpy(data)
+        if channels_first:
+            tensor = tensor.T  # (channels, frames)
+        return tensor, sr
+
+    torchaudio.load = _sf_load
+
+
+class CosyVoiceTTSBackend:
+    """CosyVoice2 / CosyVoice3 TTS backend for voice cloning with instruct support."""
+
+    # Class-level lock for import patching
+    _import_lock: ClassVar[threading.Lock] = threading.Lock()
+    _patched: ClassVar[bool] = False
+
+    def __init__(self):
+        self.model = None
+        self._variant: Optional[str] = None  # "v2" or "v3"
+        self._device: Optional[str] = None
+        self._model_load_lock = asyncio.Lock()
+
+    def _get_device(self) -> str:
+        # CosyVoice has no MPS support — force CPU on macOS
+        return get_torch_device(force_cpu_on_mac=True)
+
+    def is_loaded(self) -> bool:
+        return self.model is not None
+
+    def _get_model_path(self, model_size: str = "v2") -> str:
+        return COSYVOICE_HF_REPOS.get(model_size, COSYVOICE_HF_REPOS["v2"])
+
+    def _is_model_cached(self, model_size: str = "v2") -> bool:
+        variant = model_size if model_size in COSYVOICE_HF_REPOS else "v2"
+        repo = COSYVOICE_HF_REPOS[variant]
+        required = _REQUIRED_FILES[variant]
+        return is_model_cached(repo, required_files=required)
+
+    async def load_model(self, model_size: str = "v2") -> None:
+        """Load a CosyVoice model variant.
+
+        Args:
+            model_size: ``"v2"`` for CosyVoice2-0.5B or ``"v3"`` for CosyVoice3-0.5B.
+        """
+        variant = model_size if model_size in COSYVOICE_HF_REPOS else "v2"
+
+        # If already loaded with the right variant, skip
+        if self.model is not None and self._variant == variant:
+            return
+
+        async with self._model_load_lock:
+            if self.model is not None and self._variant == variant:
+                return
+            # Unload previous variant if switching
+            if self.model is not None:
+                self.unload_model()
+            await asyncio.to_thread(self._load_model_sync, variant)
+
+    def _load_model_sync(self, variant: str) -> None:
+        """Synchronous model loading."""
+        model_name = f"cosyvoice{'2' if variant == 'v2' else '3'}-0.5b"
+        is_cached = self._is_model_cached(variant)
+
+        with model_load_progress(model_name, is_cached):
+            device = self._get_device()
+            self._device = device
+            hf_repo = COSYVOICE_HF_REPOS[variant]
+            logger.info(
+                "Loading CosyVoice %s (%s) on %s...",
+                "2" if variant == "v2" else "3",
+                hf_repo,
+                device,
+            )
+
+            # 1. Ensure cosyvoice source is on sys.path
+            _ensure_cosyvoice_on_path()
+
+            # 2. Patch imports (thread-safe, once)
+            with CosyVoiceTTSBackend._import_lock:
+                if not CosyVoiceTTSBackend._patched:
+                    _shim_training_only_modules()
+                    _patch_modelscope_to_hf()
+                    _patch_torchaudio_load()
+                    CosyVoiceTTSBackend._patched = True
+
+            # 3. Patch torch.load to force map_location on CPU
+            import torch
+
+            if device == "cpu":
+                _orig_torch_load = torch.load
+
+                def _patched_load(*args, **kwargs):
+                    kwargs.setdefault("map_location", "cpu")
+                    return _orig_torch_load(*args, **kwargs)
+
+                torch.load = _patched_load
+
+            try:
+                if variant == "v2":
+                    from cosyvoice.cli.cosyvoice import CosyVoice2
+
+                    model = CosyVoice2(hf_repo)
+                else:
+                    from cosyvoice.cli.cosyvoice import CosyVoice3
+
+                    model = CosyVoice3(hf_repo)
+            finally:
+                # Restore original torch.load
+                if device == "cpu":
+                    torch.load = _orig_torch_load
+
+            self.model = model
+            self._variant = variant
+
+        logger.info("CosyVoice %s loaded successfully", "2" if variant == "v2" else "3")
+
+    def unload_model(self) -> None:
+        """Unload model to free memory."""
+        if self.model is not None:
+            device = self._device
+            del self.model
+            self.model = None
+            self._variant = None
+            self._device = None
+            if device == "cuda":
+                import torch
+
+                torch.cuda.empty_cache()
+            logger.info("CosyVoice unloaded")
+
+    async def create_voice_prompt(
+        self,
+        audio_path: str,
+        reference_text: str,
+        use_cache: bool = True,
+    ) -> Tuple[dict, bool]:
+        """
+        Create voice prompt from reference audio.
+
+        CosyVoice processes the reference at generation time via
+        ``frontend_zero_shot`` / ``frontend_instruct2``, so we just
+        store the path + text for later use.
+        """
+        voice_prompt = {
+            "ref_audio": str(audio_path),
+            "ref_text": reference_text,
+        }
+        return voice_prompt, False
+
+    async def combine_voice_prompts(
+        self,
+        audio_paths: List[str],
+        reference_texts: List[str],
+    ) -> Tuple[np.ndarray, str]:
+        return await _combine_voice_prompts(audio_paths, reference_texts)
+
+    async def generate(
+        self,
+        text: str,
+        voice_prompt: dict,
+        language: str = "en",
+        seed: Optional[int] = None,
+        instruct: Optional[str] = None,
+    ) -> Tuple[np.ndarray, int]:
+        """
+        Generate audio using CosyVoice instruct2 (with cloning) or zero-shot.
+
+        If ``instruct`` is provided, uses ``inference_instruct2()`` which
+        supports emotion, speed, volume, and dialect control.
+        Otherwise falls back to ``inference_zero_shot()``.
+
+        Args:
+            text: Text to synthesize.
+            voice_prompt: Dict with ``ref_audio`` path and ``ref_text``.
+            language: BCP-47 language code (unused by CosyVoice directly,
+                      but kept for protocol compatibility).
+            seed: Random seed for reproducibility.
+            instruct: Instruct text for style control, e.g.
+                      ``"Read with a happy tone, slowly."``.
+
+        Returns:
+            Tuple of (audio_array, sample_rate).
+        """
+        await self.load_model(self._variant or "v2")
+
+        ref_audio = voice_prompt.get("ref_audio")
+        ref_text = voice_prompt.get("ref_text", "")
+
+        if ref_audio and not Path(ref_audio).exists():
+            logger.warning("Reference audio not found: %s", ref_audio)
+            ref_audio = None
+
+        def _generate_sync():
+            import torch
+
+            if seed is not None:
+                torch.manual_seed(seed)
+
+            # Collect all chunks from the generator
+            audio_chunks = []
+
+            if instruct and ref_audio:
+                # instruct2: text + instruct + reference audio → cloned + styled
+                logger.info("[CosyVoice] instruct2: lang=%s instruct=%s", language, instruct[:60])
+                for chunk in self.model.inference_instruct2(
+                    tts_text=text,
+                    instruct_text=instruct,
+                    prompt_wav=ref_audio,
+                    stream=False,
+                    speed=1.0,
+                ):
+                    audio_chunks.append(chunk["tts_speech"])
+            elif ref_audio:
+                # zero-shot voice cloning
+                logger.info("[CosyVoice] zero_shot: lang=%s", language)
+                for chunk in self.model.inference_zero_shot(
+                    tts_text=text,
+                    prompt_text=ref_text,
+                    prompt_wav=ref_audio,
+                    stream=False,
+                    speed=1.0,
+                ):
+                    audio_chunks.append(chunk["tts_speech"])
+            else:
+                # cross-lingual (no reference audio, shouldn't normally happen
+                # in voicebox since profiles always have samples, but handle it)
+                logger.info("[CosyVoice] cross_lingual fallback: lang=%s", language)
+                for chunk in self.model.inference_cross_lingual(
+                    tts_text=text,
+                    prompt_wav=ref_audio or "",
+                    stream=False,
+                    speed=1.0,
+                ):
+                    audio_chunks.append(chunk["tts_speech"])
+
+            # Concatenate all chunks
+            if not audio_chunks:
+                return np.zeros(COSYVOICE_SAMPLE_RATE, dtype=np.float32), COSYVOICE_SAMPLE_RATE
+
+            full_audio = torch.cat(audio_chunks, dim=-1)
+            audio_np = full_audio.squeeze().cpu().numpy().astype(np.float32)
+
+            return audio_np, COSYVOICE_SAMPLE_RATE
+
+        return await asyncio.to_thread(_generate_sync)
diff --git a/backend/build_binary.py b/backend/build_binary.py
index 901514f..05b1ebd 100644
--- a/backend/build_binary.py
+++ b/backend/build_binary.py
@@ -228,9 +228,40 @@ def build_server(cuda=False):
             "torchaudio",
             "--collect-submodules",
             "tada",
+            # CosyVoice2/3 — Alibaba TTS with instruct + cloning
+            "--hidden-import",
+            "backend.backends.cosyvoice_backend",
+            # hyperpyyaml dynamically instantiates classes from YAML —
+            # needs source files and the ruamel.yaml backend
+            "--collect-all",
+            "hyperpyyaml",
+            # onnxruntime ships native shared libraries + provider plugins
+            "--collect-all",
+            "onnxruntime",
+            "--copy-metadata",
+            "onnxruntime",
+            # openai-whisper ships mel filter assets and uses tiktoken
+            "--collect-all",
+            "whisper",
+            "--collect-all",
+            "tiktoken",
+            # einops used by CosyVoice flow/decoder
+            "--hidden-import",
+            "einops",
         ]
     )
 
+    # Bundle the vendored CosyVoice source tree for frozen builds.
+    # The clone lives at backend/vendors/CosyVoice/ at build time.
+    cosyvoice_vendor = backend_dir / "vendors" / "CosyVoice"
+    if cosyvoice_vendor.exists():
+        args.extend([
+            "--add-data",
+            f"{cosyvoice_vendor / 'cosyvoice'}{os.pathsep}cosyvoice",
+            "--add-data",
+            f"{cosyvoice_vendor / 'third_party' / 'Matcha-TTS' / 'matcha'}{os.pathsep}matcha",
+        ])
+
     # Add CUDA-specific hidden imports
     if cuda:
         logger.info("Building with CUDA support")
diff --git a/backend/models.py b/backend/models.py
index 4dd2b36..d898101 100644
--- a/backend/models.py
+++ b/backend/models.py
@@ -66,9 +66,9 @@ class GenerationRequest(BaseModel):
     text: str = Field(..., min_length=1, max_length=50000)
     language: str = Field(default="en", pattern="^(zh|en|ja|ko|de|fr|ru|pt|es|it|he|ar|da|el|fi|hi|ms|nl|no|pl|sv|sw|tr)$")
     seed: Optional[int] = Field(None, ge=0)
-    model_size: Optional[str] = Field(default="1.7B", pattern="^(1\\.7B|0\\.6B|1B|3B)$")
+    model_size: Optional[str] = Field(default="1.7B", pattern="^(1\\.7B|0\\.6B|1B|3B|v2|v3)$")
     instruct: Optional[str] = Field(None, max_length=500)
-    engine: Optional[str] = Field(default="qwen", pattern="^(qwen|luxtts|chatterbox|chatterbox_turbo|tada)$")
+    engine: Optional[str] = Field(default="qwen", pattern="^(qwen|luxtts|chatterbox|chatterbox_turbo|tada|cosyvoice)$")
     max_chunk_chars: int = Field(
         default=800, ge=100, le=5000, description="Max characters per chunk for long text splitting"
     )
diff --git a/backend/requirements.txt b/backend/requirements.txt
index d77f97b..d5ff724 100644
--- a/backend/requirements.txt
+++ b/backend/requirements.txt
@@ -40,6 +40,16 @@ pyloudnorm
 # provides the only class TADA uses: Snake1d.)
 torchaudio
 
+# CosyVoice2/3 sub-dependencies (the cosyvoice source is cloned at
+# setup time into backend/vendors/CosyVoice — no PyPI package exists)
+hyperpyyaml>=1.2.0
+onnxruntime>=1.18.0
+openai-whisper>=20231117
+tiktoken
+einops
+inflect
+matplotlib
+
 # Audio processing
 librosa>=0.10.0
 soundfile>=0.12.0
diff --git a/backend/server.py b/backend/server.py
index bc6a81b..2bba5d1 100644
--- a/backend/server.py
+++ b/backend/server.py
@@ -39,6 +39,11 @@ def _is_writable(stream):
     _espeak_data = os.path.join(_meipass, 'piper_phonemize', 'espeak-ng-data')
     if os.path.isdir(_espeak_data):
         os.environ.setdefault('ESPEAK_DATA_PATH', _espeak_data)
+    # CosyVoice source + Matcha-TTS are bundled as --add-data into _MEIPASS.
+    # Add them to sys.path so ``from cosyvoice...`` and ``from matcha...``
+    # resolve at runtime.
+    if os.path.isdir(os.path.join(_meipass, 'cosyvoice')):
+        sys.path.insert(0, _meipass)
 
 # Fast path: handle --version before any heavy imports so the Rust
 # version check doesn't block for 30+ seconds loading torch etc.
diff --git a/justfile b/justfile
index a1d4013..7b8e5e7 100644
--- a/justfile
+++ b/justfile
@@ -48,6 +48,12 @@ setup-python:
     {{ pip }} install --no-deps chatterbox-tts
     # HumeAI TADA pins torch>=2.7,<2.8 which conflicts with our torch>=2.1
     {{ pip }} install --no-deps hume-tada
+    # CosyVoice: clone source into backend/vendors/ (no PyPI package exists)
+    if [ ! -d "{{ backend_dir }}/vendors/CosyVoice" ]; then
+        echo "Cloning CosyVoice source..."
+        mkdir -p {{ backend_dir }}/vendors
+        git clone --recursive --depth 1 https://github.com/FunAudioLLM/CosyVoice.git {{ backend_dir }}/vendors/CosyVoice
+    fi
     # Apple Silicon: install MLX backend
     if [ "$(uname -m)" = "arm64" ] && [ "$(uname)" = "Darwin" ]; then
         echo "Detected Apple Silicon — installing MLX dependencies..."
@@ -77,6 +83,11 @@ setup-python:
     & "{{ pip }}" install -r {{ backend_dir }}/requirements.txt
     & "{{ pip }}" install --no-deps chatterbox-tts
     & "{{ pip }}" install --no-deps hume-tada
+    if (-not (Test-Path "{{ backend_dir }}/vendors/CosyVoice")) { \
+        Write-Host "Cloning CosyVoice source..."; \
+        New-Item -ItemType Directory -Force -Path "{{ backend_dir }}/vendors" | Out-Null; \
+        git clone --recursive --depth 1 https://github.com/FunAudioLLM/CosyVoice.git "{{ backend_dir }}/vendors/CosyVoice"; \
+    }
     & "{{ pip }}" install git+https://github.com/QwenLM/Qwen3-TTS.git
     & "{{ pip }}" install pyinstaller ruff pytest pytest-asyncio -q
     Write-Host "Python environment ready."