jamiepine · jamiepine · Mar 17, 2026 · Mar 18, 2026 · coderabbitai · Mar 17, 2026
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -63,6 +63,7 @@ jobs:
           pip install -r backend/requirements.txt
           pip install --no-deps chatterbox-tts
           pip install --no-deps hume-tada
+          git clone --recursive --depth 1 https://github.com/FunAudioLLM/CosyVoice.git backend/vendors/CosyVoice
 
       - name: Install MLX dependencies (Apple Silicon only)
         if: matrix.backend == 'mlx'
@@ -190,6 +191,7 @@ jobs:
           pip install -r backend/requirements.txt
           pip install --no-deps chatterbox-tts
           pip install --no-deps hume-tada
+          git clone --recursive --depth 1 https://github.com/FunAudioLLM/CosyVoice.git backend/vendors/CosyVoice
 
       - name: Install PyTorch with CUDA 12.6
         run: |

diff --git a/.gitignore b/.gitignore
@@ -59,6 +59,9 @@ tauri/src-tauri/gen/partial.plist
 # Windows artifacts
 nul
 
+# Vendored source clones (fetched at setup time)
+backend/vendors/
+
 # Temporary
 tmp/
 temp/

diff --git a/Dockerfile b/Dockerfile
@@ -39,6 +39,7 @@ RUN pip install --no-cache-dir --prefix=/install --no-deps chatterbox-tts
 RUN pip install --no-cache-dir --prefix=/install --no-deps hume-tada
 RUN pip install --no-cache-dir --prefix=/install \
     git+https://github.com/QwenLM/Qwen3-TTS.git
+RUN git clone --recursive --depth 1 https://github.com/FunAudioLLM/CosyVoice.git /build/CosyVoice
 
 
 # === Stage 3: Runtime ===
@@ -62,6 +63,9 @@ COPY --from=backend-builder /install /usr/local
 # Copy backend application code
 COPY --chown=voicebox:voicebox backend/ /app/backend/
 
+# Copy CosyVoice source from builder stage
+COPY --from=backend-builder --chown=voicebox:voicebox /build/CosyVoice/ /app/backend/vendors/CosyVoice/
+
 # Copy built frontend from frontend stage
 COPY --from=frontend --chown=voicebox:voicebox /build/web/dist /app/frontend/
 

diff --git a/app/src/components/Generation/EngineModelSelector.tsx b/app/src/components/Generation/EngineModelSelector.tsx
@@ -22,6 +22,8 @@ const ENGINE_OPTIONS = [
   { value: 'chatterbox_turbo', label: 'Chatterbox Turbo' },
   { value: 'tada:1B', label: 'TADA 1B' },
   { value: 'tada:3B', label: 'TADA 3B Multilingual' },
+  { value: 'cosyvoice:v2', label: 'CosyVoice2 0.5B' },
+  { value: 'cosyvoice:v3', label: 'CosyVoice3 0.5B' },
 ] as const;
 
 const ENGINE_DESCRIPTIONS: Record<string, string> = {
@@ -30,6 +32,7 @@ const ENGINE_DESCRIPTIONS: Record<string, string> = {
   chatterbox: '23 languages, incl. Hebrew',
   chatterbox_turbo: 'English, [laugh] [cough] tags',
   tada: 'HumeAI, 700s+ coherent audio',
+  cosyvoice: 'Alibaba, instruct + cloning',
 };
 
 /** Engines that only support English and should force language to 'en' on select. */
@@ -38,6 +41,7 @@ const ENGLISH_ONLY_ENGINES = new Set(['luxtts', 'chatterbox_turbo']);
 function getSelectValue(engine: string, modelSize?: string): string {
   if (engine === 'qwen') return `qwen:${modelSize || '1.7B'}`;
   if (engine === 'tada') return `tada:${modelSize || '1B'}`;
+  if (engine === 'cosyvoice') return `cosyvoice:${modelSize || 'v2'}`;
   return engine;
 }
 
@@ -66,6 +70,15 @@ function handleEngineChange(form: UseFormReturn<GenerationFormValues>, value: st
         form.setValue('language', available[0]?.value ?? 'en');
       }
     }
+  } else if (value.startsWith('cosyvoice:')) {
+    const [, modelSize] = value.split(':');
+    form.setValue('engine', 'cosyvoice');
+    form.setValue('modelSize', modelSize as 'v2' | 'v3');
+    const currentLang = form.getValues('language');
+    const available = getLanguageOptionsForEngine('cosyvoice');
+    if (!available.some((l) => l.value === currentLang)) {
+      form.setValue('language', available[0]?.value ?? 'en');
+    }
   } else {
     form.setValue('engine', value as GenerationFormValues['engine']);
     form.setValue('modelSize', undefined as unknown as '1.7B' | '0.6B');

diff --git a/app/src/components/ServerSettings/ModelManagement.tsx b/app/src/components/ServerSettings/ModelManagement.tsx
@@ -66,6 +66,10 @@ const MODEL_DESCRIPTIONS: Record<string, string> = {
     'HumeAI TADA 1B — English speech-language model built on Llama 3.2 1B. Generates 700s+ of coherent audio with synchronized text-acoustic alignment.',
   'tada-3b-ml':
     'HumeAI TADA 3B Multilingual — built on Llama 3.2 3B. Supports 10 languages with high-fidelity voice cloning via text-acoustic dual alignment.',
+  'cosyvoice2-0.5b':
+    'CosyVoice2 0.5B by Alibaba. Multilingual TTS with instruct support for emotions, speed, volume, and dialects. 9 languages with zero-shot voice cloning.',
+  'cosyvoice3-0.5b':
+    'Fun-CosyVoice3 0.5B by Alibaba. Improved robustness, prosody, and Chinese dialect support over CosyVoice2. Best quality for in-the-wild speech generation.',
   'whisper-base':
     'Smallest Whisper model (74M parameters). Fast transcription with moderate accuracy.',
   'whisper-small':
@@ -390,14 +394,7 @@ export function ModelManagement() {
     setDetailOpen(true);
   };
 
-  const voiceModels =
-    modelStatus?.models.filter(
-      (m) =>
-        m.model_name.startsWith('qwen-tts') ||
-        m.model_name.startsWith('luxtts') ||
-        m.model_name.startsWith('chatterbox') ||
-        m.model_name.startsWith('tada'),
-    ) ?? [];
+  const voiceModels = modelStatus?.models.filter((m) => !m.model_name.startsWith('whisper')) ?? [];
   const whisperModels = modelStatus?.models.filter((m) => m.model_name.startsWith('whisper')) ?? [];
 
   // Build sections

diff --git a/app/src/lib/api/types.ts b/app/src/lib/api/types.ts
@@ -42,8 +42,8 @@ export interface GenerationRequest {
   text: string;
   language: LanguageCode;
   seed?: number;
-  model_size?: '1.7B' | '0.6B' | '1B' | '3B';
-  engine?: 'qwen' | 'luxtts' | 'chatterbox' | 'chatterbox_turbo' | 'tada';
+  model_size?: '1.7B' | '0.6B' | '1B' | '3B' | 'v2' | 'v3';
+  engine?: 'qwen' | 'luxtts' | 'chatterbox' | 'chatterbox_turbo' | 'tada' | 'cosyvoice';
   instruct?: string;
   max_chunk_chars?: number;
   crossfade_ms?: number;

diff --git a/app/src/lib/constants/languages.ts b/app/src/lib/constants/languages.ts
@@ -67,6 +67,7 @@ export const ENGINE_LANGUAGES: Record<string, readonly LanguageCode[]> = {
   ],
   chatterbox_turbo: ['en'],
   tada: ['en', 'ar', 'zh', 'de', 'es', 'fr', 'it', 'ja', 'pl', 'pt'],
+  cosyvoice: ['zh', 'en', 'ja', 'ko', 'de', 'fr', 'ru', 'es', 'it'],
 } as const;
 
 /** Helper: get language options for a given engine. */

diff --git a/app/src/lib/hooks/useGenerationForm.ts b/app/src/lib/hooks/useGenerationForm.ts
@@ -15,9 +15,11 @@ const generationSchema = z.object({
   text: z.string().min(1, '').max(50000),
   language: z.enum(LANGUAGE_CODES as [LanguageCode, ...LanguageCode[]]),
   seed: z.number().int().optional(),
-  modelSize: z.enum(['1.7B', '0.6B', '1B', '3B']).optional(),
+  modelSize: z.enum(['1.7B', '0.6B', '1B', '3B', 'v2', 'v3']).optional(),
   instruct: z.string().max(500).optional(),
-  engine: z.enum(['qwen', 'luxtts', 'chatterbox', 'chatterbox_turbo', 'tada']).optional(),
+  engine: z
+    .enum(['qwen', 'luxtts', 'chatterbox', 'chatterbox_turbo', 'tada', 'cosyvoice'])
+    .optional(),
 });
 
 export type GenerationFormValues = z.infer<typeof generationSchema>;
@@ -83,7 +85,11 @@ export function useGenerationForm(options: UseGenerationFormOptions = {}) {
                 ? data.modelSize === '3B'
                   ? 'tada-3b-ml'
                   : 'tada-1b'
-                : `qwen-tts-${data.modelSize}`;
+                : engine === 'cosyvoice'
+                  ? data.modelSize === 'v3'
+                    ? 'cosyvoice3-0.5b'
+                    : 'cosyvoice2-0.5b'
+                  : `qwen-tts-${data.modelSize}`;
       const displayName =
         engine === 'luxtts'
           ? 'LuxTTS'
@@ -95,9 +101,13 @@ export function useGenerationForm(options: UseGenerationFormOptions = {}) {
                 ? data.modelSize === '3B'
                   ? 'TADA 3B Multilingual'
                   : 'TADA 1B'
-                : data.modelSize === '1.7B'
-                  ? 'Qwen TTS 1.7B'
-                  : 'Qwen TTS 0.6B';
+                : engine === 'cosyvoice'
+                  ? data.modelSize === 'v3'
+                    ? 'CosyVoice3 0.5B'
+                    : 'CosyVoice2 0.5B'
+                  : data.modelSize === '1.7B'
+                    ? 'Qwen TTS 1.7B'
+                    : 'Qwen TTS 0.6B';
 
       // Check if model needs downloading
       try {
@@ -112,7 +122,7 @@ export function useGenerationForm(options: UseGenerationFormOptions = {}) {
         console.error('Failed to check model status:', error);
       }
 
-      const hasModelSizes = engine === 'qwen' || engine === 'tada';
+      const hasModelSizes = engine === 'qwen' || engine === 'tada' || engine === 'cosyvoice';
       const effectsChain = options.getEffectsChain?.();
       // This now returns immediately with status="generating"
       const result = await generation.mutateAsync({
@@ -122,7 +132,8 @@ export function useGenerationForm(options: UseGenerationFormOptions = {}) {
         seed: data.seed,
         model_size: hasModelSizes ? data.modelSize : undefined,
         engine,
-        instruct: engine === 'qwen' ? data.instruct || undefined : undefined,
+        instruct:
+          engine === 'qwen' || engine === 'cosyvoice' ? data.instruct || undefined : undefined,
         max_chunk_chars: maxChunkChars,
         crossfade_ms: crossfadeMs,
         normalize: normalizeAudio,

diff --git a/backend/backends/__init__.py b/backend/backends/__init__.py
@@ -167,6 +167,7 @@ def is_loaded(self) -> bool:
     "chatterbox": "Chatterbox TTS",
     "chatterbox_turbo": "Chatterbox Turbo",
     "tada": "TADA",
+    "cosyvoice": "CosyVoice",
 }
 
 
@@ -278,6 +279,26 @@ def _get_non_qwen_tts_configs() -> list[ModelConfig]:
             size_mb=8000,
             languages=["en", "ar", "zh", "de", "es", "fr", "it", "ja", "pl", "pt"],
         ),
+        ModelConfig(
+            model_name="cosyvoice2-0.5b",
+            display_name="CosyVoice2 0.5B (Multilingual, Instruct)",
+            engine="cosyvoice",
+            hf_repo_id="FunAudioLLM/CosyVoice2-0.5B",
+            model_size="v2",
+            size_mb=4600,
+            supports_instruct=True,
+            languages=["zh", "en", "ja", "ko", "de", "fr", "ru", "es", "it"],
+        ),
+        ModelConfig(
+            model_name="cosyvoice3-0.5b",
+            display_name="CosyVoice3 0.5B (Best Quality)",
+            engine="cosyvoice",
+            hf_repo_id="FunAudioLLM/Fun-CosyVoice3-0.5B-2512",
+            model_size="v3",
+            size_mb=4600,
+            supports_instruct=True,
+            languages=["zh", "en", "ja", "ko", "de", "fr", "ru", "es", "it"],
+        ),
     ]
 
 
@@ -362,7 +383,7 @@ async def load_engine_model(engine: str, model_size: str = "default") -> None:
     backend = get_tts_backend_for_engine(engine)
     if engine == "qwen":
         await backend.load_model_async(model_size)
-    elif engine == "tada":
+    elif engine in ("tada", "cosyvoice"):
         await backend.load_model(model_size)
     else:
         await backend.load_model()
@@ -379,7 +400,7 @@ async def ensure_model_cached_or_raise(engine: str, model_size: str = "default")
             cfg = c
             break
 
-    if engine in ("qwen", "tada"):
+    if engine in ("qwen", "tada", "cosyvoice"):
         if not backend._is_model_cached(model_size):
             raise HTTPException(
                 status_code=400,
@@ -454,6 +475,9 @@ def get_model_load_func(config: ModelConfig):
     if config.engine == "qwen":
         return lambda: tts.get_tts_model().load_model(config.model_size)
 
+    if config.engine in ("tada", "cosyvoice"):
+        return lambda: get_tts_backend_for_engine(config.engine).load_model(config.model_size)
+
     return lambda: get_tts_backend_for_engine(config.engine).load_model()
 
 
@@ -515,6 +539,10 @@ def get_tts_backend_for_engine(engine: str) -> TTSBackend:
             from .hume_backend import HumeTadaBackend
 
             backend = HumeTadaBackend()
+        elif engine == "cosyvoice":
+            from .cosyvoice_backend import CosyVoiceTTSBackend
+
+            backend = CosyVoiceTTSBackend()
         else:
             raise ValueError(f"Unknown TTS engine: {engine}. Supported: {list(TTS_ENGINES.keys())}")