Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .github/workflows/release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ jobs:
pip install -r backend/requirements.txt
pip install --no-deps chatterbox-tts
pip install --no-deps hume-tada
git clone --recursive --depth 1 https://github.com/FunAudioLLM/CosyVoice.git backend/vendors/CosyVoice
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical

🧩 Analysis chain

🏁 Script executed:

#!/bin/bash
test -d backend/vendors && echo "backend/vendors exists" || echo "backend/vendors missing"
rg -n "backend/vendors/CosyVoice" .github/workflows/release.yml

Repository: jamiepine/voicebox

Length of output: 318


🏁 Script executed:

cat -n .gitignore | grep -i "backend/vendors"

Repository: jamiepine/voicebox

Length of output: 85


🏁 Script executed:

# Check if justfile exists and contains backend/vendors directory creation
if [ -f justfile ]; then
  cat -n justfile | grep -A 3 -B 3 "backend/vendors"
else
  echo "justfile not found"
fi

Repository: jamiepine/voicebox

Length of output: 497


🏁 Script executed:

# Check the workflow file around lines 66 and 194 for context
sed -n '50,75p' .github/workflows/release.yml
echo "---"
sed -n '178,210p' .github/workflows/release.yml

Repository: jamiepine/voicebox

Length of output: 2329


🏁 Script executed:

# Verify git clone behavior with missing parent directory
git clone --help 2>&1 | grep -i "parent\|directory" | head -5

Repository: jamiepine/voicebox

Length of output: 118


Create backend/vendors before cloning.

Fresh checkouts do not contain backend/vendors (it is ignored by .gitignore), and git clone ... backend/vendors/CosyVoice will fail when the parent path is missing. This breaks both release jobs before the server build starts.

🛠️ Cross-platform fix for both clone sites
          pip install --no-deps chatterbox-tts
          pip install --no-deps hume-tada
+         python -c "from pathlib import Path; Path('backend/vendors').mkdir(parents=True, exist_ok=True)"
          git clone --recursive --depth 1 https://github.com/FunAudioLLM/CosyVoice.git backend/vendors/CosyVoice

Also applies to: 194-194

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In @.github/workflows/release.yml at line 66, The git clone command "git clone
--recursive --depth 1 https://github.com/FunAudioLLM/CosyVoice.git
backend/vendors/CosyVoice" will fail when its parent directory doesn't exist—add
a step to create the parent directory before cloning (e.g., run a cross-platform
create like mkdir -p backend/vendors or the equivalent Windows-aware command)
and apply the same change to the other clone occurrence of the same command;
update the workflow so the directory creation runs immediately before each git
clone of backend/vendors/CosyVoice.


- name: Install MLX dependencies (Apple Silicon only)
if: matrix.backend == 'mlx'
Expand Down Expand Up @@ -190,6 +191,7 @@ jobs:
pip install -r backend/requirements.txt
pip install --no-deps chatterbox-tts
pip install --no-deps hume-tada
git clone --recursive --depth 1 https://github.com/FunAudioLLM/CosyVoice.git backend/vendors/CosyVoice

- name: Install PyTorch with CUDA 12.6
run: |
Expand Down
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,9 @@ tauri/src-tauri/gen/partial.plist
# Windows artifacts
nul

# Vendored source clones (fetched at setup time)
backend/vendors/

# Temporary
tmp/
temp/
Expand Down
4 changes: 4 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ RUN pip install --no-cache-dir --prefix=/install --no-deps chatterbox-tts
RUN pip install --no-cache-dir --prefix=/install --no-deps hume-tada
RUN pip install --no-cache-dir --prefix=/install \
git+https://github.com/QwenLM/Qwen3-TTS.git
RUN git clone --recursive --depth 1 https://github.com/FunAudioLLM/CosyVoice.git /build/CosyVoice


# === Stage 3: Runtime ===
Expand All @@ -62,6 +63,9 @@ COPY --from=backend-builder /install /usr/local
# Copy backend application code
COPY --chown=voicebox:voicebox backend/ /app/backend/

# Copy CosyVoice source from builder stage
COPY --from=backend-builder --chown=voicebox:voicebox /build/CosyVoice/ /app/backend/vendors/CosyVoice/

# Copy built frontend from frontend stage
COPY --from=frontend --chown=voicebox:voicebox /build/web/dist /app/frontend/

Expand Down
13 changes: 13 additions & 0 deletions app/src/components/Generation/EngineModelSelector.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ const ENGINE_OPTIONS = [
{ value: 'chatterbox_turbo', label: 'Chatterbox Turbo' },
{ value: 'tada:1B', label: 'TADA 1B' },
{ value: 'tada:3B', label: 'TADA 3B Multilingual' },
{ value: 'cosyvoice:v2', label: 'CosyVoice2 0.5B' },
{ value: 'cosyvoice:v3', label: 'CosyVoice3 0.5B' },
] as const;

const ENGINE_DESCRIPTIONS: Record<string, string> = {
Expand All @@ -30,6 +32,7 @@ const ENGINE_DESCRIPTIONS: Record<string, string> = {
chatterbox: '23 languages, incl. Hebrew',
chatterbox_turbo: 'English, [laugh] [cough] tags',
tada: 'HumeAI, 700s+ coherent audio',
cosyvoice: 'Alibaba, instruct + cloning',
};

/** Engines that only support English and should force language to 'en' on select. */
Expand All @@ -38,6 +41,7 @@ const ENGLISH_ONLY_ENGINES = new Set(['luxtts', 'chatterbox_turbo']);
function getSelectValue(engine: string, modelSize?: string): string {
if (engine === 'qwen') return `qwen:${modelSize || '1.7B'}`;
if (engine === 'tada') return `tada:${modelSize || '1B'}`;
if (engine === 'cosyvoice') return `cosyvoice:${modelSize || 'v2'}`;
return engine;
}

Expand Down Expand Up @@ -66,6 +70,15 @@ function handleEngineChange(form: UseFormReturn<GenerationFormValues>, value: st
form.setValue('language', available[0]?.value ?? 'en');
}
}
} else if (value.startsWith('cosyvoice:')) {
const [, modelSize] = value.split(':');
form.setValue('engine', 'cosyvoice');
form.setValue('modelSize', modelSize as 'v2' | 'v3');
const currentLang = form.getValues('language');
const available = getLanguageOptionsForEngine('cosyvoice');
if (!available.some((l) => l.value === currentLang)) {
form.setValue('language', available[0]?.value ?? 'en');
}
} else {
form.setValue('engine', value as GenerationFormValues['engine']);
form.setValue('modelSize', undefined as unknown as '1.7B' | '0.6B');
Expand Down
13 changes: 5 additions & 8 deletions app/src/components/ServerSettings/ModelManagement.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,10 @@ const MODEL_DESCRIPTIONS: Record<string, string> = {
'HumeAI TADA 1B — English speech-language model built on Llama 3.2 1B. Generates 700s+ of coherent audio with synchronized text-acoustic alignment.',
'tada-3b-ml':
'HumeAI TADA 3B Multilingual — built on Llama 3.2 3B. Supports 10 languages with high-fidelity voice cloning via text-acoustic dual alignment.',
'cosyvoice2-0.5b':
'CosyVoice2 0.5B by Alibaba. Multilingual TTS with instruct support for emotions, speed, volume, and dialects. 9 languages with zero-shot voice cloning.',
'cosyvoice3-0.5b':
'Fun-CosyVoice3 0.5B by Alibaba. Improved robustness, prosody, and Chinese dialect support over CosyVoice2. Best quality for in-the-wild speech generation.',
'whisper-base':
'Smallest Whisper model (74M parameters). Fast transcription with moderate accuracy.',
'whisper-small':
Expand Down Expand Up @@ -390,14 +394,7 @@ export function ModelManagement() {
setDetailOpen(true);
};

const voiceModels =
modelStatus?.models.filter(
(m) =>
m.model_name.startsWith('qwen-tts') ||
m.model_name.startsWith('luxtts') ||
m.model_name.startsWith('chatterbox') ||
m.model_name.startsWith('tada'),
) ?? [];
const voiceModels = modelStatus?.models.filter((m) => !m.model_name.startsWith('whisper')) ?? [];
const whisperModels = modelStatus?.models.filter((m) => m.model_name.startsWith('whisper')) ?? [];

// Build sections
Expand Down
4 changes: 2 additions & 2 deletions app/src/lib/api/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,8 @@ export interface GenerationRequest {
text: string;
language: LanguageCode;
seed?: number;
model_size?: '1.7B' | '0.6B' | '1B' | '3B';
engine?: 'qwen' | 'luxtts' | 'chatterbox' | 'chatterbox_turbo' | 'tada';
model_size?: '1.7B' | '0.6B' | '1B' | '3B' | 'v2' | 'v3';
engine?: 'qwen' | 'luxtts' | 'chatterbox' | 'chatterbox_turbo' | 'tada' | 'cosyvoice';
instruct?: string;
max_chunk_chars?: number;
crossfade_ms?: number;
Expand Down
1 change: 1 addition & 0 deletions app/src/lib/constants/languages.ts
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ export const ENGINE_LANGUAGES: Record<string, readonly LanguageCode[]> = {
],
chatterbox_turbo: ['en'],
tada: ['en', 'ar', 'zh', 'de', 'es', 'fr', 'it', 'ja', 'pl', 'pt'],
cosyvoice: ['zh', 'en', 'ja', 'ko', 'de', 'fr', 'ru', 'es', 'it'],
} as const;

/** Helper: get language options for a given engine. */
Expand Down
27 changes: 19 additions & 8 deletions app/src/lib/hooks/useGenerationForm.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,11 @@ const generationSchema = z.object({
text: z.string().min(1, '').max(50000),
language: z.enum(LANGUAGE_CODES as [LanguageCode, ...LanguageCode[]]),
seed: z.number().int().optional(),
modelSize: z.enum(['1.7B', '0.6B', '1B', '3B']).optional(),
modelSize: z.enum(['1.7B', '0.6B', '1B', '3B', 'v2', 'v3']).optional(),
instruct: z.string().max(500).optional(),
engine: z.enum(['qwen', 'luxtts', 'chatterbox', 'chatterbox_turbo', 'tada']).optional(),
engine: z
.enum(['qwen', 'luxtts', 'chatterbox', 'chatterbox_turbo', 'tada', 'cosyvoice'])
.optional(),
});

export type GenerationFormValues = z.infer<typeof generationSchema>;
Expand Down Expand Up @@ -83,7 +85,11 @@ export function useGenerationForm(options: UseGenerationFormOptions = {}) {
? data.modelSize === '3B'
? 'tada-3b-ml'
: 'tada-1b'
: `qwen-tts-${data.modelSize}`;
: engine === 'cosyvoice'
? data.modelSize === 'v3'
? 'cosyvoice3-0.5b'
: 'cosyvoice2-0.5b'
: `qwen-tts-${data.modelSize}`;
const displayName =
engine === 'luxtts'
? 'LuxTTS'
Expand All @@ -95,9 +101,13 @@ export function useGenerationForm(options: UseGenerationFormOptions = {}) {
? data.modelSize === '3B'
? 'TADA 3B Multilingual'
: 'TADA 1B'
: data.modelSize === '1.7B'
? 'Qwen TTS 1.7B'
: 'Qwen TTS 0.6B';
: engine === 'cosyvoice'
? data.modelSize === 'v3'
? 'CosyVoice3 0.5B'
: 'CosyVoice2 0.5B'
: data.modelSize === '1.7B'
? 'Qwen TTS 1.7B'
: 'Qwen TTS 0.6B';

// Check if model needs downloading
try {
Expand All @@ -112,7 +122,7 @@ export function useGenerationForm(options: UseGenerationFormOptions = {}) {
console.error('Failed to check model status:', error);
}

const hasModelSizes = engine === 'qwen' || engine === 'tada';
const hasModelSizes = engine === 'qwen' || engine === 'tada' || engine === 'cosyvoice';
const effectsChain = options.getEffectsChain?.();
// This now returns immediately with status="generating"
const result = await generation.mutateAsync({
Expand All @@ -122,7 +132,8 @@ export function useGenerationForm(options: UseGenerationFormOptions = {}) {
seed: data.seed,
model_size: hasModelSizes ? data.modelSize : undefined,
engine,
instruct: engine === 'qwen' ? data.instruct || undefined : undefined,
instruct:
engine === 'qwen' || engine === 'cosyvoice' ? data.instruct || undefined : undefined,
max_chunk_chars: maxChunkChars,
crossfade_ms: crossfadeMs,
normalize: normalizeAudio,
Expand Down
32 changes: 30 additions & 2 deletions backend/backends/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,7 @@ def is_loaded(self) -> bool:
"chatterbox": "Chatterbox TTS",
"chatterbox_turbo": "Chatterbox Turbo",
"tada": "TADA",
"cosyvoice": "CosyVoice",
}


Expand Down Expand Up @@ -278,6 +279,26 @@ def _get_non_qwen_tts_configs() -> list[ModelConfig]:
size_mb=8000,
languages=["en", "ar", "zh", "de", "es", "fr", "it", "ja", "pl", "pt"],
),
ModelConfig(
model_name="cosyvoice2-0.5b",
display_name="CosyVoice2 0.5B (Multilingual, Instruct)",
engine="cosyvoice",
hf_repo_id="FunAudioLLM/CosyVoice2-0.5B",
model_size="v2",
size_mb=4600,
supports_instruct=True,
languages=["zh", "en", "ja", "ko", "de", "fr", "ru", "es", "it"],
),
ModelConfig(
model_name="cosyvoice3-0.5b",
display_name="CosyVoice3 0.5B (Best Quality)",
engine="cosyvoice",
hf_repo_id="FunAudioLLM/Fun-CosyVoice3-0.5B-2512",
model_size="v3",
size_mb=4600,
supports_instruct=True,
languages=["zh", "en", "ja", "ko", "de", "fr", "ru", "es", "it"],
),
]


Expand Down Expand Up @@ -362,7 +383,7 @@ async def load_engine_model(engine: str, model_size: str = "default") -> None:
backend = get_tts_backend_for_engine(engine)
if engine == "qwen":
await backend.load_model_async(model_size)
elif engine == "tada":
elif engine in ("tada", "cosyvoice"):
await backend.load_model(model_size)
else:
await backend.load_model()
Expand All @@ -379,7 +400,7 @@ async def ensure_model_cached_or_raise(engine: str, model_size: str = "default")
cfg = c
break

if engine in ("qwen", "tada"):
if engine in ("qwen", "tada", "cosyvoice"):
if not backend._is_model_cached(model_size):
raise HTTPException(
status_code=400,
Expand Down Expand Up @@ -454,6 +475,9 @@ def get_model_load_func(config: ModelConfig):
if config.engine == "qwen":
return lambda: tts.get_tts_model().load_model(config.model_size)

if config.engine in ("tada", "cosyvoice"):
return lambda: get_tts_backend_for_engine(config.engine).load_model(config.model_size)

return lambda: get_tts_backend_for_engine(config.engine).load_model()


Expand Down Expand Up @@ -515,6 +539,10 @@ def get_tts_backend_for_engine(engine: str) -> TTSBackend:
from .hume_backend import HumeTadaBackend

backend = HumeTadaBackend()
elif engine == "cosyvoice":
from .cosyvoice_backend import CosyVoiceTTSBackend

backend = CosyVoiceTTSBackend()
else:
raise ValueError(f"Unknown TTS engine: {engine}. Supported: {list(TTS_ENGINES.keys())}")

Expand Down
Loading