diff --git a/CHANGELOG.md b/CHANGELOG.md index b7116d39..e760812b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -62,6 +62,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Self-documenting help system with `make help` - Colored output for better readability - Supports parallel development server execution +- **48kHz speech tokenizer opt-in setting** - UI toggle in the Models tab (Advanced Settings) to enable the higher-quality 48kHz speech tokenizer on the PyTorch backend. Disabled by default; changes take effect from the next generation. Settings are persisted via `GET /settings` and `PATCH /settings` API endpoints. ### Changed - **README** - Added Makefile reference and updated Quick Start with Makefile-based setup instructions alongside manual setup diff --git a/app/src/components/ServerSettings/ModelManagement.tsx b/app/src/components/ServerSettings/ModelManagement.tsx index 4a5fd439..724b97f9 100644 --- a/app/src/components/ServerSettings/ModelManagement.tsx +++ b/app/src/components/ServerSettings/ModelManagement.tsx @@ -1,5 +1,6 @@ import { useMutation, useQuery, useQueryClient } from '@tanstack/react-query'; import { Download, Loader2, Trash2 } from 'lucide-react'; +import { Checkbox } from '@/components/ui/checkbox'; import { useCallback, useState } from 'react'; import { AlertDialog, @@ -24,6 +25,36 @@ export function ModelManagement() { const [downloadingModel, setDownloadingModel] = useState(null); const [downloadingDisplayName, setDownloadingDisplayName] = useState(null); + const { data: appSettings } = useQuery({ + queryKey: ['appSettings'], + queryFn: () => apiClient.getSettings(), + }); + + const { data: health } = useQuery({ + queryKey: ['health'], + queryFn: () => apiClient.getHealth(), + }); + + const settingsMutation = useMutation({ + mutationFn: (data: { use_48k_speech_tokenizer: boolean }) => apiClient.updateSettings(data), + onSuccess: (updated) => { + queryClient.setQueryData(['appSettings'], updated); + toast({ + title: 'Setting updated', + description: updated.use_48k_speech_tokenizer + ? '48kHz speech tokenizer enabled. It will apply from your next generation.' + : '48kHz speech tokenizer disabled. It will apply from your next generation.', + }); + }, + onError: (error: Error) => { + toast({ + title: 'Failed to update setting', + description: error.message, + variant: 'destructive', + }); + }, + }); + const { data: modelStatus, isLoading } = useQuery({ queryKey: ['modelStatus'], queryFn: async () => { @@ -215,6 +246,34 @@ export function ModelManagement() { ) : null} + + {health?.backend_type === 'pytorch' && ( +
+

Advanced Settings

+
+ { + settingsMutation.mutate({ use_48k_speech_tokenizer: checked }); + }} + disabled={settingsMutation.isPending} + /> +
+ +

+ Enables a higher quality 48kHz speech tokenizer. Changes take + effect from your next generation. +

+
+
+
+ )} {/* Delete Confirmation Dialog */} diff --git a/app/src/lib/api/client.ts b/app/src/lib/api/client.ts index c5b079b2..e4213603 100644 --- a/app/src/lib/api/client.ts +++ b/app/src/lib/api/client.ts @@ -24,6 +24,7 @@ import type { StoryItemMove, StoryItemTrim, StoryItemSplit, + AppSettings, } from './types'; class ApiClient { @@ -495,6 +496,18 @@ class ApiClient { }); } + // App Settings + async getSettings(): Promise { + return this.request('/settings'); + } + + async updateSettings(data: Partial): Promise { + return this.request('/settings', { + method: 'PATCH', + body: JSON.stringify(data), + }); + } + async exportStoryAudio(storyId: string): Promise { const url = `${this.getBaseUrl()}/stories/${storyId}/export-audio`; const response = await fetch(url); diff --git a/app/src/lib/api/types.ts b/app/src/lib/api/types.ts index 131c1be5..64c30e8a 100644 --- a/app/src/lib/api/types.ts +++ b/app/src/lib/api/types.ts @@ -79,6 +79,7 @@ export interface HealthResponse { model_size?: string; gpu_available: boolean; vram_used_mb?: number; + backend_type?: string; } export interface ModelProgress { @@ -202,3 +203,7 @@ export interface StoryItemTrim { export interface StoryItemSplit { split_time_ms: number; } + +export interface AppSettings { + use_48k_speech_tokenizer: boolean; +} diff --git a/backend/README.md b/backend/README.md index 57163467..acce819d 100644 --- a/backend/README.md +++ b/backend/README.md @@ -230,6 +230,34 @@ Manually load TTS model. #### `POST /models/unload` Unload TTS model to free memory. +### Settings + +#### `GET /settings` +Get current application settings. + +**Response:** +```json +{ + "use_48k_speech_tokenizer": false +} +``` + +#### `PATCH /settings` +Partially update application settings. Only provided fields are updated. + +**Request:** +```json +{ + "use_48k_speech_tokenizer": true +} +``` + +**Response:** Updated settings object (same shape as `GET /settings`). + +Settings are persisted to `data/settings.json` and survive server restarts. + +> **Note:** `use_48k_speech_tokenizer` is only applied by the PyTorch backend. It has no effect on the MLX backend. + ## Database Schema ### profiles diff --git a/backend/backends/pytorch_backend.py b/backend/backends/pytorch_backend.py index d0cba11a..eb714c27 100644 --- a/backend/backends/pytorch_backend.py +++ b/backend/backends/pytorch_backend.py @@ -9,6 +9,8 @@ from pathlib import Path from . import TTSBackend, STTBackend +from .. import config +from ..models import AppSettings from ..utils.cache import get_cache_key, get_cached_voice_prompt, cache_voice_prompt from ..utils.audio import normalize_audio, load_audio from ..utils.progress import get_progress_manager @@ -24,6 +26,7 @@ def __init__(self, model_size: str = "1.7B"): self.model_size = model_size self.device = self._get_device() self._current_model_size = None + self._use_48k_speech_tokenizer = False def _get_device(self) -> str: """Get the best available device.""" @@ -121,22 +124,28 @@ async def load_model_async(self, model_size: Optional[str] = None): """ if model_size is None: model_size = self.model_size - - # If already loaded with correct size, return - if self.model is not None and self._current_model_size == model_size: + + requested_48k = AppSettings(**config.load_app_settings()).use_48k_speech_tokenizer + + # If already loaded with correct size and same 48k setting, return + if ( + self.model is not None + and self._current_model_size == model_size + and self._use_48k_speech_tokenizer == requested_48k + ): return - - # Unload existing model if different size requested - if self.model is not None and self._current_model_size != model_size: + + # Unload existing model if reload is needed + if self.model is not None: self.unload_model() - + # Run blocking load in thread pool - await asyncio.to_thread(self._load_model_sync, model_size) - + await asyncio.to_thread(self._load_model_sync, model_size, requested_48k) + # Alias for compatibility load_model = load_model_async - def _load_model_sync(self, model_size: str): + def _load_model_sync(self, model_size: str, use_48k_speech_tokenizer: bool = False): """Synchronous model loading.""" try: progress_manager = get_progress_manager() @@ -195,6 +204,14 @@ def _load_model_sync(self, model_size: str): device_map=self.device, torch_dtype=torch.bfloat16, ) + + # optionally replace speech tokenizer with 48kHz version for better audio quality + if use_48k_speech_tokenizer: + from qwen_tts import Qwen3TTSTokenizer + self.model.model.speech_tokenizer = Qwen3TTSTokenizer.from_pretrained( + "takuma104/Qwen3-TTS-Tokenizer-12Hz-48kHz" + ) + self._use_48k_speech_tokenizer = use_48k_speech_tokenizer finally: # Exit the patch context tracker_context.__exit__(None, None, None) diff --git a/backend/config.py b/backend/config.py index b5c64825..31b78c7f 100644 --- a/backend/config.py +++ b/backend/config.py @@ -4,6 +4,7 @@ Handles data directory configuration for production bundling. """ +import json import os from pathlib import Path @@ -66,3 +67,24 @@ def get_models_dir() -> Path: path = _data_dir / "models" path.mkdir(parents=True, exist_ok=True) return path + +def get_settings_path() -> Path: + """Get app settings file path.""" + return _data_dir / "settings.json" + +def load_app_settings() -> dict: + """Load app settings from JSON file. Returns empty dict if file not found.""" + path = get_settings_path() + if path.exists(): + try: + return json.loads(path.read_text(encoding="utf-8")) + except (json.JSONDecodeError, OSError) as exc: + print(f"[config] Failed to load settings from {path}: {exc}") + return {} + return {} + +def save_app_settings(data: dict) -> None: + """Save app settings to JSON file.""" + path = get_settings_path() + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(json.dumps(data, indent=2), encoding="utf-8") diff --git a/backend/main.py b/backend/main.py index e218d237..8d7779a7 100644 --- a/backend/main.py +++ b/backend/main.py @@ -1764,6 +1764,26 @@ def _get_gpu_status() -> str: return "None (CPU only)" +@app.get("/settings", response_model=models.AppSettings) +async def get_settings(): + """Return current application settings.""" + return models.AppSettings(**config.load_app_settings()) + + +@app.patch("/settings", response_model=models.AppSettings) +async def update_settings(update: models.AppSettingsUpdate): + """Partially update application settings.""" + data = config.load_app_settings() + if data == {} and config.get_settings_path().exists(): + raise HTTPException(status_code=500, detail="Failed to read settings") + + patch = update.model_dump(exclude_none=True) + data.update(patch) + validated = models.AppSettings(**data) + config.save_app_settings(validated.model_dump()) + return validated + + @app.on_event("startup") async def startup_event(): """Run on application startup.""" diff --git a/backend/models.py b/backend/models.py index 59e45405..8b2bcbb4 100644 --- a/backend/models.py +++ b/backend/models.py @@ -299,3 +299,13 @@ class StoryItemTrim(BaseModel): class StoryItemSplit(BaseModel): """Request model for splitting a story item.""" split_time_ms: int = Field(..., ge=0) # Time within the clip to split at (relative to clip start) + + +class AppSettings(BaseModel): + """Application settings.""" + use_48k_speech_tokenizer: bool = False + + +class AppSettingsUpdate(BaseModel): + """Partial update model for application settings.""" + use_48k_speech_tokenizer: Optional[bool] = None diff --git a/data/settings.json b/data/settings.json new file mode 100644 index 00000000..28b0f704 --- /dev/null +++ b/data/settings.json @@ -0,0 +1,3 @@ +{ + "use_48k_speech_tokenizer": false +} \ No newline at end of file