From 700ca6e7b1f9ff69832465e85c91e175bd4ef906 Mon Sep 17 00:00:00 2001 From: Takuma Mori Date: Fri, 6 Mar 2026 00:11:08 +0900 Subject: [PATCH 1/5] feat: update speech tokenizer to 48kHz version for improved audio quality --- backend/backends/pytorch_backend.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/backend/backends/pytorch_backend.py b/backend/backends/pytorch_backend.py index d0cba11a..da35fd74 100644 --- a/backend/backends/pytorch_backend.py +++ b/backend/backends/pytorch_backend.py @@ -195,6 +195,12 @@ def _load_model_sync(self, model_size: str): device_map=self.device, torch_dtype=torch.bfloat16, ) + + # replace speech tokenizer to 48kHz version for better audio quality + from qwen_tts import Qwen3TTSTokenizer + self.model.model.speech_tokenizer = Qwen3TTSTokenizer.from_pretrained( + "takuma104/Qwen3-TTS-Tokenizer-12Hz-48kHz" + ) finally: # Exit the patch context tracker_context.__exit__(None, None, None) From 09f22e2809cc9ce6073c332027d61cb81a4fdbb8 Mon Sep 17 00:00:00 2001 From: Takuma Mori Date: Fri, 13 Mar 2026 00:30:53 +0900 Subject: [PATCH 2/5] feat: add 48kHz speech tokenizer settings and update API for configuration management --- .../ServerSettings/ModelManagement.tsx | 45 +++++++++++++++++++ app/src/lib/api/client.ts | 13 ++++++ app/src/lib/api/types.ts | 4 ++ backend/backends/pytorch_backend.py | 14 +++--- backend/config.py | 20 +++++++++ backend/main.py | 17 +++++++ backend/models.py | 10 +++++ data/settings.json | 3 ++ 8 files changed, 121 insertions(+), 5 deletions(-) create mode 100644 data/settings.json diff --git a/app/src/components/ServerSettings/ModelManagement.tsx b/app/src/components/ServerSettings/ModelManagement.tsx index 4a5fd439..f213c45e 100644 --- a/app/src/components/ServerSettings/ModelManagement.tsx +++ b/app/src/components/ServerSettings/ModelManagement.tsx @@ -1,5 +1,6 @@ import { useMutation, useQuery, useQueryClient } from '@tanstack/react-query'; import { Download, Loader2, Trash2 } from 'lucide-react'; +import { Checkbox } from '@/components/ui/checkbox'; import { useCallback, useState } from 'react'; import { AlertDialog, @@ -24,6 +25,24 @@ export function ModelManagement() { const [downloadingModel, setDownloadingModel] = useState(null); const [downloadingDisplayName, setDownloadingDisplayName] = useState(null); + const { data: appSettings } = useQuery({ + queryKey: ['appSettings'], + queryFn: () => apiClient.getSettings(), + }); + + const settingsMutation = useMutation({ + mutationFn: (data: { use_48k_speech_tokenizer: boolean }) => apiClient.updateSettings(data), + onSuccess: (updated) => { + queryClient.setQueryData(['appSettings'], updated); + toast({ + title: 'Setting updated', + description: updated.use_48k_speech_tokenizer + ? '48kHz speech tokenizer enabled. It will apply from your next generation.' + : '48kHz speech tokenizer disabled. It will apply from your next generation.', + }); + }, + }); + const { data: modelStatus, isLoading } = useQuery({ queryKey: ['modelStatus'], queryFn: async () => { @@ -215,6 +234,32 @@ export function ModelManagement() { ) : null} + +
+

Advanced Settings

+
+ { + settingsMutation.mutate({ use_48k_speech_tokenizer: checked }); + }} + disabled={settingsMutation.isPending} + /> +
+ +

+ Enables a higher quality 48kHz speech tokenizer. Changes take + effect from your next generation. +

+
+
+
{/* Delete Confirmation Dialog */} diff --git a/app/src/lib/api/client.ts b/app/src/lib/api/client.ts index c5b079b2..e4213603 100644 --- a/app/src/lib/api/client.ts +++ b/app/src/lib/api/client.ts @@ -24,6 +24,7 @@ import type { StoryItemMove, StoryItemTrim, StoryItemSplit, + AppSettings, } from './types'; class ApiClient { @@ -495,6 +496,18 @@ class ApiClient { }); } + // App Settings + async getSettings(): Promise { + return this.request('/settings'); + } + + async updateSettings(data: Partial): Promise { + return this.request('/settings', { + method: 'PATCH', + body: JSON.stringify(data), + }); + } + async exportStoryAudio(storyId: string): Promise { const url = `${this.getBaseUrl()}/stories/${storyId}/export-audio`; const response = await fetch(url); diff --git a/app/src/lib/api/types.ts b/app/src/lib/api/types.ts index 131c1be5..7241e7e6 100644 --- a/app/src/lib/api/types.ts +++ b/app/src/lib/api/types.ts @@ -202,3 +202,7 @@ export interface StoryItemTrim { export interface StoryItemSplit { split_time_ms: number; } + +export interface AppSettings { + use_48k_speech_tokenizer: boolean; +} diff --git a/backend/backends/pytorch_backend.py b/backend/backends/pytorch_backend.py index da35fd74..4bc5f7a6 100644 --- a/backend/backends/pytorch_backend.py +++ b/backend/backends/pytorch_backend.py @@ -9,6 +9,8 @@ from pathlib import Path from . import TTSBackend, STTBackend +from .. import config +from ..models import AppSettings from ..utils.cache import get_cache_key, get_cached_voice_prompt, cache_voice_prompt from ..utils.audio import normalize_audio, load_audio from ..utils.progress import get_progress_manager @@ -196,11 +198,13 @@ def _load_model_sync(self, model_size: str): torch_dtype=torch.bfloat16, ) - # replace speech tokenizer to 48kHz version for better audio quality - from qwen_tts import Qwen3TTSTokenizer - self.model.model.speech_tokenizer = Qwen3TTSTokenizer.from_pretrained( - "takuma104/Qwen3-TTS-Tokenizer-12Hz-48kHz" - ) + # optionally replace speech tokenizer with 48kHz version for better audio quality + settings = AppSettings(**config.load_app_settings()) + if settings.use_48k_speech_tokenizer: + from qwen_tts import Qwen3TTSTokenizer + self.model.model.speech_tokenizer = Qwen3TTSTokenizer.from_pretrained( + "takuma104/Qwen3-TTS-Tokenizer-12Hz-48kHz" + ) finally: # Exit the patch context tracker_context.__exit__(None, None, None) diff --git a/backend/config.py b/backend/config.py index b5c64825..0c2ad5d4 100644 --- a/backend/config.py +++ b/backend/config.py @@ -4,6 +4,7 @@ Handles data directory configuration for production bundling. """ +import json import os from pathlib import Path @@ -66,3 +67,22 @@ def get_models_dir() -> Path: path = _data_dir / "models" path.mkdir(parents=True, exist_ok=True) return path + +def get_settings_path() -> Path: + """Get app settings file path.""" + return _data_dir / "settings.json" + +def load_app_settings() -> dict: + """Load app settings from JSON file. Returns empty dict if file not found.""" + path = get_settings_path() + if path.exists(): + try: + return json.loads(path.read_text(encoding="utf-8")) + except Exception: + return {} + return {} + +def save_app_settings(data: dict) -> None: + """Save app settings to JSON file.""" + path = get_settings_path() + path.write_text(json.dumps(data, indent=2), encoding="utf-8") diff --git a/backend/main.py b/backend/main.py index e218d237..3d1898f3 100644 --- a/backend/main.py +++ b/backend/main.py @@ -1764,6 +1764,23 @@ def _get_gpu_status() -> str: return "None (CPU only)" +@app.get("/settings", response_model=models.AppSettings) +async def get_settings(): + """Return current application settings.""" + return models.AppSettings(**config.load_app_settings()) + + +@app.patch("/settings", response_model=models.AppSettings) +async def update_settings(update: models.AppSettingsUpdate): + """Partially update application settings.""" + current = models.AppSettings(**config.load_app_settings()) + data = current.model_dump() + patch = update.model_dump(exclude_none=True) + data.update(patch) + config.save_app_settings(data) + return models.AppSettings(**data) + + @app.on_event("startup") async def startup_event(): """Run on application startup.""" diff --git a/backend/models.py b/backend/models.py index 59e45405..8b2bcbb4 100644 --- a/backend/models.py +++ b/backend/models.py @@ -299,3 +299,13 @@ class StoryItemTrim(BaseModel): class StoryItemSplit(BaseModel): """Request model for splitting a story item.""" split_time_ms: int = Field(..., ge=0) # Time within the clip to split at (relative to clip start) + + +class AppSettings(BaseModel): + """Application settings.""" + use_48k_speech_tokenizer: bool = False + + +class AppSettingsUpdate(BaseModel): + """Partial update model for application settings.""" + use_48k_speech_tokenizer: Optional[bool] = None diff --git a/data/settings.json b/data/settings.json new file mode 100644 index 00000000..28b0f704 --- /dev/null +++ b/data/settings.json @@ -0,0 +1,3 @@ +{ + "use_48k_speech_tokenizer": false +} \ No newline at end of file From 5e917d9bbca7a21d73f006918bc2fecd25f87e7a Mon Sep 17 00:00:00 2001 From: Takuma Mori Date: Fri, 13 Mar 2026 00:45:15 +0900 Subject: [PATCH 3/5] feat: add 48kHz speech tokenizer opt-in setting and update settings API documentation --- CHANGELOG.md | 1 + backend/README.md | 26 ++++++++++++++++++++++++++ 2 files changed, 27 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index b7116d39..6b9bea56 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -58,6 +58,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added - **Makefile** - Comprehensive development workflow automation with commands for setup, development, building, testing, and code quality checks +- **48kHz speech tokenizer opt-in setting** - UI toggle in the Models tab (Advanced Settings) to enable the higher-quality 48kHz speech tokenizer on the PyTorch backend. Disabled by default; changes take effect from the next generation. Settings are persisted via `GET /settings` and `PATCH /settings` API endpoints. - Includes Python version detection and compatibility warnings - Self-documenting help system with `make help` - Colored output for better readability diff --git a/backend/README.md b/backend/README.md index 57163467..1b6a428c 100644 --- a/backend/README.md +++ b/backend/README.md @@ -230,6 +230,32 @@ Manually load TTS model. #### `POST /models/unload` Unload TTS model to free memory. +### Settings + +#### `GET /settings` +Get current application settings. + +**Response:** +```json +{ + "use_48k_speech_tokenizer": false +} +``` + +#### `PATCH /settings` +Partially update application settings. Only provided fields are updated. + +**Request:** +```json +{ + "use_48k_speech_tokenizer": true +} +``` + +**Response:** Updated settings object (same shape as `GET /settings`). + +Settings are persisted to `data/settings.json` and survive server restarts. + ## Database Schema ### profiles From 721c126789629d7f2506c5deb8bd7769d73fd926 Mon Sep 17 00:00:00 2001 From: Takuma Mori Date: Fri, 13 Mar 2026 02:20:48 +0900 Subject: [PATCH 4/5] fixes for coderabbit --- CHANGELOG.md | 2 +- .../ServerSettings/ModelManagement.tsx | 60 ++++++++++++------- app/src/lib/api/types.ts | 1 + backend/README.md | 2 + backend/backends/pytorch_backend.py | 22 ++++--- backend/config.py | 4 +- backend/main.py | 11 ++-- 7 files changed, 66 insertions(+), 36 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6b9bea56..e760812b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -58,11 +58,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added - **Makefile** - Comprehensive development workflow automation with commands for setup, development, building, testing, and code quality checks -- **48kHz speech tokenizer opt-in setting** - UI toggle in the Models tab (Advanced Settings) to enable the higher-quality 48kHz speech tokenizer on the PyTorch backend. Disabled by default; changes take effect from the next generation. Settings are persisted via `GET /settings` and `PATCH /settings` API endpoints. - Includes Python version detection and compatibility warnings - Self-documenting help system with `make help` - Colored output for better readability - Supports parallel development server execution +- **48kHz speech tokenizer opt-in setting** - UI toggle in the Models tab (Advanced Settings) to enable the higher-quality 48kHz speech tokenizer on the PyTorch backend. Disabled by default; changes take effect from the next generation. Settings are persisted via `GET /settings` and `PATCH /settings` API endpoints. ### Changed - **README** - Added Makefile reference and updated Quick Start with Makefile-based setup instructions alongside manual setup diff --git a/app/src/components/ServerSettings/ModelManagement.tsx b/app/src/components/ServerSettings/ModelManagement.tsx index f213c45e..724b97f9 100644 --- a/app/src/components/ServerSettings/ModelManagement.tsx +++ b/app/src/components/ServerSettings/ModelManagement.tsx @@ -30,6 +30,11 @@ export function ModelManagement() { queryFn: () => apiClient.getSettings(), }); + const { data: health } = useQuery({ + queryKey: ['health'], + queryFn: () => apiClient.getHealth(), + }); + const settingsMutation = useMutation({ mutationFn: (data: { use_48k_speech_tokenizer: boolean }) => apiClient.updateSettings(data), onSuccess: (updated) => { @@ -41,6 +46,13 @@ export function ModelManagement() { : '48kHz speech tokenizer disabled. It will apply from your next generation.', }); }, + onError: (error: Error) => { + toast({ + title: 'Failed to update setting', + description: error.message, + variant: 'destructive', + }); + }, }); const { data: modelStatus, isLoading } = useQuery({ @@ -235,31 +247,33 @@ export function ModelManagement() { ) : null} -
-

Advanced Settings

-
- { - settingsMutation.mutate({ use_48k_speech_tokenizer: checked }); - }} - disabled={settingsMutation.isPending} - /> -
- -

- Enables a higher quality 48kHz speech tokenizer. Changes take - effect from your next generation. -

+ {health?.backend_type === 'pytorch' && ( +
+

Advanced Settings

+
+ { + settingsMutation.mutate({ use_48k_speech_tokenizer: checked }); + }} + disabled={settingsMutation.isPending} + /> +
+ +

+ Enables a higher quality 48kHz speech tokenizer. Changes take + effect from your next generation. +

+
-
+ )} {/* Delete Confirmation Dialog */} diff --git a/app/src/lib/api/types.ts b/app/src/lib/api/types.ts index 7241e7e6..64c30e8a 100644 --- a/app/src/lib/api/types.ts +++ b/app/src/lib/api/types.ts @@ -79,6 +79,7 @@ export interface HealthResponse { model_size?: string; gpu_available: boolean; vram_used_mb?: number; + backend_type?: string; } export interface ModelProgress { diff --git a/backend/README.md b/backend/README.md index 1b6a428c..acce819d 100644 --- a/backend/README.md +++ b/backend/README.md @@ -256,6 +256,8 @@ Partially update application settings. Only provided fields are updated. Settings are persisted to `data/settings.json` and survive server restarts. +> **Note:** `use_48k_speech_tokenizer` is only applied by the PyTorch backend. It has no effect on the MLX backend. + ## Database Schema ### profiles diff --git a/backend/backends/pytorch_backend.py b/backend/backends/pytorch_backend.py index 4bc5f7a6..8116d301 100644 --- a/backend/backends/pytorch_backend.py +++ b/backend/backends/pytorch_backend.py @@ -26,6 +26,7 @@ def __init__(self, model_size: str = "1.7B"): self.model_size = model_size self.device = self._get_device() self._current_model_size = None + self._use_48k_speech_tokenizer = False def _get_device(self) -> str: """Get the best available device.""" @@ -123,15 +124,21 @@ async def load_model_async(self, model_size: Optional[str] = None): """ if model_size is None: model_size = self.model_size - - # If already loaded with correct size, return - if self.model is not None and self._current_model_size == model_size: + + requested_48k = AppSettings(**config.load_app_settings()).use_48k_speech_tokenizer + + # If already loaded with correct size and same 48k setting, return + if ( + self.model is not None + and self._current_model_size == model_size + and self._use_48k_speech_tokenizer == requested_48k + ): return - - # Unload existing model if different size requested - if self.model is not None and self._current_model_size != model_size: + + # Unload existing model if reload is needed + if self.model is not None: self.unload_model() - + # Run blocking load in thread pool await asyncio.to_thread(self._load_model_sync, model_size) @@ -205,6 +212,7 @@ def _load_model_sync(self, model_size: str): self.model.model.speech_tokenizer = Qwen3TTSTokenizer.from_pretrained( "takuma104/Qwen3-TTS-Tokenizer-12Hz-48kHz" ) + self._use_48k_speech_tokenizer = settings.use_48k_speech_tokenizer finally: # Exit the patch context tracker_context.__exit__(None, None, None) diff --git a/backend/config.py b/backend/config.py index 0c2ad5d4..31b78c7f 100644 --- a/backend/config.py +++ b/backend/config.py @@ -78,11 +78,13 @@ def load_app_settings() -> dict: if path.exists(): try: return json.loads(path.read_text(encoding="utf-8")) - except Exception: + except (json.JSONDecodeError, OSError) as exc: + print(f"[config] Failed to load settings from {path}: {exc}") return {} return {} def save_app_settings(data: dict) -> None: """Save app settings to JSON file.""" path = get_settings_path() + path.parent.mkdir(parents=True, exist_ok=True) path.write_text(json.dumps(data, indent=2), encoding="utf-8") diff --git a/backend/main.py b/backend/main.py index 3d1898f3..8d7779a7 100644 --- a/backend/main.py +++ b/backend/main.py @@ -1773,12 +1773,15 @@ async def get_settings(): @app.patch("/settings", response_model=models.AppSettings) async def update_settings(update: models.AppSettingsUpdate): """Partially update application settings.""" - current = models.AppSettings(**config.load_app_settings()) - data = current.model_dump() + data = config.load_app_settings() + if data == {} and config.get_settings_path().exists(): + raise HTTPException(status_code=500, detail="Failed to read settings") + patch = update.model_dump(exclude_none=True) data.update(patch) - config.save_app_settings(data) - return models.AppSettings(**data) + validated = models.AppSettings(**data) + config.save_app_settings(validated.model_dump()) + return validated @app.on_event("startup") From b51526324740ac07180631eb504180e524adb6d2 Mon Sep 17 00:00:00 2001 From: Takuma Mori Date: Fri, 13 Mar 2026 02:55:59 +0900 Subject: [PATCH 5/5] remove `config.load_app_settings()` in `_load_model_sync()` --- backend/backends/pytorch_backend.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/backend/backends/pytorch_backend.py b/backend/backends/pytorch_backend.py index 8116d301..eb714c27 100644 --- a/backend/backends/pytorch_backend.py +++ b/backend/backends/pytorch_backend.py @@ -140,12 +140,12 @@ async def load_model_async(self, model_size: Optional[str] = None): self.unload_model() # Run blocking load in thread pool - await asyncio.to_thread(self._load_model_sync, model_size) - + await asyncio.to_thread(self._load_model_sync, model_size, requested_48k) + # Alias for compatibility load_model = load_model_async - def _load_model_sync(self, model_size: str): + def _load_model_sync(self, model_size: str, use_48k_speech_tokenizer: bool = False): """Synchronous model loading.""" try: progress_manager = get_progress_manager() @@ -206,13 +206,12 @@ def _load_model_sync(self, model_size: str): ) # optionally replace speech tokenizer with 48kHz version for better audio quality - settings = AppSettings(**config.load_app_settings()) - if settings.use_48k_speech_tokenizer: + if use_48k_speech_tokenizer: from qwen_tts import Qwen3TTSTokenizer self.model.model.speech_tokenizer = Qwen3TTSTokenizer.from_pretrained( "takuma104/Qwen3-TTS-Tokenizer-12Hz-48kHz" ) - self._use_48k_speech_tokenizer = settings.use_48k_speech_tokenizer + self._use_48k_speech_tokenizer = use_48k_speech_tokenizer finally: # Exit the patch context tracker_context.__exit__(None, None, None)