Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Self-documenting help system with `make help`
- Colored output for better readability
- Supports parallel development server execution
- **48kHz speech tokenizer opt-in setting** - UI toggle in the Models tab (Advanced Settings) to enable the higher-quality 48kHz speech tokenizer on the PyTorch backend. Disabled by default; changes take effect from the next generation. Settings are persisted via `GET /settings` and `PATCH /settings` API endpoints.

### Changed
- **README** - Added Makefile reference and updated Quick Start with Makefile-based setup instructions alongside manual setup
Expand Down
59 changes: 59 additions & 0 deletions app/src/components/ServerSettings/ModelManagement.tsx
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import { useMutation, useQuery, useQueryClient } from '@tanstack/react-query';
import { Download, Loader2, Trash2 } from 'lucide-react';
import { Checkbox } from '@/components/ui/checkbox';
import { useCallback, useState } from 'react';
import {
AlertDialog,
Expand All @@ -24,6 +25,36 @@ export function ModelManagement() {
const [downloadingModel, setDownloadingModel] = useState<string | null>(null);
const [downloadingDisplayName, setDownloadingDisplayName] = useState<string | null>(null);

const { data: appSettings } = useQuery({
queryKey: ['appSettings'],
queryFn: () => apiClient.getSettings(),
});

const { data: health } = useQuery({
queryKey: ['health'],
queryFn: () => apiClient.getHealth(),
});

const settingsMutation = useMutation({
mutationFn: (data: { use_48k_speech_tokenizer: boolean }) => apiClient.updateSettings(data),
onSuccess: (updated) => {
queryClient.setQueryData(['appSettings'], updated);
toast({
title: 'Setting updated',
description: updated.use_48k_speech_tokenizer
? '48kHz speech tokenizer enabled. It will apply from your next generation.'
: '48kHz speech tokenizer disabled. It will apply from your next generation.',
});
},
onError: (error: Error) => {
toast({
title: 'Failed to update setting',
description: error.message,
variant: 'destructive',
});
},
});

const { data: modelStatus, isLoading } = useQuery({
queryKey: ['modelStatus'],
queryFn: async () => {
Expand Down Expand Up @@ -215,6 +246,34 @@ export function ModelManagement() {

</div>
) : null}

{health?.backend_type === 'pytorch' && (
<div className="mt-6 pt-6 border-t">
<h3 className="text-sm font-semibold mb-3 text-muted-foreground">Advanced Settings</h3>
<div className="flex items-start space-x-3">
<Checkbox
id="use48kTokenizer"
checked={appSettings?.use_48k_speech_tokenizer ?? false}
onCheckedChange={(checked: boolean) => {
settingsMutation.mutate({ use_48k_speech_tokenizer: checked });
}}
disabled={settingsMutation.isPending}
/>
<div className="space-y-1">
<label
htmlFor="use48kTokenizer"
className="text-sm font-medium leading-none peer-disabled:cursor-not-allowed peer-disabled:opacity-70 cursor-pointer"
>
Use 48kHz speech tokenizer (experimental)
</label>
<p className="text-sm text-muted-foreground">
Enables a higher quality 48kHz speech tokenizer. Changes take
effect from your next generation.
</p>
</div>
</div>
</div>
)}
</CardContent>

{/* Delete Confirmation Dialog */}
Expand Down
13 changes: 13 additions & 0 deletions app/src/lib/api/client.ts
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import type {
StoryItemMove,
StoryItemTrim,
StoryItemSplit,
AppSettings,
} from './types';

class ApiClient {
Expand Down Expand Up @@ -495,6 +496,18 @@ class ApiClient {
});
}

// App Settings
async getSettings(): Promise<AppSettings> {
return this.request<AppSettings>('/settings');
}

async updateSettings(data: Partial<AppSettings>): Promise<AppSettings> {
return this.request<AppSettings>('/settings', {
method: 'PATCH',
body: JSON.stringify(data),
});
}

async exportStoryAudio(storyId: string): Promise<Blob> {
const url = `${this.getBaseUrl()}/stories/${storyId}/export-audio`;
const response = await fetch(url);
Expand Down
5 changes: 5 additions & 0 deletions app/src/lib/api/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ export interface HealthResponse {
model_size?: string;
gpu_available: boolean;
vram_used_mb?: number;
backend_type?: string;
}

export interface ModelProgress {
Expand Down Expand Up @@ -202,3 +203,7 @@ export interface StoryItemTrim {
export interface StoryItemSplit {
split_time_ms: number;
}

export interface AppSettings {
use_48k_speech_tokenizer: boolean;
}
28 changes: 28 additions & 0 deletions backend/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,34 @@ Manually load TTS model.
#### `POST /models/unload`
Unload TTS model to free memory.

### Settings

#### `GET /settings`
Get current application settings.

**Response:**
```json
{
"use_48k_speech_tokenizer": false
}
```

#### `PATCH /settings`
Partially update application settings. Only provided fields are updated.

**Request:**
```json
{
"use_48k_speech_tokenizer": true
}
```

**Response:** Updated settings object (same shape as `GET /settings`).

Settings are persisted to `data/settings.json` and survive server restarts.

> **Note:** `use_48k_speech_tokenizer` is only applied by the PyTorch backend. It has no effect on the MLX backend.

## Database Schema

### profiles
Expand Down
37 changes: 27 additions & 10 deletions backend/backends/pytorch_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
from pathlib import Path

from . import TTSBackend, STTBackend
from .. import config
from ..models import AppSettings
from ..utils.cache import get_cache_key, get_cached_voice_prompt, cache_voice_prompt
from ..utils.audio import normalize_audio, load_audio
from ..utils.progress import get_progress_manager
Expand All @@ -24,6 +26,7 @@ def __init__(self, model_size: str = "1.7B"):
self.model_size = model_size
self.device = self._get_device()
self._current_model_size = None
self._use_48k_speech_tokenizer = False

def _get_device(self) -> str:
"""Get the best available device."""
Expand Down Expand Up @@ -121,22 +124,28 @@ async def load_model_async(self, model_size: Optional[str] = None):
"""
if model_size is None:
model_size = self.model_size

# If already loaded with correct size, return
if self.model is not None and self._current_model_size == model_size:

requested_48k = AppSettings(**config.load_app_settings()).use_48k_speech_tokenizer

# If already loaded with correct size and same 48k setting, return
if (
self.model is not None
and self._current_model_size == model_size
and self._use_48k_speech_tokenizer == requested_48k
):
return
# Unload existing model if different size requested
if self.model is not None and self._current_model_size != model_size:

# Unload existing model if reload is needed
if self.model is not None:
self.unload_model()

# Run blocking load in thread pool
await asyncio.to_thread(self._load_model_sync, model_size)
await asyncio.to_thread(self._load_model_sync, model_size, requested_48k)

# Alias for compatibility
load_model = load_model_async

def _load_model_sync(self, model_size: str):
def _load_model_sync(self, model_size: str, use_48k_speech_tokenizer: bool = False):
"""Synchronous model loading."""
try:
progress_manager = get_progress_manager()
Expand Down Expand Up @@ -195,6 +204,14 @@ def _load_model_sync(self, model_size: str):
device_map=self.device,
torch_dtype=torch.bfloat16,
)

# optionally replace speech tokenizer with 48kHz version for better audio quality
if use_48k_speech_tokenizer:
from qwen_tts import Qwen3TTSTokenizer
self.model.model.speech_tokenizer = Qwen3TTSTokenizer.from_pretrained(
"takuma104/Qwen3-TTS-Tokenizer-12Hz-48kHz"
)
self._use_48k_speech_tokenizer = use_48k_speech_tokenizer
Comment on lines +208 to +214
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Namespace cached voice prompts by tokenizer mode.

This adds a second speech-tokenizer variant, but create_voice_prompt() still caches prompts by only (audio_path, reference_text) at Line 280. After switching this flag, the model reloads and can then immediately reuse a prompt generated under the old tokenizer, so the “next generation” does not actually reflect the new mode and the cached tensors may not match the new tokenizer. Include use_48k_speech_tokenizer (or a tokenizer ID) in the cache key, or invalidate prompt cache entries when the mode changes.

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@backend/backends/pytorch_backend.py` around lines 208 - 214, The prompt cache
is keyed only by (audio_path, reference_text) in create_voice_prompt(), which
leads to reuse of prompts across speech_tokenizer variants; update the cache key
to include the tokenizer mode (e.g., self._use_48k_speech_tokenizer or a
tokenizer identifier such as model.model.speech_tokenizer.__class__.__name__ /
tokenizer ID) or clear/invalidate relevant entries when
self._use_48k_speech_tokenizer is changed (where you set it in the model load
block), so create_voice_prompt() returns prompts generated with the currently
active tokenizer.

finally:
# Exit the patch context
tracker_context.__exit__(None, None, None)
Expand Down
22 changes: 22 additions & 0 deletions backend/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
Handles data directory configuration for production bundling.
"""

import json
import os
from pathlib import Path

Expand Down Expand Up @@ -66,3 +67,24 @@ def get_models_dir() -> Path:
path = _data_dir / "models"
path.mkdir(parents=True, exist_ok=True)
return path

def get_settings_path() -> Path:
"""Get app settings file path."""
return _data_dir / "settings.json"

def load_app_settings() -> dict:
"""Load app settings from JSON file. Returns empty dict if file not found."""
path = get_settings_path()
if path.exists():
try:
return json.loads(path.read_text(encoding="utf-8"))
except (json.JSONDecodeError, OSError) as exc:
print(f"[config] Failed to load settings from {path}: {exc}")
return {}
return {}

def save_app_settings(data: dict) -> None:
"""Save app settings to JSON file."""
path = get_settings_path()
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text(json.dumps(data, indent=2), encoding="utf-8")
20 changes: 20 additions & 0 deletions backend/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -1764,6 +1764,26 @@ def _get_gpu_status() -> str:
return "None (CPU only)"


@app.get("/settings", response_model=models.AppSettings)
async def get_settings():
"""Return current application settings."""
return models.AppSettings(**config.load_app_settings())


@app.patch("/settings", response_model=models.AppSettings)
async def update_settings(update: models.AppSettingsUpdate):
"""Partially update application settings."""
data = config.load_app_settings()
if data == {} and config.get_settings_path().exists():
raise HTTPException(status_code=500, detail="Failed to read settings")

patch = update.model_dump(exclude_none=True)
data.update(patch)
validated = models.AppSettings(**data)
config.save_app_settings(validated.model_dump())
return validated


@app.on_event("startup")
async def startup_event():
"""Run on application startup."""
Expand Down
10 changes: 10 additions & 0 deletions backend/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -299,3 +299,13 @@ class StoryItemTrim(BaseModel):
class StoryItemSplit(BaseModel):
"""Request model for splitting a story item."""
split_time_ms: int = Field(..., ge=0) # Time within the clip to split at (relative to clip start)


class AppSettings(BaseModel):
"""Application settings."""
use_48k_speech_tokenizer: bool = False


class AppSettingsUpdate(BaseModel):
"""Partial update model for application settings."""
use_48k_speech_tokenizer: Optional[bool] = None
3 changes: 3 additions & 0 deletions data/settings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{
"use_48k_speech_tokenizer": false
}