Skip to content
Open
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Self-documenting help system with `make help`
- Colored output for better readability
- Supports parallel development server execution
- **48kHz speech tokenizer opt-in setting** - UI toggle in the Models tab (Advanced Settings) to enable the higher-quality 48kHz speech tokenizer on the PyTorch backend. Disabled by default; changes take effect from the next generation. Settings are persisted via `GET /settings` and `PATCH /settings` API endpoints.

### Changed
- **README** - Added Makefile reference and updated Quick Start with Makefile-based setup instructions alongside manual setup
Expand Down
59 changes: 59 additions & 0 deletions app/src/components/ServerSettings/ModelManagement.tsx
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import { useMutation, useQuery, useQueryClient } from '@tanstack/react-query';
import { Download, Loader2, Trash2 } from 'lucide-react';
import { Checkbox } from '@/components/ui/checkbox';
import { useCallback, useState } from 'react';
import {
AlertDialog,
Expand All @@ -24,6 +25,36 @@ export function ModelManagement() {
const [downloadingModel, setDownloadingModel] = useState<string | null>(null);
const [downloadingDisplayName, setDownloadingDisplayName] = useState<string | null>(null);

const { data: appSettings } = useQuery({
queryKey: ['appSettings'],
queryFn: () => apiClient.getSettings(),
});

const { data: health } = useQuery({
queryKey: ['health'],
queryFn: () => apiClient.getHealth(),
});

const settingsMutation = useMutation({
mutationFn: (data: { use_48k_speech_tokenizer: boolean }) => apiClient.updateSettings(data),
onSuccess: (updated) => {
queryClient.setQueryData(['appSettings'], updated);
toast({
title: 'Setting updated',
description: updated.use_48k_speech_tokenizer
? '48kHz speech tokenizer enabled. It will apply from your next generation.'
: '48kHz speech tokenizer disabled. It will apply from your next generation.',
});
},
onError: (error: Error) => {
toast({
title: 'Failed to update setting',
description: error.message,
variant: 'destructive',
});
},
});

const { data: modelStatus, isLoading } = useQuery({
queryKey: ['modelStatus'],
queryFn: async () => {
Expand Down Expand Up @@ -215,6 +246,34 @@ export function ModelManagement() {

</div>
) : null}

{health?.backend_type === 'pytorch' && (
<div className="mt-6 pt-6 border-t">
<h3 className="text-sm font-semibold mb-3 text-muted-foreground">Advanced Settings</h3>
<div className="flex items-start space-x-3">
<Checkbox
id="use48kTokenizer"
checked={appSettings?.use_48k_speech_tokenizer ?? false}
onCheckedChange={(checked: boolean) => {
settingsMutation.mutate({ use_48k_speech_tokenizer: checked });
}}
disabled={settingsMutation.isPending}
/>
<div className="space-y-1">
<label
htmlFor="use48kTokenizer"
className="text-sm font-medium leading-none peer-disabled:cursor-not-allowed peer-disabled:opacity-70 cursor-pointer"
>
Use 48kHz speech tokenizer (experimental)
</label>
<p className="text-sm text-muted-foreground">
Enables a higher quality 48kHz speech tokenizer. Changes take
effect from your next generation.
</p>
</div>
</div>
</div>
)}
</CardContent>

{/* Delete Confirmation Dialog */}
Expand Down
13 changes: 13 additions & 0 deletions app/src/lib/api/client.ts
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import type {
StoryItemMove,
StoryItemTrim,
StoryItemSplit,
AppSettings,
} from './types';

class ApiClient {
Expand Down Expand Up @@ -495,6 +496,18 @@ class ApiClient {
});
}

// App Settings
async getSettings(): Promise<AppSettings> {
return this.request<AppSettings>('/settings');
}

async updateSettings(data: Partial<AppSettings>): Promise<AppSettings> {
return this.request<AppSettings>('/settings', {
method: 'PATCH',
body: JSON.stringify(data),
});
}

async exportStoryAudio(storyId: string): Promise<Blob> {
const url = `${this.getBaseUrl()}/stories/${storyId}/export-audio`;
const response = await fetch(url);
Expand Down
5 changes: 5 additions & 0 deletions app/src/lib/api/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ export interface HealthResponse {
model_size?: string;
gpu_available: boolean;
vram_used_mb?: number;
backend_type?: string;
}

export interface ModelProgress {
Expand Down Expand Up @@ -202,3 +203,7 @@ export interface StoryItemTrim {
export interface StoryItemSplit {
split_time_ms: number;
}

export interface AppSettings {
use_48k_speech_tokenizer: boolean;
}
28 changes: 28 additions & 0 deletions backend/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,34 @@ Manually load TTS model.
#### `POST /models/unload`
Unload TTS model to free memory.

### Settings

#### `GET /settings`
Get current application settings.

**Response:**
```json
{
"use_48k_speech_tokenizer": false
}
```

#### `PATCH /settings`
Partially update application settings. Only provided fields are updated.

**Request:**
```json
{
"use_48k_speech_tokenizer": true
}
```

**Response:** Updated settings object (same shape as `GET /settings`).

Settings are persisted to `data/settings.json` and survive server restarts.

> **Note:** `use_48k_speech_tokenizer` is only applied by the PyTorch backend. It has no effect on the MLX backend.

## Database Schema

### profiles
Expand Down
32 changes: 25 additions & 7 deletions backend/backends/pytorch_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
from pathlib import Path

from . import TTSBackend, STTBackend
from .. import config
from ..models import AppSettings
from ..utils.cache import get_cache_key, get_cached_voice_prompt, cache_voice_prompt
from ..utils.audio import normalize_audio, load_audio
from ..utils.progress import get_progress_manager
Expand All @@ -24,6 +26,7 @@ def __init__(self, model_size: str = "1.7B"):
self.model_size = model_size
self.device = self._get_device()
self._current_model_size = None
self._use_48k_speech_tokenizer = False

def _get_device(self) -> str:
"""Get the best available device."""
Expand Down Expand Up @@ -121,15 +124,21 @@ async def load_model_async(self, model_size: Optional[str] = None):
"""
if model_size is None:
model_size = self.model_size

# If already loaded with correct size, return
if self.model is not None and self._current_model_size == model_size:

requested_48k = AppSettings(**config.load_app_settings()).use_48k_speech_tokenizer

# If already loaded with correct size and same 48k setting, return
if (
self.model is not None
and self._current_model_size == model_size
and self._use_48k_speech_tokenizer == requested_48k
):
return
# Unload existing model if different size requested
if self.model is not None and self._current_model_size != model_size:

# Unload existing model if reload is needed
if self.model is not None:
self.unload_model()

# Run blocking load in thread pool
await asyncio.to_thread(self._load_model_sync, model_size)
Copy link
Contributor

@coderabbitai coderabbitai bot Mar 12, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical

Guard reloads against in-flight inference.

This new reload path can call unload_model() while another request is still inside create_voice_clone_prompt() or generate_voice_clone() on the same self.model. Toggling the checkbox during concurrent requests can therefore tear down the live model underneath an active generation and produce nondeterministic failures. Please serialize model unload/reload against prompt creation and generation, or defer the swap until no active operations are using the model.

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@backend/backends/pytorch_backend.py` around lines 138 - 143, The reload path
currently calls unload_model() and _load_model_sync while other coroutines may
be using self.model (e.g., create_voice_clone_prompt() and
generate_voice_clone()), so add serialization: introduce an instance-level
asyncio.Lock or a read-write pattern (e.g., an asyncio.Lock named
model_swap_lock or an active_ops counter with an asyncio.Event) and use it to
prevent unloading during in-flight inference; acquire the lock (or increment
active_ops) at the start of create_voice_clone_prompt() and
generate_voice_clone() and release (or decrement) when done, and in the reload
branch (the code that calls unload_model() and await
asyncio.to_thread(self._load_model_sync, ...)) acquire the swap lock or wait for
active_ops==0 before calling unload_model() so model teardown is deferred until
no active operations are using self.model.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The same issue exists in the original code and is unrelated to this PR.

_generate_sync directly references self.model (L376), which is the core of the problem.

Race condition scenario (pre-existing):

  1. Request A calls generate()load_model_async() sees model is loaded, passes through → starts await asyncio.to_thread(_generate_sync)
  2. The event loop switches to Request B (e.g., model size change) → load_model_async() detects size mismatch → unload_model() sets self.model = None / del self.model → begins loading the new model
  3. Request A's _generate_sync thread resumes and calls self.model.generate_voice_clone(...) → crashes because self.model is now None

Why this rarely occurs in practice:

  • In typical usage, a model size change and an in-flight generation request almost never happen concurrently
  • FastAPI route handlers may also serialize requests in practice

In short, this is a pre-existing latent design issue that predates this PR. The 48kHz tokenizer toggle makes it marginally more likely to occur (since toggling the setting while a generation is running is a new possible trigger), but addressing it is out of scope for this PR.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The PR description image shows a dark-themed "Advanced Settings" UI panel with a gold/tan checkmark checkbox for "Use 48kHz speech tokenizer (experimental)", accompanied by the description "Enables a higher quality 48kHz speech tokenizer. Changes take effect from your next generation."


@takuma104, you're right — the race condition is a pre-existing latent issue rooted in _generate_sync holding a direct reference to self.model without any lock, and this PR doesn't change that fundamental design. The 48kHz toggle is a marginal incremental exposure at most. I'll withdraw the critical flag for this PR.

That said, the issue is worth tracking so it doesn't get forgotten. Would you like me to open a follow-up GitHub issue to capture the model_swap_lock / active-ops guard work for a future PR?


✏️ Learnings added
Learnt from: takuma104
Repo: jamiepine/voicebox PR: 253
File: backend/backends/pytorch_backend.py:0-0
Timestamp: 2026-03-12T18:05:24.715Z
Learning: In `backend/backends/pytorch_backend.py` (PyTorchTTSBackend), the model reload/unload race condition (unloading self.model while _generate_sync or _create_prompt_sync threads hold a direct reference to it) is a pre-existing latent design issue that predates the 48kHz tokenizer PR. It is not introduced by the 48kHz toggle changes. Fixing it requires an asyncio.Lock or active-ops counter to serialize model swap against in-flight inference, and is tracked as a future follow-up.

Note: Learnings are effective only in the context of similar code segments. To apply general coding guidelines, please configure review instructions. You can manage existing learnings in the app.


Expand Down Expand Up @@ -195,6 +204,15 @@ def _load_model_sync(self, model_size: str):
device_map=self.device,
torch_dtype=torch.bfloat16,
)

# optionally replace speech tokenizer with 48kHz version for better audio quality
settings = AppSettings(**config.load_app_settings())
if settings.use_48k_speech_tokenizer:
from qwen_tts import Qwen3TTSTokenizer
self.model.model.speech_tokenizer = Qwen3TTSTokenizer.from_pretrained(
"takuma104/Qwen3-TTS-Tokenizer-12Hz-48kHz"
)
self._use_48k_speech_tokenizer = settings.use_48k_speech_tokenizer
finally:
# Exit the patch context
tracker_context.__exit__(None, None, None)
Expand Down
22 changes: 22 additions & 0 deletions backend/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
Handles data directory configuration for production bundling.
"""

import json
import os
from pathlib import Path

Expand Down Expand Up @@ -66,3 +67,24 @@ def get_models_dir() -> Path:
path = _data_dir / "models"
path.mkdir(parents=True, exist_ok=True)
return path

def get_settings_path() -> Path:
"""Get app settings file path."""
return _data_dir / "settings.json"

def load_app_settings() -> dict:
"""Load app settings from JSON file. Returns empty dict if file not found."""
path = get_settings_path()
if path.exists():
try:
return json.loads(path.read_text(encoding="utf-8"))
except (json.JSONDecodeError, OSError) as exc:
print(f"[config] Failed to load settings from {path}: {exc}")
return {}
return {}

def save_app_settings(data: dict) -> None:
"""Save app settings to JSON file."""
path = get_settings_path()
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text(json.dumps(data, indent=2), encoding="utf-8")
20 changes: 20 additions & 0 deletions backend/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -1764,6 +1764,26 @@ def _get_gpu_status() -> str:
return "None (CPU only)"


@app.get("/settings", response_model=models.AppSettings)
async def get_settings():
"""Return current application settings."""
return models.AppSettings(**config.load_app_settings())


@app.patch("/settings", response_model=models.AppSettings)
async def update_settings(update: models.AppSettingsUpdate):
"""Partially update application settings."""
data = config.load_app_settings()
if data == {} and config.get_settings_path().exists():
raise HTTPException(status_code=500, detail="Failed to read settings")

patch = update.model_dump(exclude_none=True)
data.update(patch)
validated = models.AppSettings(**data)
config.save_app_settings(validated.model_dump())
return validated


@app.on_event("startup")
async def startup_event():
"""Run on application startup."""
Expand Down
10 changes: 10 additions & 0 deletions backend/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -299,3 +299,13 @@ class StoryItemTrim(BaseModel):
class StoryItemSplit(BaseModel):
"""Request model for splitting a story item."""
split_time_ms: int = Field(..., ge=0) # Time within the clip to split at (relative to clip start)


class AppSettings(BaseModel):
"""Application settings."""
use_48k_speech_tokenizer: bool = False


class AppSettingsUpdate(BaseModel):
"""Partial update model for application settings."""
use_48k_speech_tokenizer: Optional[bool] = None
3 changes: 3 additions & 0 deletions data/settings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{
"use_48k_speech_tokenizer": false
}