From a552347245556533844c6ee372d458f599663703 Mon Sep 17 00:00:00 2001 From: Yizuki_Ame Date: Mon, 16 Mar 2026 23:05:55 +0800 Subject: [PATCH 1/3] feat: add TTS Model ID configuration UI Add a Model ID input field to the TTS provider settings dialog, allowing users to customize the model used for text-to-speech generation. Changes: - Add Model ID input to tts-settings.tsx with conditional rendering (only shown for providers that support model IDs: OpenAI, GLM, Qwen) - Import DEFAULT_TTS_MODELS constant for placeholder and conditional logic - Add ttsModelId i18n keys for Chinese and English locales - Add modelId field support in audio constants (DEFAULT_TTS_MODELS map) - Add modelId to TTSProviderConfig type and settings store - Wire modelId through TTS API route and provider implementations The field shows the default model as placeholder text and persists user-specified model IDs to the settings store. --- app/api/generate/tts/route.ts | 4 +++- components/settings/audio-settings.tsx | 25 ++++++++++++++++++++++++- components/settings/tts-settings.tsx | 20 +++++++++++++++++++- lib/audio/constants.ts | 13 +++++++++++++ lib/audio/tts-providers.ts | 11 ++++++----- lib/audio/types.ts | 7 +++---- lib/export/latex-to-omml.ts | 4 +++- lib/hooks/use-scene-generator.ts | 1 + lib/i18n/settings.ts | 2 ++ lib/store/settings.ts | 3 ++- 10 files changed, 76 insertions(+), 14 deletions(-) diff --git a/app/api/generate/tts/route.ts b/app/api/generate/tts/route.ts index 4ae820c7..73fe6c55 100644 --- a/app/api/generate/tts/route.ts +++ b/app/api/generate/tts/route.ts @@ -21,7 +21,7 @@ export const maxDuration = 30; export async function POST(req: NextRequest) { try { const body = await req.json(); - const { text, audioId, ttsProviderId, ttsVoice, ttsSpeed, ttsApiKey, ttsBaseUrl } = body as { + const { text, audioId, ttsProviderId, ttsVoice, ttsSpeed, ttsApiKey, ttsBaseUrl, ttsModelId } = body as { text: string; audioId: string; ttsProviderId: TTSProviderId; @@ -29,6 +29,7 @@ export async function POST(req: NextRequest) { ttsSpeed?: number; ttsApiKey?: string; ttsBaseUrl?: string; + ttsModelId?: string; }; // Validate required fields @@ -56,6 +57,7 @@ export async function POST(req: NextRequest) { speed: ttsSpeed ?? 1.0, apiKey, baseUrl, + modelId: ttsModelId || undefined, }; log.info( diff --git a/components/settings/audio-settings.tsx b/components/settings/audio-settings.tsx index 9a65ef80..d95de195 100644 --- a/components/settings/audio-settings.tsx +++ b/components/settings/audio-settings.tsx @@ -20,6 +20,7 @@ import { getTTSVoices, ASR_PROVIDERS, getASRSupportedLanguages, + DEFAULT_TTS_MODELS, } from '@/lib/audio/constants'; import type { TTSProviderId, ASRProviderId } from '@/lib/audio/types'; import { Volume2, Mic, MicOff, Loader2, CheckCircle2, XCircle, Eye, EyeOff } from 'lucide-react'; @@ -112,7 +113,7 @@ export function AudioSettings({ onSave }: AudioSettingsProps = {}) { const handleTTSProviderConfigChange = ( providerId: TTSProviderId, - config: Partial<{ apiKey: string; baseUrl: string; enabled: boolean }>, + config: Partial<{ apiKey: string; baseUrl: string; enabled: boolean; modelId: string }>, ) => { setTTSProviderConfig(providerId, config); onSave?.(); @@ -316,6 +317,11 @@ export function AudioSettings({ onSave }: AudioSettingsProps = {}) { requestBody.ttsBaseUrl = baseUrlValue; } + const modelIdValue = ttsProvidersConfig[ttsProviderId]?.modelId; + if (modelIdValue && modelIdValue.trim()) { + requestBody.ttsModelId = modelIdValue; + } + const response = await fetch('/api/generate/tts', { method: 'POST', headers: { 'Content-Type': 'application/json' }, @@ -591,6 +597,23 @@ export function AudioSettings({ onSave }: AudioSettingsProps = {}) { /> + + {/* Model ID input - only show for providers that use model ID */} + {DEFAULT_TTS_MODELS[ttsProviderId] && ( +
+ + + handleTTSProviderConfigChange(ttsProviderId, { + modelId: e.target.value, + }) + } + className="text-sm" + /> +
+ )} {(() => { const effectiveBaseUrl = ttsProvidersConfig[ttsProviderId]?.baseUrl || ttsProvider.defaultBaseUrl || ''; diff --git a/components/settings/tts-settings.tsx b/components/settings/tts-settings.tsx index 45a03f51..3160d93e 100644 --- a/components/settings/tts-settings.tsx +++ b/components/settings/tts-settings.tsx @@ -6,7 +6,7 @@ import { Input } from '@/components/ui/input'; import { Button } from '@/components/ui/button'; import { useI18n } from '@/lib/hooks/use-i18n'; import { useSettingsStore } from '@/lib/store/settings'; -import { TTS_PROVIDERS, DEFAULT_TTS_VOICES } from '@/lib/audio/constants'; +import { TTS_PROVIDERS, DEFAULT_TTS_VOICES, DEFAULT_TTS_MODELS } from '@/lib/audio/constants'; import type { TTSProviderId } from '@/lib/audio/types'; import { Volume2, Loader2, CheckCircle2, XCircle, Eye, EyeOff } from 'lucide-react'; import { cn } from '@/lib/utils'; @@ -190,6 +190,24 @@ export function TTSSettings({ selectedProviderId }: TTSSettingsProps) { /> + + {/* Model ID input - only show for providers that use model ID */} + {DEFAULT_TTS_MODELS[selectedProviderId] && ( +
+ + + setTTSProviderConfig(selectedProviderId, { + modelId: e.target.value, + }) + } + className="text-sm" + /> +
+ )} + {/* Request URL Preview */} {(() => { const effectiveBaseUrl = diff --git a/lib/audio/constants.ts b/lib/audio/constants.ts index 55a5cbb3..6203a3b3 100644 --- a/lib/audio/constants.ts +++ b/lib/audio/constants.ts @@ -836,6 +836,19 @@ export const DEFAULT_TTS_VOICES: Record = { 'browser-native-tts': 'default', }; +/** + * Default model ID for each TTS provider. + * Used as fallback when user has not configured a custom model ID. + * Empty string means the provider does not use a model ID parameter. + */ +export const DEFAULT_TTS_MODELS: Record = { + 'openai-tts': 'gpt-4o-mini-tts', + 'azure-tts': '', + 'glm-tts': 'glm-tts', + 'qwen-tts': 'qwen3-tts-flash', + 'browser-native-tts': '', +}; + /** * Get voices for a specific TTS provider */ diff --git a/lib/audio/tts-providers.ts b/lib/audio/tts-providers.ts index bf1f2c12..302db660 100644 --- a/lib/audio/tts-providers.ts +++ b/lib/audio/tts-providers.ts @@ -90,7 +90,7 @@ */ import type { TTSModelConfig } from './types'; -import { TTS_PROVIDERS } from './constants'; +import { TTS_PROVIDERS, DEFAULT_TTS_MODELS } from './constants'; /** * Result of TTS generation @@ -149,7 +149,7 @@ async function generateOpenAITTS( ): Promise { const baseUrl = config.baseUrl || TTS_PROVIDERS['openai-tts'].defaultBaseUrl; - // Use gpt-4o-mini-tts for best quality and intelligent realtime applications + // Use configurable model, fallback to default const response = await fetch(`${baseUrl}/audio/speech`, { method: 'POST', headers: { @@ -157,7 +157,7 @@ async function generateOpenAITTS( 'Content-Type': 'application/json; charset=utf-8', }, body: JSON.stringify({ - model: 'gpt-4o-mini-tts', + model: config.modelId || DEFAULT_TTS_MODELS['openai-tts'], input: text, voice: config.voice, speed: config.speed || 1.0, @@ -229,7 +229,7 @@ async function generateGLMTTS(config: TTSModelConfig, text: string): Promise { baseUrl: providerConfig?.baseUrl, voice: ttsVoice, speed: ttsSpeed, + modelId: providerConfig?.modelId, }; } diff --git a/lib/audio/types.ts b/lib/audio/types.ts index 43c37087..88cf53ce 100644 --- a/lib/audio/types.ts +++ b/lib/audio/types.ts @@ -129,6 +129,7 @@ export interface TTSModelConfig { voice: string; speed?: number; format?: string; + modelId?: string; } // ============================================================================ @@ -143,10 +144,8 @@ export interface TTSModelConfig { */ export type ASRProviderId = 'openai-whisper' | 'browser-native' | 'qwen-asr'; // Add new ASR providers below (uncomment and modify): -// | 'elevenlabs-asr' -// | 'assemblyai-asr' -// | 'deepgram-asr' -// | 'azure-asr' +// | 'assemblyai' +// | 'deepgram' /** * ASR Provider Configuration diff --git a/lib/export/latex-to-omml.ts b/lib/export/latex-to-omml.ts index 0aa6f926..f21d31e8 100644 --- a/lib/export/latex-to-omml.ts +++ b/lib/export/latex-to-omml.ts @@ -71,7 +71,9 @@ export function latexToOmml(latex: string, fontSize?: number): string | null { try { const mathml = temml.renderToString(latex); const cleaned = stripUnsupportedMathML(mathml); - const omml = mml2omml(cleaned); + const ommlOutput = mml2omml(cleaned); + // Handle case where mml2omml might return an object with the OMML string or directly a string + const omml = typeof ommlOutput === 'string' ? ommlOutput : String(ommlOutput); const szHundredths = fontSize ? Math.round(fontSize * 100) : undefined; return postProcessOmml(omml, szHundredths); } catch { diff --git a/lib/hooks/use-scene-generator.ts b/lib/hooks/use-scene-generator.ts index 1c7d540f..67019d77 100644 --- a/lib/hooks/use-scene-generator.ts +++ b/lib/hooks/use-scene-generator.ts @@ -225,6 +225,7 @@ export async function generateAndStoreTTS( ttsSpeed: settings.ttsSpeed, ttsApiKey: ttsProviderConfig?.apiKey || undefined, ttsBaseUrl: ttsProviderConfig?.baseUrl || undefined, + ttsModelId: ttsProviderConfig?.modelId || undefined, }), signal, }); diff --git a/lib/i18n/settings.ts b/lib/i18n/settings.ts index 1d2579c9..86fc2330 100644 --- a/lib/i18n/settings.ts +++ b/lib/i18n/settings.ts @@ -209,6 +209,7 @@ export const settingsZhCN = { ttsSpeed: '语速', ttsBaseUrl: 'Base URL', ttsApiKey: 'API 密钥', + ttsModelId: 'Model ID(模型标识)', asrProvider: 'ASR 提供商', asrLanguage: '识别语言', asrBaseUrl: 'Base URL', @@ -793,6 +794,7 @@ export const settingsEnUS = { ttsSpeed: 'Speed', ttsBaseUrl: 'Base URL', ttsApiKey: 'API Key', + ttsModelId: 'Model ID', asrProvider: 'ASR Provider', asrLanguage: 'Recognition Language', asrBaseUrl: 'Base URL', diff --git a/lib/store/settings.ts b/lib/store/settings.ts index 2ffc49ed..7d229640 100644 --- a/lib/store/settings.ts +++ b/lib/store/settings.ts @@ -48,6 +48,7 @@ export interface SettingsState { apiKey: string; baseUrl: string; enabled: boolean; + modelId?: string; isServerConfigured?: boolean; serverBaseUrl?: string; } @@ -175,7 +176,7 @@ export interface SettingsState { setASRLanguage: (language: string) => void; setTTSProviderConfig: ( providerId: TTSProviderId, - config: Partial<{ apiKey: string; baseUrl: string; enabled: boolean }>, + config: Partial<{ apiKey: string; baseUrl: string; enabled: boolean; modelId: string }>, ) => void; setASRProviderConfig: ( providerId: ASRProviderId, From 50b8a1ffa58f80c29d9c22d272d490356384d364 Mon Sep 17 00:00:00 2001 From: Yizuki_Ame Date: Tue, 17 Mar 2026 00:47:22 +0800 Subject: [PATCH 2/3] chore: remove redundant annotation from Model ID label --- lib/i18n/settings.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/i18n/settings.ts b/lib/i18n/settings.ts index 86fc2330..1d10ed19 100644 --- a/lib/i18n/settings.ts +++ b/lib/i18n/settings.ts @@ -209,7 +209,7 @@ export const settingsZhCN = { ttsSpeed: '语速', ttsBaseUrl: 'Base URL', ttsApiKey: 'API 密钥', - ttsModelId: 'Model ID(模型标识)', + ttsModelId: 'Model ID', asrProvider: 'ASR 提供商', asrLanguage: '识别语言', asrBaseUrl: 'Base URL', From 8e2f2c351a44df94d7ed36fbf1ba4ff83902e90c Mon Sep 17 00:00:00 2001 From: YizukiAme Date: Tue, 17 Mar 2026 08:58:50 +0800 Subject: [PATCH 3/3] style: fix Prettier formatting --- app/api/generate/tts/route.ts | 21 +++++++++--------- components/settings/tts-settings.tsx | 32 ++++++++++++++-------------- 2 files changed, 27 insertions(+), 26 deletions(-) diff --git a/app/api/generate/tts/route.ts b/app/api/generate/tts/route.ts index 73fe6c55..b1de8c2d 100644 --- a/app/api/generate/tts/route.ts +++ b/app/api/generate/tts/route.ts @@ -21,16 +21,17 @@ export const maxDuration = 30; export async function POST(req: NextRequest) { try { const body = await req.json(); - const { text, audioId, ttsProviderId, ttsVoice, ttsSpeed, ttsApiKey, ttsBaseUrl, ttsModelId } = body as { - text: string; - audioId: string; - ttsProviderId: TTSProviderId; - ttsVoice: string; - ttsSpeed?: number; - ttsApiKey?: string; - ttsBaseUrl?: string; - ttsModelId?: string; - }; + const { text, audioId, ttsProviderId, ttsVoice, ttsSpeed, ttsApiKey, ttsBaseUrl, ttsModelId } = + body as { + text: string; + audioId: string; + ttsProviderId: TTSProviderId; + ttsVoice: string; + ttsSpeed?: number; + ttsApiKey?: string; + ttsBaseUrl?: string; + ttsModelId?: string; + }; // Validate required fields if (!text || !audioId || !ttsProviderId || !ttsVoice) { diff --git a/components/settings/tts-settings.tsx b/components/settings/tts-settings.tsx index 3160d93e..3a44429c 100644 --- a/components/settings/tts-settings.tsx +++ b/components/settings/tts-settings.tsx @@ -191,22 +191,22 @@ export function TTSSettings({ selectedProviderId }: TTSSettingsProps) { - {/* Model ID input - only show for providers that use model ID */} - {DEFAULT_TTS_MODELS[selectedProviderId] && ( -
- - - setTTSProviderConfig(selectedProviderId, { - modelId: e.target.value, - }) - } - className="text-sm" - /> -
- )} + {/* Model ID input - only show for providers that use model ID */} + {DEFAULT_TTS_MODELS[selectedProviderId] && ( +
+ + + setTTSProviderConfig(selectedProviderId, { + modelId: e.target.value, + }) + } + className="text-sm" + /> +
+ )} {/* Request URL Preview */} {(() => {