From a552347245556533844c6ee372d458f599663703 Mon Sep 17 00:00:00 2001
From: Yizuki_Ame <yinzimike@gmail.com>
Date: Mon, 16 Mar 2026 23:05:55 +0800
Subject: [PATCH 1/3] feat: add TTS Model ID configuration UI

Add a Model ID input field to the TTS provider settings dialog,
allowing users to customize the model used for text-to-speech generation.

Changes:
- Add Model ID input to tts-settings.tsx with conditional rendering
  (only shown for providers that support model IDs: OpenAI, GLM, Qwen)
- Import DEFAULT_TTS_MODELS constant for placeholder and conditional logic
- Add ttsModelId i18n keys for Chinese and English locales
- Add modelId field support in audio constants (DEFAULT_TTS_MODELS map)
- Add modelId to TTSProviderConfig type and settings store
- Wire modelId through TTS API route and provider implementations

The field shows the default model as placeholder text and persists
user-specified model IDs to the settings store.
---
 app/api/generate/tts/route.ts          |  4 +++-
 components/settings/audio-settings.tsx | 25 ++++++++++++++++++++++++-
 components/settings/tts-settings.tsx   | 20 +++++++++++++++++++-
 lib/audio/constants.ts                 | 13 +++++++++++++
 lib/audio/tts-providers.ts             | 11 ++++++-----
 lib/audio/types.ts                     |  7 +++----
 lib/export/latex-to-omml.ts            |  4 +++-
 lib/hooks/use-scene-generator.ts       |  1 +
 lib/i18n/settings.ts                   |  2 ++
 lib/store/settings.ts                  |  3 ++-
 10 files changed, 76 insertions(+), 14 deletions(-)

diff --git a/app/api/generate/tts/route.ts b/app/api/generate/tts/route.ts
index 4ae820c7..73fe6c55 100644
--- a/app/api/generate/tts/route.ts
+++ b/app/api/generate/tts/route.ts
@@ -21,7 +21,7 @@ export const maxDuration = 30;
 export async function POST(req: NextRequest) {
   try {
     const body = await req.json();
-    const { text, audioId, ttsProviderId, ttsVoice, ttsSpeed, ttsApiKey, ttsBaseUrl } = body as {
+    const { text, audioId, ttsProviderId, ttsVoice, ttsSpeed, ttsApiKey, ttsBaseUrl, ttsModelId } = body as {
       text: string;
       audioId: string;
       ttsProviderId: TTSProviderId;
@@ -29,6 +29,7 @@ export async function POST(req: NextRequest) {
       ttsSpeed?: number;
       ttsApiKey?: string;
       ttsBaseUrl?: string;
+      ttsModelId?: string;
     };
 
     // Validate required fields
@@ -56,6 +57,7 @@ export async function POST(req: NextRequest) {
       speed: ttsSpeed ?? 1.0,
       apiKey,
       baseUrl,
+      modelId: ttsModelId || undefined,
     };
 
     log.info(
diff --git a/components/settings/audio-settings.tsx b/components/settings/audio-settings.tsx
index 9a65ef80..d95de195 100644
--- a/components/settings/audio-settings.tsx
+++ b/components/settings/audio-settings.tsx
@@ -20,6 +20,7 @@ import {
   getTTSVoices,
   ASR_PROVIDERS,
   getASRSupportedLanguages,
+  DEFAULT_TTS_MODELS,
 } from '@/lib/audio/constants';
 import type { TTSProviderId, ASRProviderId } from '@/lib/audio/types';
 import { Volume2, Mic, MicOff, Loader2, CheckCircle2, XCircle, Eye, EyeOff } from 'lucide-react';
@@ -112,7 +113,7 @@ export function AudioSettings({ onSave }: AudioSettingsProps = {}) {
 
   const handleTTSProviderConfigChange = (
     providerId: TTSProviderId,
-    config: Partial<{ apiKey: string; baseUrl: string; enabled: boolean }>,
+    config: Partial<{ apiKey: string; baseUrl: string; enabled: boolean; modelId: string }>,
   ) => {
     setTTSProviderConfig(providerId, config);
     onSave?.();
@@ -316,6 +317,11 @@ export function AudioSettings({ onSave }: AudioSettingsProps = {}) {
         requestBody.ttsBaseUrl = baseUrlValue;
       }
 
+      const modelIdValue = ttsProvidersConfig[ttsProviderId]?.modelId;
+      if (modelIdValue && modelIdValue.trim()) {
+        requestBody.ttsModelId = modelIdValue;
+      }
+
       const response = await fetch('/api/generate/tts', {
         method: 'POST',
         headers: { 'Content-Type': 'application/json' },
@@ -591,6 +597,23 @@ export function AudioSettings({ onSave }: AudioSettingsProps = {}) {
                   />
                 </div>
               </div>
+
+              {/* Model ID input - only show for providers that use model ID */}
+              {DEFAULT_TTS_MODELS[ttsProviderId] && (
+                <div className="space-y-2">
+                  <Label className="text-sm">{t('settings.ttsModelId')}</Label>
+                  <Input
+                    placeholder={DEFAULT_TTS_MODELS[ttsProviderId]}
+                    value={ttsProvidersConfig[ttsProviderId]?.modelId || ''}
+                    onChange={(e) =>
+                      handleTTSProviderConfigChange(ttsProviderId, {
+                        modelId: e.target.value,
+                      })
+                    }
+                    className="text-sm"
+                  />
+                </div>
+              )}
               {(() => {
                 const effectiveBaseUrl =
                   ttsProvidersConfig[ttsProviderId]?.baseUrl || ttsProvider.defaultBaseUrl || '';
diff --git a/components/settings/tts-settings.tsx b/components/settings/tts-settings.tsx
index 45a03f51..3160d93e 100644
--- a/components/settings/tts-settings.tsx
+++ b/components/settings/tts-settings.tsx
@@ -6,7 +6,7 @@ import { Input } from '@/components/ui/input';
 import { Button } from '@/components/ui/button';
 import { useI18n } from '@/lib/hooks/use-i18n';
 import { useSettingsStore } from '@/lib/store/settings';
-import { TTS_PROVIDERS, DEFAULT_TTS_VOICES } from '@/lib/audio/constants';
+import { TTS_PROVIDERS, DEFAULT_TTS_VOICES, DEFAULT_TTS_MODELS } from '@/lib/audio/constants';
 import type { TTSProviderId } from '@/lib/audio/types';
 import { Volume2, Loader2, CheckCircle2, XCircle, Eye, EyeOff } from 'lucide-react';
 import { cn } from '@/lib/utils';
@@ -190,6 +190,24 @@ export function TTSSettings({ selectedProviderId }: TTSSettingsProps) {
               />
             </div>
           </div>
+
+              {/* Model ID input - only show for providers that use model ID */}
+              {DEFAULT_TTS_MODELS[selectedProviderId] && (
+                <div className="space-y-2">
+                  <Label className="text-sm">{t('settings.ttsModelId')}</Label>
+                  <Input
+                    placeholder={DEFAULT_TTS_MODELS[selectedProviderId]}
+                    value={ttsProvidersConfig[selectedProviderId]?.modelId || ''}
+                    onChange={(e) =>
+                      setTTSProviderConfig(selectedProviderId, {
+                        modelId: e.target.value,
+                      })
+                    }
+                    className="text-sm"
+                  />
+                </div>
+              )}
+
           {/* Request URL Preview */}
           {(() => {
             const effectiveBaseUrl =
diff --git a/lib/audio/constants.ts b/lib/audio/constants.ts
index 55a5cbb3..6203a3b3 100644
--- a/lib/audio/constants.ts
+++ b/lib/audio/constants.ts
@@ -836,6 +836,19 @@ export const DEFAULT_TTS_VOICES: Record<TTSProviderId, string> = {
   'browser-native-tts': 'default',
 };
 
+/**
+ * Default model ID for each TTS provider.
+ * Used as fallback when user has not configured a custom model ID.
+ * Empty string means the provider does not use a model ID parameter.
+ */
+export const DEFAULT_TTS_MODELS: Record<TTSProviderId, string> = {
+  'openai-tts': 'gpt-4o-mini-tts',
+  'azure-tts': '',
+  'glm-tts': 'glm-tts',
+  'qwen-tts': 'qwen3-tts-flash',
+  'browser-native-tts': '',
+};
+
 /**
  * Get voices for a specific TTS provider
  */
diff --git a/lib/audio/tts-providers.ts b/lib/audio/tts-providers.ts
index bf1f2c12..302db660 100644
--- a/lib/audio/tts-providers.ts
+++ b/lib/audio/tts-providers.ts
@@ -90,7 +90,7 @@
  */
 
 import type { TTSModelConfig } from './types';
-import { TTS_PROVIDERS } from './constants';
+import { TTS_PROVIDERS, DEFAULT_TTS_MODELS } from './constants';
 
 /**
  * Result of TTS generation
@@ -149,7 +149,7 @@ async function generateOpenAITTS(
 ): Promise<TTSGenerationResult> {
   const baseUrl = config.baseUrl || TTS_PROVIDERS['openai-tts'].defaultBaseUrl;
 
-  // Use gpt-4o-mini-tts for best quality and intelligent realtime applications
+  // Use configurable model, fallback to default
   const response = await fetch(`${baseUrl}/audio/speech`, {
     method: 'POST',
     headers: {
@@ -157,7 +157,7 @@ async function generateOpenAITTS(
       'Content-Type': 'application/json; charset=utf-8',
     },
     body: JSON.stringify({
-      model: 'gpt-4o-mini-tts',
+      model: config.modelId || DEFAULT_TTS_MODELS['openai-tts'],
       input: text,
       voice: config.voice,
       speed: config.speed || 1.0,
@@ -229,7 +229,7 @@ async function generateGLMTTS(config: TTSModelConfig, text: string): Promise<TTS
       'Content-Type': 'application/json; charset=utf-8',
     },
     body: JSON.stringify({
-      model: 'glm-tts',
+      model: config.modelId || DEFAULT_TTS_MODELS['glm-tts'],
       input: text,
       voice: config.voice,
       speed: config.speed || 1.0,
@@ -276,7 +276,7 @@ async function generateQwenTTS(config: TTSModelConfig, text: string): Promise<TT
       'Content-Type': 'application/json; charset=utf-8',
     },
     body: JSON.stringify({
-      model: 'qwen3-tts-flash',
+      model: config.modelId || DEFAULT_TTS_MODELS['qwen-tts'],
       input: {
         text,
         voice: config.voice,
@@ -337,6 +337,7 @@ export async function getCurrentTTSConfig(): Promise<TTSModelConfig> {
     baseUrl: providerConfig?.baseUrl,
     voice: ttsVoice,
     speed: ttsSpeed,
+    modelId: providerConfig?.modelId,
   };
 }
 
diff --git a/lib/audio/types.ts b/lib/audio/types.ts
index 43c37087..88cf53ce 100644
--- a/lib/audio/types.ts
+++ b/lib/audio/types.ts
@@ -129,6 +129,7 @@ export interface TTSModelConfig {
   voice: string;
   speed?: number;
   format?: string;
+  modelId?: string;
 }
 
 // ============================================================================
@@ -143,10 +144,8 @@ export interface TTSModelConfig {
  */
 export type ASRProviderId = 'openai-whisper' | 'browser-native' | 'qwen-asr';
 // Add new ASR providers below (uncomment and modify):
-// | 'elevenlabs-asr'
-// | 'assemblyai-asr'
-// | 'deepgram-asr'
-// | 'azure-asr'
+// | 'assemblyai'
+// | 'deepgram'
 
 /**
  * ASR Provider Configuration
diff --git a/lib/export/latex-to-omml.ts b/lib/export/latex-to-omml.ts
index 0aa6f926..f21d31e8 100644
--- a/lib/export/latex-to-omml.ts
+++ b/lib/export/latex-to-omml.ts
@@ -71,7 +71,9 @@ export function latexToOmml(latex: string, fontSize?: number): string | null {
   try {
     const mathml = temml.renderToString(latex);
     const cleaned = stripUnsupportedMathML(mathml);
-    const omml = mml2omml(cleaned);
+    const ommlOutput = mml2omml(cleaned);
+    // Handle case where mml2omml might return an object with the OMML string or directly a string
+    const omml = typeof ommlOutput === 'string' ? ommlOutput : String(ommlOutput);
     const szHundredths = fontSize ? Math.round(fontSize * 100) : undefined;
     return postProcessOmml(omml, szHundredths);
   } catch {
diff --git a/lib/hooks/use-scene-generator.ts b/lib/hooks/use-scene-generator.ts
index 1c7d540f..67019d77 100644
--- a/lib/hooks/use-scene-generator.ts
+++ b/lib/hooks/use-scene-generator.ts
@@ -225,6 +225,7 @@ export async function generateAndStoreTTS(
       ttsSpeed: settings.ttsSpeed,
       ttsApiKey: ttsProviderConfig?.apiKey || undefined,
       ttsBaseUrl: ttsProviderConfig?.baseUrl || undefined,
+      ttsModelId: ttsProviderConfig?.modelId || undefined,
     }),
     signal,
   });
diff --git a/lib/i18n/settings.ts b/lib/i18n/settings.ts
index 1d2579c9..86fc2330 100644
--- a/lib/i18n/settings.ts
+++ b/lib/i18n/settings.ts
@@ -209,6 +209,7 @@ export const settingsZhCN = {
     ttsSpeed: '语速',
     ttsBaseUrl: 'Base URL',
     ttsApiKey: 'API 密钥',
+    ttsModelId: 'Model ID（模型标识）',
     asrProvider: 'ASR 提供商',
     asrLanguage: '识别语言',
     asrBaseUrl: 'Base URL',
@@ -793,6 +794,7 @@ export const settingsEnUS = {
     ttsSpeed: 'Speed',
     ttsBaseUrl: 'Base URL',
     ttsApiKey: 'API Key',
+    ttsModelId: 'Model ID',
     asrProvider: 'ASR Provider',
     asrLanguage: 'Recognition Language',
     asrBaseUrl: 'Base URL',
diff --git a/lib/store/settings.ts b/lib/store/settings.ts
index 2ffc49ed..7d229640 100644
--- a/lib/store/settings.ts
+++ b/lib/store/settings.ts
@@ -48,6 +48,7 @@ export interface SettingsState {
       apiKey: string;
       baseUrl: string;
       enabled: boolean;
+      modelId?: string;
       isServerConfigured?: boolean;
       serverBaseUrl?: string;
     }
@@ -175,7 +176,7 @@ export interface SettingsState {
   setASRLanguage: (language: string) => void;
   setTTSProviderConfig: (
     providerId: TTSProviderId,
-    config: Partial<{ apiKey: string; baseUrl: string; enabled: boolean }>,
+    config: Partial<{ apiKey: string; baseUrl: string; enabled: boolean; modelId: string }>,
   ) => void;
   setASRProviderConfig: (
     providerId: ASRProviderId,

From 50b8a1ffa58f80c29d9c22d272d490356384d364 Mon Sep 17 00:00:00 2001
From: Yizuki_Ame <yinzimike@gmail.com>
Date: Tue, 17 Mar 2026 00:47:22 +0800
Subject: [PATCH 2/3] chore: remove redundant annotation from Model ID label

---
 lib/i18n/settings.ts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/i18n/settings.ts b/lib/i18n/settings.ts
index 86fc2330..1d10ed19 100644
--- a/lib/i18n/settings.ts
+++ b/lib/i18n/settings.ts
@@ -209,7 +209,7 @@ export const settingsZhCN = {
     ttsSpeed: '语速',
     ttsBaseUrl: 'Base URL',
     ttsApiKey: 'API 密钥',
-    ttsModelId: 'Model ID（模型标识）',
+    ttsModelId: 'Model ID',
     asrProvider: 'ASR 提供商',
     asrLanguage: '识别语言',
     asrBaseUrl: 'Base URL',

From 8e2f2c351a44df94d7ed36fbf1ba4ff83902e90c Mon Sep 17 00:00:00 2001
From: YizukiAme <yizuki@users.noreply.github.com>
Date: Tue, 17 Mar 2026 08:58:50 +0800
Subject: [PATCH 3/3] style: fix Prettier formatting

---
 app/api/generate/tts/route.ts        | 21 +++++++++---------
 components/settings/tts-settings.tsx | 32 ++++++++++++++--------------
 2 files changed, 27 insertions(+), 26 deletions(-)

diff --git a/app/api/generate/tts/route.ts b/app/api/generate/tts/route.ts
index 73fe6c55..b1de8c2d 100644
--- a/app/api/generate/tts/route.ts
+++ b/app/api/generate/tts/route.ts
@@ -21,16 +21,17 @@ export const maxDuration = 30;
 export async function POST(req: NextRequest) {
   try {
     const body = await req.json();
-    const { text, audioId, ttsProviderId, ttsVoice, ttsSpeed, ttsApiKey, ttsBaseUrl, ttsModelId } = body as {
-      text: string;
-      audioId: string;
-      ttsProviderId: TTSProviderId;
-      ttsVoice: string;
-      ttsSpeed?: number;
-      ttsApiKey?: string;
-      ttsBaseUrl?: string;
-      ttsModelId?: string;
-    };
+    const { text, audioId, ttsProviderId, ttsVoice, ttsSpeed, ttsApiKey, ttsBaseUrl, ttsModelId } =
+      body as {
+        text: string;
+        audioId: string;
+        ttsProviderId: TTSProviderId;
+        ttsVoice: string;
+        ttsSpeed?: number;
+        ttsApiKey?: string;
+        ttsBaseUrl?: string;
+        ttsModelId?: string;
+      };
 
     // Validate required fields
     if (!text || !audioId || !ttsProviderId || !ttsVoice) {
diff --git a/components/settings/tts-settings.tsx b/components/settings/tts-settings.tsx
index 3160d93e..3a44429c 100644
--- a/components/settings/tts-settings.tsx
+++ b/components/settings/tts-settings.tsx
@@ -191,22 +191,22 @@ export function TTSSettings({ selectedProviderId }: TTSSettingsProps) {
             </div>
           </div>
 
-              {/* Model ID input - only show for providers that use model ID */}
-              {DEFAULT_TTS_MODELS[selectedProviderId] && (
-                <div className="space-y-2">
-                  <Label className="text-sm">{t('settings.ttsModelId')}</Label>
-                  <Input
-                    placeholder={DEFAULT_TTS_MODELS[selectedProviderId]}
-                    value={ttsProvidersConfig[selectedProviderId]?.modelId || ''}
-                    onChange={(e) =>
-                      setTTSProviderConfig(selectedProviderId, {
-                        modelId: e.target.value,
-                      })
-                    }
-                    className="text-sm"
-                  />
-                </div>
-              )}
+          {/* Model ID input - only show for providers that use model ID */}
+          {DEFAULT_TTS_MODELS[selectedProviderId] && (
+            <div className="space-y-2">
+              <Label className="text-sm">{t('settings.ttsModelId')}</Label>
+              <Input
+                placeholder={DEFAULT_TTS_MODELS[selectedProviderId]}
+                value={ttsProvidersConfig[selectedProviderId]?.modelId || ''}
+                onChange={(e) =>
+                  setTTSProviderConfig(selectedProviderId, {
+                    modelId: e.target.value,
+                  })
+                }
+                className="text-sm"
+              />
+            </div>
+          )}
 
           {/* Request URL Preview */}
           {(() => {