From 322358f47c36320f3243849ef3c66e53a28e143a Mon Sep 17 00:00:00 2001 From: yangshen Date: Thu, 30 Apr 2026 08:48:09 +0800 Subject: [PATCH 1/9] feat: add lemonade to provider --- .env.example | 13 +++ app/api/generate/image/route.ts | 9 +- app/api/transcription/route.ts | 6 +- app/api/verify-image-provider/route.ts | 5 +- components/settings/asr-settings.tsx | 8 +- components/settings/audio-settings.tsx | 6 +- components/settings/image-settings.tsx | 3 +- components/settings/index.tsx | 4 + components/settings/model-selector.tsx | 2 +- components/settings/tts-settings.tsx | 4 +- lib/ai/providers.ts | 28 +++++- lib/audio/asr-providers.ts | 81 ++++++++++++++++++ lib/audio/constants.ts | 31 +++++++ lib/audio/tts-providers.ts | 42 +++++++++ lib/audio/types.ts | 7 +- lib/audio/voice-resolver.ts | 5 +- lib/audio/wav-utils.ts | 84 ++++++++++++++++++ lib/hooks/use-audio-recorder.ts | 6 +- lib/i18n/locales/ar-SA.json | 4 + lib/i18n/locales/en-US.json | 4 + lib/i18n/locales/ja-JP.json | 4 + lib/i18n/locales/ru-RU.json | 4 + lib/i18n/locales/zh-CN.json | 4 + lib/media/adapters/lemonade-image-adapter.ts | 90 ++++++++++++++++++++ lib/media/image-providers.ts | 18 ++++ lib/media/types.ts | 3 +- lib/server/classroom-media-generation.ts | 14 ++- lib/server/provider-config.ts | 16 +++- lib/store/settings.ts | 12 ++- lib/types/provider.ts | 1 + public/logos/lemonade.svg | 53 ++++++++++++ 31 files changed, 538 insertions(+), 33 deletions(-) create mode 100644 lib/audio/wav-utils.ts create mode 100644 lib/media/adapters/lemonade-image-adapter.ts create mode 100644 public/logos/lemonade.svg diff --git a/.env.example b/.env.example index 968671ec0..882ea1dda 100644 --- a/.env.example +++ b/.env.example @@ -79,6 +79,10 @@ XIAOMI_MODELS= # OLLAMA_BASE_URL=http://localhost:11434/v1 # OLLAMA_MODELS=llama3.3,llama3.2,qwen2.5,mistral,gemma3 +# Lemonade local server (OpenAI-compatible, no API key required) +# LEMONADE_BASE_URL=http://localhost:13305/v1 +# LEMONADE_MODELS=Qwen3-0.6B-GGUF,Llama-3.2-1B-Instruct-Hybrid,Qwen2.5-VL-7B-Instruct + # --- TTS (Text-to-Speech) ---------------------------------------------------- TTS_OPENAI_API_KEY= @@ -99,6 +103,9 @@ TTS_MINIMAX_BASE_URL=https://api.minimaxi.com TTS_ELEVENLABS_API_KEY= TTS_ELEVENLABS_BASE_URL= +# Lemonade TTS (local, no API key required) +# TTS_LEMONADE_BASE_URL=http://localhost:13305/v1 + # --- ASR (Automatic Speech Recognition) -------------------------------------- ASR_OPENAI_API_KEY= @@ -107,6 +114,9 @@ ASR_OPENAI_BASE_URL= ASR_QWEN_API_KEY= ASR_QWEN_BASE_URL= +# Lemonade ASR (local, WAV input only, no API key required) +# ASR_LEMONADE_BASE_URL=http://localhost:13305/v1 + # --- PDF Processing ----------------------------------------------------------- PDF_UNPDF_API_KEY= @@ -136,6 +146,9 @@ IMAGE_MINIMAX_BASE_URL=https://api.minimaxi.com IMAGE_GROK_API_KEY= IMAGE_GROK_BASE_URL= +# Lemonade image generation (local, no API key required) +# IMAGE_LEMONADE_BASE_URL=http://localhost:13305/v1 + # --- Video Generation --------------------------------------------------------- VIDEO_SEEDANCE_API_KEY= diff --git a/app/api/generate/image/route.ts b/app/api/generate/image/route.ts index efcbbb5c0..8629406ac 100644 --- a/app/api/generate/image/route.ts +++ b/app/api/generate/image/route.ts @@ -16,7 +16,11 @@ */ import { NextRequest } from 'next/server'; -import { generateImage, aspectRatioToDimensions } from '@/lib/media/image-providers'; +import { + generateImage, + aspectRatioToDimensions, + IMAGE_PROVIDERS, +} from '@/lib/media/image-providers'; import { resolveImageApiKey, resolveImageBaseUrl } from '@/lib/server/provider-config'; import type { ImageProviderId, ImageGenerationOptions } from '@/lib/media/types'; import { createLogger } from '@/lib/logger'; @@ -50,7 +54,8 @@ export async function POST(request: NextRequest) { const apiKey = clientBaseUrl ? clientApiKey || '' : resolveImageApiKey(providerId, clientApiKey); - if (!apiKey) { + const provider = IMAGE_PROVIDERS[providerId]; + if (provider?.requiresApiKey && !apiKey) { return apiError( 'MISSING_API_KEY', 401, diff --git a/app/api/transcription/route.ts b/app/api/transcription/route.ts index 708384b63..c1837fae7 100644 --- a/app/api/transcription/route.ts +++ b/app/api/transcription/route.ts @@ -50,12 +50,8 @@ export async function POST(req: NextRequest) { : resolveASRBaseUrl(effectiveProviderId, baseUrl || undefined), }; - // Convert audio file to buffer - const arrayBuffer = await audioFile.arrayBuffer(); - const buffer = Buffer.from(arrayBuffer); - // Transcribe using the provider system - const result = await transcribeAudio(config, buffer); + const result = await transcribeAudio(config, audioFile); return apiSuccess({ text: result.text }); } catch (error) { diff --git a/app/api/verify-image-provider/route.ts b/app/api/verify-image-provider/route.ts index bea247415..4ad581b4b 100644 --- a/app/api/verify-image-provider/route.ts +++ b/app/api/verify-image-provider/route.ts @@ -15,7 +15,7 @@ */ import { NextRequest } from 'next/server'; -import { testImageConnectivity } from '@/lib/media/image-providers'; +import { IMAGE_PROVIDERS, testImageConnectivity } from '@/lib/media/image-providers'; import { resolveImageApiKey, resolveImageBaseUrl } from '@/lib/server/provider-config'; import type { ImageProviderId } from '@/lib/media/types'; import { apiError, apiSuccess } from '@/lib/server/api-response'; @@ -43,7 +43,8 @@ export async function POST(request: NextRequest) { : resolveImageApiKey(providerId, clientApiKey); const baseUrl = clientBaseUrl ? clientBaseUrl : resolveImageBaseUrl(providerId, clientBaseUrl); - if (!apiKey) { + const provider = IMAGE_PROVIDERS[providerId]; + if (provider?.requiresApiKey && !apiKey) { return apiError('MISSING_API_KEY', 400, 'No API key configured'); } diff --git a/components/settings/asr-settings.tsx b/components/settings/asr-settings.tsx index 67f67bfbe..2acae27f0 100644 --- a/components/settings/asr-settings.tsx +++ b/components/settings/asr-settings.tsx @@ -30,6 +30,7 @@ import { Mic, MicOff, CheckCircle2, XCircle, Eye, EyeOff, Plus, Loader2 } from ' import { cn } from '@/lib/utils'; import { toast } from 'sonner'; import { createLogger } from '@/lib/logger'; +import { normalizeASRUploadAudio } from '@/lib/audio/wav-utils'; const log = createLogger('ASRSettings'); @@ -52,6 +53,7 @@ export function ASRSettings({ selectedProviderId }: ASRSettingsProps) { const requiresApiKey = isCustom ? !!providerConfig?.requiresApiKey : !!asrProvider?.requiresApiKey; + const isKeylessLocalProvider = !isCustom && !requiresApiKey && !!asrProvider?.defaultBaseUrl; const [showApiKey, setShowApiKey] = useState(false); const [showDeleteConfirm, setShowDeleteConfirm] = useState(false); @@ -129,8 +131,9 @@ export function ASRSettings({ selectedProviderId }: ASRSettingsProps) { stream.getTracks().forEach((track) => track.stop()); setIsProcessing(true); const audioBlob = new Blob(audioChunks, { type: 'audio/webm' }); + const uploadAudio = await normalizeASRUploadAudio(selectedProviderId, audioBlob); const formData = new FormData(); - formData.append('audio', audioBlob, 'recording.webm'); + formData.append('audio', uploadAudio.blob, uploadAudio.fileName); formData.append('providerId', selectedProviderId); formData.append( 'modelId', @@ -207,7 +210,7 @@ export function ASRSettings({ selectedProviderId }: ASRSettingsProps) { )} {/* API Key & Base URL */} - {(requiresApiKey || isServerConfigured || isCustom) && ( + {(requiresApiKey || isServerConfigured || isCustom || isKeylessLocalProvider) && ( <>
@@ -276,6 +279,7 @@ export function ASRSettings({ selectedProviderId }: ASRSettingsProps) { } else { switch (selectedProviderId) { case 'openai-whisper': + case 'lemonade-asr': endpointPath = '/audio/transcriptions'; break; case 'qwen-asr': diff --git a/components/settings/audio-settings.tsx b/components/settings/audio-settings.tsx index d0e351ea8..9971b6e7d 100644 --- a/components/settings/audio-settings.tsx +++ b/components/settings/audio-settings.tsx @@ -28,6 +28,7 @@ import azureVoicesData from '@/lib/audio/azure.json'; import { createLogger } from '@/lib/logger'; import { getVoxCPMVoiceOptions, useVoxCPMVoiceProfiles } from '@/lib/audio/voxcpm-voices'; import { normalizeVoxCPMBackend, voxCPMBackendSupportsReferenceAudio } from '@/lib/audio/voxcpm'; +import { normalizeASRUploadAudio } from '@/lib/audio/wav-utils'; const log = createLogger('AudioSettings'); @@ -44,6 +45,7 @@ function getTTSProviderName(providerId: TTSProviderId, t: (key: string) => strin 'doubao-tts': t('settings.providerDoubaoTTS'), 'elevenlabs-tts': t('settings.providerElevenLabsTTS'), 'minimax-tts': t('settings.providerMiniMaxTTS'), + 'lemonade-tts': t('settings.providerLemonadeTTS'), 'browser-native-tts': t('settings.providerBrowserNativeTTS'), }; return names[providerId]; @@ -54,6 +56,7 @@ function getASRProviderName(providerId: ASRProviderId, t: (key: string) => strin 'openai-whisper': t('settings.providerOpenAIWhisper'), 'browser-native': t('settings.providerBrowserNative'), 'qwen-asr': t('settings.providerQwenASR'), + 'lemonade-asr': t('settings.providerLemonadeASR'), }; return names[providerId]; } @@ -330,8 +333,9 @@ export function AudioSettings({ onSave }: AudioSettingsProps = {}) { stream.getTracks().forEach((track) => track.stop()); const audioBlob = new Blob(audioChunks, { type: 'audio/webm' }); + const uploadAudio = await normalizeASRUploadAudio(asrProviderId, audioBlob); const formData = new FormData(); - formData.append('audio', audioBlob, 'recording.webm'); + formData.append('audio', uploadAudio.blob, uploadAudio.fileName); formData.append('providerId', asrProviderId); formData.append('language', asrLanguage); diff --git a/components/settings/image-settings.tsx b/components/settings/image-settings.tsx index 0931cb2b1..319344959 100644 --- a/components/settings/image-settings.tsx +++ b/components/settings/image-settings.tsx @@ -60,6 +60,7 @@ export function ImageSettings({ selectedProviderId }: ImageSettingsProps) { [currentConfig?.customModels], ); const isServerConfigured = !!currentConfig?.isServerConfigured; + const requiresApiKey = currentProvider?.requiresApiKey ?? true; const handleApiKeyChange = (apiKey: string) => { setImageProviderConfig(selectedProviderId, { apiKey }); @@ -179,7 +180,7 @@ export function ImageSettings({ selectedProviderId }: ImageSettingsProps) { variant="outline" size="sm" onClick={handleTest} - disabled={testLoading || (!currentConfig?.apiKey && !isServerConfigured)} + disabled={testLoading || (requiresApiKey && !currentConfig?.apiKey && !isServerConfigured)} className="gap-1.5" > {testLoading ? ( diff --git a/components/settings/index.tsx b/components/settings/index.tsx index 94d65961e..e3ea81fbe 100644 --- a/components/settings/index.tsx +++ b/components/settings/index.tsx @@ -145,6 +145,7 @@ function getTTSProviderName(providerId: TTSProviderId, t: (key: string) => strin 'doubao-tts': t('settings.providerDoubaoTTS'), 'elevenlabs-tts': t('settings.providerElevenLabsTTS'), 'minimax-tts': t('settings.providerMiniMaxTTS'), + 'lemonade-tts': t('settings.providerLemonadeTTS'), 'browser-native-tts': t('settings.providerBrowserNativeTTS'), }; return names[providerId] || providerId; @@ -159,6 +160,7 @@ function getASRProviderName(providerId: ASRProviderId, t: (key: string) => strin 'openai-whisper': t('settings.providerOpenAIWhisper'), 'browser-native': t('settings.providerBrowserNative'), 'qwen-asr': t('settings.providerQwenASR'), + 'lemonade-asr': t('settings.providerLemonadeASR'), }; return names[providerId] || providerId; } @@ -171,6 +173,7 @@ const IMAGE_PROVIDER_NAMES: Record = { 'nano-banana': 'providerNanoBanana', 'minimax-image': 'providerMiniMaxImage', 'grok-image': 'providerGrokImage', + lemonade: 'providerLemonadeImage', }; const IMAGE_PROVIDER_ICONS: Record = { @@ -180,6 +183,7 @@ const IMAGE_PROVIDER_ICONS: Record = { 'nano-banana': '/logos/gemini.svg', 'minimax-image': '/logos/minimax.svg', 'grok-image': '/logos/grok.svg', + lemonade: '/logos/lemonade.svg', }; const VIDEO_PROVIDER_NAMES: Record = { diff --git a/components/settings/model-selector.tsx b/components/settings/model-selector.tsx index 57df7f58b..ef0ba443a 100644 --- a/components/settings/model-selector.tsx +++ b/components/settings/model-selector.tsx @@ -75,7 +75,7 @@ export function ModelSelector({ ([, config]) => (config.requiresApiKey ? config.apiKey || config.isServerConfigured - : config.isServerConfigured || config.baseUrl) && + : config.isServerConfigured || config.baseUrl || config.defaultBaseUrl) && config.models.length >= 1 && (config.baseUrl || config.defaultBaseUrl || config.serverBaseUrl), ) diff --git a/components/settings/tts-settings.tsx b/components/settings/tts-settings.tsx index 010358aa0..ac28e2e51 100644 --- a/components/settings/tts-settings.tsx +++ b/components/settings/tts-settings.tsx @@ -93,6 +93,7 @@ export function TTSSettings({ selectedProviderId }: TTSSettingsProps) { const requiresApiKey = isCustom ? !!providerConfig?.requiresApiKey : !!ttsProvider?.requiresApiKey; + const isKeylessLocalProvider = !isCustom && !requiresApiKey && !!ttsProvider?.defaultBaseUrl; // When testing a non-active provider, use that provider's default voice // instead of the active provider's voice (which may be incompatible). @@ -192,6 +193,7 @@ export function TTSSettings({ selectedProviderId }: TTSSettingsProps) { switch (selectedProviderId) { case 'openai-tts': case 'glm-tts': + case 'lemonade-tts': return '/audio/speech'; case 'azure-tts': return '/cognitiveservices/v1'; @@ -225,7 +227,7 @@ export function TTSSettings({ selectedProviderId }: TTSSettingsProps) { )} {/* API Key & Base URL */} - {(requiresApiKey || isServerConfigured || isCustom || isVoxCPM) && + {(requiresApiKey || isServerConfigured || isCustom || isVoxCPM || isKeylessLocalProvider) && (isVoxCPM ? (
diff --git a/lib/ai/providers.ts b/lib/ai/providers.ts index 011089b30..f2b4aa1f8 100644 --- a/lib/ai/providers.ts +++ b/lib/ai/providers.ts @@ -6,7 +6,7 @@ * - Anthropic Claude (native) * - Google Gemini (native) * - MiniMax (Anthropic-compatible, recommended by official) - * - OpenAI-compatible providers (DeepSeek, Qwen, Kimi, GLM, SiliconFlow, Doubao, Tencent, Xiaomi, etc.) + * - OpenAI-compatible providers (DeepSeek, Qwen, Kimi, GLM, SiliconFlow, Doubao, Tencent, Xiaomi, Lemonade, etc.) * * Sources: * - https://platform.openai.com/docs/models @@ -980,6 +980,32 @@ export const PROVIDERS: Record = { }, ], }, + + lemonade: { + id: 'lemonade', + name: 'Lemonade', + type: 'openai', + defaultBaseUrl: 'http://localhost:13305/v1', + requiresApiKey: false, + icon: '/logos/lemonade.svg', + models: [ + { + id: 'Qwen3-0.6B-GGUF', + name: 'Qwen3 0.6B GGUF', + capabilities: { streaming: true, tools: true, vision: false }, + }, + { + id: 'Llama-3.2-1B-Instruct-Hybrid', + name: 'Llama 3.2 1B Instruct Hybrid', + capabilities: { streaming: true, tools: true, vision: false }, + }, + { + id: 'Qwen2.5-VL-7B-Instruct', + name: 'Qwen2.5 VL 7B Instruct', + capabilities: { streaming: true, tools: true, vision: true }, + }, + ], + }, }; applyModelMetadata(PROVIDERS); diff --git a/lib/audio/asr-providers.ts b/lib/audio/asr-providers.ts index 0fec13dc0..dd5a8deb9 100644 --- a/lib/audio/asr-providers.ts +++ b/lib/audio/asr-providers.ts @@ -182,6 +182,9 @@ export async function transcribeAudio( case 'qwen-asr': return await transcribeQwenASR(config, audioBuffer); + case 'lemonade-asr': + return await transcribeLemonadeASR(config, audioBuffer); + default: if (isCustomASRProvider(config.providerId)) { return await transcribeOpenAIWhisper(config, audioBuffer); @@ -190,6 +193,84 @@ export async function transcribeAudio( } } +/** + * Lemonade ASR implementation (OpenAI-compatible multipart transcription). + * + * Lemonade currently supports WAV input and JSON response format. + */ +async function transcribeLemonadeASR( + config: ASRModelConfig, + audioBuffer: Buffer | Blob, +): Promise { + const baseUrl = (config.baseUrl || ASR_PROVIDERS['lemonade-asr'].defaultBaseUrl || '').replace( + /\/$/, + '', + ); + + const audioBlob = await toAudioBlob(audioBuffer); + if (!isWavAudio(audioBlob)) { + throw new Error( + 'Lemonade ASR currently supports WAV input only. Recordings should be converted to WAV before upload.', + ); + } + + const formData = new FormData(); + formData.set('file', audioBlob, 'audio.wav'); + formData.set('model', config.modelId || ASR_PROVIDERS['lemonade-asr'].defaultModelId); + formData.set('response_format', 'json'); + if (config.language && config.language !== 'auto') { + formData.set('language', config.language); + } + + const response = await fetch(`${baseUrl}/audio/transcriptions`, { + method: 'POST', + headers: getOptionalBearerAuthHeaders(config.apiKey), + body: formData, + }); + + if (!response.ok) { + const errorText = await response.text().catch(() => response.statusText); + if (errorText.includes('audio is empty') || errorText.includes('too short')) { + return { text: '' }; + } + throw new Error(`Lemonade ASR API error: ${errorText || response.statusText}`); + } + + const data = await response.json(); + return { text: typeof data.text === 'string' ? data.text : '' }; +} + +async function toAudioBlob(audioBuffer: Buffer | Blob): Promise { + if (audioBuffer instanceof Blob) { + return audioBuffer; + } + if (audioBuffer instanceof Buffer) { + const arrayBuffer = audioBuffer.buffer.slice( + audioBuffer.byteOffset, + audioBuffer.byteOffset + audioBuffer.byteLength, + ) as ArrayBuffer; + return new Blob([arrayBuffer], { type: detectWavBuffer(audioBuffer) ? 'audio/wav' : '' }); + } + throw new Error('Invalid audio buffer type'); +} + +function isWavAudio(blob: Blob): boolean { + return blob.type.includes('audio/wav') || blob.type.includes('audio/x-wav'); +} + +function detectWavBuffer(buffer: Buffer): boolean { + return ( + buffer.byteLength >= 12 && + buffer.toString('ascii', 0, 4) === 'RIFF' && + buffer.toString('ascii', 8, 12) === 'WAVE' + ); +} + +function getOptionalBearerAuthHeaders(apiKey?: string): Record { + const key = apiKey?.trim(); + return key ? { Authorization: `Bearer ${key}` } : {}; +} + /** * OpenAI Whisper implementation (using Vercel AI SDK) */ diff --git a/lib/audio/constants.ts b/lib/audio/constants.ts index 1f5167b01..9515478f8 100644 --- a/lib/audio/constants.ts +++ b/lib/audio/constants.ts @@ -943,6 +943,23 @@ export const TTS_PROVIDERS: Record = { supportedFormats: ['browser'], // Browser native audio speedRange: { min: 0.1, max: 10.0, default: 1.0 }, }, + + 'lemonade-tts': { + id: 'lemonade-tts', + name: 'Lemonade TTS', + requiresApiKey: false, + defaultBaseUrl: 'http://localhost:13305/v1', + icon: '/logos/lemonade.svg', + models: [{ id: 'kokoro', name: 'Kokoro' }], + defaultModelId: 'kokoro', + voices: [ + { id: 'default', name: 'Default', language: 'auto', gender: 'neutral' }, + { id: 'af_bella', name: 'Bella', language: 'en', gender: 'female' }, + { id: 'am_adam', name: 'Adam', language: 'en', gender: 'male' }, + ], + supportedFormats: ['wav'], + speedRange: { min: 0.25, max: 4.0, default: 1.0 }, + }, }; /** @@ -1142,6 +1159,18 @@ export const ASR_PROVIDERS: Record = { ], supportedFormats: ['webm'], // MediaRecorder format }, + + 'lemonade-asr': { + id: 'lemonade-asr', + name: 'Lemonade ASR', + requiresApiKey: false, + defaultBaseUrl: 'http://localhost:13305/v1', + icon: '/logos/lemonade.svg', + models: [{ id: 'whisper', name: 'Whisper' }], + defaultModelId: 'whisper', + supportedLanguages: CUSTOM_ASR_DEFAULT_LANGUAGES, + supportedFormats: ['wav'], + }, }; /** @@ -1157,6 +1186,7 @@ export const DEFAULT_TTS_VOICES: Record = { 'doubao-tts': 'zh_female_vv_uranus_bigtts', 'elevenlabs-tts': 'EXAVITQu4vr4xnSDxMaL', 'minimax-tts': 'female-yujie', + 'lemonade-tts': 'default', 'browser-native-tts': 'default', }; @@ -1169,6 +1199,7 @@ export const DEFAULT_TTS_MODELS: Record = { 'doubao-tts': '', 'elevenlabs-tts': 'eleven_multilingual_v2', 'minimax-tts': 'speech-2.8-hd', + 'lemonade-tts': 'kokoro', 'browser-native-tts': '', }; diff --git a/lib/audio/tts-providers.ts b/lib/audio/tts-providers.ts index 3019e940f..5eb245948 100644 --- a/lib/audio/tts-providers.ts +++ b/lib/audio/tts-providers.ts @@ -164,6 +164,9 @@ export async function generateTTS( case 'elevenlabs-tts': return await generateElevenLabsTTS(config, text); + case 'lemonade-tts': + return await generateLemonadeTTS(config, text); + case 'browser-native-tts': throw new Error( 'Browser Native TTS must be handled client-side using Web Speech API. This provider cannot be used on the server.', @@ -215,6 +218,45 @@ async function generateOpenAITTS( }; } +/** + * Lemonade TTS implementation (OpenAI-compatible /v1/audio/speech). + */ +async function generateLemonadeTTS( + config: TTSModelConfig, + text: string, +): Promise { + const baseUrl = (config.baseUrl || TTS_PROVIDERS['lemonade-tts'].defaultBaseUrl || '').replace( + /\/$/, + '', + ); + + const response = await fetch(`${baseUrl}/audio/speech`, { + method: 'POST', + headers: { + 'Content-Type': 'application/json; charset=utf-8', + ...getBackendAuthHeaders(config.apiKey), + }, + body: JSON.stringify({ + model: config.modelId || TTS_PROVIDERS['lemonade-tts'].defaultModelId, + input: text, + voice: config.voice || 'default', + speed: config.speed || 1.0, + response_format: config.format || 'wav', + }), + }); + + if (!response.ok) { + throw new Error(`Lemonade TTS API error: ${await readTTSApiError(response)}`); + } + + const arrayBuffer = await response.arrayBuffer(); + const contentType = response.headers.get('content-type') || ''; + return { + audio: new Uint8Array(arrayBuffer), + format: getAudioResponseFormat(contentType), + }; +} + /** * VoxCPM2 TTS implementation. * diff --git a/lib/audio/types.ts b/lib/audio/types.ts index 5e4cde562..d57960692 100644 --- a/lib/audio/types.ts +++ b/lib/audio/types.ts @@ -87,6 +87,7 @@ export type BuiltInTTSProviderId = | 'doubao-tts' | 'elevenlabs-tts' | 'minimax-tts' + | 'lemonade-tts' | 'browser-native-tts'; export type TTSProviderId = BuiltInTTSProviderId | `custom-tts-${string}`; @@ -151,7 +152,11 @@ export interface TTSModelConfig { * Add new ASR providers here as union members. * Keep in sync with ASR_PROVIDERS registry in constants.ts */ -export type BuiltInASRProviderId = 'openai-whisper' | 'browser-native' | 'qwen-asr'; +export type BuiltInASRProviderId = + | 'openai-whisper' + | 'browser-native' + | 'qwen-asr' + | 'lemonade-asr'; export type ASRProviderId = BuiltInASRProviderId | `custom-asr-${string}`; diff --git a/lib/audio/voice-resolver.ts b/lib/audio/voice-resolver.ts index ca3c42932..f95975d20 100644 --- a/lib/audio/voice-resolver.ts +++ b/lib/audio/voice-resolver.ts @@ -130,6 +130,9 @@ export function getAvailableProvidersWithVoices( const providerConfig = ttsProvidersConfig[providerId]; const hasApiKey = providerConfig?.apiKey && providerConfig.apiKey.trim().length > 0; const isServerConfigured = providerConfig?.isServerConfigured === true; + const isKeylessLocalProvider = + !config.requiresApiKey && + !!(providerConfig?.serverBaseUrl?.trim() || providerConfig?.baseUrl?.trim() || config.defaultBaseUrl); const isLocalVoxCPM = providerId === VOXCPM_TTS_PROVIDER_ID && !!(providerConfig?.serverBaseUrl?.trim() || providerConfig?.baseUrl?.trim()); @@ -141,7 +144,7 @@ export function getAvailableProvidersWithVoices( }) : []; - if (hasApiKey || isServerConfigured || isLocalVoxCPM) { + if (hasApiKey || isServerConfigured || isLocalVoxCPM || isKeylessLocalProvider) { const allVoices = [ ...config.voices.map((v) => ({ id: v.id, diff --git a/lib/audio/wav-utils.ts b/lib/audio/wav-utils.ts new file mode 100644 index 000000000..1a81a37f9 --- /dev/null +++ b/lib/audio/wav-utils.ts @@ -0,0 +1,84 @@ +'use client'; + +function writeAscii(view: DataView, offset: number, value: string): void { + for (let i = 0; i < value.length; i++) { + view.setUint8(offset + i, value.charCodeAt(i)); + } +} + +function audioBufferToMonoWav(audioBuffer: AudioBuffer): ArrayBuffer { + const sampleRate = audioBuffer.sampleRate; + const sampleCount = audioBuffer.length; + const dataSize = sampleCount * 2; + const buffer = new ArrayBuffer(44 + dataSize); + const view = new DataView(buffer); + + writeAscii(view, 0, 'RIFF'); + view.setUint32(4, 36 + dataSize, true); + writeAscii(view, 8, 'WAVE'); + writeAscii(view, 12, 'fmt '); + view.setUint32(16, 16, true); + view.setUint16(20, 1, true); + view.setUint16(22, 1, true); + view.setUint32(24, sampleRate, true); + view.setUint32(28, sampleRate * 2, true); + view.setUint16(32, 2, true); + view.setUint16(34, 16, true); + writeAscii(view, 36, 'data'); + view.setUint32(40, dataSize, true); + + const channels = Array.from({ length: audioBuffer.numberOfChannels }, (_, index) => + audioBuffer.getChannelData(index), + ); + let offset = 44; + for (let i = 0; i < sampleCount; i++) { + let mixed = 0; + for (const channel of channels) mixed += channel[i]; + const sample = Math.max(-1, Math.min(1, mixed / channels.length)); + view.setInt16(offset, sample < 0 ? sample * 0x8000 : sample * 0x7fff, true); + offset += 2; + } + + return buffer; +} + +export function isWavBlob(blob: Blob, fileName?: string): boolean { + return ( + blob.type.includes('audio/wav') || + blob.type.includes('audio/x-wav') || + /\.wav$/i.test(fileName || '') + ); +} + +export async function audioBlobToWav(blob: Blob): Promise { + if (isWavBlob(blob)) return blob; + if (typeof window === 'undefined') { + throw new Error('Audio conversion requires a browser environment'); + } + + const AudioContextConstructor = + window.AudioContext || + (window as typeof window & { webkitAudioContext?: typeof AudioContext }).webkitAudioContext; + if (!AudioContextConstructor) { + throw new Error('This browser does not support audio conversion'); + } + + const audioContext = new AudioContextConstructor(); + try { + const arrayBuffer = await blob.arrayBuffer(); + const audioBuffer = await audioContext.decodeAudioData(arrayBuffer.slice(0)); + return new Blob([audioBufferToMonoWav(audioBuffer)], { type: 'audio/wav' }); + } finally { + await audioContext.close().catch(() => undefined); + } +} + +export async function normalizeASRUploadAudio( + providerId: string, + audioBlob: Blob, +): Promise<{ blob: Blob; fileName: string }> { + if (providerId !== 'lemonade-asr') { + return { blob: audioBlob, fileName: 'recording.webm' }; + } + return { blob: await audioBlobToWav(audioBlob), fileName: 'recording.wav' }; +} diff --git a/lib/hooks/use-audio-recorder.ts b/lib/hooks/use-audio-recorder.ts index 327985bb2..a19286e34 100644 --- a/lib/hooks/use-audio-recorder.ts +++ b/lib/hooks/use-audio-recorder.ts @@ -1,5 +1,6 @@ import { useState, useRef, useCallback } from 'react'; import { ASR_PROVIDERS } from '@/lib/audio/constants'; +import { normalizeASRUploadAudio } from '@/lib/audio/wav-utils'; import { createLogger } from '@/lib/logger'; const log = createLogger('AudioRecorder'); @@ -41,13 +42,14 @@ export function useAudioRecorder(options: UseAudioRecorderOptions = {}) { try { const formData = new FormData(); - formData.append('audio', audioBlob, 'recording.webm'); // Get current ASR configuration from settings store // Note: This requires importing useSettingsStore in browser context if (typeof window !== 'undefined') { const { useSettingsStore } = await import('@/lib/store/settings'); const { asrProviderId, asrLanguage, asrProvidersConfig } = useSettingsStore.getState(); + const uploadAudio = await normalizeASRUploadAudio(asrProviderId, audioBlob); + formData.append('audio', uploadAudio.blob, uploadAudio.fileName); formData.append('providerId', asrProviderId); formData.append( @@ -68,6 +70,8 @@ export function useAudioRecorder(options: UseAudioRecorderOptions = {}) { if (effectiveBaseUrl) { formData.append('baseUrl', effectiveBaseUrl); } + } else { + formData.append('audio', audioBlob, 'recording.webm'); } const response = await fetch('/api/transcription', { diff --git a/lib/i18n/locales/ar-SA.json b/lib/i18n/locales/ar-SA.json index de952a41b..19c9becc2 100644 --- a/lib/i18n/locales/ar-SA.json +++ b/lib/i18n/locales/ar-SA.json @@ -454,6 +454,7 @@ "grok": "Grok", "tencent-hunyuan": "Tencent Hunyuan", "xiaomi": "Xiaomi MiMo", + "lemonade": "Lemonade (محلي)", "ollama": "Ollama (محلي)" }, "providerTypes": { @@ -601,6 +602,7 @@ "providerDoubaoTTS": "Doubao TTS 2.0 (فولكينجين)", "providerElevenLabsTTS": "ElevenLabs TTS", "providerMiniMaxTTS": "MiniMax TTS", + "providerLemonadeTTS": "Lemonade TTS (محلي)", "providerBrowserNativeTTS": "تحويل النص إلى كلام المدمج في المتصفح", "voxcpmBackend": "الخلفية", "voxcpmBaseUrlPending": "أدخل Base URL لإنشاء عنوان الطلب", @@ -647,6 +649,7 @@ "providerOpenAIWhisper": "OpenAI ASR (gpt-4o-mini-transcribe)", "providerBrowserNative": "التعرّف على الكلام المدمج في المتصفح", "providerQwenASR": "Qwen ASR (سحابة علي بابا بايليان)", + "providerLemonadeASR": "Lemonade ASR (محلي)", "providerUnpdf": "unpdf (مُدمج)", "providerMinerU": "MinerU", "providerMinerUCloud": "MinerU (السحابي)", @@ -898,6 +901,7 @@ "providerNanoBanana": "Nano Banana (Gemini)", "providerMiniMaxImage": "MiniMax Image", "providerGrokImage": "Grok Image (xAI)", + "providerLemonadeImage": "Lemonade Image (محلي)", "testImageGeneration": "اختبار توليد الصور", "testImageConnectivity": "اختبار الاتصال", "imageConnectivitySuccess": "تم الاتصال بخدمة الصور بنجاح", diff --git a/lib/i18n/locales/en-US.json b/lib/i18n/locales/en-US.json index 46535f443..50b37fb85 100644 --- a/lib/i18n/locales/en-US.json +++ b/lib/i18n/locales/en-US.json @@ -454,6 +454,7 @@ "grok": "Grok", "tencent-hunyuan": "Tencent Hunyuan", "xiaomi": "Xiaomi MiMo", + "lemonade": "Lemonade (Local)", "ollama": "Ollama (Local)" }, "providerTypes": { @@ -601,6 +602,7 @@ "providerDoubaoTTS": "Doubao TTS 2.0 (Volcengine)", "providerElevenLabsTTS": "ElevenLabs TTS", "providerMiniMaxTTS": "MiniMax TTS", + "providerLemonadeTTS": "Lemonade TTS (Local)", "providerBrowserNativeTTS": "Browser Native TTS", "voxcpmBackend": "Backend", "voxcpmBaseUrlPending": "Enter a Base URL to generate the request URL", @@ -647,6 +649,7 @@ "providerOpenAIWhisper": "OpenAI ASR (gpt-4o-mini-transcribe)", "providerBrowserNative": "Browser Native ASR", "providerQwenASR": "Qwen ASR (Alibaba Cloud Bailian)", + "providerLemonadeASR": "Lemonade ASR (Local)", "providerUnpdf": "unpdf (Built-in)", "providerMinerU": "MinerU", "providerMinerUCloud": "MinerU (Cloud)", @@ -898,6 +901,7 @@ "providerNanoBanana": "Nano Banana (Gemini)", "providerMiniMaxImage": "MiniMax Image", "providerGrokImage": "Grok Image (xAI)", + "providerLemonadeImage": "Lemonade Image (Local)", "testImageGeneration": "Test Image Generation", "testImageConnectivity": "Test Connection", "imageConnectivitySuccess": "Image service connected successfully", diff --git a/lib/i18n/locales/ja-JP.json b/lib/i18n/locales/ja-JP.json index 1f70154d7..6a47a8ccf 100644 --- a/lib/i18n/locales/ja-JP.json +++ b/lib/i18n/locales/ja-JP.json @@ -454,6 +454,7 @@ "grok": "Grok", "tencent-hunyuan": "Tencent Hunyuan", "xiaomi": "Xiaomi MiMo", + "lemonade": "Lemonade(ローカル)", "ollama": "Ollama(ローカルモデル)" }, "providerTypes": { @@ -601,6 +602,7 @@ "providerDoubaoTTS": "Doubao TTS 2.0(火山エンジン)", "providerElevenLabsTTS": "ElevenLabs TTS", "providerMiniMaxTTS": "MiniMax TTS", + "providerLemonadeTTS": "Lemonade TTS(ローカル)", "providerBrowserNativeTTS": "ブラウザネイティブTTS", "voxcpmBackend": "バックエンド", "voxcpmBaseUrlPending": "Base URL を入力するとリクエスト URL が生成されます", @@ -647,6 +649,7 @@ "providerOpenAIWhisper": "OpenAI ASR (gpt-4o-mini-transcribe)", "providerBrowserNative": "ブラウザネイティブASR", "providerQwenASR": "Qwen ASR(Alibaba Cloud百錬)", + "providerLemonadeASR": "Lemonade ASR(ローカル)", "providerUnpdf": "unpdf(組み込み)", "providerMinerU": "MinerU", "providerMinerUCloud": "MinerU(クラウド)", @@ -898,6 +901,7 @@ "providerNanoBanana": "Nano Banana(Gemini)", "providerMiniMaxImage": "MiniMax Image", "providerGrokImage": "Grok Image(xAI)", + "providerLemonadeImage": "Lemonade Image(ローカル)", "testImageGeneration": "画像生成をテスト", "testImageConnectivity": "接続テスト", "imageConnectivitySuccess": "画像サービスへの接続に成功しました", diff --git a/lib/i18n/locales/ru-RU.json b/lib/i18n/locales/ru-RU.json index e8cc05b14..06b5bd943 100644 --- a/lib/i18n/locales/ru-RU.json +++ b/lib/i18n/locales/ru-RU.json @@ -454,6 +454,7 @@ "grok": "Grok", "tencent-hunyuan": "Tencent Hunyuan", "xiaomi": "Xiaomi MiMo", + "lemonade": "Lemonade (Локальный)", "ollama": "Ollama (Локальный)" }, "providerTypes": { @@ -601,6 +602,7 @@ "providerDoubaoTTS": "Doubao TTS 2.0 (Volcengine)", "providerElevenLabsTTS": "ElevenLabs TTS", "providerMiniMaxTTS": "MiniMax TTS", + "providerLemonadeTTS": "Lemonade TTS (Локальный)", "providerBrowserNativeTTS": "Встроенный TTS браузера", "voxcpmBackend": "Бэкенд", "voxcpmBaseUrlPending": "Введите Base URL, чтобы сформировать URL запроса", @@ -647,6 +649,7 @@ "providerOpenAIWhisper": "OpenAI ASR (gpt-4o-mini-transcribe)", "providerBrowserNative": "Встроенный ASR браузера", "providerQwenASR": "Qwen ASR (Alibaba Cloud Bailian)", + "providerLemonadeASR": "Lemonade ASR (Локальный)", "providerUnpdf": "unpdf (встроенный)", "providerMinerU": "MinerU", "providerMinerUCloud": "MinerU (Облако)", @@ -898,6 +901,7 @@ "providerNanoBanana": "Nano Banana (Gemini)", "providerMiniMaxImage": "MiniMax Image", "providerGrokImage": "Grok Image (xAI)", + "providerLemonadeImage": "Lemonade Image (Локальный)", "testImageGeneration": "Тест генерации изображений", "testImageConnectivity": "Тест подключения", "imageConnectivitySuccess": "Подключение к сервису изображений успешно", diff --git a/lib/i18n/locales/zh-CN.json b/lib/i18n/locales/zh-CN.json index da4482fc3..98cb750f7 100644 --- a/lib/i18n/locales/zh-CN.json +++ b/lib/i18n/locales/zh-CN.json @@ -454,6 +454,7 @@ "grok": "Grok", "tencent-hunyuan": "腾讯混元", "xiaomi": "小米 MiMo", + "lemonade": "Lemonade(本地)", "ollama": "Ollama(本地模型)" }, "providerTypes": { @@ -601,6 +602,7 @@ "providerDoubaoTTS": "豆包 TTS 2.0(火山引擎)", "providerElevenLabsTTS": "ElevenLabs TTS", "providerMiniMaxTTS": "MiniMax TTS", + "providerLemonadeTTS": "Lemonade TTS(本地)", "providerBrowserNativeTTS": "浏览器原生 TTS", "voxcpmBackend": "Backend", "voxcpmBaseUrlPending": "填写 Base URL 后生成", @@ -647,6 +649,7 @@ "providerOpenAIWhisper": "OpenAI ASR (gpt-4o-mini-transcribe)", "providerBrowserNative": "浏览器原生 ASR", "providerQwenASR": "Qwen ASR(阿里云百炼)", + "providerLemonadeASR": "Lemonade ASR(本地)", "providerUnpdf": "unpdf(内置)", "providerMinerU": "MinerU", "providerMinerUCloud": "MinerU(云端)", @@ -898,6 +901,7 @@ "providerNanoBanana": "Nano Banana(Gemini)", "providerMiniMaxImage": "MiniMax 图像", "providerGrokImage": "Grok Image(xAI)", + "providerLemonadeImage": "Lemonade 图像(本地)", "testImageGeneration": "测试图像生成", "testImageConnectivity": "测试连接", "imageConnectivitySuccess": "图像服务连接成功", diff --git a/lib/media/adapters/lemonade-image-adapter.ts b/lib/media/adapters/lemonade-image-adapter.ts new file mode 100644 index 000000000..75e809f02 --- /dev/null +++ b/lib/media/adapters/lemonade-image-adapter.ts @@ -0,0 +1,90 @@ +/** + * Lemonade Image Generation Adapter + * + * Lemonade exposes OpenAI-compatible image generation at /v1/images/generations. + */ + +import type { + ImageGenerationConfig, + ImageGenerationOptions, + ImageGenerationResult, +} from '../types'; + +const DEFAULT_MODEL = 'sd-cpp'; +const DEFAULT_BASE_URL = 'http://localhost:13305/v1'; + +function normalizeBaseUrl(baseUrl?: string): string { + return (baseUrl || DEFAULT_BASE_URL).replace(/\/$/, ''); +} + +function authHeaders(apiKey?: string): Record { + const key = apiKey?.trim(); + return key ? { Authorization: `Bearer ${key}` } : {}; +} + +function resolveSize(options: ImageGenerationOptions): string { + return `${options.width || 1024}x${options.height || 1024}`; +} + +export async function testLemonadeImageConnectivity( + config: ImageGenerationConfig, +): Promise<{ success: boolean; message: string }> { + const baseUrl = normalizeBaseUrl(config.baseUrl); + + try { + const response = await fetch(`${baseUrl}/models`, { + headers: authHeaders(config.apiKey), + }); + + if (response.ok) { + return { success: true, message: 'Connected to Lemonade image generation' }; + } + + const text = await response.text().catch(() => response.statusText); + return { success: false, message: `Lemonade API error (${response.status}): ${text}` }; + } catch (err) { + return { success: false, message: `Lemonade connectivity error: ${err}` }; + } +} + +export async function generateWithLemonadeImage( + config: ImageGenerationConfig, + options: ImageGenerationOptions, +): Promise { + const baseUrl = normalizeBaseUrl(config.baseUrl); + const width = options.width || 1024; + const height = options.height || 1024; + + const response = await fetch(`${baseUrl}/images/generations`, { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + ...authHeaders(config.apiKey), + }, + body: JSON.stringify({ + model: config.model || DEFAULT_MODEL, + prompt: options.prompt, + n: 1, + size: resolveSize(options), + response_format: 'b64_json', + }), + }); + + if (!response.ok) { + const text = await response.text().catch(() => response.statusText); + throw new Error(`Lemonade image generation failed (${response.status}): ${text}`); + } + + const data = await response.json(); + const imageData = data.data?.[0]; + if (!imageData?.url && !imageData?.b64_json) { + throw new Error('Lemonade returned empty image response'); + } + + return { + url: imageData.url, + base64: imageData.b64_json, + width, + height, + }; +} diff --git a/lib/media/image-providers.ts b/lib/media/image-providers.ts index f2102df27..a2b73b1c6 100644 --- a/lib/media/image-providers.ts +++ b/lib/media/image-providers.ts @@ -21,6 +21,10 @@ import { testMiniMaxImageConnectivity, } from './adapters/minimax-image-adapter'; import { generateWithGrokImage, testGrokImageConnectivity } from './adapters/grok-image-adapter'; +import { + generateWithLemonadeImage, + testLemonadeImageConnectivity, +} from './adapters/lemonade-image-adapter'; export const IMAGE_PROVIDERS: Record = { seedream: { @@ -116,6 +120,16 @@ export const IMAGE_PROVIDERS: Record = { ], supportedAspectRatios: ['16:9', '4:3', '1:1', '9:16'], }, + lemonade: { + id: 'lemonade', + name: 'Lemonade', + requiresApiKey: false, + defaultBaseUrl: 'http://localhost:13305/v1', + icon: '/logos/lemonade.svg', + models: [{ id: 'sd-cpp', name: 'Stable Diffusion (sd-cpp)' }], + supportedAspectRatios: ['16:9', '4:3', '1:1', '9:16'], + maxResolution: { width: 1024, height: 1024 }, + }, }; export async function testImageConnectivity( @@ -134,6 +148,8 @@ export async function testImageConnectivity( return testMiniMaxImageConnectivity(config); case 'grok-image': return testGrokImageConnectivity(config); + case 'lemonade': + return testLemonadeImageConnectivity(config); default: return { success: false, @@ -159,6 +175,8 @@ export async function generateImage( return generateWithMiniMaxImage(config, options); case 'grok-image': return generateWithGrokImage(config, options); + case 'lemonade': + return generateWithLemonadeImage(config, options); default: throw new Error(`Unsupported image provider: ${config.providerId}`); } diff --git a/lib/media/types.ts b/lib/media/types.ts index 8a2f8e5f2..47c80ac93 100644 --- a/lib/media/types.ts +++ b/lib/media/types.ts @@ -76,7 +76,8 @@ export type ImageProviderId = | 'qwen-image' | 'nano-banana' | 'minimax-image' - | 'grok-image'; + | 'grok-image' + | 'lemonade'; // Add new image providers below (uncomment and modify): // | 'dall-e' // | 'midjourney' diff --git a/lib/server/classroom-media-generation.ts b/lib/server/classroom-media-generation.ts index bd307d150..7848c9af2 100644 --- a/lib/server/classroom-media-generation.ts +++ b/lib/server/classroom-media-generation.ts @@ -95,11 +95,11 @@ export async function generateMediaForClassroom( try { const providerId = imageProviderIds[0] as ImageProviderId; const apiKey = resolveImageApiKey(providerId); - if (!apiKey) { + const providerConfig = IMAGE_PROVIDERS[providerId]; + if (providerConfig?.requiresApiKey && !apiKey) { log.warn(`No API key for image provider "${providerId}", skipping ${req.elementId}`); continue; } - const providerConfig = IMAGE_PROVIDERS[providerId]; const model = providerConfig?.models?.[0]?.id; const result = await generateImage( @@ -221,16 +221,14 @@ export async function generateTTSForClassroom( const providerId = ttsProviderIds[0] as TTSProviderId; const apiKey = resolveTTSApiKey(providerId); - if (!apiKey) { + const ttsProvider = TTS_PROVIDERS[providerId as keyof typeof TTS_PROVIDERS]; + if (ttsProvider?.requiresApiKey && !apiKey) { log.warn(`No API key for TTS provider "${providerId}", skipping TTS generation`); return; } - const ttsBaseUrl = - resolveTTSBaseUrl(providerId) || - TTS_PROVIDERS[providerId as keyof typeof TTS_PROVIDERS]?.defaultBaseUrl; + const ttsBaseUrl = resolveTTSBaseUrl(providerId) || ttsProvider?.defaultBaseUrl; const voice = DEFAULT_TTS_VOICES[providerId as keyof typeof DEFAULT_TTS_VOICES] || 'default'; - const format = - TTS_PROVIDERS[providerId as keyof typeof TTS_PROVIDERS]?.supportedFormats?.[0] || 'mp3'; + const format = ttsProvider?.supportedFormats?.[0] || 'mp3'; if (providerId === VOXCPM_TTS_PROVIDER_ID && voice === VOXCPM_AUTO_VOICE_ID) { log.warn('VoxCPM Auto Voice requires agent context; skipping server-side TTS generation'); return; diff --git a/lib/server/provider-config.ts b/lib/server/provider-config.ts index 8b5cf6bc5..870319bff 100644 --- a/lib/server/provider-config.ts +++ b/lib/server/provider-config.ts @@ -55,6 +55,7 @@ const LLM_ENV_MAP: Record = { XIAOMI: 'xiaomi', MIMO: 'xiaomi', OLLAMA: 'ollama', + LEMONADE: 'lemonade', }; const TTS_ENV_MAP: Record = { @@ -66,11 +67,13 @@ const TTS_ENV_MAP: Record = { TTS_DOUBAO: 'doubao-tts', TTS_ELEVENLABS: 'elevenlabs-tts', TTS_MINIMAX: 'minimax-tts', + TTS_LEMONADE: 'lemonade-tts', }; const ASR_ENV_MAP: Record = { ASR_OPENAI: 'openai-whisper', ASR_QWEN: 'qwen-asr', + ASR_LEMONADE: 'lemonade-asr', }; const PDF_ENV_MAP: Record = { @@ -86,6 +89,7 @@ const IMAGE_ENV_MAP: Record = { IMAGE_NANO_BANANA: 'nano-banana', IMAGE_MINIMAX: 'minimax-image', IMAGE_GROK: 'grok-image', + IMAGE_LEMONADE: 'lemonade', }; const VIDEO_ENV_MAP: Record = { @@ -210,14 +214,18 @@ const _configs: Map = new Map(); function buildConfig(yamlData: YamlData): ServerConfig { return { providers: loadEnvSection(LLM_ENV_MAP, yamlData.providers, { - keylessProviders: new Set(['ollama']), + keylessProviders: new Set(['ollama', 'lemonade']), }), tts: loadEnvSection(TTS_ENV_MAP, yamlData.tts, { - keylessProviders: new Set(['voxcpm-tts']), + keylessProviders: new Set(['voxcpm-tts', 'lemonade-tts']), + }), + asr: loadEnvSection(ASR_ENV_MAP, yamlData.asr, { + keylessProviders: new Set(['lemonade-asr']), }), - asr: loadEnvSection(ASR_ENV_MAP, yamlData.asr), pdf: loadEnvSection(PDF_ENV_MAP, yamlData.pdf, { requiresBaseUrl: true }), - image: loadEnvSection(IMAGE_ENV_MAP, yamlData.image), + image: loadEnvSection(IMAGE_ENV_MAP, yamlData.image, { + keylessProviders: new Set(['lemonade']), + }), video: loadEnvSection(VIDEO_ENV_MAP, yamlData.video), webSearch: loadEnvSection(WEB_SEARCH_ENV_MAP, yamlData['web-search']), }; diff --git a/lib/store/settings.ts b/lib/store/settings.ts index a879546a9..afc9c370d 100644 --- a/lib/store/settings.ts +++ b/lib/store/settings.ts @@ -356,6 +356,7 @@ const getDefaultAudioConfig = () => ({ 'doubao-tts': { apiKey: '', baseUrl: '', enabled: false }, 'elevenlabs-tts': { apiKey: '', baseUrl: '', enabled: false }, 'minimax-tts': { apiKey: '', baseUrl: '', modelId: 'speech-2.8-hd', enabled: false }, + 'lemonade-tts': { apiKey: '', baseUrl: '', modelId: 'kokoro', enabled: false }, 'browser-native-tts': { apiKey: '', baseUrl: '', enabled: true }, } as Record< TTSProviderId, @@ -365,6 +366,7 @@ const getDefaultAudioConfig = () => ({ 'openai-whisper': { apiKey: '', baseUrl: '', enabled: true }, 'browser-native': { apiKey: '', baseUrl: '', enabled: true }, 'qwen-asr': { apiKey: '', baseUrl: '', enabled: false }, + 'lemonade-asr': { apiKey: '', baseUrl: '', enabled: false }, } as Record, }); @@ -389,6 +391,7 @@ const getDefaultImageConfig = () => ({ 'nano-banana': { apiKey: '', baseUrl: '', enabled: false }, 'minimax-image': { apiKey: '', baseUrl: '', enabled: false }, 'grok-image': { apiKey: '', baseUrl: '', enabled: false }, + lemonade: { apiKey: '', baseUrl: '', enabled: false }, } as Record, }); @@ -851,7 +854,14 @@ export const useSettingsStore = create()( })), // Image Generation actions - setImageProvider: (providerId) => set({ imageProviderId: providerId }), + setImageProvider: (providerId) => + set(() => { + const models = IMAGE_PROVIDERS[providerId]?.models || []; + return { + imageProviderId: providerId, + imageModelId: models[0]?.id || '', + }; + }), setImageModelId: (modelId) => set({ imageModelId: modelId }), setImageProviderConfig: (providerId, config) => diff --git a/lib/types/provider.ts b/lib/types/provider.ts index 9437e2998..77c929f9e 100644 --- a/lib/types/provider.ts +++ b/lib/types/provider.ts @@ -20,6 +20,7 @@ export type BuiltInProviderId = | 'grok' | 'tencent-hunyuan' | 'xiaomi' + | 'lemonade' | 'ollama'; /** diff --git a/public/logos/lemonade.svg b/public/logos/lemonade.svg new file mode 100644 index 000000000..5b51d7a65 --- /dev/null +++ b/public/logos/lemonade.svg @@ -0,0 +1,53 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + From f23c0d288d0b031944b541027f8811489611aa1d Mon Sep 17 00:00:00 2001 From: yangshen <1322568757@qq.com> Date: Tue, 5 May 2026 20:59:27 +0800 Subject: [PATCH 2/9] fix(lemonade): correct TTS/ASR model IDs and expand Kokoro voices MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - TTS model: kokoro → kokoro-v1 - ASR model: single 'whisper' → 6 Whisper variants (Base/Large-v3/Large-v3-Turbo/Medium/Small/Tiny), default Whisper-Base - Expand lemonade-tts voices to full Kokoro v1 set (54 voices across en-US/en-GB/zh-CN/ja-JP/es/fr-FR/hi/it-IT/pt-BR) Co-Authored-By: Claude Opus 4.7 (1M context) --- lib/audio/constants.ts | 85 ++++++++++++++++++++++++++++++++++++++---- lib/store/settings.ts | 2 +- 2 files changed, 79 insertions(+), 8 deletions(-) diff --git a/lib/audio/constants.ts b/lib/audio/constants.ts index 9515478f8..cef5783c8 100644 --- a/lib/audio/constants.ts +++ b/lib/audio/constants.ts @@ -950,12 +950,76 @@ export const TTS_PROVIDERS: Record = { requiresApiKey: false, defaultBaseUrl: 'http://localhost:13305/v1', icon: '/logos/lemonade.svg', - models: [{ id: 'kokoro', name: 'Kokoro' }], - defaultModelId: 'kokoro', + models: [{ id: 'kokoro-v1', name: 'Kokoro v1' }], + defaultModelId: 'kokoro-v1', voices: [ { id: 'default', name: 'Default', language: 'auto', gender: 'neutral' }, - { id: 'af_bella', name: 'Bella', language: 'en', gender: 'female' }, - { id: 'am_adam', name: 'Adam', language: 'en', gender: 'male' }, + // American English — female + { id: 'af_alloy', name: 'Alloy', language: 'en-US', gender: 'female' }, + { id: 'af_aoede', name: 'Aoede', language: 'en-US', gender: 'female' }, + { id: 'af_bella', name: 'Bella', language: 'en-US', gender: 'female' }, + { id: 'af_heart', name: 'Heart', language: 'en-US', gender: 'female' }, + { id: 'af_jessica', name: 'Jessica', language: 'en-US', gender: 'female' }, + { id: 'af_kore', name: 'Kore', language: 'en-US', gender: 'female' }, + { id: 'af_nicole', name: 'Nicole', language: 'en-US', gender: 'female' }, + { id: 'af_nova', name: 'Nova', language: 'en-US', gender: 'female' }, + { id: 'af_river', name: 'River', language: 'en-US', gender: 'female' }, + { id: 'af_sarah', name: 'Sarah', language: 'en-US', gender: 'female' }, + { id: 'af_sky', name: 'Sky', language: 'en-US', gender: 'female' }, + // American English — male + { id: 'am_adam', name: 'Adam', language: 'en-US', gender: 'male' }, + { id: 'am_echo', name: 'Echo', language: 'en-US', gender: 'male' }, + { id: 'am_eric', name: 'Eric', language: 'en-US', gender: 'male' }, + { id: 'am_fenrir', name: 'Fenrir', language: 'en-US', gender: 'male' }, + { id: 'am_liam', name: 'Liam', language: 'en-US', gender: 'male' }, + { id: 'am_michael', name: 'Michael', language: 'en-US', gender: 'male' }, + { id: 'am_onyx', name: 'Onyx', language: 'en-US', gender: 'male' }, + { id: 'am_puck', name: 'Puck', language: 'en-US', gender: 'male' }, + // British English — female + { id: 'bf_alice', name: 'Alice', language: 'en-GB', gender: 'female' }, + { id: 'bf_emma', name: 'Emma', language: 'en-GB', gender: 'female' }, + { id: 'bf_isabella', name: 'Isabella', language: 'en-GB', gender: 'female' }, + { id: 'bf_lily', name: 'Lily', language: 'en-GB', gender: 'female' }, + // British English — male + { id: 'bm_daniel', name: 'Daniel', language: 'en-GB', gender: 'male' }, + { id: 'bm_fable', name: 'Fable', language: 'en-GB', gender: 'male' }, + { id: 'bm_george', name: 'George', language: 'en-GB', gender: 'male' }, + { id: 'bm_lewis', name: 'Lewis', language: 'en-GB', gender: 'male' }, + // Mandarin Chinese — female + { id: 'zf_xiaobei', name: '晓贝', language: 'zh-CN', gender: 'female' }, + { id: 'zf_xiaoni', name: '晓妮', language: 'zh-CN', gender: 'female' }, + { id: 'zf_xiaoxiao', name: '晓晓', language: 'zh-CN', gender: 'female' }, + { id: 'zf_xiaoyi', name: '晓伊', language: 'zh-CN', gender: 'female' }, + // Mandarin Chinese — male + { id: 'zm_yunjian', name: '云健', language: 'zh-CN', gender: 'male' }, + { id: 'zm_yunxi', name: '云希', language: 'zh-CN', gender: 'male' }, + { id: 'zm_yunxia', name: '云夏', language: 'zh-CN', gender: 'male' }, + { id: 'zm_yunyang', name: '云扬', language: 'zh-CN', gender: 'male' }, + // Japanese — female + { id: 'jf_alpha', name: 'Alpha', language: 'ja-JP', gender: 'female' }, + { id: 'jf_gongitsune', name: 'Gongitsune', language: 'ja-JP', gender: 'female' }, + { id: 'jf_nezumi', name: 'Nezumi', language: 'ja-JP', gender: 'female' }, + { id: 'jf_tebukuro', name: 'Tebukuro', language: 'ja-JP', gender: 'female' }, + // Japanese — male + { id: 'jm_kumo', name: 'Kumo', language: 'ja-JP', gender: 'male' }, + // Spanish + { id: 'ef_dora', name: 'Dora', language: 'es', gender: 'female' }, + { id: 'em_alex', name: 'Alex', language: 'es', gender: 'male' }, + { id: 'em_santa', name: 'Santa', language: 'es', gender: 'male' }, + // French + { id: 'ff_siwis', name: 'Siwis', language: 'fr-FR', gender: 'female' }, + // Hindi + { id: 'hf_alpha', name: 'Alpha', language: 'hi', gender: 'female' }, + { id: 'hf_beta', name: 'Beta', language: 'hi', gender: 'female' }, + { id: 'hm_omega', name: 'Omega', language: 'hi', gender: 'male' }, + { id: 'hm_psi', name: 'Psi', language: 'hi', gender: 'male' }, + // Italian + { id: 'if_sara', name: 'Sara', language: 'it-IT', gender: 'female' }, + { id: 'im_nicola', name: 'Nicola', language: 'it-IT', gender: 'male' }, + // Brazilian Portuguese + { id: 'pf_dora', name: 'Dora', language: 'pt-BR', gender: 'female' }, + { id: 'pm_alex', name: 'Alex', language: 'pt-BR', gender: 'male' }, + { id: 'pm_santa', name: 'Santa', language: 'pt-BR', gender: 'male' }, ], supportedFormats: ['wav'], speedRange: { min: 0.25, max: 4.0, default: 1.0 }, @@ -1166,8 +1230,15 @@ export const ASR_PROVIDERS: Record = { requiresApiKey: false, defaultBaseUrl: 'http://localhost:13305/v1', icon: '/logos/lemonade.svg', - models: [{ id: 'whisper', name: 'Whisper' }], - defaultModelId: 'whisper', + models: [ + { id: 'Whisper-Base', name: 'Whisper Base' }, + { id: 'Whisper-Large-v3', name: 'Whisper Large v3' }, + { id: 'Whisper-Large-v3-Turbo', name: 'Whisper Large v3 Turbo' }, + { id: 'Whisper-Medium', name: 'Whisper Medium' }, + { id: 'Whisper-Small', name: 'Whisper Small' }, + { id: 'Whisper-Tiny', name: 'Whisper Tiny' }, + ], + defaultModelId: 'Whisper-Base', supportedLanguages: CUSTOM_ASR_DEFAULT_LANGUAGES, supportedFormats: ['wav'], }, @@ -1199,7 +1270,7 @@ export const DEFAULT_TTS_MODELS: Record = { 'doubao-tts': '', 'elevenlabs-tts': 'eleven_multilingual_v2', 'minimax-tts': 'speech-2.8-hd', - 'lemonade-tts': 'kokoro', + 'lemonade-tts': 'kokoro-v1', 'browser-native-tts': '', }; diff --git a/lib/store/settings.ts b/lib/store/settings.ts index 32f9b19d8..3580adb86 100644 --- a/lib/store/settings.ts +++ b/lib/store/settings.ts @@ -356,7 +356,7 @@ const getDefaultAudioConfig = () => ({ 'doubao-tts': { apiKey: '', baseUrl: '', enabled: false }, 'elevenlabs-tts': { apiKey: '', baseUrl: '', enabled: false }, 'minimax-tts': { apiKey: '', baseUrl: '', modelId: 'speech-2.8-hd', enabled: false }, - 'lemonade-tts': { apiKey: '', baseUrl: '', modelId: 'kokoro', enabled: false }, + 'lemonade-tts': { apiKey: '', baseUrl: '', modelId: 'kokoro-v1', enabled: false }, 'browser-native-tts': { apiKey: '', baseUrl: '', enabled: true }, } as Record< TTSProviderId, From a944b88bcb81f075fa1d3eedb0b7b25d010623ac Mon Sep 17 00:00:00 2001 From: yangshen <1322568757@qq.com> Date: Tue, 5 May 2026 21:15:16 +0800 Subject: [PATCH 3/9] fix(lemonade): zh-TW i18n parity, real Kokoro default voice, language code consistency MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - zh-TW.json: add missing providerLemonadeTTS/ASR/Image keys - TTS default voice: 'default' → 'af_heart' (Kokoro expects real voice ID, not placeholder) - Voice language codes: 'es'/'hi' → 'es-ES'/'hi-IN' for consistency Co-Authored-By: Claude Opus 4.7 (1M context) --- lib/audio/constants.ts | 17 ++++++++--------- lib/audio/tts-providers.ts | 2 +- lib/i18n/locales/zh-TW.json | 3 +++ 3 files changed, 12 insertions(+), 10 deletions(-) diff --git a/lib/audio/constants.ts b/lib/audio/constants.ts index cef5783c8..52966e09d 100644 --- a/lib/audio/constants.ts +++ b/lib/audio/constants.ts @@ -953,7 +953,6 @@ export const TTS_PROVIDERS: Record = { models: [{ id: 'kokoro-v1', name: 'Kokoro v1' }], defaultModelId: 'kokoro-v1', voices: [ - { id: 'default', name: 'Default', language: 'auto', gender: 'neutral' }, // American English — female { id: 'af_alloy', name: 'Alloy', language: 'en-US', gender: 'female' }, { id: 'af_aoede', name: 'Aoede', language: 'en-US', gender: 'female' }, @@ -1003,16 +1002,16 @@ export const TTS_PROVIDERS: Record = { // Japanese — male { id: 'jm_kumo', name: 'Kumo', language: 'ja-JP', gender: 'male' }, // Spanish - { id: 'ef_dora', name: 'Dora', language: 'es', gender: 'female' }, - { id: 'em_alex', name: 'Alex', language: 'es', gender: 'male' }, - { id: 'em_santa', name: 'Santa', language: 'es', gender: 'male' }, + { id: 'ef_dora', name: 'Dora', language: 'es-ES', gender: 'female' }, + { id: 'em_alex', name: 'Alex', language: 'es-ES', gender: 'male' }, + { id: 'em_santa', name: 'Santa', language: 'es-ES', gender: 'male' }, // French { id: 'ff_siwis', name: 'Siwis', language: 'fr-FR', gender: 'female' }, // Hindi - { id: 'hf_alpha', name: 'Alpha', language: 'hi', gender: 'female' }, - { id: 'hf_beta', name: 'Beta', language: 'hi', gender: 'female' }, - { id: 'hm_omega', name: 'Omega', language: 'hi', gender: 'male' }, - { id: 'hm_psi', name: 'Psi', language: 'hi', gender: 'male' }, + { id: 'hf_alpha', name: 'Alpha', language: 'hi-IN', gender: 'female' }, + { id: 'hf_beta', name: 'Beta', language: 'hi-IN', gender: 'female' }, + { id: 'hm_omega', name: 'Omega', language: 'hi-IN', gender: 'male' }, + { id: 'hm_psi', name: 'Psi', language: 'hi-IN', gender: 'male' }, // Italian { id: 'if_sara', name: 'Sara', language: 'it-IT', gender: 'female' }, { id: 'im_nicola', name: 'Nicola', language: 'it-IT', gender: 'male' }, @@ -1257,7 +1256,7 @@ export const DEFAULT_TTS_VOICES: Record = { 'doubao-tts': 'zh_female_vv_uranus_bigtts', 'elevenlabs-tts': 'EXAVITQu4vr4xnSDxMaL', 'minimax-tts': 'female-yujie', - 'lemonade-tts': 'default', + 'lemonade-tts': 'af_heart', 'browser-native-tts': 'default', }; diff --git a/lib/audio/tts-providers.ts b/lib/audio/tts-providers.ts index 5eb245948..bacfea68a 100644 --- a/lib/audio/tts-providers.ts +++ b/lib/audio/tts-providers.ts @@ -239,7 +239,7 @@ async function generateLemonadeTTS( body: JSON.stringify({ model: config.modelId || TTS_PROVIDERS['lemonade-tts'].defaultModelId, input: text, - voice: config.voice || 'default', + voice: config.voice || 'af_heart', speed: config.speed || 1.0, response_format: config.format || 'wav', }), diff --git a/lib/i18n/locales/zh-TW.json b/lib/i18n/locales/zh-TW.json index 2b116b313..d7b263e53 100644 --- a/lib/i18n/locales/zh-TW.json +++ b/lib/i18n/locales/zh-TW.json @@ -584,9 +584,11 @@ "providerDoubaoTTS": "豆包 TTS 2.0(火山引擎)", "providerElevenLabsTTS": "ElevenLabs TTS", "providerMiniMaxTTS": "MiniMax TTS", + "providerLemonadeTTS": "Lemonade TTS(本機)", "providerBrowserNativeTTS": "瀏覽器原生 TTS", "providerOpenAIWhisper": "OpenAI ASR (gpt-4o-mini-transcribe)", "providerBrowserNative": "瀏覽器原生 ASR", + "providerLemonadeASR": "Lemonade ASR(本機)", "providerQwenASR": "Qwen ASR(阿里雲百煉)", "providerUnpdf": "unpdf(內建)", "providerMinerU": "MinerU", @@ -815,6 +817,7 @@ "providerQwenImage": "Qwen Image(阿里通義)", "providerNanoBanana": "Nano Banana(Gemini)", "providerMiniMaxImage": "MiniMax 圖像", + "providerLemonadeImage": "Lemonade 圖像(本機)", "providerGrokImage": "Grok Image(xAI)", "testImageGeneration": "測試圖像生成", "testImageConnectivity": "測試連線", From a287367961bbdd93b7a515919b3207a9eef21e65 Mon Sep 17 00:00:00 2001 From: yangshen <1322568757@qq.com> Date: Tue, 5 May 2026 21:23:30 +0800 Subject: [PATCH 4/9] test(lemonade): add unit tests for image/TTS/ASR adapters and wav-utils Covers: - lib/media/adapters/lemonade-image-adapter: request shape, base URL fallback, custom model, optional Bearer auth, error paths, connectivity probe - lib/audio/tts-providers (lemonade-tts case): request body (model/voice/format), af_heart fallback, optional auth, error path - lib/audio/asr-providers (lemonade-asr case): WAV-only validation, model/language form fields, "audio is empty" graceful empty result, error propagation, default model fallback - lib/audio/wav-utils: isWavBlob detection by MIME and filename, normalizeASRUploadAudio pass-through and wav handling Co-Authored-By: Claude Opus 4.7 (1M context) --- tests/audio/lemonade-asr.test.ts | 107 ++++++++++++++ tests/audio/lemonade-tts.test.ts | 108 +++++++++++++++ tests/audio/wav-utils.test.ts | 42 ++++++ tests/media/lemonade-image-adapter.test.ts | 153 +++++++++++++++++++++ 4 files changed, 410 insertions(+) create mode 100644 tests/audio/lemonade-asr.test.ts create mode 100644 tests/audio/lemonade-tts.test.ts create mode 100644 tests/audio/wav-utils.test.ts create mode 100644 tests/media/lemonade-image-adapter.test.ts diff --git a/tests/audio/lemonade-asr.test.ts b/tests/audio/lemonade-asr.test.ts new file mode 100644 index 000000000..d280d3eea --- /dev/null +++ b/tests/audio/lemonade-asr.test.ts @@ -0,0 +1,107 @@ +import { beforeEach, describe, expect, it, vi, type Mock } from 'vitest'; +import { transcribeAudio } from '@/lib/audio/asr-providers'; + +const mockFetch = vi.fn() as Mock; +vi.stubGlobal('fetch', mockFetch); + +function wavBuffer(): Buffer { + const buf = Buffer.alloc(16); + buf.write('RIFF', 0, 'ascii'); + buf.writeUInt32LE(8, 4); + buf.write('WAVE', 8, 'ascii'); + return buf; +} + +describe('Lemonade ASR', () => { + beforeEach(() => { + mockFetch.mockReset(); + }); + + it('posts WAV audio to /audio/transcriptions with the configured model', async () => { + mockFetch.mockResolvedValueOnce({ + ok: true, + json: async () => ({ text: 'hello' }), + }); + + const result = await transcribeAudio( + { + providerId: 'lemonade-asr', + baseUrl: 'http://localhost:13305/v1/', + modelId: 'Whisper-Base', + }, + wavBuffer(), + ); + + expect(mockFetch).toHaveBeenCalledWith( + 'http://localhost:13305/v1/audio/transcriptions', + expect.objectContaining({ method: 'POST' }), + ); + const formData = mockFetch.mock.calls[0][1].body as FormData; + expect(formData.get('model')).toBe('Whisper-Base'); + expect(formData.get('response_format')).toBe('json'); + expect(formData.get('file')).toBeInstanceOf(Blob); + expect(result).toEqual({ text: 'hello' }); + }); + + it('forwards an explicit language but not when set to "auto"', async () => { + mockFetch.mockResolvedValue({ + ok: true, + json: async () => ({ text: '' }), + }); + + await transcribeAudio({ providerId: 'lemonade-asr', language: 'en' }, wavBuffer()); + let formData = mockFetch.mock.calls[0][1].body as FormData; + expect(formData.get('language')).toBe('en'); + + mockFetch.mockClear(); + + await transcribeAudio({ providerId: 'lemonade-asr', language: 'auto' }, wavBuffer()); + formData = mockFetch.mock.calls[0][1].body as FormData; + expect(formData.get('language')).toBeNull(); + }); + + it('rejects non-WAV audio buffers', async () => { + const notWav = Buffer.from('IDXX' + '\0'.repeat(12)); + + await expect(transcribeAudio({ providerId: 'lemonade-asr' }, notWav)).rejects.toThrow( + /WAV input only/, + ); + expect(mockFetch).not.toHaveBeenCalled(); + }); + + it('returns empty text gracefully when upstream reports empty audio', async () => { + mockFetch.mockResolvedValueOnce({ + ok: false, + status: 400, + text: async () => 'audio is empty', + statusText: 'Bad Request', + }); + + const result = await transcribeAudio({ providerId: 'lemonade-asr' }, wavBuffer()); + expect(result).toEqual({ text: '' }); + }); + + it('throws on unrecognized error payloads', async () => { + mockFetch.mockResolvedValueOnce({ + ok: false, + status: 500, + text: async () => 'model crashed', + statusText: 'Internal Server Error', + }); + + await expect(transcribeAudio({ providerId: 'lemonade-asr' }, wavBuffer())).rejects.toThrow( + /Lemonade ASR API error.*model crashed/, + ); + }); + + it('falls back to default model id when not provided', async () => { + mockFetch.mockResolvedValueOnce({ + ok: true, + json: async () => ({ text: 'ok' }), + }); + + await transcribeAudio({ providerId: 'lemonade-asr' }, wavBuffer()); + const formData = mockFetch.mock.calls[0][1].body as FormData; + expect(formData.get('model')).toBe('Whisper-Base'); + }); +}); diff --git a/tests/audio/lemonade-tts.test.ts b/tests/audio/lemonade-tts.test.ts new file mode 100644 index 000000000..9580cbcf4 --- /dev/null +++ b/tests/audio/lemonade-tts.test.ts @@ -0,0 +1,108 @@ +import { beforeEach, describe, expect, it, vi, type Mock } from 'vitest'; +import { generateTTS } from '@/lib/audio/tts-providers'; + +const mockFetch = vi.fn() as Mock; +vi.stubGlobal('fetch', mockFetch); + +function wavBytes(): ArrayBuffer { + const data = new Uint8Array(16); + data[0] = 0x52; // 'R' + data[1] = 0x49; // 'I' + data[2] = 0x46; // 'F' + data[3] = 0x46; // 'F' + data[8] = 0x57; // 'W' + data[9] = 0x41; // 'A' + data[10] = 0x56; // 'V' + data[11] = 0x45; // 'E' + return data.buffer; +} + +describe('Lemonade TTS', () => { + beforeEach(() => { + mockFetch.mockReset(); + }); + + it('posts to /audio/speech with kokoro-v1 + wav and bubble-up audio bytes', async () => { + const buffer = wavBytes(); + mockFetch.mockResolvedValueOnce({ + ok: true, + arrayBuffer: async () => buffer, + headers: { get: () => 'audio/wav' }, + }); + + const result = await generateTTS( + { + providerId: 'lemonade-tts', + baseUrl: 'http://localhost:13305/v1/', + voice: 'af_heart', + }, + 'hello world', + ); + + expect(mockFetch).toHaveBeenCalledWith( + 'http://localhost:13305/v1/audio/speech', + expect.objectContaining({ method: 'POST' }), + ); + const body = JSON.parse(mockFetch.mock.calls[0][1].body); + expect(body).toEqual({ + model: 'kokoro-v1', + input: 'hello world', + voice: 'af_heart', + speed: 1.0, + response_format: 'wav', + }); + expect(result.audio).toBeInstanceOf(Uint8Array); + expect(result.audio.byteLength).toBe(16); + expect(result.format).toBe('wav'); + }); + + it('falls back to af_heart when no voice is provided', async () => { + mockFetch.mockResolvedValueOnce({ + ok: true, + arrayBuffer: async () => wavBytes(), + headers: { get: () => 'audio/wav' }, + }); + + await generateTTS({ providerId: 'lemonade-tts', voice: '' }, 'hi'); + + const body = JSON.parse(mockFetch.mock.calls[0][1].body); + expect(body.voice).toBe('af_heart'); + }); + + it('does not require an API key (keyless provider)', async () => { + mockFetch.mockResolvedValueOnce({ + ok: true, + arrayBuffer: async () => wavBytes(), + headers: { get: () => 'audio/wav' }, + }); + + await generateTTS({ providerId: 'lemonade-tts', voice: 'af_heart' }, 'hi'); + + expect(mockFetch.mock.calls[0][1].headers.Authorization).toBeUndefined(); + }); + + it('attaches Bearer auth when apiKey is provided', async () => { + mockFetch.mockResolvedValueOnce({ + ok: true, + arrayBuffer: async () => wavBytes(), + headers: { get: () => 'audio/wav' }, + }); + + await generateTTS({ providerId: 'lemonade-tts', apiKey: 'sk-lm', voice: 'af_heart' }, 'hi'); + + expect(mockFetch.mock.calls[0][1].headers.Authorization).toBe('Bearer sk-lm'); + }); + + it('throws on non-OK responses', async () => { + mockFetch.mockResolvedValueOnce({ + ok: false, + status: 400, + text: async () => 'bad voice', + statusText: 'Bad Request', + }); + + await expect(generateTTS({ providerId: 'lemonade-tts', voice: 'foo' }, 'hi')).rejects.toThrow( + /Lemonade TTS API error/, + ); + }); +}); diff --git a/tests/audio/wav-utils.test.ts b/tests/audio/wav-utils.test.ts new file mode 100644 index 000000000..cbddc46c9 --- /dev/null +++ b/tests/audio/wav-utils.test.ts @@ -0,0 +1,42 @@ +import { describe, expect, it } from 'vitest'; +import { isWavBlob, normalizeASRUploadAudio } from '@/lib/audio/wav-utils'; + +describe('isWavBlob', () => { + it('detects audio/wav MIME type', () => { + const blob = new Blob([new Uint8Array(4)], { type: 'audio/wav' }); + expect(isWavBlob(blob)).toBe(true); + }); + + it('detects audio/x-wav MIME type', () => { + const blob = new Blob([new Uint8Array(4)], { type: 'audio/x-wav' }); + expect(isWavBlob(blob)).toBe(true); + }); + + it('detects .wav file extension when MIME is missing', () => { + const blob = new Blob([new Uint8Array(4)]); + expect(isWavBlob(blob, 'recording.wav')).toBe(true); + expect(isWavBlob(blob, 'recording.WAV')).toBe(true); + }); + + it('returns false for non-WAV blobs without a wav filename', () => { + const blob = new Blob([new Uint8Array(4)], { type: 'audio/webm' }); + expect(isWavBlob(blob)).toBe(false); + expect(isWavBlob(blob, 'recording.webm')).toBe(false); + }); +}); + +describe('normalizeASRUploadAudio', () => { + it('passes through non-lemonade providers unchanged', async () => { + const input = new Blob([new Uint8Array([1, 2, 3])], { type: 'audio/webm' }); + const result = await normalizeASRUploadAudio('openai-whisper', input); + expect(result.blob).toBe(input); + expect(result.fileName).toBe('recording.webm'); + }); + + it('keeps WAV blobs unchanged for lemonade-asr', async () => { + const input = new Blob([new Uint8Array([1, 2, 3])], { type: 'audio/wav' }); + const result = await normalizeASRUploadAudio('lemonade-asr', input); + expect(result.blob).toBe(input); + expect(result.fileName).toBe('recording.wav'); + }); +}); diff --git a/tests/media/lemonade-image-adapter.test.ts b/tests/media/lemonade-image-adapter.test.ts new file mode 100644 index 000000000..63ff4990d --- /dev/null +++ b/tests/media/lemonade-image-adapter.test.ts @@ -0,0 +1,153 @@ +import { beforeEach, describe, expect, it, vi, type Mock } from 'vitest'; +import { + generateWithLemonadeImage, + testLemonadeImageConnectivity, +} from '@/lib/media/adapters/lemonade-image-adapter'; + +const mockFetch = vi.fn() as Mock; +vi.stubGlobal('fetch', mockFetch); + +describe('lemonade-image-adapter', () => { + beforeEach(() => { + mockFetch.mockReset(); + }); + + it('posts generation requests to /images/generations with b64_json response_format', async () => { + mockFetch.mockResolvedValueOnce({ + ok: true, + json: async () => ({ data: [{ b64_json: 'aW1n' }] }), + }); + + const result = await generateWithLemonadeImage( + { providerId: 'lemonade', apiKey: '', baseUrl: 'http://localhost:13305/v1/' }, + { prompt: 'a fox', width: 768, height: 768 }, + ); + + expect(mockFetch).toHaveBeenCalledWith( + 'http://localhost:13305/v1/images/generations', + expect.objectContaining({ + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + }), + ); + const body = JSON.parse(mockFetch.mock.calls[0][1].body); + expect(body).toEqual({ + model: 'sd-cpp', + prompt: 'a fox', + n: 1, + size: '768x768', + response_format: 'b64_json', + }); + expect(result).toEqual({ + url: undefined, + base64: 'aW1n', + width: 768, + height: 768, + }); + }); + + it('falls back to default base URL and 1024x1024 when not provided', async () => { + mockFetch.mockResolvedValueOnce({ + ok: true, + json: async () => ({ data: [{ b64_json: 'eA==' }] }), + }); + + await generateWithLemonadeImage({ providerId: 'lemonade', apiKey: '' }, { prompt: 'tile' }); + + expect(mockFetch.mock.calls[0][0]).toBe('http://localhost:13305/v1/images/generations'); + const body = JSON.parse(mockFetch.mock.calls[0][1].body); + expect(body.size).toBe('1024x1024'); + }); + + it('forwards custom model id when provided', async () => { + mockFetch.mockResolvedValueOnce({ + ok: true, + json: async () => ({ data: [{ b64_json: 'eA==' }] }), + }); + + await generateWithLemonadeImage( + { providerId: 'lemonade', apiKey: '', model: 'flux-schnell' }, + { prompt: 'p' }, + ); + + const body = JSON.parse(mockFetch.mock.calls[0][1].body); + expect(body.model).toBe('flux-schnell'); + }); + + it('attaches Bearer auth header when apiKey is provided', async () => { + mockFetch.mockResolvedValueOnce({ + ok: true, + json: async () => ({ data: [{ b64_json: 'eA==' }] }), + }); + + await generateWithLemonadeImage({ providerId: 'lemonade', apiKey: 'sk-lm' }, { prompt: 'p' }); + + expect(mockFetch.mock.calls[0][1].headers).toEqual({ + 'Content-Type': 'application/json', + Authorization: 'Bearer sk-lm', + }); + }); + + it('omits auth header when apiKey is empty (keyless)', async () => { + mockFetch.mockResolvedValueOnce({ + ok: true, + json: async () => ({ data: [{ b64_json: 'eA==' }] }), + }); + + await generateWithLemonadeImage({ providerId: 'lemonade', apiKey: '' }, { prompt: 'p' }); + + expect(mockFetch.mock.calls[0][1].headers).toEqual({ + 'Content-Type': 'application/json', + }); + }); + + it('throws a useful error on failed generation responses', async () => { + mockFetch.mockResolvedValueOnce({ + ok: false, + status: 500, + text: async () => 'model unavailable', + statusText: 'Internal Server Error', + }); + + await expect( + generateWithLemonadeImage({ providerId: 'lemonade', apiKey: '' }, { prompt: 'p' }), + ).rejects.toThrow('Lemonade image generation failed (500): model unavailable'); + }); + + it('throws when response payload contains no image data', async () => { + mockFetch.mockResolvedValueOnce({ + ok: true, + json: async () => ({ data: [{}] }), + }); + + await expect( + generateWithLemonadeImage({ providerId: 'lemonade', apiKey: '' }, { prompt: 'p' }), + ).rejects.toThrow('Lemonade returned empty image response'); + }); + + it('reports connectivity success against /models endpoint', async () => { + mockFetch.mockResolvedValueOnce({ ok: true }); + + const result = await testLemonadeImageConnectivity({ providerId: 'lemonade', apiKey: '' }); + + expect(mockFetch).toHaveBeenCalledWith( + 'http://localhost:13305/v1/models', + expect.objectContaining({ headers: {} }), + ); + expect(result.success).toBe(true); + }); + + it('reports connectivity failure with response text', async () => { + mockFetch.mockResolvedValueOnce({ + ok: false, + status: 503, + text: async () => 'unavailable', + statusText: 'Service Unavailable', + }); + + const result = await testLemonadeImageConnectivity({ providerId: 'lemonade', apiKey: '' }); + + expect(result.success).toBe(false); + expect(result.message).toBe('Lemonade API error (503): unavailable'); + }); +}); From 957c4d3da3fbccc5533849fcae24d97bff47267c Mon Sep 17 00:00:00 2001 From: yangshen <1322568757@qq.com> Date: Thu, 7 May 2026 19:11:22 +0800 Subject: [PATCH 5/9] fix(lemonade): remove text path fork and harden fallbacks --- app/api/generate/agent-profiles/route.ts | 33 ++++--- .../generate/scene-outlines-stream/route.ts | 15 +++- app/generation-preview/page.tsx | 1 + components/settings/tts-settings.tsx | 29 +++++++ lib/ai/model-metadata.ts | 20 ++++- lib/ai/providers.ts | 83 +++++++++++++++--- lib/audio/tts-providers.ts | 71 +++++++++++++++- lib/generation/json-repair.ts | 49 ++++++++++- lib/i18n/locales/en-US.json | 2 + lib/i18n/locales/zh-CN.json | 2 + lib/media/image-providers.ts | 5 +- lib/server/resolve-model.ts | 6 ++ lib/store/settings.ts | 15 +++- lib/types/provider.ts | 3 +- tests/ai/openai-provider.test.ts | 85 ++++++++++++++++++- tests/ai/thinking-config.test.ts | 15 ++++ tests/audio/lemonade-tts.test.ts | 59 +++++++++++++ tests/generation/json-repair.test.ts | 57 +++++++++++++ 18 files changed, 514 insertions(+), 36 deletions(-) create mode 100644 tests/generation/json-repair.test.ts diff --git a/app/api/generate/agent-profiles/route.ts b/app/api/generate/agent-profiles/route.ts index 523fae819..b9d4710f6 100644 --- a/app/api/generate/agent-profiles/route.ts +++ b/app/api/generate/agent-profiles/route.ts @@ -23,7 +23,12 @@ interface RequestBody { languageDirective: string; availableAvatars: string[]; avatarDescriptions?: Array<{ path: string; desc: string }>; - availableVoices?: Array<{ providerId: string; voiceId: string; voiceName: string }>; + availableVoices?: Array<{ + providerId: string; + voiceId: string; + voiceName: string; + voiceLanguage?: string; + }>; } function stripCodeFences(text: string): string { @@ -89,12 +94,14 @@ export async function POST(req: NextRequest) { availableVoices.map((v) => ({ id: `${v.providerId}::${v.voiceId}`, name: v.voiceName, + language: v.voiceLanguage || 'unknown', })), ) : ''; const voicePrompt = voiceListStr ? `- Each agent should be assigned a voice that matches their persona from this list: ${voiceListStr} + - Prefer a voice whose language matches the course language directive - Pick a voice that suits the agent's personality and role (e.g. authoritative voice for teacher, lively voice for energetic student) - Try to use different voices for each agent` : ''; @@ -139,19 +146,21 @@ Return a JSON object with this exact structure: log.info(`Generating agent profiles for "${stageInfo.name}" [model=${modelString}]`); - const result = await callLLM( - { - model: languageModel, - system: systemPrompt, - prompt: userPrompt, - }, - 'agent-profiles', - undefined, - thinkingConfig, - ); + const rawResult = ( + await callLLM( + { + model: languageModel, + system: systemPrompt, + prompt: userPrompt, + }, + 'agent-profiles', + undefined, + thinkingConfig, + ) + ).text; // ── Parse LLM response ── - const rawText = stripCodeFences(result.text); + const rawText = stripCodeFences(rawResult); let parsed: { agents: Array<{ name: string; diff --git a/app/api/generate/scene-outlines-stream/route.ts b/app/api/generate/scene-outlines-stream/route.ts index df6d545e9..ce1c3469a 100644 --- a/app/api/generate/scene-outlines-stream/route.ts +++ b/app/api/generate/scene-outlines-stream/route.ts @@ -284,13 +284,16 @@ export async function POST(req: NextRequest) { for (let attempt = 1; attempt <= MAX_STREAM_RETRIES + 1; attempt++) { try { - const result = streamLLM(streamParams, 'scene-outlines-stream', thinkingConfig); - let fullText = ''; parsedOutlines = []; languageDirective = null; + const textStream = streamLLM( + streamParams, + 'scene-outlines-stream', + thinkingConfig, + ).textStream; - for await (const chunk of result.textStream) { + for await (const chunk of textStream) { fullText += chunk; // Try to extract language directive early @@ -332,6 +335,9 @@ export async function POST(req: NextRequest) { lastError = fullText.trim() ? 'LLM response could not be parsed into outlines' : 'LLM returned empty response'; + log.warn( + `Outlines attempt ${attempt} diagnostics: textLen=${fullText.length}, outlines=${parsedOutlines.length}, languageDirective=${languageDirective ? 'yes' : 'no'}, preview=${JSON.stringify(fullText.slice(0, 240))}`, + ); if (attempt <= MAX_STREAM_RETRIES) { log.warn( @@ -347,6 +353,9 @@ export async function POST(req: NextRequest) { } } catch (error) { lastError = error instanceof Error ? error.message : String(error); + log.warn( + `Outlines stream error detail (attempt ${attempt}/${MAX_STREAM_RETRIES + 1}): ${lastError}`, + ); if (attempt <= MAX_STREAM_RETRIES) { log.warn( diff --git a/app/generation-preview/page.tsx b/app/generation-preview/page.tsx index 2f7754af7..bcd2b007e 100644 --- a/app/generation-preview/page.tsx +++ b/app/generation-preview/page.tsx @@ -577,6 +577,7 @@ function GenerationPreviewContent() { providerId: p.providerId, voiceId: v.id, voiceName: v.name, + voiceLanguage: v.language, })), ); }; diff --git a/components/settings/tts-settings.tsx b/components/settings/tts-settings.tsx index ac28e2e51..b65616290 100644 --- a/components/settings/tts-settings.tsx +++ b/components/settings/tts-settings.tsx @@ -4,6 +4,7 @@ import { useState, useEffect, useRef, type ReactNode } from 'react'; import { Label } from '@/components/ui/label'; import { Input } from '@/components/ui/input'; import { Button } from '@/components/ui/button'; +import { Switch } from '@/components/ui/switch'; import { Textarea } from '@/components/ui/textarea'; import { Tabs, TabsContent, TabsList, TabsTrigger } from '@/components/ui/tabs'; import { @@ -89,6 +90,7 @@ export function TTSSettings({ selectedProviderId }: TTSSettingsProps) { const providerConfig = ttsProvidersConfig[selectedProviderId]; const isServerConfigured = !!providerConfig?.isServerConfigured; const isVoxCPM = selectedProviderId === 'voxcpm-tts'; + const isLemonade = selectedProviderId === 'lemonade-tts'; const voxcpmBackend = normalizeVoxCPMBackend(providerConfig?.providerOptions?.backend); const requiresApiKey = isCustom ? !!providerConfig?.requiresApiKey @@ -522,6 +524,33 @@ export function TTSSettings({ selectedProviderId }: TTSSettingsProps) {
)} + {isLemonade && ( +
+
+
+ +

+ {t('settings.lemonadeAutoMatchVoiceLanguageDescription')} +

+
+ + setTTSProviderConfig(selectedProviderId, { + providerOptions: { + ...(providerConfig?.providerOptions || {}), + autoMatchVoiceLanguage: checked, + }, + }) + } + aria-label={t('settings.lemonadeAutoMatchVoiceLanguageLabel')} + /> +
+
+ )} + {selectedProviderId === 'voxcpm-tts' && } {/* Custom Voice List Management */} diff --git a/lib/ai/model-metadata.ts b/lib/ai/model-metadata.ts index 425cf9b1e..3a08cc6d7 100644 --- a/lib/ai/model-metadata.ts +++ b/lib/ai/model-metadata.ts @@ -163,6 +163,12 @@ const hunyuanHy3Effort: ThinkingCapability = { defaultEnabled: false, }; +const lemonadeToggleBudget = toggleBudgetCapability( + 'lemonade', + { min: 0, max: 81920, step: 1024, disableValue: 0 }, + false, +); + const qwenBudgetEnabled = toggleBudgetCapability( 'qwen', { min: 0, max: 81920, step: 1024, disableValue: 0 }, @@ -328,13 +334,25 @@ const THINKING_CAPABILITIES: Record = { [getModelMetadataKey('xiaomi', 'mimo-v2.5-pro')]: toggleCapability('xiaomi'), [getModelMetadataKey('xiaomi', 'mimo-v2.5')]: toggleCapability('xiaomi'), + + [getModelMetadataKey('lemonade', 'Qwen3-4B-GGUF')]: lemonadeToggleBudget, + [getModelMetadataKey('lemonade', 'Qwen3.5-4B-GGUF')]: lemonadeToggleBudget, + [getModelMetadataKey('lemonade', 'gpt-oss-20b')]: lemonadeToggleBudget, + [getModelMetadataKey('lemonade', 'GPT-OSS-20B-GGUF')]: lemonadeToggleBudget, }; export function getCatalogThinkingCapability( providerId: string, modelId: string, ): ThinkingCapability | undefined { - return THINKING_CAPABILITIES[getModelMetadataKey(providerId, modelId)]; + const exact = THINKING_CAPABILITIES[getModelMetadataKey(providerId, modelId)]; + if (exact) return exact; + + if (providerId === 'lemonade') { + return lemonadeToggleBudget; + } + + return undefined; } export function applyModelMetadata(providers: Record): void { diff --git a/lib/ai/providers.ts b/lib/ai/providers.ts index f2b4aa1f8..348cd62c6 100644 --- a/lib/ai/providers.ts +++ b/lib/ai/providers.ts @@ -35,7 +35,7 @@ import type { ThinkingConfig, } from '@/lib/types/provider'; import { applyModelMetadata, getCatalogThinkingCapability } from './model-metadata'; -import { getThinkingMode, pickThinkingBudget } from './thinking-config'; +import { getDefaultThinkingConfig, getThinkingMode, pickThinkingBudget } from './thinking-config'; import { createLogger } from '@/lib/logger'; // NOTE: Do NOT import thinking-context.ts here — it uses node:async_hooks // which is server-only, and this file is also used on the client via @@ -990,19 +990,24 @@ export const PROVIDERS: Record = { icon: '/logos/lemonade.svg', models: [ { - id: 'Qwen3-0.6B-GGUF', - name: 'Qwen3 0.6B GGUF', + id: 'Qwen3.5-4B-GGUF', + name: 'Qwen3.5 4B GGUF', + capabilities: { streaming: true, tools: true, vision: true }, + }, + { + id: 'Qwen3-4B-GGUF', + name: 'Qwen3 4B GGUF', capabilities: { streaming: true, tools: true, vision: false }, }, { - id: 'Llama-3.2-1B-Instruct-Hybrid', - name: 'Llama 3.2 1B Instruct Hybrid', + id: 'gpt-oss-20b', + name: 'GPT-OSS 20B', capabilities: { streaming: true, tools: true, vision: false }, }, { - id: 'Qwen2.5-VL-7B-Instruct', - name: 'Qwen2.5 VL 7B Instruct', - capabilities: { streaming: true, tools: true, vision: true }, + id: 'Gemma-4-26B-A4B-it-GGUF', + name: 'Gemma 4 26B A4B IT GGUF', + capabilities: { streaming: true, tools: true, vision: false }, }, ], }, @@ -1153,6 +1158,19 @@ function getCompatThinkingBodyParams( : undefined; } + case 'lemonade': { + const chatTemplateKwargs: Record = {}; + if (mode === 'enabled') { + chatTemplateKwargs.enable_thinking = true; + } else { + chatTemplateKwargs.enable_thinking = false; + } + if (mode === 'enabled' && budget !== undefined) { + chatTemplateKwargs.thinking_budget = budget; + } + return { chat_template_kwargs: chatTemplateKwargs }; + } + default: return undefined; } @@ -1243,12 +1261,20 @@ export function getModel(config: ModelConfig): ModelWithInfo { const thinkingCtx = (globalThis as Record).__thinkingContext as | { getStore?: () => unknown } | undefined; - const thinking = thinkingCtx?.getStore?.() as ThinkingConfig | undefined; + const thinkingFromContext = thinkingCtx?.getStore?.() as ThinkingConfig | undefined; + const thinking = + thinkingFromContext ?? + (providerId === 'lemonade' + ? getDefaultThinkingConfig(getCatalogThinkingCapability(providerId, config.modelId)) + : undefined); if (thinking && init?.body && typeof init.body === 'string') { const extra = getCompatThinkingBodyParams(providerId, config.modelId, thinking); if (extra) { try { const body = JSON.parse(init.body); + if (providerId === 'lemonade' && 'stream_options' in body) { + delete body.stream_options; + } Object.assign(body, extra); init = { ...init, body: JSON.stringify(body) }; } catch { @@ -1256,7 +1282,44 @@ export function getModel(config: ModelConfig): ModelWithInfo { } } } - return globalThis.fetch(url, init); + const response = await globalThis.fetch(url, init); + + if (providerId !== 'lemonade') { + return response; + } + + const contentType = response.headers.get('content-type') || ''; + let isStreamingRequest = false; + if (init?.body && typeof init.body === 'string') { + try { + const requestBody = JSON.parse(init.body); + isStreamingRequest = requestBody?.stream === true; + } catch { + /* ignore request-body inspection failure */ + } + } + + if (isStreamingRequest) { + return response; + } + + try { + const cloned = response.clone(); + const text = await cloned.text(); + + try { + JSON.parse(text); + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + log.warn( + `[Lemonade] Invalid JSON response from OpenAI-compatible path: status=${response.status}, contentType=${contentType || 'n/a'}, bodyLen=${text.length}, first=${JSON.stringify(text.slice(0, 500))}, last=${JSON.stringify(text.slice(Math.max(0, text.length - 500)))}, parseError=${message}`, + ); + } + } catch (error) { + log.warn('[Lemonade] Failed to inspect JSON response body:', error); + } + + return response; }; } diff --git a/lib/audio/tts-providers.ts b/lib/audio/tts-providers.ts index bacfea68a..c7a4c047f 100644 --- a/lib/audio/tts-providers.ts +++ b/lib/audio/tts-providers.ts @@ -110,6 +110,8 @@ export interface TTSGenerationResult { format: string; } +const CJK_LANG_THRESHOLD = 0.3; + /** * Thrown when a TTS provider returns a rate-limit / concurrency-quota error. * Allows downstream consumers to distinguish rate-limit errors from other TTS failures. @@ -229,6 +231,8 @@ async function generateLemonadeTTS( /\/$/, '', ); + const modelId = config.modelId || TTS_PROVIDERS['lemonade-tts'].defaultModelId; + const voice = resolveLemonadeVoice(config, text, modelId); const response = await fetch(`${baseUrl}/audio/speech`, { method: 'POST', @@ -237,9 +241,9 @@ async function generateLemonadeTTS( ...getBackendAuthHeaders(config.apiKey), }, body: JSON.stringify({ - model: config.modelId || TTS_PROVIDERS['lemonade-tts'].defaultModelId, + model: modelId, input: text, - voice: config.voice || 'af_heart', + voice, speed: config.speed || 1.0, response_format: config.format || 'wav', }), @@ -257,6 +261,69 @@ async function generateLemonadeTTS( }; } +function resolveLemonadeVoice(config: TTSModelConfig, text: string, modelId: string): string { + const fallbackVoice = config.voice || 'af_heart'; + if (modelId !== 'kokoro-v1' || !isLemonadeAutoMatchVoiceLanguageEnabled(config)) { + return fallbackVoice; + } + + if (!shouldPreferChineseLemonadeVoice(text)) { + return fallbackVoice; + } + + const voices = TTS_PROVIDERS['lemonade-tts'].voices; + const selectedVoice = voices.find((voice) => voice.id === fallbackVoice); + + if (!selectedVoice) { + return inferLemonadeFallbackVoice(voices, 'zh-CN') || fallbackVoice; + } + + if (selectedVoice.language === 'zh-CN') { + return fallbackVoice; + } + + return inferLemonadeFallbackVoice(voices, 'zh-CN', selectedVoice.gender) || fallbackVoice; +} + +function isLemonadeAutoMatchVoiceLanguageEnabled(config: TTSModelConfig): boolean { + return config.providerOptions?.autoMatchVoiceLanguage !== false; +} + +function shouldPreferChineseLemonadeVoice(text: string): boolean { + const cjkCount = (text.match(/[\u4e00-\u9fff\u3400-\u4dbf]/g) || []).length; + const ratio = text.length > 0 ? cjkCount / text.length : 0; + return ratio > CJK_LANG_THRESHOLD; +} + +function inferLemonadeFallbackVoice( + voices: Array<{ id: string; language: string; gender?: 'male' | 'female' | 'neutral' }>, + language: 'zh-CN' | 'en-US', + preferredGender?: 'male' | 'female' | 'neutral', +): string | undefined { + const matchingVoices = voices.filter((voice) => voice.language === language); + if (matchingVoices.length === 0) { + return undefined; + } + + const curatedDefaults: Record<'zh-CN' | 'en-US', string> = { + 'zh-CN': 'zf_xiaoxiao', + 'en-US': 'af_heart', + }; + const curatedVoice = matchingVoices.find((voice) => voice.id === curatedDefaults[language]); + if (curatedVoice) { + return curatedVoice.id; + } + + if (preferredGender) { + const sameGenderVoice = matchingVoices.find((voice) => voice.gender === preferredGender); + if (sameGenderVoice) { + return sameGenderVoice.id; + } + } + + return matchingVoices[0]?.id; +} + /** * VoxCPM2 TTS implementation. * diff --git a/lib/generation/json-repair.ts b/lib/generation/json-repair.ts index 89f7fa0b4..a754c00fe 100644 --- a/lib/generation/json-repair.ts +++ b/lib/generation/json-repair.ts @@ -6,6 +6,32 @@ import { jsonrepair } from 'jsonrepair'; import { createLogger } from '@/lib/logger'; const log = createLogger('Generation'); +function repairQuotedPropertyFragments(jsonStr: string): string { + return jsonStr.replace( + /([,{]\s*)"([A-Za-z_][A-Za-z0-9_]*)\s*:\s*(true|false|null|[+-]?\d+(?:\.\d+)?)"(?=\s*[,}])/g, + (_match, prefix, key, value) => `${prefix}"${key}": ${value}`, + ); +} + +function logJsonParseError(stage: string, jsonStr: string, error: unknown): void { + const message = error instanceof Error ? error.message : String(error); + const positionMatch = message.match(/position\s+(\d+)/i); + const position = positionMatch ? Number(positionMatch[1]) : undefined; + + if (typeof position === 'number' && Number.isFinite(position)) { + const start = Math.max(0, position - 120); + const end = Math.min(jsonStr.length, position + 120); + log.warn( + `${stage} parse error at position ${position}: ${message}. Context: ${jsonStr + .slice(start, end) + .replace(/\n/g, '\\n')}`, + ); + return; + } + + log.warn(`${stage} parse error: ${message}`); +} + export function parseJsonResponse(response: string): T | null { // Strategy 1: Try to extract JSON from markdown code blocks (may have multiple) const codeBlockMatches = response.matchAll(/```(?:json)?\s*([\s\S]*?)```/g); @@ -90,6 +116,10 @@ export function parseJsonResponse(response: string): T | null { log.error('Failed to parse JSON from response'); log.error('Raw response (first 500 chars):', response.substring(0, 500)); + log.error( + 'Raw response (last 500 chars):', + response.substring(Math.max(0, response.length - 500)), + ); return null; } @@ -101,7 +131,8 @@ export function tryParseJson(jsonStr: string): T | null { // Attempt 1: Try parsing as-is try { return JSON.parse(jsonStr) as T; - } catch { + } catch (error) { + logJsonParseError('Attempt 1', jsonStr, error); // Continue to fix attempts } @@ -109,6 +140,13 @@ export function tryParseJson(jsonStr: string): T | null { try { let fixed = jsonStr; + // Fix 0: Recover malformed property fragments that were accidentally + // emitted as standalone strings inside an object, such as: + // `"height: 76"` -> `"height": 76` + // `"fixedRatio: false"` -> `"fixedRatio": false` + // The object-context prefix/suffix guards keep valid JSON strings intact. + fixed = repairQuotedPropertyFragments(fixed); + // Fix 1: Handle LaTeX-style escapes that break JSON (e.g., \frac, \left, \right, \times, etc.) // These are common in math content and need to be double-escaped // Match backslash followed by letters (LaTeX commands) inside strings, @@ -152,7 +190,8 @@ export function tryParseJson(jsonStr: string): T | null { } return JSON.parse(fixed) as T; - } catch { + } catch (error) { + logJsonParseError('Attempt 2', jsonStr, error); // Continue to next attempt } @@ -160,7 +199,8 @@ export function tryParseJson(jsonStr: string): T | null { try { const repaired = jsonrepair(jsonStr); return JSON.parse(repaired) as T; - } catch { + } catch (error) { + logJsonParseError('Attempt 3', jsonStr, error); // Continue to next attempt } @@ -183,7 +223,8 @@ export function tryParseJson(jsonStr: string): T | null { }); return JSON.parse(fixed) as T; - } catch { + } catch (error) { + logJsonParseError('Attempt 4', jsonStr, error); return null; } } diff --git a/lib/i18n/locales/en-US.json b/lib/i18n/locales/en-US.json index 19b7282f2..b1a045b78 100644 --- a/lib/i18n/locales/en-US.json +++ b/lib/i18n/locales/en-US.json @@ -682,6 +682,8 @@ "voiceBaseUrlRequired": "Base URL required", "ttsTestTextPlaceholder": "Enter text to convert", "ttsTestTextDefault": "Hello, this is a test speech.", + "lemonadeAutoMatchVoiceLanguageLabel": "Auto-match voice language", + "lemonadeAutoMatchVoiceLanguageDescription": "For kokoro-v1, automatically switch to a same-language voice when the selected voice language does not match the text.", "startRecording": "Start Recording", "stopRecording": "Stop Recording", "recording": "Recording...", diff --git a/lib/i18n/locales/zh-CN.json b/lib/i18n/locales/zh-CN.json index 84471f846..8ca3fce97 100644 --- a/lib/i18n/locales/zh-CN.json +++ b/lib/i18n/locales/zh-CN.json @@ -682,6 +682,8 @@ "voiceBaseUrlRequired": "需要 Base URL", "ttsTestTextPlaceholder": "输入要转换的文本", "ttsTestTextDefault": "你好,这是一段测试语音。", + "lemonadeAutoMatchVoiceLanguageLabel": "自动匹配音色语言", + "lemonadeAutoMatchVoiceLanguageDescription": "仅对 kokoro-v1 生效。当所选音色语言与文本不一致时,自动切换到同语言音色。", "startRecording": "开始录音", "stopRecording": "停止录音", "recording": "录音中...", diff --git a/lib/media/image-providers.ts b/lib/media/image-providers.ts index a2b73b1c6..6a8ea817f 100644 --- a/lib/media/image-providers.ts +++ b/lib/media/image-providers.ts @@ -126,7 +126,10 @@ export const IMAGE_PROVIDERS: Record = { requiresApiKey: false, defaultBaseUrl: 'http://localhost:13305/v1', icon: '/logos/lemonade.svg', - models: [{ id: 'sd-cpp', name: 'Stable Diffusion (sd-cpp)' }], + models: [ + { id: 'Qwen-Image-GGUF', name: 'Qwen Image GGUF' }, + { id: 'sd-cpp', name: 'Stable Diffusion (sd-cpp)' }, + ], supportedAspectRatios: ['16:9', '4:3', '1:1', '9:16'], maxResolution: { width: 1024, height: 1024 }, }, diff --git a/lib/server/resolve-model.ts b/lib/server/resolve-model.ts index 8887ae43b..8553176be 100644 --- a/lib/server/resolve-model.ts +++ b/lib/server/resolve-model.ts @@ -16,8 +16,12 @@ export interface ResolvedModel extends ModelWithInfo { modelString: string; /** Resolved provider ID (e.g. "openai", "ollama") */ providerId: string; + /** Resolved model ID (e.g. "gpt-4o-mini") */ + modelId: string; /** Effective API key after server-side fallback resolution */ apiKey: string; + /** Effective base URL after server/client resolution */ + baseUrl?: string; /** Optional per-request thinking configuration from the client. */ thinkingConfig?: ThinkingConfig; } @@ -67,7 +71,9 @@ export async function resolveModel(params: { modelInfo, modelString, providerId, + modelId, apiKey, + baseUrl, thinkingConfig: params.thinkingConfig, }; } diff --git a/lib/store/settings.ts b/lib/store/settings.ts index 3580adb86..89f71463e 100644 --- a/lib/store/settings.ts +++ b/lib/store/settings.ts @@ -356,7 +356,13 @@ const getDefaultAudioConfig = () => ({ 'doubao-tts': { apiKey: '', baseUrl: '', enabled: false }, 'elevenlabs-tts': { apiKey: '', baseUrl: '', enabled: false }, 'minimax-tts': { apiKey: '', baseUrl: '', modelId: 'speech-2.8-hd', enabled: false }, - 'lemonade-tts': { apiKey: '', baseUrl: '', modelId: 'kokoro-v1', enabled: false }, + 'lemonade-tts': { + apiKey: '', + baseUrl: '', + modelId: 'kokoro-v1', + enabled: false, + providerOptions: { autoMatchVoiceLanguage: true }, + }, 'browser-native-tts': { apiKey: '', baseUrl: '', enabled: true }, } as Record< TTSProviderId, @@ -497,6 +503,13 @@ function ensureBuiltInAudioProviders(state: Partial): void { ...(voxcpmConfig.providerOptions || {}), }; } + const lemonadeConfig = state.ttsProvidersConfig['lemonade-tts']; + if (lemonadeConfig) { + lemonadeConfig.providerOptions = { + autoMatchVoiceLanguage: true, + ...(lemonadeConfig.providerOptions || {}), + }; + } } if (state.asrProvidersConfig) { diff --git a/lib/types/provider.ts b/lib/types/provider.ts index 77c929f9e..bbbe58b6b 100644 --- a/lib/types/provider.ts +++ b/lib/types/provider.ts @@ -60,7 +60,8 @@ export type ThinkingRequestAdapter = | 'doubao' | 'openrouter' | 'hunyuan' - | 'xiaomi'; + | 'xiaomi' + | 'lemonade'; /** * Describes a model's thinking/reasoning API control capability. diff --git a/tests/ai/openai-provider.test.ts b/tests/ai/openai-provider.test.ts index 949d8c074..def4781f6 100644 --- a/tests/ai/openai-provider.test.ts +++ b/tests/ai/openai-provider.test.ts @@ -16,7 +16,7 @@ import type { ProviderId } from '@/lib/types/provider'; async function captureInjectedRequestBody( providerId: ProviderId, modelId: string, - thinkingConfig: Record, + thinkingConfig?: Record, ) { const originalFetch = globalThis.fetch; const globalRecord = globalThis as Record; @@ -145,6 +145,12 @@ describe('OpenAI provider defaults', () => { { mode: 'enabled', effort: 'high' }, { chat_template_kwargs: { reasoning_effort: 'high' } }, ], + [ + 'lemonade', + 'Qwen3.5-4B-GGUF', + { mode: 'enabled', budgetTokens: 4096 }, + { chat_template_kwargs: { enable_thinking: true, thinking_budget: 4096 } }, + ], ] as const)( 'injects %s thinking params into the OpenAI-compatible request body', async (providerId, modelId, thinkingConfig, expected) => { @@ -152,4 +158,81 @@ describe('OpenAI provider defaults', () => { expect(body).toMatchObject(expected); }, ); + + it('disables Lemonade thinking by default for recognized local reasoning models', async () => { + const body = await captureInjectedRequestBody('lemonade', 'Qwen3.5-4B-GGUF'); + + expect(body).toMatchObject({ + chat_template_kwargs: { enable_thinking: false }, + }); + }); + + it('recognizes manually added Lemonade reasoning model IDs', async () => { + const body = await captureInjectedRequestBody('lemonade', 'custom-gpt-oss-20b-q4'); + + expect(body).toMatchObject({ + chat_template_kwargs: { enable_thinking: false }, + }); + }); + + it('disables Lemonade thinking by default for non-catalog local models too', async () => { + const body = await captureInjectedRequestBody('lemonade', 'Gemma-4-26B-A4B-it-GGUF'); + + expect(body).toMatchObject({ + chat_template_kwargs: { enable_thinking: false }, + }); + }); + + it('strips unsupported Lemonade stream_options while preserving thinking overrides', async () => { + const originalFetch = globalThis.fetch; + const globalRecord = globalThis as Record; + const originalThinkingContext = globalRecord.__thinkingContext; + const fetchMock = vi.fn(async (_url: RequestInfo | URL, _init?: RequestInit) => { + return new Response(JSON.stringify({ ok: true }), { + status: 200, + headers: { 'content-type': 'application/json' }, + }); + }); + + try { + globalThis.fetch = fetchMock as typeof fetch; + globalRecord.__thinkingContext = { + getStore: () => ({ mode: 'disabled' }), + }; + + getModel({ + providerId: 'lemonade', + modelId: 'Gemma-4-26B-A4B-it-GGUF', + apiKey: '', + }); + + const lastCall = openAiMock.createOpenAI.mock.calls.at(-1); + const options = lastCall?.[0] as { fetch?: typeof fetch } | undefined; + + await options?.fetch?.('https://example.test/v1/chat/completions', { + method: 'POST', + body: JSON.stringify({ + model: 'Gemma-4-26B-A4B-it-GGUF', + messages: [{ role: 'user', content: 'hi' }], + stream: true, + stream_options: { include_usage: true }, + }), + }); + + const init = fetchMock.mock.calls[0]?.[1] as RequestInit; + const body = JSON.parse(init.body as string); + + expect(body.stream_options).toBeUndefined(); + expect(body).toMatchObject({ + chat_template_kwargs: { enable_thinking: false }, + }); + } finally { + globalThis.fetch = originalFetch; + if (originalThinkingContext === undefined) { + delete globalRecord.__thinkingContext; + } else { + globalRecord.__thinkingContext = originalThinkingContext; + } + } + }); }); diff --git a/tests/ai/thinking-config.test.ts b/tests/ai/thinking-config.test.ts index 283d41a07..9dd980485 100644 --- a/tests/ai/thinking-config.test.ts +++ b/tests/ai/thinking-config.test.ts @@ -152,6 +152,21 @@ describe('thinking config normalization', () => { expect(thinking?.effortValues).toEqual(['none', 'low', 'high']); }); + it('normalizes Lemonade reasoning models as disabled-by-default token budgets', () => { + const thinking = getThinking('lemonade', 'Qwen3.5-4B-GGUF'); + + expect(supportsConfigurableThinking(thinking)).toBe(true); + expect(thinking?.requestAdapter).toBe('lemonade'); + expect(getDefaultThinkingConfig(thinking)).toEqual({ + mode: 'disabled', + budgetTokens: undefined, + }); + expect(normalizeThinkingConfig(thinking, { mode: 'enabled', budgetTokens: 4096 })).toEqual({ + mode: 'enabled', + budgetTokens: 4096, + }); + }); + it('normalizes Doubao Seed 2.0 thinking as reasoning effort levels', () => { const thinking = getThinking('doubao', 'doubao-seed-2-0-pro-260215'); diff --git a/tests/audio/lemonade-tts.test.ts b/tests/audio/lemonade-tts.test.ts index 9580cbcf4..e68f35677 100644 --- a/tests/audio/lemonade-tts.test.ts +++ b/tests/audio/lemonade-tts.test.ts @@ -69,6 +69,65 @@ describe('Lemonade TTS', () => { expect(body.voice).toBe('af_heart'); }); + it('switches kokoro voice to zh-CN when Chinese text is paired with an English voice', async () => { + mockFetch.mockResolvedValueOnce({ + ok: true, + arrayBuffer: async () => wavBytes(), + headers: { get: () => 'audio/wav' }, + }); + + await generateTTS({ providerId: 'lemonade-tts', voice: 'af_heart' }, '给我讲讲 Python'); + + const body = JSON.parse(mockFetch.mock.calls[0][1].body); + expect(body.voice).toBe('zf_xiaoxiao'); + }); + + it('keeps the selected voice when its language already matches the text', async () => { + mockFetch.mockResolvedValueOnce({ + ok: true, + arrayBuffer: async () => wavBytes(), + headers: { get: () => 'audio/wav' }, + }); + + await generateTTS({ providerId: 'lemonade-tts', voice: 'zf_xiaoxiao' }, '给我讲讲 Python'); + + const body = JSON.parse(mockFetch.mock.calls[0][1].body); + expect(body.voice).toBe('zf_xiaoxiao'); + }); + + it('can disable automatic language matching via providerOptions', async () => { + mockFetch.mockResolvedValueOnce({ + ok: true, + arrayBuffer: async () => wavBytes(), + headers: { get: () => 'audio/wav' }, + }); + + await generateTTS( + { + providerId: 'lemonade-tts', + voice: 'af_heart', + providerOptions: { autoMatchVoiceLanguage: false }, + }, + '给我讲讲 Python', + ); + + const body = JSON.parse(mockFetch.mock.calls[0][1].body); + expect(body.voice).toBe('af_heart'); + }); + + it('does not auto-switch non-Chinese Kokoro voices', async () => { + mockFetch.mockResolvedValueOnce({ + ok: true, + arrayBuffer: async () => wavBytes(), + headers: { get: () => 'audio/wav' }, + }); + + await generateTTS({ providerId: 'lemonade-tts', voice: 'jf_alpha' }, 'こんにちは、Python を学ぼう'); + + const body = JSON.parse(mockFetch.mock.calls[0][1].body); + expect(body.voice).toBe('jf_alpha'); + }); + it('does not require an API key (keyless provider)', async () => { mockFetch.mockResolvedValueOnce({ ok: true, diff --git a/tests/generation/json-repair.test.ts b/tests/generation/json-repair.test.ts new file mode 100644 index 000000000..a401982ab --- /dev/null +++ b/tests/generation/json-repair.test.ts @@ -0,0 +1,57 @@ +import { describe, expect, it } from 'vitest'; + +import { parseJsonResponse } from '@/lib/generation/json-repair'; + +describe('json-repair targeted fixes', () => { + it('repairs quoted key-value fragments such as "height: 76"', () => { + const raw = `{ + "background": { + "type": "solid", + "color": "#ffffff" + }, + "elements": [ + { + "id": "code_text", + "type": "text", + "left": 80, + "top": 420, + "width": 840, + "height: 76", + "content": "

age = 25

", + "defaultFontName": "", + "defaultColor": "#333333" + } + ] +}`; + + const parsed = parseJsonResponse<{ + elements: Array<{ height: number; content: string }>; + }>(raw); + + expect(parsed).not.toBeNull(); + expect(parsed?.elements[0]?.height).toBe(76); + expect(parsed?.elements[0]?.content).toContain('age = 25'); + }); + + it('repairs boolean property fragments without touching valid string values', () => { + const raw = `{ + "elements": [ + { + "id": "shape_1", + "fixedRatio: false", + "height: 58", + "content": "

literal text: height: 58

" + } + ] +}`; + + const parsed = parseJsonResponse<{ + elements: Array<{ fixedRatio: boolean; height: number; content: string }>; + }>(raw); + + expect(parsed).not.toBeNull(); + expect(parsed?.elements[0]?.fixedRatio).toBe(false); + expect(parsed?.elements[0]?.height).toBe(58); + expect(parsed?.elements[0]?.content).toBe('

literal text: height: 58

'); + }); +}); From ac5af8a6068e5c1943f2840fca1bed516ff04ed2 Mon Sep 17 00:00:00 2001 From: yangshen <1322568757@qq.com> Date: Thu, 7 May 2026 19:15:23 +0800 Subject: [PATCH 6/9] style: format lemonade tts files --- components/settings/tts-settings.tsx | 4 +--- tests/audio/lemonade-tts.test.ts | 5 ++++- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/components/settings/tts-settings.tsx b/components/settings/tts-settings.tsx index b65616290..06bf44de7 100644 --- a/components/settings/tts-settings.tsx +++ b/components/settings/tts-settings.tsx @@ -528,9 +528,7 @@ export function TTSSettings({ selectedProviderId }: TTSSettingsProps) {
- +

{t('settings.lemonadeAutoMatchVoiceLanguageDescription')}

diff --git a/tests/audio/lemonade-tts.test.ts b/tests/audio/lemonade-tts.test.ts index e68f35677..b42236738 100644 --- a/tests/audio/lemonade-tts.test.ts +++ b/tests/audio/lemonade-tts.test.ts @@ -122,7 +122,10 @@ describe('Lemonade TTS', () => { headers: { get: () => 'audio/wav' }, }); - await generateTTS({ providerId: 'lemonade-tts', voice: 'jf_alpha' }, 'こんにちは、Python を学ぼう'); + await generateTTS( + { providerId: 'lemonade-tts', voice: 'jf_alpha' }, + 'こんにちは、Python を学ぼう', + ); const body = JSON.parse(mockFetch.mock.calls[0][1].body); expect(body.voice).toBe('jf_alpha'); From f267b26869585c4b94e77c4e728c075c2e4b35e4 Mon Sep 17 00:00:00 2001 From: yangshen <1322568757@qq.com> Date: Thu, 7 May 2026 21:37:05 +0800 Subject: [PATCH 7/9] i18n: add lemonade voice language labels --- lib/i18n/locales/ar-SA.json | 2 ++ lib/i18n/locales/ja-JP.json | 2 ++ lib/i18n/locales/ru-RU.json | 2 ++ lib/i18n/locales/zh-TW.json | 2 ++ 4 files changed, 8 insertions(+) diff --git a/lib/i18n/locales/ar-SA.json b/lib/i18n/locales/ar-SA.json index 92c3aacc5..d1dff77d7 100644 --- a/lib/i18n/locales/ar-SA.json +++ b/lib/i18n/locales/ar-SA.json @@ -605,6 +605,8 @@ "providerElevenLabsTTS": "ElevenLabs TTS", "providerMiniMaxTTS": "MiniMax TTS", "providerLemonadeTTS": "Lemonade TTS (محلي)", + "lemonadeAutoMatchVoiceLanguageLabel": "مطابقة لغة الصوت تلقائيًا", + "lemonadeAutoMatchVoiceLanguageDescription": "ينطبق على kokoro-v1 فقط. عند عدم تطابق لغة الصوت المحدد مع النص، يتم التبديل تلقائيًا إلى صوت باللغة نفسها.", "providerBrowserNativeTTS": "تحويل النص إلى كلام المدمج في المتصفح", "voxcpmBackend": "الخلفية", "voxcpmBaseUrlPending": "أدخل Base URL لإنشاء عنوان الطلب", diff --git a/lib/i18n/locales/ja-JP.json b/lib/i18n/locales/ja-JP.json index bd26eac12..e0af2aae6 100644 --- a/lib/i18n/locales/ja-JP.json +++ b/lib/i18n/locales/ja-JP.json @@ -605,6 +605,8 @@ "providerElevenLabsTTS": "ElevenLabs TTS", "providerMiniMaxTTS": "MiniMax TTS", "providerLemonadeTTS": "Lemonade TTS(ローカル)", + "lemonadeAutoMatchVoiceLanguageLabel": "音声言語を自動で合わせる", + "lemonadeAutoMatchVoiceLanguageDescription": "kokoro-v1 のみ有効です。選択した音声の言語が本文と一致しない場合、同じ言語の音声に自動で切り替えます。", "providerBrowserNativeTTS": "ブラウザネイティブTTS", "voxcpmBackend": "バックエンド", "voxcpmBaseUrlPending": "Base URL を入力するとリクエスト URL が生成されます", diff --git a/lib/i18n/locales/ru-RU.json b/lib/i18n/locales/ru-RU.json index fb5eca058..f7d621e37 100644 --- a/lib/i18n/locales/ru-RU.json +++ b/lib/i18n/locales/ru-RU.json @@ -605,6 +605,8 @@ "providerElevenLabsTTS": "ElevenLabs TTS", "providerMiniMaxTTS": "MiniMax TTS", "providerLemonadeTTS": "Lemonade TTS (Локальный)", + "lemonadeAutoMatchVoiceLanguageLabel": "Автоподбор языка голоса", + "lemonadeAutoMatchVoiceLanguageDescription": "Работает только для kokoro-v1. Если язык выбранного голоса не совпадает с текстом, система автоматически переключится на голос того же языка.", "providerBrowserNativeTTS": "Встроенный TTS браузера", "voxcpmBackend": "Бэкенд", "voxcpmBaseUrlPending": "Введите Base URL, чтобы сформировать URL запроса", diff --git a/lib/i18n/locales/zh-TW.json b/lib/i18n/locales/zh-TW.json index d7b263e53..e9ae5487c 100644 --- a/lib/i18n/locales/zh-TW.json +++ b/lib/i18n/locales/zh-TW.json @@ -585,6 +585,8 @@ "providerElevenLabsTTS": "ElevenLabs TTS", "providerMiniMaxTTS": "MiniMax TTS", "providerLemonadeTTS": "Lemonade TTS(本機)", + "lemonadeAutoMatchVoiceLanguageLabel": "自動匹配音色語言", + "lemonadeAutoMatchVoiceLanguageDescription": "僅對 kokoro-v1 生效。當所選音色語言與文字不一致時,會自動切換為相同語言的音色。", "providerBrowserNativeTTS": "瀏覽器原生 TTS", "providerOpenAIWhisper": "OpenAI ASR (gpt-4o-mini-transcribe)", "providerBrowserNative": "瀏覽器原生 ASR", From 52a06b4542152d65bdc677b989fc3d046a77a00f Mon Sep 17 00:00:00 2001 From: yangshen <1322568757@qq.com> Date: Thu, 7 May 2026 23:16:57 +0800 Subject: [PATCH 8/9] fix(lemonade): align image default model and harden wav detection --- lib/audio/asr-providers.ts | 23 +++++++++++++++++--- lib/media/adapters/lemonade-image-adapter.ts | 2 +- tests/audio/lemonade-asr.test.ts | 13 +++++++++++ tests/media/lemonade-image-adapter.test.ts | 2 +- 4 files changed, 35 insertions(+), 5 deletions(-) diff --git a/lib/audio/asr-providers.ts b/lib/audio/asr-providers.ts index dd5a8deb9..7de04e653 100644 --- a/lib/audio/asr-providers.ts +++ b/lib/audio/asr-providers.ts @@ -208,7 +208,7 @@ async function transcribeLemonadeASR( ); const audioBlob = await toAudioBlob(audioBuffer); - if (!isWavAudio(audioBlob)) { + if (!(await isWavAudio(audioBlob))) { throw new Error( 'Lemonade ASR currently supports WAV input only. Recordings should be converted to WAV before upload.', ); @@ -254,8 +254,17 @@ async function toAudioBlob(audioBuffer: Buffer | Blob): Promise { throw new Error('Invalid audio buffer type'); } -function isWavAudio(blob: Blob): boolean { - return blob.type.includes('audio/wav') || blob.type.includes('audio/x-wav'); +async function isWavAudio(blob: Blob): Promise { + if (blob.type.includes('audio/wav') || blob.type.includes('audio/x-wav')) { + return true; + } + + if (blob instanceof File && /\.wav$/i.test(blob.name)) { + return true; + } + + const header = await blob.slice(0, 12).arrayBuffer(); + return detectWavBytes(new Uint8Array(header)); } function detectWavBuffer(buffer: Buffer): boolean { @@ -266,6 +275,14 @@ function detectWavBuffer(buffer: Buffer): boolean { ); } +function detectWavBytes(bytes: Uint8Array): boolean { + return ( + bytes.byteLength >= 12 && + String.fromCharCode(...bytes.slice(0, 4)) === 'RIFF' && + String.fromCharCode(...bytes.slice(8, 12)) === 'WAVE' + ); +} + function getOptionalBearerAuthHeaders(apiKey?: string): Record { const key = apiKey?.trim(); return key ? { Authorization: `Bearer ${key}` } : {}; diff --git a/lib/media/adapters/lemonade-image-adapter.ts b/lib/media/adapters/lemonade-image-adapter.ts index 75e809f02..82d9ed83b 100644 --- a/lib/media/adapters/lemonade-image-adapter.ts +++ b/lib/media/adapters/lemonade-image-adapter.ts @@ -10,7 +10,7 @@ import type { ImageGenerationResult, } from '../types'; -const DEFAULT_MODEL = 'sd-cpp'; +const DEFAULT_MODEL = 'Qwen-Image-GGUF'; const DEFAULT_BASE_URL = 'http://localhost:13305/v1'; function normalizeBaseUrl(baseUrl?: string): string { diff --git a/tests/audio/lemonade-asr.test.ts b/tests/audio/lemonade-asr.test.ts index d280d3eea..674ea19fd 100644 --- a/tests/audio/lemonade-asr.test.ts +++ b/tests/audio/lemonade-asr.test.ts @@ -69,6 +69,19 @@ describe('Lemonade ASR', () => { expect(mockFetch).not.toHaveBeenCalled(); }); + it('accepts WAV files even when the MIME type is missing', async () => { + mockFetch.mockResolvedValueOnce({ + ok: true, + json: async () => ({ text: 'hello' }), + }); + + const audioFile = new File([wavBuffer()], 'recording.wav'); + const result = await transcribeAudio({ providerId: 'lemonade-asr' }, audioFile); + + expect(result).toEqual({ text: 'hello' }); + expect(mockFetch).toHaveBeenCalledTimes(1); + }); + it('returns empty text gracefully when upstream reports empty audio', async () => { mockFetch.mockResolvedValueOnce({ ok: false, diff --git a/tests/media/lemonade-image-adapter.test.ts b/tests/media/lemonade-image-adapter.test.ts index 63ff4990d..43d291c14 100644 --- a/tests/media/lemonade-image-adapter.test.ts +++ b/tests/media/lemonade-image-adapter.test.ts @@ -32,7 +32,7 @@ describe('lemonade-image-adapter', () => { ); const body = JSON.parse(mockFetch.mock.calls[0][1].body); expect(body).toEqual({ - model: 'sd-cpp', + model: 'Qwen-Image-GGUF', prompt: 'a fox', n: 1, size: '768x768', From 42be46877051ed4e431b04f0c2a6f1c7dc423496 Mon Sep 17 00:00:00 2001 From: yangshen <1322568757@qq.com> Date: Fri, 8 May 2026 00:08:01 +0800 Subject: [PATCH 9/9] test(lemonade): fix File BlobPart typing --- tests/audio/lemonade-asr.test.ts | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/tests/audio/lemonade-asr.test.ts b/tests/audio/lemonade-asr.test.ts index 674ea19fd..f53617272 100644 --- a/tests/audio/lemonade-asr.test.ts +++ b/tests/audio/lemonade-asr.test.ts @@ -12,6 +12,13 @@ function wavBuffer(): Buffer { return buf; } +function wavArrayBuffer(): ArrayBuffer { + const buffer = wavBuffer(); + const arrayBuffer = new ArrayBuffer(buffer.byteLength); + new Uint8Array(arrayBuffer).set(buffer); + return arrayBuffer; +} + describe('Lemonade ASR', () => { beforeEach(() => { mockFetch.mockReset(); @@ -75,7 +82,7 @@ describe('Lemonade ASR', () => { json: async () => ({ text: 'hello' }), }); - const audioFile = new File([wavBuffer()], 'recording.wav'); + const audioFile = new File([wavArrayBuffer()], 'recording.wav'); const result = await transcribeAudio({ providerId: 'lemonade-asr' }, audioFile); expect(result).toEqual({ text: 'hello' });