Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,10 @@ XIAOMI_MODELS=
# OLLAMA_BASE_URL=http://localhost:11434/v1
# OLLAMA_MODELS=llama3.3,llama3.2,qwen2.5,mistral,gemma3

# Lemonade local server (OpenAI-compatible, no API key required)
# LEMONADE_BASE_URL=http://localhost:13305/v1
# LEMONADE_MODELS=Qwen3-0.6B-GGUF,Llama-3.2-1B-Instruct-Hybrid,Qwen2.5-VL-7B-Instruct

# --- TTS (Text-to-Speech) ----------------------------------------------------

TTS_OPENAI_API_KEY=
Expand All @@ -99,6 +103,9 @@ TTS_MINIMAX_BASE_URL=https://api.minimaxi.com
TTS_ELEVENLABS_API_KEY=
TTS_ELEVENLABS_BASE_URL=

# Lemonade TTS (local, no API key required)
# TTS_LEMONADE_BASE_URL=http://localhost:13305/v1

# --- ASR (Automatic Speech Recognition) --------------------------------------

ASR_OPENAI_API_KEY=
Expand All @@ -107,6 +114,9 @@ ASR_OPENAI_BASE_URL=
ASR_QWEN_API_KEY=
ASR_QWEN_BASE_URL=

# Lemonade ASR (local, WAV input only, no API key required)
# ASR_LEMONADE_BASE_URL=http://localhost:13305/v1

# --- PDF Processing -----------------------------------------------------------

PDF_UNPDF_API_KEY=
Expand Down Expand Up @@ -136,6 +146,9 @@ IMAGE_MINIMAX_BASE_URL=https://api.minimaxi.com
IMAGE_GROK_API_KEY=
IMAGE_GROK_BASE_URL=

# Lemonade image generation (local, no API key required)
# IMAGE_LEMONADE_BASE_URL=http://localhost:13305/v1

# --- Video Generation ---------------------------------------------------------

VIDEO_SEEDANCE_API_KEY=
Expand Down
9 changes: 7 additions & 2 deletions app/api/generate/image/route.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,11 @@
*/

import { NextRequest } from 'next/server';
import { generateImage, aspectRatioToDimensions } from '@/lib/media/image-providers';
import {
generateImage,
aspectRatioToDimensions,
IMAGE_PROVIDERS,
} from '@/lib/media/image-providers';
import { resolveImageApiKey, resolveImageBaseUrl } from '@/lib/server/provider-config';
import type { ImageProviderId, ImageGenerationOptions } from '@/lib/media/types';
import { createLogger } from '@/lib/logger';
Expand Down Expand Up @@ -50,7 +54,8 @@ export async function POST(request: NextRequest) {
const apiKey = clientBaseUrl
? clientApiKey || ''
: resolveImageApiKey(providerId, clientApiKey);
if (!apiKey) {
const provider = IMAGE_PROVIDERS[providerId];
if (provider?.requiresApiKey && !apiKey) {
return apiError(
'MISSING_API_KEY',
401,
Expand Down
6 changes: 1 addition & 5 deletions app/api/transcription/route.ts
Original file line number Diff line number Diff line change
Expand Up @@ -50,12 +50,8 @@ export async function POST(req: NextRequest) {
: resolveASRBaseUrl(effectiveProviderId, baseUrl || undefined),
};

// Convert audio file to buffer
const arrayBuffer = await audioFile.arrayBuffer();
const buffer = Buffer.from(arrayBuffer);

// Transcribe using the provider system
const result = await transcribeAudio(config, buffer);
const result = await transcribeAudio(config, audioFile);

return apiSuccess({ text: result.text });
} catch (error) {
Expand Down
5 changes: 3 additions & 2 deletions app/api/verify-image-provider/route.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
*/

import { NextRequest } from 'next/server';
import { testImageConnectivity } from '@/lib/media/image-providers';
import { IMAGE_PROVIDERS, testImageConnectivity } from '@/lib/media/image-providers';
import { resolveImageApiKey, resolveImageBaseUrl } from '@/lib/server/provider-config';
import type { ImageProviderId } from '@/lib/media/types';
import { apiError, apiSuccess } from '@/lib/server/api-response';
Expand Down Expand Up @@ -43,7 +43,8 @@ export async function POST(request: NextRequest) {
: resolveImageApiKey(providerId, clientApiKey);
const baseUrl = clientBaseUrl ? clientBaseUrl : resolveImageBaseUrl(providerId, clientBaseUrl);

if (!apiKey) {
const provider = IMAGE_PROVIDERS[providerId];
if (provider?.requiresApiKey && !apiKey) {
return apiError('MISSING_API_KEY', 400, 'No API key configured');
}

Expand Down
8 changes: 6 additions & 2 deletions components/settings/asr-settings.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ import { Mic, MicOff, CheckCircle2, XCircle, Eye, EyeOff, Plus, Loader2 } from '
import { cn } from '@/lib/utils';
import { toast } from 'sonner';
import { createLogger } from '@/lib/logger';
import { normalizeASRUploadAudio } from '@/lib/audio/wav-utils';

const log = createLogger('ASRSettings');

Expand All @@ -52,6 +53,7 @@ export function ASRSettings({ selectedProviderId }: ASRSettingsProps) {
const requiresApiKey = isCustom
? !!providerConfig?.requiresApiKey
: !!asrProvider?.requiresApiKey;
const isKeylessLocalProvider = !isCustom && !requiresApiKey && !!asrProvider?.defaultBaseUrl;

const [showApiKey, setShowApiKey] = useState(false);
const [showDeleteConfirm, setShowDeleteConfirm] = useState(false);
Expand Down Expand Up @@ -129,8 +131,9 @@ export function ASRSettings({ selectedProviderId }: ASRSettingsProps) {
stream.getTracks().forEach((track) => track.stop());
setIsProcessing(true);
const audioBlob = new Blob(audioChunks, { type: 'audio/webm' });
const uploadAudio = await normalizeASRUploadAudio(selectedProviderId, audioBlob);
const formData = new FormData();
formData.append('audio', audioBlob, 'recording.webm');
formData.append('audio', uploadAudio.blob, uploadAudio.fileName);
formData.append('providerId', selectedProviderId);
formData.append(
'modelId',
Expand Down Expand Up @@ -207,7 +210,7 @@ export function ASRSettings({ selectedProviderId }: ASRSettingsProps) {
)}

{/* API Key & Base URL */}
{(requiresApiKey || isServerConfigured || isCustom) && (
{(requiresApiKey || isServerConfigured || isCustom || isKeylessLocalProvider) && (
<>
<div className="grid grid-cols-2 gap-4">
<div className="space-y-2">
Expand Down Expand Up @@ -276,6 +279,7 @@ export function ASRSettings({ selectedProviderId }: ASRSettingsProps) {
} else {
switch (selectedProviderId) {
case 'openai-whisper':
case 'lemonade-asr':
endpointPath = '/audio/transcriptions';
break;
case 'qwen-asr':
Expand Down
6 changes: 5 additions & 1 deletion components/settings/audio-settings.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ import azureVoicesData from '@/lib/audio/azure.json';
import { createLogger } from '@/lib/logger';
import { getVoxCPMVoiceOptions, useVoxCPMVoiceProfiles } from '@/lib/audio/voxcpm-voices';
import { normalizeVoxCPMBackend, voxCPMBackendSupportsReferenceAudio } from '@/lib/audio/voxcpm';
import { normalizeASRUploadAudio } from '@/lib/audio/wav-utils';

const log = createLogger('AudioSettings');

Expand All @@ -44,6 +45,7 @@ function getTTSProviderName(providerId: TTSProviderId, t: (key: string) => strin
'doubao-tts': t('settings.providerDoubaoTTS'),
'elevenlabs-tts': t('settings.providerElevenLabsTTS'),
'minimax-tts': t('settings.providerMiniMaxTTS'),
'lemonade-tts': t('settings.providerLemonadeTTS'),
'browser-native-tts': t('settings.providerBrowserNativeTTS'),
};
return names[providerId];
Expand All @@ -54,6 +56,7 @@ function getASRProviderName(providerId: ASRProviderId, t: (key: string) => strin
'openai-whisper': t('settings.providerOpenAIWhisper'),
'browser-native': t('settings.providerBrowserNative'),
'qwen-asr': t('settings.providerQwenASR'),
'lemonade-asr': t('settings.providerLemonadeASR'),
};
return names[providerId];
}
Expand Down Expand Up @@ -330,8 +333,9 @@ export function AudioSettings({ onSave }: AudioSettingsProps = {}) {
stream.getTracks().forEach((track) => track.stop());

const audioBlob = new Blob(audioChunks, { type: 'audio/webm' });
const uploadAudio = await normalizeASRUploadAudio(asrProviderId, audioBlob);
const formData = new FormData();
formData.append('audio', audioBlob, 'recording.webm');
formData.append('audio', uploadAudio.blob, uploadAudio.fileName);
formData.append('providerId', asrProviderId);
formData.append('language', asrLanguage);

Expand Down
5 changes: 4 additions & 1 deletion components/settings/image-settings.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ export function ImageSettings({ selectedProviderId }: ImageSettingsProps) {
[currentConfig?.customModels],
);
const isServerConfigured = !!currentConfig?.isServerConfigured;
const requiresApiKey = currentProvider?.requiresApiKey ?? true;

const handleApiKeyChange = (apiKey: string) => {
setImageProviderConfig(selectedProviderId, { apiKey });
Expand Down Expand Up @@ -179,7 +180,9 @@ export function ImageSettings({ selectedProviderId }: ImageSettingsProps) {
variant="outline"
size="sm"
onClick={handleTest}
disabled={testLoading || (!currentConfig?.apiKey && !isServerConfigured)}
disabled={
testLoading || (requiresApiKey && !currentConfig?.apiKey && !isServerConfigured)
}
className="gap-1.5"
>
{testLoading ? (
Expand Down
4 changes: 4 additions & 0 deletions components/settings/index.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,7 @@ function getTTSProviderName(providerId: TTSProviderId, t: (key: string) => strin
'doubao-tts': t('settings.providerDoubaoTTS'),
'elevenlabs-tts': t('settings.providerElevenLabsTTS'),
'minimax-tts': t('settings.providerMiniMaxTTS'),
'lemonade-tts': t('settings.providerLemonadeTTS'),
'browser-native-tts': t('settings.providerBrowserNativeTTS'),
};
return names[providerId] || providerId;
Expand All @@ -159,6 +160,7 @@ function getASRProviderName(providerId: ASRProviderId, t: (key: string) => strin
'openai-whisper': t('settings.providerOpenAIWhisper'),
'browser-native': t('settings.providerBrowserNative'),
'qwen-asr': t('settings.providerQwenASR'),
'lemonade-asr': t('settings.providerLemonadeASR'),
};
return names[providerId] || providerId;
}
Expand All @@ -171,6 +173,7 @@ const IMAGE_PROVIDER_NAMES: Record<ImageProviderId, string> = {
'nano-banana': 'providerNanoBanana',
'minimax-image': 'providerMiniMaxImage',
'grok-image': 'providerGrokImage',
lemonade: 'providerLemonadeImage',
};

const IMAGE_PROVIDER_ICONS: Record<ImageProviderId, string> = {
Expand All @@ -180,6 +183,7 @@ const IMAGE_PROVIDER_ICONS: Record<ImageProviderId, string> = {
'nano-banana': '/logos/gemini.svg',
'minimax-image': '/logos/minimax.svg',
'grok-image': '/logos/grok.svg',
lemonade: '/logos/lemonade.svg',
};

const VIDEO_PROVIDER_NAMES: Record<VideoProviderId, string> = {
Expand Down
2 changes: 1 addition & 1 deletion components/settings/model-selector.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ export function ModelSelector({
([, config]) =>
(config.requiresApiKey
? config.apiKey || config.isServerConfigured
: config.isServerConfigured || config.baseUrl) &&
: config.isServerConfigured || config.baseUrl || config.defaultBaseUrl) &&
config.models.length >= 1 &&
(config.baseUrl || config.defaultBaseUrl || config.serverBaseUrl),
)
Expand Down
4 changes: 3 additions & 1 deletion components/settings/tts-settings.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ export function TTSSettings({ selectedProviderId }: TTSSettingsProps) {
const requiresApiKey = isCustom
? !!providerConfig?.requiresApiKey
: !!ttsProvider?.requiresApiKey;
const isKeylessLocalProvider = !isCustom && !requiresApiKey && !!ttsProvider?.defaultBaseUrl;

// When testing a non-active provider, use that provider's default voice
// instead of the active provider's voice (which may be incompatible).
Expand Down Expand Up @@ -192,6 +193,7 @@ export function TTSSettings({ selectedProviderId }: TTSSettingsProps) {
switch (selectedProviderId) {
case 'openai-tts':
case 'glm-tts':
case 'lemonade-tts':
return '/audio/speech';
case 'azure-tts':
return '/cognitiveservices/v1';
Expand Down Expand Up @@ -225,7 +227,7 @@ export function TTSSettings({ selectedProviderId }: TTSSettingsProps) {
)}

{/* API Key & Base URL */}
{(requiresApiKey || isServerConfigured || isCustom || isVoxCPM) &&
{(requiresApiKey || isServerConfigured || isCustom || isVoxCPM || isKeylessLocalProvider) &&
(isVoxCPM ? (
<div className="rounded-lg border border-border/60 bg-background px-3 py-2.5">
<div className="flex flex-col gap-2 md:flex-row md:items-end">
Expand Down
28 changes: 27 additions & 1 deletion lib/ai/providers.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
* - Anthropic Claude (native)
* - Google Gemini (native)
* - MiniMax (Anthropic-compatible, recommended by official)
* - OpenAI-compatible providers (DeepSeek, Qwen, Kimi, GLM, SiliconFlow, Doubao, Tencent, Xiaomi, etc.)
* - OpenAI-compatible providers (DeepSeek, Qwen, Kimi, GLM, SiliconFlow, Doubao, Tencent, Xiaomi, Lemonade, etc.)
*
* Sources:
* - https://platform.openai.com/docs/models
Expand Down Expand Up @@ -980,6 +980,32 @@ export const PROVIDERS: Record<ProviderId, ProviderConfig> = {
},
],
},

lemonade: {
id: 'lemonade',
name: 'Lemonade',
type: 'openai',
defaultBaseUrl: 'http://localhost:13305/v1',
requiresApiKey: false,
icon: '/logos/lemonade.svg',
models: [
{
id: 'Qwen3-0.6B-GGUF',
name: 'Qwen3 0.6B GGUF',
capabilities: { streaming: true, tools: true, vision: false },
},
{
id: 'Llama-3.2-1B-Instruct-Hybrid',
name: 'Llama 3.2 1B Instruct Hybrid',
capabilities: { streaming: true, tools: true, vision: false },
},
{
id: 'Qwen2.5-VL-7B-Instruct',
name: 'Qwen2.5 VL 7B Instruct',
capabilities: { streaming: true, tools: true, vision: true },
},
],
},
};

applyModelMetadata(PROVIDERS);
Expand Down
81 changes: 81 additions & 0 deletions lib/audio/asr-providers.ts
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,9 @@ export async function transcribeAudio(
case 'qwen-asr':
return await transcribeQwenASR(config, audioBuffer);

case 'lemonade-asr':
return await transcribeLemonadeASR(config, audioBuffer);

default:
if (isCustomASRProvider(config.providerId)) {
return await transcribeOpenAIWhisper(config, audioBuffer);
Expand All @@ -190,6 +193,84 @@ export async function transcribeAudio(
}
}

/**
* Lemonade ASR implementation (OpenAI-compatible multipart transcription).
*
* Lemonade currently supports WAV input and JSON response format.
*/
async function transcribeLemonadeASR(
config: ASRModelConfig,
audioBuffer: Buffer | Blob,
): Promise<ASRTranscriptionResult> {
const baseUrl = (config.baseUrl || ASR_PROVIDERS['lemonade-asr'].defaultBaseUrl || '').replace(
/\/$/,
'',
);

const audioBlob = await toAudioBlob(audioBuffer);
if (!isWavAudio(audioBlob)) {
throw new Error(
'Lemonade ASR currently supports WAV input only. Recordings should be converted to WAV before upload.',
);
}

const formData = new FormData();
formData.set('file', audioBlob, 'audio.wav');
formData.set('model', config.modelId || ASR_PROVIDERS['lemonade-asr'].defaultModelId);
formData.set('response_format', 'json');
if (config.language && config.language !== 'auto') {
formData.set('language', config.language);
}

const response = await fetch(`${baseUrl}/audio/transcriptions`, {
method: 'POST',
headers: getOptionalBearerAuthHeaders(config.apiKey),
body: formData,
});

if (!response.ok) {
const errorText = await response.text().catch(() => response.statusText);
if (errorText.includes('audio is empty') || errorText.includes('too short')) {
return { text: '' };
}
throw new Error(`Lemonade ASR API error: ${errorText || response.statusText}`);
}

const data = await response.json();
return { text: typeof data.text === 'string' ? data.text : '' };
}

async function toAudioBlob(audioBuffer: Buffer | Blob): Promise<Blob> {
if (audioBuffer instanceof Blob) {
return audioBuffer;
}
if (audioBuffer instanceof Buffer) {
const arrayBuffer = audioBuffer.buffer.slice(
audioBuffer.byteOffset,
audioBuffer.byteOffset + audioBuffer.byteLength,
) as ArrayBuffer;
return new Blob([arrayBuffer], { type: detectWavBuffer(audioBuffer) ? 'audio/wav' : '' });
}
throw new Error('Invalid audio buffer type');
}

function isWavAudio(blob: Blob): boolean {
return blob.type.includes('audio/wav') || blob.type.includes('audio/x-wav');
}

function detectWavBuffer(buffer: Buffer): boolean {
return (
buffer.byteLength >= 12 &&
buffer.toString('ascii', 0, 4) === 'RIFF' &&
buffer.toString('ascii', 8, 12) === 'WAVE'
);
}

function getOptionalBearerAuthHeaders(apiKey?: string): Record<string, string> {
const key = apiKey?.trim();
return key ? { Authorization: `Bearer ${key}` } : {};
}

/**
* OpenAI Whisper implementation (using Vercel AI SDK)
*/
Expand Down
Loading
Loading