diff --git a/.env.example b/.env.example index b854b653..f7405a90 100644 --- a/.env.example +++ b/.env.example @@ -77,6 +77,9 @@ ASR_OPENAI_BASE_URL= ASR_QWEN_API_KEY= ASR_QWEN_BASE_URL= +ASR_AZURE_API_KEY= +ASR_AZURE_BASE_URL=https://{region}.api.cognitive.microsoft.com + # --- PDF Processing ----------------------------------------------------------- PDF_UNPDF_API_KEY= diff --git a/components/settings/audio-settings.tsx b/components/settings/audio-settings.tsx index dd69eed2..85232d7e 100644 --- a/components/settings/audio-settings.tsx +++ b/components/settings/audio-settings.tsx @@ -48,6 +48,7 @@ function getASRProviderName(providerId: ASRProviderId, t: (key: string) => strin 'openai-whisper': t('settings.providerOpenAIWhisper'), 'browser-native': t('settings.providerBrowserNative'), 'qwen-asr': t('settings.providerQwenASR'), + 'azure-asr': t('settings.providerAzureASR'), }; return names[providerId]; } diff --git a/components/settings/index.tsx b/components/settings/index.tsx index 828cb693..f4c5bac0 100644 --- a/components/settings/index.tsx +++ b/components/settings/index.tsx @@ -132,6 +132,7 @@ function getASRProviderName(providerId: ASRProviderId, t: (key: string) => strin 'openai-whisper': t('settings.providerOpenAIWhisper'), 'browser-native': t('settings.providerBrowserNative'), 'qwen-asr': t('settings.providerQwenASR'), + 'azure-asr': t('settings.providerAzureASR'), }; return names[providerId]; } diff --git a/lib/audio/asr-providers.ts b/lib/audio/asr-providers.ts index 081febb1..9e1afff4 100644 --- a/lib/audio/asr-providers.ts +++ b/lib/audio/asr-providers.ts @@ -184,6 +184,9 @@ export async function transcribeAudio( case 'qwen-asr': return await transcribeQwenASR(config, audioBuffer); + case 'azure-asr': + return await transcribeAzureASR(config, audioBuffer); + default: throw new Error(`Unsupported ASR provider: ${config.providerId}`); } @@ -326,6 +329,90 @@ async function transcribeQwenASR( return { text: transcribedText }; } +/** + * Azure STT implementation (Fast Transcription REST API) + * https://learn.microsoft.com/azure/ai-services/speech-service/fast-transcription-create + */ +async function transcribeAzureASR( + config: ASRModelConfig, + audioBuffer: Buffer | Blob, +): Promise { + const rawBaseUrl = config.baseUrl || ASR_PROVIDERS['azure-asr'].defaultBaseUrl!; + + if (!rawBaseUrl || rawBaseUrl.includes('{region}')) { + throw new Error('Azure STT base URL must include a real region'); + } + + let endpoint = rawBaseUrl.replace(/\/+$/, ''); + if (/\.stt\.speech\.microsoft\.com$/i.test(endpoint)) { + endpoint = endpoint.replace(/\.stt\.speech\.microsoft\.com$/i, '.api.cognitive.microsoft.com'); + } + if (!/\/speechtotext\/transcriptions:transcribe/i.test(endpoint)) { + endpoint = `${endpoint}/speechtotext/transcriptions:transcribe`; + } + const url = new URL(endpoint); + if (!url.searchParams.get('api-version')) { + url.searchParams.set('api-version', '2025-10-15'); + } + + let audioBlob: Blob; + if (audioBuffer instanceof Blob) { + audioBlob = audioBuffer; + } else { + audioBlob = new Blob([audioBuffer as unknown as BlobPart], { type: 'audio/webm' }); + } + + const formData = new FormData(); + formData.append('audio', audioBlob, 'recording.webm'); + + const localeMap: Record = { + en: 'en-US', + zh: 'zh-CN', + ja: 'ja-JP', + ko: 'ko-KR', + de: 'de-DE', + fr: 'fr-FR', + es: 'es-ES', + it: 'it-IT', + pt: 'pt-BR', + ru: 'ru-RU', + ar: 'ar-SA', + hi: 'hi-IN', + }; + + if (config.language && config.language !== 'auto') { + const locale = localeMap[config.language] || config.language; + formData.append('definition', JSON.stringify({ locales: [locale] })); + } + + const response = await fetch(url.toString(), { + method: 'POST', + headers: { 'Ocp-Apim-Subscription-Key': config.apiKey! }, + body: formData, + }); + + if (!response.ok) { + const errorText = await response.text().catch(() => response.statusText); + throw new Error(`Azure STT error (${response.status}): ${errorText}`); + } + + const data = (await response.json()) as { + combinedPhrases?: Array<{ text?: string }>; + phrases?: Array<{ text?: string }>; + }; + + const combinedText = data.combinedPhrases + ?.map((p) => p.text || '') + .filter(Boolean) + .join(' '); + const phraseText = data.phrases + ?.map((p) => p.text || '') + .filter(Boolean) + .join(' '); + + return { text: combinedText || phraseText || '' }; +} + /** * Get current ASR configuration from settings store * Note: This function should only be called in browser context diff --git a/lib/audio/constants.ts b/lib/audio/constants.ts index 8e5b9976..57f023ac 100644 --- a/lib/audio/constants.ts +++ b/lib/audio/constants.ts @@ -806,6 +806,30 @@ export const ASR_PROVIDERS: Record = { supportedFormats: ['mp3', 'wav', 'webm', 'm4a', 'flac'], }, + 'azure-asr': { + id: 'azure-asr', + name: 'Azure STT', + requiresApiKey: true, + defaultBaseUrl: 'https://{region}.api.cognitive.microsoft.com', + icon: '/logos/azure.svg', + supportedLanguages: [ + 'auto', + 'en', + 'zh', + 'ja', + 'ko', + 'de', + 'fr', + 'es', + 'it', + 'pt', + 'ru', + 'ar', + 'hi', + ], + supportedFormats: ['wav', 'ogg', 'webm', 'mp3', 'flac', 'm4a'], + }, + 'browser-native': { id: 'browser-native', name: '浏览器原生 ASR (Web Speech API)', diff --git a/lib/audio/types.ts b/lib/audio/types.ts index 588efcdb..9355e1a4 100644 --- a/lib/audio/types.ts +++ b/lib/audio/types.ts @@ -15,6 +15,7 @@ * - OpenAI Whisper (https://platform.openai.com/docs/guides/speech-to-text) * - Browser Native (Web Speech API, client-side only) * - Qwen ASR (DashScope API) + * - Azure STT (https://learn.microsoft.com/azure/ai-services/speech-service/fast-transcription-create) * * Future Provider Support (extensible): * - ElevenLabs TTS/ASR (https://elevenlabs.io/docs) @@ -141,7 +142,7 @@ export interface TTSModelConfig { * Add new ASR providers here as union members. * Keep in sync with ASR_PROVIDERS registry in constants.ts */ -export type ASRProviderId = 'openai-whisper' | 'browser-native' | 'qwen-asr'; +export type ASRProviderId = 'openai-whisper' | 'browser-native' | 'qwen-asr' | 'azure-asr'; // Add new ASR providers below (uncomment and modify): // | 'elevenlabs-asr' // | 'assemblyai-asr' diff --git a/lib/i18n/settings.ts b/lib/i18n/settings.ts index 3ba0be4f..aeec187d 100644 --- a/lib/i18n/settings.ts +++ b/lib/i18n/settings.ts @@ -227,6 +227,7 @@ export const settingsZhCN = { providerOpenAIWhisper: 'OpenAI ASR (gpt-4o-mini-transcribe)', providerBrowserNative: '浏览器原生 ASR', providerQwenASR: 'Qwen ASR(阿里云百炼)', + providerAzureASR: 'Azure STT', providerUnpdf: 'unpdf(内置)', providerMinerU: 'MinerU', browserNativeTTSNote: '浏览器原生 TTS 无需配置,完全免费,使用系统内置语音', @@ -817,6 +818,7 @@ export const settingsEnUS = { providerOpenAIWhisper: 'OpenAI ASR (gpt-4o-mini-transcribe)', providerBrowserNative: 'Browser Native ASR', providerQwenASR: 'Qwen ASR (Alibaba Cloud Bailian)', + providerAzureASR: 'Azure STT', providerUnpdf: 'unpdf (Built-in)', providerMinerU: 'MinerU', browserNativeTTSNote: diff --git a/lib/server/provider-config.ts b/lib/server/provider-config.ts index 0b876df0..a74edeb9 100644 --- a/lib/server/provider-config.ts +++ b/lib/server/provider-config.ts @@ -62,6 +62,7 @@ const TTS_ENV_MAP: Record = { const ASR_ENV_MAP: Record = { ASR_OPENAI: 'openai-whisper', ASR_QWEN: 'qwen-asr', + ASR_AZURE: 'azure-asr', }; const PDF_ENV_MAP: Record = { diff --git a/lib/store/settings.ts b/lib/store/settings.ts index cc322f6a..8e182eff 100644 --- a/lib/store/settings.ts +++ b/lib/store/settings.ts @@ -274,6 +274,7 @@ const getDefaultAudioConfig = () => ({ 'openai-whisper': { apiKey: '', baseUrl: '', enabled: true }, 'browser-native': { apiKey: '', baseUrl: '', enabled: true }, 'qwen-asr': { apiKey: '', baseUrl: '', enabled: false }, + 'azure-asr': { apiKey: '', baseUrl: '', enabled: false }, } as Record, });