Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,9 @@ ASR_OPENAI_BASE_URL=
ASR_QWEN_API_KEY=
ASR_QWEN_BASE_URL=

ASR_AZURE_API_KEY=
ASR_AZURE_BASE_URL=https://{region}.api.cognitive.microsoft.com

# --- PDF Processing -----------------------------------------------------------

PDF_UNPDF_API_KEY=
Expand Down
1 change: 1 addition & 0 deletions components/settings/audio-settings.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ function getASRProviderName(providerId: ASRProviderId, t: (key: string) => strin
'openai-whisper': t('settings.providerOpenAIWhisper'),
'browser-native': t('settings.providerBrowserNative'),
'qwen-asr': t('settings.providerQwenASR'),
'azure-asr': t('settings.providerAzureASR'),
};
return names[providerId];
}
Expand Down
1 change: 1 addition & 0 deletions components/settings/index.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,7 @@ function getASRProviderName(providerId: ASRProviderId, t: (key: string) => strin
'openai-whisper': t('settings.providerOpenAIWhisper'),
'browser-native': t('settings.providerBrowserNative'),
'qwen-asr': t('settings.providerQwenASR'),
'azure-asr': t('settings.providerAzureASR'),
};
return names[providerId];
}
Expand Down
87 changes: 87 additions & 0 deletions lib/audio/asr-providers.ts
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,9 @@ export async function transcribeAudio(
case 'qwen-asr':
return await transcribeQwenASR(config, audioBuffer);

case 'azure-asr':
return await transcribeAzureASR(config, audioBuffer);

default:
throw new Error(`Unsupported ASR provider: ${config.providerId}`);
}
Expand Down Expand Up @@ -326,6 +329,90 @@ async function transcribeQwenASR(
return { text: transcribedText };
}

/**
* Azure STT implementation (Fast Transcription REST API)
* https://learn.microsoft.com/azure/ai-services/speech-service/fast-transcription-create
*/
async function transcribeAzureASR(
config: ASRModelConfig,
audioBuffer: Buffer | Blob,
): Promise<ASRTranscriptionResult> {
const rawBaseUrl = config.baseUrl || ASR_PROVIDERS['azure-asr'].defaultBaseUrl!;

if (!rawBaseUrl || rawBaseUrl.includes('{region}')) {
throw new Error('Azure STT base URL must include a real region');
}

let endpoint = rawBaseUrl.replace(/\/+$/, '');
if (/\.stt\.speech\.microsoft\.com$/i.test(endpoint)) {
endpoint = endpoint.replace(/\.stt\.speech\.microsoft\.com$/i, '.api.cognitive.microsoft.com');
}
if (!/\/speechtotext\/transcriptions:transcribe/i.test(endpoint)) {
endpoint = `${endpoint}/speechtotext/transcriptions:transcribe`;
}
const url = new URL(endpoint);
if (!url.searchParams.get('api-version')) {
url.searchParams.set('api-version', '2025-10-15');
}

let audioBlob: Blob;
if (audioBuffer instanceof Blob) {
audioBlob = audioBuffer;
} else {
audioBlob = new Blob([audioBuffer as unknown as BlobPart], { type: 'audio/webm' });
}

const formData = new FormData();
formData.append('audio', audioBlob, 'recording.webm');

const localeMap: Record<string, string> = {
en: 'en-US',
zh: 'zh-CN',
ja: 'ja-JP',
ko: 'ko-KR',
de: 'de-DE',
fr: 'fr-FR',
es: 'es-ES',
it: 'it-IT',
pt: 'pt-BR',
ru: 'ru-RU',
ar: 'ar-SA',
hi: 'hi-IN',
};

if (config.language && config.language !== 'auto') {
const locale = localeMap[config.language] || config.language;
formData.append('definition', JSON.stringify({ locales: [locale] }));
}

const response = await fetch(url.toString(), {
method: 'POST',
headers: { 'Ocp-Apim-Subscription-Key': config.apiKey! },
body: formData,
});

if (!response.ok) {
const errorText = await response.text().catch(() => response.statusText);
throw new Error(`Azure STT error (${response.status}): ${errorText}`);
}

const data = (await response.json()) as {
combinedPhrases?: Array<{ text?: string }>;
phrases?: Array<{ text?: string }>;
};

const combinedText = data.combinedPhrases
?.map((p) => p.text || '')
.filter(Boolean)
.join(' ');
const phraseText = data.phrases
?.map((p) => p.text || '')
.filter(Boolean)
.join(' ');

return { text: combinedText || phraseText || '' };
}

/**
* Get current ASR configuration from settings store
* Note: This function should only be called in browser context
Expand Down
24 changes: 24 additions & 0 deletions lib/audio/constants.ts
Original file line number Diff line number Diff line change
Expand Up @@ -806,6 +806,30 @@ export const ASR_PROVIDERS: Record<ASRProviderId, ASRProviderConfig> = {
supportedFormats: ['mp3', 'wav', 'webm', 'm4a', 'flac'],
},

'azure-asr': {
id: 'azure-asr',
name: 'Azure STT',
requiresApiKey: true,
defaultBaseUrl: 'https://{region}.api.cognitive.microsoft.com',
icon: '/logos/azure.svg',
supportedLanguages: [
'auto',
'en',
'zh',
'ja',
'ko',
'de',
'fr',
'es',
'it',
'pt',
'ru',
'ar',
'hi',
],
supportedFormats: ['wav', 'ogg', 'webm', 'mp3', 'flac', 'm4a'],
},

'browser-native': {
id: 'browser-native',
name: '浏览器原生 ASR (Web Speech API)',
Expand Down
3 changes: 2 additions & 1 deletion lib/audio/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
* - OpenAI Whisper (https://platform.openai.com/docs/guides/speech-to-text)
* - Browser Native (Web Speech API, client-side only)
* - Qwen ASR (DashScope API)
* - Azure STT (https://learn.microsoft.com/azure/ai-services/speech-service/fast-transcription-create)
*
* Future Provider Support (extensible):
* - ElevenLabs TTS/ASR (https://elevenlabs.io/docs)
Expand Down Expand Up @@ -141,7 +142,7 @@ export interface TTSModelConfig {
* Add new ASR providers here as union members.
* Keep in sync with ASR_PROVIDERS registry in constants.ts
*/
export type ASRProviderId = 'openai-whisper' | 'browser-native' | 'qwen-asr';
export type ASRProviderId = 'openai-whisper' | 'browser-native' | 'qwen-asr' | 'azure-asr';
// Add new ASR providers below (uncomment and modify):
// | 'elevenlabs-asr'
// | 'assemblyai-asr'
Expand Down
2 changes: 2 additions & 0 deletions lib/i18n/settings.ts
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,7 @@ export const settingsZhCN = {
providerOpenAIWhisper: 'OpenAI ASR (gpt-4o-mini-transcribe)',
providerBrowserNative: '浏览器原生 ASR',
providerQwenASR: 'Qwen ASR(阿里云百炼)',
providerAzureASR: 'Azure STT',
providerUnpdf: 'unpdf(内置)',
providerMinerU: 'MinerU',
browserNativeTTSNote: '浏览器原生 TTS 无需配置,完全免费,使用系统内置语音',
Expand Down Expand Up @@ -817,6 +818,7 @@ export const settingsEnUS = {
providerOpenAIWhisper: 'OpenAI ASR (gpt-4o-mini-transcribe)',
providerBrowserNative: 'Browser Native ASR',
providerQwenASR: 'Qwen ASR (Alibaba Cloud Bailian)',
providerAzureASR: 'Azure STT',
providerUnpdf: 'unpdf (Built-in)',
providerMinerU: 'MinerU',
browserNativeTTSNote:
Expand Down
1 change: 1 addition & 0 deletions lib/server/provider-config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ const TTS_ENV_MAP: Record<string, string> = {
const ASR_ENV_MAP: Record<string, string> = {
ASR_OPENAI: 'openai-whisper',
ASR_QWEN: 'qwen-asr',
ASR_AZURE: 'azure-asr',
};

const PDF_ENV_MAP: Record<string, string> = {
Expand Down
1 change: 1 addition & 0 deletions lib/store/settings.ts
Original file line number Diff line number Diff line change
Expand Up @@ -274,6 +274,7 @@ const getDefaultAudioConfig = () => ({
'openai-whisper': { apiKey: '', baseUrl: '', enabled: true },
'browser-native': { apiKey: '', baseUrl: '', enabled: true },
'qwen-asr': { apiKey: '', baseUrl: '', enabled: false },
'azure-asr': { apiKey: '', baseUrl: '', enabled: false },
} as Record<ASRProviderId, { apiKey: string; baseUrl: string; enabled: boolean }>,
});

Expand Down
Loading