diff --git a/app/api/generate/agent-profiles/route.ts b/app/api/generate/agent-profiles/route.ts index b081cbdac..fbf4d4e46 100644 --- a/app/api/generate/agent-profiles/route.ts +++ b/app/api/generate/agent-profiles/route.ts @@ -36,6 +36,8 @@ interface RequestBody { sceneOutlines?: { title: string; description?: string }[]; language: string; availableAvatars: string[]; + avatarDescriptions?: Array<{ path: string; desc: string }>; + availableVoices?: Array<{ providerId: string; voiceId: string; voiceName: string }>; } function stripCodeFences(text: string): string { @@ -50,7 +52,14 @@ function stripCodeFences(text: string): string { export async function POST(req: NextRequest) { try { const body = (await req.json()) as RequestBody; - const { stageInfo, sceneOutlines, language, availableAvatars } = body; + const { + stageInfo, + sceneOutlines, + language, + availableAvatars, + avatarDescriptions, + availableVoices, + } = body; // ── Validate required fields ── if (!stageInfo?.name) { @@ -79,6 +88,27 @@ export async function POST(req: NextRequest) { const systemPrompt = `You are an expert instructional designer. Generate agent profiles for a multi-agent classroom simulation. Decide the appropriate number of agents (typically 3-5) based on the course content and complexity. Return ONLY valid JSON, no markdown or explanation.`; + // Build voice list for prompt (if available) + const voiceListStr = + availableVoices && availableVoices.length > 0 + ? JSON.stringify( + availableVoices.map((v) => ({ + id: `${v.providerId}::${v.voiceId}`, + name: v.voiceName, + })), + ) + : ''; + + const voicePrompt = voiceListStr + ? `- Each agent should be assigned a voice that matches their persona from this list: ${voiceListStr} + - Pick a voice that suits the agent's personality and role (e.g. authoritative voice for teacher, lively voice for energetic student) + - Try to use different voices for each agent` + : ''; + + const voiceJsonField = voiceListStr + ? ',\n "voice": "string (voice id from available list, e.g. \'qwen-tts::Cherry\')"' + : ''; + const userPrompt = `Generate agent profiles for the following course: Course name: ${stageInfo.name} @@ -90,10 +120,13 @@ Requirements: - Priority values: teacher=10 (highest), assistant=7, student=4-6 - Each agent needs: name, role, persona (2-3 sentences describing personality and teaching/learning style) - Names and personas must be in language: ${language} -- Each agent must be assigned one avatar from this list: ${JSON.stringify(availableAvatars)} +- Each agent must be assigned one avatar from this list: ${JSON.stringify(avatarDescriptions && avatarDescriptions.length > 0 ? avatarDescriptions.map((a) => ({ path: a.path, description: a.desc })) : availableAvatars)} + - Pick an avatar that visually matches the agent's personality and role - Try to use different avatars for each agent + - Use the "path" value as the avatar field in the output - Each agent must be assigned one color from this list: ${JSON.stringify(COLOR_PALETTE)} - Each agent must have a different color +${voicePrompt} Return a JSON object with this exact structure: { @@ -104,7 +137,7 @@ Return a JSON object with this exact structure: "persona": "string (2-3 sentences)", "avatar": "string (from available list)", "color": "string (hex color from palette)", - "priority": number (10 for teacher, 7 for assistant, 4-6 for student) + "priority": number (10 for teacher, 7 for assistant, 4-6 for student)${voiceJsonField} } ] }`; @@ -130,6 +163,7 @@ Return a JSON object with this exact structure: avatar: string; color: string; priority: number; + voice?: string; }>; }; @@ -161,16 +195,28 @@ Return a JSON object with this exact structure: } // ── Build output with IDs ── - const agents = parsed.agents.map((agent, index) => ({ - id: `gen-${nanoid(8)}`, - name: agent.name, - role: agent.role, - persona: agent.persona, - avatar: agent.avatar || availableAvatars[index % availableAvatars.length], - color: agent.color || COLOR_PALETTE[index % COLOR_PALETTE.length], - priority: - agent.priority ?? (agent.role === 'teacher' ? 10 : agent.role === 'assistant' ? 7 : 5), - })); + const agents = parsed.agents.map((agent, index) => { + // Parse voice "providerId::voiceId" format + let voiceConfig: { providerId: string; voiceId: string } | undefined; + if (agent.voice && agent.voice.includes('::')) { + const [providerId, voiceId] = agent.voice.split('::'); + if (providerId && voiceId) { + voiceConfig = { providerId, voiceId }; + } + } + + return { + id: `gen-${nanoid(8)}`, + name: agent.name, + role: agent.role, + persona: agent.persona, + avatar: agent.avatar || availableAvatars[index % availableAvatars.length], + color: agent.color || COLOR_PALETTE[index % COLOR_PALETTE.length], + priority: + agent.priority ?? (agent.role === 'teacher' ? 10 : agent.role === 'assistant' ? 7 : 5), + ...(voiceConfig ? { voiceConfig } : {}), + }; + }); log.info(`Successfully generated ${agents.length} agent profiles for "${stageInfo.name}"`); diff --git a/app/generation-preview/page.tsx b/app/generation-preview/page.tsx index 213a51409..9272d0461 100644 --- a/app/generation-preview/page.tsx +++ b/app/generation-preview/page.tsx @@ -11,6 +11,7 @@ import { cn } from '@/lib/utils'; import { useStageStore } from '@/lib/store/stage'; import { useSettingsStore } from '@/lib/store/settings'; import { useAgentRegistry } from '@/lib/orchestration/registry/store'; +import { getAvailableProvidersWithVoices } from '@/lib/audio/voice-resolver'; import { useI18n } from '@/lib/hooks/use-i18n'; import { loadImageMapping, @@ -379,20 +380,67 @@ function GenerationPreviewContent() { try { const allAvatars = [ - '/avatars/assist.png', - '/avatars/assist-2.png', - '/avatars/clown.png', - '/avatars/clown-2.png', - '/avatars/curious.png', - '/avatars/curious-2.png', - '/avatars/note-taker.png', - '/avatars/note-taker-2.png', - '/avatars/teacher.png', - '/avatars/teacher-2.png', - '/avatars/thinker.png', - '/avatars/thinker-2.png', + { + path: '/avatars/teacher.png', + desc: 'Male teacher with glasses, holding a book, green background', + }, + { + path: '/avatars/teacher-2.png', + desc: 'Female teacher with long dark hair, blue traditional outfit, gentle expression', + }, + { + path: '/avatars/assist.png', + desc: 'Young female assistant with glasses, pink background, friendly smile', + }, + { + path: '/avatars/assist-2.png', + desc: 'Young female in orange top and purple overalls, cheerful and approachable', + }, + { + path: '/avatars/clown.png', + desc: 'Energetic girl with glasses pointing up, green shirt, lively and fun', + }, + { + path: '/avatars/clown-2.png', + desc: 'Playful girl with curly hair doing rock gesture, blue shirt, humorous vibe', + }, + { + path: '/avatars/curious.png', + desc: 'Surprised boy with glasses, hand on cheek, curious expression', + }, + { + path: '/avatars/curious-2.png', + desc: 'Boy with backpack holding a book and question mark bubble, inquisitive', + }, + { + path: '/avatars/note-taker.png', + desc: 'Studious boy with glasses, blue shirt, calm and organized', + }, + { + path: '/avatars/note-taker-2.png', + desc: 'Active boy with yellow backpack waving, blue outfit, enthusiastic learner', + }, + { + path: '/avatars/thinker.png', + desc: 'Thoughtful girl with hand on chin, purple background, contemplative', + }, + { + path: '/avatars/thinker-2.png', + desc: 'Girl reading a book intently, long dark hair, intellectual and focused', + }, ]; + const getAvailableVoicesForGeneration = () => { + const providers = getAvailableProvidersWithVoices(settings.ttsProvidersConfig); + return providers.flatMap((p) => + p.voices.map((v) => ({ + providerId: p.providerId, + voiceId: v.id, + voiceName: v.name, + })), + ); + }; + // No outlines yet — agent generation uses only stage name + description const agentResp = await fetch('/api/generate/agent-profiles', { method: 'POST', @@ -400,7 +448,9 @@ function GenerationPreviewContent() { body: JSON.stringify({ stageInfo: { name: stage.name, description: stage.description }, language: currentSession.requirements.language || 'zh-CN', - availableAvatars: allAvatars, + availableAvatars: allAvatars.map((a) => a.path), + avatarDescriptions: allAvatars.map((a) => ({ path: a.path, desc: a.desc })), + availableVoices: getAvailableVoicesForGeneration(), }), signal, }); diff --git a/components/agent/agent-bar.tsx b/components/agent/agent-bar.tsx index 289ee03cb..27585ecab 100644 --- a/components/agent/agent-bar.tsx +++ b/components/agent/agent-bar.tsx @@ -1,15 +1,450 @@ 'use client'; -import { useState, useEffect, useRef } from 'react'; +import { useState, useEffect, useRef, useCallback } from 'react'; import { motion, AnimatePresence } from 'motion/react'; import { Checkbox } from '@/components/ui/checkbox'; -import { Input } from '@/components/ui/input'; +import { Popover, PopoverContent, PopoverTrigger } from '@/components/ui/popover'; import { cn } from '@/lib/utils'; import { useI18n } from '@/lib/hooks/use-i18n'; import { useSettingsStore } from '@/lib/store/settings'; import { useAgentRegistry } from '@/lib/orchestration/registry/store'; -import { Sparkles, ChevronDown, ChevronUp, Shuffle } from 'lucide-react'; +import { resolveAgentVoice, getAvailableProvidersWithVoices } from '@/lib/audio/voice-resolver'; +import { playBrowserTTSPreview } from '@/lib/audio/browser-tts-preview'; +import { + Sparkles, + ChevronDown, + ChevronUp, + Shuffle, + Volume2, + VolumeX, + Loader2, + MessageSquare, + Minus, + Plus, +} from 'lucide-react'; import { Tooltip, TooltipContent, TooltipTrigger } from '@/components/ui/tooltip'; +import type { AgentConfig } from '@/lib/orchestration/registry/types'; +import type { TTSProviderId } from '@/lib/audio/types'; +import type { ProviderWithVoices } from '@/lib/audio/voice-resolver'; + +function AgentVoicePill({ + agent, + agentIndex, + availableProviders, + disabled, +}: { + agent: AgentConfig; + agentIndex: number; + availableProviders: ProviderWithVoices[]; + disabled?: boolean; +}) { + const updateAgent = useAgentRegistry((s) => s.updateAgent); + const ttsProvidersConfig = useSettingsStore((s) => s.ttsProvidersConfig); + const resolved = resolveAgentVoice(agent, agentIndex, availableProviders); + const [popoverOpen, setPopoverOpen] = useState(false); + const [previewingId, setPreviewingId] = useState(null); + const previewCancelRef = useRef<(() => void) | null>(null); + const previewAudioRef = useRef(null); + const previewAbortRef = useRef(null); + + const displayName = (() => { + for (const p of availableProviders) { + if (p.providerId === resolved.providerId) { + const v = p.voices.find((voice) => voice.id === resolved.voiceId); + if (v) return v.name; + } + } + return resolved.voiceId; + })(); + + const stopPreview = useCallback(() => { + previewCancelRef.current?.(); + previewCancelRef.current = null; + previewAbortRef.current?.abort(); + previewAbortRef.current = null; + if (previewAudioRef.current) { + previewAudioRef.current.pause(); + previewAudioRef.current.src = ''; + previewAudioRef.current = null; + } + setPreviewingId(null); + }, []); + + const handlePreview = useCallback( + async (providerId: TTSProviderId, voiceId: string) => { + const key = `${providerId}::${voiceId}`; + if (previewingId === key) { + stopPreview(); + return; + } + stopPreview(); + setPreviewingId(key); + + const courseLanguage = + (typeof localStorage !== 'undefined' && localStorage.getItem('generationLanguage')) || + 'zh-CN'; + const previewText = courseLanguage === 'en-US' ? 'Welcome to AI Classroom' : '欢迎来到AI课堂'; + + if (providerId === 'browser-native-tts') { + const { promise, cancel } = playBrowserTTSPreview({ text: previewText, voice: voiceId }); + previewCancelRef.current = cancel; + try { + await promise; + } catch { + // ignore abort + } + setPreviewingId(null); + return; + } + + // Server TTS + try { + const controller = new AbortController(); + previewAbortRef.current = controller; + const providerConfig = ttsProvidersConfig[providerId]; + const res = await fetch('/api/generate/tts', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ + text: previewText, + audioId: 'voice-preview', + ttsProviderId: providerId, + ttsVoice: voiceId, + ttsSpeed: 1, + ttsApiKey: providerConfig?.apiKey, + ttsBaseUrl: providerConfig?.serverBaseUrl || providerConfig?.baseUrl, + }), + signal: controller.signal, + }); + if (!res.ok) throw new Error('TTS error'); + const data = await res.json(); + if (!data.base64) throw new Error('No audio'); + + const audio = new Audio(`data:audio/${data.format || 'mp3'};base64,${data.base64}`); + previewAudioRef.current = audio; + audio.addEventListener('ended', () => setPreviewingId(null)); + audio.addEventListener('error', () => setPreviewingId(null)); + await audio.play(); + } catch { + setPreviewingId(null); + } + }, + [previewingId, stopPreview, ttsProvidersConfig], + ); + + // Cleanup on unmount + useEffect(() => () => stopPreview(), [stopPreview]); + + if (disabled) { + return ( +
e.stopPropagation()} + onPointerDown={(e) => e.stopPropagation()} + className="flex items-center gap-1 h-5 w-[88px] rounded-full bg-muted/40 px-2 text-[10px] text-muted-foreground/30 shrink-0 cursor-not-allowed" + > + + {displayName} +
+ ); + } + + return ( + { + setPopoverOpen(open); + if (!open) stopPreview(); + }} + > + + + + e.stopPropagation()} + onPointerDown={(e) => e.stopPropagation()} + > + {availableProviders.map((provider) => ( +
+
+ {provider.providerName} +
+ {provider.voices.map((voice) => { + const isActive = + resolved.providerId === provider.providerId && resolved.voiceId === voice.id; + const previewKey = `${provider.providerId}::${voice.id}`; + const isPreviewing = previewingId === previewKey; + return ( +
+ + +
+ ); + })} +
+ ))} +
+
+ ); +} + +/** + * Teacher voice pill — reads/writes global ttsProviderId + ttsVoice (single source of truth). + * This ensures lecture and discussion use the same voice for the teacher. + */ +function TeacherVoicePill({ + availableProviders, + disabled, +}: { + availableProviders: ProviderWithVoices[]; + disabled?: boolean; +}) { + const ttsProviderId = useSettingsStore((s) => s.ttsProviderId); + const ttsVoice = useSettingsStore((s) => s.ttsVoice); + const setTTSProvider = useSettingsStore((s) => s.setTTSProvider); + const setTTSVoice = useSettingsStore((s) => s.setTTSVoice); + const ttsProvidersConfig = useSettingsStore((s) => s.ttsProvidersConfig); + const [popoverOpen, setPopoverOpen] = useState(false); + const [previewingId, setPreviewingId] = useState(null); + const previewCancelRef = useRef<(() => void) | null>(null); + const previewAudioRef = useRef(null); + const previewAbortRef = useRef(null); + + const displayName = (() => { + for (const p of availableProviders) { + if (p.providerId === ttsProviderId) { + const v = p.voices.find((voice) => voice.id === ttsVoice); + if (v) return v.name; + } + } + return ttsVoice || 'default'; + })(); + + const stopPreview = useCallback(() => { + previewCancelRef.current?.(); + previewCancelRef.current = null; + previewAbortRef.current?.abort(); + previewAbortRef.current = null; + if (previewAudioRef.current) { + previewAudioRef.current.pause(); + previewAudioRef.current.src = ''; + previewAudioRef.current = null; + } + setPreviewingId(null); + }, []); + + const handlePreview = useCallback( + async (providerId: TTSProviderId, voiceId: string) => { + const key = `${providerId}::${voiceId}`; + if (previewingId === key) { + stopPreview(); + return; + } + stopPreview(); + setPreviewingId(key); + + const courseLanguage = + (typeof localStorage !== 'undefined' && localStorage.getItem('generationLanguage')) || + 'zh-CN'; + const previewText = courseLanguage === 'en-US' ? 'Welcome to AI Classroom' : '欢迎来到AI课堂'; + + if (providerId === 'browser-native-tts') { + const { promise, cancel } = playBrowserTTSPreview({ text: previewText, voice: voiceId }); + previewCancelRef.current = cancel; + try { + await promise; + } catch { + // ignore abort + } + setPreviewingId(null); + return; + } + + try { + const controller = new AbortController(); + previewAbortRef.current = controller; + const providerConfig = ttsProvidersConfig[providerId]; + const res = await fetch('/api/generate/tts', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ + text: previewText, + audioId: 'voice-preview', + ttsProviderId: providerId, + ttsVoice: voiceId, + ttsSpeed: 1, + ttsApiKey: providerConfig?.apiKey, + ttsBaseUrl: providerConfig?.serverBaseUrl || providerConfig?.baseUrl, + }), + signal: controller.signal, + }); + if (!res.ok) throw new Error('TTS error'); + const data = await res.json(); + if (!data.base64) throw new Error('No audio'); + const audio = new Audio(`data:audio/${data.format || 'mp3'};base64,${data.base64}`); + previewAudioRef.current = audio; + audio.addEventListener('ended', () => setPreviewingId(null)); + audio.addEventListener('error', () => setPreviewingId(null)); + await audio.play(); + } catch { + setPreviewingId(null); + } + }, + [previewingId, stopPreview, ttsProvidersConfig], + ); + + useEffect(() => () => stopPreview(), [stopPreview]); + + if (disabled) { + return ( +
e.stopPropagation()} + onPointerDown={(e) => e.stopPropagation()} + className="flex items-center gap-1 h-5 w-[88px] rounded-full bg-muted/40 px-2 text-[10px] text-muted-foreground/30 shrink-0 cursor-not-allowed" + > + + {displayName} +
+ ); + } + + return ( + { + setPopoverOpen(open); + if (!open) stopPreview(); + }} + > + + + + e.stopPropagation()} + onPointerDown={(e) => e.stopPropagation()} + > + {availableProviders.map((provider) => ( +
+
+ {provider.providerName} +
+ {provider.voices.map((voice) => { + const isActive = ttsProviderId === provider.providerId && ttsVoice === voice.id; + const previewKey = `${provider.providerId}::${voice.id}`; + const isPreviewing = previewingId === previewKey; + return ( +
+ + +
+ ); + })} +
+ ))} +
+
+ ); +} export function AgentBar() { const { t } = useI18n(); @@ -20,24 +455,51 @@ export function AgentBar() { const setMaxTurns = useSettingsStore((s) => s.setMaxTurns); const agentMode = useSettingsStore((s) => s.agentMode); const setAgentMode = useSettingsStore((s) => s.setAgentMode); + const ttsProvidersConfig = useSettingsStore((s) => s.ttsProvidersConfig); + const ttsEnabled = useSettingsStore((s) => s.ttsEnabled); const [open, setOpen] = useState(false); + const [browserVoices, setBrowserVoices] = useState([]); const containerRef = useRef(null); + // Load browser native TTS voices + useEffect(() => { + if (typeof window === 'undefined' || !window.speechSynthesis) return; + const loadVoices = () => setBrowserVoices(speechSynthesis.getVoices()); + loadVoices(); + speechSynthesis.addEventListener('voiceschanged', loadVoices); + return () => speechSynthesis.removeEventListener('voiceschanged', loadVoices); + }, []); + const allAgents = listAgents(); - // In preset mode, only show default (non-generated) agents const agents = allAgents.filter((a) => !a.isGenerated); const teacherAgent = agents.find((a) => a.role === 'teacher'); const selectedAgents = agents.filter((a) => selectedAgentIds.includes(a.id)); const nonTeacherSelected = selectedAgents.filter((a) => a.role !== 'teacher'); - // Click-outside to collapse + const serverProviders = getAvailableProvidersWithVoices(ttsProvidersConfig); + const availableProviders: ProviderWithVoices[] = [ + ...serverProviders, + ...(browserVoices.length > 0 + ? [ + { + providerId: 'browser-native-tts' as TTSProviderId, + providerName: 'Browser Native', + voices: browserVoices.map((v) => ({ id: v.voiceURI, name: v.name })), + }, + ] + : []), + ]; + const showVoice = availableProviders.length > 0; + useEffect(() => { if (!open) return; const handler = (e: MouseEvent) => { - if (containerRef.current && !containerRef.current.contains(e.target as Node)) { - setOpen(false); - } + const target = e.target as Node; + if (containerRef.current && containerRef.current.contains(target)) return; + // Don't close if clicking inside a Radix portal (Popover, Select, etc.) + if ((target as Element).closest?.('[data-radix-popper-content-wrapper]')) return; + setOpen(false); }; document.addEventListener('mousedown', handler); return () => document.removeEventListener('mousedown', handler); @@ -46,7 +508,6 @@ export function AgentBar() { const handleModeChange = (mode: 'preset' | 'auto') => { setAgentMode(mode); if (mode === 'preset') { - // Ensure a teacher is always selected in preset mode const hasTeacherSelected = selectedAgentIds.some((id) => { const a = agents.find((agent) => agent.id === id); return a?.role === 'teacher'; @@ -59,7 +520,7 @@ export function AgentBar() { const toggleAgent = (agentId: string) => { const agent = agents.find((a) => a.id === agentId); - if (agent?.role === 'teacher') return; // teacher is always selected + if (agent?.role === 'teacher') return; if (selectedAgentIds.includes(agentId)) { setSelectedAgentIds(selectedAgentIds.filter((id) => id !== agentId)); } else { @@ -79,10 +540,8 @@ export function AgentBar() { return translated !== key ? translated : agent.role; }; - /* ── Shared avatar row — always visible on the right side ── */ const avatarRow = (
- {/* Teacher avatar — always shown */} {teacherAgent && (
- {/* In auto mode: show assistant avatar + shuffle indicator */}
{agents.find((a) => a.role === 'assistant') && (
@@ -111,7 +569,6 @@ export function AgentBar() { ) : ( <> - {/* In preset mode: show selected non-teacher agents */} {nonTeacherSelected.length > 0 && (
{nonTeacherSelected.slice(0, 4).map((agent) => ( @@ -137,12 +594,59 @@ export function AgentBar() { )} )} + {showVoice && + (ttsEnabled ? ( + + ) : ( + + ))}
); + const renderAgentRow = (agent: AgentConfig, agentIndex: number, isTeacher: boolean) => { + const isSelected = isTeacher || selectedAgentIds.includes(agent.id); + return ( +
toggleAgent(agent.id)} + className={cn( + 'w-full flex items-center gap-2 px-2.5 py-1.5 rounded-lg transition-colors', + isTeacher ? 'bg-primary/5' : 'cursor-pointer', + !isTeacher && isSelected && 'bg-primary/5', + !isTeacher && !isSelected && 'hover:bg-muted/50', + )} + > + +
+ {getAgentName(agent)} +
+ + {getAgentName(agent)} + + + {getAgentRole(agent)} + + {showVoice && ( + + )} +
+ ); + }; + return ( -
- {/* ── Header row — always in document flow ── */} +
{agentMode === 'preset' ? ( - /* Agent list — teacher is always selected, no need to show */ -
+
{agents .filter((a) => a.role !== 'teacher') - .map((agent) => { - const isSelected = selectedAgentIds.includes(agent.id); - return ( -
toggleAgent(agent.id)} - className={cn( - 'w-full flex items-center gap-3 px-3 py-2 text-left transition-colors cursor-pointer rounded-lg', - isSelected ? 'bg-primary/5' : 'hover:bg-muted/50', - )} - > - -
- {getAgentName(agent)} -
-
-
- {getAgentName(agent)} - - {getAgentRole(agent)} - -
- {(() => { - const descKey = `settings.agentDescriptions.${agent.id}`; - const desc = t(descKey); - return desc !== descKey ? ( -

- {desc} -

- ) : null; - })()} -
-
- ); - })} + .map((agent, idx) => renderAgentRow(agent, idx + 1, false))}
) : ( - /* Auto-generate mode */ -
- {/* Shuffle icon with ambient animation */} +
- {/* Ping ripple */} -
- {/* Soft glow ring */} -
- {/* Icon */} - +
+
+ +
+
+
+

+ {t('settings.agentModeAutoDesc')} +

+

+ {t('agentBar.voiceAutoAssign')} +

-

- {t('settings.agentModeAutoDesc')} -

)} - {/* Max turns — always visible */} -
- + {/* Max turns — compact stepper */} +
+ + {t('settings.maxTurns')} - setMaxTurns(e.target.value)} - className="w-16 h-7 text-xs" - /> +
+ + { + const raw = e.target.value.replace(/\D/g, ''); + if (!raw) { + setMaxTurns(''); + return; + } + const v = Math.min(20, Math.max(1, parseInt(raw))); + setMaxTurns(String(v)); + }} + onBlur={() => { + if (!maxTurns || parseInt(maxTurns) < 1) setMaxTurns('1'); + }} + onClick={(e) => e.stopPropagation()} + className="w-5 h-5 text-[11px] font-medium tabular-nums text-center bg-transparent outline-none border-none" + /> + +
diff --git a/components/chat/chat-area.tsx b/components/chat/chat-area.tsx index 03ebd8ecc..eae2efb81 100644 --- a/components/chat/chat-area.tsx +++ b/components/chat/chat-area.tsx @@ -27,6 +27,14 @@ interface ChatAreaProps { onThinking?: (state: { stage: string; agentId?: string } | null) => void; onCueUser?: (fromAgentId?: string, prompt?: string) => void; onStopSession?: () => void; + onSegmentSealed?: ( + messageId: string, + partId: string, + fullText: string, + agentId: string | null, + ) => void; + /** When provided and returns true, StreamBuffer holds on the current text item after reveal. */ + shouldHoldAfterReveal?: () => boolean; currentSceneId?: string | null; } @@ -69,6 +77,8 @@ export const ChatArea = forwardRef( onThinking, onCueUser, onStopSession, + onSegmentSealed, + shouldHoldAfterReveal, currentSceneId, }, ref, @@ -102,6 +112,8 @@ export const ChatArea = forwardRef( onCueUser, onActiveBubble, onStopSession, + onSegmentSealed, + shouldHoldAfterReveal, }); const [activeTab, setActiveTab] = useState<'lecture' | 'chat'>('lecture'); diff --git a/components/chat/use-chat-sessions.ts b/components/chat/use-chat-sessions.ts index be39df01d..c78d269bb 100644 --- a/components/chat/use-chat-sessions.ts +++ b/components/chat/use-chat-sessions.ts @@ -36,6 +36,14 @@ interface UseChatSessionsOptions { onActiveBubble?: (messageId: string | null) => void; /** Called when a QA/Discussion session completes naturally (director end). */ onStopSession?: () => void; + onSegmentSealed?: ( + messageId: string, + partId: string, + fullText: string, + agentId: string | null, + ) => void; + /** When provided and returns true, StreamBuffer holds on the current text item after reveal. */ + shouldHoldAfterReveal?: () => boolean; } export function useChatSessions(options: UseChatSessionsOptions = {}) { @@ -45,6 +53,8 @@ export function useChatSessions(options: UseChatSessionsOptions = {}) { const onCueUserRef = useRef(options.onCueUser); const onActiveBubbleRef = useRef(options.onActiveBubble); const onStopSessionRef = useRef(options.onStopSession); + const onSegmentSealedRef = useRef(options.onSegmentSealed); + const shouldHoldAfterRevealRef = useRef(options.shouldHoldAfterReveal); useEffect(() => { onLiveSpeechRef.current = options.onLiveSpeech; onSpeechProgressRef.current = options.onSpeechProgress; @@ -52,6 +62,8 @@ export function useChatSessions(options: UseChatSessionsOptions = {}) { onCueUserRef.current = options.onCueUser; onActiveBubbleRef.current = options.onActiveBubble; onStopSessionRef.current = options.onStopSession; + onSegmentSealedRef.current = options.onSegmentSealed; + shouldHoldAfterRevealRef.current = options.shouldHoldAfterReveal; }, [ options.onLiveSpeech, options.onSpeechProgress, @@ -59,6 +71,8 @@ export function useChatSessions(options: UseChatSessionsOptions = {}) { options.onCueUser, options.onActiveBubble, options.onStopSession, + options.onSegmentSealed, + options.shouldHoldAfterReveal, ]); const { t } = useI18n(); @@ -321,6 +335,19 @@ export function useChatSessions(options: UseChatSessionsOptions = {}) { onError(message: string) { log.error('[Buffer] Stream error:', message); }, + + onSegmentSealed( + messageId: string, + partId: string, + fullText: string, + agentId: string | null, + ) { + onSegmentSealedRef.current?.(messageId, partId, fullText, agentId); + }, + + shouldHoldAfterReveal() { + return shouldHoldAfterRevealRef.current?.() ?? false; + }, }, pacingOptions, ); diff --git a/components/generation/media-popover.tsx b/components/generation/media-popover.tsx index 496ec9fe0..309a12acc 100644 --- a/components/generation/media-popover.tsx +++ b/components/generation/media-popover.tsx @@ -416,56 +416,9 @@ export function MediaPopover({ onSettingsOpen }: MediaPopoverProps) { enabled={ttsEnabled} onToggle={setTTSEnabled} > - {/* Provider + Voice grouped select + preview */} -
-
- { - if (gid !== ttsProviderId) { - setTTSProvider(gid as TTSProviderId); - } - setTTSVoice(iid); - }} - /> -
- -
- {ttsSpeedRange && ( -
- - {t('media.speed')} - - setTTSSpeed(value[0])} - min={ttsSpeedRange.min} - max={ttsSpeedRange.max} - step={0.1} - className="flex-1" - /> - - {ttsSpeed.toFixed(1)}x - -
- )} +

+ {t('settings.ttsVoiceConfigHint')} +

)} diff --git a/components/roundtable/audio-indicator.tsx b/components/roundtable/audio-indicator.tsx new file mode 100644 index 000000000..4b630a1d0 --- /dev/null +++ b/components/roundtable/audio-indicator.tsx @@ -0,0 +1,43 @@ +'use client'; + +import { motion } from 'motion/react'; + +export type AudioIndicatorState = 'idle' | 'generating' | 'playing'; + +interface AudioIndicatorProps { + state: AudioIndicatorState; + agentColor?: string; +} + +const BAR_COUNT = 4; + +export function AudioIndicator({ state, agentColor = '#10b981' }: AudioIndicatorProps) { + if (state === 'idle') return null; + + const color = state === 'generating' ? 'rgba(251, 191, 36, 0.7)' : agentColor; + const cycleDuration = state === 'generating' ? 0.8 : 0.5; + + return ( + + {Array.from({ length: BAR_COUNT }).map((_, i) => ( + + ))} + + ); +} diff --git a/components/roundtable/index.tsx b/components/roundtable/index.tsx index 9ab09939b..31a3cd581 100644 --- a/components/roundtable/index.tsx +++ b/components/roundtable/index.tsx @@ -16,6 +16,8 @@ import { Loader2, } from 'lucide-react'; import { cn } from '@/lib/utils'; +import { AudioIndicator } from './audio-indicator'; +import type { AudioIndicatorState } from './audio-indicator'; import { CanvasToolbar } from '@/components/canvas/canvas-toolbar'; import { useAudioRecorder } from '@/lib/hooks/use-audio-recorder'; import { useI18n } from '@/lib/hooks/use-i18n'; @@ -47,6 +49,8 @@ interface RoundtableProps { readonly isStreaming?: boolean; readonly sessionType?: 'qa' | 'discussion'; readonly speakingAgentId?: string | null; + readonly audioIndicatorState?: AudioIndicatorState; + readonly audioAgentId?: string | null; readonly speechProgress?: number | null; // StreamBuffer reveal progress (0–1) for auto-scroll readonly showEndFlash?: boolean; readonly endFlashSessionType?: 'qa' | 'discussion'; @@ -110,6 +114,8 @@ export function Roundtable({ isStreaming, sessionType, speakingAgentId, + audioIndicatorState, + audioAgentId, speechProgress: _speechProgress, showEndFlash, endFlashSessionType = 'discussion', @@ -456,7 +462,10 @@ export function Roundtable({ ttsEnabled={ttsEnabled} ttsMuted={ttsMuted} ttsVolume={ttsVolume} - onToggleMute={() => ttsEnabled && setTTSMuted(!ttsMuted)} + onToggleMute={() => { + if (!ttsEnabled) return; + setTTSMuted(!ttsMuted); + }} onVolumeChange={(v) => setTTSVolume(v)} autoPlayLecture={autoPlayLecture} onToggleAutoPlay={() => setAutoPlayLecture(!autoPlayLecture)} @@ -1020,6 +1029,29 @@ export function Roundtable({ })()}
+ {/* Agent name + audio indicator header */} + {bubbleRole !== 'user' && bubbleName && ( +
+ + {bubbleName} + + +
+ )} {isBubbleLoading ? (
state.ttsProviderId); const ttsVoice = useSettingsStore((state) => state.ttsVoice); - const ttsSpeed = useSettingsStore((state) => state.ttsSpeed); const ttsProvidersConfig = useSettingsStore((state) => state.ttsProvidersConfig); const setTTSProvider = useSettingsStore((state) => state.setTTSProvider); const setTTSVoice = useSettingsStore((state) => state.setTTSVoice); - const setTTSSpeed = useSettingsStore((state) => state.setTTSSpeed); const setTTSProviderConfig = useSettingsStore((state) => state.setTTSProviderConfig); // ASR state @@ -106,16 +98,6 @@ export function AudioSettings({ onSave }: AudioSettingsProps = {}) { onSave?.(); }; - const handleTTSVoiceChange = (voice: string) => { - setTTSVoice(voice); - onSave?.(); - }; - - const handleTTSSpeedChange = (speed: number) => { - setTTSSpeed(speed); - onSave?.(); - }; - const handleTTSProviderConfigChange = ( providerId: TTSProviderId, config: Partial<{ apiKey: string; baseUrl: string; enabled: boolean }>, @@ -150,12 +132,6 @@ export function AudioSettings({ onSave }: AudioSettingsProps = {}) { const [selectedLocale, setSelectedLocale] = useState('all'); // Test state - const [testingTTS, setTestingTTS] = useState(false); - const [testText, setTestText] = useState(t('settings.ttsTestTextDefault')); - const [ttsTestStatus, setTTSTestStatus] = useState<'idle' | 'testing' | 'success' | 'error'>( - 'idle', - ); - const [ttsTestMessage, setTTSTestMessage] = useState(''); const [isRecording, setIsRecording] = useState(false); const [asrResult, setASRResult] = useState(''); const [asrTestStatus, setASRTestStatus] = useState<'idle' | 'testing' | 'success' | 'error'>( @@ -170,13 +146,6 @@ export function AudioSettings({ onSave }: AudioSettingsProps = {}) { const asrProvider = ASR_PROVIDERS[asrProviderId] ?? ASR_PROVIDERS['openai-whisper']; - // Update test text when language changes (derived state pattern) - const [prevT, setPrevT] = useState(() => t); - if (t !== prevT) { - setPrevT(t); - setTestText(t('settings.ttsTestTextDefault')); - } - // Reset locale filter when provider changes (derived state pattern) const [prevTTSProviderId, setPrevTTSProviderId] = useState(ttsProviderId); if (ttsProviderId !== prevTTSProviderId) { @@ -186,7 +155,7 @@ export function AudioSettings({ onSave }: AudioSettingsProps = {}) { } } - const stopTTSPreview = useCallback((resetState = true) => { + const stopTTSPreview = useCallback(() => { ttsTestRequestIdRef.current += 1; browserPreviewCancelRef.current?.(); browserPreviewCancelRef.current = null; @@ -198,9 +167,6 @@ export function AudioSettings({ onSave }: AudioSettingsProps = {}) { URL.revokeObjectURL(audioUrlRef.current); audioUrlRef.current = null; } - if (resetState) { - setTestingTTS(false); - } }, []); // Update voice selection when locale filter changes @@ -222,9 +188,7 @@ export function AudioSettings({ onSave }: AudioSettingsProps = {}) { }, [selectedLocale, ttsProviderId, azureVoices, setTTSVoice]); useEffect(() => { - stopTTSPreview(false); - setTTSTestStatus('idle'); - setTTSTestMessage(''); + stopTTSPreview(); }, [ttsProviderId, stopTTSPreview]); // Initialize and reset TTS voice when provider changes @@ -275,7 +239,7 @@ export function AudioSettings({ onSave }: AudioSettingsProps = {}) { useEffect(() => { return () => { - stopTTSPreview(false); + stopTTSPreview(); }; }, [stopTTSPreview]); @@ -288,123 +252,6 @@ export function AudioSettings({ onSave }: AudioSettingsProps = {}) { setASRResult(''); } - // Test TTS - const handleTestTTS = async () => { - if (!testText.trim()) { - return; - } - - const requestId = ttsTestRequestIdRef.current + 1; - ttsTestRequestIdRef.current = requestId; - - setTestingTTS(true); - setTTSTestStatus('testing'); - setTTSTestMessage(''); - - try { - if (ttsProviderId === 'browser-native-tts') { - if (!('speechSynthesis' in window)) { - setTTSTestStatus('error'); - setTTSTestMessage(t('settings.browserTTSNotSupported')); - return; - } - - const voices = await ensureVoicesLoaded(); - if (ttsTestRequestIdRef.current !== requestId) { - return; - } - if (voices.length === 0) { - setTTSTestStatus('error'); - setTTSTestMessage(t('settings.browserTTSNoVoices')); - return; - } - - const controller = playBrowserTTSPreview({ - text: testText, - voice: ttsVoice, - rate: ttsSpeed, - voices, - }); - browserPreviewCancelRef.current = controller.cancel; - await controller.promise; - - if (ttsTestRequestIdRef.current !== requestId) { - return; - } - setTTSTestStatus('success'); - setTTSTestMessage(t('settings.ttsTestSuccess')); - return; - } - - const requestBody: Record = { - text: testText, - audioId: 'tts-test', - ttsProviderId, - ttsVoice: ttsVoice, - ttsSpeed: ttsSpeed, - }; - - const apiKeyValue = ttsProvidersConfig[ttsProviderId]?.apiKey; - if (apiKeyValue && apiKeyValue.trim()) { - requestBody.ttsApiKey = apiKeyValue; - } - - const baseUrlValue = ttsProvidersConfig[ttsProviderId]?.baseUrl; - if (baseUrlValue && baseUrlValue.trim()) { - requestBody.ttsBaseUrl = baseUrlValue; - } - - const response = await fetch('/api/generate/tts', { - method: 'POST', - headers: { 'Content-Type': 'application/json' }, - body: JSON.stringify(requestBody), - }); - - const data = await response - .json() - .catch(() => ({ success: false, error: response.statusText })); - if (ttsTestRequestIdRef.current !== requestId) { - return; - } - if (response.ok && data.success) { - const binaryStr = atob(data.base64); - const bytes = new Uint8Array(binaryStr.length); - for (let i = 0; i < binaryStr.length; i++) bytes[i] = binaryStr.charCodeAt(i); - const audioBlob = new Blob([bytes], { type: `audio/${data.format}` }); - if (audioUrlRef.current) { - URL.revokeObjectURL(audioUrlRef.current); - } - const audioUrl = URL.createObjectURL(audioBlob); - audioUrlRef.current = audioUrl; - if (audioRef.current) { - audioRef.current.src = audioUrl; - await audioRef.current.play(); - } - setTTSTestStatus('success'); - setTTSTestMessage(t('settings.ttsTestSuccess')); - } else { - setTTSTestStatus('error'); - setTTSTestMessage(data.error || t('settings.ttsTestFailed')); - } - } catch (error) { - if (ttsTestRequestIdRef.current !== requestId || isBrowserTTSAbortError(error)) { - return; - } - log.error('TTS test failed:', error); - setTTSTestStatus('error'); - setTTSTestMessage( - error instanceof Error && error.message - ? `${t('settings.ttsTestFailed')}: ${error.message}` - : t('settings.ttsTestFailed'), - ); - } finally { - if (ttsTestRequestIdRef.current === requestId) { - browserPreviewCancelRef.current = null; - setTestingTTS(false); - } - } - }; - // Test ASR const handleToggleASRRecording = async () => { if (isRecording) { @@ -571,6 +418,8 @@ export function AudioSettings({ onSave }: AudioSettingsProps = {}) { ttsEnabled ? 'opacity-100' : 'opacity-40 max-h-0 pointer-events-none', )} > +

{t('settings.ttsVoiceConfigHint')}

+
- handleTTSProviderConfigChange(ttsProviderId, { - apiKey: e.target.value, - }) - } - className="font-mono text-sm pr-10" - /> - -
-
- -
- +
+
+ +
handleTTSProviderConfigChange(ttsProviderId, { - baseUrl: e.target.value, + apiKey: e.target.value, }) } - className="text-sm" + className="font-mono text-sm pr-10" /> +
- {(() => { - const effectiveBaseUrl = - ttsProvidersConfig[ttsProviderId]?.baseUrl || ttsProvider.defaultBaseUrl || ''; - if (!effectiveBaseUrl) return null; - - // Get endpoint path based on provider - let endpointPath = ''; - switch (ttsProviderId) { - case 'openai-tts': - case 'glm-tts': - endpointPath = '/audio/speech'; - break; - case 'azure-tts': - endpointPath = '/cognitiveservices/v1'; - break; - case 'qwen-tts': - endpointPath = '/services/aigc/multimodal-generation/generation'; - break; - default: - endpointPath = ''; - } - - if (!endpointPath) return null; - const fullUrl = effectiveBaseUrl + endpointPath; - return ( -

- {t('settings.requestUrl')}: {fullUrl} -

- ); - })()} - - )} - - {/* Voice Selection Row */} -
- {/* Language Filter for Azure TTS */} - {ttsProviderId === 'azure-tts' && ( -
- - -
- )} - -
- - -
- {ttsProvider.speedRange && (
- -
- handleTTSSpeedChange(value[0])} - min={ttsProvider.speedRange.min} - max={ttsProvider.speedRange.max} - step={0.1} - className="flex-1" - /> - - {ttsSpeed.toFixed(1)}x - -
-
- )} -
- - {/* Test TTS Section */} -
- -
- setTestText(e.target.value)} - className="flex-1" - /> - -
-
- - {ttsTestMessage && ( -
-
- {ttsTestStatus === 'success' && ( - - )} - {ttsTestStatus === 'error' && } -

{ttsTestMessage}

+ + + handleTTSProviderConfigChange(ttsProviderId, { + baseUrl: e.target.value, + }) + } + className="text-sm" + />
)} - -
diff --git a/components/stage.tsx b/components/stage.tsx index dc5379f34..0684afc43 100644 --- a/components/stage.tsx +++ b/components/stage.tsx @@ -14,6 +14,8 @@ import { PlaybackEngine, computePlaybackView } from '@/lib/playback'; import type { EngineMode, TriggerEvent, Effect } from '@/lib/playback'; import { ActionEngine } from '@/lib/action/engine'; import { createAudioPlayer } from '@/lib/utils/audio-player'; +import { useDiscussionTTS } from '@/lib/hooks/use-discussion-tts'; +import type { AudioIndicatorState } from '@/components/roundtable/audio-indicator'; import type { Action, DiscussionAction, SpeechAction } from '@/lib/types/action'; // Playback state persistence removed — refresh always starts from the beginning import { ChatArea, type ChatAreaRef } from '@/components/chat/chat-area'; @@ -100,6 +102,8 @@ export function Stage({ // Selected agents from settings store (Zustand) const selectedAgentIds = useSettingsStore((s) => s.selectedAgentIds); + const ttsMuted = useSettingsStore((s) => s.ttsMuted); + const ttsEnabled = useSettingsStore((s) => s.ttsEnabled); // Generate participants from selected agents const participants = useMemo( @@ -107,6 +111,27 @@ export function Stage({ [selectedAgentIds, t], ); + // Resolved AgentConfig array for hooks that need full agent objects + // Subscribe to the agents record so voiceConfig changes trigger re-resolution + const agentsRecord = useAgentRegistry((s) => s.agents); + const selectedAgents = useMemo( + () => selectedAgentIds.map((id) => agentsRecord[id]).filter((a): a is AgentConfig => a != null), + [agentsRecord, selectedAgentIds], + ); + + // Discussion TTS: audio indicator state + const [audioIndicatorState, setAudioIndicatorState] = useState('idle'); + const [audioAgentId, setAudioAgentId] = useState(null); + + const discussionTTS = useDiscussionTTS({ + enabled: ttsEnabled && !ttsMuted, + agents: selectedAgents, + onAudioStateChange: (agentId, state) => { + setAudioAgentId(agentId); + setAudioIndicatorState(state); + }, + }); + // Pick a student agent for discussion trigger (prioritize student > non-teacher > fallback) const pickStudentAgent = useCallback((): string => { const registry = useAgentRegistry.getState(); @@ -221,8 +246,11 @@ export function Stage({ setTimeout(() => setShowEndFlash(false), 1800); } + // Stop any in-flight discussion TTS audio + discussionTTS.cleanup(); + resetLiveState(); - }, [chatSessionType, resetLiveState]); + }, [chatSessionType, resetLiveState, discussionTTS]); // Shared stop-discussion handler (used by both Roundtable and Canvas toolbar) const handleStopDiscussion = useCallback(async () => { @@ -246,6 +274,9 @@ export function Stage({ discussionAbortRef.current = null; } + // Stop any in-flight discussion TTS audio on scene switch + discussionTTS.cleanup(); + // Reset all roundtable/live state so scenes are fully isolated resetSceneState(); @@ -335,6 +366,8 @@ export function Stage({ discussionAbortRef.current = null; } setDiscussionTrigger(null); + // Stop any in-flight discussion TTS audio + discussionTTS.cleanup(); // Clear roundtable state (idempotent — may already be cleared by doSessionCleanup) resetLiveState(); // Only show flash for engine-initiated ends (not manual stop — that's handled by doSessionCleanup) @@ -441,7 +474,6 @@ export function Stage({ }, []); // Sync mute state from settings store to audioPlayer - const ttsMuted = useSettingsStore((s) => s.ttsMuted); useEffect(() => { audioPlayerRef.current.setMuted(ttsMuted); }, [ttsMuted]); @@ -748,6 +780,8 @@ export function Stage({ discussionRequest={discussionRequest} engineMode={engineMode} isStreaming={chatIsStreaming} + audioIndicatorState={audioIndicatorState} + audioAgentId={audioAgentId} sessionType={ chatSessionType === 'qa' ? 'qa' @@ -890,6 +924,8 @@ export function Stage({ setIsCueUser(true); }} onStopSession={doSessionCleanup} + onSegmentSealed={discussionTTS.handleSegmentSealed} + shouldHoldAfterReveal={discussionTTS.shouldHold} /> {/* Scene switch confirmation dialog */} diff --git a/lib/audio/voice-resolver.ts b/lib/audio/voice-resolver.ts new file mode 100644 index 000000000..2018add3f --- /dev/null +++ b/lib/audio/voice-resolver.ts @@ -0,0 +1,103 @@ +import type { TTSProviderId } from '@/lib/audio/types'; +import type { AgentConfig } from '@/lib/orchestration/registry/types'; +import { TTS_PROVIDERS } from '@/lib/audio/constants'; + +export interface ResolvedVoice { + providerId: TTSProviderId; + voiceId: string; +} + +/** + * Resolve the TTS provider + voice for an agent. + * 1. If agent has voiceConfig and the voice is still valid, use it + * 2. Otherwise, use the first available provider + deterministic voice by index + */ +export function resolveAgentVoice( + agent: AgentConfig, + agentIndex: number, + availableProviders: ProviderWithVoices[], +): ResolvedVoice { + // Agent-specific config + if (agent.voiceConfig) { + // Browser-native voices are dynamic (not in static registry), so skip validation + if (agent.voiceConfig.providerId === 'browser-native-tts') { + return agent.voiceConfig; + } + const list = getServerVoiceList(agent.voiceConfig.providerId); + if (list.includes(agent.voiceConfig.voiceId)) { + return agent.voiceConfig; + } + } + + // Fallback: first available provider, deterministic voice + if (availableProviders.length > 0) { + const first = availableProviders[0]; + return { + providerId: first.providerId, + voiceId: first.voices[agentIndex % first.voices.length].id, + }; + } + + return { providerId: 'browser-native-tts', voiceId: 'default' }; +} + +/** + * Get the list of voice IDs for a TTS provider. + * For browser-native-tts, returns empty (browser voices are dynamic). + */ +export function getServerVoiceList(providerId: TTSProviderId): string[] { + if (providerId === 'browser-native-tts') return []; + const provider = TTS_PROVIDERS[providerId]; + if (!provider) return []; + return provider.voices.map((v) => v.id); +} + +export interface ProviderWithVoices { + providerId: TTSProviderId; + providerName: string; + voices: Array<{ id: string; name: string }>; +} + +/** + * Get all available providers and their voices for the voice picker UI. + * A provider is available if it has an API key or is server-configured. + * Browser-native-tts is excluded (no static voice list). + */ +export function getAvailableProvidersWithVoices( + ttsProvidersConfig: Record< + string, + { apiKey?: string; enabled?: boolean; isServerConfigured?: boolean } + >, +): ProviderWithVoices[] { + const result: ProviderWithVoices[] = []; + + for (const [id, config] of Object.entries(TTS_PROVIDERS)) { + const providerId = id as TTSProviderId; + if (providerId === 'browser-native-tts') continue; + if (config.voices.length === 0) continue; + + const providerConfig = ttsProvidersConfig[providerId]; + const hasApiKey = providerConfig?.apiKey && providerConfig.apiKey.trim().length > 0; + const isServerConfigured = providerConfig?.isServerConfigured === true; + + if (hasApiKey || isServerConfigured) { + result.push({ + providerId, + providerName: config.name, + voices: config.voices.map((v) => ({ id: v.id, name: v.name })), + }); + } + } + + return result; +} + +/** + * Find a voice display name across all providers. + */ +export function findVoiceDisplayName(providerId: TTSProviderId, voiceId: string): string { + const provider = TTS_PROVIDERS[providerId]; + if (!provider) return voiceId; + const voice = provider.voices.find((v) => v.id === voiceId); + return voice?.name ?? voiceId; +} diff --git a/lib/buffer/stream-buffer.ts b/lib/buffer/stream-buffer.ts index 6ba94ea6f..fb8e21699 100644 --- a/lib/buffer/stream-buffer.ts +++ b/lib/buffer/stream-buffer.ts @@ -124,6 +124,18 @@ export interface StreamBufferCallbacks { directorState?: DirectorState; }): void; onError(message: string): void; + onSegmentSealed?: ( + messageId: string, + partId: string, + fullText: string, + agentId: string | null, + ) => void; + /** + * When provided, called after a text item is fully revealed and sealed. + * If it returns true, the tick loop will NOT advance to the next item — + * the bubble stays on the current text (e.g. waiting for TTS playback to finish). + */ + shouldHoldAfterReveal?: () => boolean; } // ─── Options ───────────────────────────────────────────────────────── @@ -165,6 +177,8 @@ export class StreamBuffer { // Dwell / delay counters (in ticks) private _dwellTicksRemaining = 0; + /** True when a text item's post-delay has elapsed and we're waiting for TTS to finish. */ + private _holdingForTTS = false; // Config private readonly tickMs: number; @@ -403,6 +417,9 @@ export class StreamBuffer { const item = this.items[i]; if (item.kind === 'text' && !item.sealed) { item.sealed = true; + // Ordering invariant: sealLastText() is called BEFORE pushAgentEnd/pushAgentStart, + // so this.currentAgentId still refers to the agent whose text is being sealed. + this.cb.onSegmentSealed?.(item.messageId, item.partId, item.text, this.currentAgentId); break; } // Stop searching once we hit a non-text item @@ -416,6 +433,21 @@ export class StreamBuffer { // Honour dwell / action-delay countdown before advancing if (this._dwellTicksRemaining > 0) { this._dwellTicksRemaining--; + if (this._dwellTicksRemaining === 0 && this._holdingForTTS) { + // Post-text delay just finished — fall through to the TTS hold check below + } else { + return; + } + } + + // TTS hold: after post-text delay, keep the bubble on screen while audio plays + if (this._holdingForTTS) { + if (this.cb.shouldHoldAfterReveal?.()) { + return; // TTS still playing — stay on current item + } + this._holdingForTTS = false; + // TTS done — continue to process next item + this.advanceNonText(); return; } @@ -450,9 +482,19 @@ export class StreamBuffer { // before the next action or agent turn fires. if (this.postTextDelayTicks > 0) { this._dwellTicksRemaining = this.postTextDelayTicks; + // If TTS hold callback exists, mark that we need to check it after delay + if (this.cb.shouldHoldAfterReveal) { + this._holdingForTTS = true; + } return; // next tick will count down, then advanceNonText } + // No post-text delay — check TTS hold immediately + if (this.cb.shouldHoldAfterReveal?.()) { + this._holdingForTTS = true; + return; // TTS still playing — hold here + } + // Process any immediately-advanceable items in the same tick // (e.g. action badges right after text) this.advanceNonText(); diff --git a/lib/hooks/use-discussion-tts.ts b/lib/hooks/use-discussion-tts.ts new file mode 100644 index 000000000..2075cf053 --- /dev/null +++ b/lib/hooks/use-discussion-tts.ts @@ -0,0 +1,224 @@ +'use client'; + +import { useCallback, useEffect, useRef } from 'react'; +import { useSettingsStore } from '@/lib/store/settings'; +import { useBrowserTTS } from '@/lib/hooks/use-browser-tts'; +import { resolveAgentVoice, getAvailableProvidersWithVoices } from '@/lib/audio/voice-resolver'; +import type { AgentConfig } from '@/lib/orchestration/registry/types'; +import type { TTSProviderId } from '@/lib/audio/types'; +import type { AudioIndicatorState } from '@/components/roundtable/audio-indicator'; + +interface DiscussionTTSOptions { + enabled: boolean; + agents: AgentConfig[]; + onAudioStateChange?: (agentId: string | null, state: AudioIndicatorState) => void; +} + +interface QueueItem { + messageId: string; + partId: string; + text: string; + agentId: string | null; + providerId: TTSProviderId; + voiceId: string; +} + +export function useDiscussionTTS({ enabled, agents, onAudioStateChange }: DiscussionTTSOptions) { + const ttsProvidersConfig = useSettingsStore((s) => s.ttsProvidersConfig); + const ttsSpeed = useSettingsStore((s) => s.ttsSpeed); + const ttsMuted = useSettingsStore((s) => s.ttsMuted); + const ttsVolume = useSettingsStore((s) => s.ttsVolume); + const playbackSpeed = useSettingsStore((s) => s.playbackSpeed); + // Global lecture voice — used as fallback for teacher agent + const globalTtsProviderId = useSettingsStore((s) => s.ttsProviderId); + const globalTtsVoice = useSettingsStore((s) => s.ttsVoice); + + const queueRef = useRef([]); + const isPlayingRef = useRef(false); + const abortControllerRef = useRef(null); + const audioRef = useRef(null); + const onAudioStateChangeRef = useRef(onAudioStateChange); + onAudioStateChangeRef.current = onAudioStateChange; + const processQueueRef = useRef<() => void>(() => {}); + + const { speak: browserSpeak, cancel: browserCancel } = useBrowserTTS({ + rate: ttsSpeed, + onEnd: () => { + isPlayingRef.current = false; + onAudioStateChangeRef.current?.(null, 'idle'); + processQueueRef.current(); + }, + }); + const browserCancelRef = useRef(browserCancel); + browserCancelRef.current = browserCancel; + const browserSpeakRef = useRef(browserSpeak); + browserSpeakRef.current = browserSpeak; + + // Build agent index map for deterministic voice resolution + const agentIndexMap = useRef>(new Map()); + useEffect(() => { + const map = new Map(); + agents.forEach((agent, i) => map.set(agent.id, i)); + agentIndexMap.current = map; + }, [agents]); + + const resolveVoiceForAgent = useCallback( + (agentId: string | null): { providerId: TTSProviderId; voiceId: string } => { + const providers = getAvailableProvidersWithVoices(ttsProvidersConfig); + if (!agentId) { + if (providers.length > 0) { + return { + providerId: providers[0].providerId, + voiceId: providers[0].voices[0]?.id ?? 'default', + }; + } + return { providerId: 'browser-native-tts', voiceId: 'default' }; + } + const agent = agents.find((a) => a.id === agentId); + if (!agent) { + if (providers.length > 0) { + return { + providerId: providers[0].providerId, + voiceId: providers[0].voices[0]?.id ?? 'default', + }; + } + return { providerId: 'browser-native-tts', voiceId: 'default' }; + } + // Teacher: always use global lecture voice (single source of truth with settings) + if (agent.role === 'teacher') { + return { providerId: globalTtsProviderId, voiceId: globalTtsVoice }; + } + const index = agentIndexMap.current.get(agentId) ?? 0; + return resolveAgentVoice(agent, index, providers); + }, + [agents, ttsProvidersConfig, globalTtsProviderId, globalTtsVoice], + ); + + const processQueue = useCallback(async () => { + if (isPlayingRef.current || queueRef.current.length === 0) return; + if (!enabled || ttsMuted) { + queueRef.current = []; + return; + } + + isPlayingRef.current = true; + const item = queueRef.current.shift()!; + + // Browser TTS + if (item.providerId === 'browser-native-tts') { + onAudioStateChangeRef.current?.(item.agentId, 'playing'); + browserSpeakRef.current(item.text, item.voiceId); + return; + } + + // Server TTS — use the item's provider, not the global one + onAudioStateChangeRef.current?.(item.agentId, 'generating'); + const controller = new AbortController(); + abortControllerRef.current = controller; + + try { + const providerConfig = ttsProvidersConfig[item.providerId]; + const res = await fetch('/api/generate/tts', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ + text: item.text, + audioId: item.partId, + ttsProviderId: item.providerId, + ttsVoice: item.voiceId, + ttsSpeed: ttsSpeed, + ttsApiKey: providerConfig?.apiKey, + ttsBaseUrl: providerConfig?.serverBaseUrl || providerConfig?.baseUrl, + }), + signal: controller.signal, + }); + + if (!res.ok) throw new Error(`TTS API error: ${res.status}`); + + const data = await res.json(); + if (!data.base64) throw new Error('No audio in response'); + + onAudioStateChangeRef.current?.(item.agentId, 'playing'); + const audioUrl = `data:audio/${data.format || 'mp3'};base64,${data.base64}`; + const audio = new Audio(audioUrl); + audio.playbackRate = playbackSpeed; + audio.volume = ttsMuted ? 0 : ttsVolume; + audioRef.current = audio; + audio.addEventListener('ended', () => { + isPlayingRef.current = false; + onAudioStateChangeRef.current?.(item.agentId, 'idle'); + queueMicrotask(() => processQueueRef.current()); + }); + audio.addEventListener('error', () => { + isPlayingRef.current = false; + onAudioStateChangeRef.current?.(item.agentId, 'idle'); + queueMicrotask(() => processQueueRef.current()); + }); + await audio.play(); + } catch (err) { + if ((err as Error).name !== 'AbortError') { + console.error('[DiscussionTTS] TTS generation failed:', err); + } + isPlayingRef.current = false; + onAudioStateChangeRef.current?.(item.agentId, 'idle'); + queueMicrotask(() => processQueueRef.current()); + } + }, [enabled, ttsMuted, ttsVolume, ttsProvidersConfig, ttsSpeed, playbackSpeed]); + + processQueueRef.current = processQueue; + + const handleSegmentSealed = useCallback( + (messageId: string, partId: string, fullText: string, agentId: string | null) => { + if (!enabled || ttsMuted || !fullText.trim()) return; + + const { providerId, voiceId } = resolveVoiceForAgent(agentId); + queueRef.current.push({ messageId, partId, text: fullText, agentId, providerId, voiceId }); + + if (!isPlayingRef.current) { + processQueueRef.current(); + } else if (providerId !== 'browser-native-tts') { + onAudioStateChangeRef.current?.(agentId, 'generating'); + } + }, + [enabled, ttsMuted, resolveVoiceForAgent], + ); + + const cleanup = useCallback(() => { + abortControllerRef.current?.abort(); + abortControllerRef.current = null; + if (audioRef.current) { + audioRef.current.pause(); + audioRef.current.src = ''; + audioRef.current = null; + } + browserCancelRef.current(); + queueRef.current = []; + isPlayingRef.current = false; + onAudioStateChangeRef.current?.(null, 'idle'); + }, []); + + // Sync playbackSpeed to currently playing audio in real-time + useEffect(() => { + if (audioRef.current) { + audioRef.current.playbackRate = playbackSpeed; + } + }, [playbackSpeed]); + + // Sync volume and mute to currently playing audio in real-time + useEffect(() => { + if (audioRef.current) { + audioRef.current.volume = ttsMuted ? 0 : ttsVolume; + } + }, [ttsVolume, ttsMuted]); + + useEffect(() => cleanup, [cleanup]); + + /** Returns true when TTS audio is still playing or queued — used by StreamBuffer hold logic. */ + const shouldHold = useCallback(() => isPlayingRef.current || queueRef.current.length > 0, []); + + return { + handleSegmentSealed, + cleanup, + shouldHold, + }; +} diff --git a/lib/i18n/chat.ts b/lib/i18n/chat.ts index 4a5421399..1bb535d3e 100644 --- a/lib/i18n/chat.ts +++ b/lib/i18n/chat.ts @@ -55,6 +55,9 @@ export const chatZhCN = { readyToLearn: '准备好一起学习了吗?', expandedTitle: '课堂角色配置', configTooltip: '点击配置课堂角色', + voiceLabel: '音色', + voiceLoading: '加载中...', + voiceAutoAssign: '音色将自动分配', }, proactiveCard: { discussion: '讨论', @@ -126,6 +129,9 @@ export const chatEnUS = { readyToLearn: 'Ready to learn together?', expandedTitle: 'Classroom Role Config', configTooltip: 'Click to configure classroom roles', + voiceLabel: 'Voice', + voiceLoading: 'Loading...', + voiceAutoAssign: 'Voices will be auto-assigned', }, proactiveCard: { discussion: 'Discussion', diff --git a/lib/i18n/settings.ts b/lib/i18n/settings.ts index 3dba9f669..3ba0be4f3 100644 --- a/lib/i18n/settings.ts +++ b/lib/i18n/settings.ts @@ -200,6 +200,7 @@ export const settingsZhCN = { asrDescription: 'ASR (Automatic Speech Recognition) - 将语音转换为文字', enableTTS: '启用语音合成', ttsEnabledDescription: '开启后,课程生成时将自动合成语音', + ttsVoiceConfigHint: '每个 Agent 的音色可在首页「课堂角色配置」中设置', enableASR: '启用语音识别', asrEnabledDescription: '开启后,学生可使用麦克风进行语音输入', ttsProvider: 'TTS 提供商', @@ -788,6 +789,8 @@ export const settingsEnUS = { asrDescription: 'ASR (Automatic Speech Recognition) - Convert speech to text', enableTTS: 'Enable Text-to-Speech', ttsEnabledDescription: 'When enabled, speech audio will be generated during course creation', + ttsVoiceConfigHint: + 'Per-agent voice can be configured in "Classroom Role Config" on the homepage', enableASR: 'Enable Speech Recognition', asrEnabledDescription: 'When enabled, students can use microphone for voice input', ttsProvider: 'TTS Provider', diff --git a/lib/orchestration/registry/store.ts b/lib/orchestration/registry/store.ts index b5e7b8600..893a5fa63 100644 --- a/lib/orchestration/registry/store.ts +++ b/lib/orchestration/registry/store.ts @@ -7,6 +7,7 @@ import { create } from 'zustand'; import { persist } from 'zustand/middleware'; import type { AgentConfig } from './types'; import { getActionsForRole } from './types'; +import type { TTSProviderId } from '@/lib/audio/types'; import { USER_AVATAR } from '@/lib/types/roundtable'; import type { Participant, ParticipantRole } from '@/lib/types/roundtable'; import { useUserProfileStore } from '@/lib/store/user-profile'; @@ -231,7 +232,7 @@ export const useAgentRegistry = create()( }), { name: 'agent-registry-storage', - version: 10, // Bumped: exclude generated agents from persisted cache + version: 11, // Bumped: add voiceOverrides field to AgentConfig migrate: (persistedState: unknown) => persistedState, // Merge persisted state with default agents // Default agents always use code-defined values (not cached) @@ -377,6 +378,7 @@ export async function saveGeneratedAgents( avatar: string; color: string; priority: number; + voiceConfig?: { providerId: string; voiceId: string }; }>, ): Promise { const { db } = await import('@/lib/utils/database'); @@ -396,14 +398,23 @@ export async function saveGeneratedAgents( // Add to registry for (const record of records) { + const { voiceConfig, ...rest } = record; registry.addAgent({ - ...record, + ...rest, allowedActions: getActionsForRole(record.role), isDefault: false, isGenerated: true, boundStageId: stageId, createdAt: new Date(record.createdAt), updatedAt: new Date(record.createdAt), + ...(voiceConfig + ? { + voiceConfig: { + providerId: voiceConfig.providerId as TTSProviderId, + voiceId: voiceConfig.voiceId, + }, + } + : {}), }); } diff --git a/lib/orchestration/registry/types.ts b/lib/orchestration/registry/types.ts index ba978b02c..6631e9b46 100644 --- a/lib/orchestration/registry/types.ts +++ b/lib/orchestration/registry/types.ts @@ -3,6 +3,8 @@ * Defines the structure for configurable AI agents in the multi-agent system */ +import type { TTSProviderId } from '@/lib/audio/types'; + export interface AgentConfig { id: string; // Unique agent ID name: string; // Display name (Chinese) @@ -12,6 +14,7 @@ export interface AgentConfig { color: string; // UI theme color (hex) allowedActions: string[]; // Action types this agent can use priority: number; // Priority for director selection (1-10) + voiceConfig?: { providerId: TTSProviderId; voiceId: string }; // Per-agent TTS voice selection // Metadata createdAt: Date; @@ -32,6 +35,7 @@ export interface AgentTemplate { color: string; allowedActions: string[]; priority: number; + voiceConfig?: { providerId: TTSProviderId; voiceId: string }; // Per-agent TTS voice selection // LLM-generated agent fields isGenerated?: boolean; // true for LLM-generated agents