feat: implement Web Speech API fallback for STT

sjwoo1999 · claude · sjwoo1999 · commit cc0470685df0 · 2025-11-18T20:08:35.000+09:00
- Create useFallbackSTT hook with browser Speech Recognition API - Auto-switch to fallback when WebSocket STT times out - Support for Korean (ko-KR) speech recognition - Continuous recognition with auto-restart - Network error auto-retry mechanism - Fallback cascade: WebSocket → Browser API → Disabled - Cleanup on unmount Features: - Browser compatibility check - Graceful error handling - User-friendly error messages - Automatic mode switching Resolves: P0-2 (Fallback STT) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
diff --git a/src/App.tsx b/src/App.tsx
@@ -21,6 +21,7 @@ import { useWebSocket } from './hooks/useWebSocket';
 import { useKeyboardShortcuts } from './hooks/useKeyboardShortcuts';
 import { useTheme } from './contexts/ThemeContext';
 import { useOverallConnectionStatus } from './hooks/useOverallConnectionStatus';
+import { useFallbackSTT } from './hooks/useFallbackSTT';
 import type { EmotionType, VADMetrics } from './types';
 import type { KeyboardShortcut } from './hooks/useKeyboardShortcuts';
 import { VADMonitorSkeleton } from './components/Skeleton/Skeleton';
@@ -157,6 +158,40 @@ function App() {
   const sttTimeoutRef = useRef<number | null>(null);
   const lastSpeechTimeRef = useRef<number>(0);
   const STT_TIMEOUT_MS = 5000; // 5 seconds
+  const [sttMode, setSTTMode] = useState<'websocket' | 'fallback' | 'disabled'>('websocket');
+
+  // Fallback STT (Web Speech API)
+  const fallbackSTT = useFallbackSTT({
+    onResult: (text) => {
+      Logger.info('✅ Fallback STT result', { text });
+      setSttText(text);
+
+      // Dispatch user message
+      if (text.trim()) {
+        window.dispatchEvent(new CustomEvent('ai:userMessage', {
+          detail: {
+            message: text,
+            timestamp: Date.now()
+          }
+        }));
+        Logger.debug('🗣️ User message dispatched from fallback STT', { text });
+
+        // Trigger AI response
+        sendToSession({
+          type: 'request_ai_response',
+          data: {
+            message: text,
+            emotion: currentEmotion,
+            timestamp: Date.now()
+          }
+        });
+      }
+    },
+    onError: (error) => {
+      Logger.error('❌ Fallback STT error', { error });
+      setOverlayError(`폴백 음성 인식 오류: ${error}`);
+    },
+  });
 
   // WebSocket 연결
   const { isConnected: wsConnected, connectionStatus, connect: connectWS, disconnect: disconnectWS, suppressReconnect: suppressWSReconnect, landmarksWs, sendToSession } = useWebSocket({
@@ -295,7 +330,19 @@ function App() {
                 lastSpeechTime: new Date(lastSpeechTimeRef.current).toISOString()
               });
 
-              setOverlayError('음성 인식 시간 초과. 다시 말씀해주세요.');
+              // Try fallback STT if supported
+              if (sttMode === 'websocket' && fallbackSTT.isSupported()) {
+                Logger.info('🔄 Switching to fallback STT (Web Speech API)');
+                setSTTMode('fallback');
+                fallbackSTT.start();
+                setOverlayError('WebSocket STT 시간 초과. 브라우저 음성 인식으로 전환합니다.');
+              } else if (sttMode === 'fallback') {
+                Logger.warn('⚠️ Fallback STT also timed out');
+                setOverlayError('음성 인식을 사용할 수 없습니다. 네트워크를 확인해주세요.');
+                setSTTMode('disabled');
+              } else {
+                setOverlayError('음성 인식 시간 초과. 다시 말씀해주세요.');
+              }
 
               // Clear timeout ref
               sttTimeoutRef.current = null;
@@ -910,16 +957,21 @@ function App() {
     }
   }, [consent, openDialog]);
 
-  // Cleanup STT timeout on unmount
+  // Cleanup STT timeout and fallback on unmount
   useEffect(() => {
     return () => {
       if (sttTimeoutRef.current) {
         clearTimeout(sttTimeoutRef.current);
         sttTimeoutRef.current = null;
         Logger.debug('🧹 STT timeout cleaned up on unmount');
       }
+
+      if (fallbackSTT.isActive) {
+        fallbackSTT.stop();
+        Logger.debug('🧹 Fallback STT stopped on unmount');
+      }
     };
-  }, []);
+  }, [fallbackSTT]);
 
 
   return (
diff --git a/src/hooks/useFallbackSTT.ts b/src/hooks/useFallbackSTT.ts
@@ -0,0 +1,263 @@
+import { useRef, useCallback, useState } from 'react';
+import { Logger } from '../config/env';
+
+interface FallbackSTTOptions {
+  lang?: string;
+  continuous?: boolean;
+  interimResults?: boolean;
+  onResult: (text: string) => void;
+  onError?: (error: string) => void;
+}
+
+interface FallbackSTTReturn {
+  isSupported: () => boolean;
+  start: () => void;
+  stop: () => void;
+  isActive: boolean;
+}
+
+// Extend Window interface for Web Speech API
+declare global {
+  interface Window {
+    SpeechRecognition?: new () => SpeechRecognition;
+    webkitSpeechRecognition?: new () => SpeechRecognition;
+  }
+}
+
+interface SpeechRecognition extends EventTarget {
+  continuous: boolean;
+  interimResults: boolean;
+  lang: string;
+  start: () => void;
+  stop: () => void;
+  abort: () => void;
+  onresult: ((event: SpeechRecognitionEvent) => void) | null;
+  onerror: ((event: SpeechRecognitionErrorEvent) => void) | null;
+  onend: (() => void) | null;
+}
+
+interface SpeechRecognitionEvent extends Event {
+  resultIndex: number;
+  results: SpeechRecognitionResultList;
+}
+
+interface SpeechRecognitionResultList {
+  length: number;
+  item: (index: number) => SpeechRecognitionResult;
+  [index: number]: SpeechRecognitionResult;
+}
+
+interface SpeechRecognitionResult {
+  isFinal: boolean;
+  [index: number]: SpeechRecognitionAlternative;
+  length: number;
+}
+
+interface SpeechRecognitionAlternative {
+  transcript: string;
+  confidence: number;
+}
+
+interface SpeechRecognitionErrorEvent extends Event {
+  error: string;
+  message: string;
+}
+
+/**
+ * Web Speech API fallback for STT
+ * Provides browser-native speech recognition as backup for WebSocket STT
+ *
+ * @example
+ * const fallbackSTT = useFallbackSTT({
+ *   onResult: (text) => console.log('Recognized:', text),
+ *   onError: (err) => console.error('STT Error:', err)
+ * });
+ *
+ * if (fallbackSTT.isSupported()) {
+ *   fallbackSTT.start();
+ * }
+ */
+export const useFallbackSTT = (options: FallbackSTTOptions): FallbackSTTReturn => {
+  const {
+    lang = 'ko-KR',
+    continuous = true,
+    interimResults = false,
+    onResult,
+    onError,
+  } = options;
+
+  const recognitionRef = useRef<SpeechRecognition | null>(null);
+  const [isActive, setIsActive] = useState(false);
+
+  /**
+   * Check if Web Speech API is supported in current browser
+   */
+  const isSupported = useCallback((): boolean => {
+    const hasAPI =
+      typeof window !== 'undefined' &&
+      (window.SpeechRecognition !== undefined ||
+        window.webkitSpeechRecognition !== undefined);
+
+    Logger.debug('🎤 Web Speech API support check', {
+      supported: hasAPI,
+      SpeechRecognition: !!window.SpeechRecognition,
+      webkitSpeechRecognition: !!window.webkitSpeechRecognition,
+    });
+
+    return hasAPI;
+  }, []);
+
+  /**
+   * Start speech recognition
+   */
+  const start = useCallback(() => {
+    if (!isSupported()) {
+      const error = 'Web Speech API not supported in this browser';
+      Logger.error('❌ Cannot start fallback STT', { reason: error });
+      onError?.(error);
+      return;
+    }
+
+    // Stop existing recognition if any
+    if (recognitionRef.current) {
+      try {
+        recognitionRef.current.stop();
+      } catch (err) {
+        Logger.warn('Failed to stop existing recognition', { error: err });
+      }
+    }
+
+    try {
+      // Create recognition instance
+      const SpeechRecognitionAPI =
+        window.SpeechRecognition || window.webkitSpeechRecognition;
+
+      if (!SpeechRecognitionAPI) {
+        throw new Error('SpeechRecognition constructor not available');
+      }
+
+      const recognition = new SpeechRecognitionAPI();
+      recognition.continuous = continuous;
+      recognition.interimResults = interimResults;
+      recognition.lang = lang;
+
+      // Handle results
+      recognition.onresult = (event: SpeechRecognitionEvent) => {
+        const result = event.results[event.resultIndex];
+
+        if (result && result.isFinal && result[0]) {
+          const transcript = result[0].transcript;
+          const confidence = result[0].confidence;
+
+          Logger.info('✅ Fallback STT result', {
+            text: transcript,
+            confidence: confidence.toFixed(2),
+            resultIndex: event.resultIndex,
+          });
+
+          onResult(transcript);
+        } else if (interimResults && result && result[0]) {
+          const transcript = result[0].transcript;
+          Logger.debug('🔄 Interim STT result', { text: transcript });
+        }
+      };
+
+      // Handle errors
+      recognition.onerror = (event: SpeechRecognitionErrorEvent) => {
+        Logger.error('❌ Fallback STT error', {
+          error: event.error,
+          message: event.message,
+        });
+
+        const errorMessage = getErrorMessage(event.error);
+        onError?.(errorMessage);
+
+        // Auto-restart on network error (transient)
+        if (event.error === 'network') {
+          Logger.info('🔄 Attempting to restart after network error');
+          setTimeout(() => {
+            if (recognitionRef.current === recognition) {
+              start();
+            }
+          }, 2000);
+        } else if (event.error !== 'aborted') {
+          setIsActive(false);
+        }
+      };
+
+      // Handle end
+      recognition.onend = () => {
+        Logger.debug('🛑 Fallback STT ended');
+
+        // Auto-restart if continuous mode and not manually stopped
+        if (continuous && recognitionRef.current === recognition) {
+          Logger.debug('🔄 Restarting continuous recognition');
+          setTimeout(() => {
+            if (recognitionRef.current === recognition) {
+              start();
+            }
+          }, 100);
+        } else {
+          setIsActive(false);
+        }
+      };
+
+      // Start recognition
+      recognition.start();
+      recognitionRef.current = recognition;
+      setIsActive(true);
+
+      Logger.info('✅ Fallback STT started', {
+        lang,
+        continuous,
+        interimResults,
+      });
+    } catch (err) {
+      const errorMessage =
+        err instanceof Error ? err.message : 'Failed to start speech recognition';
+      Logger.error('❌ Failed to initialize fallback STT', { error: err });
+      onError?.(errorMessage);
+      setIsActive(false);
+    }
+  }, [isSupported, lang, continuous, interimResults, onResult, onError]);
+
+  /**
+   * Stop speech recognition
+   */
+  const stop = useCallback(() => {
+    if (recognitionRef.current) {
+      try {
+        recognitionRef.current.stop();
+        recognitionRef.current = null;
+        setIsActive(false);
+        Logger.info('✅ Fallback STT stopped');
+      } catch (err) {
+        Logger.error('❌ Failed to stop fallback STT', { error: err });
+      }
+    }
+  }, []);
+
+  return {
+    isSupported,
+    start,
+    stop,
+    isActive,
+  };
+};
+
+/**
+ * Get user-friendly error message from speech recognition error code
+ */
+function getErrorMessage(errorCode: string): string {
+  const errorMessages: Record<string, string> = {
+    'no-speech': '음성이 감지되지 않았습니다',
+    'audio-capture': '마이크에 접근할 수 없습니다',
+    'not-allowed': '마이크 권한이 거부되었습니다',
+    'network': '네트워크 오류가 발생했습니다',
+    'aborted': '음성 인식이 중단되었습니다',
+    'bad-grammar': '음성 인식 설정 오류',
+    'language-not-supported': '지원하지 않는 언어입니다',
+  };
+
+  return errorMessages[errorCode] || `음성 인식 오류: ${errorCode}`;
+}