diff --git a/App.tsx b/App.tsx index ac8cee153..d4a43a109 100644 --- a/App.tsx +++ b/App.tsx @@ -14,6 +14,14 @@ import { useTheme } from './src/theme'; import { hardwareService, modelManager, authService, ragService, remoteServerManager } from './src/services'; import logger from './src/utils/logger'; import { useAppStore, useAuthStore, useRemoteServerStore } from './src/stores'; +import { useTTSStore } from './src/stores/ttsStore'; +import { initExecutorch } from 'react-native-executorch'; +import { BareResourceFetcher } from 'react-native-executorch-bare-resource-fetcher'; +import { EngineBridge } from './src/components/EngineBridge'; + +// Initialise executorch resource fetcher once at module load time. +// This must run before any useTextToSpeech hook is mounted. +initExecutorch({ resourceFetcher: BareResourceFetcher }); import { LockScreen } from './src/screens'; import { useAppState } from './src/hooks/useAppState'; @@ -191,6 +199,10 @@ function App() { // Initialize RAG database tables ragService.ensureReady().catch((err) => logger.error('Failed to initialize RAG service on startup', err)); + // Initialize TTS engine from persisted settings and sync download state + const ttsState = useTTSStore.getState(); + ttsState.setEngine(ttsState.settings.engineId).catch(() => {}); + // Show the UI immediately setIsInitializing(false); @@ -235,6 +247,7 @@ function App() { + 'ready' as const), + on: jest.fn(() => jest.fn()), + off: jest.fn(), + once: jest.fn(() => jest.fn()), + isSupported: jest.fn(() => true), + initialize: jest.fn().mockResolvedValue(undefined), + release: jest.fn().mockResolvedValue(undefined), + destroy: jest.fn().mockResolvedValue(undefined), + getRequiredAssets: jest.fn(() => [ + { id: 'backbone', label: 'Voice Model', url: 'https://example.com/bb.gguf', sizeBytes: 454 * 1024 * 1024, filename: 'bb.gguf' }, + { id: 'vocoder', label: 'Decoder', url: 'https://example.com/voc.gguf', sizeBytes: 73 * 1024 * 1024, filename: 'voc.gguf' }, + ]), + checkAssetStatus: jest.fn().mockResolvedValue([ + { asset: { id: 'backbone', label: 'Voice Model', url: '', sizeBytes: 454 * 1024 * 1024, filename: 'bb.gguf' }, status: 'downloaded', progress: 1 }, + { asset: { id: 'vocoder', label: 'Decoder', url: '', sizeBytes: 73 * 1024 * 1024, filename: 'voc.gguf' }, status: 'downloaded', progress: 1 }, + ]), + downloadAssets: jest.fn().mockResolvedValue(undefined), + deleteAssets: jest.fn().mockResolvedValue(undefined), + getOverallDownloadProgress: jest.fn(() => 1), + isFullyDownloaded: jest.fn(() => true), + getBridgeComponent: jest.fn(() => null), + getVoices: jest.fn(() => [{ id: '0', label: 'Default', metadata: {} }]), + getActiveVoice: jest.fn(() => ({ id: '0', label: 'Default', metadata: {} })), + setVoice: jest.fn().mockResolvedValue(undefined), + speak: jest.fn().mockResolvedValue(undefined), + generateAndSave: jest.fn().mockResolvedValue({ + filePath: '/cache/c1/m1.pcm', + durationSeconds: 1.5, + waveformData: new Array(200).fill(0.2), + }), + playFromFile: jest.fn().mockResolvedValue(undefined), + stop: jest.fn(), + pause: jest.fn(), + resume: jest.fn(), +}; + +jest.mock('../../../src/engine', () => ({ + ttsRegistry: { + register: jest.fn(), + has: jest.fn(() => true), + getEngine: jest.fn(() => mockEngine), + setActiveEngine: jest.fn().mockResolvedValue(mockEngine), + getActiveEngine: jest.fn(() => mockEngine), + getActiveEngineId: jest.fn(() => 'mock-tts'), + getRegisteredIds: jest.fn(() => ['mock-tts']), + }, + OuteTTSEngine: class {}, +})); + +jest.mock('../../../src/utils/logger', () => ({ + __esModule: true, + default: { log: jest.fn(), error: jest.fn(), warn: jest.fn() }, +})); + +import { useTTSStore } from '../../../src/stores/ttsStore'; + +const getState = () => useTTSStore.getState(); + +const resetStore = () => { + useTTSStore.setState({ + phase: 'ready', + currentMessageId: null, + currentAmplitude: 0, + playbackElapsed: 0, + playSessionId: 0, + error: null, + isReady: true, + isDownloading: false, + isLoading: false, + isSpeaking: false, + isPaused: false, + isGeneratingAudio: false, + assets: [], + overallDownloadProgress: 1, + voices: [{ id: '0', label: 'Default', metadata: {} }], + activeVoiceId: '0', + audioCacheSizeMB: 0, + settings: { + interfaceMode: 'chat', + enabled: true, + autoPlay: false, + speed: 1.0, + engineId: 'mock-tts', + voiceByEngine: {}, + }, + }); +}; + +describe('TTS integration', () => { + beforeEach(() => { + resetStore(); + jest.clearAllMocks(); + }); + + // ── Chat Mode full flow ─────────────────────────────────────────────── + + describe('Chat Mode: speak → stop', () => { + it('completes the full Chat Mode flow', async () => { + // Speak + const speakPromise = getState().speak('hello', 'msg1'); + expect(getState().currentMessageId).toBe('msg1'); + + await speakPromise; + expect(mockEngine.speak).toHaveBeenCalledWith('hello', expect.objectContaining({ + speed: 1.0, + messageId: 'msg1', + })); + expect(getState().currentMessageId).toBeNull(); + + // Stop mid-speech + mockEngine.speak.mockImplementation( + () => new Promise((resolve) => setTimeout(resolve, 1000)), + ); + getState().speak('second', 'msg2'); + getState().stop(); + expect(mockEngine.stop).toHaveBeenCalled(); + }); + }); + + // ── Audio Mode full flow ────────────────────────────────────────────── + + describe('Audio Mode: generateAndSave → playMessage → stop', () => { + beforeEach(() => { + useTTSStore.setState({ + settings: { ...getState().settings, interfaceMode: 'audio' }, + }); + }); + + it('completes the full Audio Mode flow', async () => { + // GenerateAndSave + const result = await getState().generateAndSave('hello audio', 'conv1', 'msg1'); + + expect(result.path).toBe('/cache/c1/m1.pcm'); + expect(result.waveformData).toHaveLength(200); + expect(result.durationSeconds).toBe(1.5); + + // PlayMessage + const playPromise = getState().playMessage('msg1', '/cache/c1/m1.pcm'); + expect(getState().currentMessageId).toBe('msg1'); + + await playPromise; + + // StopPlayback + getState().stopPlayback(); + expect(mockEngine.stop).toHaveBeenCalled(); + }); + }); + + // ── Mode switching ──────────────────────────────────────────────────── + + describe('mode switching', () => { + it('switching interfaceMode to audio takes effect', () => { + expect(getState().settings.interfaceMode).toBe('chat'); + getState().updateSettings({ interfaceMode: 'audio' }); + expect(getState().settings.interfaceMode).toBe('audio'); + }); + + it('switching back to chat mode works', () => { + getState().updateSettings({ interfaceMode: 'audio' }); + getState().updateSettings({ interfaceMode: 'chat' }); + expect(getState().settings.interfaceMode).toBe('chat'); + }); + }); + + // ── Engine-agnostic speak ───────────────────────────────────────────── + + describe('auto-play', () => { + it('speak delegates to engine when autoPlay and engine ready', async () => { + useTTSStore.setState({ + settings: { ...getState().settings, autoPlay: true }, + }); + + await getState().speak('AI response', 'last-msg'); + + expect(mockEngine.speak).toHaveBeenCalledWith('AI response', expect.objectContaining({ + messageId: 'last-msg', + })); + }); + }); +}); diff --git a/__tests__/rntl/components/ChatInput.test.tsx b/__tests__/rntl/components/ChatInput.test.tsx index 617430abf..303297d39 100644 --- a/__tests__/rntl/components/ChatInput.test.tsx +++ b/__tests__/rntl/components/ChatInput.test.tsx @@ -51,10 +51,20 @@ jest.mock('../../../src/services/documentService', () => ({ // Mock the stores const mockUseWhisperStore = jest.fn(); const mockUseAppStore = jest.fn(); +const mockUseTTSStore = jest.fn(() => ({ + settings: { interfaceMode: 'chat', enabled: false, speed: 1.0 }, + isBackboneDownloaded: false, + isVocoderDownloaded: false, + isModelLoaded: false, + loadModels: jest.fn(), + unloadModels: jest.fn(), + updateSettings: jest.fn(), +})); jest.mock('../../../src/stores', () => ({ useWhisperStore: () => mockUseWhisperStore(), useAppStore: () => mockUseAppStore(), + useTTSStore: () => mockUseTTSStore(), })); // Mock the whisper hook diff --git a/__tests__/rntl/components/GenerationSettingsModal.test.tsx b/__tests__/rntl/components/GenerationSettingsModal.test.tsx index a9ef46471..ed7272b1e 100644 --- a/__tests__/rntl/components/GenerationSettingsModal.test.tsx +++ b/__tests__/rntl/components/GenerationSettingsModal.test.tsx @@ -859,13 +859,13 @@ describe('GenerationSettingsModal', () => { }); it('calls handleSliderComplete on text generation slider (no-op)', () => { - const { getByText, getAllByTestId } = render( + const { getByText, queryAllByTestId } = render( , ); fireEvent.press(getByText('TEXT GENERATION')); - const sliders = getAllByTestId('slider'); + const sliders = queryAllByTestId('slider'); // onSlidingComplete is a no-op but should not throw if (sliders.length > 0 && sliders[0].props.onSlidingComplete) { expect(() => sliders[0].props.onSlidingComplete(0.5)).not.toThrow(); @@ -873,13 +873,13 @@ describe('GenerationSettingsModal', () => { }); it('calls handleSliderChange on text slider value change', () => { - const { getByText, getAllByTestId } = render( + const { getByText, queryAllByTestId } = render( , ); fireEvent.press(getByText('TEXT GENERATION')); - const sliders = getAllByTestId('slider'); + const sliders = queryAllByTestId('slider'); if (sliders.length > 0 && sliders[0].props.onValueChange) { sliders[0].props.onValueChange(0.5); expect(mockUpdateSettings).toHaveBeenCalled(); @@ -1070,17 +1070,16 @@ describe('GenerationSettingsModal', () => { expect(mockUpdateSettings).toHaveBeenCalledWith({ enableGpu: true, cacheType: 'f16' }); }); - it('calls updateSettings with gpuLayers value from GPU layers slider', () => { + it('calls updateSettings with gpuLayers value from GPU layers stepper', () => { mockStoreValues.settings = { ...defaultSettings, enableGpu: true, gpuLayers: 6, flashAttn: false }; const { getByText, getByTestId } = render(); fireEvent.press(getByText('TEXT GENERATION')); fireEvent.press(getByTestId('modal-text-advanced-toggle')); mockUpdateSettings.mockClear(); - const slider = getByTestId('gpu-layers-slider'); - slider.props.onSlidingComplete(12); + fireEvent.press(getByTestId('gpu-layers-stepper-increment')); - expect(mockUpdateSettings).toHaveBeenCalledWith({ gpuLayers: 12 }); + expect(mockUpdateSettings).toHaveBeenCalledWith({ gpuLayers: 7 }); }); }); }); diff --git a/__tests__/rntl/components/VoiceRecordButton.test.tsx b/__tests__/rntl/components/VoiceRecordButton.test.tsx index b92c45a30..84899278b 100644 --- a/__tests__/rntl/components/VoiceRecordButton.test.tsx +++ b/__tests__/rntl/components/VoiceRecordButton.test.tsx @@ -87,16 +87,17 @@ describe('VoiceRecordButton', () => { }); it('shows recording indicator when isRecording is true', () => { - const { getByText } = render( + const { toJSON } = render( ); - // When recording, "Slide to cancel" text appears in the cancel hint - expect(getByText('Slide to cancel')).toBeTruthy(); + // In audio mode (default, !asSendButton), recording shows a stop icon (square) + const treeStr = JSON.stringify(toJSON()); + expect(treeStr).toContain('square'); }); it('shows transcribing state when isTranscribing is true', () => { - const { getByText } = render( + const { toJSON } = render( { /> ); - // Transcribing state shows "Transcribing..." text - expect(getByText('Transcribing...')).toBeTruthy(); + // Transcribing state renders a spinning indicator (no text in audio mode) + expect(toJSON()).toBeTruthy(); }); - it('shows partial result text when provided', () => { + it('shows partial result text when provided in chat mode (asSendButton)', () => { const { getByText } = render( @@ -166,7 +168,7 @@ describe('VoiceRecordButton', () => { expect(toJSON()).toBeTruthy(); }); - it('taps unavailable button and triggers alert with error message', () => { + it('taps unavailable button and triggers download prompt alert', () => { const { UNSAFE_getAllByType } = render( { fireEvent.press(touchables[0]); expect(mockShowAlert).toHaveBeenCalledWith( - 'Voice Input Unavailable', - expect.stringContaining('Microphone permission denied'), + 'Download Voice Model', + expect.stringContaining('Download Whisper Small'), expect.any(Array) ); }); - it('taps unavailable button with default error when no error prop', () => { + it('taps unavailable button shows download prompt with size', () => { const { UNSAFE_getAllByType } = render( { fireEvent.press(touchables[0]); expect(mockShowAlert).toHaveBeenCalledWith( - 'Voice Input Unavailable', - expect.stringContaining('No transcription model downloaded'), + 'Download Voice Model', + expect.stringContaining('466 MB'), expect.any(Array) ); }); - it('alert message includes instructions for downloading model', () => { + it('alert message includes Download and Cancel buttons', () => { const { UNSAFE_getAllByType } = render( { fireEvent.press(touchables[0]); expect(mockShowAlert).toHaveBeenCalledWith( - 'Voice Input Unavailable', - expect.stringContaining('Download a Whisper model'), - expect.any(Array) + 'Download Voice Model', + expect.any(String), + expect.arrayContaining([ + expect.objectContaining({ text: 'Cancel' }), + expect.objectContaining({ text: 'Download' }), + ]) ); }); }); @@ -400,11 +405,13 @@ describe('VoiceRecordButton', () => { }); it('does not show cancel hint when not recording', () => { - const { queryByText } = render( + const { toJSON } = render( ); - expect(queryByText('Slide to cancel')).toBeNull(); + // Audio mode (default) uses tap-to-toggle, no slide-to-cancel + const treeStr = JSON.stringify(toJSON()); + expect(treeStr).not.toContain('Slide to cancel'); }); it('does not show partial result when partialResult is empty', () => { @@ -418,12 +425,12 @@ describe('VoiceRecordButton', () => { // partialResult is empty, so the partial result container should not render const treeStr = JSON.stringify(toJSON()); - // The cancel hint should still show - expect(treeStr).toContain('Slide to cancel'); + // Audio mode uses tap-to-toggle with a stop icon + expect(treeStr).toContain('square'); }); it('shows recording UI elements but not transcribing when recording', () => { - const { getByText, queryByText } = render( + const { toJSON, queryByText } = render( { // When isRecording is true AND isTranscribing is true, // the component shows recording UI (not transcribing state) - expect(getByText('Slide to cancel')).toBeTruthy(); + const treeStr = JSON.stringify(toJSON()); + expect(treeStr).toContain('square'); expect(queryByText('Transcribing...')).toBeNull(); }); @@ -446,7 +454,7 @@ describe('VoiceRecordButton', () => { }); it('prioritizes model loading state over recording', () => { - const { getByText, queryByText } = render( + const { getByText, toJSON } = render( { ); expect(getByText('Loading...')).toBeTruthy(); - expect(queryByText('Slide to cancel')).toBeNull(); + // Recording UI should not render when loading + const treeStr = JSON.stringify(toJSON()); + expect(treeStr).not.toContain('square'); }); it('prioritizes model loading state over transcribing', () => { - const { getByText, queryByText } = render( + const { getByText, toJSON } = render( { ); expect(getByText('Loading...')).toBeTruthy(); - expect(queryByText('Transcribing...')).toBeNull(); + // Transcribing state should not render when loading + expect(toJSON()).toBeTruthy(); }); }); }); diff --git a/__tests__/rntl/screens/DownloadManagerScreen.test.tsx b/__tests__/rntl/screens/DownloadManagerScreen.test.tsx index 2a976dfd2..255ab5e6a 100644 --- a/__tests__/rntl/screens/DownloadManagerScreen.test.tsx +++ b/__tests__/rntl/screens/DownloadManagerScreen.test.tsx @@ -212,20 +212,23 @@ describe('DownloadManagerScreen', () => { }); it('shows empty state when no downloads', () => { - const { getByText } = render(); - expect(getByText('No active downloads')).toBeTruthy(); + const { getByText, queryByText } = render(); + // Active Downloads section is hidden when there are no active items + expect(queryByText('Active Downloads')).toBeNull(); expect(getByText('No models downloaded yet')).toBeTruthy(); }); it('shows section headers for active and completed', () => { - const { getByText } = render(); - expect(getByText('Active Downloads')).toBeTruthy(); + const { getByText, queryByText } = render(); + // Active Downloads section is hidden when empty + expect(queryByText('Active Downloads')).toBeNull(); + // Downloaded Models section is always shown expect(getByText('Downloaded Models')).toBeTruthy(); }); it('shows empty subtext when no models downloaded', () => { const { getByText } = render(); - expect(getByText('Go to the Models tab to browse and download models')).toBeTruthy(); + expect(getByText('No models downloaded yet')).toBeTruthy(); }); it('renders completed text model with details', () => { @@ -305,11 +308,12 @@ describe('DownloadManagerScreen', () => { expect(getByText(/Total storage used/)).toBeTruthy(); }); - it('shows count badges for active and completed sections', () => { + it('shows count badge for completed section', () => { setupSingleModelState(); const { getByText } = render(); - expect(getByText('0')).toBeTruthy(); + // Active section is hidden when empty (no "0" badge) + // Completed section shows count of 1 expect(getByText('1')).toBeTruthy(); }); @@ -344,7 +348,8 @@ describe('DownloadManagerScreen', () => { const { UNSAFE_getAllByType } = render(); const touchables = UNSAFE_getAllByType(TouchableOpacity); - const cancelButtons = touchables.filter((_: any, i: number) => i > 0); + // Skip back button (1) + filter chips (6) = 7 touchables before content + const cancelButtons = touchables.filter((_: any, i: number) => i > 6); if (cancelButtons.length > 0) { fireEvent.press(cancelButtons[0]); } @@ -820,8 +825,8 @@ describe('DownloadManagerScreen', () => { const { UNSAFE_getAllByType, getByTestId } = render(); const touchables = UNSAFE_getAllByType(TouchableOpacity); - // Press the cancel button (second touchable after back button) - const cancelButtons = touchables.filter((_: any, i: number) => i > 0); + // Skip back button (1) + filter chips (6) = 7 touchables before content + const cancelButtons = touchables.filter((_: any, i: number) => i > 6); fireEvent.press(cancelButtons[0]); // Press "Yes" to confirm @@ -852,7 +857,8 @@ describe('DownloadManagerScreen', () => { const { UNSAFE_getAllByType, getByTestId } = render(); const touchables = UNSAFE_getAllByType(TouchableOpacity); - const cancelButtons = touchables.filter((_: any, i: number) => i > 0); + // Skip back button (1) + filter chips (6) = 7 touchables before content + const cancelButtons = touchables.filter((_: any, i: number) => i > 6); fireEvent.press(cancelButtons[0]); await act(async () => { @@ -880,7 +886,8 @@ describe('DownloadManagerScreen', () => { const { UNSAFE_getAllByType, getByTestId } = render(); const touchables = UNSAFE_getAllByType(TouchableOpacity); - const cancelButtons = touchables.filter((_: any, i: number) => i > 0); + // Skip back button (1) + filter chips (6) = 7 touchables before content + const cancelButtons = touchables.filter((_: any, i: number) => i > 6); fireEvent.press(cancelButtons[0]); await act(async () => { @@ -1029,7 +1036,8 @@ describe('DownloadManagerScreen', () => { // Find the cancel button for the RNFS download (which has no downloadId) const touchables = result.UNSAFE_getAllByType(TouchableOpacity); - const cancelButtons = touchables.filter((_: any, i: number) => i > 0); + // Skip back button (1) + filter chips (6) = 7 touchables before content + const cancelButtons = touchables.filter((_: any, i: number) => i > 6); if (cancelButtons.length > 0) { fireEvent.press(cancelButtons[0]); @@ -1367,8 +1375,8 @@ describe('DownloadManagerScreen', () => { // Find and press cancel button on the active download const touchables = result.UNSAFE_getAllByType(TouchableOpacity); - // Find cancel buttons (skip back button) - const cancelButtons = touchables.filter((_: any, i: number) => i > 0); + // Skip back button (1) + filter chips (6) = 7 touchables before content + const cancelButtons = touchables.filter((_: any, i: number) => i > 6); if (cancelButtons.length > 0) { fireEvent.press(cancelButtons[0]); diff --git a/__tests__/rntl/screens/ModelSettingsScreen.test.tsx b/__tests__/rntl/screens/ModelSettingsScreen.test.tsx index 026ba7b17..455b376bf 100644 --- a/__tests__/rntl/screens/ModelSettingsScreen.test.tsx +++ b/__tests__/rntl/screens/ModelSettingsScreen.test.tsx @@ -644,14 +644,13 @@ describe('ModelSettingsScreen', () => { expect(useAppStore.getState().settings.enableGpu).toBe(true); }); - it('updates gpuLayers when GPU Layers slider completes', () => { + it('updates gpuLayers when GPU Layers stepper is incremented', () => { useAppStore.getState().updateSettings({ enableGpu: true, flashAttn: false, gpuLayers: 6 }); const { getByTestId } = renderWithSections('text'); - const slider = getByTestId('gpu-layers-slider'); - fireEvent(slider, 'slidingComplete', 12); + fireEvent.press(getByTestId('gpu-layers-stepper-increment')); - expect(useAppStore.getState().settings.gpuLayers).toBe(12); + expect(useAppStore.getState().settings.gpuLayers).toBe(7); }); }); }); diff --git a/__tests__/rntl/screens/VoiceSettingsScreen.test.tsx b/__tests__/rntl/screens/VoiceSettingsScreen.test.tsx index a055a2ad1..7d459bde7 100644 --- a/__tests__/rntl/screens/VoiceSettingsScreen.test.tsx +++ b/__tests__/rntl/screens/VoiceSettingsScreen.test.tsx @@ -3,15 +3,15 @@ * * Tests for the voice settings screen including: * - Title display - * - Description text about Whisper - * - Download options when no model + * - Privacy note text + * - English and Multilingual model sections * - Back button navigation - * - Downloaded model state (name, status badge, remove button) + * - Active model state (name, badge, remove button) * - Download progress display * - Model download trigger * - Remove model confirmation alert * - Error display and clear - * - Privacy card display + * - Search bar * * Priority: P1 (High) */ @@ -82,6 +82,7 @@ jest.mock('../../../src/components/Button', () => ({ })); const mockDownloadModel = jest.fn(); +const mockDownloadFromUrl = jest.fn(); const mockDeleteModel = jest.fn(); const mockClearError = jest.fn(); @@ -90,6 +91,7 @@ let mockWhisperStoreValues: any = { isDownloading: false, downloadProgress: 0, downloadModel: mockDownloadModel, + downloadFromUrl: mockDownloadFromUrl, deleteModel: mockDeleteModel, error: null, clearError: mockClearError, @@ -101,13 +103,24 @@ jest.mock('../../../src/stores', () => ({ jest.mock('../../../src/services', () => ({ WHISPER_MODELS: [ - { id: 'tiny', name: 'Whisper Tiny', size: '75', description: 'Fastest, lower accuracy' }, - { id: 'base', name: 'Whisper Base', size: '141', description: 'Good accuracy' }, - { id: 'small', name: 'Whisper Small', size: '461', description: 'Better accuracy' }, - { id: 'medium', name: 'Whisper Medium', size: '1500', description: 'Best accuracy' }, + { id: 'tiny.en', name: 'Tiny', size: 75, lang: 'en', description: 'Fastest, English only' }, + { id: 'base.en', name: 'Base', size: 142, lang: 'en', description: 'Better accuracy, English only' }, + { id: 'small.en', name: 'Small', size: 466, lang: 'en', description: 'High accuracy, English only' }, + { id: 'medium.en', name: 'Medium', size: 1500, lang: 'en', description: 'Near human-level, English only' }, + { id: 'tiny', name: 'Tiny', size: 75, lang: 'multi', description: 'Fastest, 99 languages' }, + { id: 'base', name: 'Base', size: 142, lang: 'multi', description: 'Better accuracy, 99 languages' }, + { id: 'small', name: 'Small', size: 466, lang: 'multi', description: 'High accuracy, 99 languages' }, + { id: 'medium', name: 'Medium', size: 1500, lang: 'multi', description: 'Near human-level, 99 languages' }, ], })); +jest.mock('../../../src/services/huggingface', () => ({ + huggingFaceService: { + searchWhisperRepos: jest.fn().mockResolvedValue([]), + getWhisperFiles: jest.fn().mockResolvedValue([]), + }, +})); + import { VoiceSettingsScreen } from '../../../src/screens/VoiceSettingsScreen'; const mockGoBack = jest.fn(); @@ -134,6 +147,7 @@ describe('VoiceSettingsScreen', () => { isDownloading: false, downloadProgress: 0, downloadModel: mockDownloadModel, + downloadFromUrl: mockDownloadFromUrl, deleteModel: mockDeleteModel, error: null, clearError: mockClearError, @@ -149,19 +163,16 @@ describe('VoiceSettingsScreen', () => { expect(getByText('Voice Transcription')).toBeTruthy(); }); - it('shows description text about Whisper', () => { + it('shows privacy note about on-device transcription', () => { const { getByText } = render(); expect( - getByText(/Download a Whisper model to enable on-device voice input/), + getByText(/All transcription runs on-device/), ).toBeTruthy(); }); - it('shows privacy card', () => { - const { getByText } = render(); - expect(getByText('Privacy First')).toBeTruthy(); - expect( - getByText(/Voice transcription happens entirely on your device/), - ).toBeTruthy(); + it('shows search bar', () => { + const { getByPlaceholderText } = render(); + expect(getByPlaceholderText('Search models or HuggingFace...')).toBeTruthy(); }); it('back button calls goBack', () => { @@ -178,48 +189,46 @@ describe('VoiceSettingsScreen', () => { // No Model Downloaded - Download Options // ============================================================================ describe('download options (no model)', () => { - it('shows download options when no model is downloaded', () => { + it('shows English model section', () => { const { getByText } = render(); - expect(getByText('Whisper Tiny')).toBeTruthy(); - expect(getByText('Whisper Base')).toBeTruthy(); - expect(getByText('Whisper Small')).toBeTruthy(); + expect(getByText('ENGLISH ONLY')).toBeTruthy(); }); - it('shows only first 3 models (slice(0, 3))', () => { - const { queryByText } = render(); - // 4th model (medium) should NOT be shown due to .slice(0, 3) - expect(queryByText('Whisper Medium')).toBeNull(); + it('shows Multilingual model section', () => { + const { getByText } = render(); + expect(getByText(/MULTILINGUAL/)).toBeTruthy(); }); - it('shows "Select a model to download" label', () => { - const { getByText } = render(); - expect(getByText('Select a model to download:')).toBeTruthy(); + it('shows model names in English section', () => { + const { getAllByText } = render(); + // "Tiny" appears in both English and Multilingual sections + expect(getAllByText('Tiny').length).toBeGreaterThanOrEqual(1); }); - it('shows model size for each option', () => { - const { getByText } = render(); - expect(getByText('75 MB')).toBeTruthy(); - expect(getByText('141 MB')).toBeTruthy(); - expect(getByText('461 MB')).toBeTruthy(); + it('shows model size for options', () => { + const { getAllByText } = render(); + // Sizes appear in both English and Multilingual sections + expect(getAllByText('75 MB').length).toBeGreaterThanOrEqual(1); + expect(getAllByText('142 MB').length).toBeGreaterThanOrEqual(1); + expect(getAllByText('466 MB').length).toBeGreaterThanOrEqual(1); }); - it('shows model description for each option', () => { + it('shows model description for options', () => { const { getByText } = render(); - expect(getByText('Fastest, lower accuracy')).toBeTruthy(); - expect(getByText('Good accuracy')).toBeTruthy(); - expect(getByText('Better accuracy')).toBeTruthy(); + expect(getByText('Fastest, English only')).toBeTruthy(); + expect(getByText('Better accuracy, English only')).toBeTruthy(); }); it('calls downloadModel when a model option is pressed', () => { - const { getByText } = render(); - fireEvent.press(getByText('Whisper Base')); - expect(mockDownloadModel).toHaveBeenCalledWith('base'); + const { getByTestId } = render(); + fireEvent.press(getByTestId('model-download-base.en')); + expect(mockDownloadModel).toHaveBeenCalledWith('base.en'); }); it('calls downloadModel with correct id for tiny model', () => { - const { getByText } = render(); - fireEvent.press(getByText('Whisper Tiny')); - expect(mockDownloadModel).toHaveBeenCalledWith('tiny'); + const { getByTestId } = render(); + fireEvent.press(getByTestId('model-download-tiny.en')); + expect(mockDownloadModel).toHaveBeenCalledWith('tiny.en'); }); }); @@ -230,28 +239,28 @@ describe('VoiceSettingsScreen', () => { beforeEach(() => { mockWhisperStoreValues = { ...mockWhisperStoreValues, - downloadedModelId: 'base', + downloadedModelId: 'base.en', }; }); - it('shows downloaded model name', () => { + it('shows active model section label', () => { const { getByText } = render(); - expect(getByText('Whisper Base')).toBeTruthy(); + expect(getByText('ACTIVE MODEL')).toBeTruthy(); }); - it('shows "Downloaded" status badge', () => { + it('shows downloaded model name with language', () => { const { getByText } = render(); - expect(getByText('Downloaded')).toBeTruthy(); + expect(getByText(/Base — English/)).toBeTruthy(); }); - it('shows "Remove Model" button', () => { + it('shows "Active" status badge', () => { const { getByText } = render(); - expect(getByText('Remove Model')).toBeTruthy(); + expect(getByText('Active')).toBeTruthy(); }); - it('does not show download options when model is downloaded', () => { - const { queryByText } = render(); - expect(queryByText('Select a model to download:')).toBeNull(); + it('shows "Remove" button', () => { + const { getByText } = render(); + expect(getByText('Remove')).toBeTruthy(); }); it('shows model id as fallback when model not found in WHISPER_MODELS', () => { @@ -263,11 +272,11 @@ describe('VoiceSettingsScreen', () => { expect(getByText('unknown-model')).toBeTruthy(); }); - it('pressing Remove Model shows confirmation alert', () => { + it('pressing Remove shows confirmation alert', () => { const { getByText } = render(); - fireEvent.press(getByText('Remove Model')); + fireEvent.press(getByText('Remove')); expect(mockShowAlert).toHaveBeenCalledWith( - 'Remove Whisper Model', + 'Remove Voice Model', 'This will disable voice input until you download a model again.', expect.arrayContaining([ expect.objectContaining({ text: 'Cancel', style: 'cancel' }), @@ -294,11 +303,6 @@ describe('VoiceSettingsScreen', () => { expect(getByText('Downloading... 45%')).toBeTruthy(); }); - it('does not show download options during download', () => { - const { queryByText } = render(); - expect(queryByText('Select a model to download:')).toBeNull(); - }); - it('shows 0% at start of download', () => { mockWhisperStoreValues = { ...mockWhisperStoreValues, @@ -334,13 +338,13 @@ describe('VoiceSettingsScreen', () => { // Error State // ============================================================================ describe('error state', () => { - it('shows error message when whisperError is set', () => { + it('shows error message with tap to dismiss when whisperError is set', () => { mockWhisperStoreValues = { ...mockWhisperStoreValues, error: 'Download failed: network error', }; const { getByText } = render(); - expect(getByText('Download failed: network error')).toBeTruthy(); + expect(getByText('Download failed: network error (tap to dismiss)')).toBeTruthy(); }); it('calls clearError when error is tapped', () => { @@ -349,7 +353,7 @@ describe('VoiceSettingsScreen', () => { error: 'Download failed', }; const { getByText } = render(); - fireEvent.press(getByText('Download failed')); + fireEvent.press(getByText('Download failed (tap to dismiss)')); expect(mockClearError).toHaveBeenCalled(); }); diff --git a/__tests__/unit/hooks/useKeyboardAwarePopover.test.ts b/__tests__/unit/hooks/useKeyboardAwarePopover.test.ts index 0e37e3e3b..727880ba5 100644 --- a/__tests__/unit/hooks/useKeyboardAwarePopover.test.ts +++ b/__tests__/unit/hooks/useKeyboardAwarePopover.test.ts @@ -126,12 +126,12 @@ describe('useKeyboardAwarePopover', () => { expect(mockKeyboardDismiss).not.toHaveBeenCalled(); }); - it('measures trigger position with custom offsetX', () => { + it('measures trigger position from button coords', () => { const mockMeasureInWindow = jest.fn((callback) => { callback(10, 100, 50, 30); }); - const { result } = renderHook(() => useKeyboardAwarePopover(20)); + const { result } = renderHook(() => useKeyboardAwarePopover()); // Set up mock ref (result.current.triggerRef as any).current = { @@ -143,9 +143,9 @@ describe('useKeyboardAwarePopover', () => { }); expect(mockMeasureInWindow).toHaveBeenCalled(); - // anchor.y = screenH - y = 800 - 100 = 700 - // anchor.x = offsetX = 20 - expect(result.current.anchor).toEqual({ y: 700, x: 20 }); + // anchor.y = screenH - btnY = 800 - 100 = 700 + // anchor.x = screenW - (btnX + btnW) = 400 - (10 + 50) = 340 + expect(result.current.anchor).toEqual({ y: 700, x: 340 }); }); it('handles missing measureInWindow gracefully', () => { @@ -175,7 +175,8 @@ describe('useKeyboardAwarePopover', () => { }); // y = screenH - (undefined ?? 0) = 800 - 0 = 800 - expect(result.current.anchor).toEqual({ y: 800, x: 12 }); // SPACING.md = 12 + // x = screenW - (btnX + btnW) = 400 - (10 + 50) = 340 + expect(result.current.anchor).toEqual({ y: 800, x: 340 }); }); }); @@ -361,8 +362,8 @@ describe('useKeyboardAwarePopover', () => { }); }); - describe('offsetX parameter', () => { - it('uses default SPACING.md when offsetX not provided', () => { + describe('button position measurement', () => { + it('computes anchorX as right-edge distance from screen right', () => { const mockMeasureInWindow = jest.fn((callback) => { callback(10, 100, 50, 30); }); @@ -377,16 +378,16 @@ describe('useKeyboardAwarePopover', () => { result.current.show(); }); - // SPACING.md = 12 - expect(result.current.anchor.x).toBe(12); + // screenW=400, btnX=10, btnW=50 → x = 400 - (10+50) = 340 + expect(result.current.anchor.x).toBe(340); }); - it('uses custom offsetX when provided', () => { + it('computes anchorY as distance from button top to screen bottom', () => { const mockMeasureInWindow = jest.fn((callback) => { callback(10, 100, 50, 30); }); - const { result } = renderHook(() => useKeyboardAwarePopover(50)); + const { result } = renderHook(() => useKeyboardAwarePopover()); (result.current.triggerRef as any).current = { measureInWindow: mockMeasureInWindow, @@ -396,7 +397,8 @@ describe('useKeyboardAwarePopover', () => { result.current.show(); }); - expect(result.current.anchor.x).toBe(50); + // screenH=800, btnY=100 → y = 800 - 100 = 700 + expect(result.current.anchor.y).toBe(700); }); }); }); \ No newline at end of file diff --git a/__tests__/unit/services/ttsService.test.ts b/__tests__/unit/services/ttsService.test.ts new file mode 100644 index 000000000..4e46d45b9 --- /dev/null +++ b/__tests__/unit/services/ttsService.test.ts @@ -0,0 +1,302 @@ +/** + * TTS Service Unit Tests + * + * Tests for backbone/vocoder download, model lifecycle, audio generation, + * file persistence, and playback control. + * Priority: P1 - Core TTS functionality. + */ + +jest.mock('llama.rn', () => ({ + initLlama: jest.fn(), +})); + +jest.mock('react-native-fs', () => ({ + DocumentDirectoryPath: '/mock/docs', + exists: jest.fn(), + mkdir: jest.fn(), + unlink: jest.fn(), + downloadFile: jest.fn(), + writeFile: jest.fn(), + readFile: jest.fn(), + stat: jest.fn(), + readDir: jest.fn(), +})); + +jest.mock('react-native-audio-api', () => ({ + AudioContext: jest.fn().mockImplementation(() => ({ + createBuffer: jest.fn().mockReturnValue({ copyToChannel: jest.fn() }), + createBufferSource: jest.fn().mockReturnValue({ + connect: jest.fn(), + start: jest.fn(), + stop: jest.fn(), + playbackRate: { value: 1.0 }, + onended: null, + buffer: null, + }), + destination: {}, + close: jest.fn(), + })), +})); + +jest.mock('../../../src/utils/logger', () => ({ + __esModule: true, + default: { log: jest.fn(), error: jest.fn(), warn: jest.fn() }, +})); + +import RNFS from 'react-native-fs'; +import { initLlama } from 'llama.rn'; +import { ttsService } from '../../../src/services/ttsService'; +import { TTS_BACKBONE_MODEL } from '../../../src/constants/ttsModels'; + +const mockRNFS = RNFS as jest.Mocked; +const mockInitLlama = initLlama as jest.Mock; + +const makeMockContext = (vocoderEnabled = true) => ({ + initVocoder: jest.fn().mockResolvedValue(undefined), + isVocoderEnabled: jest.fn().mockResolvedValue(vocoderEnabled), + releaseVocoder: jest.fn().mockResolvedValue(undefined), + release: jest.fn().mockResolvedValue(undefined), + getFormattedAudioCompletion: jest.fn().mockResolvedValue({ prompt: 'p', grammar: 'g' }), + getAudioCompletionGuideTokens: jest.fn().mockResolvedValue([1, 2, 3]), + completion: jest.fn().mockResolvedValue({ audio_tokens: [10, 20, 30] }), + decodeAudioTokens: jest.fn().mockResolvedValue(new Array(2400).fill(0.1)), +}); + +describe('ttsService', () => { + beforeEach(() => { + jest.clearAllMocks(); + // Reset internal state between tests + (ttsService as any).context = null; + (ttsService as any).isVocoderReady = false; + (ttsService as any).isSpeakingFlag = false; + (ttsService as any).contextLoadPromise = Promise.resolve(); + }); + + // ─── Paths ──────────────────────────────────────────────────────────────── + + describe('paths', () => { + it('backbone path uses tts-models directory', () => { + expect(ttsService.getBackbonePath()).toBe( + `/mock/docs/tts-models/${TTS_BACKBONE_MODEL.backboneFile}`, + ); + }); + + it('vocoder path uses tts-models directory', () => { + expect(ttsService.getVocoderPath()).toBe( + `/mock/docs/tts-models/${TTS_BACKBONE_MODEL.vocoderFile}`, + ); + }); + + it('audio file path scoped to conversationId and messageId', () => { + expect(ttsService.getAudioFilePath('conv1', 'msg1')).toBe( + '/mock/docs/audio-cache/conv1/msg1.pcm', + ); + }); + }); + + // ─── Download ──────────────────────────────────────────────────────────── + + describe('downloadBackbone', () => { + it('returns existing path without downloading if already present', async () => { + mockRNFS.exists.mockResolvedValueOnce(true) // ensureDir + .mockResolvedValueOnce(true); // file exists + const path = await ttsService.downloadBackbone(); + expect(mockRNFS.downloadFile).not.toHaveBeenCalled(); + expect(path).toBe(ttsService.getBackbonePath()); + }); + + it('downloads and returns path on success', async () => { + mockRNFS.exists.mockResolvedValueOnce(false) // dir missing + .mockResolvedValueOnce(false); // file missing + mockRNFS.mkdir.mockResolvedValueOnce(undefined); + mockRNFS.downloadFile.mockReturnValue({ jobId: 1, promise: Promise.resolve({ statusCode: 200, jobId: 1, bytesWritten: 0 }) }); + + const onProgress = jest.fn(); + const path = await ttsService.downloadBackbone(onProgress); + + expect(mockRNFS.downloadFile).toHaveBeenCalledWith( + expect.objectContaining({ fromUrl: TTS_BACKBONE_MODEL.backboneUrl }), + ); + expect(path).toBe(ttsService.getBackbonePath()); + }); + + it('throws and removes partial file on non-200 response', async () => { + mockRNFS.exists.mockResolvedValue(false); + mockRNFS.mkdir.mockResolvedValueOnce(undefined); + mockRNFS.downloadFile.mockReturnValue({ jobId: 1, promise: Promise.resolve({ statusCode: 404, jobId: 1, bytesWritten: 0 }) }); + mockRNFS.unlink.mockResolvedValue(undefined); + + await expect(ttsService.downloadBackbone()).rejects.toThrow('HTTP 404'); + expect(mockRNFS.unlink).toHaveBeenCalled(); + }); + }); + + describe('downloadVocoder', () => { + it('downloads vocoder to correct path', async () => { + mockRNFS.exists.mockResolvedValue(false); + mockRNFS.mkdir.mockResolvedValueOnce(undefined); + mockRNFS.downloadFile.mockReturnValue({ jobId: 1, promise: Promise.resolve({ statusCode: 200, jobId: 1, bytesWritten: 0 }) }); + + const path = await ttsService.downloadVocoder(); + expect(mockRNFS.downloadFile).toHaveBeenCalledWith( + expect.objectContaining({ fromUrl: TTS_BACKBONE_MODEL.vocoderUrl }), + ); + expect(path).toBe(ttsService.getVocoderPath()); + }); + }); + + // ─── Model Lifecycle ───────────────────────────────────────────────────── + + describe('loadModels', () => { + it('calls initLlama with backbone path then initVocoder', async () => { + const ctx = makeMockContext(); + mockInitLlama.mockResolvedValue(ctx); + + await ttsService.loadModels(); + + expect(mockInitLlama).toHaveBeenCalledWith( + expect.objectContaining({ model: ttsService.getBackbonePath() }), + ); + expect(ctx.initVocoder).toHaveBeenCalledWith( + expect.objectContaining({ path: ttsService.getVocoderPath() }), + ); + }); + + it('throws if isVocoderEnabled returns false', async () => { + const ctx = makeMockContext(false); + mockInitLlama.mockResolvedValue(ctx); + + await expect(ttsService.loadModels()).rejects.toThrow('Vocoder failed to initialize'); + }); + + it('is idempotent — does not double-init if already loaded', async () => { + const ctx = makeMockContext(); + mockInitLlama.mockResolvedValue(ctx); + + await ttsService.loadModels(); + await ttsService.loadModels(); + + expect(mockInitLlama).toHaveBeenCalledTimes(1); + }); + }); + + describe('unloadModels', () => { + it('calls releaseVocoder and release', async () => { + const ctx = makeMockContext(); + mockInitLlama.mockResolvedValue(ctx); + await ttsService.loadModels(); + + await ttsService.unloadModels(); + + expect(ctx.releaseVocoder).toHaveBeenCalled(); + expect(ctx.release).toHaveBeenCalled(); + expect(ttsService.isLoaded()).toBe(false); + }); + }); + + // ─── Generation ────────────────────────────────────────────────────────── + + describe('generate', () => { + it('calls completion pipeline in correct order and returns GeneratedAudio', async () => { + const ctx = makeMockContext(); + mockInitLlama.mockResolvedValue(ctx); + await ttsService.loadModels(); + + const audio = await ttsService.generate('hello world'); + + expect(ctx.getFormattedAudioCompletion).toHaveBeenCalled(); + expect(ctx.getAudioCompletionGuideTokens).toHaveBeenCalledWith('hello world'); + expect(ctx.completion).toHaveBeenCalled(); + expect(ctx.decodeAudioTokens).toHaveBeenCalled(); + + expect(audio.samples).toBeInstanceOf(Float32Array); + expect(audio.waveformData).toHaveLength(200); + expect(audio.durationSeconds).toBeGreaterThan(0); + expect(audio.sampleRate).toBe(TTS_BACKBONE_MODEL.sampleRate); + }); + + it('throws if models not loaded', async () => { + await expect(ttsService.generate('test')).rejects.toThrow('TTS models not loaded'); + }); + }); + + describe('saveToFile', () => { + it('writes base64-encoded PCM to correct path', async () => { + mockRNFS.exists.mockResolvedValue(false); + mockRNFS.mkdir.mockResolvedValueOnce(undefined); + mockRNFS.writeFile.mockResolvedValueOnce(undefined); + + const audio = { + samples: new Float32Array([0.1, 0.2, 0.3]), + durationSeconds: 0.01, + sampleRate: 24000, + waveformData: new Array(200).fill(0.1), + }; + + const path = await ttsService.saveToFile(audio, 'conv1', 'msg1'); + + expect(path).toBe('/mock/docs/audio-cache/conv1/msg1.pcm'); + expect(mockRNFS.writeFile).toHaveBeenCalledWith( + '/mock/docs/audio-cache/conv1/msg1.pcm', + expect.any(String), + 'base64', + ); + }); + }); + + // ─── Stop ──────────────────────────────────────────────────────────────── + + describe('stop', () => { + it('sets isSpeakingFlag to false', () => { + (ttsService as any).isSpeakingFlag = true; + ttsService.stop(); + expect(ttsService.isSpeaking()).toBe(false); + }); + + it('calls stop on currentSource', () => { + const mockSource = { stop: jest.fn() }; + (ttsService as any).currentSource = mockSource; + ttsService.stop(); + expect(mockSource.stop).toHaveBeenCalled(); + }); + }); + + // ─── Cache ──────────────────────────────────────────────────────────────── + + describe('getAudioCacheSizeMB', () => { + it('returns 0 if cache directory does not exist', async () => { + mockRNFS.exists.mockResolvedValueOnce(false); + const size = await ttsService.getAudioCacheSizeMB(); + expect(size).toBe(0); + }); + + it('returns size in MB by summing individual file sizes', async () => { + mockRNFS.exists.mockResolvedValueOnce(true); + // readDir(cacheRoot) → one conversation directory + (mockRNFS as any).readDir + .mockResolvedValueOnce([{ isDirectory: () => true, path: '/mock/docs/audio-cache/conv1' }]) + // readDir(conv1) → two .pcm files, each 2.5 MB + .mockResolvedValueOnce([ + { isDirectory: () => false, size: 2.5 * 1024 * 1024 }, + { isDirectory: () => false, size: 2.5 * 1024 * 1024 }, + ]); + const size = await ttsService.getAudioCacheSizeMB(); + expect(size).toBeCloseTo(5); + }); + }); + + describe('clearAudioCache', () => { + it('unlinks the cache root if it exists', async () => { + mockRNFS.exists.mockResolvedValueOnce(true); + mockRNFS.unlink.mockResolvedValueOnce(undefined); + await ttsService.clearAudioCache(); + expect(mockRNFS.unlink).toHaveBeenCalledWith('/mock/docs/audio-cache'); + }); + + it('does nothing if cache does not exist', async () => { + mockRNFS.exists.mockResolvedValueOnce(false); + await ttsService.clearAudioCache(); + expect(mockRNFS.unlink).not.toHaveBeenCalled(); + }); + }); +}); diff --git a/__tests__/unit/stores/ttsStore.test.ts b/__tests__/unit/stores/ttsStore.test.ts new file mode 100644 index 000000000..e3ae0164c --- /dev/null +++ b/__tests__/unit/stores/ttsStore.test.ts @@ -0,0 +1,225 @@ +/** + * TTS Store Unit Tests + * + * Tests for the engine-agnostic TTS store. + * The store delegates to the active TTSEngine via the registry. + */ + +// Mock the engine module — we control the registry and engine instances +const mockEngine = { + id: 'mock-tts', + displayName: 'Mock TTS', + capabilities: { + streaming: false, + voiceCloning: false, + pauseResume: true, + generateAndSave: true, + peakRamMB: 100, + }, + getPhase: jest.fn(() => 'ready' as const), + on: jest.fn(() => jest.fn()), // returns unsub + off: jest.fn(), + once: jest.fn(() => jest.fn()), + isSupported: jest.fn(() => true), + initialize: jest.fn().mockResolvedValue(undefined), + release: jest.fn().mockResolvedValue(undefined), + destroy: jest.fn().mockResolvedValue(undefined), + getRequiredAssets: jest.fn(() => []), + checkAssetStatus: jest.fn().mockResolvedValue([]), + downloadAssets: jest.fn().mockResolvedValue(undefined), + deleteAssets: jest.fn().mockResolvedValue(undefined), + getOverallDownloadProgress: jest.fn(() => 1), + isFullyDownloaded: jest.fn(() => true), + getBridgeComponent: jest.fn(() => null), + getVoices: jest.fn(() => [{ id: 'default', label: 'Default', metadata: {} }]), + getActiveVoice: jest.fn(() => ({ id: 'default', label: 'Default', metadata: {} })), + setVoice: jest.fn().mockResolvedValue(undefined), + speak: jest.fn().mockResolvedValue(undefined), + generateAndSave: jest.fn().mockResolvedValue({ + filePath: '/cache/c1/m1.pcm', + durationSeconds: 2.5, + waveformData: new Array(200).fill(0.1), + }), + playFromFile: jest.fn().mockResolvedValue(undefined), + stop: jest.fn(), + pause: jest.fn(), + resume: jest.fn(), +}; + +jest.mock('../../../src/engine', () => ({ + ttsRegistry: { + register: jest.fn(), + has: jest.fn(() => true), + getEngine: jest.fn(() => mockEngine), + setActiveEngine: jest.fn().mockResolvedValue(mockEngine), + getActiveEngine: jest.fn(() => mockEngine), + getActiveEngineId: jest.fn(() => 'mock-tts'), + getRegisteredIds: jest.fn(() => ['mock-tts']), + }, + OuteTTSEngine: class {}, +})); + +jest.mock('../../../src/utils/logger', () => ({ + __esModule: true, + default: { log: jest.fn(), error: jest.fn(), warn: jest.fn() }, +})); + +import { useTTSStore } from '../../../src/stores/ttsStore'; + +const getState = () => useTTSStore.getState(); + +const resetState = () => { + useTTSStore.setState({ + phase: 'ready', + currentMessageId: null, + currentAmplitude: 0, + playbackElapsed: 0, + playSessionId: 0, + error: null, + isReady: true, + isDownloading: false, + isLoading: false, + isSpeaking: false, + isPaused: false, + isGeneratingAudio: false, + assets: [], + overallDownloadProgress: 1, + voices: [{ id: 'default', label: 'Default', metadata: {} }], + activeVoiceId: 'default', + audioCacheSizeMB: 0, + settings: { + interfaceMode: 'chat', + enabled: true, + autoPlay: false, + speed: 1.0, + engineId: 'mock-tts', + voiceByEngine: {}, + }, + }); +}; + +describe('ttsStore', () => { + beforeEach(() => { + resetState(); + jest.clearAllMocks(); + }); + + // ── Speak ────────────────────────────────────────────────────────────── + + describe('speak', () => { + it('delegates to engine.speak with correct options', async () => { + await getState().speak('hello', 'msg1'); + + expect(mockEngine.speak).toHaveBeenCalledWith('hello', expect.objectContaining({ + speed: 1.0, + messageId: 'msg1', + })); + }); + + it('toggles off when same message is already speaking', async () => { + useTTSStore.setState({ isSpeaking: true, currentMessageId: 'msg1' }); + + await getState().speak('hello', 'msg1'); + + expect(mockEngine.stop).toHaveBeenCalled(); + expect(mockEngine.speak).not.toHaveBeenCalled(); + }); + + it('does nothing when TTS is disabled', async () => { + useTTSStore.setState({ settings: { ...getState().settings, enabled: false } }); + + await getState().speak('hello', 'msg1'); + + expect(mockEngine.speak).not.toHaveBeenCalled(); + }); + + it('clears currentMessageId after completion', async () => { + await getState().speak('hello', 'msg1'); + + expect(getState().currentMessageId).toBeNull(); + }); + }); + + // ── Stop / Pause / Resume ───────────────────────────────────────────── + + describe('stop', () => { + it('delegates to engine.stop and clears state', () => { + useTTSStore.setState({ currentMessageId: 'msg1' }); + getState().stop(); + + expect(mockEngine.stop).toHaveBeenCalled(); + expect(getState().currentMessageId).toBeNull(); + }); + }); + + describe('pause/resume', () => { + it('delegates to engine', () => { + getState().pause(); + expect(mockEngine.pause).toHaveBeenCalled(); + + getState().resume(); + expect(mockEngine.resume).toHaveBeenCalled(); + }); + }); + + // ── Generate and Save ───────────────────────────────────────────────── + + describe('generateAndSave', () => { + it('delegates to engine and returns result', async () => { + const result = await getState().generateAndSave('hello', 'conv1', 'msg1'); + + expect(mockEngine.generateAndSave).toHaveBeenCalledWith('hello', 'conv1', 'msg1', expect.any(Object)); + expect(result.path).toBe('/cache/c1/m1.pcm'); + expect(result.waveformData).toHaveLength(200); + expect(result.durationSeconds).toBe(2.5); + }); + }); + + // ── Play Message ────────────────────────────────────────────────────── + + describe('playMessage', () => { + it('delegates to engine.playFromFile', async () => { + await getState().playMessage('msg1', '/cache/conv1/msg1.pcm'); + + expect(mockEngine.playFromFile).toHaveBeenCalledWith('/cache/conv1/msg1.pcm', expect.objectContaining({ + speed: 1.0, + startOffset: 0, + messageId: 'msg1', + })); + }); + + it('stops if same message is already playing', async () => { + useTTSStore.setState({ isSpeaking: true, currentMessageId: 'msg1' }); + + await getState().playMessage('msg1', '/cache/conv1/msg1.pcm'); + + expect(mockEngine.stop).toHaveBeenCalled(); + expect(mockEngine.playFromFile).not.toHaveBeenCalled(); + }); + }); + + // ── Settings ────────────────────────────────────────────────────────── + + describe('updateSettings', () => { + it('merges partial settings', () => { + getState().updateSettings({ speed: 1.5, autoPlay: true }); + const { settings } = getState(); + expect(settings.speed).toBe(1.5); + expect(settings.autoPlay).toBe(true); + expect(settings.enabled).toBe(true); + }); + + it('can switch interfaceMode', () => { + getState().updateSettings({ interfaceMode: 'audio' }); + expect(getState().settings.interfaceMode).toBe('audio'); + }); + }); + + describe('clearError', () => { + it('clears the error field', () => { + useTTSStore.setState({ error: 'something went wrong' }); + getState().clearError(); + expect(getState().error).toBeNull(); + }); + }); +}); diff --git a/__tests__/unit/utils/messageContent.test.ts b/__tests__/unit/utils/messageContent.test.ts index b35b0181e..5f79afefb 100644 --- a/__tests__/unit/utils/messageContent.test.ts +++ b/__tests__/unit/utils/messageContent.test.ts @@ -118,8 +118,8 @@ describe('stripControlTokens', () => { expect(stripControlTokens('<|im_start|>assistant\n<|im_end|>')).toBe(''); }); - it('preserves whitespace in content', () => { - expect(stripControlTokens(' Hello World ')).toBe(' Hello World '); + it('trims leading/trailing whitespace in content', () => { + expect(stripControlTokens(' Hello World ')).toBe('Hello World'); }); it('preserves HTML-like tags that are not control tokens', () => { diff --git a/android/build.gradle b/android/build.gradle index dad99b022..984e5bed6 100644 --- a/android/build.gradle +++ b/android/build.gradle @@ -19,3 +19,4 @@ buildscript { } apply plugin: "com.facebook.react.rootproject" + diff --git a/docs/PERSONAS_IMPLEMENTATION_PLAN.md b/docs/PERSONAS_IMPLEMENTATION_PLAN.md index 93ccdd5db..dd1225fcb 100644 --- a/docs/PERSONAS_IMPLEMENTATION_PLAN.md +++ b/docs/PERSONAS_IMPLEMENTATION_PLAN.md @@ -31,7 +31,8 @@ export type Capability = | 'voice' // STT + TTS | 'vision' // image understanding | 'image-gen' // image generation - | 'rag'; // knowledge base search + | 'rag' // knowledge base search (user-uploaded documents) + | 'memory-rag'; // cross-conversation RAG — past messages indexed and retrieved export type SkillTriggerEvent = | 'message_received' // new message in connected app @@ -109,8 +110,9 @@ export interface Persona { capabilities: Capability[]; // What this persona knows - knowledgeBaseIds: string[]; // attached RAG knowledge bases (use projectId as KB id) - memoryFacts: PersonaMemoryFact[]; // persistent learned facts + knowledgeBaseIds: string[]; // attached RAG knowledge bases (user-uploaded documents) + conversationMemoryEnabled: boolean; // true = all past conversations for this persona are embedded + searchable + memoryFacts: PersonaMemoryFact[]; // persistent learned facts (LLM-extracted, concise) // What this persona does automatically skills: Skill[]; @@ -227,8 +229,9 @@ export const DEFAULT_PERSONAS: Omit[] = [ systemPrompt: 'You are Jarvis, a capable and concise personal assistant. You help with anything — questions, tasks, planning, thinking. You are direct, warm, and never verbose unless asked.', icon: 'cpu', accentColor: '#6366F1', - capabilities: ['text', 'voice', 'vision'], + capabilities: ['text', 'voice', 'vision', 'memory-rag'], knowledgeBaseIds: [], + conversationMemoryEnabled: true, // Jarvis indexes all past conversations — gives it cross-chat intelligence memoryFacts: [], skills: [], integrationIds: [], @@ -418,6 +421,113 @@ export function buildMemoryContext(facts: PersonaMemoryFact[]): string { } ``` +### conversationRagService.ts (new — cross-conversation memory) + +This is what makes Jarvis actually intelligent across sessions. Rather than relying only on extracted `memoryFacts` (brief summaries) or the current context window, Jarvis embeds every conversation message into a per-persona vector store. When a new message arrives, relevant past exchanges are retrieved and injected as context — so Jarvis remembers "we discussed your onboarding last Tuesday" without you having to repeat it. + +**How it's different from document KB:** + +| | Document KB (`knowledgeBaseIds`) | Conversation RAG (`conversationMemoryEnabled`) | +|---|---|---| +| Source | User-uploaded PDFs, notes | Past conversation messages | +| Indexed when | User uploads a file | After each assistant response | +| Retrieved by | User explicitly asking about docs | Automatically on every message | +| Scoped to | Attached knowledge bases | All conversations for this persona | + +```typescript +// src/services/conversationRagService.ts + +/** + * Indexes completed conversation messages into the persona's vector store. + * Called after each assistant turn completes (streaming done). + * + * Each chunk stored = ~4–6 messages grouped by semantic coherence, not + * arbitrary token windows. This preserves conversational context. + */ +export async function indexConversationTurn( + personaId: string, + conversationId: string, + messages: Message[], // recent messages to embed (typically last 4–6) +): Promise { + const chunks = chunkMessagesForEmbedding(messages); + for (const chunk of chunks) { + const embedding = await embeddingService.embed(chunk.text); + await vectorStore.upsert({ + id: `${conversationId}:${chunk.startIndex}`, + embedding, + metadata: { + personaId, + conversationId, + timestamp: chunk.timestamp, + preview: chunk.text.slice(0, 120), + }, + }); + } +} + +/** + * Retrieves the most relevant past conversation context for the current message. + * Returns plain text ready to inject into the system prompt. + */ +export async function retrieveRelevantHistory( + personaId: string, + currentMessage: string, + topK = 3, +): Promise { + const queryEmbedding = await embeddingService.embed(currentMessage); + const results = await vectorStore.search({ + embedding: queryEmbedding, + filter: { personaId }, + topK, + minScore: 0.72, // only inject if meaningfully relevant + }); + + if (results.length === 0) return ''; + + const snippets = results.map(r => + `[${formatRelativeDate(r.metadata.timestamp)}]\n${r.metadata.preview}` + ); + return `\n\nRelevant context from past conversations:\n${snippets.join('\n\n---\n\n')}`; +} + +/** + * Groups messages into semantically coherent chunks for embedding. + * Avoids splitting a user question from its assistant answer. + */ +function chunkMessagesForEmbedding(messages: Message[]): EmbeddingChunk[] { + // Pair each user message with its following assistant response + // Output: chunks of ~300–400 tokens each +} +``` + +**System prompt injection** (in `llm.ts` or wherever the prompt is assembled): + +```typescript +// When conversationMemoryEnabled is true for the active persona: +if (persona.conversationMemoryEnabled) { + const history = await conversationRagService.retrieveRelevantHistory( + persona.id, + latestUserMessage, + ); + systemPrompt += history; +} +``` + +**Indexing trigger** (after streaming completes, in chatStore or the streaming callback): + +```typescript +// After assistant response is done streaming: +if (persona.conversationMemoryEnabled) { + conversationRagService.indexConversationTurn( + persona.id, + conversationId, + recentMessages.slice(-6), + ).catch(() => {}); // fire-and-forget, non-blocking +} +``` + +**Storage:** Uses the existing `ragService` vector store, namespaced by `personaId`. No new storage layer needed — just a new indexing source. + --- ## Screens @@ -926,6 +1036,11 @@ export interface Message { 18. Memory injection into system prompt 19. `PersonaMemoryScreen` 20. Memory bar in chat (new fact notification) +21. `conversationRagService.ts` — cross-conversation RAG for `memory-rag` capability + - Index each conversation turn after streaming completes (fire-and-forget) + - Retrieve relevant history and inject into system prompt before each LLM call + - Jarvis has `conversationMemoryEnabled: true` by default; other personas opt in via PersonaEditScreen + - Reuses existing `ragService` vector store, namespaced by `personaId` ### Phase 5 — Integrations in Chat (tool calls) 21. Wire integration tool registry entries diff --git a/docs/TTS_ENGINE_INTERFACE.md b/docs/TTS_ENGINE_INTERFACE.md new file mode 100644 index 000000000..a8ddd5f1f --- /dev/null +++ b/docs/TTS_ENGINE_INTERFACE.md @@ -0,0 +1,154 @@ +# TTS Engine Interface + +## Overview + +The TTS subsystem uses a pluggable engine interface that decouples the app from any specific TTS implementation. Engines are registered at startup, the user picks one in settings, and the store delegates all operations through the active engine. + +The interface is designed as the first concrete implementation of a broader **On-Device Engine** pattern that will generalize to STT, Vision, and LLM modalities. + +## Architecture + +``` +src/engine/ + types.ts # OnDeviceEngine base + TTSEngine interface + OnDeviceEngineEmitter.ts # Zero-dep typed event emitter + EngineRegistry.ts # Generic registry (TTS, STT, Vision, LLM) + index.ts # Barrel + singleton ttsRegistry + + tts/engines/ + kokoro/ # Kokoro TTS via react-native-executorch + KokoroEngine.ts # TTSEngine implementation + KokoroTTSBridge.tsx # React component bridge (wraps useTextToSpeech hook) + voices.ts # 8 voice definitions + outetts/ # OuteTTS 0.3 via llama.rn + OuteTTSEngine.ts # TTSEngine implementation + models.ts # GGUF asset definitions + qwen3/ # Qwen3-TTS 0.6B (stub) + Qwen3TTSEngine.ts # Asset management ready, inference TODO + models.ts # Talker + predictor + codec asset definitions +``` + +## How It Works + +### Engine Lifecycle + +``` +register → getEngine → setActiveEngine → initialize → speak/stop/pause → release +``` + +1. **Registration** — engines register factories at import time in `engine/index.ts` +2. **Activation** — `ttsRegistry.setActiveEngine('kokoro')` creates the instance and releases the previous engine +3. **Initialization** — imperative engines (OuteTTS) load models via `initialize()`. Hook-based engines (Kokoro) initialize when the bridge component mounts. +4. **Usage** — `engine.speak(text, options)` is the universal entry point +5. **Teardown** — `engine.release()` frees models; `engine.destroy()` also deletes downloaded files + +### Event System + +Every engine emits typed events. The store subscribes once and syncs state: + +- `phaseChange` — idle/downloading/loading/ready/processing/paused/error +- `audioChunk` — streaming PCM data (Kokoro) +- `audioComplete` — full audio buffer (OuteTTS) +- `downloadProgress` — per-asset download progress +- `amplitudeChange` — RMS amplitude for waveform visualization +- `voiceChanged` — active voice updated +- `error` — recoverable/non-recoverable errors + +### Store Delegation + +The Zustand store (`ttsStore.ts`) is a thin proxy: + +```typescript +speak: async (text, messageId) => { + const engine = ttsRegistry.getActiveEngine(); + if (!engine || !get().settings.enabled) return; + await engine.speak(text, { speed: get().settings.speed, messageId }); +} +``` + +No engine-specific branching. The store exposes derived booleans (`isReady`, `isSpeaking`, `isPaused`) computed from the engine's phase for backward compatibility with UI components. + +### React Bridge Pattern + +Some engines (Kokoro) depend on React hooks. These engines return a React component from `getBridgeComponent()`. The `` component (mounted in `App.tsx`) renders it: + +``` +App.tsx → → engine.getBridgeComponent() → +``` + +The bridge mounts the hook, then pushes an imperative handle into the engine instance. Fully imperative engines (OuteTTS, Qwen3) return `null` — no bridge needed. + +## Registered Engines + +| Engine | ID | Size | Streaming | Voice Cloning | Status | +|--------|-----|------|-----------|---------------|--------| +| Kokoro TTS | `kokoro` | 82 MB | Yes | No | Production | +| OuteTTS 0.3 | `outetts` | 530 MB | No | Yes | Production | +| Qwen3-TTS 0.6B | `qwen3-tts` | ~650 MB | No | Yes | Stub (not registered) | + +## Adding a New Engine + +1. Create `src/engine/tts/engines//` with: + - `models.ts` — `ModelAsset[]` definitions (URLs, sizes, filenames) + - `Engine.ts` — class extending `OnDeviceEngineEmitter` implementing `TTSEngine` + - `index.ts` — barrel exports + +2. Implement the interface: + - `getRequiredAssets()` — what to download + - `initialize()` — load models into memory + - `speak()` — text in, audio out + - `getVoices()` / `setVoice()` — voice management + - `stop()` / `pause()` / `resume()` — playback control + - `getBridgeComponent()` — return `null` for imperative engines + +3. Register in `src/engine/index.ts`: + ```typescript + import { MyEngine } from './tts/engines/myengine'; + ttsRegistry.register('myengine', () => new MyEngine()); + ``` + +4. It appears in the engine picker on the TTS Settings screen automatically. + +## Multimodal Future + +The `OnDeviceEngine` base interface generalizes beyond TTS: + +``` +OnDeviceEngine # lifecycle, assets, events, capabilities + ├── TTSEngine # text → audio (Kokoro, OuteTTS, Qwen3) + ├── STTEngine (future) # audio → text (whisper.rn) + ├── VisionEngine (future) # image → structured (CoreML) + └── LLMEngine (future) # text → text (llama.rn) +``` + +Each modality shares: lifecycle management, model asset download/delete, typed event system, capability declaration, platform checks, and the React bridge pattern. + +The `EngineRegistry` is generic — `new EngineRegistry()` works identically. + +The orchestration layer above would wire engines together: +- **Listen** (STT) → **Think** (LLM) → **Speak** (TTS) +- **See** (Vision) feeds context to **Think** + +## Qwen3-TTS Integration Path + +The stub is ready at `src/engine/tts/engines/qwen3/`. Asset management, download, and lifecycle are implemented. The remaining work is the inference pipeline in `speak()`: + +1. Load talker GGUF + predictor GGUF via `llama.rn` (two contexts) +2. Load codec decoder ONNX via `onnxruntime-react-native` +3. Talker generates first-codebook tokens at 12Hz +4. Predictor fills codebooks 2-16 +5. Codec decodes token grid to PCM Float32 at 24kHz + +Reference: [LunaVox](https://github.com/wkwong/lunavox) has a working desktop implementation of this pipeline. + +## Settings Migration + +The store handles migration from the pre-engine-interface format automatically via `onRehydrateStorage`. Old fields (`voiceId`, `kokoroVoiceId`) are migrated to `voiceByEngine` map on first load. + +## Key Files + +- `src/engine/types.ts` — all interfaces +- `src/engine/index.ts` — registry + engine registration +- `src/stores/ttsStore.ts` — store (delegates to active engine) +- `src/components/EngineBridge.tsx` — renders bridge for hook-based engines +- `src/screens/TTSSettingsScreen/index.tsx` — engine picker UI diff --git a/docs/TTS_IMPLEMENTATION_PLAN.md b/docs/TTS_IMPLEMENTATION_PLAN.md index 19b6942c5..41f548f40 100644 --- a/docs/TTS_IMPLEMENTATION_PLAN.md +++ b/docs/TTS_IMPLEMENTATION_PLAN.md @@ -2,1075 +2,275 @@ ## Product Vision -Two first-class interface modes, switchable from Settings: +Two first-class interface modes, switchable from Chat Settings or TTS Settings: | Mode | Primary output | TTS role | Text | |---|---|---|---| | **Chat Mode** | Text bubbles | Add-on — play button per message | Default visible | -| **Audio Mode** | Waveform bubbles | Core — auto-generated at completion | Hidden by default, expandable | +| **Audio Mode** | Waveform bubbles (both sides) | Core — auto-generated at completion | Hidden by default, expandable | -**Audio Mode is the target product experience.** Messages feel like voice note exchanges — not a chat app that also speaks. The user has full per-message audio controls: scrub to position, adjust playback speed, change voice/tone. Text is always available as a "Show transcript" expand. +**Audio Mode is the target product experience.** Both the user's voice recordings AND the AI's responses appear as waveform audio bubbles — a full voice-note conversation. No text is shown by default; transcript is always accessible via "Show transcript" expand. -Chat Mode is the fallback for devices that can't run TTS models, or users who prefer it. +- User voice recordings: right-aligned audio bubbles (recorded WAV, played back locally) +- AI responses: left-aligned audio bubbles (OuteTTS-generated, with 40-bar waveform visualization) + +Chat Mode is the fallback for devices that can't run TTS models, or users who prefer text. --- ## Decision Log -### Engine -**OuteTTS 0.3 (500M) + WavTokenizer** via `llama.rn`. +### Engine (updated) + +**Two-tier TTS architecture:** + +| Tier | Engine | Use case | Speed | Size | +|---|---|---|---|---| +| **Tier 1 — Speak (Chat Mode)** | Kokoro via `react-native-executorch` | On-demand speak button, long-press Speak action | ~1s (streaming) | ~100MB | +| **Tier 2 — Generate+Save (Audio Mode)** | OuteTTS 0.3 + WavTokenizer via `llama.rn` | Auto-generate waveform bubble after streaming | ~30–120s | ~527MB | + +**Why two tiers:** +- Kokoro via ExecuTorch is fast enough for interactive use (streaming starts < 1s) but outputs raw PCM chunks — no way to write to disk for waveform scrubbing without custom buffering +- OuteTTS via llama.rn generates the full audio up front, returns `Float32Array` + waveform data + duration in one call — ideal for the saved-file + waveform visualisation pattern Audio Mode requires +- OuteTTS is NOT suitable for the speak button (too slow, ~30–120s per sentence) +- Kokoro is NOT currently available as a GGUF via llama.cpp (feature request opened Jan 2025, closed stale Oct 2025, never merged) + +**Previous decision (superseded):** +OuteTTS only via llama.rn for both modes. Superseded because ~1 minute to speak a single sentence is not acceptable for interactive use. + +### Platform constraint -- OuteTTS 1.0 (Qwen3 0.6B) is blocked: the DAC vocoder has no GGUF, and llama.cpp PR#12794 is an open draft. The backbone exists on HuggingFace but the decoder is not implemented upstream. -- OuteTTS 0.3 with WavTokenizer is the **only fully working path** through llama.rn today (confirmed via TTSScreen.tsx in mybigday/llama.rn example app). -- Upgrade to OuteTTS 1.0 will be a model swap with no architecture change once PR#12794 and llama.rn PR#300 land. +`react-native-executorch` requires **Android 13 (API 33)** minimum and **iOS 17** minimum. + +Current app `minSdkVersion` is **24 (Android 7)**. + +**Resolution:** Kokoro speak is available only on Android 13+ / iOS 17+. On older devices, the speak button falls back to OuteTTS (slow but functional). This is detected at runtime — no code path is dead, just slower on older OS. + +`minSdkVersion` stays at 24. No breaking change for existing users. ### Playback -**react-native-audio-api** (Software Mansion). Implements the Web Audio API spec for React Native. `decodeAudioTokens()` returns `number[]` (Float32 PCM at 24kHz mono) which feeds directly into an `AudioBuffer`. +**react-native-audio-api** (Software Mansion, already installed). Implements the Web Audio API spec for React Native. Both Kokoro (streaming `Float32Array` chunks) and OuteTTS (full `Float32Array`) pipe through the same `AudioContext → AudioBufferSourceNode` path at 24kHz mono. ### Audio Persistence (Audio Mode only) -In Audio Mode, generated PCM is written to disk as a WAV file per message so scrubbing works without re-generating. Files live at: +In Audio Mode, generated PCM is written to disk as a raw PCM file per message so scrubbing works without re-generating. Files live at: ``` -${RNFS.DocumentDirectoryPath}/audio-cache/{conversationId}/{messageId}.wav +${RNFS.DocumentDirectoryPath}/audio-cache/{conversationId}/{messageId}.pcm ``` Cache eviction strategy: - Keep the last 50 messages worth of audio per conversation - User can wipe audio cache from Settings ("Clear audio cache — X MB") -- Estimated size: ~1–4 MB per message (24kHz mono, varies by length) +- Estimated size: ~1–4 MB per message (24kHz mono Float32, varies by length) -In Chat Mode, audio is generated on demand, played, then discarded (no disk write). +In Chat Mode, audio is generated (via Kokoro) on demand, played, then discarded (no disk write). ### Voice Selection -OuteTTS 0.3 supports multiple speaker profiles. Expose as a voice picker in TTSSettingsScreen. Store selected voice ID in `ttsStore` settings (persisted). Default: speaker 0 (natural female). +- **Kokoro voices (Chat Mode speak):** 8 built-in voices (US/GB English, male/female). Stored as `kokoroVoiceId` in `ttsStore` settings. Default: `af_heart`. +- **OuteTTS voices (Audio Mode waveform):** Single profile (`speaker 0`) — OuteTTS 0.3 multi-speaker not confirmed working via llama.rn. Will expand when OuteTTS 1.0 lands. ### Device Gate -Require **flagship tier (8GB+ RAM)**. The memory stack: -``` -LLM (3B Q4) ~2.0 GB -Whisper base ~150 MB -OuteTTS backbone ~454 MB -WavTokenizer ~ 73 MB -OS + app ~2.0 GB -───────────────────────── -Total: ~4.7 GB → fits 8GB devices, tight on 6GB -``` -Show a warning (not a hard block) for 6–8GB devices. Hard block below 6GB. If device is blocked, Audio Mode is unavailable — app defaults to Chat Mode and hides the Audio Mode option. - ---- - -## Model Files - -| Role | HuggingFace Repo | File | Size | -|---|---|---|---| -| TTS Backbone | `OuteAI/OuteTTS-0.3-500M-GGUF` | `OuteTTS-0.3-500M-Q4_K_M.gguf` | 454 MB | -| Vocoder | `ggml-org/WavTokenizer` | `WavTokenizer-Large-75-Q5_1.gguf` | 73 MB | +Show a warning (not a hard block) for 6–8GB devices. Hard block below 6GB for Audio Mode (OuteTTS only). Kokoro speak has no RAM gate. -Direct download URLs (HuggingFace resolve): +Memory stack (worst case — both models loaded simultaneously): ``` -https://huggingface.co/OuteAI/OuteTTS-0.3-500M-GGUF/resolve/main/OuteTTS-0.3-500M-Q4_K_M.gguf -https://huggingface.co/ggml-org/WavTokenizer/resolve/main/WavTokenizer-Large-75-Q5_1.gguf +LLM (3B Q4) ~2.0 GB +Whisper base ~150 MB +OuteTTS backbone ~454 MB +WavTokenizer ~ 73 MB +Kokoro (XNNPACK .pte) ~100 MB ← new +OS + app ~2.0 GB +────────────────────────────── +Total: ~4.8 GB → fits 8GB devices ``` -Storage directories: -``` -${RNFS.DocumentDirectoryPath}/tts-models/ ← model weights -${RNFS.DocumentDirectoryPath}/audio-cache/ ← per-message WAV files (Audio Mode only) -``` +Kokoro and OuteTTS are never loaded simultaneously — Kokoro handles Chat Mode speak (OuteTTS not loaded), OuteTTS handles Audio Mode generation (Kokoro not involved). --- -## New Package - -```bash -npm install react-native-audio-api -``` - -iOS: run `pod install` after. -Android: auto-linked. - ---- - -## Interface Mode Setting - -### Where it lives -`ttsStore` settings object gains: +## Model Files -```typescript -export type InterfaceMode = 'chat' | 'audio'; - -export interface TTSSettings { - interfaceMode: InterfaceMode; // default: 'chat' until TTS models downloaded, then user can switch - enabled: boolean; - autoPlay: boolean; // Chat Mode only — auto-speak after completion - speed: number; // 0.5–2.0, default 1.0 - voiceId: string; // OuteTTS speaker profile, default '0' -} -``` +### Tier 1 — Kokoro (react-native-executorch) -### Mode switching rules -- If TTS models not downloaded → `interfaceMode` locked to `'chat'` -- If device RAM < 6GB → `interfaceMode` locked to `'chat'`, Audio Mode option hidden -- Switching mode takes effect immediately for new messages; existing messages render in whatever mode they were generated in (Chat Mode messages have no audio file, Audio Mode messages have one) -- A banner appears at the top of the chat on first switch: "Audio mode on — responses will play as voice notes." +Downloaded automatically by `react-native-executorch` to its internal cache (`react-native-executorch/` in document directory). No manual download management needed. ---- +| File | Source | Size (approx) | +|---|---|---| +| `duration_predictor.pte` | HuggingFace: `software-mansion/react-native-executorch-kokoro` | ~10 MB | +| `synthesizer.pte` | same | ~80 MB | +| Voice `.bin` files (per voice) | same repo | ~3–5 MB each | +| Phonemizer data (tagger + lexicon) | same repo | ~5 MB | -## Audio Mode: Message Bubble +Total cold download: ~100–120 MB. Subsequent launches use cached files. -### Layout (replaces text bubble for assistant messages) +### Tier 2 — OuteTTS (llama.rn, audio mode only) -``` -┌─────────────────────────────────────────────┐ -│ [avatar] ●━━━━━━━━━━━━━━━━━━━ 0:42 1x │ -│ [waveform visualization] │ -│ [Show transcript ▾] │ -└─────────────────────────────────────────────┘ -``` - -- **Waveform bar** — static amplitude visualization drawn from PCM data at generation time (no real-time animation needed, just a static shape like WhatsApp) -- **Scrubber** — draggable progress indicator -- **Timestamp** — elapsed / total duration -- **Speed chip** — tappable, cycles 0.5x → 1x → 1.5x → 2x -- **Show transcript** — expands inline to full text, collapses again - -User messages (voice input via Whisper) show the same bubble layout but with the transcript as primary since we have no TTS for user messages. +| Role | HuggingFace Repo | File | Size | +|---|---|---|---| +| TTS Backbone | `OuteAI/OuteTTS-0.3-500M-GGUF` | `OuteTTS-0.3-500M-Q4_K_M.gguf` | 454 MB | +| Vocoder | `ggml-org/WavTokenizer` | `WavTokenizer-Large-75-Q5_1.gguf` | 73 MB | -### Per-message controls (long press → action sheet) -- Change voice (re-generates audio with new speaker profile, overwrites cached file) -- Regenerate audio -- Copy text -- Delete message +Stored at: `${RNFS.DocumentDirectoryPath}/tts-models/` --- -## Files to Create - -### 1. `src/constants/ttsModels.ts` +## New Packages -```typescript -export const TTS_BACKBONE_MODEL = { - id: 'outetts-0.3-500m-q4', - name: 'OuteTTS 0.3', - backboneFile: 'OuteTTS-0.3-500M-Q4_K_M.gguf', - backboneUrl: 'https://huggingface.co/OuteAI/OuteTTS-0.3-500M-GGUF/resolve/main/OuteTTS-0.3-500M-Q4_K_M.gguf', - backboneSizeMB: 454, - vocoderFile: 'WavTokenizer-Large-75-Q5_1.gguf', - vocoderUrl: 'https://huggingface.co/ggml-org/WavTokenizer/resolve/main/WavTokenizer-Large-75-Q5_1.gguf', - vocoderSizeMB: 73, - sampleRate: 24000, - description: 'Natural-sounding on-device speech. Requires ~530 MB storage.', -}; - -export const TTS_SPEAKER_PROFILES = [ - { id: '0', label: 'Default' }, - // Add more as OuteTTS 0.3 speaker profiles are confirmed -]; - -export const TTS_MIN_RAM_GB = 6; // warn below 8, hard block below 6 -export const TTS_BLOCK_RAM_GB = 6; // hard block -export const TTS_WARN_RAM_GB = 8; // show warning card -export const AUDIO_CACHE_MAX_MESSAGES = 50; // per conversation +```bash +npm install react-native-executorch +npm install react-native-executorch-bare-resource-fetcher +npm install @dr.pogodin/react-native-fs @kesha-antonov/react-native-background-downloader ``` ---- - -### 2. `src/services/ttsService.ts` - -Mirror `whisperService.ts` pattern exactly. - -```typescript -import { initLlama, LlamaContext } from 'llama.rn'; -import RNFS from 'react-native-fs'; -import { AudioContext } from 'react-native-audio-api'; -import logger from '../utils/logger'; -import { TTS_BACKBONE_MODEL } from '../constants/ttsModels'; - -export interface TTSOptions { - speed?: number; // 0.5–2.0, default 1.0 - voiceId?: string; // speaker profile id, default '0' -} - -export interface GeneratedAudio { - samples: Float32Array; - durationSeconds: number; - sampleRate: number; - /** Amplitude envelope (downsampled to ~200 points) for waveform visualization */ - waveformData: number[]; -} - -class TTSService { - private context: LlamaContext | null = null; - private isVocoderReady: boolean = false; - private isSpeakingFlag: boolean = false; - private audioCtx: AudioContext | null = null; - private currentSource: AudioBufferSourceNode | null = null; - private contextLoadPromise: Promise = Promise.resolve(); - - // ─── Directories & Paths ──────────────────────────────────────────────── - - getModelsDir(): string { - return `${RNFS.DocumentDirectoryPath}/tts-models`; - } - - getAudioCacheDir(conversationId: string): string { - return `${RNFS.DocumentDirectoryPath}/audio-cache/${conversationId}`; - } - - getAudioFilePath(conversationId: string, messageId: string): string { - return `${this.getAudioCacheDir(conversationId)}/${messageId}.wav`; - } - - async ensureModelsDirExists(): Promise { - const dir = this.getModelsDir(); - if (!await RNFS.exists(dir)) await RNFS.mkdir(dir); - } - - async ensureAudioCacheDirExists(conversationId: string): Promise { - const dir = this.getAudioCacheDir(conversationId); - if (!await RNFS.exists(dir)) await RNFS.mkdir(dir); - } - - getBackbonePath(): string { - return `${this.getModelsDir()}/${TTS_BACKBONE_MODEL.backboneFile}`; - } - - getVocoderPath(): string { - return `${this.getModelsDir()}/${TTS_BACKBONE_MODEL.vocoderFile}`; - } - - async isBackboneDownloaded(): Promise { - return RNFS.exists(this.getBackbonePath()); - } - - async isVocoderDownloaded(): Promise { - return RNFS.exists(this.getVocoderPath()); - } - - async areBothModelsDownloaded(): Promise { - return (await this.isBackboneDownloaded()) && (await this.isVocoderDownloaded()); - } - - async isAudioCached(conversationId: string, messageId: string): Promise { - return RNFS.exists(this.getAudioFilePath(conversationId, messageId)); - } - - async getAudioCacheSizeMB(): Promise { - const cacheRoot = `${RNFS.DocumentDirectoryPath}/audio-cache`; - if (!await RNFS.exists(cacheRoot)) return 0; - const stat = await RNFS.stat(cacheRoot); - return stat.size / (1024 * 1024); - } - - async clearAudioCache(): Promise { - const cacheRoot = `${RNFS.DocumentDirectoryPath}/audio-cache`; - if (await RNFS.exists(cacheRoot)) await RNFS.unlink(cacheRoot); - } - - // ─── Download ──────────────────────────────────────────────────────────── - - async downloadBackbone(onProgress?: (p: number) => void): Promise { - await this.ensureModelsDirExists(); - const dest = this.getBackbonePath(); - if (await RNFS.exists(dest)) return dest; - const dl = RNFS.downloadFile({ - fromUrl: TTS_BACKBONE_MODEL.backboneUrl, - toFile: dest, - progressDivider: 1, - progress: (res) => onProgress?.(res.bytesWritten / res.contentLength), - }); - const result = await dl.promise; - if (result.statusCode !== 200) { - await RNFS.unlink(dest).catch(() => {}); - throw new Error(`Backbone download failed: HTTP ${result.statusCode}`); - } - return dest; - } - - async downloadVocoder(onProgress?: (p: number) => void): Promise { - await this.ensureModelsDirExists(); - const dest = this.getVocoderPath(); - if (await RNFS.exists(dest)) return dest; - const dl = RNFS.downloadFile({ - fromUrl: TTS_BACKBONE_MODEL.vocoderUrl, - toFile: dest, - progressDivider: 1, - progress: (res) => onProgress?.(res.bytesWritten / res.contentLength), - }); - const result = await dl.promise; - if (result.statusCode !== 200) { - await RNFS.unlink(dest).catch(() => {}); - throw new Error(`Vocoder download failed: HTTP ${result.statusCode}`); - } - return dest; - } - - async deleteModels(): Promise { - await this.unloadModels(); - const bp = this.getBackbonePath(); - const vp = this.getVocoderPath(); - if (await RNFS.exists(bp)) await RNFS.unlink(bp); - if (await RNFS.exists(vp)) await RNFS.unlink(vp); - } - - // ─── Model Lifecycle ───────────────────────────────────────────────────── - - async loadModels(): Promise { - if (this.context && this.isVocoderReady) return; - - this.contextLoadPromise = this.contextLoadPromise.then(async () => { - if (this.context && this.isVocoderReady) return; - - logger.log('[TTS] Loading backbone...'); - this.context = await initLlama({ - model: this.getBackbonePath(), - n_ctx: 8192, - n_threads: 4, - }); - - logger.log('[TTS] Loading vocoder...'); - await this.context.initVocoder({ - path: this.getVocoderPath(), - n_batch: 4096, - }); +iOS: `pod install` after. - this.isVocoderReady = await this.context.isVocoderEnabled(); - if (!this.isVocoderReady) { - throw new Error('Vocoder failed to initialize — check model files.'); - } - - logger.log('[TTS] Ready.'); - }); - - return this.contextLoadPromise; - } - - async unloadModels(): Promise { - this.stop(); - if (this.context) { - await this.context.releaseVocoder().catch(() => {}); - await this.context.release().catch(() => {}); - this.context = null; - } - this.isVocoderReady = false; - this.audioCtx?.close().catch(() => {}); - this.audioCtx = null; - } - - isLoaded(): boolean { - return this.context !== null && this.isVocoderReady; - } - - // ─── Audio Generation ──────────────────────────────────────────────────── - - /** - * Generate PCM audio for `text`. Does NOT play it. - * Returns samples + metadata needed for waveform rendering and playback. - */ - async generate(text: string, options: TTSOptions = {}): Promise { - if (!this.context || !this.isVocoderReady) { - throw new Error('TTS models not loaded.'); - } - - const speakerId = options.voiceId ?? '0'; - const { prompt, grammar } = await this.context.getFormattedAudioCompletion( - speakerId === '0' ? null : speakerId, - text, - ); - const guideTokens = await this.context.getAudioCompletionGuideTokens(text); - - const result = await this.context.completion({ - prompt, - grammar, - guide_tokens: guideTokens, - n_predict: 4096, - temperature: 0.7, - top_p: 0.9, - stop: ['<|im_end|>'], - }); - - const pcmArray = await this.context.decodeAudioTokens(result.audio_tokens); - const samples = new Float32Array(pcmArray); - const sampleRate = TTS_BACKBONE_MODEL.sampleRate; - const durationSeconds = samples.length / sampleRate; - const waveformData = this.downsampleForWaveform(samples, 200); - - return { samples, durationSeconds, sampleRate, waveformData }; - } - - /** - * Write PCM samples to a WAV file on disk. - * Used in Audio Mode to persist audio per message. - */ - async saveToFile(audio: GeneratedAudio, conversationId: string, messageId: string): Promise { - await this.ensureAudioCacheDirExists(conversationId); - const path = this.getAudioFilePath(conversationId, messageId); - const wavBuffer = this.encodeWAV(audio.samples, audio.sampleRate); - await RNFS.writeFile(path, wavBuffer, 'base64'); - return path; - } - - /** - * Generate + save in one step (Audio Mode convenience). - */ - async generateAndSave( - text: string, - conversationId: string, - messageId: string, - options: TTSOptions = {}, - ): Promise<{ path: string; audio: GeneratedAudio }> { - const audio = await this.generate(text, options); - const path = await this.saveToFile(audio, conversationId, messageId); - return { path, audio }; - } - - // ─── Playback ──────────────────────────────────────────────────────────── - - async playFromSamples(samples: Float32Array, speed: number = 1.0, startOffset: number = 0): Promise { - const sampleRate = TTS_BACKBONE_MODEL.sampleRate; - - this.audioCtx?.close().catch(() => {}); - this.audioCtx = new AudioContext({ sampleRate }); - - const buffer = this.audioCtx.createBuffer(1, samples.length, sampleRate); - buffer.copyToChannel(samples, 0); - - const source = this.audioCtx.createBufferSource(); - source.buffer = buffer; - source.playbackRate.value = speed; - source.connect(this.audioCtx.destination); - - this.currentSource = source; - this.isSpeakingFlag = true; - - return new Promise((resolve) => { - source.onended = () => { - this.currentSource = null; - this.isSpeakingFlag = false; - resolve(); - }; - source.start(0, startOffset); - }); - } - - async playFromFile(filePath: string, speed: number = 1.0, startOffset: number = 0): Promise { - const base64 = await RNFS.readFile(filePath, 'base64'); - const samples = this.decodeWAV(base64); - return this.playFromSamples(samples, speed, startOffset); - } - - /** - * Chat Mode convenience: generate + play + discard (no disk write). - */ - async speak(text: string, options: TTSOptions = {}): Promise { - if (this.isSpeakingFlag) this.stop(); - const audio = await this.generate(text, options); - if (!this.isSpeakingFlag) { // may have been stopped during generation - await this.playFromSamples(audio.samples, options.speed ?? 1.0); - } - } - - stop(): void { - this.isSpeakingFlag = false; - try { - this.currentSource?.stop(); - } catch { - // already stopped - } - this.currentSource = null; - } - - isSpeaking(): boolean { - return this.isSpeakingFlag; - } - - // ─── Utilities ─────────────────────────────────────────────────────────── - - private downsampleForWaveform(samples: Float32Array, points: number): number[] { - const blockSize = Math.floor(samples.length / points); - const result: number[] = []; - for (let i = 0; i < points; i++) { - let sum = 0; - for (let j = 0; j < blockSize; j++) { - sum += Math.abs(samples[i * blockSize + j]); - } - result.push(sum / blockSize); - } - return result; - } - - private encodeWAV(samples: Float32Array, sampleRate: number): string { - // Standard 16-bit PCM WAV encoding → base64 - // Implementation: write RIFF header + PCM data - const buffer = new ArrayBuffer(44 + samples.length * 2); - const view = new DataView(buffer); - const writeString = (offset: number, s: string) => { - for (let i = 0; i < s.length; i++) view.setUint8(offset + i, s.charCodeAt(i)); - }; - writeString(0, 'RIFF'); - view.setUint32(4, 36 + samples.length * 2, true); - writeString(8, 'WAVE'); - writeString(12, 'fmt '); - view.setUint32(16, 16, true); - view.setUint16(20, 1, true); - view.setUint16(22, 1, true); - view.setUint32(24, sampleRate, true); - view.setUint32(28, sampleRate * 2, true); - view.setUint16(32, 2, true); - view.setUint16(34, 16, true); - writeString(36, 'data'); - view.setUint32(40, samples.length * 2, true); - for (let i = 0; i < samples.length; i++) { - view.setInt16(44 + i * 2, Math.max(-32768, Math.min(32767, samples[i] * 32768)), true); - } - return Buffer.from(buffer).toString('base64'); - } - - private decodeWAV(base64: string): Float32Array { - const buffer = Buffer.from(base64, 'base64'); - const view = new DataView(buffer.buffer); - const sampleCount = (buffer.length - 44) / 2; - const samples = new Float32Array(sampleCount); - for (let i = 0; i < sampleCount; i++) { - samples[i] = view.getInt16(44 + i * 2, true) / 32768; - } - return samples; - } -} - -export const ttsService = new TTSService(); -``` +**Note:** `react-native-executorch-bare-resource-fetcher` requires its own RNFS fork (`@dr.pogodin/react-native-fs`) alongside the existing `react-native-fs`. Both can coexist. --- -### 3. `src/stores/ttsStore.ts` +## Architecture -Mirror `whisperStore.ts` pattern, using Zustand with `persist`. +### Initialization (`App.tsx`) ```typescript -import { create } from 'zustand'; -import { persist, createJSONStorage } from 'zustand/middleware'; -import AsyncStorage from '@react-native-async-storage/async-storage'; -import { ttsService } from '../services/ttsService'; -import logger from '../utils/logger'; - -export type InterfaceMode = 'chat' | 'audio'; - -export interface TTSSettings { - interfaceMode: InterfaceMode; - enabled: boolean; - autoPlay: boolean; // Chat Mode only - speed: number; // 0.5–2.0 - voiceId: string; // OuteTTS speaker profile -} +import { initExecutorch } from 'react-native-executorch'; +import { BareResourceFetcher } from 'react-native-executorch-bare-resource-fetcher'; -export interface TTSState { - // Download state - isBackboneDownloaded: boolean; - isVocoderDownloaded: boolean; - isDownloadingBackbone: boolean; - isDownloadingVocoder: boolean; - backboneDownloadProgress: number; - vocoderDownloadProgress: number; - - // Model lifecycle - isModelLoading: boolean; - isModelLoaded: boolean; - - // Playback - isSpeaking: boolean; - currentMessageId: string | null; - playbackPosition: number; // seconds, for scrubber - - // Cache - audioCacheSizeMB: number; - - // Settings (persisted) - settings: TTSSettings; - - error: string | null; - - // Actions - checkDownloadStatus: () => Promise; - downloadModels: () => Promise; - deleteModels: () => Promise; - loadModels: () => Promise; - unloadModels: () => Promise; - - // Chat Mode - speak: (text: string, messageId: string) => Promise; - stop: () => void; - - // Audio Mode - generateAndSave: (text: string, conversationId: string, messageId: string) => Promise<{ path: string; waveformData: number[]; durationSeconds: number }>; - playMessage: (messageId: string, filePath: string, startOffset?: number) => Promise; - stopPlayback: () => void; - - // Cache management - refreshCacheSize: () => Promise; - clearAudioCache: () => Promise; - - updateSettings: (patch: Partial) => void; - clearError: () => void; -} - -export const useTTSStore = create()( - persist( - (set, get) => ({ - isBackboneDownloaded: false, - isVocoderDownloaded: false, - isDownloadingBackbone: false, - isDownloadingVocoder: false, - backboneDownloadProgress: 0, - vocoderDownloadProgress: 0, - isModelLoading: false, - isModelLoaded: false, - isSpeaking: false, - currentMessageId: null, - playbackPosition: 0, - audioCacheSizeMB: 0, - settings: { - interfaceMode: 'chat', - enabled: true, - autoPlay: false, - speed: 1.0, - voiceId: '0', - }, - error: null, - - checkDownloadStatus: async () => { - const [backbone, vocoder] = await Promise.all([ - ttsService.isBackboneDownloaded(), - ttsService.isVocoderDownloaded(), - ]); - set({ isBackboneDownloaded: backbone, isVocoderDownloaded: vocoder }); - }, - - downloadModels: async () => { - set({ error: null }); - try { - set({ isDownloadingBackbone: true, backboneDownloadProgress: 0 }); - await ttsService.downloadBackbone((p) => set({ backboneDownloadProgress: p })); - set({ isDownloadingBackbone: false, isBackboneDownloaded: true }); - - set({ isDownloadingVocoder: true, vocoderDownloadProgress: 0 }); - await ttsService.downloadVocoder((p) => set({ vocoderDownloadProgress: p })); - set({ isDownloadingVocoder: false, isVocoderDownloaded: true }); - } catch (err) { - const msg = err instanceof Error ? err.message : 'Download failed'; - logger.error('[TTS Store] Download error:', msg); - set({ isDownloadingBackbone: false, isDownloadingVocoder: false, error: msg }); - } - }, - - deleteModels: async () => { - await ttsService.deleteModels(); - set({ isBackboneDownloaded: false, isVocoderDownloaded: false, isModelLoaded: false }); - }, - - loadModels: async () => { - if (get().isModelLoaded || get().isModelLoading) return; - set({ isModelLoading: true, error: null }); - try { - await ttsService.loadModels(); - set({ isModelLoaded: true }); - } catch (err) { - const msg = err instanceof Error ? err.message : 'Failed to load TTS models'; - logger.error('[TTS Store] Load error:', msg); - set({ error: msg }); - } finally { - set({ isModelLoading: false }); - } - }, - - unloadModels: async () => { - await ttsService.unloadModels(); - set({ isModelLoaded: false, isSpeaking: false, currentMessageId: null }); - }, - - // ── Chat Mode ────────────────────────────────────────────────────────── - - speak: async (text: string, messageId: string) => { - const { isModelLoaded, settings } = get(); - if (!settings.enabled) return; - if (!isModelLoaded) return; - - if (get().currentMessageId === messageId && get().isSpeaking) { - get().stop(); - return; - } - - ttsService.stop(); - set({ isSpeaking: true, currentMessageId: messageId, error: null }); - - try { - await ttsService.speak(text, { speed: settings.speed, voiceId: settings.voiceId }); - } catch (err) { - const msg = err instanceof Error ? err.message : 'Speech failed'; - logger.error('[TTS Store] Speak error:', msg); - set({ error: msg }); - } finally { - set({ isSpeaking: false, currentMessageId: null }); - } - }, - - stop: () => { - ttsService.stop(); - set({ isSpeaking: false, currentMessageId: null }); - }, - - // ── Audio Mode ───────────────────────────────────────────────────────── - - generateAndSave: async (text: string, conversationId: string, messageId: string) => { - const { settings } = get(); - const { path, audio } = await ttsService.generateAndSave( - text, - conversationId, - messageId, - { voiceId: settings.voiceId }, - ); - await get().refreshCacheSize(); - return { path, waveformData: audio.waveformData, durationSeconds: audio.durationSeconds }; - }, - - playMessage: async (messageId: string, filePath: string, startOffset: number = 0) => { - const { settings } = get(); - - if (get().currentMessageId === messageId && get().isSpeaking) { - get().stopPlayback(); - return; - } - - ttsService.stop(); - set({ isSpeaking: true, currentMessageId: messageId, playbackPosition: startOffset }); - - try { - await ttsService.playFromFile(filePath, settings.speed, startOffset); - } catch (err) { - const msg = err instanceof Error ? err.message : 'Playback failed'; - logger.error('[TTS Store] Playback error:', msg); - set({ error: msg }); - } finally { - set({ isSpeaking: false, currentMessageId: null, playbackPosition: 0 }); - } - }, - - stopPlayback: () => { - ttsService.stop(); - set({ isSpeaking: false, currentMessageId: null, playbackPosition: 0 }); - }, - - // ── Cache ────────────────────────────────────────────────────────────── - - refreshCacheSize: async () => { - const mb = await ttsService.getAudioCacheSizeMB(); - set({ audioCacheSizeMB: mb }); - }, - - clearAudioCache: async () => { - await ttsService.clearAudioCache(); - set({ audioCacheSizeMB: 0 }); - }, - - updateSettings: (patch) => { - set((state) => ({ settings: { ...state.settings, ...patch } })); - }, - - clearError: () => set({ error: null }), - }), - { - name: 'tts-store', - storage: createJSONStorage(() => AsyncStorage), - partialize: (state) => ({ settings: state.settings }), - } - ) -); +// Called once at startup, before any model hook is used +initExecutorch({ resourceFetcher: BareResourceFetcher }); ``` ---- +### KokoroTTSManager component -### 4. `src/hooks/useTTS.ts` +`react-native-executorch`'s `useTextToSpeech` is a React hook — it must live in a component. A `KokoroTTSManager` component mounts near the root, holds the hook instance, and exposes its methods via a module-level ref (`kokoroRef`). -```typescript -import { useEffect, useCallback } from 'react'; -import { useTTSStore } from '../stores/ttsStore'; -import { hardwareService } from '../services/hardware'; -import { TTS_BLOCK_RAM_GB, TTS_WARN_RAM_GB } from '../constants/ttsModels'; - -export function useTTS() { - const store = useTTSStore(); - - useEffect(() => { - store.checkDownloadStatus(); - }, []); - - const canRunOnDevice = useCallback(async (): Promise<{ allowed: boolean; warning: boolean }> => { - const ramGB = await hardwareService.getTotalMemoryGB(); - return { - allowed: ramGB >= TTS_BLOCK_RAM_GB, - warning: ramGB < TTS_WARN_RAM_GB, - }; - }, []); - - const speakMessage = useCallback( - (text: string, messageId: string) => { - if (!store.isModelLoaded && store.isBackboneDownloaded && store.isVocoderDownloaded) { - store.loadModels().then(() => store.speak(text, messageId)); - return; - } - store.speak(text, messageId); - }, - [store] - ); - - return { - ...store, - speakMessage, - canRunOnDevice, - areBothDownloaded: store.isBackboneDownloaded && store.isVocoderDownloaded, - isDownloading: store.isDownloadingBackbone || store.isDownloadingVocoder, - overallDownloadProgress: - store.backboneDownloadProgress * 0.86 + store.vocoderDownloadProgress * 0.14, - isAudioMode: store.settings.interfaceMode === 'audio', - isChatMode: store.settings.interfaceMode === 'chat', - }; -} ``` - ---- - -### 5. `src/components/AudioMessageBubble/index.tsx` *(Audio Mode only)* - -Replaces `ChatMessage` assistant bubble when `interfaceMode === 'audio'`. - -```typescript -interface AudioMessageBubbleProps { - messageId: string; - conversationId: string; - audioPath: string; // path to WAV on disk - waveformData: number[]; // 200-point amplitude array - durationSeconds: number; - isGenerating?: boolean; // true while TTS is still running -} +App +└── KokoroTTSManager ← mounts useTextToSpeech, wires to kokoroRef + └── AppNavigator + └── ChatScreen + └── TTSButton ← calls kokoroRef.stream(text, callbacks) ``` -**Layout:** -- Static waveform bar (200 rect bars, amplitude-scaled, filled up to scrubber position) -- Draggable scrubber thumb -- `MM:SS` elapsed / total -- Speed chip (cycles 0.5x → 1x → 1.5x → 2x, persists to store) -- "Show transcript" collapse/expand -- Long press → action sheet (Change voice, Regenerate, Copy text, Delete) - ---- - -### 6. `src/components/TTSButton/index.tsx` *(Chat Mode only)* - -Play/stop button that appears on each assistant message bubble. Unchanged from original plan — only rendered when `interfaceMode === 'chat'`. +### Speak flow (Chat Mode — Kokoro, fast) -```typescript -// Don't render in Audio Mode or if TTS disabled/not downloaded -if (settings.interfaceMode === 'audio' || !settings.enabled || !areBothDownloaded) return null; ``` - ---- - -### 7. `src/screens/TTSSettingsScreen/index.tsx` - -Accessible from SettingsScreen → "Text to Speech" row. - -**Sections:** -1. **Header** — back button + "Text to Speech" title -2. **Interface Mode card** — segmented control: `Chat` / `Audio` - - If device RAM < `TTS_BLOCK_RAM_GB`: Audio option is greyed out with "Requires 6GB+ RAM" - - If RAM is between block and warn thresholds: yellow warning under the control -3. **Master toggle card** — enable/disable TTS (Chat Mode only — in Audio Mode, TTS is always on) -4. **Model download card** — download status for both files with separate progress bars; "Download (527 MB)" / "Remove" buttons -5. **Voice card** (shown when downloaded) — voice picker from `TTS_SPEAKER_PROFILES` -6. **Playback card** (shown when downloaded) — Speed slider (0.5–2.0x), Auto-play toggle (Chat Mode only) -7. **Audio cache card** (Audio Mode only) — "Audio cache: X MB" + "Clear cache" button -8. **Device compatibility card** — RAM check with status -9. **Privacy card** — "All speech generated on your device. Nothing is sent to any server." - ---- - -### 8. `src/stores/index.ts` - -Add: -```typescript -export { useTTSStore } from './ttsStore'; +TTSButton tap + → kokoroRef.stream({ text, onNext: playChunk, onBegin, onEnd }) + → AudioContext buffers played as Float32Array chunks arrive + → Streaming: audio starts < 1s after tap ``` -### 9. `src/services/index.ts` +### Voice input flow (Audio Mode — user side) -Add: -```typescript -export { ttsService } from './ttsService'; ``` - -### 10. `src/navigation/types.ts` - -Add `TTSSettings: undefined` to `RootStackParamList`. - -### 11. `src/navigation/AppNavigator.tsx` - -```tsx - +User taps mic button + → audioRecorderService.startRecording() — records WAV to disk + → User releases mic + → audioRecorderService.stopRecording() → { path, durationSeconds } + → whisperService.transcribeFile(path) — file-based STT + → onAutoSend(transcript, { uri: path, format: 'wav', durationSeconds }) + → ChatInput builds MediaAttachment { type: 'audio', uri, durationSeconds } + → onSend(transcript, [audioAttachment]) — content = transcript, attachment = WAV + → MessageRenderer: user message with audio attachment → right-aligned AudioMessageBubble + → LLM receives transcript as text input (standard text generation) ``` -### 12. `src/screens/index.ts` - -Export `TTSSettingsScreen` and `AudioMessageBubble`. +For models that natively support audio input (e.g. Qwen2-Audio): WAV is passed directly as `input_audio` to the model — Whisper is bypassed entirely. -### 13. `src/screens/SettingsScreen.tsx` +### Generate+Save flow (Audio Mode — AI side) -Add nav row pointing to `TTSSettings` (after the Voice row): -```tsx - navigation.navigate('TTSSettings')}> - - Text to Speech - - ``` - -### 14. `src/components/ChatMessage/index.tsx` - -Mode-branch the assistant message render path: - -```tsx -import { AudioMessageBubble } from '../AudioMessageBubble'; -import { TTSButton } from '../TTSButton'; - -// In assistant message render: -const { settings } = useTTSStore(); - -if (settings.interfaceMode === 'audio' && message.audioPath) { - return ( - - ); -} - -// Chat Mode: existing text bubble + TTSButton +Streaming LLM response ends + → triggerAudioModeGeneration(conversationId, messageId, content) + (reads fresh message from useChatStore.getState() — not stale closure) + → ttsService.generateAndSave(text, ctx, options) + → OuteTTS runs inference → Float32Array + waveformData + duration + → Write PCM to disk → update message { audioPath, waveformData, audioDurationSeconds } + → MessageRenderer shows left-aligned AudioMessageBubble ``` -This requires adding `audioPath`, `waveformData`, `audioDurationSeconds`, and `isGeneratingAudio` fields to the message model. +--- -### 15. Message model update (`src/types/` or wherever `Message` is defined) +## ttsStore additions ```typescript -export interface Message { - // ... existing fields ... - audioPath?: string; // Audio Mode: path to WAV on disk - waveformData?: number[]; // Audio Mode: 200-point amplitude envelope - audioDurationSeconds?: number; // Audio Mode: total duration - isGeneratingAudio?: boolean; // true while TTS is running for this message -} +// Kokoro state +kokoroReady: boolean; // useTextToSpeech.isReady +kokoroDownloadProgress: number; // 0–1, during initial model download +kokoroVoiceId: KokoroVoiceId; // persisted setting + +// Actions +setKokoroReady: (ready: boolean, progress: number) => void; +kokoroSpeak: (text: string, messageId: string) => void; // delegates to kokoroRef +kokoroStop: () => void; ``` -### 16. Chat completion flow - -**Chat Mode (autoPlay):** unchanged from original plan — call `speak()` after streaming completes when `autoPlay: true`. - -**Audio Mode:** after streaming completes, immediately trigger `generateAndSave()` and update the message record with the returned `audioPath`, `waveformData`, `durationSeconds`. Set `isGeneratingAudio: true` on the message while generation runs so the bubble shows a loading state. - +The existing `speak()` action becomes: ```typescript -// After streaming completes, if Audio Mode: -if (settings.interfaceMode === 'audio') { - updateMessage(lastMessage.id, { isGeneratingAudio: true }); - const { path, waveformData, durationSeconds } = await ttsStore.generateAndSave( - stripControlTokens(lastMessage.content), - conversationId, - lastMessage.id, - ); - updateMessage(lastMessage.id, { - audioPath: path, - waveformData, - audioDurationSeconds: durationSeconds, - isGeneratingAudio: false, - }); +speak: (text, messageId) => { + if (kokoroReady) { + kokoroSpeak(text, messageId); // fast path + } else { + // OuteTTS fallback (slow, Android <13 or first launch before Kokoro loads) + outeTTSSpeak(text, messageId); + } } ``` --- -## Tests to Write - -### `__tests__/unit/services/ttsService.test.ts` -- `generate` calls `getFormattedAudioCompletion`, `getAudioCompletionGuideTokens`, `completion`, `decodeAudioTokens` in order -- `generate` returns correct `durationSeconds` and 200-point `waveformData` -- `saveToFile` writes a valid WAV file to the correct path -- `generateAndSave` calls both and returns path + audio -- `playFromFile` reads WAV, decodes, and calls `playFromSamples` -- `stop` sets `isSpeakingFlag` to false and calls `currentSource.stop()` -- `encodeWAV` / `decodeWAV` round-trip preserves samples (within 16-bit quantization error) -- `getAudioCacheSizeMB` returns correct value -- `clearAudioCache` removes the cache directory - -### `__tests__/unit/stores/ttsStore.test.ts` -- `generateAndSave` sets correct waveformData and calls `refreshCacheSize` -- `playMessage` sets `isSpeaking: true`, then `false` after completion -- `playMessage` on same messageId while playing → calls `stopPlayback` -- `updateSettings` merges partial settings correctly -- Settings persisted: `interfaceMode`, `speed`, `voiceId`, `enabled` survive re-hydration - -### `__tests__/integration/tts.test.ts` -- **Chat Mode full flow:** download → load → speak → stop -- **Audio Mode full flow:** download → load → generateAndSave → playMessage → stop -- **Auto-play:** Chat Mode with `autoPlay: true`, streaming completes → `speak` called -- **Audio Mode post-completion:** streaming completes → `generateAndSave` called → message updated with `audioPath` -- **Mode switch:** switching `interfaceMode` from `'chat'` to `'audio'` takes effect for next message +## Kokoro Voice IDs ---- - -## Implementation Order - -1. `src/constants/ttsModels.ts` -2. `src/services/ttsService.ts` (with WAV encode/decode + `generate`/`generateAndSave`/`playFromFile`) -3. `src/stores/ttsStore.ts` (with Audio Mode actions) -4. `src/hooks/useTTS.ts` -5. `src/stores/index.ts` — add export -6. `src/services/index.ts` — add export -7. `src/navigation/types.ts` — add route -8. Message model — add `audioPath`, `waveformData`, `audioDurationSeconds`, `isGeneratingAudio` -9. `src/components/AudioMessageBubble/index.tsx` -10. `src/components/TTSButton/index.tsx` (Chat Mode only, unchanged) -11. `src/screens/TTSSettingsScreen/index.tsx` (with Interface Mode section) -12. `src/screens/index.ts` — add exports -13. `src/navigation/AppNavigator.tsx` — add screen -14. `src/screens/SettingsScreen.tsx` — add nav row -15. `src/components/ChatMessage/index.tsx` — mode-branch render -16. Wire Audio Mode generation into chat completion flow -17. Write all tests -18. `npm install react-native-audio-api` + `pod install` +| ID | Label | Accent | Gender | +|---|---|---|---| +| `af_heart` | Heart | US English | Female | +| `af_river` | River | US English | Female | +| `af_sarah` | Sarah | US English | Female | +| `am_adam` | Adam | US English | Male | +| `am_michael` | Michael | US English | Male | +| `am_santa` | Santa | US English | Male | +| `bf_emma` | Emma | British English | Female | +| `bm_daniel` | Daniel | British English | Male | --- -## Memory Safety +## Files to Create / Modify -Before calling `loadModels()`, check available memory: +### New files +- `src/components/KokoroTTSManager.tsx` — mounts the hook, exposes via ref +- `src/constants/kokoroModels.ts` — voice/model constants mirroring executorch exports -```typescript -const available = await hardwareService.getAvailableMemoryGB(); -if (available < 1.0) { - throw new Error('Not enough free memory. Try closing image generation first.'); -} -``` +### Modified files +- `App.tsx` — add `initExecutorch()` call + mount `` +- `src/stores/ttsStore.ts` — add Kokoro state + `kokoroVoiceId` setting +- `src/services/ttsService.ts` — no change to OuteTTS path +- `src/components/TTSButton/index.tsx` — use Kokoro speak when available +- `src/screens/TTSSettingsScreen/index.tsx` — add voice picker (8 Kokoro voices) -This check belongs in `useTTSStore.loadModels()` before calling `ttsService.loadModels()`. +### android/build.gradle +- Bump `minSdkVersion` for executorch: **leave at 24**, guard Kokoro at runtime via `Platform.Version >= 33` --- -## Future: Upgrade to OuteTTS 1.0 - -When llama.cpp PR#12794 (DAC decoder) merges and llama.rn PR#300 (codec.cpp integration) ships: - -1. Add `TTS_BACKBONE_MODEL_V2` to `ttsModels.ts` (backbone + DAC vocoder GGUF) -2. `ttsService.ts` API is unchanged — model-agnostic -3. Store gets a `modelVersion` setting; 0.3 and 1.0 can coexist on disk +## Status + +| Task | Status | +|---|---| +| OuteTTS speak (Chat Mode) | ✅ Implemented (slow, functional) | +| OuteTTS generate+save (Audio Mode — AI side) | ✅ Implemented | +| Stale-closure bug fix (reads fresh store state) | ✅ Fixed | +| TTSButton + Speak long-press action | ✅ Implemented | +| Generation vs playback state (spinner) | ✅ Implemented | +| 300-char text truncation | ✅ Implemented | +| checkDownloadStatus on app start | ✅ Implemented | +| User voice recording → audio bubble (Audio Mode) | ✅ Implemented | +| Auto-send on voice stop in Audio Mode | ✅ Implemented | +| User audio bubble right-aligned | ✅ Implemented | +| TTS section in Chat Settings modal | ✅ Implemented | +| Chat Settings modal: TTS Settings deep link | ✅ Implemented | +| Multimodal audio input (bypass Whisper for audio-capable models) | ✅ Implemented | +| Kokoro via react-native-executorch | 🔲 Not started | +| KokoroTTSManager component | 🔲 Not started | +| Voice picker in TTSSettingsScreen | 🔲 Not started | +| Kokoro → OuteTTS fallback for Android <13 | 🔲 Not started | diff --git a/ios/Podfile.lock b/ios/Podfile.lock index a076829d0..3f58a70ef 100644 --- a/ios/Podfile.lock +++ b/ios/Podfile.lock @@ -2797,6 +2797,121 @@ PODS: - React-perflogger (= 0.83.1) - React-utils (= 0.83.1) - SocketRocket + - RNAudioAPI (0.11.7): + - boost + - DoubleConversion + - fast_float + - fmt + - glog + - hermes-engine + - RCT-Folly + - RCT-Folly/Fabric + - RCTRequired + - RCTTypeSafety + - React-Core + - React-debug + - React-Fabric + - React-featureflags + - React-graphics + - React-ImageManager + - React-jsi + - React-NativeModulesApple + - React-RCTFabric + - React-renderercss + - React-rendererdebug + - React-utils + - ReactCodegen + - ReactCommon/turbomodule/bridging + - ReactCommon/turbomodule/core + - RNAudioAPI/audioapi (= 0.11.7) + - SocketRocket + - Yoga + - RNAudioAPI/audioapi (0.11.7): + - boost + - DoubleConversion + - fast_float + - fmt + - glog + - hermes-engine + - RCT-Folly + - RCT-Folly/Fabric + - RCTRequired + - RCTTypeSafety + - React-Core + - React-debug + - React-Fabric + - React-featureflags + - React-graphics + - React-ImageManager + - React-jsi + - React-NativeModulesApple + - React-RCTFabric + - React-renderercss + - React-rendererdebug + - React-utils + - ReactCodegen + - ReactCommon/turbomodule/bridging + - ReactCommon/turbomodule/core + - RNAudioAPI/audioapi/audioapi_dsp (= 0.11.7) + - RNAudioAPI/audioapi/ios (= 0.11.7) + - SocketRocket + - Yoga + - RNAudioAPI/audioapi/audioapi_dsp (0.11.7): + - boost + - DoubleConversion + - fast_float + - fmt + - glog + - hermes-engine + - RCT-Folly + - RCT-Folly/Fabric + - RCTRequired + - RCTTypeSafety + - React-Core + - React-debug + - React-Fabric + - React-featureflags + - React-graphics + - React-ImageManager + - React-jsi + - React-NativeModulesApple + - React-RCTFabric + - React-renderercss + - React-rendererdebug + - React-utils + - ReactCodegen + - ReactCommon/turbomodule/bridging + - ReactCommon/turbomodule/core + - SocketRocket + - Yoga + - RNAudioAPI/audioapi/ios (0.11.7): + - boost + - DoubleConversion + - fast_float + - fmt + - glog + - hermes-engine + - RCT-Folly + - RCT-Folly/Fabric + - RCTRequired + - RCTTypeSafety + - React-Core + - React-debug + - React-Fabric + - React-featureflags + - React-graphics + - React-ImageManager + - React-jsi + - React-NativeModulesApple + - React-RCTFabric + - React-renderercss + - React-rendererdebug + - React-utils + - ReactCodegen + - ReactCommon/turbomodule/bridging + - ReactCommon/turbomodule/core + - SocketRocket + - Yoga - RNCAsyncStorage (2.2.0): - boost - DoubleConversion @@ -3368,6 +3483,7 @@ DEPENDENCIES: - ReactAppDependencyProvider (from `build/generated/ios/ReactAppDependencyProvider`) - ReactCodegen (from `build/generated/ios/ReactCodegen`) - ReactCommon/turbomodule/core (from `../node_modules/react-native/ReactCommon`) + - RNAudioAPI (from `../node_modules/react-native-audio-api`) - "RNCAsyncStorage (from `../node_modules/@react-native-async-storage/async-storage`)" - RNDeviceInfo (from `../node_modules/react-native-device-info`) - RNFS (from `../node_modules/react-native-fs`) @@ -3566,6 +3682,8 @@ EXTERNAL SOURCES: :path: build/generated/ios/ReactCodegen ReactCommon: :path: "../node_modules/react-native/ReactCommon" + RNAudioAPI: + :path: "../node_modules/react-native-audio-api" RNCAsyncStorage: :path: "../node_modules/@react-native-async-storage/async-storage" RNDeviceInfo: @@ -3684,6 +3802,7 @@ SPEC CHECKSUMS: ReactAppDependencyProvider: 0eb286cc274abb059ee601b862ebddac2e681d01 ReactCodegen: 3d48510bcef445f6403c0004047d4d9cbb915435 ReactCommon: ac934cb340aee91282ecd6f273a26d24d4c55cae + RNAudioAPI: 106257d5f3713bb667d6d74ebb3105c9cf5d60db RNCAsyncStorage: 29f0230e1a25f36c20b05f65e2eb8958d6526e82 RNDeviceInfo: 36d7f232bfe7c9b5c494cb7793230424ed32c388 RNFS: 89de7d7f4c0f6bafa05343c578f61118c8282ed8 diff --git a/jest.setup.ts b/jest.setup.ts index 15d0f8cb2..af694a3d2 100644 --- a/jest.setup.ts +++ b/jest.setup.ts @@ -149,6 +149,61 @@ jest.mock('whisper.rn', () => ({ }, }), { virtual: true }); +// react-native-audio-api mock +jest.mock('react-native-audio-api', () => ({ + AudioContext: jest.fn().mockImplementation(() => ({ + createBuffer: jest.fn().mockReturnValue({ copyToChannel: jest.fn() }), + createBufferSource: jest.fn().mockReturnValue({ + connect: jest.fn(), + start: jest.fn(), + stop: jest.fn(), + playbackRate: { value: 1.0 }, + onEnded: null, + buffer: null, + }), + destination: {}, + close: jest.fn(), + })), + AudioRecorder: jest.fn().mockImplementation(() => ({ + enableFileOutput: jest.fn().mockReturnValue({ status: 'success', path: '/mock/audio/input.wav' }), + start: jest.fn().mockReturnValue({ status: 'success', path: '/mock/audio/input.wav' }), + stop: jest.fn().mockReturnValue({ status: 'success', path: '/mock/audio/input.wav', size: 1024, duration: 1.0 }), + pause: jest.fn(), + resume: jest.fn(), + isRecording: jest.fn().mockReturnValue(false), + isPaused: jest.fn().mockReturnValue(false), + })), + FileFormat: { Wav: 0, Caf: 1, M4A: 2, Flac: 3 }, + FileDirectory: { Document: 0, Cache: 1 }, +}), { virtual: true }); + +// @react-native-community/slider mock +jest.mock('@react-native-community/slider', () => { + const { View } = require('react-native'); + return { __esModule: true, default: View }; +}); + +// react-native-executorch mock +const mockVoiceConfig = { id: 'mock_voice' }; +jest.mock('react-native-executorch', () => ({ + useTextToSpeech: jest.fn(() => ({ + isReady: true, + downloadProgress: 1, + error: null, + stream: jest.fn(() => Promise.resolve()), + streamStop: jest.fn(), + })), + KOKORO_MEDIUM: 'kokoro-medium', + KOKORO_VOICE_AF_HEART: mockVoiceConfig, + KOKORO_VOICE_AF_RIVER: mockVoiceConfig, + KOKORO_VOICE_AF_SARAH: mockVoiceConfig, + KOKORO_VOICE_AM_ADAM: mockVoiceConfig, + KOKORO_VOICE_AM_MICHAEL: mockVoiceConfig, + KOKORO_VOICE_AM_SANTA: mockVoiceConfig, + KOKORO_VOICE_BF_EMMA: mockVoiceConfig, + KOKORO_VOICE_BM_DANIEL: mockVoiceConfig, +})); + // react-native-fs mock jest.mock('react-native-fs', () => ({ DocumentDirectoryPath: '/mock/documents', diff --git a/package-lock.json b/package-lock.json index 9353548fc..1d6a7f40d 100644 --- a/package-lock.json +++ b/package-lock.json @@ -9,7 +9,9 @@ "version": "0.0.86", "hasInstallScript": true, "dependencies": { + "@dr.pogodin/react-native-fs": "^2.38.1", "@gorhom/bottom-sheet": "^5.2.8", + "@kesha-antonov/react-native-background-downloader": "^4.5.4", "@op-engineering/op-sqlite": "^15.2.5", "@react-native-async-storage/async-storage": "^2.2.0", "@react-native-community/blur": "^4.4.1", @@ -31,7 +33,10 @@ "patch-package": "^8.0.1", "react": "19.2.0", "react-native": "0.83.1", + "react-native-audio-api": "^0.11.7", "react-native-device-info": "^15.0.1", + "react-native-executorch": "^0.8.1", + "react-native-executorch-bare-resource-fetcher": "^0.8.0", "react-native-fs": "^2.20.0", "react-native-gesture-handler": "^2.30.0", "react-native-haptic-feedback": "^2.3.3", @@ -2113,6 +2118,51 @@ "devOptional": true, "license": "MIT" }, + "node_modules/@dr.pogodin/react-native-fs": { + "version": "2.38.1", + "resolved": "https://registry.npmjs.org/@dr.pogodin/react-native-fs/-/react-native-fs-2.38.1.tgz", + "integrity": "sha512-H5uxbEy61as7m5p4dNhv4a/huO8g9r4weu0FM/UjlgRd1PSYqpZaJBi2nhDGums/N+MrK8IZFOHVV5ukHWX8UQ==", + "license": "MIT", + "workspaces": [ + "example" + ], + "dependencies": { + "buffer": "^6.0.3", + "http-status-codes": "^2.3.0" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/birdofpreyru" + }, + "peerDependencies": { + "react": "*", + "react-native": "*" + } + }, + "node_modules/@dr.pogodin/react-native-fs/node_modules/buffer": { + "version": "6.0.3", + "resolved": "https://registry.npmjs.org/buffer/-/buffer-6.0.3.tgz", + "integrity": "sha512-FTiCpNxtwiZZHEZbcbTIcZjERVICn9yq/pDFkTl95/AxzD1naBctN7YO68riM/gLSDY7sdrMby8hofADYuuqOA==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/feross" + }, + { + "type": "patreon", + "url": "https://www.patreon.com/feross" + }, + { + "type": "consulting", + "url": "https://feross.org/support" + } + ], + "license": "MIT", + "dependencies": { + "base64-js": "^1.3.1", + "ieee754": "^1.2.1" + } + }, "node_modules/@egjs/hammerjs": { "version": "2.0.17", "resolved": "https://registry.npmjs.org/@egjs/hammerjs/-/hammerjs-2.0.17.tgz", @@ -2559,6 +2609,15 @@ "@hapi/hoek": "^9.0.0" } }, + "node_modules/@huggingface/jinja": { + "version": "0.5.6", + "resolved": "https://registry.npmjs.org/@huggingface/jinja/-/jinja-0.5.6.tgz", + "integrity": "sha512-MyMWyLnjqo+KRJYSH7oWNbsOn5onuIvfXYPcc0WOGxU0eHUV7oAYUoQTl2BMdu7ml+ea/bu11UM+EshbeHwtIA==", + "license": "MIT", + "engines": { + "node": ">=18" + } + }, "node_modules/@humanwhocodes/config-array": { "version": "0.13.0", "resolved": "https://registry.npmjs.org/@humanwhocodes/config-array/-/config-array-0.13.0.tgz", @@ -3110,6 +3169,15 @@ "@jridgewell/sourcemap-codec": "^1.4.14" } }, + "node_modules/@kesha-antonov/react-native-background-downloader": { + "version": "4.5.4", + "resolved": "https://registry.npmjs.org/@kesha-antonov/react-native-background-downloader/-/react-native-background-downloader-4.5.4.tgz", + "integrity": "sha512-WH9n7Sy8MebWiVZqZYpvP4q2sJeOIiNLrbHB64ue/YYsXnWtdJ3iMQowv/QEmU2Cw9biI1d2k8LFHKV9oACLsw==", + "license": "Apache-2.0", + "peerDependencies": { + "react-native": ">=0.57.0" + } + }, "node_modules/@motionone/animation": { "version": "10.18.0", "resolved": "https://registry.npmjs.org/@motionone/animation/-/animation-10.18.0.tgz", @@ -8090,6 +8158,12 @@ "node": ">= 0.8" } }, + "node_modules/http-status-codes": { + "version": "2.3.0", + "resolved": "https://registry.npmjs.org/http-status-codes/-/http-status-codes-2.3.0.tgz", + "integrity": "sha512-RJ8XvFvpPM/Dmc5SV+dC4y5PCeOhT3x1Hq0NU3rjGeg5a/CqlhZ7uudknPwZFz4aeAXDcbAyaeP7GAo9lvngtA==", + "license": "MIT" + }, "node_modules/https-proxy-agent": { "version": "7.0.6", "resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-7.0.6.tgz", @@ -8146,7 +8220,6 @@ "version": "1.2.1", "resolved": "https://registry.npmjs.org/ieee754/-/ieee754-1.2.1.tgz", "integrity": "sha512-dcyqhDvX1C46lXZcVqCpK+FtMRQVdIMN6/Df5js2zouUsqG7I6sFxitIC+7KYK29KdXOLHdu9zL4sFnoVQnqaA==", - "devOptional": true, "funding": [ { "type": "github", @@ -9609,6 +9682,24 @@ "url": "https://github.com/sponsors/ljharb" } }, + "node_modules/jsonrepair": { + "version": "3.13.3", + "resolved": "https://registry.npmjs.org/jsonrepair/-/jsonrepair-3.13.3.tgz", + "integrity": "sha512-BTznj0owIt2CBAH/LTo7+1I5pMvl1e1033LRl/HUowlZmJOIhzC0zbX5bxMngLkfT4WnzPP26QnW5wMr2g9tsQ==", + "license": "ISC", + "bin": { + "jsonrepair": "bin/cli.js" + } + }, + "node_modules/jsonschema": { + "version": "1.5.0", + "resolved": "https://registry.npmjs.org/jsonschema/-/jsonschema-1.5.0.tgz", + "integrity": "sha512-K+A9hhqbn0f3pJX17Q/7H6yQfD/5OXgdrR5UE12gMXCiN9D5Xq2o5mddV2QEcX/bjla99ASsAAQUyMCCRWAEhw==", + "license": "MIT", + "engines": { + "node": "*" + } + }, "node_modules/jsx-ast-utils": { "version": "3.3.5", "resolved": "https://registry.npmjs.org/jsx-ast-utils/-/jsx-ast-utils-3.3.5.tgz", @@ -11862,6 +11953,15 @@ "node": ">=8.0" } }, + "node_modules/pngjs": { + "version": "7.0.0", + "resolved": "https://registry.npmjs.org/pngjs/-/pngjs-7.0.0.tgz", + "integrity": "sha512-LKWqWJRhstyYo9pGvgor/ivk2w94eSjE3RGVuzLGlr3NmD8bf7RcYGze1mNdEHRP6TRP6rMuDHk5t44hnTRyow==", + "license": "MIT", + "engines": { + "node": ">=14.19.0" + } + }, "node_modules/popmotion": { "version": "11.0.3", "resolved": "https://registry.npmjs.org/popmotion/-/popmotion-11.0.3.tgz", @@ -12220,6 +12320,34 @@ } } }, + "node_modules/react-native-audio-api": { + "version": "0.11.7", + "resolved": "https://registry.npmjs.org/react-native-audio-api/-/react-native-audio-api-0.11.7.tgz", + "integrity": "sha512-2oIoP77Tn2nlouRVfEC3bAsuSyKU6xhGNkSnVXTLLQQZslEDoYX2cN9pVRZoWOqhFrLT8q4IZI9HaFgYL13L1A==", + "license": "MIT", + "dependencies": { + "semver": "^7.7.3" + }, + "bin": { + "setup-rn-audio-api-web": "scripts/setup-rn-audio-api-web.js" + }, + "peerDependencies": { + "react": "*", + "react-native": "*" + } + }, + "node_modules/react-native-audio-api/node_modules/semver": { + "version": "7.7.4", + "resolved": "https://registry.npmjs.org/semver/-/semver-7.7.4.tgz", + "integrity": "sha512-vFKC2IEtQnVhpT78h1Yp8wzwrf8CM+MzKMHGJZfBtzhZNycRFnXsHk6E5TxIkkMsgNS7mdX3AGB7x2QM2di4lA==", + "license": "ISC", + "bin": { + "semver": "bin/semver.js" + }, + "engines": { + "node": ">=10" + } + }, "node_modules/react-native-device-info": { "version": "15.0.1", "resolved": "https://registry.npmjs.org/react-native-device-info/-/react-native-device-info-15.0.1.tgz", @@ -12229,6 +12357,38 @@ "react-native": "*" } }, + "node_modules/react-native-executorch": { + "version": "0.8.1", + "resolved": "https://registry.npmjs.org/react-native-executorch/-/react-native-executorch-0.8.1.tgz", + "integrity": "sha512-DEVWs+Ki7p1C8mEgsHiabZizO/kDM0zELlJ+JFCfNCb2RrraMUXBTZIARWHPUbxpG17nqFswIZmwjUoNK5V36g==", + "license": "MIT", + "workspaces": [ + "example" + ], + "dependencies": { + "@huggingface/jinja": "^0.5.0", + "jsonrepair": "^3.12.0", + "jsonschema": "^1.5.0", + "pngjs": "^7.0.0", + "zod": "^4.3.6" + }, + "peerDependencies": { + "react": "*", + "react-native": "*" + } + }, + "node_modules/react-native-executorch-bare-resource-fetcher": { + "version": "0.8.0", + "resolved": "https://registry.npmjs.org/react-native-executorch-bare-resource-fetcher/-/react-native-executorch-bare-resource-fetcher-0.8.0.tgz", + "integrity": "sha512-PzSzK31qnKmwW06+JCbpQML24u3XiqYcWKQG0Y1cwPmkOqz0VppI0ZOeCZh03/03SMyuvwwEgteJtgO0uSP8sg==", + "license": "MIT", + "peerDependencies": { + "@dr.pogodin/react-native-fs": "^2.0.0", + "@kesha-antonov/react-native-background-downloader": "^4.0.0", + "react-native": "*", + "react-native-executorch": "*" + } + }, "node_modules/react-native-fit-image": { "version": "1.5.5", "resolved": "https://registry.npmjs.org/react-native-fit-image/-/react-native-fit-image-1.5.5.tgz", @@ -14716,7 +14876,6 @@ "version": "4.3.6", "resolved": "https://registry.npmjs.org/zod/-/zod-4.3.6.tgz", "integrity": "sha512-rftlrkhHZOcjDwkGlnUtZZkvaPHCsDATp4pGpuOOMDaTdDDXF91wuVDJoWoPsKX/3YPQ5fHuF3STjcYyKr+Qhg==", - "dev": true, "license": "MIT", "funding": { "url": "https://github.com/sponsors/colinhacks" diff --git a/package.json b/package.json index 0650d784a..54ceb6a25 100644 --- a/package.json +++ b/package.json @@ -20,7 +20,9 @@ "postinstall": "patch-package" }, "dependencies": { + "@dr.pogodin/react-native-fs": "^2.38.1", "@gorhom/bottom-sheet": "^5.2.8", + "@kesha-antonov/react-native-background-downloader": "^4.5.4", "@op-engineering/op-sqlite": "^15.2.5", "@react-native-async-storage/async-storage": "^2.2.0", "@react-native-community/blur": "^4.4.1", @@ -42,7 +44,10 @@ "patch-package": "^8.0.1", "react": "19.2.0", "react-native": "0.83.1", + "react-native-audio-api": "^0.11.7", "react-native-device-info": "^15.0.1", + "react-native-executorch": "^0.8.1", + "react-native-executorch-bare-resource-fetcher": "^0.8.0", "react-native-fs": "^2.20.0", "react-native-gesture-handler": "^2.30.0", "react-native-haptic-feedback": "^2.3.3", diff --git a/src/components/AudioMessageBubble/PlaybackControls.tsx b/src/components/AudioMessageBubble/PlaybackControls.tsx new file mode 100644 index 000000000..8477b595e --- /dev/null +++ b/src/components/AudioMessageBubble/PlaybackControls.tsx @@ -0,0 +1,264 @@ +import React, { useState, useCallback, useEffect, useRef } from 'react'; +import { + View, + Text, + TouchableOpacity, + ActivityIndicator, +} from 'react-native'; +import { ScrollView } from 'react-native-gesture-handler'; +import Slider from '@react-native-community/slider'; +import { stripMarkdownForSpeech } from '../../utils/messageContent'; +import { MarkdownText } from '../MarkdownText'; +import Icon from 'react-native-vector-icons/Feather'; +import { useTTSStore } from '../../stores/ttsStore'; +import type { ThemeColors } from '../../theme'; + +const SPEED_STEPS: number[] = [0.5, 0.8, 0.9, 1.0, 1.1, 1.2, 1.5, 2.0]; + +function formatDuration(seconds: number): string { + const m = Math.floor(seconds / 60); + const s = Math.floor(seconds % 60); + return `${m}:${s.toString().padStart(2, '0')}`; +} + +interface PlaybackState { + isThisPlaying: boolean; + isThisPaused: boolean; + isThisAudible: boolean; + isThisLoading: boolean; +} + +/** Derives playback state for a given messageId from TTS store selectors */ +export function usePlaybackState(messageId: string): PlaybackState { + const isSpeaking = useTTSStore((s) => s.isSpeaking); + const isPaused = useTTSStore((s) => s.isPaused); + const isAudioPlaying = useTTSStore((s) => s.isSpeaking); + const currentMessageId = useTTSStore((s) => s.currentMessageId); + + const isThisPlaying = isSpeaking && currentMessageId === messageId && !isPaused; + const isThisPaused = isSpeaking && currentMessageId === messageId && isPaused; + const isThisAudible = isAudioPlaying && currentMessageId === messageId && !isPaused; + const isThisLoading = isThisPlaying && !isThisAudible; + + return { isThisPlaying, isThisPaused, isThisAudible, isThisLoading }; +} + +/** Hook for wall-clock elapsed timer */ +export function useElapsedTimer( + playback: { isThisAudible: boolean; isThisPaused: boolean }, + seekOffsetRef: React.MutableRefObject, +) { + const { isThisAudible, isThisPaused } = playback; + // playSessionId is a monotonic counter that increments on every new play — + // guarantees the effect re-runs even if boolean deps appear unchanged. + const playSessionId = useTTSStore((s) => s.playSessionId); + const [localElapsed, setLocalElapsed] = useState(0); + const startTimeRef = useRef(0); + const pausedAtRef = useRef(0); + + useEffect(() => { + console.log('[Timer] effect: isThisAudible=', isThisAudible, 'isThisPaused=', isThisPaused, 'playSessionId=', playSessionId); + if (!isThisAudible && !isThisPaused) { + if (seekOffsetRef.current === 0) { + setLocalElapsed(0); + pausedAtRef.current = 0; + } + console.log('[Timer] not audible, not paused — resetting'); + return; + } + if (isThisPaused) { + pausedAtRef.current = localElapsed; + console.log('[Timer] paused at', localElapsed); + return; + } + const offset = seekOffsetRef.current || pausedAtRef.current; + seekOffsetRef.current = 0; + startTimeRef.current = Date.now() - offset * 1000; + console.log('[Timer] STARTING interval, offset=', offset); + const id = setInterval(() => { + setLocalElapsed((Date.now() - startTimeRef.current) / 1000); + }, 50); + return () => { console.log('[Timer] CLEARING interval'); clearInterval(id); }; + // eslint-disable-next-line react-hooks/exhaustive-deps + }, [isThisAudible, isThisPaused, playSessionId]); + + return { localElapsed, setLocalElapsed }; +} + +/** Play/pause button with loading states */ +export const PlayButton: React.FC<{ + isLoading: boolean; + isThisLoading: boolean; + isThisPlaying: boolean; + onPlayPause: () => void; + colors: ThemeColors; + styles: any; +}> = ({ isLoading, isThisLoading, isThisPlaying, onPlayPause, colors, styles }) => { + if (isLoading) { + return ( + + + + ); + } + if (isThisLoading) { + return ( + + + + ); + } + return ( + + + + ); +}; + +/** Speed cycle chip */ +export const SpeedChip: React.FC<{ + styles: any; +}> = ({ styles }) => { + const speed = useTTSStore((s) => s.settings.speed); + const updateSettings = useTTSStore((s) => s.updateSettings); + + const handleSpeedCycle = useCallback(() => { + let idx = SPEED_STEPS.indexOf(speed); + if (idx < 0) { + idx = SPEED_STEPS.findIndex((s) => s > speed) - 1; + if (idx < 0) idx = 0; + } + const next = (idx + 1) % SPEED_STEPS.length; + updateSettings({ speed: SPEED_STEPS[next] }); + }, [speed, updateSettings]); + + return ( + + {speed}x + + ); +}; + +/** Duration display */ +export const DurationText: React.FC<{ + isLoading: boolean; + totalDuration: number; + styles: any; +}> = ({ isLoading, totalDuration, styles }) => ( + + {isLoading ? '—' : formatDuration(totalDuration)} + +); + +/** Seekable progress bar using native Slider component */ +export const SeekBar: React.FC<{ + displayProgress: number; + colors: ThemeColors; + styles: any; + onSeek: (fraction: number) => void; +}> = ({ displayProgress, colors, styles, onSeek }) => { + const [isSeeking, setIsSeeking] = useState(false); + const [seekValue, setSeekValue] = useState(0); + + return ( + { setIsSeeking(true); setSeekValue(val); }} + onValueChange={(val) => { if (isSeeking) setSeekValue(val); }} + onSlidingComplete={(val) => { setIsSeeking(false); onSeek(val); }} + /> + ); +}; + +/** Transcript toggle and content */ +export const TranscriptToggle: React.FC<{ + transcript?: string; + colors: ThemeColors; + styles: any; + isOpen: boolean; + onToggle: (v: boolean) => void; +}> = ({ transcript, colors, styles, isOpen, onToggle }) => { + if (!transcript) return null; + + return ( + onToggle(!isOpen)} + style={styles.transcriptToggle} + > + + {isOpen ? 'Hide transcript' : 'Show transcript'} + + + + ); +}; + +export const TranscriptContent: React.FC<{ + transcript: string; + styles: any; +}> = ({ transcript, styles }) => ( + + + {transcript} + + +); + +/** Hook for seek logic */ +interface SeekHandlerParams { + transcript: string | undefined; + audioPath: string; + messageId: string; + totalDurationRef: React.MutableRefObject; + seekOffsetRef: React.MutableRefObject; + setLocalElapsed: (v: number) => void; + setIsSeeking: (v: boolean) => void; +} + +export function useSeekHandler({ + transcript, audioPath, messageId, + totalDurationRef, seekOffsetRef, setLocalElapsed, setIsSeeking, +}: SeekHandlerParams) { + const stop = useTTSStore((s) => s.stop); + const speak = useTTSStore((s) => s.speak); + + return useCallback((fraction: number) => { + if (!transcript || audioPath) return; + const text = stripMarkdownForSpeech(transcript); + const charOffset = Math.floor(fraction * text.length); + const seekPoint = text.lastIndexOf('. ', charOffset) + 2 || charOffset; + const remaining = text.slice(seekPoint).trim(); + console.log(`[AudioBubble] seeking to ${Math.round(fraction * 100)}%`, 'charOffset:', charOffset, 'remaining:', remaining.length, 'chars'); + if (!remaining) return; + const seekSeconds = Math.floor(fraction * totalDurationRef.current); + seekOffsetRef.current = seekSeconds; + setLocalElapsed(seekSeconds); + setIsSeeking(true); + stop(); + setTimeout(() => { + speak(remaining, messageId).finally(() => setIsSeeking(false)); + }, 200); + }, [transcript, audioPath, stop, speak, messageId, totalDurationRef, seekOffsetRef, setLocalElapsed, setIsSeeking]); +} diff --git a/src/components/AudioMessageBubble/index.tsx b/src/components/AudioMessageBubble/index.tsx new file mode 100644 index 000000000..c18cfa6c2 --- /dev/null +++ b/src/components/AudioMessageBubble/index.tsx @@ -0,0 +1,390 @@ +import React, { useState, useCallback, useEffect, useRef, useMemo } from 'react'; +import { + View, + Text, + TouchableOpacity, + StyleSheet, + Animated, +} from 'react-native'; +import { stripMarkdownForSpeech } from '../../utils/messageContent'; +import { useTheme, useThemedStyles } from '../../theme'; +import { useTTSStore } from '../../stores/ttsStore'; +import { triggerHaptic } from '../../utils/haptics'; +import { TYPOGRAPHY, SPACING } from '../../constants'; +import type { ThemeColors, ThemeShadows } from '../../theme'; +import { ActionMenuSheet } from '../ChatMessage/components/ActionMenuSheet'; +import { createStyles as createChatStyles } from '../ChatMessage/styles'; +import { + usePlaybackState, + useElapsedTimer, + useSeekHandler, + PlayButton, + SpeedChip, + DurationText, + SeekBar, + TranscriptToggle, + TranscriptContent, +} from './PlaybackControls'; + +const WAVEFORM_BARS = 48; + +interface AudioMessageBubbleProps { + messageId: string; + audioPath: string; + waveformData: number[]; + durationSeconds: number; + transcript?: string; + isUser?: boolean; + isLoading?: boolean; + _reasoningContent?: string; + onCopy?: (content: string) => void; + onRetry?: () => void; + onEdit?: (newContent: string) => void; +} + +function subsample(data: number[], count: number): number[] { + if (data.length === 0) { + return Array.from({ length: count }, (_, i) => 0.25 + 0.25 * Math.sin((i / count) * Math.PI * 4)); + } + const step = data.length / count; + const result: number[] = []; + for (let i = 0; i < count; i++) { + result.push(data[Math.floor(i * step)] ?? 0.1); + } + return result; +} + +function normalize(data: number[]): number[] { + const max = Math.max(...data, 0.001); + return data.map((v) => v / max); +} + +/** WhatsApp-style waveform — bars tint as the playhead passes over them. + * Played bars are full color, unplayed bars are muted. */ +const WaveformBars: React.FC<{ + data: number[]; + colors: ThemeColors; + /** 0–1 playback progress — bars behind the playhead are tinted */ + progress?: number; +}> = ({ data, colors, progress = 0 }) => { + const bars = useMemo(() => normalize(subsample(data, WAVEFORM_BARS)), [data]); + + return ( + + {bars.map((shape, i) => { + const played = progress > 0 && (i / bars.length) < progress; + return ( + + ); + })} + + ); +}; + +const barStyles = StyleSheet.create({ + container: { + flex: 1, + flexDirection: 'row', + alignItems: 'center', + gap: 1.5, + height: 40, + overflow: 'hidden', + }, + bar: { + flex: 1, + borderRadius: 2, + }, +}); + +/** Three pulsing dots shown while the LLM is generating */ +const ThinkingDots: React.FC<{ colors: ThemeColors }> = ({ colors }) => { + const dots = useRef([new Animated.Value(0.3), new Animated.Value(0.3), new Animated.Value(0.3)]).current; + + useEffect(() => { + const anims = dots.map((v, i) => + Animated.loop( + Animated.sequence([ + Animated.delay(i * 150), + Animated.timing(v, { toValue: 1, duration: 300, useNativeDriver: false }), + Animated.timing(v, { toValue: 0.3, duration: 300, useNativeDriver: false }), + ]), + ), + ); + anims.forEach((a) => a.start()); + return () => anims.forEach((a) => a.stop()); + }, [dots]); + + return ( + + {dots.map((v, i) => ( + + ))} + + ); +}; + +const dotStyles = StyleSheet.create({ + container: { + flex: 1, + flexDirection: 'row', + alignItems: 'center', + gap: 6, + paddingHorizontal: 4, + height: 32, + }, + dot: { + width: 7, + height: 7, + borderRadius: 4, + }, +}); + +export const AudioMessageBubble: React.FC = ({ + messageId, + audioPath, + waveformData, + durationSeconds, + transcript, + isUser = false, + isLoading = false, + _reasoningContent, + onCopy, + onRetry, + onEdit, +}) => { + const { colors } = useTheme(); + const styles = useThemedStyles(createStyles); + const chatStyles = useThemedStyles(createChatStyles); + const [showActionMenu, setShowActionMenu] = useState(false); + const speed = useTTSStore((s) => s.settings.speed); + const playMessage = useTTSStore((s) => s.playMessage); + const speak = useTTSStore((s) => s.speak); + + const { isThisPlaying, isThisPaused, isThisAudible, isThisLoading } = usePlaybackState(messageId); + const currentMessageId = useTTSStore((s) => s.currentMessageId); + + useEffect(() => { + console.log('[AudioBubble] state: messageId=', messageId, 'currentMessageId=', currentMessageId, 'isThisAudible=', isThisAudible, 'isThisPlaying=', isThisPlaying); + }, [messageId, currentMessageId, isThisAudible, isThisPlaying]); + const [showTranscript, setShowTranscript] = useState(false); + const [isSeeking, setIsSeeking] = useState(false); + const seekOffsetRef = useRef(0); + const { localElapsed, setLocalElapsed } = useElapsedTimer({ isThisAudible, isThisPaused }, seekOffsetRef); + + const handlePlayPause = useCallback(() => { + const { pause, resume } = useTTSStore.getState(); + if (isThisPaused) { resume(); return; } + if (isThisPlaying) { pause(); return; } + if (audioPath) { + playMessage(messageId, audioPath); + } else { + const text = stripMarkdownForSpeech(transcript ?? ''); + speak(text, messageId); + } + }, [isThisPlaying, isThisPaused, playMessage, speak, messageId, audioPath, transcript]); + + const totalDurationRef = useRef(0); + const totalDuration = useMemo(() => { + if (!audioPath && transcript) { + const wordCount = transcript.trim().split(/\s+/).filter(Boolean).length; + return Math.max(1, wordCount / (2.5 * speed)); + } + return durationSeconds; + }, [audioPath, transcript, speed, durationSeconds]); + totalDurationRef.current = totalDuration; + + const handleSeek = useSeekHandler({ + transcript, audioPath, messageId, + totalDurationRef, seekOffsetRef, setLocalElapsed, setIsSeeking, + }); + + const isThisActive = ((isThisPlaying || isThisPaused) && currentMessageId === messageId) || isSeeking; + const progress = isThisActive ? Math.min(1, localElapsed / Math.max(1, totalDuration)) : 0; + + // Waveform + seekbar overlay — seekbar sits on top of the waveform, centered vertically + const waveformWithSeek = ( + + {isLoading && !isUser + ? + : } + {!isLoading && ( + + + + )} + + ); + + const handleLongPress = useCallback(() => { + if (isLoading) return; + triggerHaptic('impactMedium'); + setShowActionMenu(true); + }, [isLoading]); + + const showActions = !!(onCopy || onRetry || onEdit); + + return ( + + + + + {waveformWithSeek} + + + + + + + + {showActions && !isLoading && ( + { triggerHaptic('impactLight'); setShowActionMenu(true); }}> + ••• + + )} + + + + + {showTranscript && transcript ? ( + + ) : null} + + setShowActionMenu(false)} + isUser={isUser} + canEdit={isUser && !!onEdit} + canRetry={!!onRetry} + canGenerateImage={false} + canSpeak={false} + styles={chatStyles} + onCopy={() => { onCopy?.(transcript ?? ''); setShowActionMenu(false); }} + onEdit={() => setShowActionMenu(false)} + onRetry={() => { onRetry?.(); setShowActionMenu(false); }} + onGenerateImage={() => setShowActionMenu(false)} + onSpeak={() => setShowActionMenu(false)} + /> + + ); +}; + +const createStyles = (colors: ThemeColors, _shadows: ThemeShadows) => ({ + bubble: { + backgroundColor: colors.surface, + borderRadius: 12, + borderWidth: 1, + borderColor: colors.border, + padding: SPACING.md, + width: '88%' as const, + alignSelf: 'flex-start' as const, + gap: SPACING.sm, + overflow: 'hidden' as const, + }, + bubbleUser: { + alignSelf: 'flex-end' as const, + backgroundColor: `${colors.primary}18`, + borderColor: `${colors.primary}40`, + }, + playRow: { + flexDirection: 'row' as const, + alignItems: 'center' as const, + gap: SPACING.xs, + }, + metaRow: { + flexDirection: 'row' as const, + alignItems: 'center' as const, + justifyContent: 'space-between' as const, + }, + metaRight: { + flexDirection: 'row' as const, + alignItems: 'center' as const, + gap: SPACING.sm, + }, + playButton: { + width: 28, + height: 28, + borderRadius: 14, + backgroundColor: `${colors.primary}20`, + alignItems: 'center' as const, + justifyContent: 'center' as const, + }, + playButtonDisabled: { + opacity: 0.35, + }, + duration: { + ...TYPOGRAPHY.meta, + color: colors.textMuted, + minWidth: 32, + textAlign: 'right' as const, + }, + speedChip: { + backgroundColor: colors.surfaceLight, + borderRadius: 10, + paddingHorizontal: SPACING.sm, + paddingVertical: SPACING.xs, + borderWidth: 1, + borderColor: colors.border, + }, + speedText: { + ...TYPOGRAPHY.metaSmall, + color: colors.textSecondary, + }, + waveformSeekContainer: { + flex: 1, + position: 'relative' as const, + marginLeft: SPACING.sm, + }, + seekOverlay: { + position: 'absolute' as const, + top: 0, + left: -16, + right: -16, + bottom: 0, + justifyContent: 'center' as const, + }, + seekSlider: { + height: 40, + }, + transcriptToggle: { + flexDirection: 'row' as const, + alignItems: 'center' as const, + gap: SPACING.xs, + }, + transcriptToggleText: { + ...TYPOGRAPHY.meta, + color: colors.textMuted, + }, + transcriptContent: { + paddingTop: SPACING.xs, + }, + transcriptScroll: { + maxHeight: 120, + }, + transcriptText: { + ...TYPOGRAPHY.bodySmall, + lineHeight: 20, + }, + actionHint: { + padding: 4, + }, + actionHintText: { + ...TYPOGRAPHY.bodySmall, + color: colors.textMuted, + letterSpacing: 1, + }, +}); diff --git a/src/components/ChatInput/Attachments.tsx b/src/components/ChatInput/Attachments.tsx index bdf90cdfe..b96e3b53b 100644 --- a/src/components/ChatInput/Attachments.tsx +++ b/src/components/ChatInput/Attachments.tsx @@ -101,9 +101,21 @@ export function useAttachments(setAlertState: (state: AlertState) => void) { } }; + const addAudioAttachment = (uri: string, audioFormat: 'wav' | 'mp3', audioDurationSeconds?: number) => { + const attachment: MediaAttachment = { + id: nextAttachmentId(), + type: 'audio', + uri, + audioFormat, + audioDurationSeconds, + fileName: uri.split('/').pop(), + }; + setAttachments(prev => [...prev, attachment]); + }; + const clearAttachments = () => setAttachments([]); - return { attachments, removeAttachment, clearAttachments, handlePickImage, handlePickDocument }; + return { attachments, removeAttachment, clearAttachments, handlePickImage, handlePickDocument, addAudioAttachment }; } // ─── AttachmentPreview component ───────────────────────────────────────────── @@ -135,6 +147,11 @@ export const AttachmentPreview: React.FC = ({ attachment source={{ uri: attachment.uri }} style={styles.attachmentImage} /> + ) : attachment.type === 'audio' ? ( + + + Voice + ) : ( diff --git a/src/components/ChatInput/AudioModeLayout.tsx b/src/components/ChatInput/AudioModeLayout.tsx new file mode 100644 index 000000000..1cd957cb6 --- /dev/null +++ b/src/components/ChatInput/AudioModeLayout.tsx @@ -0,0 +1,239 @@ +import React from 'react'; +import { View, TouchableOpacity, Text, ActivityIndicator } from 'react-native'; +import Icon from 'react-native-vector-icons/Feather'; +import { useTheme } from '../../theme'; +import { ImageModeState, MediaAttachment } from '../../types'; +import { VoiceRecordButton } from '../VoiceRecordButton'; +import { triggerHaptic } from '../../utils/haptics'; +import { CustomAlert, hideAlert, AlertState } from '../CustomAlert'; +import { QueueRow } from './Toolbar'; +import { AttachmentPreview } from './Attachments'; +import { AttachPickerPopover, VoicePickerPopover, QuickSettingsPopover } from './Popovers'; +import { useTTSStore } from '../../stores/ttsStore'; +import type { TTSVoice } from '../../engine'; + +interface AudioModeLayoutProps { + styles: any; + disabled?: boolean; + isGenerating?: boolean; + imageMode: ImageModeState; + imageModelLoaded: boolean; + supportsThinking: boolean; + supportsToolCalling: boolean; + enabledToolCount: number; + thinkingEnabled: boolean; + currentVoice: TTSVoice; + // Attachments + attachments: MediaAttachment[]; + onRemoveAttachment: (id: string) => void; + // Queue + queueCount: number; + queuedTexts: string[]; + onClearQueue?: () => void; + // Voice recording + isRecording: boolean; + voiceAvailable: boolean; + isModelLoading: boolean; + isTranscribing: boolean; + partialResult: string; + error: string | null; + onStartRecording: () => void; + onStopRecording: () => void; + onCancelRecording: () => void; + // Handlers + onStop?: () => void; + onImageModeToggle: () => void; + onThinkingToggle: () => void; + onToolsPress?: () => void; + onVisionPress: () => void; + onPickDocument: () => void; + // Popovers + attachPicker: any; + voicePicker: any; + quickSettings: any; + supportsVision: boolean; + // Alert + alertState: AlertState; + setAlertState: (s: AlertState) => void; +} + +export const AudioModeLayout: React.FC = ({ + styles, + disabled, + isGenerating, + imageMode, + imageModelLoaded, + supportsThinking, + supportsToolCalling, + enabledToolCount, + thinkingEnabled, + currentVoice, + attachments, + onRemoveAttachment, + queueCount, + queuedTexts, + onClearQueue, + isRecording, + voiceAvailable, + isModelLoading, + isTranscribing, + partialResult, + error, + onStartRecording, + onStopRecording, + onCancelRecording, + onStop, + onImageModeToggle, + onThinkingToggle, + onToolsPress, + onVisionPress, + onPickDocument, + attachPicker, + voicePicker, + quickSettings, + supportsVision, + alertState, + setAlertState, +}) => { + const { colors } = useTheme(); + const isChangingVoice = false; // Voice change state is handled by the engine internally + + const handleStop = () => { + if (onStop && isGenerating) { + triggerHaptic('impactLight'); + onStop(); + } + }; + + const audioStopButton = isGenerating && onStop ? ( + + + + ) : null; + + return ( + + + + + attachPicker.show()} + disabled={disabled} + hitSlop={{ top: 4, bottom: 4, left: 8, right: 8 }} + > + + + { + triggerHaptic('impactLight'); + useTTSStore.getState().updateSettings({ interfaceMode: 'chat' }); + }} + hitSlop={{ top: 4, bottom: 4, left: 8, right: 8 }} + > + + + + + + {supportsThinking && ( + + + + )} + { triggerHaptic('impactLight'); onToolsPress?.(); }} + disabled={disabled || !supportsToolCalling} + hitSlop={{ top: 4, bottom: 4, left: 8, right: 8 }} + > + 0 ? colors.primary : !supportsToolCalling ? colors.textMuted : colors.textSecondary} /> + + voicePicker.show()} + hitSlop={{ top: 4, bottom: 4, left: 8, right: 8 }} + > + {isChangingVoice + ? + : } + {currentVoice.label} + + + {isGenerating && onStop ? ( + audioStopButton + ) : ( + + )} + + + + + + setAlertState(hideAlert())} + /> + + ); +}; diff --git a/src/components/ChatInput/Popovers.tsx b/src/components/ChatInput/Popovers.tsx index 52a61b694..53fc42f77 100644 --- a/src/components/ChatInput/Popovers.tsx +++ b/src/components/ChatInput/Popovers.tsx @@ -1,11 +1,15 @@ import React from 'react'; import { View, TouchableOpacity, Text, StyleSheet, Modal, TouchableWithoutFeedback } from 'react-native'; import Icon from 'react-native-vector-icons/Feather'; +import { useNavigation } from '@react-navigation/native'; import { useTheme } from '../../theme'; import { ImageModeState } from '../../types'; -import { useAppStore } from '../../stores'; +import { useAppStore, useTTSStore } from '../../stores'; import { triggerHaptic } from '../../utils/haptics'; -import { FONTS } from '../../constants'; +import { FONTS, TYPOGRAPHY } from '../../constants'; +import type { TTSVoice } from '../../engine'; +import type { NativeStackNavigationProp } from '@react-navigation/native-stack'; +import type { RootStackParamList } from '../../navigation/types'; // ─── Shared Styles ────────────────────────────────────────────────────────── @@ -100,11 +104,28 @@ export const QuickSettingsPopover: React.FC = ({ }) => { const { colors } = useTheme(); const { settings, updateSettings } = useAppStore(); + const { settings: ttsSettings, isReady: ttsReady, updateSettings: updateTTSSettings, initializeEngine } = useTTSStore(); + const navigation = useNavigation>(); if (!visible) return null; const imgBadge = getImageModeBadge(imageMode, colors); const tools = getToolsStyle(supportsToolCalling, enabledToolCount, colors); + const ttsMode = ttsSettings.interfaceMode; + const ttsBadge = !ttsReady + ? { label: 'N/A', bg: colors.textMuted } + : ttsMode === 'audio' + ? { label: 'Audio', bg: colors.primary } + : { label: 'Chat', bg: `${colors.textMuted}80` }; + + const handleTTSToggle = () => { + triggerHaptic('impactLight'); + if (!ttsReady) { onClose(); navigation.navigate('TTSSettings'); return; } + onClose(); + const next = ttsMode === 'audio' ? 'chat' : 'audio'; + updateTTSSettings({ interfaceMode: next }); + if (next === 'audio') initializeEngine(); + }; return ( @@ -150,6 +171,18 @@ export const QuickSettingsPopover: React.FC = ({ )} + + + Voice + + {ttsBadge.label} + + + = ({ ); }; + +// ─── Voice Picker Popover ────────────────────────────────────────────────── + +interface VoicePickerPopoverProps { + visible: boolean; + onClose: () => void; + anchorY: number; + anchorX: number; +} + +export const VoicePickerPopover: React.FC = ({ + visible, onClose, anchorY, anchorX, +}) => { + const { colors } = useTheme(); + const { voices, activeVoiceId, isSpeaking, stop, setVoice } = useTTSStore(); + + if (!visible) return null; + + const handleSelect = (voice: TTSVoice) => { + triggerHaptic('impactLight'); + if (isSpeaking) { stop(); } + setVoice(voice.id); + onClose(); + }; + + return ( + + + + + + {voices.map((voice) => { + const isActive = voice.id === activeVoiceId; + return ( + handleSelect(voice)} + > + + + + {voice.label} + + + {voice.metadata.persona || ''} + + + {isActive && } + + ); + })} + + + + + + ); +}; + +const voicePickerStyles = StyleSheet.create({ + popover: { + minWidth: 200, + }, + labelCol: { + flex: 1, + }, + accent: { + ...TYPOGRAPHY.meta, + marginTop: 1, + }, +}); diff --git a/src/components/ChatInput/Voice.ts b/src/components/ChatInput/Voice.ts index 1cc66a19e..616b6bcaf 100644 --- a/src/components/ChatInput/Voice.ts +++ b/src/components/ChatInput/Voice.ts @@ -1,35 +1,195 @@ -import { useEffect, useRef } from 'react'; +import { useEffect, useRef, useState } from 'react'; import { useWhisperTranscription } from '../../hooks/useWhisperTranscription'; -import { useWhisperStore } from '../../stores'; +import { useWhisperStore, useChatStore } from '../../stores'; +import { useTTSStore } from '../../stores/ttsStore'; +import { llmService } from '../../services/llm'; +import { audioRecorderService } from '../../services/audioRecorderService'; +import { whisperService } from '../../services/whisperService'; +import logger from '../../utils/logger'; interface UseVoiceInputParams { conversationId?: string | null; onTranscript: (text: string) => void; + onAudioAttachment?: (uri: string, format: 'wav' | 'mp3', durationSeconds?: number) => void; + /** Called in Audio Mode to auto-send. Includes audio info so caller can build attachment atomically. */ + onAutoSend?: (text: string, audio: { uri: string; format: 'wav' | 'mp3'; durationSeconds: number }) => void; } -export function useVoiceInput({ conversationId, onTranscript }: UseVoiceInputParams) { +export function useVoiceInput({ conversationId, onTranscript, onAudioAttachment, onAutoSend }: UseVoiceInputParams) { const recordingConversationIdRef = useRef(null); const onTranscriptRef = useRef(onTranscript); onTranscriptRef.current = onTranscript; + const onAudioAttachmentRef = useRef(onAudioAttachment); + onAudioAttachmentRef.current = onAudioAttachment; + const onAutoSendRef = useRef(onAutoSend); + onAutoSendRef.current = onAutoSend; const { downloadedModelId } = useWhisperStore(); + const [isDirectRecording, setIsDirectRecording] = useState(false); + const [isAudioModeRecording, setIsAudioModeRecording] = useState(false); + const [isTranscribingFile, setIsTranscribingFile] = useState(false); + const [directError, setDirectError] = useState(null); const { - isRecording, + isRecording: isWhisperRecording, isModelLoading, - isTranscribing, + isTranscribing: isWhisperTranscribing, partialResult, finalResult, - error, - startRecording: startRecordingBase, - stopRecording, + error: whisperError, + startRecording: startWhisperRecording, + stopRecording: stopWhisperRecording, clearResult, } = useWhisperTranscription(); - const voiceAvailable = !!downloadedModelId; + const supportsDirectAudio = (): boolean => { + const support = llmService.getMultimodalSupport(); + return Boolean(support?.audio) && audioRecorderService.supportsDirectAudioInput(); + }; + + const isInAudioInterfaceMode = (): boolean => + useTTSStore.getState().settings.interfaceMode === 'audio'; + + // Use file-based transcription path when: Audio Mode + Whisper available + not direct audio model + const shouldUseFilePath = (): boolean => + isInAudioInterfaceMode() && !!downloadedModelId && !supportsDirectAudio(); + + const isTranscribing = isWhisperTranscribing || isTranscribingFile; + const isRecording = isDirectRecording || isAudioModeRecording || isWhisperRecording; + const error = directError ?? whisperError; + + // voiceAvailable: direct audio OR whisper downloaded + const voiceAvailable = supportsDirectAudio() || !!downloadedModelId; const startRecording = async () => { recordingConversationIdRef.current = conversationId || null; - await startRecordingBase(); + setDirectError(null); + // Stop any TTS playback before recording — mic and speaker shouldn't overlap + const tts = useTTSStore.getState(); + if (tts.isSpeaking) { tts.stop(); } + + if (supportsDirectAudio()) { + try { + setIsDirectRecording(true); + await audioRecorderService.startRecording(); + } catch (err) { + setIsDirectRecording(false); + const msg = err instanceof Error ? err.message : 'Recording failed'; + logger.error('[Voice] Direct audio recording error:', err); + setDirectError(msg); + } + return; + } + + if (shouldUseFilePath()) { + try { + setIsAudioModeRecording(true); + await audioRecorderService.startRecording(); + } catch (err) { + setIsAudioModeRecording(false); + const msg = err instanceof Error ? err.message : 'Recording failed'; + logger.error('[Voice] Audio mode recording error:', err); + setDirectError(msg); + } + return; + } + + await startWhisperRecording(); + }; + + const stopRecording = async () => { + if (isDirectRecording) { + try { + const { path, durationSeconds } = await audioRecorderService.stopRecording(); + setIsDirectRecording(false); + if (!recordingConversationIdRef.current || recordingConversationIdRef.current === conversationId) { + const format = audioRecorderService.getFormat(); + // In Audio Mode, auto-send directly — no transcription needed for multimodal models + if (onAutoSendRef.current && isInAudioInterfaceMode()) { + onAutoSendRef.current('', { uri: path, format, durationSeconds }); + + // Parallel transcription: send audio to model immediately, transcribe in background + // so the voice bubble gets a transcript for display/playback review + if (downloadedModelId) { + const convId = conversationId; + whisperService.transcribeFile(path).then(text => { + if (!text?.trim() || !convId) return; + const conv = useChatStore.getState().conversations.find(c => c.id === convId); + const msg = conv?.messages.find(m => + m.role === 'user' && m.attachments?.some(a => a.uri === path), + ); + if (msg) { + useChatStore.getState().updateMessageContent(convId, msg.id, text.trim()); + } + }).catch(err => logger.error('[Voice] Background transcription error:', err)); + } + } else { + onAudioAttachmentRef.current?.(path, format, durationSeconds); + } + } + recordingConversationIdRef.current = null; + } catch (err) { + setIsDirectRecording(false); + logger.error('[Voice] Failed to stop direct recording:', err); + } + return; + } + + if (isAudioModeRecording) { + try { + const { path, durationSeconds } = await audioRecorderService.stopRecording(); + setIsAudioModeRecording(false); + if (recordingConversationIdRef.current && recordingConversationIdRef.current !== conversationId) { + recordingConversationIdRef.current = null; + return; + } + setIsTranscribingFile(true); + let text = ''; + try { + text = await whisperService.transcribeFile(path); + } catch (transcribeErr) { + logger.error('[Voice] File transcription error:', transcribeErr); + } + setIsTranscribingFile(false); + recordingConversationIdRef.current = null; + if (text.trim()) { + if (onAutoSendRef.current) { + onAutoSendRef.current(text.trim(), { uri: path, format: 'wav', durationSeconds }); + } else { + onAudioAttachmentRef.current?.(path, 'wav', durationSeconds); + onTranscriptRef.current(text.trim()); + } + } else { + // Transcription returned nothing — clip too short or too quiet + setDirectError("Couldn't hear that — try again"); + setTimeout(() => setDirectError(null), 3000); + } + } catch (err) { + setIsAudioModeRecording(false); + setIsTranscribingFile(false); + logger.error('[Voice] Failed to stop audio mode recording:', err); + } + return; + } + + await stopWhisperRecording(); + }; + + const cancelRecording = () => { + if (isDirectRecording) { + audioRecorderService.cancelRecording(); + setIsDirectRecording(false); + recordingConversationIdRef.current = null; + return; + } + if (isAudioModeRecording) { + audioRecorderService.cancelRecording(); + setIsAudioModeRecording(false); + recordingConversationIdRef.current = null; + return; + } + stopWhisperRecording(); + clearResult(); + recordingConversationIdRef.current = null; }; useEffect(() => { @@ -49,5 +209,20 @@ export function useVoiceInput({ conversationId, onTranscript }: UseVoiceInputPar } }, [finalResult, clearResult, conversationId]); - return { isRecording, isModelLoading, isTranscribing, partialResult, error, voiceAvailable, startRecording, stopRecording, clearResult }; + return { + isRecording, + isModelLoading, + isTranscribing, + partialResult, + error, + voiceAvailable, + startRecording, + stopRecording, + cancelRecording, + clearResult, + /** True when model accepts audio directly (no Whisper needed) */ + isDirectAudioMode: supportsDirectAudio(), + /** True when recording in Audio Mode for file-based transcription */ + isAudioModeRecording, + }; } diff --git a/src/components/ChatInput/index.tsx b/src/components/ChatInput/index.tsx index 1ebbb496e..7368cfb9f 100644 --- a/src/components/ChatInput/index.tsx +++ b/src/components/ChatInput/index.tsx @@ -1,4 +1,4 @@ -import React, { useState, useRef, useEffect } from 'react'; +import React, { useState, useRef, useEffect, useMemo } from 'react'; import { View, TextInput, TouchableOpacity, Animated, StyleSheet } from 'react-native'; import Icon from 'react-native-vector-icons/Feather'; import { useTheme, useThemedStyles } from '../../theme'; @@ -13,6 +13,10 @@ import { AttachmentPreview, useAttachments } from './Attachments'; import { useVoiceInput } from './Voice'; import { QuickSettingsPopover, AttachPickerPopover } from './Popovers'; import { useKeyboardAwarePopover } from './useKeyboardAwarePopover'; +import { useTTSStore } from '../../stores/ttsStore'; +import { useAppStore } from '../../stores'; +import type { TTSVoice } from '../../engine'; +import { AudioModeLayout } from './AudioModeLayout'; interface ChatInputProps { onSend: (message: string, attachments?: MediaAttachment[], imageMode?: ImageModeState) => void; @@ -33,7 +37,6 @@ interface ChatInputProps { supportsToolCalling?: boolean; supportsThinking?: boolean; onRepairVision?: () => void; - /** When set, mounts a single AttachStep for that index. Only one at a time to avoid waypoint dots. */ activeSpotlight?: number | null; } @@ -69,7 +72,9 @@ export const ChatInput: React.FC = ({ const [alertState, setAlertState] = useState(initialAlertState); const quickSettings = useKeyboardAwarePopover(); const attachPicker = useKeyboardAwarePopover(); + const voicePicker = useKeyboardAwarePopover(); const inputRef = useRef(null); + const attachmentsRef = useRef([]); const hasText = message.length > 0; const iconsAnim = useRef(new Animated.Value(0)).current; @@ -81,9 +86,18 @@ export const ChatInput: React.FC = ({ }).start(); }, [hasText, iconsAnim]); - const { attachments, removeAttachment, clearAttachments, handlePickImage, handlePickDocument } = useAttachments(setAlertState); + const { attachments, removeAttachment, clearAttachments, handlePickImage, handlePickDocument, addAudioAttachment } = useAttachments(setAlertState); + attachmentsRef.current = attachments; + const ttsInterfaceMode = useTTSStore((s) => s.settings.interfaceMode); + const activeVoiceId = useTTSStore((s) => s.activeVoiceId); + const voices = useTTSStore((s) => s.voices); + const isAudioMode = ttsInterfaceMode === 'audio'; + const currentVoice: TTSVoice = useMemo( + () => voices.find((v) => v.id === activeVoiceId) ?? voices[0] ?? { id: 'default', label: 'Default', metadata: {} }, + [activeVoiceId, voices], + ); - const { isRecording, isModelLoading, isTranscribing, partialResult, error, voiceAvailable, startRecording, stopRecording, clearResult } = useVoiceInput({ + const { isRecording, isModelLoading, isTranscribing, partialResult, error, voiceAvailable, startRecording, stopRecording, cancelRecording } = useVoiceInput({ conversationId, onTranscript: (text) => { setMessage(prev => { @@ -91,8 +105,33 @@ export const ChatInput: React.FC = ({ return prefix + text; }); }, + onAudioAttachment: (uri, format, durationSeconds) => { + addAudioAttachment(uri, format, durationSeconds); + }, + onAutoSend: isAudioMode ? (text, audio) => { + const audioAttachment: MediaAttachment = { + id: `audio-${Date.now()}`, + type: 'audio', + uri: audio.uri, + audioFormat: audio.format, + audioDurationSeconds: audio.durationSeconds, + fileName: audio.uri.split('/').pop(), + }; + triggerHaptic('impactMedium'); + const all = [...attachmentsRef.current, audioAttachment]; + onSend(text, all, imageMode); + clearAttachments(); + } : undefined, }); + const { settings: appSettings, updateSettings: updateAppSettings } = useAppStore(); + const thinkingEnabled = appSettings.thinkingEnabled; + + const handleThinkingToggle = () => { + triggerHaptic('impactLight'); + updateAppSettings({ thinkingEnabled: !thinkingEnabled }); + }; + const canSend = (message.trim().length > 0 || attachments.length > 0) && !disabled; const handleSend = () => { @@ -137,9 +176,49 @@ export const ChatInput: React.FC = ({ } }; - const handleQuickSettingsPress = () => quickSettings.show(); - - const handleAttachPress = () => attachPicker.show(); + // ─── Audio mode: simplified mic-only layout ───────────────────────────────── + if (isAudioMode) { + return ( + + ); + } const actionButton = canSend ? ( = ({ disabled={disabled} onStartRecording={startRecording} onStopRecording={stopRecording} - onCancelRecording={() => { stopRecording(); clearResult(); }} + onCancelRecording={cancelRecording} asSendButton /> ); - const content = ( + return ( = ({ onClearQueue={onClearQueue} /> - {/* Pill: text input + right icons */} = ({ blurOnSubmit={false} returnKeyType="default" /> - {/* Icons collapse when user starts typing, reappear when input is empty */} = ({ overflow: 'hidden' as const, }]} > - {/* Attach button — opens picker for image or document */} attachPicker.show()} disabled={disabled} hitSlop={{ top: 4, bottom: 4, left: 4, right: 4 }} > - + - - {/* Quick settings button */} + {supportsThinking && ( + + + + )} quickSettings.show()} disabled={disabled} hitSlop={{ top: 4, bottom: 4, left: 4, right: 4 }} > - - {/* Circular action button — conditionally wrapped with AttachStep */} {activeSpotlight === 12 ? ( {actionButton} ) : actionButton} @@ -253,7 +332,6 @@ export const ChatInput: React.FC = ({ onPhoto={handleVisionPress} onDocument={handlePickDocument} /> - = ({ enabledToolCount={enabledToolCount} onToolsPress={onToolsPress} /> - = ({ /> ); - - return content; }; const spotlightStyles = StyleSheet.create({ centered: { alignSelf: 'center' }, }); - diff --git a/src/components/ChatInput/styles.ts b/src/components/ChatInput/styles.ts index a9f8df69c..7aab9a884 100644 --- a/src/components/ChatInput/styles.ts +++ b/src/components/ChatInput/styles.ts @@ -1,5 +1,5 @@ import type { ThemeColors, ThemeShadows } from '../../theme'; -import { FONTS } from '../../constants'; +import { FONTS, TYPOGRAPHY, SPACING } from '../../constants'; import { Platform } from 'react-native'; export const PILL_ICON_SIZE = 32; @@ -208,4 +208,25 @@ export const createStyles = (colors: ThemeColors, _shadows: ThemeShadows) => ({ fontWeight: '500' as const, color: colors.primary, }, + // Audio mode layout + audioModeRow: { + flexDirection: 'row' as const, + alignItems: 'center' as const, + justifyContent: 'center' as const, + gap: SPACING.md, + paddingVertical: SPACING.xs, + }, + // Voice cycle button — shows icon + voice name + audioVoiceButton: { + flexDirection: 'row' as const, + alignItems: 'center' as const, + gap: 4, + paddingHorizontal: SPACING.sm, + height: 32, + borderRadius: 16, + }, + audioVoiceLabel: { + ...TYPOGRAPHY.meta, + color: colors.textSecondary, + }, }); diff --git a/src/components/ChatInput/useKeyboardAwarePopover.ts b/src/components/ChatInput/useKeyboardAwarePopover.ts index 13cdfaa4c..dc4f0b7b8 100644 --- a/src/components/ChatInput/useKeyboardAwarePopover.ts +++ b/src/components/ChatInput/useKeyboardAwarePopover.ts @@ -1,13 +1,15 @@ import { useRef, useEffect, useState, useCallback } from 'react'; import { Keyboard, Dimensions, Platform, StatusBar, TouchableOpacity } from 'react-native'; -import { SPACING } from '../../constants'; /** * Hook that manages keyboard-aware popover positioning. * When the keyboard is visible, dismisses it and waits for `keyboardDidHide` * before measuring position to ensure correct coordinates. + * + * anchorY → distance from screen bottom to trigger top (popover sits above trigger) + * anchorX → distance from screen right to trigger right edge (popover right-aligns with trigger) */ -export function useKeyboardAwarePopover(offsetX: number = SPACING.md) { +export function useKeyboardAwarePopover() { const [anchor, setAnchor] = useState({ y: 0, x: 0 }); const [visible, setVisible] = useState(false); const triggerRef = useRef>(null); @@ -27,13 +29,15 @@ export function useKeyboardAwarePopover(offsetX: number = SPACING.md) { const show = useCallback(() => { const measureAndShow = () => { - triggerRef.current?.measureInWindow?.((...args: number[]) => { - const screenH = Dimensions.get('window').height; - // On Android, measureInWindow Y includes the status bar but - // Dimensions.get('window').height may not — subtract the offset - // so the popover sits snugly above the trigger button. + triggerRef.current?.measureInWindow?.((btnX: number, btnY: number, btnW: number) => { + const { height: screenH, width: screenW } = Dimensions.get('window'); + // On Android, measureInWindow Y includes the status bar height. const statusBarOffset = Platform.OS === 'android' ? (StatusBar.currentHeight ?? 0) : 0; - setAnchor({ y: screenH - (args[1] ?? 0) - statusBarOffset, x: offsetX }); + // bottom: how far the popover bottom sits above the screen bottom (= above the trigger) + const y = screenH - (btnY ?? 0) - statusBarOffset; + // right: align popover's right edge with the trigger button's right edge + const x = screenW - ((btnX ?? 0) + (btnW ?? 0)); + setAnchor({ y, x }); }); setVisible(true); }; @@ -54,7 +58,7 @@ export function useKeyboardAwarePopover(offsetX: number = SPACING.md) { } else { measureAndShow(); } - }, [offsetX]); + }, []); const hide = useCallback(() => setVisible(false), []); diff --git a/src/components/ChatMessage/components/ActionMenuSheet.tsx b/src/components/ChatMessage/components/ActionMenuSheet.tsx index 1f380fe2d..802bc5db1 100644 --- a/src/components/ChatMessage/components/ActionMenuSheet.tsx +++ b/src/components/ChatMessage/components/ActionMenuSheet.tsx @@ -12,11 +12,13 @@ interface ActionMenuSheetProps { canEdit: boolean; canRetry: boolean; canGenerateImage: boolean; + canSpeak: boolean; styles: any; onCopy: () => void; onEdit: () => void; onRetry: () => void; onGenerateImage: () => void; + onSpeak: () => void; } export function ActionMenuSheet({ @@ -26,11 +28,13 @@ export function ActionMenuSheet({ canEdit, canRetry, canGenerateImage, + canSpeak, styles, onCopy, onEdit, onRetry, onGenerateImage, + onSpeak, }: ActionMenuSheetProps) { const { colors } = useTheme(); @@ -89,6 +93,18 @@ export function ActionMenuSheet({ Generate Image )} + + {!isUser && canSpeak && ( + + + Speak + + )} ); diff --git a/src/components/ChatMessage/components/MessageAttachments.tsx b/src/components/ChatMessage/components/MessageAttachments.tsx index adead2c98..b798a2fcd 100644 --- a/src/components/ChatMessage/components/MessageAttachments.tsx +++ b/src/components/ChatMessage/components/MessageAttachments.tsx @@ -78,7 +78,22 @@ export function MessageAttachments({ return ( {attachments.map((attachment, index) => - attachment.type === 'document' ? ( + attachment.type === 'audio' ? ( + + + + Voice message + + + ) : attachment.type === 'document' ? ( ); } + // No content but may have thinking — render ThinkingBlock alone (audio mode above-bubble use case) + if (parsedContent.thinking) { + return ( + + ); + } return null; } diff --git a/src/components/ChatMessage/index.tsx b/src/components/ChatMessage/index.tsx index d80310b7d..f8c6e83f5 100644 --- a/src/components/ChatMessage/index.tsx +++ b/src/components/ChatMessage/index.tsx @@ -1,6 +1,7 @@ import React, { useState } from 'react'; import { View, Text, TouchableOpacity, Clipboard } from 'react-native'; import { useTheme, useThemedStyles } from '../../theme'; +import { useTTSStore } from '../../stores/ttsStore'; import Icon from 'react-native-vector-icons/Feather'; import { stripControlTokens } from '../../utils/messageContent'; import { CustomAlert, showAlert, hideAlert, AlertState, initialAlertState } from '../CustomAlert'; @@ -133,14 +134,16 @@ type MetaRowProps = { isStreaming?: boolean; showActions: boolean; onMenuOpen: () => void; + metaExtra?: React.ReactNode; }; -const MessageMetaRow: React.FC = ({ message, styles, isStreaming, showActions, onMenuOpen }) => ( +const MessageMetaRow: React.FC = ({ message, styles, isStreaming, showActions, onMenuOpen, metaExtra }) => ( {formatTime(message.timestamp)} {message.generationTimeMs != null && message.role === 'assistant' && ( {formatDuration(message.generationTimeMs)} )} + {metaExtra} {showActions && !isStreaming && ( ••• @@ -157,7 +160,9 @@ const ToolCallWithThinking: React.FC<{ return ( {!!tc?.thinking && ( - + + + )} {hasText && ( @@ -179,11 +184,17 @@ export const ChatMessage: React.FC = ({ onGenerateImage, showActions = true, canGenerateImage = false, + canSpeak: canSpeakProp = false, + onSpeak: onSpeakProp, showGenerationDetails = false, animateEntry = false, + metaExtra, }) => { const { colors } = useTheme(); const styles = useThemedStyles(createStyles); + const ttsCanSpeak = useTTSStore( + s => s.settings.enabled && s.isReady, + ); const [showActionMenu, setShowActionMenu] = useState(false); const [isEditing, setIsEditing] = useState(false); const [editedContent, setEditedContent] = useState(message.content); @@ -242,6 +253,17 @@ export const ChatMessage: React.FC = ({ setShowActionMenu(false); }; + const canSpeak = !isUser && !isStreaming && (canSpeakProp || ttsCanSpeak); + + const handleSpeak = () => { + setShowActionMenu(false); + if (onSpeakProp) { + onSpeakProp(); + return; + } + useTTSStore.getState().speak(displayContent, message.id); + }; + if (message.isSystemInfo) { return setAlertState(hideAlert())} />; @@ -291,6 +313,7 @@ export const ChatMessage: React.FC = ({ isStreaming={isStreaming} showActions={showActions} onMenuOpen={() => setShowActionMenu(true)} + metaExtra={metaExtra} /> {showGenerationDetails && !isUser && message.generationMeta && ( @@ -310,11 +333,13 @@ export const ChatMessage: React.FC = ({ canEdit={!!onEdit} canRetry={!!onRetry} canGenerateImage={canGenerateImage && !!onGenerateImage} + canSpeak={canSpeak} styles={styles} onCopy={handleCopy} onEdit={handleEdit} onRetry={handleRetry} onGenerateImage={handleGenerateImage} + onSpeak={handleSpeak} /> ({ overflow: 'hidden' as const, width: '100%' as const, }, + /** Constrains the ThinkingBlock when rendered outside a message bubble (e.g. ToolCallWithThinking) */ + thinkingBlockWrapper: { + width: '88%' as const, + alignSelf: 'flex-start' as const, + }, thinkingHeader: { flexDirection: 'row' as const, alignItems: 'flex-start' as const, diff --git a/src/components/ChatMessage/types.ts b/src/components/ChatMessage/types.ts index f93ef8ec2..becd367aa 100644 --- a/src/components/ChatMessage/types.ts +++ b/src/components/ChatMessage/types.ts @@ -10,8 +10,12 @@ export interface ChatMessageProps { onGenerateImage?: (prompt: string) => void; showActions?: boolean; canGenerateImage?: boolean; + canSpeak?: boolean; + onSpeak?: () => void; showGenerationDetails?: boolean; animateEntry?: boolean; + /** Extra element rendered at the end of the meta row (e.g. TTSButton) */ + metaExtra?: React.ReactNode; } export interface ParsedContent { diff --git a/src/components/EngineBridge.tsx b/src/components/EngineBridge.tsx new file mode 100644 index 000000000..a877b0113 --- /dev/null +++ b/src/components/EngineBridge.tsx @@ -0,0 +1,37 @@ +/** + * EngineBridge + * + * Renders the React bridge component for the currently active TTS engine + * (if it needs one). Mount once at the app root. + * + * Engines that are fully imperative (OuteTTS, Qwen3-TTS) return null + * from getBridgeComponent() and this renders nothing. + * + * Hook-based engines (Kokoro) return a component that mounts their + * React hooks and registers imperative handles with the engine instance. + * + * Platform gating: if the engine declares platformRequirements and the + * device doesn't meet them, the bridge is not rendered (prevents crashes + * from mounting native hooks on unsupported OS versions). + */ +import React, { useMemo } from 'react'; +import { useTTSStore } from '../stores/ttsStore'; +import { ttsRegistry } from '../engine'; + +export const EngineBridge: React.FC = () => { + const engineId = useTTSStore(s => s.settings.engineId); + + const BridgeComponent = useMemo(() => { + if (!ttsRegistry.has(engineId)) return null; + try { + const engine = ttsRegistry.getEngine(engineId); + if (!engine.isSupported()) return null; + return engine.getBridgeComponent(); + } catch { + return null; + } + }, [engineId]); + + if (!BridgeComponent) return null; + return ; +}; diff --git a/src/components/GenerationSettingsModal/ImageQualitySliders.tsx b/src/components/GenerationSettingsModal/ImageQualitySliders.tsx index f1e0544dc..2feac93a0 100644 --- a/src/components/GenerationSettingsModal/ImageQualitySliders.tsx +++ b/src/components/GenerationSettingsModal/ImageQualitySliders.tsx @@ -1,6 +1,6 @@ import React from 'react'; import { View, Text, Switch, Platform, TouchableOpacity } from 'react-native'; -import Slider from '@react-native-community/slider'; +import { NumericStepper } from '../NumericStepper'; import { useTheme, useThemedStyles } from '../../theme'; import { useAppStore } from '../../stores'; import { useClearGpuCache } from '../../hooks/useImageGenerationSettings'; @@ -24,70 +24,38 @@ const ClearGPUCacheButton: React.FC = () => { ); }; -/** Basic sliders: Image Steps + Image Size */ +/** Basic controls: Image Steps + Image Size */ export const ImageQualityBasicSliders: React.FC = () => { - const { colors } = useTheme(); const styles = useThemedStyles(createStyles); const { settings, updateSettings } = useAppStore(); return ( <> - - Image Steps - {settings.imageSteps || 8} - - - 4-8 steps for speed, 20-50 for quality - - Image Steps + 4-8 steps for speed, 20-50 for quality + updateSettings({ imageSteps: value })} - minimumTrackTintColor={colors.primary} - maximumTrackTintColor={colors.surfaceLight} - thumbTintColor={colors.primary} + min={4} max={50} step={1} + onChange={(value) => updateSettings({ imageSteps: value })} /> - - 4 - 50 - - - Image Size - - {settings.imageWidth ?? 256}x{settings.imageHeight ?? 256} - - - - Output resolution (smaller = faster, larger = more detail) - - Image Size + Output resolution (smaller = faster, larger = more detail) + updateSettings({ imageWidth: value, imageHeight: value })} - minimumTrackTintColor={colors.primary} - maximumTrackTintColor={colors.surfaceLight} - thumbTintColor={colors.primary} + min={128} max={512} step={64} + formatValue={(v) => `${v}x${v}`} + onChange={(value) => updateSettings({ imageWidth: value, imageHeight: value })} /> - - 128 - 512 - ); }; -/** Advanced sliders: Guidance Scale, Image Threads, GPU Acceleration */ +/** Advanced controls: Guidance Scale, Image Threads, GPU Acceleration */ export const ImageQualityAdvancedSliders: React.FC = () => { const { colors } = useTheme(); const styles = useThemedStyles(createStyles); @@ -96,53 +64,23 @@ export const ImageQualityAdvancedSliders: React.FC = () => { return ( <> - - Guidance Scale - {(settings.imageGuidanceScale || 7.5).toFixed(1)} - - - Higher = follows prompt more strictly (5-15 range) - - Guidance Scale + Higher = follows prompt more strictly (5-15 range) + updateSettings({ imageGuidanceScale: value })} - minimumTrackTintColor={colors.primary} - maximumTrackTintColor={colors.surfaceLight} - thumbTintColor={colors.primary} + min={1} max={20} step={0.5} decimals={1} + onChange={(value) => updateSettings({ imageGuidanceScale: value })} /> - - 1 - 20 - - - Image Threads - {settings.imageThreads ?? 4} - - - CPU threads used for image generation. Takes effect next time the image model loads. - - Image Threads + CPU threads used for image generation. Takes effect next time the image model loads. + updateSettings({ imageThreads: value })} - minimumTrackTintColor={colors.primary} - maximumTrackTintColor={colors.surfaceLight} - thumbTintColor={colors.primary} + min={1} max={8} step={1} + onChange={(value) => updateSettings({ imageThreads: value })} /> - - 1 - 8 - {Platform.OS === 'android' && ( @@ -157,7 +95,7 @@ export const ImageQualityAdvancedSliders: React.FC = () => { /> - Use GPU for faster image generation. First run may be slower while optimizing for your device. For best performance, use NPU models on supported Snapdragon devices. + Use GPU for faster image generation. First run may be slower while optimizing for your device. {(settings.imageUseOpenCL ?? true) && } diff --git a/src/components/GenerationSettingsModal/TTSSection.tsx b/src/components/GenerationSettingsModal/TTSSection.tsx new file mode 100644 index 000000000..cf4f384cf --- /dev/null +++ b/src/components/GenerationSettingsModal/TTSSection.tsx @@ -0,0 +1,227 @@ +import React from 'react'; +import { View, Text, Switch, TouchableOpacity, ActivityIndicator } from 'react-native'; +import Icon from 'react-native-vector-icons/Feather'; +import { NumericStepper } from '../NumericStepper'; +import { useTheme, useThemedStyles } from '../../theme'; +import type { ThemeColors, ThemeShadows } from '../../theme'; +import { SPACING } from '../../constants'; +import { useTTSStore } from '../../stores/ttsStore'; +import { createStyles as createModalStyles } from './styles'; + +const createLocalStyles = (colors: ThemeColors, _shadows: ThemeShadows) => ({ + modeChipDisabled: { opacity: 0.4 as const }, + linkButton: { + alignSelf: 'flex-start' as const, + paddingHorizontal: SPACING.md, + paddingVertical: SPACING.sm, + borderRadius: 8, + borderWidth: 1, + borderColor: colors.border, + marginTop: SPACING.sm, + }, + linkButtonRow: { flexDirection: 'row' as const, alignItems: 'center' as const, gap: SPACING.xs }, + flex1: { flex: 1 }, + toggleRow: { + flexDirection: 'row' as const, + alignItems: 'center' as const, + justifyContent: 'space-between' as const, + marginBottom: SPACING.lg, + }, + toggleInfo: { flex: 1 }, + noBottomMargin: { marginBottom: 0 }, + divider: { height: 1, backgroundColor: colors.border, marginBottom: SPACING.lg }, + voiceRow: { + flexDirection: 'row' as const, + alignItems: 'center' as const, + justifyContent: 'space-between' as const, + paddingVertical: SPACING.sm, + }, + voiceRowBorder: { borderTopWidth: 1, borderTopColor: colors.border }, + voiceInfo: { flex: 1 }, + voiceName: { fontSize: 13, color: colors.text }, + voiceMeta: { fontSize: 11, color: colors.textMuted, marginTop: 2 }, + voiceSectionHeader: { + flexDirection: 'row' as const, + alignItems: 'center' as const, + justifyContent: 'space-between' as const, + marginBottom: SPACING.sm, + }, + voiceSectionLabel: { fontSize: 11, color: colors.textMuted, textTransform: 'uppercase' as const, letterSpacing: 0.3 }, + downloadRow: { flexDirection: 'row' as const, alignItems: 'center' as const, gap: SPACING.sm, marginBottom: SPACING.md }, + downloadText: { fontSize: 12, color: colors.textSecondary, flex: 1 }, +}); + +// ── Mode Picker ────────────────────────────────────────────────────────────── + +const ModePicker: React.FC<{ audioAvailable: boolean }> = ({ audioAvailable }) => { + const modal = useThemedStyles(createModalStyles); + const local = useThemedStyles(createLocalStyles); + const { settings, updateSettings, initializeEngine } = useTTSStore(); + const mode = settings.interfaceMode; + + const handleModeChange = (next: 'chat' | 'audio') => { + if (next === 'audio' && !audioAvailable) return; + updateSettings({ interfaceMode: next }); + if (next === 'audio') initializeEngine(); + }; + + return ( + + + Interface Mode + + {mode === 'audio' + ? 'Audio Mode — responses rendered as voice notes' + : 'Chat Mode — play button added to text messages'} + + + + {(['chat', 'audio'] as const).map((m) => { + const active = mode === m; + const disabled = m === 'audio' && !audioAvailable; + return ( + handleModeChange(m)} + disabled={disabled} + > + + {m === 'chat' ? 'Chat' : 'Audio'} + + + ); + })} + + + ); +}; + +// ── Voice Picker ───────────────────────────────────────────────────────────── + +const VoicePicker: React.FC = () => { + const { colors } = useTheme(); + const local = useThemedStyles(createLocalStyles); + const { voices, activeVoiceId, isReady, isDownloading, overallDownloadProgress, setVoice } = useTTSStore(); + + return ( + + + Voice + {isDownloading && ( + {Math.round(overallDownloadProgress * 100)}% + )} + {!isReady && !isDownloading && ( + + )} + {isReady && ( + + )} + + + {voices.map((voice, i) => { + const active = voice.id === activeVoiceId; + return ( + 0 && local.voiceRowBorder]} + onPress={() => setVoice(voice.id)} + > + + {voice.label} + + {voice.metadata.accent ? `${voice.metadata.accent} · ` : ''} + {voice.metadata.gender || ''} + + + {active && } + + ); + })} + + + + ); +}; + +// ── Main TTS Section ───────────────────────────────────────────────────────── + +interface TTSSectionProps { + onNavigateToTTSSettings?: () => void; +} + +export const TTSSection: React.FC = ({ onNavigateToTTSSettings }) => { + const { colors } = useTheme(); + const modal = useThemedStyles(createModalStyles); + const local = useThemedStyles(createLocalStyles); + const { settings, updateSettings, isReady } = useTTSStore(); + + const trackColor = { false: colors.surfaceLight, true: `${colors.primary}80` }; + const isChatMode = settings.interfaceMode === 'chat'; + + if (!isReady) { + return ( + + + No voice models downloaded. Go to TTS Settings to download them. + + {onNavigateToTTSSettings && ( + + + + TTS Settings + + + )} + + ); + } + + return ( + + + + {isChatMode && ( + + + Enable TTS + Show play buttons on assistant messages + + updateSettings({ enabled: v })} + trackColor={trackColor} + thumbColor={settings.enabled ? colors.primary : colors.textMuted} + /> + + )} + + + + + Speed + `${v.toFixed(1)}x`} + onChange={(v) => updateSettings({ speed: v })} + /> + + + {isChatMode && ( + + + Auto-play + Speak AI responses automatically + + updateSettings({ autoPlay: v })} + trackColor={trackColor} + thumbColor={settings.autoPlay ? colors.primary : colors.textMuted} + /> + + )} + + ); +}; diff --git a/src/components/GenerationSettingsModal/TextGenerationAdvanced.tsx b/src/components/GenerationSettingsModal/TextGenerationAdvanced.tsx index 0b017e571..3d44a9998 100644 --- a/src/components/GenerationSettingsModal/TextGenerationAdvanced.tsx +++ b/src/components/GenerationSettingsModal/TextGenerationAdvanced.tsx @@ -1,7 +1,7 @@ import React from 'react'; import { View, Text, TouchableOpacity } from 'react-native'; -import Slider from '@react-native-community/slider'; -import { useTheme, useThemedStyles } from '../../theme'; +import { NumericStepper } from '../NumericStepper'; +import { useThemedStyles } from '../../theme'; import { useAppStore } from '../../stores'; import { CacheType } from '../../types'; import { @@ -15,7 +15,6 @@ import { createStyles } from './styles'; // ─── GPU Acceleration ───────────────────────────────────────────────────────── export const GpuAccelerationToggle: React.FC = () => { - const { colors } = useTheme(); const styles = useThemedStyles(createStyles); const { settings, updateSettings } = useAppStore(); const { gpuLayersEffective, handleGpuToggle } = useTextGenerationAdvanced(); @@ -51,24 +50,15 @@ export const GpuAccelerationToggle: React.FC = () => { {settings.enableGpu && ( - - GPU Layers - {gpuLayersEffective} - + GPU Layers Layers offloaded to GPU. Higher = faster but may crash on low-VRAM devices. Requires model reload. - updateSettings({ gpuLayers: value })} - minimumTrackTintColor={colors.primary} - maximumTrackTintColor={colors.surfaceLight} - thumbTintColor={colors.primary} + min={1} max={GPU_LAYERS_MAX} step={1} + onChange={(value) => updateSettings({ gpuLayers: value })} /> )} @@ -199,56 +189,34 @@ export const ModelLoadingStrategyToggle: React.FC = () => { // ─── CPU Threads & Batch Size ──────────────────────────────────────────────── export const CpuThreadsSlider: React.FC = () => { - const { colors } = useTheme(); const styles = useThemedStyles(createStyles); const { settings, updateSettings } = useAppStore(); - const value = settings.nThreads ?? 6; return ( - - CPU Threads - {value} - + CPU Threads Parallel threads for inference - updateSettings({ nThreads: v })} - minimumTrackTintColor={colors.primary} - maximumTrackTintColor={colors.surfaceLight} - thumbTintColor={colors.primary} + updateSettings({ nThreads: v })} /> ); }; export const BatchSizeSlider: React.FC = () => { - const { colors } = useTheme(); const styles = useThemedStyles(createStyles); const { settings, updateSettings } = useAppStore(); - const value = settings.nBatch ?? 512; return ( - - Batch Size - {value} - + Batch Size Tokens processed per batch - updateSettings({ nBatch: v })} - minimumTrackTintColor={colors.primary} - maximumTrackTintColor={colors.surfaceLight} - thumbTintColor={colors.primary} + updateSettings({ nBatch: v })} /> ); diff --git a/src/components/GenerationSettingsModal/TextGenerationSection.tsx b/src/components/GenerationSettingsModal/TextGenerationSection.tsx index 18ed0c031..9ef8070da 100644 --- a/src/components/GenerationSettingsModal/TextGenerationSection.tsx +++ b/src/components/GenerationSettingsModal/TextGenerationSection.tsx @@ -1,6 +1,6 @@ import React, { useState } from 'react'; import { View, Text, TouchableOpacity, Platform } from 'react-native'; -import Slider from '@react-native-community/slider'; +import { NumericStepper } from '../NumericStepper'; import { AdvancedToggle } from '../AdvancedToggle'; import { useTheme, useThemedStyles } from '../../theme'; import { useAppStore } from '../../stores'; @@ -103,35 +103,23 @@ const SettingSlider: React.FC = ({ config }) => { const rawValue = (settings as Record)[config.key]; const value = (rawValue ?? DEFAULT_SETTINGS[config.key]) as number; const warningText = config.warning?.(value) ?? null; + const decimals = config.step < 1 ? 2 : 0; return ( - - {config.label} - {config.format(value)} - + {config.label} {config.description && ( {config.description} )} {warningText && ( {warningText} )} - updateSettings({ [config.key]: v })} - onSlidingComplete={() => {}} - minimumTrackTintColor={colors.primary} - maximumTrackTintColor={colors.surfaceLight} - thumbTintColor={colors.primary} + min={config.min} max={config.max} step={config.step} decimals={decimals} + formatValue={config.format} + onChange={(v) => updateSettings({ [config.key]: v })} /> - - {config.format(config.min)} - {config.format(config.max)} - ); }; diff --git a/src/components/GenerationSettingsModal/index.tsx b/src/components/GenerationSettingsModal/index.tsx index b23a3b74a..fa54ea964 100644 --- a/src/components/GenerationSettingsModal/index.tsx +++ b/src/components/GenerationSettingsModal/index.tsx @@ -9,6 +9,7 @@ import { createStyles } from './styles'; import { ConversationActionsSection } from './ConversationActionsSection'; import { ImageGenerationSection } from './ImageGenerationSection'; import { TextGenerationSection } from './TextGenerationSection'; +import { TTSSection } from './TTSSection'; const DEFAULT_SETTINGS = { temperature: 0.7, @@ -26,6 +27,7 @@ interface GenerationSettingsModalProps { onOpenProject?: () => void; onOpenGallery?: () => void; onDeleteConversation?: () => void; + onOpenTTSSettings?: () => void; conversationImageCount?: number; activeProjectName?: string | null; isRemote?: boolean; @@ -37,6 +39,7 @@ export const GenerationSettingsModal: React.FC = ( onOpenProject, onOpenGallery, onDeleteConversation, + onOpenTTSSettings, conversationImageCount = 0, activeProjectName, isRemote, @@ -48,6 +51,7 @@ export const GenerationSettingsModal: React.FC = ( const [performanceStats, setPerformanceStats] = useState(llmService.getPerformanceStats()); const [imageSettingsOpen, setImageSettingsOpen] = useState(false); const [textSettingsOpen, setTextSettingsOpen] = useState(false); + const [ttsSettingsOpen, setTtsSettingsOpen] = useState(false); useEffect(() => { if (visible) { @@ -144,6 +148,23 @@ export const GenerationSettingsModal: React.FC = ( )} + {/* TTS SETTINGS */} + setTtsSettingsOpen(!ttsSettingsOpen)} + activeOpacity={0.7} + > + TEXT TO SPEECH + + + {ttsSettingsOpen && ( + + )} + Reset to Defaults diff --git a/src/components/MarkdownText.tsx b/src/components/MarkdownText.tsx index 78d6c9ae1..233a606a3 100644 --- a/src/components/MarkdownText.tsx +++ b/src/components/MarkdownText.tsx @@ -1,5 +1,5 @@ import React, { useCallback, useMemo } from 'react'; -import { Linking, Pressable, Text, StyleSheet } from 'react-native'; +import { Linking, Text } from 'react-native'; import Markdown from '@ronradtke/react-native-markdown-display'; import { useTheme } from '../theme'; import type { ThemeColors } from '../theme'; @@ -14,21 +14,17 @@ export function preprocessMarkdown(text: string): string { return text.replaceAll(/(\d)\*(?=\d)/g, String.raw`$1\*`); } -const linkWrapperStyles = StyleSheet.create({ - pressable: { flexShrink: 1, paddingBottom: 6 }, -}); - -/** Custom link rule that constrains the Pressable wrapper width */ +/** Custom link rule — renders as inline Text so it wraps correctly inside list items */ function createLinkRule(onPress: (url: string) => void) { - return (node: any, renderChildren: any, _parent: any) => ( - ( + onPress(node.attributes?.href ?? '')} > - {renderChildren} - + {children} + ); } diff --git a/src/components/NumericStepper.tsx b/src/components/NumericStepper.tsx new file mode 100644 index 000000000..342cc6694 --- /dev/null +++ b/src/components/NumericStepper.tsx @@ -0,0 +1,105 @@ +import React from 'react'; +import { View, Text, TouchableOpacity, StyleSheet } from 'react-native'; +import Icon from 'react-native-vector-icons/Feather'; +import { useTheme } from '../theme'; +import { TYPOGRAPHY, SPACING } from '../constants'; + +interface NumericStepperProps { + value: number; + min: number; + max: number; + step: number; + decimals?: number; + onChange: (value: number) => void; + formatValue?: (value: number) => string; + testID?: string; +} + +export const NumericStepper: React.FC = ({ + value, + min, + max, + step, + decimals = 0, + onChange, + formatValue, + testID, +}) => { + const { colors } = useTheme(); + + const round = (v: number) => Math.round(v / step) * step; + + const decrement = () => { + const next = round(value - step); + if (next >= min) onChange(parseFloat(next.toFixed(decimals))); + }; + + const increment = () => { + const next = round(value + step); + if (next <= max) onChange(parseFloat(next.toFixed(decimals))); + }; + + const display = formatValue ? formatValue(value) : value.toFixed(decimals); + const canDecrement = value > min; + const canIncrement = value < max; + + return ( + + + + + + + {display} + + + + + + + ); +}; + +const styles = StyleSheet.create({ + row: { + flexDirection: 'row', + alignItems: 'center', + justifyContent: 'center', + gap: SPACING.sm, + marginTop: SPACING.sm, + }, + button: { + width: 32, + height: 32, + borderRadius: 8, + borderWidth: 1, + alignItems: 'center', + justifyContent: 'center', + }, + buttonDisabled: { + opacity: 0.35, + }, + value: { + ...TYPOGRAPHY.body, + fontWeight: '400', + minWidth: 72, + textAlign: 'center', + paddingHorizontal: SPACING.sm, + paddingVertical: SPACING.xs, + borderRadius: 8, + borderWidth: 1, + overflow: 'hidden', + }, +}); diff --git a/src/components/TTSButton/index.tsx b/src/components/TTSButton/index.tsx new file mode 100644 index 000000000..38335e0b7 --- /dev/null +++ b/src/components/TTSButton/index.tsx @@ -0,0 +1,96 @@ +import React, { useEffect } from 'react'; +import { TouchableOpacity, ActivityIndicator, StyleSheet } from 'react-native'; +import Animated, { + useSharedValue, + useAnimatedStyle, + withRepeat, + withSequence, + withTiming, +} from 'react-native-reanimated'; +import Icon from 'react-native-vector-icons/Feather'; +import { useTheme } from '../../theme'; +import { useTTSStore } from '../../stores/ttsStore'; +import { SPACING } from '../../constants'; + +interface TTSButtonProps { + text: string; + messageId: string; +} + +export const TTSButton: React.FC = ({ text, messageId }) => { + const { colors } = useTheme(); + const { + speak, + stop, + isSpeaking, + isGeneratingAudio, + isLoading, + isReady, + currentMessageId, + settings, + } = useTTSStore(); + + const isThisMessage = currentMessageId === messageId; + const isThisMessageGenerating = isGeneratingAudio && isThisMessage; + const isThisMessageSpeaking = isSpeaking && !isGeneratingAudio && isThisMessage; + + const opacity = useSharedValue(1); + useEffect(() => { + if (isThisMessageSpeaking) { + opacity.value = withRepeat( + withSequence( + withTiming(0.4, { duration: 600 }), + withTiming(1, { duration: 600 }), + ), + -1, + false, + ); + } else { + opacity.value = withTiming(1, { duration: 200 }); + } + // eslint-disable-next-line react-hooks/exhaustive-deps + }, [isThisMessageSpeaking]); + + const animatedStyle = useAnimatedStyle(() => ({ opacity: opacity.value })); + + // Don't render if TTS disabled or engine not ready + if (!settings.enabled || !isReady) { + return null; + } + + // Show spinner while loading or generating audio tokens + if ((isLoading && isThisMessage) || isThisMessageGenerating) { + return ; + } + + const handlePress = () => { + if (isThisMessageSpeaking || isThisMessageGenerating) { + stop(); + return; + } + speak(text, messageId); + }; + + return ( + + + + + + ); +}; + +const styles = StyleSheet.create({ + button: { + padding: SPACING.xs, + }, +}); diff --git a/src/components/VoiceRecordButton/index.tsx b/src/components/VoiceRecordButton/index.tsx index bd1cca737..6844c05f6 100644 --- a/src/components/VoiceRecordButton/index.tsx +++ b/src/components/VoiceRecordButton/index.tsx @@ -9,6 +9,7 @@ import { PanResponderGestureState, Vibration, } from 'react-native'; +import Icon from 'react-native-vector-icons/Feather'; import ReanimatedAnimated, { useSharedValue, useAnimatedStyle, @@ -16,15 +17,16 @@ import ReanimatedAnimated, { withTiming, Easing, } from 'react-native-reanimated'; -import { useNavigation } from '@react-navigation/native'; -import { NativeStackNavigationProp } from '@react-navigation/native-stack'; import { useThemedStyles } from '../../theme'; import { CustomAlert, showAlert, hideAlert, AlertState, initialAlertState } from '../CustomAlert'; import { createStyles } from './styles'; import { LoadingState, TranscribingState, UnavailableButton, ButtonIcon } from './states'; -import { RootStackParamList } from '../../navigation/types'; +import { useWhisperStore } from '../../stores'; import logger from '../../utils/logger'; +const DOWNLOAD_MODEL_ID = 'small.en'; +const DOWNLOAD_MODEL_SIZE_MB = 466; + interface VoiceRecordButtonProps { isRecording: boolean; isAvailable: boolean; @@ -95,7 +97,7 @@ export const VoiceRecordButton: React.FC = ({ isModelLoading, isTranscribing, partialResult, - error, + error: _error, disabled, onStartRecording, onStopRecording, @@ -103,7 +105,7 @@ export const VoiceRecordButton: React.FC = ({ asSendButton = false, }) => { const styles = useThemedStyles(createStyles); - const navigation = useNavigation>(); + const { downloadModel, isDownloading, downloadProgress } = useWhisperStore(); const pulseAnim = useRef(new Animated.Value(1)).current; const loadingAnim = useRef(new Animated.Value(0)).current; @@ -125,6 +127,7 @@ export const VoiceRecordButton: React.FC = ({ rippleOpacity.value = 0; } + // eslint-disable-next-line react-hooks/exhaustive-deps }, [isRecording]); const rippleStyle = useAnimatedStyle(() => ({ @@ -161,15 +164,20 @@ export const VoiceRecordButton: React.FC = ({ const panResponder = useRef(buildPanResponder({ isDraggingToCancel, cancelOffsetX, callbacksRef })).current; const handleUnavailableTap = () => { - const errorDetail = error || 'No transcription model downloaded'; + if (isDownloading) { return; } setAlertState(showAlert( - 'Voice Input Unavailable', - `${errorDetail}\n\nDownload a Whisper model to enable on-device voice input.`, + 'Download Voice Model', + `Download Whisper Small to enable voice input? (${DOWNLOAD_MODEL_SIZE_MB} MB)`, [ - { text: 'Cancel' }, + { text: 'Cancel', style: 'cancel' }, { - text: 'Go to Voice Settings', - onPress: () => navigation.navigate('VoiceSettings'), + text: 'Download', + onPress: () => { + setAlertState(hideAlert()); + downloadModel(DOWNLOAD_MODEL_ID).catch((err) => { + logger.error('[VoiceRecordButton] Download failed:', err); + }); + }, }, ], )); @@ -206,8 +214,8 @@ export const VoiceRecordButton: React.FC = ({ if (!isAvailable) { return ( - - + + {alert} @@ -221,6 +229,42 @@ export const VoiceRecordButton: React.FC = ({ disabled && styles.buttonDisabled, ]; + // ── Audio mode: tap-to-toggle (tap to start, tap to stop & send) ─────────── + if (!asSendButton) { + const handleToggle = () => { + if (disabled) return; + Vibration.vibrate(50); + if (isRecording) { + onStopRecording(); + } else { + onStartRecording(); + } + }; + + return ( + + {isRecording && } + + + + {isRecording + ? + : } + + + + {alert} + + ); + } + + // ── Chat mode: hold-to-record with slide-to-cancel ───────────────────────── return ( {isRecording && ( diff --git a/src/components/VoiceRecordButton/states.tsx b/src/components/VoiceRecordButton/states.tsx index d0ba1ab22..889a820c7 100644 --- a/src/components/VoiceRecordButton/states.tsx +++ b/src/components/VoiceRecordButton/states.tsx @@ -43,7 +43,6 @@ export const TranscribingState: React.FC = ({ asSendButt {asSendButton ? : } - {!asSendButton && Transcribing...} ); }; @@ -52,16 +51,30 @@ export const TranscribingState: React.FC = ({ asSendButt interface UnavailableButtonProps { asSendButton: boolean; + /** 0–1 while downloading, undefined when idle */ + downloadProgress?: number; } -export const UnavailableButton: React.FC = ({ asSendButton }) => { +export const UnavailableButton: React.FC = ({ asSendButton, downloadProgress }) => { const { colors } = useTheme(); const styles = useThemedStyles(createStyles); + const isDownloading = downloadProgress !== undefined; + + if (asSendButton) { + return ( + + + + ); + } return ( - - {asSendButton ? ( - + + {isDownloading ? ( + <> + + {Math.round(downloadProgress * 100)}% + ) : ( <> diff --git a/src/constants/kokoroModels.ts b/src/constants/kokoroModels.ts new file mode 100644 index 000000000..333555f2c --- /dev/null +++ b/src/constants/kokoroModels.ts @@ -0,0 +1,24 @@ +/** + * @deprecated — Use imports from 'src/engine' instead. + * This file re-exports for backward compatibility with any remaining consumers. + */ +export { + KOKORO_VOICES, + DEFAULT_KOKORO_VOICE_ID, + getKokoroVoiceConfig, +} from '../engine/tts/engines/kokoro/voices'; +export type { KokoroVoiceId } from '../engine/tts/engines/kokoro/voices'; +export { KOKORO_MEDIUM } from 'react-native-executorch'; + +import { Platform } from 'react-native'; + +/** @deprecated — Use engine.isSupported() instead */ +export function isExecutorchSupported(): boolean { + if (Platform.OS === 'android') { + return (Platform.Version as number) >= 26; + } + if (Platform.OS === 'ios') { + return parseInt(Platform.Version as string, 10) >= 17; + } + return false; +} diff --git a/src/constants/ttsModels.ts b/src/constants/ttsModels.ts new file mode 100644 index 000000000..f93dfe856 --- /dev/null +++ b/src/constants/ttsModels.ts @@ -0,0 +1,25 @@ +export const TTS_BACKBONE_MODEL = { + id: 'outetts-0.3-500m-q4', + name: 'OuteTTS 0.3', + backboneFile: 'OuteTTS-0.3-500M-Q4_K_M.gguf', + backboneUrl: + 'https://huggingface.co/OuteAI/OuteTTS-0.3-500M-GGUF/resolve/main/OuteTTS-0.3-500M-Q4_K_M.gguf', + backboneSizeMB: 454, + vocoderFile: 'WavTokenizer-Large-75-Q5_1.gguf', + vocoderUrl: + 'https://huggingface.co/ggml-org/WavTokenizer/resolve/main/WavTokenizer-Large-75-Q5_1.gguf', + vocoderSizeMB: 73, + sampleRate: 24000, + description: 'Natural-sounding on-device speech. Requires ~530 MB storage.', +}; + +export const TTS_SPEAKER_PROFILES = [ + { id: '0', label: 'Default' }, +]; + +/** Warn user if device RAM is below this threshold */ +export const TTS_WARN_RAM_GB = 8; +/** Hard-block TTS on devices below this threshold */ +export const TTS_BLOCK_RAM_GB = 6; +/** Max cached audio messages per conversation before eviction */ +export const AUDIO_CACHE_MAX_MESSAGES = 50; diff --git a/src/engine/EngineRegistry.ts b/src/engine/EngineRegistry.ts new file mode 100644 index 000000000..78d5711f3 --- /dev/null +++ b/src/engine/EngineRegistry.ts @@ -0,0 +1,116 @@ +/** + * Generic engine registry. + * + * Works for any modality — TTS, STT, Vision, LLM. + * Engines register a factory; the registry lazily instantiates and + * manages the active engine lifecycle. + * + * Usage: + * const ttsRegistry = new EngineRegistry(); + * ttsRegistry.register('kokoro', () => new KokoroEngine()); + * await ttsRegistry.setActiveEngine('kokoro'); + */ +import type { OnDeviceEngine, BaseEngineEvents } from './types'; + +export type EngineFactory = () => T; + +interface Stoppable { stop(): void; } +function hasStop(obj: unknown): obj is Stoppable { + return typeof obj === 'object' && obj !== null && 'stop' in obj && typeof (obj as Stoppable).stop === 'function'; +} + +export class EngineRegistry< + T extends OnDeviceEngine, +> { + private _factories = new Map>(); + private _instances = new Map(); + private _activeId: string | null = null; + + /** Register an engine factory. Call once per engine at module load time. */ + register(id: string, factory: EngineFactory): void { + this._factories.set(id, factory); + } + + /** Unregister an engine. Releases instance if it exists. */ + async unregister(id: string): Promise { + const instance = this._instances.get(id); + if (instance) { + if (hasStop(instance)) instance.stop(); + await instance.release(); + this._instances.delete(id); + } + this._factories.delete(id); + if (this._activeId === id) { + this._activeId = null; + } + } + + /** All registered engine IDs */ + getRegisteredIds(): string[] { + return Array.from(this._factories.keys()); + } + + /** Check if an engine ID is registered */ + has(id: string): boolean { + return this._factories.has(id); + } + + /** Get or lazily create a singleton engine instance */ + getEngine(id: string): T { + let engine = this._instances.get(id); + if (!engine) { + const factory = this._factories.get(id); + if (!factory) { + throw new Error(`Engine '${id}' is not registered.`); + } + engine = factory(); + this._instances.set(id, engine); + } + return engine; + } + + /** + * Set the active engine. Stops and releases the previous one. + * Returns the newly active engine instance. + */ + async setActiveEngine(id: string): Promise { + if (this._activeId && this._activeId !== id) { + const prev = this._instances.get(this._activeId); + if (prev) { + try { + if (hasStop(prev)) prev.stop(); + await prev.release(); + } catch { + // Best-effort cleanup + } + } + } + this._activeId = id; + return this.getEngine(id); + } + + /** Currently active engine (null if none set) */ + getActiveEngine(): T | null { + if (!this._activeId) return null; + return this._instances.get(this._activeId) ?? null; + } + + /** Currently active engine ID (null if none set) */ + getActiveEngineId(): string | null { + return this._activeId; + } + + /** Release all engine instances */ + async releaseAll(): Promise { + for (const [, engine] of this._instances) { + try { + if (hasStop(engine)) engine.stop(); + await engine.release(); + } catch { + // Best-effort + } + } + this._instances.clear(); + this._activeId = null; + } +} diff --git a/src/engine/OnDeviceEngineEmitter.ts b/src/engine/OnDeviceEngineEmitter.ts new file mode 100644 index 000000000..b61bd6a27 --- /dev/null +++ b/src/engine/OnDeviceEngineEmitter.ts @@ -0,0 +1,71 @@ +/** + * Minimal typed event emitter for on-device engines. + * + * Engines extend this to get on/off/once/emit for free. + * Zero dependencies — no Node EventEmitter, no third-party lib. + */ + +type Listener = (...args: any[]) => void; + +export class OnDeviceEngineEmitter< + TEvents extends Record = Record, +> { + private _listeners = new Map>(); + + on(event: K, listener: TEvents[K]): () => void { + const key = event as string; + if (!this._listeners.has(key)) { + this._listeners.set(key, new Set()); + } + this._listeners.get(key)!.add(listener as Listener); + return () => this.off(event, listener); + } + + off(event: K, listener: TEvents[K]): void { + this._listeners.get(event as string)?.delete(listener as Listener); + } + + once(event: K, listener: TEvents[K]): () => void { + const wrapper = ((...args: any[]) => { + this.off(event, wrapper as TEvents[K]); + (listener as Listener)(...args); + }) as TEvents[K]; + return this.on(event, wrapper); + } + + protected emit( + event: K, + ...args: Parameters + ): void { + const listeners = this._listeners.get(event as string); + if (!listeners) return; + for (const fn of listeners) { + try { + fn(...args); + } catch { + // Swallow event handler errors to prevent cascading failures + } + } + } + + /** Remove all listeners, optionally for a specific event */ + protected removeAllListeners(event?: keyof TEvents): void { + if (event) { + this._listeners.delete(event as string); + } else { + this._listeners.clear(); + } + } + + /** Current listener count, optionally for a specific event */ + protected listenerCount(event?: keyof TEvents): number { + if (event) { + return this._listeners.get(event as string)?.size ?? 0; + } + let count = 0; + for (const set of this._listeners.values()) { + count += set.size; + } + return count; + } +} diff --git a/src/engine/index.ts b/src/engine/index.ts new file mode 100644 index 000000000..e20172d27 --- /dev/null +++ b/src/engine/index.ts @@ -0,0 +1,52 @@ +/** + * On-Device Engine SDK + * + * Public API surface. Everything exported here is part of the SDK contract. + */ + +// ── Types ───────────────────────────────────────────────────────────────── +export type { + // Base + EnginePhase, + ModelAsset, + ModelAssetStatus, + ModelAssetState, + EngineCapabilities, + BaseEngineEvents, + OnDeviceEngine, + // TTS + TTSVoice, + TTSEngineCapabilities, + TTSSpeakOptions, + TTSGenerateResult, + TTSEngineEvents, + TTSEngine, +} from './types'; + +// ── Classes ─────────────────────────────────────────────────────────────── +export { OnDeviceEngineEmitter } from './OnDeviceEngineEmitter'; +export { EngineRegistry } from './EngineRegistry'; +export type { EngineFactory } from './EngineRegistry'; + +// ── TTS Engines ────────────────────────────────────────────────────────── +export { KokoroEngine } from './tts/engines/kokoro'; +export { OuteTTSEngine } from './tts/engines/outetts'; +export { Qwen3TTSEngine } from './tts/engines/qwen3'; + +// Re-export Kokoro voice types for settings UI +export { KOKORO_VOICES, DEFAULT_KOKORO_VOICE_ID } from './tts/engines/kokoro'; +export type { KokoroVoiceId } from './tts/engines/kokoro'; + +// ── TTS Registry (singleton) ────────────────────────────────────────────── +import { EngineRegistry } from './EngineRegistry'; +import type { TTSEngine } from './types'; +import { KokoroEngine } from './tts/engines/kokoro'; +import { OuteTTSEngine } from './tts/engines/outetts'; +export const ttsRegistry = new EngineRegistry(); + +// Register built-in TTS engines +ttsRegistry.register('kokoro', () => new KokoroEngine()); +ttsRegistry.register('outetts', () => new OuteTTSEngine()); +// Qwen3-TTS stub — uncomment when inference pipeline is implemented: +// import { Qwen3TTSEngine } from './tts/engines/qwen3'; +// ttsRegistry.register('qwen3-tts', () => new Qwen3TTSEngine()); diff --git a/src/engine/tts/engines/kokoro/KokoroEngine.ts b/src/engine/tts/engines/kokoro/KokoroEngine.ts new file mode 100644 index 000000000..fa345454d --- /dev/null +++ b/src/engine/tts/engines/kokoro/KokoroEngine.ts @@ -0,0 +1,300 @@ +/** + * KokoroEngine — TTSEngine implementation for Kokoro TTS via ExecuTorch. + * + * Wraps react-native-executorch's useTextToSpeech hook through a bridge + * component pattern. The bridge registers an imperative handle; the engine + * exposes the standard TTSEngine API. + */ +import { Platform } from 'react-native'; +import { OnDeviceEngineEmitter } from '../../../OnDeviceEngineEmitter'; +import type { + EnginePhase, + TTSEngine, + TTSEngineCapabilities, + TTSEngineEvents, + TTSSpeakOptions, + TTSGenerateResult, + TTSVoice, + ModelAsset, + ModelAssetState, +} from '../../../types'; +import { + KOKORO_VOICES, + DEFAULT_KOKORO_VOICE_ID, + getKokoroTTSVoices, +} from './voices'; +import type { KokoroVoiceId } from './voices'; +import { createKokoroTTSBridge } from './KokoroTTSBridge'; +import logger from '../../../../utils/logger'; + +/** Bridge interface: the React component pushes these into the engine */ +export interface KokoroBridgeHandle { + speak: (text: string, speed: number) => Promise; + stop: (instant?: boolean) => void; + pause: () => void; + resume: () => void; + setKeepAlive: (keepAlive: boolean) => void; +} + +export class KokoroEngine + extends OnDeviceEngineEmitter + implements TTSEngine +{ + readonly id = 'kokoro'; + readonly displayName = 'Kokoro TTS'; + readonly capabilities: TTSEngineCapabilities = { + streaming: true, + voiceCloning: false, + pauseResume: true, + generateAndSave: false, + platformRequirements: { + android: { minSdkVersion: 26 }, + ios: { minVersion: 17 }, + }, + peakRamMB: 82, + }; + + private _phase: EnginePhase = 'idle'; + private _bridge: KokoroBridgeHandle | null = null; + private _activeVoiceId: KokoroVoiceId = DEFAULT_KOKORO_VOICE_ID; + private _downloadProgress = 0; + private _currentMessageId: string | null = null; + private _playSessionId = 0; + private _BridgeComponent: React.ComponentType; + + constructor() { + super(); + this._BridgeComponent = createKokoroTTSBridge(this); + } + + // ── State ─────────────────────────────────────────────────────────────── + + getPhase(): EnginePhase { + return this._phase; + } + + private _setPhase(phase: EnginePhase): void { + if (phase === this._phase) return; + const prev = this._phase; + this._phase = phase; + this.emit('phaseChange', phase, prev); + } + + // ── Bridge callbacks (called by KokoroTTSBridge) ──────────────────────── + + /** @internal Called by bridge when hook becomes ready or is torn down */ + _setBridge(handle: KokoroBridgeHandle | null, voiceId: KokoroVoiceId): void { + this._bridge = handle; + if (handle) { + this._activeVoiceId = voiceId; + this._setPhase('ready'); + logger.log('[KokoroEngine] Bridge registered, voice:', voiceId); + } else { + this._setPhase(this._downloadProgress > 0 && this._downloadProgress < 1 ? 'downloading' : 'idle'); + } + } + + /** @internal Called by bridge to sync download progress */ + _setDownloadProgress(progress: number): void { + this._downloadProgress = progress; + if (progress > 0 && progress < 1 && this._phase === 'idle') { + this._setPhase('downloading'); + } + this.emit('downloadProgress', { + assetId: 'kokoro-medium', + progress, + bytesWritten: 0, + totalBytes: 0, + }); + } + + /** @internal Called by bridge on each audio chunk */ + _onAudioChunk(data: { + samples: Float32Array; + sampleRate: number; + chunkIndex: number; + isFinal: boolean; + }): void { + this.emit('audioChunk', data); + } + + /** @internal Called by bridge on runtime error */ + _onBridgeError(message: string): void { + this._bridge = null; + this._setPhase('error'); + this.emit('error', { code: 'KOKORO_RUNTIME', message, recoverable: false }); + } + + // ── Lifecycle ─────────────────────────────────────────────────────────── + + isSupported(): boolean { + if (Platform.OS === 'android') { + return (Platform.Version as number) >= 26; + } + if (Platform.OS === 'ios') { + return parseInt(Platform.Version as string, 10) >= 17; + } + return false; + } + + async initialize(): Promise { + // No-op: Kokoro initializes when the bridge component mounts. + // The bridge calls _setBridge() which transitions to 'ready'. + } + + async release(): Promise { + this._bridge?.stop(true); + this._bridge = null; + this._currentMessageId = null; + this._setPhase('idle'); + } + + async destroy(): Promise { + await this.release(); + // Kokoro models are managed by executorch's internal cache + } + + // ── Assets ────────────────────────────────────────────────────────────── + + getRequiredAssets(): ModelAsset[] { + return [ + { + id: 'kokoro-medium', + label: 'Kokoro Medium', + url: '', // Managed internally by react-native-executorch + sizeBytes: 82 * 1024 * 1024, + filename: 'kokoro-medium', + }, + ]; + } + + async checkAssetStatus(): Promise { + const isReady = this._phase === 'ready'; + return [ + { + asset: this.getRequiredAssets()[0], + status: isReady ? 'downloaded' : this._downloadProgress > 0 ? 'downloading' : 'not-downloaded', + progress: isReady ? 1 : this._downloadProgress, + }, + ]; + } + + async downloadAssets(): Promise { + // Handled by react-native-executorch when the hook mounts + } + + async deleteAssets(): Promise { + await this.release(); + // Would need executorch API to clear its internal cache + } + + getOverallDownloadProgress(): number { + return this._phase === 'ready' ? 1 : this._downloadProgress; + } + + isFullyDownloaded(): boolean { + return this._phase === 'ready' || this._downloadProgress >= 1; + } + + // ── Voices ────────────────────────────────────────────────────────────── + + getVoices(): TTSVoice[] { + return getKokoroTTSVoices(); + } + + getActiveVoice(): TTSVoice | null { + return this.getVoices().find(v => v.id === this._activeVoiceId) ?? null; + } + + async setVoice(voiceId: string): Promise { + const valid = KOKORO_VOICES.find(v => v.id === voiceId); + if (!valid) { + throw new Error(`Unknown Kokoro voice: ${voiceId}`); + } + this._activeVoiceId = voiceId as KokoroVoiceId; + // Emit voiceChanged — the bridge component listens and does key-based remount + this.emit('voiceChanged', voiceId); + } + + // ── Speech ────────────────────────────────────────────────────────────── + + async speak(text: string, options?: TTSSpeakOptions): Promise { + if (!this._bridge) { + throw new Error('Kokoro bridge not mounted. Is the device supported?'); + } + + const speed = options?.speed ?? 1.0; + const messageId = options?.messageId ?? null; + + this._currentMessageId = messageId; + const sessionId = ++this._playSessionId; + this._setPhase('processing'); + + this._bridge.setKeepAlive(false); + + // Retry loop — executorch may still be busy from a previous stream + const MAX_RETRIES = 10; + for (let attempt = 0; attempt < MAX_RETRIES; attempt++) { + try { + logger.log('[KokoroEngine] speak attempt', attempt + 1); + await this._bridge.speak(text, speed); + break; + } catch (err: unknown) { + const errCode = (err as { code?: number })?.code; + if (errCode === 104 && attempt < MAX_RETRIES - 1) { + logger.log('[KokoroEngine] executorch busy, retrying in 200ms'); + await new Promise((r) => setTimeout(r, 200)); + continue; + } + this.emit('error', { + code: 'KOKORO_SPEAK', + message: err instanceof Error ? err.message : 'Speech failed', + recoverable: true, + }); + throw err; + } + } + + // Only clear state if this speak call still owns playback + if (this._playSessionId === sessionId) { + this._currentMessageId = null; + this._setPhase('ready'); + } + } + + async generateAndSave(): Promise { + throw new Error('Kokoro does not support generateAndSave. Use an engine with generateAndSave capability.'); + } + + async playFromFile(): Promise { + throw new Error('Kokoro does not support file playback.'); + } + + stop(): void { + this._bridge?.stop(true); + this._currentMessageId = null; + if (this._phase === 'processing' || this._phase === 'paused') { + this._setPhase(this._bridge ? 'ready' : 'idle'); + } + } + + pause(): void { + this._bridge?.pause(); + if (this._phase === 'processing') { + this._setPhase('paused'); + } + } + + resume(): void { + this._bridge?.resume(); + if (this._phase === 'paused') { + this._setPhase('processing'); + } + } + + // ── React Bridge ──────────────────────────────────────────────────────── + + getBridgeComponent(): React.ComponentType | null { + return this._BridgeComponent; + } +} diff --git a/src/engine/tts/engines/kokoro/KokoroTTSBridge.tsx b/src/engine/tts/engines/kokoro/KokoroTTSBridge.tsx new file mode 100644 index 000000000..0f29f6a55 --- /dev/null +++ b/src/engine/tts/engines/kokoro/KokoroTTSBridge.tsx @@ -0,0 +1,185 @@ +/** + * KokoroTTSBridge + * + * React component that mounts the react-native-executorch useTextToSpeech + * hook and registers imperative methods with the KokoroEngine instance. + * + * This replaces the old KokoroTTSManager. The key difference: instead of + * exposing module-level refs, it pushes its handle into the engine instance + * via engine._setBridge(). The engine owns the public API. + * + * Mount exactly once, near the root (via ), only on + * supported platforms. + */ +import React, { useEffect, useRef } from 'react'; +import { useTextToSpeech } from 'react-native-executorch'; +import { AudioContext } from 'react-native-audio-api'; +import { KOKORO_MEDIUM } from 'react-native-executorch'; +import { getKokoroVoiceConfig } from './voices'; +import type { KokoroVoiceId } from './voices'; +import type { KokoroEngine, KokoroBridgeHandle } from './KokoroEngine'; +import logger from '../../../../utils/logger'; + +// ─── Inner component — holds the hook for a single voice ──────────────────── + +const KokoroTTSInner: React.FC<{ + voiceId: KokoroVoiceId; + engine: KokoroEngine; +}> = ({ voiceId, engine }) => { + const audioCtxRef = useRef(null); + const pendingResolvers = useRef void>>(new Set()); + const skipSuspendOnEnd = useRef(false); + + const tts = useTextToSpeech({ + model: KOKORO_MEDIUM, + voice: getKokoroVoiceConfig(voiceId), + }); + + // Sync readiness + download progress into the engine + useEffect(() => { + logger.log('[KokoroBridge] isReady=', tts.isReady, 'downloadProgress=', tts.downloadProgress); + engine._setDownloadProgress(tts.downloadProgress); + if (tts.isReady) { + // Register the bridge handle so the engine can call speak/stop/etc. + const handle: KokoroBridgeHandle = { + speak: async (text: string, speed: number) => { + if (!audioCtxRef.current || audioCtxRef.current.state === 'closed') { + audioCtxRef.current = new AudioContext({ sampleRate: 24000 }); + } else if (audioCtxRef.current.state === 'suspended') { + await audioCtxRef.current.resume().catch(() => {}); + } + const ctx = audioCtxRef.current; + let chunkIndex = 0; + + try { + await tts.stream({ + text, + speed, + onNext: (chunk: Float32Array) => + new Promise((resolve) => { + pendingResolvers.current.add(resolve); + const done = () => { + pendingResolvers.current.delete(resolve); + resolve(); + }; + + // Emit audioChunk event so listeners can react + engine._onAudioChunk({ samples: chunk, sampleRate: 24000, chunkIndex, isFinal: false }); + chunkIndex++; + + const buffer = ctx.createBuffer(1, chunk.length, 24000); + buffer.copyToChannel(chunk, 0); + const source = ctx.createBufferSource(); + source.buffer = buffer; + source.playbackRate.value = speed; + source.connect(ctx.destination); + source.onEnded = done; + source.start(); + }), + onEnd: async () => { + // Emit final chunk marker + engine._onAudioChunk({ samples: new Float32Array(0), sampleRate: 24000, chunkIndex, isFinal: true }); + if (!skipSuspendOnEnd.current) { + await ctx.suspend().catch(() => {}); + } + }, + }); + } catch (err) { + logger.error('[KokoroBridge] stream error:', err); + throw err; + } + }, + + stop: (instant = true) => { + pendingResolvers.current.forEach((r) => r()); + pendingResolvers.current.clear(); + tts.streamStop(instant); + audioCtxRef.current?.close().catch(() => {}); + audioCtxRef.current = null; + }, + + pause: () => { + audioCtxRef.current?.suspend().catch(() => {}); + }, + + resume: () => { + audioCtxRef.current?.resume().catch(() => {}); + }, + + setKeepAlive: (keepAlive: boolean) => { + skipSuspendOnEnd.current = keepAlive; + }, + }; + + engine._setBridge(handle, voiceId); + } + }, [tts.isReady, tts.downloadProgress, voiceId, engine, tts]); + + useEffect(() => { + if (tts.error) { + logger.warn('[KokoroBridge] Runtime error:', tts.error); + engine._onBridgeError(String(tts.error)); + } + }, [tts.error, engine]); + + // Clean up on unmount + useEffect(() => { + return () => { + logger.log('[KokoroBridge] Inner unmounting'); + engine._setBridge(null, voiceId); + }; + // eslint-disable-next-line react-hooks/exhaustive-deps + }, []); + + return null; +}; + +// ─── Outer component — manages voice switching via key-based remount ──────── + +export function createKokoroTTSBridge(engine: KokoroEngine): React.FC { + return function KokoroTTSBridgeOuter() { + const [activeVoiceId, setActiveVoiceId] = React.useState( + (engine.getActiveVoice()?.id as KokoroVoiceId) ?? 'af_heart', + ); + const cooldownRef = useRef | null>(null); + const lastStreamEndRef = useRef(0); + + // Listen for voice changes from the engine + useEffect(() => { + const unsub = engine.on('voiceChanged', (voiceId) => { + const newVoice = voiceId as KokoroVoiceId; + if (newVoice === activeVoiceId) return; + + // Cooldown before remount to let executorch clean up + const elapsed = Date.now() - lastStreamEndRef.current; + const waitMs = Math.max(100, 2000 - elapsed); + + logger.log('[KokoroBridge] Voice change cooldown:', waitMs, 'ms'); + engine._setDownloadProgress(0); // Show loader during switch + + if (cooldownRef.current) clearTimeout(cooldownRef.current); + cooldownRef.current = setTimeout(() => { + setActiveVoiceId(newVoice); + cooldownRef.current = null; + }, waitMs); + }); + + return () => { + unsub(); + if (cooldownRef.current) clearTimeout(cooldownRef.current); + }; + }, [activeVoiceId]); + + // Track stream end time for cooldown calculation + useEffect(() => { + const unsub = engine.on('phaseChange', (phase, prev) => { + if (prev === 'processing' && (phase === 'ready' || phase === 'idle')) { + lastStreamEndRef.current = Date.now(); + } + }); + return unsub; + }, []); + + return ; + }; +} diff --git a/src/engine/tts/engines/kokoro/index.ts b/src/engine/tts/engines/kokoro/index.ts new file mode 100644 index 000000000..9ae77834d --- /dev/null +++ b/src/engine/tts/engines/kokoro/index.ts @@ -0,0 +1,4 @@ +export { KokoroEngine } from './KokoroEngine'; +export type { KokoroBridgeHandle } from './KokoroEngine'; +export { KOKORO_VOICES, DEFAULT_KOKORO_VOICE_ID, getKokoroVoiceConfig, getKokoroTTSVoices } from './voices'; +export type { KokoroVoiceId, KokoroVoiceEntry } from './voices'; diff --git a/src/engine/tts/engines/kokoro/voices.ts b/src/engine/tts/engines/kokoro/voices.ts new file mode 100644 index 000000000..67395658e --- /dev/null +++ b/src/engine/tts/engines/kokoro/voices.ts @@ -0,0 +1,69 @@ +/** + * Kokoro voice definitions. + * + * Moved from constants/kokoroModels.ts into the engine boundary. + * The VoiceConfig imports come from react-native-executorch; the + * TTSVoice wrappers are engine-agnostic. + */ +import { + KOKORO_VOICE_AF_HEART, + KOKORO_VOICE_AF_RIVER, + KOKORO_VOICE_AF_SARAH, + KOKORO_VOICE_AM_ADAM, + KOKORO_VOICE_AM_MICHAEL, + KOKORO_VOICE_AM_SANTA, + KOKORO_VOICE_BF_EMMA, + KOKORO_VOICE_BM_DANIEL, +} from 'react-native-executorch'; +import type { VoiceConfig } from 'react-native-executorch'; +import type { TTSVoice } from '../../../types'; + +export type KokoroVoiceId = + | 'af_heart' + | 'af_river' + | 'af_sarah' + | 'am_adam' + | 'am_michael' + | 'am_santa' + | 'bf_emma' + | 'bm_daniel'; + +export interface KokoroVoiceEntry { + id: KokoroVoiceId; + label: string; + persona: string; + accent: string; + gender: 'Female' | 'Male'; + defaultSpeed: number; + config: VoiceConfig; +} + +export const KOKORO_VOICES: KokoroVoiceEntry[] = [ + { id: 'af_heart', label: 'Warm', persona: 'Friendly and approachable', accent: 'US', gender: 'Female', defaultSpeed: 1.0, config: KOKORO_VOICE_AF_HEART }, + { id: 'af_river', label: 'Calm', persona: 'Relaxed and soothing', accent: 'US', gender: 'Female', defaultSpeed: 0.9, config: KOKORO_VOICE_AF_RIVER }, + { id: 'af_sarah', label: 'Clear', persona: 'Crisp and professional', accent: 'US', gender: 'Female', defaultSpeed: 1.0, config: KOKORO_VOICE_AF_SARAH }, + { id: 'am_adam', label: 'Steady', persona: 'Composed and reliable', accent: 'US', gender: 'Male', defaultSpeed: 1.0, config: KOKORO_VOICE_AM_ADAM }, + { id: 'am_michael', label: 'Bold', persona: 'Confident and direct', accent: 'US', gender: 'Male', defaultSpeed: 1.1, config: KOKORO_VOICE_AM_MICHAEL }, + { id: 'am_santa', label: 'Cheerful', persona: 'Upbeat and energetic', accent: 'US', gender: 'Male', defaultSpeed: 1.2, config: KOKORO_VOICE_AM_SANTA }, + { id: 'bf_emma', label: 'Gentle', persona: 'Soft and thoughtful', accent: 'British', gender: 'Female', defaultSpeed: 0.9, config: KOKORO_VOICE_BF_EMMA }, + { id: 'bm_daniel', label: 'Refined', persona: 'Polished and articulate', accent: 'British', gender: 'Male', defaultSpeed: 1.0, config: KOKORO_VOICE_BM_DANIEL }, +]; + +export const DEFAULT_KOKORO_VOICE_ID: KokoroVoiceId = 'af_heart'; + +export function getKokoroVoiceConfig(id: KokoroVoiceId): VoiceConfig { + return KOKORO_VOICES.find(v => v.id === id)?.config ?? KOKORO_VOICE_AF_HEART; +} + +/** Convert internal voice entries to engine-agnostic TTSVoice objects */ +export function getKokoroTTSVoices(): TTSVoice[] { + return KOKORO_VOICES.map(v => ({ + id: v.id, + label: v.label, + metadata: { + accent: v.accent, + gender: v.gender, + persona: v.persona, + }, + })); +} diff --git a/src/engine/tts/engines/outetts/OuteTTSEngine.ts b/src/engine/tts/engines/outetts/OuteTTSEngine.ts new file mode 100644 index 000000000..c494deb36 --- /dev/null +++ b/src/engine/tts/engines/outetts/OuteTTSEngine.ts @@ -0,0 +1,557 @@ +/* eslint-disable max-lines */ +/** + * OuteTTSEngine — TTSEngine implementation for OuteTTS via llama.rn. + * + * Absorbs the logic from services/ttsService.ts into the engine interface. + * Fully imperative — no React bridge needed. + */ +import { initLlama } from 'llama.rn'; +import type { LlamaContext } from 'llama.rn'; +import RNFS from 'react-native-fs'; +import { AudioContext, AudioBufferSourceNode } from 'react-native-audio-api'; +import { OnDeviceEngineEmitter } from '../../../OnDeviceEngineEmitter'; +import type { + EnginePhase, + TTSEngine, + TTSEngineCapabilities, + TTSEngineEvents, + TTSSpeakOptions, + TTSGenerateResult, + TTSVoice, + ModelAsset, + ModelAssetState, +} from '../../../types'; +import { OUTETTS_ASSETS, OUTETTS_BACKBONE, OUTETTS_VOCODER, OUTETTS_SAMPLE_RATE } from './models'; +import logger from '../../../../utils/logger'; + +export class OuteTTSEngine + extends OnDeviceEngineEmitter + implements TTSEngine +{ + readonly id = 'outetts'; + readonly displayName = 'OuteTTS 0.3'; + readonly capabilities: TTSEngineCapabilities = { + streaming: false, + voiceCloning: true, + pauseResume: true, + generateAndSave: true, + peakRamMB: 530, + }; + + private _phase: EnginePhase = 'idle'; + private _context: LlamaContext | null = null; + private _isVocoderReady = false; + private _contextLoadPromise: Promise = Promise.resolve(); + private _audioCtx: AudioContext | null = null; + private _currentSource: AudioBufferSourceNode | null = null; + private _isSpeakingFlag = false; + private _currentMessageId: string | null = null; + private _playSessionId = 0; + private _assetStates: ModelAssetState[] = []; + + constructor() { + super(); + this._assetStates = OUTETTS_ASSETS.map(asset => ({ + asset, + status: 'not-downloaded' as const, + progress: 0, + })); + } + + // ── State ─────────────────────────────────────────────────────────────── + + getPhase(): EnginePhase { + return this._phase; + } + + private _setPhase(phase: EnginePhase): void { + if (phase === this._phase) return; + const prev = this._phase; + this._phase = phase; + this.emit('phaseChange', phase, prev); + } + + // ── Paths ─────────────────────────────────────────────────────────────── + + private _getModelsDir(): string { + return `${RNFS.DocumentDirectoryPath}/tts-models`; + } + + private _getAssetPath(asset: ModelAsset): string { + return `${this._getModelsDir()}/${asset.filename}`; + } + + private _getAudioCacheDir(conversationId: string): string { + return `${RNFS.DocumentDirectoryPath}/audio-cache/${conversationId}`; + } + + private _getAudioFilePath(conversationId: string, messageId: string): string { + return `${this._getAudioCacheDir(conversationId)}/${messageId}.pcm`; + } + + private async _ensureDir(dir: string): Promise { + if (!(await RNFS.exists(dir))) { + await RNFS.mkdir(dir); + } + } + + // ── Lifecycle ─────────────────────────────────────────────────────────── + + isSupported(): boolean { + return true; // OuteTTS runs on all platforms via llama.rn + } + + async initialize(): Promise { + if (this._context && this._isVocoderReady) return; + if (this._phase === 'loading') return this._contextLoadPromise; + + this._setPhase('loading'); + + this._contextLoadPromise = this._contextLoadPromise.then(async () => { + if (this._context && this._isVocoderReady) return; + + logger.log('[OuteTTSEngine] Loading backbone...'); + this._context = await initLlama({ + model: this._getAssetPath(OUTETTS_BACKBONE), + n_ctx: 8192, + n_threads: 4, + }); + + logger.log('[OuteTTSEngine] Loading vocoder...'); + await this._context.initVocoder({ + path: this._getAssetPath(OUTETTS_VOCODER), + n_batch: 4096, + }); + this._isVocoderReady = await this._context.isVocoderEnabled(); + + if (!this._isVocoderReady) { + throw new Error('Vocoder failed to initialize.'); + } + logger.log('[OuteTTSEngine] Ready.'); + }); + + try { + await this._contextLoadPromise; + this._setPhase('ready'); + } catch (err) { + const msg = err instanceof Error ? err.message : 'Failed to load OuteTTS'; + this._setPhase('error'); + this.emit('error', { code: 'OUTETTS_LOAD', message: msg, recoverable: true }); + throw err; + } + } + + async release(): Promise { + this.stop(); + if (this._context) { + await this._context.releaseVocoder().catch(() => {}); + await this._context.release().catch(() => {}); + this._context = null; + } + this._isVocoderReady = false; + this._audioCtx?.close().catch(() => {}); + this._audioCtx = null; + this._setPhase('idle'); + } + + async destroy(): Promise { + await this.release(); + await this.deleteAssets(); + } + + // ── Assets ────────────────────────────────────────────────────────────── + + getRequiredAssets(): ModelAsset[] { + return OUTETTS_ASSETS; + } + + async checkAssetStatus(): Promise { + const states: ModelAssetState[] = []; + for (const asset of OUTETTS_ASSETS) { + const path = this._getAssetPath(asset); + const exists = await RNFS.exists(path); + states.push({ + asset, + status: exists ? 'downloaded' : 'not-downloaded', + progress: exists ? 1 : 0, + localPath: exists ? path : undefined, + }); + } + this._assetStates = states; + return states; + } + + async downloadAssets(assetIds?: string[]): Promise { + await this._ensureDir(this._getModelsDir()); + const toDownload = assetIds + ? OUTETTS_ASSETS.filter(a => assetIds.includes(a.id)) + : OUTETTS_ASSETS; + + this._setPhase('downloading'); + + for (const asset of toDownload) { + const dest = this._getAssetPath(asset); + if (await RNFS.exists(dest)) { + this._updateAssetState(asset.id, { status: 'downloaded', progress: 1, localPath: dest }); + continue; + } + + this._updateAssetState(asset.id, { status: 'downloading', progress: 0 }); + + const dl = RNFS.downloadFile({ + fromUrl: asset.url, + toFile: dest, + progressDivider: 1, + progress: (res) => { + const p = res.bytesWritten / res.contentLength; + this._updateAssetState(asset.id, { status: 'downloading', progress: p }); + this.emit('downloadProgress', { + assetId: asset.id, + progress: p, + bytesWritten: res.bytesWritten, + totalBytes: res.contentLength, + }); + }, + }); + + const result = await dl.promise; + if (result.statusCode !== 200) { + await RNFS.unlink(dest).catch(() => {}); + this._updateAssetState(asset.id, { status: 'error', progress: 0, error: `HTTP ${result.statusCode}` }); + throw new Error(`Download failed for ${asset.label}: HTTP ${result.statusCode}`); + } + this._updateAssetState(asset.id, { status: 'downloaded', progress: 1, localPath: dest }); + } + + // Stay in downloading until all done, then move to idle (not ready — need initialize()) + if (this.isFullyDownloaded()) { + this._setPhase('idle'); + } + } + + async deleteAssets(assetIds?: string[]): Promise { + await this.release(); + const toDelete = assetIds + ? OUTETTS_ASSETS.filter(a => assetIds.includes(a.id)) + : OUTETTS_ASSETS; + + for (const asset of toDelete) { + const path = this._getAssetPath(asset); + if (await RNFS.exists(path)) { + await RNFS.unlink(path); + } + this._updateAssetState(asset.id, { status: 'not-downloaded', progress: 0 }); + } + } + + getOverallDownloadProgress(): number { + const totalSize = OUTETTS_ASSETS.reduce((sum, a) => sum + a.sizeBytes, 0); + let weightedProgress = 0; + for (const state of this._assetStates) { + weightedProgress += state.progress * (state.asset.sizeBytes / totalSize); + } + return weightedProgress; + } + + isFullyDownloaded(): boolean { + return this._assetStates.every(s => s.status === 'downloaded'); + } + + private _updateAssetState( + assetId: string, + patch: Pick & { localPath?: string; error?: string }, + ): void { + const idx = this._assetStates.findIndex(s => s.asset.id === assetId); + if (idx >= 0) { + this._assetStates[idx] = { ...this._assetStates[idx], ...patch }; + } + } + + // ── Voices ────────────────────────────────────────────────────────────── + + getVoices(): TTSVoice[] { + return [ + { + id: '0', + label: 'Default', + metadata: { gender: 'Neutral' }, + }, + ]; + } + + getActiveVoice(): TTSVoice | null { + return this.getVoices()[0]; + } + + async setVoice(voiceId: string): Promise { + // OuteTTS only has one built-in voice; voice cloning uses referenceAudioPath + this.emit('voiceChanged', voiceId); + } + + // ── Audio Generation ──────────────────────────────────────────────────── + + private async _generate(text: string): Promise<{ + samples: Float32Array; + durationSeconds: number; + sampleRate: number; + waveformData: number[]; + }> { + if (!this._context || !this._isVocoderReady) { + throw new Error('OuteTTS models not loaded.'); + } + + const { prompt, grammar } = await this._context.getFormattedAudioCompletion( + null, // default speaker + text, + ); + const guideTokens = (await this._context.getAudioCompletionGuideTokens(text)) ?? []; + const result = await this._context.completion({ + prompt, + grammar, + guide_tokens: guideTokens, + n_predict: 4096, + temperature: 0.7, + top_p: 0.9, + stop: ['<|im_end|>'], + }); + + const pcmArray = await this._context.decodeAudioTokens(result.audio_tokens ?? []); + const samples = new Float32Array(pcmArray); + const sampleRate = OUTETTS_SAMPLE_RATE; + + return { + samples, + durationSeconds: samples.length / sampleRate, + sampleRate, + waveformData: this._buildWaveformData(samples, 200), + }; + } + + // ── Speech ────────────────────────────────────────────────────────────── + + async speak(text: string, options?: TTSSpeakOptions): Promise { + if (!this._context || !this._isVocoderReady) { + throw new Error('OuteTTS models not loaded. Call initialize() first.'); + } + + const speed = options?.speed ?? 1.0; + const messageId = options?.messageId ?? null; + + this.stop(); + this._currentMessageId = messageId; + const sessionId = ++this._playSessionId; + this._isSpeakingFlag = true; + this._setPhase('processing'); + + try { + // Truncate to keep generation time reasonable (~300 chars ~ 20-30s on device) + const truncated = text.length > 300 ? `${text.slice(0, 297)}...` : text; + const audio = await this._generate(truncated); + + // Abort if stop() was called or another speak() started during generation + if (!this._isSpeakingFlag || this._playSessionId !== sessionId) return; + + this.emit('audioComplete', audio); + await this._playFromSamples(audio.samples, speed); + } catch (err) { + const msg = err instanceof Error ? err.message : 'Speech failed'; + this.emit('error', { code: 'OUTETTS_SPEAK', message: msg, recoverable: true }); + throw err; + } finally { + if (this._playSessionId === sessionId) { + this._currentMessageId = null; + this._isSpeakingFlag = false; + this._setPhase('ready'); + } + } + } + + // eslint-disable-next-line max-params + async generateAndSave( + text: string, + conversationId: string, + messageId: string, + _options?: TTSSpeakOptions, + ): Promise { + if (!this._context || !this._isVocoderReady) { + throw new Error('OuteTTS models not loaded. Call initialize() first.'); + } + + const audio = await this._generate(text); + this.emit('audioComplete', audio); + + // Save to file + await this._ensureDir(this._getAudioCacheDir(conversationId)); + const filePath = this._getAudioFilePath(conversationId, messageId); + const base64 = this._float32ToBase64(audio.samples); + await RNFS.writeFile(filePath, base64, 'base64'); + + return { + filePath, + durationSeconds: audio.durationSeconds, + waveformData: audio.waveformData, + }; + } + + async playFromFile( + filePath: string, + options?: { speed?: number; startOffset?: number; messageId?: string }, + ): Promise { + const speed = options?.speed ?? 1.0; + const startOffset = options?.startOffset ?? 0; + const messageId = options?.messageId ?? null; + + this.stop(); + this._currentMessageId = messageId; + const sessionId = ++this._playSessionId; + this._isSpeakingFlag = true; + this._setPhase('processing'); + + try { + this._audioCtx?.close().catch(() => {}); + this._audioCtx = new AudioContext(); + const src = filePath.startsWith('file://') ? filePath : `file://${filePath}`; + const buffer = await this._audioCtx.decodeAudioData(src as unknown as ArrayBuffer); + + // Abort if stop() was called during decode + if (this._playSessionId !== sessionId) return; + + const source = this._audioCtx.createBufferSource(); + source.buffer = buffer; + source.playbackRate.value = speed; + source.connect(this._audioCtx.destination); + this._currentSource = source; + + await new Promise((resolve) => { + source.onEnded = () => { + this._currentSource = null; + resolve(); + }; + source.start(0, startOffset); + }); + } catch (err) { + const msg = err instanceof Error ? err.message : 'Playback failed'; + this.emit('error', { code: 'OUTETTS_PLAYBACK', message: msg, recoverable: true }); + throw err; + } finally { + if (this._playSessionId === sessionId) { + this._currentMessageId = null; + this._isSpeakingFlag = false; + this._setPhase('ready'); + } + } + } + + stop(): void { + this._isSpeakingFlag = false; + try { this._currentSource?.stop(); } catch { /* already stopped */ } + this._currentSource = null; + this._currentMessageId = null; + if (this._phase === 'processing' || this._phase === 'paused') { + this._setPhase(this._context ? 'ready' : 'idle'); + } + } + + pause(): void { + this._audioCtx?.suspend().catch(() => {}); + if (this._phase === 'processing') { + this._setPhase('paused'); + } + } + + resume(): void { + this._audioCtx?.resume().catch(() => {}); + if (this._phase === 'paused') { + this._setPhase('processing'); + } + } + + // ── React Bridge ──────────────────────────────────────────────────────── + + getBridgeComponent(): React.ComponentType | null { + return null; // Fully imperative + } + + // ── Audio Cache (app-level convenience) ───────────────────────────────── + + async getAudioCacheSizeMB(): Promise { + const cacheRoot = `${RNFS.DocumentDirectoryPath}/audio-cache`; + if (!(await RNFS.exists(cacheRoot))) return 0; + let totalBytes = 0; + const convDirs = await RNFS.readDir(cacheRoot); + for (const convDir of convDirs) { + if (convDir.isDirectory()) { + const files = await RNFS.readDir(convDir.path); + for (const file of files) { totalBytes += Number(file.size); } + } + } + return totalBytes / (1024 * 1024); + } + + async clearAudioCache(): Promise { + const cacheRoot = `${RNFS.DocumentDirectoryPath}/audio-cache`; + if (await RNFS.exists(cacheRoot)) { + await RNFS.unlink(cacheRoot); + } + } + + async isAudioCached(conversationId: string, messageId: string): Promise { + return RNFS.exists(this._getAudioFilePath(conversationId, messageId)); + } + + // ── Utilities ─────────────────────────────────────────────────────────── + + private async _playFromSamples(samples: Float32Array, speed: number): Promise { + this._audioCtx?.close().catch(() => {}); + this._audioCtx = new AudioContext({ sampleRate: OUTETTS_SAMPLE_RATE }); + const buffer = this._audioCtx.createBuffer(1, samples.length, OUTETTS_SAMPLE_RATE); + buffer.copyToChannel(samples, 0); + const source = this._audioCtx.createBufferSource(); + source.buffer = buffer; + source.playbackRate.value = speed; + source.connect(this._audioCtx.destination); + this._currentSource = source; + + await new Promise((resolve, reject) => { + // Guard against hanging promise if onEnded never fires + const timeout = setTimeout(() => { + this._currentSource = null; + resolve(); + }, (samples.length / OUTETTS_SAMPLE_RATE / speed) * 1000 + 5000); // estimated duration + 5s buffer + + source.onEnded = () => { + clearTimeout(timeout); + this._currentSource = null; + resolve(); + }; + try { + source.start(); + } catch (err) { + clearTimeout(timeout); + reject(err); + } + }); + } + + private _buildWaveformData(samples: Float32Array, points: number): number[] { + const blockSize = Math.floor(samples.length / points); + const result: number[] = []; + for (let i = 0; i < points; i++) { + let sum = 0; + for (let j = 0; j < blockSize; j++) { + sum += Math.abs(samples[i * blockSize + j] ?? 0); + } + result.push(blockSize > 0 ? sum / blockSize : 0); + } + return result; + } + + private _float32ToBase64(samples: Float32Array): string { + const uint8 = new Uint8Array(samples.buffer); + let binary = ''; + for (let i = 0; i < uint8.length; i++) { + binary += String.fromCharCode(uint8[i]); + } + return btoa(binary); + } +} diff --git a/src/engine/tts/engines/outetts/index.ts b/src/engine/tts/engines/outetts/index.ts new file mode 100644 index 000000000..2347e6784 --- /dev/null +++ b/src/engine/tts/engines/outetts/index.ts @@ -0,0 +1,2 @@ +export { OuteTTSEngine } from './OuteTTSEngine'; +export { OUTETTS_ASSETS, OUTETTS_BACKBONE, OUTETTS_VOCODER, OUTETTS_SAMPLE_RATE } from './models'; diff --git a/src/engine/tts/engines/outetts/models.ts b/src/engine/tts/engines/outetts/models.ts new file mode 100644 index 000000000..ee712bb98 --- /dev/null +++ b/src/engine/tts/engines/outetts/models.ts @@ -0,0 +1,26 @@ +/** + * OuteTTS model definitions. + * + * Moved from constants/ttsModels.ts into the engine boundary. + */ +import type { ModelAsset } from '../../../types'; + +export const OUTETTS_BACKBONE: ModelAsset = { + id: 'backbone', + label: 'Voice Model', + url: 'https://huggingface.co/OuteAI/OuteTTS-0.3-500M-GGUF/resolve/main/OuteTTS-0.3-500M-Q4_K_M.gguf', + sizeBytes: 454 * 1024 * 1024, + filename: 'OuteTTS-0.3-500M-Q4_K_M.gguf', +}; + +export const OUTETTS_VOCODER: ModelAsset = { + id: 'vocoder', + label: 'Audio Decoder', + url: 'https://huggingface.co/ggml-org/WavTokenizer/resolve/main/WavTokenizer-Large-75-Q5_1.gguf', + sizeBytes: 73 * 1024 * 1024, + filename: 'WavTokenizer-Large-75-Q5_1.gguf', +}; + +export const OUTETTS_ASSETS: ModelAsset[] = [OUTETTS_BACKBONE, OUTETTS_VOCODER]; + +export const OUTETTS_SAMPLE_RATE = 24000; diff --git a/src/engine/tts/engines/qwen3/Qwen3TTSEngine.ts b/src/engine/tts/engines/qwen3/Qwen3TTSEngine.ts new file mode 100644 index 000000000..e1d9c1606 --- /dev/null +++ b/src/engine/tts/engines/qwen3/Qwen3TTSEngine.ts @@ -0,0 +1,357 @@ +/** + * Qwen3TTSEngine — TTSEngine stub for Qwen3-TTS. + * + * Multi-model pipeline: + * 1. Talker (0.6B LLM, GGUF) — generates speech token sequences from text + * 2. Predictor (GGUF) — fills parallel codebook tracks (16 codebooks) + * 3. Codec decoder (ONNX) — converts token grid to PCM audio waveform + * + * The talker and predictor run via llama.rn (GGUF). + * The codec decoder runs via ONNX Runtime (onnxruntime-react-native). + * + * 12Hz frame rate = dramatically fewer tokens per second of audio than + * OuteTTS (75Hz) or most other TTS models. This makes on-device inference + * much more feasible. + * + * STATUS: Stub — asset management and lifecycle are wired up; the actual + * inference pipeline is TODO pending integration testing. + */ +import RNFS from 'react-native-fs'; +import { OnDeviceEngineEmitter } from '../../../OnDeviceEngineEmitter'; +import type { + EnginePhase, + TTSEngine, + TTSEngineCapabilities, + TTSEngineEvents, + TTSSpeakOptions, + TTSGenerateResult, + TTSVoice, + ModelAsset, + ModelAssetState, +} from '../../../types'; +import { QWEN3_TTS_ASSETS } from './models'; +import logger from '../../../../utils/logger'; + +export class Qwen3TTSEngine + extends OnDeviceEngineEmitter + implements TTSEngine +{ + readonly id = 'qwen3-tts'; + readonly displayName = 'Qwen3 TTS (0.6B)'; + readonly capabilities: TTSEngineCapabilities = { + streaming: false, // Generate-then-play (streaming planned for v2) + voiceCloning: true, + pauseResume: true, + generateAndSave: true, + platformRequirements: { + android: { minSdkVersion: 26 }, + ios: { minVersion: 15 }, + }, + peakRamMB: 600, + }; + + private _phase: EnginePhase = 'idle'; + private _assetStates: ModelAssetState[] = []; + + // TODO: llama.rn contexts for talker + predictor + // private _talkerContext: LlamaContext | null = null; + // private _predictorContext: LlamaContext | null = null; + // TODO: ONNX Runtime session for codec decoder + // private _codecSession: InferenceSession | null = null; + + constructor() { + super(); + this._assetStates = QWEN3_TTS_ASSETS.map(asset => ({ + asset, + status: 'not-downloaded' as const, + progress: 0, + })); + } + + // ── State ─────────────────────────────────────────────────────────────── + + getPhase(): EnginePhase { + return this._phase; + } + + private _setPhase(phase: EnginePhase): void { + if (phase === this._phase) return; + const prev = this._phase; + this._phase = phase; + this.emit('phaseChange', phase, prev); + } + + // ── Paths ─────────────────────────────────────────────────────────────── + + private _getModelsDir(): string { + return `${RNFS.DocumentDirectoryPath}/tts-models/qwen3`; + } + + private _getAssetPath(asset: ModelAsset): string { + return `${this._getModelsDir()}/${asset.filename}`; + } + + private async _ensureDir(dir: string): Promise { + if (!(await RNFS.exists(dir))) { + await RNFS.mkdir(dir); + } + } + + // ── Lifecycle ─────────────────────────────────────────────────────────── + + isSupported(): boolean { + // TODO: Runtime platform version check + return true; + } + + async initialize(): Promise { + if (!this.isFullyDownloaded()) { + throw new Error('Qwen3-TTS models not downloaded.'); + } + + this._setPhase('loading'); + + try { + // TODO: Load all three models + // + // const talkerPath = this._getAssetPath(QWEN3_TTS_TALKER); + // const predictorPath = this._getAssetPath(QWEN3_TTS_PREDICTOR); + // const codecPath = this._getAssetPath(QWEN3_TTS_CODEC); + // + // this._talkerContext = await initLlama({ + // model: talkerPath, + // n_ctx: 4096, + // n_threads: 4, + // }); + // + // this._predictorContext = await initLlama({ + // model: predictorPath, + // n_ctx: 2048, + // n_threads: 4, + // }); + // + // this._codecSession = await InferenceSession.create(codecPath); + + logger.log('[Qwen3TTSEngine] Models loaded (stub).'); + this._setPhase('ready'); + } catch (err) { + const msg = err instanceof Error ? err.message : 'Failed to load Qwen3-TTS'; + this._setPhase('error'); + this.emit('error', { code: 'QWEN3_LOAD', message: msg, recoverable: true }); + throw err; + } + } + + async release(): Promise { + // TODO: Release llama.rn contexts and ONNX session + // this._talkerContext?.release(); + // this._predictorContext?.release(); + // this._codecSession?.release(); + this._setPhase('idle'); + } + + async destroy(): Promise { + await this.release(); + await this.deleteAssets(); + } + + // ── Assets ────────────────────────────────────────────────────────────── + + getRequiredAssets(): ModelAsset[] { + return QWEN3_TTS_ASSETS; + } + + async checkAssetStatus(): Promise { + await this._ensureDir(this._getModelsDir()); + const states: ModelAssetState[] = []; + for (const asset of QWEN3_TTS_ASSETS) { + const path = this._getAssetPath(asset); + const exists = await RNFS.exists(path); + states.push({ + asset, + status: exists ? 'downloaded' : 'not-downloaded', + progress: exists ? 1 : 0, + localPath: exists ? path : undefined, + }); + } + this._assetStates = states; + return states; + } + + async downloadAssets(assetIds?: string[]): Promise { + await this._ensureDir(this._getModelsDir()); + const toDownload = assetIds + ? QWEN3_TTS_ASSETS.filter(a => assetIds.includes(a.id)) + : QWEN3_TTS_ASSETS; + + this._setPhase('downloading'); + + for (const asset of toDownload) { + const dest = this._getAssetPath(asset); + if (await RNFS.exists(dest)) { + this._updateAssetState(asset.id, { status: 'downloaded', progress: 1, localPath: dest }); + continue; + } + + this._updateAssetState(asset.id, { status: 'downloading', progress: 0 }); + + const dl = RNFS.downloadFile({ + fromUrl: asset.url, + toFile: dest, + progressDivider: 1, + progress: (res) => { + const p = res.bytesWritten / res.contentLength; + this._updateAssetState(asset.id, { status: 'downloading', progress: p }); + this.emit('downloadProgress', { + assetId: asset.id, + progress: p, + bytesWritten: res.bytesWritten, + totalBytes: res.contentLength, + }); + }, + }); + + const result = await dl.promise; + if (result.statusCode !== 200) { + await RNFS.unlink(dest).catch(() => {}); + this._updateAssetState(asset.id, { status: 'error', progress: 0, error: `HTTP ${result.statusCode}` }); + throw new Error(`Download failed for ${asset.label}: HTTP ${result.statusCode}`); + } + this._updateAssetState(asset.id, { status: 'downloaded', progress: 1, localPath: dest }); + } + + if (this.isFullyDownloaded()) { + this._setPhase('idle'); + } + } + + async deleteAssets(assetIds?: string[]): Promise { + await this.release(); + const toDelete = assetIds + ? QWEN3_TTS_ASSETS.filter(a => assetIds.includes(a.id)) + : QWEN3_TTS_ASSETS; + + for (const asset of toDelete) { + const path = this._getAssetPath(asset); + if (await RNFS.exists(path)) { + await RNFS.unlink(path); + } + this._updateAssetState(asset.id, { status: 'not-downloaded', progress: 0 }); + } + } + + getOverallDownloadProgress(): number { + const totalSize = QWEN3_TTS_ASSETS.reduce((sum, a) => sum + a.sizeBytes, 0); + let weightedProgress = 0; + for (const state of this._assetStates) { + weightedProgress += state.progress * (state.asset.sizeBytes / totalSize); + } + return weightedProgress; + } + + isFullyDownloaded(): boolean { + return this._assetStates.every(s => s.status === 'downloaded'); + } + + private _updateAssetState( + assetId: string, + patch: Pick & { localPath?: string; error?: string }, + ): void { + const idx = this._assetStates.findIndex(s => s.asset.id === assetId); + if (idx >= 0) { + this._assetStates[idx] = { ...this._assetStates[idx], ...patch }; + } + } + + // ── Voices ────────────────────────────────────────────────────────────── + + getVoices(): TTSVoice[] { + // TODO: Qwen3-TTS CustomVoice variant has 9 built-in voices. + // For now expose a default. Voice cloning via referenceAudioPath. + return [ + { id: 'default', label: 'Default', metadata: { language: 'multilingual' } }, + ]; + } + + getActiveVoice(): TTSVoice | null { + return this.getVoices()[0]; + } + + async setVoice(voiceId: string): Promise { + this.emit('voiceChanged', voiceId); + } + + // ── Speech ────────────────────────────────────────────────────────────── + + async speak(_text: string, _options?: TTSSpeakOptions): Promise { + // TODO: Implement the three-stage pipeline: + // + // 1. Talker inference (llama.rn): + // - Format prompt with text + voice tokens + // - Run autoregressive generation to produce first-codebook tokens + // - 12Hz frame rate = ~12 tokens per second of audio + // + // 2. Predictor inference (llama.rn): + // - Take first-codebook tokens from talker + // - Predict remaining 15 codebook tracks in parallel + // - Output: 16-codebook token grid + // + // 3. Codec decoding (ONNX Runtime): + // - Take 16-codebook token grid + // - Decode to PCM Float32 audio at 24kHz + // - Emit audioComplete event + // + // 4. Play the resulting audio via AudioContext + + throw new Error( + 'Qwen3-TTS inference pipeline not yet implemented. ' + + 'Asset management and lifecycle are ready — the inference integration is TODO.', + ); + } + + // eslint-disable-next-line max-params + async generateAndSave( + _text: string, + _conversationId: string, + _messageId: string, + _options?: TTSSpeakOptions, + ): Promise { + // TODO: Same pipeline as speak(), but save to file instead of playing + throw new Error('Qwen3-TTS generateAndSave not yet implemented.'); + } + + async playFromFile( + _filePath: string, + _options?: { speed?: number; startOffset?: number; messageId?: string }, + ): Promise { + // TODO: Standard AudioContext file playback (same as OuteTTS) + throw new Error('Qwen3-TTS playFromFile not yet implemented.'); + } + + stop(): void { + // TODO: Abort any in-flight inference + stop audio playback + if (this._phase === 'processing' || this._phase === 'paused') { + this._setPhase('ready'); + } + } + + pause(): void { + // TODO: Suspend AudioContext + if (this._phase === 'processing') { + this._setPhase('paused'); + } + } + + resume(): void { + // TODO: Resume AudioContext + if (this._phase === 'paused') { + this._setPhase('processing'); + } + } + + // ── React Bridge ──────────────────────────────────────────────────────── + + getBridgeComponent(): React.ComponentType | null { + return null; // Fully imperative via llama.rn + ONNX Runtime + } +} diff --git a/src/engine/tts/engines/qwen3/index.ts b/src/engine/tts/engines/qwen3/index.ts new file mode 100644 index 000000000..8a4122dd8 --- /dev/null +++ b/src/engine/tts/engines/qwen3/index.ts @@ -0,0 +1,2 @@ +export { Qwen3TTSEngine } from './Qwen3TTSEngine'; +export { QWEN3_TTS_ASSETS, QWEN3_TTS_TALKER, QWEN3_TTS_PREDICTOR, QWEN3_TTS_CODEC, QWEN3_TTS_SAMPLE_RATE } from './models'; diff --git a/src/engine/tts/engines/qwen3/models.ts b/src/engine/tts/engines/qwen3/models.ts new file mode 100644 index 000000000..bd66d8218 --- /dev/null +++ b/src/engine/tts/engines/qwen3/models.ts @@ -0,0 +1,41 @@ +/** + * Qwen3-TTS model asset definitions. + * + * Three-model pipeline: Talker (LLM) + Predictor + Codec decoder. + * GGUF conversions via LunaVox project. + * + * TODO: Verify exact URLs and file sizes once we commit to a quant level. + */ +import type { ModelAsset } from '../../../types'; + +export const QWEN3_TTS_TALKER: ModelAsset = { + id: 'talker', + label: 'Talker Model (0.6B)', + url: 'https://huggingface.co/wkwong/Lunavox-Qwen3-TTS-GGUF/resolve/main/base_small/qwen3_tts_talker.q5_k.gguf', + sizeBytes: 450 * 1024 * 1024, // ~450MB Q5_K estimate + filename: 'qwen3-tts-talker-q5k.gguf', +}; + +export const QWEN3_TTS_PREDICTOR: ModelAsset = { + id: 'predictor', + label: 'Predictor Model', + url: 'https://huggingface.co/wkwong/Lunavox-Qwen3-TTS-GGUF/resolve/main/base_small/qwen3_tts_predictor.q8_0.gguf', + sizeBytes: 150 * 1024 * 1024, // ~150MB Q8 estimate + filename: 'qwen3-tts-predictor-q8.gguf', +}; + +export const QWEN3_TTS_CODEC: ModelAsset = { + id: 'codec', + label: 'Audio Codec', + url: 'https://huggingface.co/wkwong/Lunavox-Qwen3-TTS-GGUF/resolve/main/base_small/qwen3_tts_decoder.fp16.onnx', + sizeBytes: 50 * 1024 * 1024, // ~50MB estimate + filename: 'qwen3-tts-decoder-fp16.onnx', +}; + +export const QWEN3_TTS_ASSETS: ModelAsset[] = [ + QWEN3_TTS_TALKER, + QWEN3_TTS_PREDICTOR, + QWEN3_TTS_CODEC, +]; + +export const QWEN3_TTS_SAMPLE_RATE = 24000; diff --git a/src/engine/types.ts b/src/engine/types.ts new file mode 100644 index 000000000..5ee592536 --- /dev/null +++ b/src/engine/types.ts @@ -0,0 +1,344 @@ +/** + * On-Device Engine Types + * + * Base interfaces for multimodal on-device AI engines. + * TTS is the first concrete implementation; STT, Vision, and LLM + * engines will inherit the same base pattern. + * + * Designed for mobile — optimized for llama.rn, llama.cpp, ONNX Runtime, + * and ExecuTorch runtimes. + */ +import type React from 'react'; + +// ─── Engine Phase ─────────────────────────────────────────────────────────── + +/** Unified lifecycle phase for any on-device engine */ +export type EnginePhase = + | 'idle' // Not loaded, not doing anything + | 'downloading' // One or more assets downloading + | 'loading' // Models being loaded into memory + | 'ready' // Models loaded, ready to process + | 'processing' // Actively running inference or playback + | 'paused' // Processing suspended (resumable) + | 'error'; // Something went wrong + +// ─── Model Assets ─────────────────────────────────────────────────────────── + +export type ModelAssetStatus = 'not-downloaded' | 'downloading' | 'downloaded' | 'error'; + +/** Describes a single downloadable model file (GGUF, ONNX, .pte, .bin, etc.) */ +export interface ModelAsset { + /** Engine-scoped unique ID (e.g., 'backbone', 'vocoder', 'talker') */ + id: string; + /** Human-readable label for UI */ + label: string; + /** Remote URL to download from (e.g., HuggingFace) */ + url: string; + /** Expected file size in bytes */ + sizeBytes: number; + /** Local filename (engine decides the directory) */ + filename: string; +} + +/** Runtime state of a single model asset */ +export interface ModelAssetState { + asset: ModelAsset; + status: ModelAssetStatus; + /** Download progress 0–1 */ + progress: number; + /** Absolute local file path once downloaded */ + localPath?: string; + /** Error message if status === 'error' */ + error?: string; +} + +// ─── Engine Capabilities ──────────────────────────────────────────────────── + +export interface EngineCapabilities { + /** Supports streaming output (chunks emitted during processing) */ + streaming: boolean; + /** Minimum OS requirements — engine enforces at runtime */ + platformRequirements?: { + android?: { minSdkVersion: number }; + ios?: { minVersion: number }; + }; + /** Approximate peak RAM usage in MB during inference */ + peakRamMB: number; +} + +// ─── Base Event Map ───────────────────────────────────────────────────────── + +/** Events shared by all engine modalities */ +export interface BaseEngineEvents { + [key: string]: (...args: any[]) => void; + /** Fired on every lifecycle phase transition */ + phaseChange: (phase: EnginePhase, previousPhase: EnginePhase) => void; + /** Fired on download progress for any asset */ + downloadProgress: (data: { + assetId: string; + progress: number; + bytesWritten: number; + totalBytes: number; + }) => void; + /** Fired on any error */ + error: (data: { + code: string; + message: string; + recoverable: boolean; + }) => void; +} + +// ─── Base Engine Interface ────────────────────────────────────────────────── + +/** + * Base interface for all on-device AI engines. + * + * Every modality (TTS, STT, Vision, LLM) extends this with modality-specific + * methods and events. The shared surface covers lifecycle, asset management, + * and the typed event system. + * + * @typeParam TEvents — union of base + modality-specific events + */ +export interface OnDeviceEngine< + TEvents extends BaseEngineEvents = BaseEngineEvents, +> { + /** Unique engine identifier (e.g., 'kokoro', 'outetts', 'qwen3-tts') */ + readonly id: string; + /** Human-readable display name */ + readonly displayName: string; + /** Static capabilities — does not change at runtime */ + readonly capabilities: EngineCapabilities; + + // ── State ─────────────────────────────────────────────────────────────── + + /** Current lifecycle phase */ + getPhase(): EnginePhase; + + // ── Events ────────────────────────────────────────────────────────────── + + /** Subscribe to an event. Returns an unsubscribe function. */ + on( + event: K, + listener: TEvents[K], + ): () => void; + + /** Unsubscribe a specific listener */ + off( + event: K, + listener: TEvents[K], + ): void; + + /** Subscribe to an event once — auto-unsubscribes after first fire */ + once( + event: K, + listener: TEvents[K], + ): () => void; + + // ── Lifecycle ─────────────────────────────────────────────────────────── + + /** Runtime platform compatibility check */ + isSupported(): boolean; + + /** + * Load models into memory. For hook-based engines this may be a no-op + * (initialization happens via the React bridge component). + * + * Phase transition: idle → loading → ready + */ + initialize(): Promise; + + /** + * Release models and resources. Engine returns to 'idle' but retains + * downloaded assets on disk. + * + * Phase transition: any → idle + */ + release(): Promise; + + /** + * Full teardown — release models AND delete downloaded assets. + * + * Phase transition: any → idle (assets cleared) + */ + destroy(): Promise; + + // ── Asset Management ──────────────────────────────────────────────────── + + /** List of model files this engine requires */ + getRequiredAssets(): ModelAsset[]; + + /** Check which assets exist on disk. Updates internal state + emits events. */ + checkAssetStatus(): Promise; + + /** + * Download required assets. Emits `downloadProgress` per asset. + * @param assetIds — optional subset; omit to download all missing + */ + downloadAssets(assetIds?: string[]): Promise; + + /** + * Delete downloaded assets from disk. Releases models first if loaded. + * @param assetIds — optional subset; omit to delete all + */ + deleteAssets(assetIds?: string[]): Promise; + + /** Aggregate download progress across all assets (0–1), weighted by size */ + getOverallDownloadProgress(): number; + + /** True if every required asset exists on disk */ + isFullyDownloaded(): boolean; + + // ── React Bridge ──────────────────────────────────────────────────────── + + /** + * If the engine requires a React component mounted in the tree (e.g., + * wrapping a React hook), return it here. The app renders it near the + * root via . Return null for fully imperative engines. + */ + getBridgeComponent(): React.ComponentType | null; +} + +// ─── TTS-Specific Types ───────────────────────────────────────────────────── + +export interface TTSVoice { + /** Engine-scoped unique ID (e.g., 'af_heart', 'default', 'zh-female-1') */ + id: string; + /** Human-readable label */ + label: string; + /** Freeform metadata — accent, gender, persona, language, etc. */ + metadata: Record; + /** True if this voice supports cloning from reference audio */ + isCloneable?: boolean; +} + +export interface TTSEngineCapabilities extends EngineCapabilities { + /** Supports zero-shot voice cloning from reference audio */ + voiceCloning: boolean; + /** Supports pause/resume during playback */ + pauseResume: boolean; + /** Supports generate-and-save-to-file (Audio Mode) */ + generateAndSave: boolean; +} + +export interface TTSSpeakOptions { + /** Playback speed multiplier (0.5–2.0) */ + speed?: number; + /** Voice ID override (uses active voice if omitted) */ + voiceId?: string; + /** Message ID for ownership tracking */ + messageId?: string; + /** Path to reference audio for voice cloning engines */ + referenceAudioPath?: string; + /** Abort signal for cancellation */ + signal?: AbortSignal; +} + +export interface TTSGenerateResult { + /** Absolute path to saved audio file */ + filePath: string; + /** Audio duration in seconds */ + durationSeconds: number; + /** Downsampled amplitude envelope (~200 points) for waveform UI */ + waveformData: number[]; +} + +/** TTS-specific events (extends base events) */ +export interface TTSEngineEvents extends BaseEngineEvents { + /** Streaming audio chunk (for engines that support streaming) */ + audioChunk: (data: { + samples: Float32Array; + sampleRate: number; + chunkIndex: number; + /** True if this is the last chunk in the current utterance */ + isFinal: boolean; + }) => void; + + /** Full audio generation complete (for non-streaming engines) */ + audioComplete: (data: { + samples: Float32Array; + sampleRate: number; + durationSeconds: number; + waveformData: number[]; + }) => void; + + /** RMS amplitude update for waveform visualization */ + amplitudeChange: (amplitude: number) => void; + + /** Playback elapsed time tick */ + playbackTick: (elapsedSeconds: number) => void; + + /** Active voice changed */ + voiceChanged: (voiceId: string) => void; +} + +// ─── TTS Engine Interface ─────────────────────────────────────────────────── + +/** + * The TTS engine interface. Every TTS implementation (Kokoro, OuteTTS, + * Qwen3-TTS, etc.) implements this. The store delegates to the active + * engine without knowing which one it is. + */ +export interface TTSEngine extends OnDeviceEngine { + readonly capabilities: TTSEngineCapabilities; + + // ── Voices ────────────────────────────────────────────────────────────── + + /** All voices this engine supports */ + getVoices(): TTSVoice[]; + + /** Currently active voice (null if none set) */ + getActiveVoice(): TTSVoice | null; + + /** + * Set the active voice. Some engines require a reload/remount to change + * voices — this method handles that transparently. Emits `voiceChanged` + * when the voice is actually active. + */ + setVoice(voiceId: string): Promise; + + // ── Speech ────────────────────────────────────────────────────────────── + + /** + * Speak text aloud (Chat Mode primary method). + * + * Streaming engines emit `audioChunk` during playback. + * Non-streaming engines emit `audioComplete` after generation, then play. + * + * Resolves when playback finishes or is stopped. + * Phase transition: ready → processing → ready + */ + speak(text: string, options?: TTSSpeakOptions): Promise; + + /** + * Generate audio and save to file (Audio Mode primary method). + * Check `capabilities.generateAndSave` before calling. + */ + generateAndSave( + text: string, + conversationId: string, + messageId: string, + options?: TTSSpeakOptions, + ): Promise; + + /** + * Play a previously saved audio file. + * Used by Audio Mode to replay cached messages. + */ + playFromFile( + filePath: string, + options?: { + speed?: number; + startOffset?: number; + messageId?: string; + }, + ): Promise; + + /** Stop all speech/playback immediately */ + stop(): void; + + /** Pause current playback (requires capabilities.pauseResume) */ + pause(): void; + + /** Resume paused playback */ + resume(): void; +} diff --git a/src/hooks/useTTS.ts b/src/hooks/useTTS.ts new file mode 100644 index 000000000..e0cec108d --- /dev/null +++ b/src/hooks/useTTS.ts @@ -0,0 +1,39 @@ +import { useEffect, useCallback } from 'react'; +import { useTTSStore } from '../stores/ttsStore'; +import { hardwareService } from '../services/hardware'; +import { TTS_WARN_RAM_GB, TTS_BLOCK_RAM_GB } from '../constants/ttsModels'; + +export function useTTS() { + const store = useTTSStore(); + + useEffect(() => { + store.checkDownloadStatus(); + // eslint-disable-next-line react-hooks/exhaustive-deps + }, []); + + const canRunOnDevice = useCallback((): { allowed: boolean; warning: boolean } => { + const ramGB = hardwareService.getTotalMemoryGB(); + return { + allowed: ramGB >= TTS_BLOCK_RAM_GB, + warning: ramGB < TTS_WARN_RAM_GB, + }; + }, []); + + const speakMessage = useCallback( + (text: string, messageId: string) => { + store.speak(text, messageId); + }, + // eslint-disable-next-line react-hooks/exhaustive-deps + [store.isReady], + ); + + return { + ...store, + speakMessage, + canRunOnDevice, + isDownloading: store.isDownloading, + overallDownloadProgress: store.overallDownloadProgress, + isAudioMode: store.settings.interfaceMode === 'audio', + isChatMode: store.settings.interfaceMode === 'chat', + }; +} diff --git a/src/navigation/AppNavigator.tsx b/src/navigation/AppNavigator.tsx index 1d15b73a0..517357a23 100644 --- a/src/navigation/AppNavigator.tsx +++ b/src/navigation/AppNavigator.tsx @@ -32,6 +32,7 @@ import { DownloadManagerScreen, ModelSettingsScreen, VoiceSettingsScreen, + TTSSettingsScreen, DeviceInfoScreen, StorageSettingsScreen, SecuritySettingsScreen, @@ -229,6 +230,7 @@ export const AppNavigator: React.FC = () => { + diff --git a/src/navigation/types.ts b/src/navigation/types.ts index 21b876daa..b58d03c1b 100644 --- a/src/navigation/types.ts +++ b/src/navigation/types.ts @@ -16,6 +16,7 @@ export type RootStackParamList = { ModelSettings: undefined; RemoteServers: undefined; VoiceSettings: undefined; + TTSSettings: undefined; DeviceInfo: undefined; StorageSettings: undefined; SecuritySettings: undefined; diff --git a/src/screens/ChatScreen/ChatMessageArea.tsx b/src/screens/ChatScreen/ChatMessageArea.tsx index f7611cc00..374c80bc2 100644 --- a/src/screens/ChatScreen/ChatMessageArea.tsx +++ b/src/screens/ChatScreen/ChatMessageArea.tsx @@ -1,5 +1,6 @@ import React, { useState, useMemo } from 'react'; -import { View, FlatList, Text, Keyboard, ActivityIndicator, Platform } from 'react-native'; +import { View, FlatList, Text, Keyboard, ActivityIndicator, Platform, StyleSheet } from 'react-native'; +import { useTTSStore } from '../../stores/ttsStore'; import Icon from 'react-native-vector-icons/Feather'; import Animated, { FadeIn } from 'react-native-reanimated'; import { AttachStep } from 'react-native-spotlight-tour'; @@ -28,6 +29,10 @@ export type ChatMessageAreaProps = { export const ChatMessageArea: React.FC = ({ flatListRef, isNearBottomRef, chat, styles, colors, handleScroll, renderItem, chatSpotlight, }) => { + // Hide FlatList until initial layout + scroll is complete to prevent visible scroll jump + const [isListReady, setIsListReady] = useState(false); + const hasScrolledRef = React.useRef(false); + const interfaceMode = useTTSStore((s) => s.settings.interfaceMode); const tabNav = useNavigation>(); const [inputHeight, setInputHeight] = useState(84); const activeModelRepoId = chat.activeModelId?.split('/').slice(0, 2).join('/'); @@ -52,12 +57,26 @@ export const ChatMessageArea: React.FC = ({ ) : ( item.id} + extraData={interfaceMode} contentContainerStyle={styles.messageList} onScroll={handleScroll} - onContentSizeChange={(_w, _h) => { if (isNearBottomRef.current) flatListRef.current?.scrollToEnd({ animated: false }); }} + onContentSizeChange={(_w, h) => { + if (!hasScrolledRef.current && h > 0) { + // Initial layout: force scroll to bottom regardless of isNearBottom + flatListRef.current?.scrollToEnd({ animated: false }); + hasScrolledRef.current = true; + // Reveal after a frame so the scroll position settles + requestAnimationFrame(() => { + requestAnimationFrame(() => setIsListReady(true)); + }); + } else if (isNearBottomRef.current) { + flatListRef.current?.scrollToEnd({ animated: false }); + } + }} onLayout={() => { }} scrollEventThrottle={16} keyboardDismissMode="on-drag" @@ -140,3 +159,7 @@ export const ChatMessageArea: React.FC = ({ ); }; + +const hiddenStyle = StyleSheet.create({ + hidden: { opacity: 0 }, +}); diff --git a/src/screens/ChatScreen/ChatModalSection.tsx b/src/screens/ChatScreen/ChatModalSection.tsx index 301b3bdc0..76f90703a 100644 --- a/src/screens/ChatScreen/ChatModalSection.tsx +++ b/src/screens/ChatScreen/ChatModalSection.tsx @@ -83,6 +83,7 @@ export const ChatModalSection: React.FC = ({ onOpenProject={() => setShowProjectSelector(true)} onOpenGallery={imageCount > 0 ? () => navigation.navigate('Gallery', { conversationId: activeConversationId }) : undefined} onDeleteConversation={activeConversation ? handleDeleteConversation : undefined} + onOpenTTSSettings={() => { setShowSettingsPanel(false); navigation.navigate('TTSSettings'); }} conversationImageCount={imageCount} activeProjectName={activeProject?.name || null} isRemote={isRemote} diff --git a/src/screens/ChatScreen/MessageRenderer.tsx b/src/screens/ChatScreen/MessageRenderer.tsx index 5cf4a0cc1..e5511441c 100644 --- a/src/screens/ChatScreen/MessageRenderer.tsx +++ b/src/screens/ChatScreen/MessageRenderer.tsx @@ -1,7 +1,18 @@ -import React from 'react'; +import React, { useState } from 'react'; +import { View, StyleSheet } from 'react-native'; import { ChatMessage } from '../../components'; +import { AudioMessageBubble } from '../../components/AudioMessageBubble'; +import { TTSButton } from '../../components/TTSButton'; +import { AnimatedEntry } from '../../components/AnimatedEntry'; +import { useTTSStore } from '../../stores/ttsStore'; +import { stripControlTokens } from '../../utils/messageContent'; import { Message } from '../../types'; +import '../../types/tts'; import { ChatMessageItem } from './useChatScreen'; +import { parseThinkingContent, buildMessageData } from '../../components/ChatMessage/utils'; +import { ThinkingBlock } from '../../components/ChatMessage/components/ThinkingBlock'; +import { createStyles as createChatStyles } from '../../components/ChatMessage/styles'; +import { useThemedStyles } from '../../theme'; type MessageRendererProps = { item: Message | ChatMessageItem; @@ -19,31 +30,215 @@ type MessageRendererProps = { onImagePress: (uri: string) => void; }; -export const MessageRenderer: React.FC = ({ - item, - index, - displayMessagesLength, - animateLastN, - imageModelLoaded, - isStreaming, - isGeneratingImage, - showGenerationDetails, - onCopy, - onRetry, - onEdit, - onGenerateImage, - onImagePress, -}) => ( - 0 && index >= displayMessagesLength - animateLastN} - /> -); +/** Renders the thinking/reasoning block for audio mode without the ChatMessage bubble wrapper */ +const AudioModeThinkingBlock: React.FC<{ msg: Message }> = ({ msg }) => { + const chatStyles = useThemedStyles(createChatStyles); + const [showThinking, setShowThinking] = useState(false); + const { parsedContent } = buildMessageData(msg); + if (!parsedContent.thinking) return null; + return ( + + setShowThinking((v) => !v)} + styles={chatStyles} + /> + + ); +}; + +interface AudioBubbleProps { + messageId: string; + audioPath: string; + waveformData: number[]; + durationSeconds: number; + transcript: string; + _reasoningContent?: string; +} + +function buildAudioBubbleProps(msg: Message): AudioBubbleProps { + const transcript = stripControlTokens(msg.content); + console.log('[AudioBubble] buildProps: msgId=', msg.id, 'contentLen=', msg.content.length, 'transcriptLen=', transcript.length); + return { + messageId: msg.id, + audioPath: msg.audioPath ?? '', + waveformData: msg.waveformData ?? [], + durationSeconds: msg.audioDurationSeconds ?? 0, + transcript, + _reasoningContent: msg.reasoningContent, + }; +} + +/** Wraps content with AnimatedEntry if needed */ +function wrapAnimated(content: React.ReactElement, shouldAnimate: boolean): React.ReactElement { + return shouldAnimate ? {content} : content; +} + +/** Renders a user voice message as an audio bubble */ +function renderUserAudioBubble( + opts: { msg: Message; audioAtt: any; shouldAnimate: boolean }, + props: MessageRendererProps, +): React.ReactElement { + const { msg, audioAtt, shouldAnimate } = opts; + const bubble = ( + + props.onRetry(msg)} + /> + + ); + return wrapAnimated(bubble, shouldAnimate); +} + +/** Renders a streaming/thinking assistant message in audio mode as a ChatMessage */ +function renderAudioStreamingMessage( + msg: Message, + isStreamingThis: boolean, + props: MessageRendererProps, +): React.ReactElement { + return ( + + ); +} + +/** Renders a completed assistant audio bubble, with optional tool call UI */ +function renderAudioAssistantBubble( + msg: Message, + shouldAnimate: boolean, + props: MessageRendererProps, +): React.ReactElement { + const hasThinking = !!msg.reasoningContent || !!parseThinkingContent(msg.content).thinking; + const hasToolCalls = !!msg.toolCalls?.length; + + // For messages with tool calls, render as a regular ChatMessage (has proper tool call UI) + // followed by the audio bubble for the spoken text + if (hasToolCalls) { + const element = ( + + + + ); + return wrapAnimated(element, shouldAnimate); + } + + const bubble = ( + + {hasThinking && } + props.onRetry(msg)} + /> + + ); + return wrapAnimated(bubble, shouldAnimate); +} + +export const MessageRenderer: React.FC = (props) => { + const { + item, + index, + displayMessagesLength, + animateLastN, + imageModelLoaded, + isStreaming, + isGeneratingImage, + showGenerationDetails, + onCopy, + onRetry, + onEdit, + onGenerateImage, + onImagePress, + } = props; + + const ttsMode = useTTSStore((s) => s.settings.interfaceMode); + const msg = item as Message; + const animateEntry = animateLastN > 0 && index >= displayMessagesLength - animateLastN; + const isStreamingThis = item.id === 'streaming'; + + // User voice message: always show as audio bubble + if (msg.role === 'user') { + const audioAtt = msg.attachments?.find((a) => a.type === 'audio'); + if (audioAtt) { + return renderUserAudioBubble({ msg, audioAtt, shouldAnimate: animateEntry }, props); + } + } + + const isAudioAssistant = msg.role === 'assistant' && !msg.isSystemInfo; + + // Thinking placeholder + audio streaming + const isThinkingItem = !!(msg as any).isThinking; + if (isAudioAssistant && ttsMode === 'audio' && (isStreamingThis || isThinkingItem)) { + return renderAudioStreamingMessage(msg, isStreamingThis, props); + } + + // Audio Mode: show assistant messages as audio bubbles after streaming ends + if (isAudioAssistant && ttsMode === 'audio' && !isStreamingThis) { + return renderAudioAssistantBubble(msg, animateEntry, props); + } + + // Chat Mode: TTSButton lives in the meta row + const isPlainAssistant = msg.role === 'assistant' && !msg.isSystemInfo && !msg.toolCalls?.length; + const ttsMeta = isPlainAssistant && !isStreamingThis + ? + : undefined; + + return ( + + ); +}; + +const audioStyles = StyleSheet.create({ + userContainer: { + paddingRight: 16, + marginVertical: 8, + alignItems: 'flex-end', + }, + assistantContainer: { + paddingHorizontal: 16, + marginVertical: 8, + alignItems: 'flex-start', + }, +}); diff --git a/src/screens/ChatScreen/index.tsx b/src/screens/ChatScreen/index.tsx index 2be6468e8..bdf0c1380 100644 --- a/src/screens/ChatScreen/index.tsx +++ b/src/screens/ChatScreen/index.tsx @@ -1,5 +1,6 @@ import React, { useCallback, useEffect, useRef, useState } from 'react'; import { FlatList, KeyboardAvoidingView, InteractionManager } from 'react-native'; +import { useTTSStore } from '../../stores/ttsStore'; import { SafeAreaView } from 'react-native-safe-area-context'; import { useFocusEffect } from '@react-navigation/native'; import { useSpotlightTour } from 'react-native-spotlight-tour'; @@ -101,6 +102,22 @@ export const ChatScreen: React.FC = () => { setTimeout(() => { flatListRef.current?.scrollToEnd({ animated: true }); }, 100); } }, [chat.activeConversation?.messages.length]); + + // Reset scroll when switching between chat/audio interface modes + const interfaceMode = useTTSStore((s) => s.settings.interfaceMode); + const prevModeRef = React.useRef(interfaceMode); + React.useEffect(() => { + if (prevModeRef.current !== interfaceMode) { + prevModeRef.current = interfaceMode; + isNearBottomRef.current = true; + chat.setShowScrollToBottom(false); + // FlatList re-renders via extraData; onContentSizeChange fires and scrolls. + // Backup: scroll after items have had time to re-measure. + setTimeout(() => { flatListRef.current?.scrollToEnd({ animated: false }); }, 300); + } + // eslint-disable-next-line react-hooks/exhaustive-deps + }, [interfaceMode]); + const alertEl = ( = Dispatch>; const FALLBACK_RECENT_MESSAGE_COUNT = 2; + +/** + * Appended to the system prompt when TTS audio mode is active. + * Guides the model to respond conversationally for voice output. + */ +const AUDIO_MODE_PROMPT_HINT = ` + +[VOICE MODE ACTIVE — your response will be spoken aloud via text-to-speech] +Respond as if you are speaking to the user in a natural conversation: +- Be concise and conversational — talk like a person, not a document +- Never use markdown formatting (no headers, bullets, bold, code blocks, tables) +- Never use special characters, symbols, or emoji that sound awkward when read aloud +- Use short sentences and natural spoken transitions ("So,", "Basically,", "Here's the thing —") +- If summarizing research or long content, give the key takeaways in a few spoken paragraphs, not an essay +- Numbers: say "about two thousand" not "~2,000" +- Keep responses under 2-3 paragraphs unless the user explicitly asks for detail +- Use expressive punctuation for natural prosody: exclamation marks for emphasis!, question marks for curiosity?, ellipses for pauses..., and vary sentence length for rhythm`; export type GenerationDeps = { activeModelId: string | null; activeModel: DownloadedModel | null | undefined; @@ -248,7 +265,13 @@ export async function startGenerationFn(deps: GenerationDeps, call: StartGenerat } const conversation = useChatStore.getState().conversations.find(c => c.id === targetConversationId); const { enabledTools, rawPrompt } = resolveToolsAndPrompt(deps, conversation); - const basePrompt = await injectRagContext(conversation?.projectId, messageText, rawPrompt); + let basePrompt = await injectRagContext(conversation?.projectId, messageText, rawPrompt); + + // In audio mode, append instructions for conversational voice-friendly responses + if (useTTSStore.getState().settings.interfaceMode === 'audio') { + basePrompt += AUDIO_MODE_PROMPT_HINT; + } + const isRemote = !!useRemoteServerStore.getState().activeRemoteTextModelId; const activeTools = enabledTools; const systemPrompt = applyGemma4ThinkToken( diff --git a/src/screens/ChatScreen/useChatMessageHandlers.ts b/src/screens/ChatScreen/useChatMessageHandlers.ts index c9ff7f1c4..f20d82379 100644 --- a/src/screens/ChatScreen/useChatMessageHandlers.ts +++ b/src/screens/ChatScreen/useChatMessageHandlers.ts @@ -1,6 +1,7 @@ import { Dispatch, SetStateAction } from 'react'; import { showAlert, AlertState } from '../../components'; import { Message } from '../../types'; +import { useTTSStore } from '../../stores/ttsStore'; import { regenerateResponseFn, executeDeleteConversationFn, handleImageGenerationFn, } from './useChatGenerationActions'; @@ -20,6 +21,8 @@ export async function handleRetryMessageFn( message: Message, genDeps: GenerationDeps, p: RetryParams, ): Promise { if (!p.activeConversationId || !p.hasActiveModel) return; + // Stop any in-flight TTS before deleting messages + useTTSStore.getState().stop(); const msgs = p.activeConversation?.messages || []; if (message.role === 'user') { const idx = msgs.findIndex((m: Message) => m.id === message.id); diff --git a/src/screens/ChatScreen/useChatScreen.ts b/src/screens/ChatScreen/useChatScreen.ts index e543b7e5c..cd426c75d 100644 --- a/src/screens/ChatScreen/useChatScreen.ts +++ b/src/screens/ChatScreen/useChatScreen.ts @@ -1,7 +1,9 @@ import { useCallback, useEffect, useMemo, useRef, useState } from 'react'; +import { AppState } from 'react-native'; import { useNavigation, useRoute, RouteProp } from '@react-navigation/native'; import { AlertState, initialAlertState } from '../../components'; -import { useAppStore, useChatStore, useProjectStore, useRemoteServerStore } from '../../stores'; +import { useAppStore, useChatStore, useProjectStore, useRemoteServerStore, useTTSStore } from '../../stores'; +import '../../types/tts'; import logger from '../../utils/logger'; import { llmService, generationService, imageGenerationService, activeModelService, @@ -15,10 +17,16 @@ import { startGenerationFn, handleSendFn, handleStopFn, handleSelectProjectFn } import { handleRetryMessageFn, handleEditMessageFn, handleDeleteConversationFn, handleGenerateImageFromMsgFn } from './useChatMessageHandlers'; import { getDisplayMessages, getPlaceholderText, ChatMessageItem, StreamingState } from './types'; import { saveImageToGallery } from './useSaveImage'; +import { stripControlTokens, stripMarkdownForSpeech } from '../../utils/messageContent'; export type { AlertState, ChatMessageItem, StreamingState }; export { getDisplayMessages, getPlaceholderText }; +function _triggerAudioModeGeneration(conversationId: string, messageId: string, content: string) { + useChatStore.getState().updateMessageAudio(conversationId, messageId, { isAudioModeMessage: true }); + useTTSStore.getState().speak(stripMarkdownForSpeech(stripControlTokens(content)), messageId); +} + type ChatScreenRouteProp = RouteProp; type ActiveModelInfo = { @@ -53,6 +61,26 @@ export const useChatScreen = () => { const [isCompacting, setIsCompacting] = useState(false); const lastMessageCountRef = useRef(0); const generatingForConversationRef = useRef(null); + + // Stop TTS when navigating away, app backgrounded, or screen locked + useEffect(() => { + const unsubBlur = navigation.addListener('blur', () => { + useTTSStore.getState().stop(); + }); + // beforeRemove fires on back button — more reliable than blur for native-stack + const unsubRemove = navigation.addListener('beforeRemove', () => { + useTTSStore.getState().stop(); + }); + const appStateSub = AppState.addEventListener('change', (nextState) => { + const tts = useTTSStore.getState(); + if (nextState !== 'active') { + if (tts.isSpeaking && !tts.isPaused) { tts.pause(); } + } else { + if (tts.isSpeaking && tts.isPaused) { tts.resume(); } + } + }); + return () => { unsubBlur(); unsubRemove(); appStateSub.remove(); }; + }, [navigation]); const modelLoadStartTimeRef = useRef(null); const startGenerationRef = useRef<(id: string, text: string) => Promise>(null as any); const addMessageRef = useRef(null as any); @@ -193,6 +221,40 @@ export const useChatScreen = () => { lastMessageCountRef.current = curr; }, [displayMessages.length]); useEffect(() => { lastMessageCountRef.current = 0; setAnimateLastN(0); }, [activeConversationId]); + const prevStreamingRef = useRef(false); + + // Stop any in-flight TTS when a new streaming response begins + useEffect(() => { + if (isStreamingForThisConversation && useTTSStore.getState().isSpeaking) { + useTTSStore.getState().stop(); + } + // eslint-disable-next-line react-hooks/exhaustive-deps + }, [isStreamingForThisConversation]); + + // When streaming ends, speak the full response as a single TTS call + useEffect(() => { + const was = prevStreamingRef.current; + prevStreamingRef.current = isStreamingForThisConversation; + if (!was || isStreamingForThisConversation || !activeConversationId) return; + const tts = useTTSStore.getState(); + if (tts.settings.interfaceMode !== 'audio') return; + const conv = useChatStore.getState().conversations.find((c) => c.id === activeConversationId); + const last = (conv?.messages ?? []).at(-1); + if (!last || last.role !== 'assistant' || last.isSystemInfo || last.toolCalls?.length || last.audioPath) return; + // Stamp as audio-mode. Estimate duration from word count (avg 2.5 words/sec) + const wordCount = last.content.split(/\s+/).filter(Boolean).length; + const speed = useTTSStore.getState().settings.speed || 1; + const estDuration = Math.max(1, wordCount / (2.5 * speed)); + useChatStore.getState().updateMessageAudio(activeConversationId, last.id, { + isAudioModeMessage: true, + audioDurationSeconds: estDuration, + }); + if (!tts.isReady) return; + const fullText = stripMarkdownForSpeech(stripControlTokens(last.content)).trim(); + if (fullText) { + useTTSStore.getState().speak(fullText, last.id); + } + }, [isStreamingForThisConversation]); // eslint-disable-line react-hooks/exhaustive-deps const startGeneration = async (targetConversationId: string, messageText: string) => { await startGenerationFn(genDeps, { setDebugInfo, targetConversationId, messageText }); diff --git a/src/screens/DownloadManagerScreen/index.tsx b/src/screens/DownloadManagerScreen/index.tsx index 3829299fa..46c2312ff 100644 --- a/src/screens/DownloadManagerScreen/index.tsx +++ b/src/screens/DownloadManagerScreen/index.tsx @@ -1,5 +1,5 @@ -import React from 'react'; -import { View, Text, FlatList, TouchableOpacity, RefreshControl } from 'react-native'; +import React, { useState, useCallback } from 'react'; +import { View, Text, FlatList, TouchableOpacity, RefreshControl, ScrollView } from 'react-native'; import { SafeAreaView } from 'react-native-safe-area-context'; import Icon from 'react-native-vector-icons/Feather'; import { Card } from '../../components'; @@ -7,13 +7,35 @@ import { CustomAlert, hideAlert } from '../../components/CustomAlert'; import { useTheme, useThemedStyles } from '../../theme'; import { useNavigation } from '@react-navigation/native'; import { createStyles } from './styles'; -import { ActiveDownloadCard, CompletedDownloadCard, formatBytes } from './items'; +import { ActiveDownloadCard, CompletedDownloadCard, formatBytes, type DownloadItem } from './items'; import { useDownloadManager } from './useDownloadManager'; +type FilterType = 'all' | 'text' | 'vision' | 'image' | 'tts' | 'stt'; + +const FILTERS: { id: FilterType; label: string }[] = [ + { id: 'all', label: 'All' }, + { id: 'text', label: 'Text' }, + { id: 'vision', label: 'Vision' }, + { id: 'image', label: 'Image Gen' }, + { id: 'tts', label: 'Text to Speech' }, + { id: 'stt', label: 'Speech to Text' }, +]; + +function matchesFilter(item: DownloadItem, filter: FilterType): boolean { + if (filter === 'all') return true; + if (filter === 'vision') return item.modelType === 'text' && !!item.isVisionModel; + if (filter === 'text') return item.modelType === 'text' && !item.isVisionModel; + if (filter === 'image') return item.modelType === 'image'; + if (filter === 'tts') return item.modelType === 'tts'; + if (filter === 'stt') return item.modelType === 'stt'; + return true; +} + export const DownloadManagerScreen: React.FC = () => { const navigation = useNavigation(); const { colors } = useTheme(); const styles = useThemedStyles(createStyles); + const [activeFilter, setActiveFilter] = useState('all'); const { isRefreshing, activeItems, @@ -27,6 +49,30 @@ export const DownloadManagerScreen: React.FC = () => { totalStorageUsed, } = useDownloadManager(); + const filteredActive = activeItems.filter(item => matchesFilter(item, activeFilter)); + const filteredCompleted = completedItems.filter(item => matchesFilter(item, activeFilter)); + + const renderHeader = useCallback(() => ( + + {FILTERS.map(f => { + const active = activeFilter === f.id; + return ( + setActiveFilter(f.id)} + > + {f.label} + + ); + })} + + ), [activeFilter, colors, styles]); + return ( @@ -39,52 +85,47 @@ export const DownloadManagerScreen: React.FC = () => { ( - {/* Active Downloads */} - - - - Active Downloads - - {activeItems.length} + {/* Active Downloads — only show when there are active items */} + {filteredActive.length > 0 && ( + + + + Active Downloads + + {filteredActive.length} + - - {activeItems.length > 0 ? ( - activeItems.map(item => ( + {filteredActive.map(item => ( - )) - ) : ( - - - No active downloads - - )} - + ))} + + )} - {/* Completed Downloads */} + {/* Downloaded Models */} - + Downloaded Models - {completedItems.length} + {filteredCompleted.length} - {completedItems.length > 0 ? ( - completedItems.map(item => ( + {filteredCompleted.length > 0 ? ( + filteredCompleted.map(item => ( )) ) : ( - - No models downloaded yet - - Go to the Models tab to browse and download models + + + {activeFilter === 'all' ? 'No models downloaded yet' : `No ${FILTERS.find(f => f.id === activeFilter)?.label ?? ''} models`} )} diff --git a/src/screens/DownloadManagerScreen/items.tsx b/src/screens/DownloadManagerScreen/items.tsx index f2d20d809..8cc45992a 100644 --- a/src/screens/DownloadManagerScreen/items.tsx +++ b/src/screens/DownloadManagerScreen/items.tsx @@ -12,7 +12,7 @@ import { createStyles } from './styles'; export type DownloadItem = { type: 'active' | 'completed'; - modelType: 'text' | 'image'; + modelType: 'text' | 'image' | 'tts' | 'stt'; downloadId?: number; modelId: string; fileName: string; @@ -222,9 +222,9 @@ export const CompletedDownloadCard: React.FC = ({ it diff --git a/src/screens/DownloadManagerScreen/styles.ts b/src/screens/DownloadManagerScreen/styles.ts index 39120fa09..8f40c2839 100644 --- a/src/screens/DownloadManagerScreen/styles.ts +++ b/src/screens/DownloadManagerScreen/styles.ts @@ -33,17 +33,17 @@ export const createStyles = (colors: ThemeColors, shadows: ThemeShadows) => ({ flex: 1, }, listContent: { - paddingTop: SPACING.lg, + paddingTop: SPACING.md, paddingBottom: SPACING.xxl, }, section: { - marginBottom: SPACING.xl, + marginBottom: SPACING.md, }, sectionHeader: { flexDirection: 'row' as const, alignItems: 'center' as const, paddingHorizontal: SPACING.lg, - marginBottom: SPACING.md, + marginBottom: SPACING.sm, gap: SPACING.sm, }, sectionTitle: { @@ -63,7 +63,7 @@ export const createStyles = (colors: ThemeColors, shadows: ThemeShadows) => ({ }, downloadCard: { marginHorizontal: SPACING.lg, - marginBottom: SPACING.md, + marginBottom: SPACING.sm, }, downloadHeader: { flexDirection: 'row' as const, @@ -160,19 +160,47 @@ export const createStyles = (colors: ThemeColors, shadows: ThemeShadows) => ({ emptyCard: { marginHorizontal: SPACING.lg, alignItems: 'center' as const, - paddingVertical: SPACING.xxl, - gap: SPACING.sm, + paddingVertical: SPACING.xl, + gap: SPACING.xs, }, emptyText: { - ...TYPOGRAPHY.body, - color: colors.textSecondary, - marginTop: SPACING.sm, + ...TYPOGRAPHY.bodySmall, + color: colors.textMuted, + marginTop: SPACING.xs, }, emptySubtext: { - ...TYPOGRAPHY.bodySmall, + ...TYPOGRAPHY.meta, color: colors.textMuted, textAlign: 'center' as const, }, + filterBarContent: { + flexDirection: 'row' as const, + alignItems: 'center' as const, + paddingHorizontal: SPACING.lg, + paddingVertical: SPACING.sm, + gap: SPACING.xs, + }, + filterChip: { + flexDirection: 'row' as const, + alignItems: 'center' as const, + paddingHorizontal: SPACING.sm + 2, + paddingVertical: 5, + borderRadius: 12, + borderWidth: 1, + borderColor: colors.border, + backgroundColor: colors.background, + }, + filterChipActive: { + borderColor: colors.primary, + backgroundColor: `${colors.primary}15`, + }, + filterChipText: { + ...TYPOGRAPHY.meta, + color: colors.textSecondary, + }, + filterChipTextActive: { + color: colors.primary, + }, storageSection: { paddingHorizontal: SPACING.lg, }, diff --git a/src/screens/ModelSettingsScreen/ImageGenerationSection.tsx b/src/screens/ModelSettingsScreen/ImageGenerationSection.tsx index ea7c9306d..4d84b1309 100644 --- a/src/screens/ModelSettingsScreen/ImageGenerationSection.tsx +++ b/src/screens/ModelSettingsScreen/ImageGenerationSection.tsx @@ -1,7 +1,7 @@ import React, { useState } from 'react'; import { View, Text, Switch, Platform, TouchableOpacity } from 'react-native'; -import Slider from '@react-native-community/slider'; import { AdvancedToggle, Card } from '../../components'; +import { NumericStepper } from '../../components/NumericStepper'; import { Button } from '../../components/Button'; import { useTheme, useThemedStyles } from '../../theme'; import { useAppStore } from '../../stores'; @@ -114,49 +114,28 @@ const DetectionMethodRow: React.FC = () => { // ─── Advanced Section ──────────────────────────────────────────────────────── const ImageAdvancedSection: React.FC = () => { - const { colors } = useTheme(); const styles = useThemedStyles(createStyles); const { settings, updateSettings } = useAppStore(); return ( <> - - Guidance Scale - {(settings?.imageGuidanceScale || 7.5).toFixed(1)} - + Guidance Scale Higher = follows prompt more strictly - updateSettings({ imageGuidanceScale: value })} - minimumTrackTintColor={colors.primary} - maximumTrackTintColor={colors.surface} - thumbTintColor={colors.primary} + min={1} max={20} step={0.5} decimals={1} + onChange={(value) => updateSettings({ imageGuidanceScale: value })} /> - - Image Threads - {settings?.imageThreads ?? 4} - - - CPU threads used for image generation (applies on next image model load) - - Image Threads + CPU threads used for image generation (applies on next image model load) + updateSettings({ imageThreads: value })} - minimumTrackTintColor={colors.primary} - maximumTrackTintColor={colors.surface} - thumbTintColor={colors.primary} + min={1} max={8} step={1} + onChange={(value) => updateSettings({ imageThreads: value })} /> @@ -212,40 +191,23 @@ export const ImageGenerationSection: React.FC = () => { - - Image Steps - {settings?.imageSteps || 8} - + Image Steps More steps = better quality but slower (4-8 fast, 20-50 high quality) - updateSettings({ imageSteps: value })} - minimumTrackTintColor={colors.primary} - maximumTrackTintColor={colors.surface} - thumbTintColor={colors.primary} + min={4} max={50} step={1} + onChange={(value) => updateSettings({ imageSteps: value })} /> - - Image Size - {settings?.imageWidth ?? 256}x{settings?.imageHeight ?? 256} - + Image Size Output resolution (smaller = faster, larger = more detail) - updateSettings({ imageWidth: value, imageHeight: value })} - minimumTrackTintColor={colors.primary} - maximumTrackTintColor={colors.surface} - thumbTintColor={colors.primary} + min={128} max={512} step={64} + formatValue={(v) => `${v}x${v}`} + onChange={(value) => updateSettings({ imageWidth: value, imageHeight: value })} /> diff --git a/src/screens/ModelSettingsScreen/TextGenerationAdvanced.tsx b/src/screens/ModelSettingsScreen/TextGenerationAdvanced.tsx index 33faa2293..e1387488b 100644 --- a/src/screens/ModelSettingsScreen/TextGenerationAdvanced.tsx +++ b/src/screens/ModelSettingsScreen/TextGenerationAdvanced.tsx @@ -1,7 +1,7 @@ import React from 'react'; import { View, Text, Switch, Platform } from 'react-native'; -import Slider from '@react-native-community/slider'; import { Button } from '../../components/Button'; +import { NumericStepper } from '../../components/NumericStepper'; import { useTheme, useThemedStyles } from '../../theme'; import { useAppStore } from '../../stores'; import { CacheType } from '../../types'; @@ -52,24 +52,15 @@ const GpuSection: React.FC = ({ {isGpuEnabled && ( - - GPU Layers - {gpuLayersEffective} - + GPU Layers Layers offloaded to GPU. Higher = faster but may crash on low-VRAM devices. - updateSettings({ gpuLayers: value })} - minimumTrackTintColor={colors.primary} - maximumTrackTintColor={colors.surface} - thumbTintColor={colors.primary} + min={1} max={GPU_LAYERS_MAX} step={1} + onChange={(value) => updateSettings({ gpuLayers: value })} /> )} @@ -207,78 +198,42 @@ export const TextGenerationAdvanced: React.FC = () => { return ( <> - - Top P - {(settings?.topP || 0.9).toFixed(2)} - + Top P Nucleus sampling threshold - updateSettings({ topP: value })} - minimumTrackTintColor={colors.primary} - maximumTrackTintColor={colors.surface} - thumbTintColor={colors.primary} + min={0.1} max={1.0} step={0.05} decimals={2} + onChange={(value) => updateSettings({ topP: value })} /> - - Repeat Penalty - {(settings?.repeatPenalty || 1.1).toFixed(2)} - + Repeat Penalty Penalize repeated tokens - updateSettings({ repeatPenalty: value })} - minimumTrackTintColor={colors.primary} - maximumTrackTintColor={colors.surface} - thumbTintColor={colors.primary} + min={1.0} max={2.0} step={0.05} decimals={2} + onChange={(value) => updateSettings({ repeatPenalty: value })} /> - - CPU Threads - {settings?.nThreads || 6} - + CPU Threads Parallel threads for inference - updateSettings({ nThreads: value })} - minimumTrackTintColor={colors.primary} - maximumTrackTintColor={colors.surface} - thumbTintColor={colors.primary} + min={1} max={12} step={1} + onChange={(value) => updateSettings({ nThreads: value })} /> - - Batch Size - {settings?.nBatch || 256} - + Batch Size Tokens processed per batch - updateSettings({ nBatch: value })} - minimumTrackTintColor={colors.primary} - maximumTrackTintColor={colors.surface} - thumbTintColor={colors.primary} + min={32} max={512} step={32} + onChange={(value) => updateSettings({ nBatch: value })} /> diff --git a/src/screens/ModelSettingsScreen/TextGenerationSection.tsx b/src/screens/ModelSettingsScreen/TextGenerationSection.tsx index 5b1d9099f..3ae132f43 100644 --- a/src/screens/ModelSettingsScreen/TextGenerationSection.tsx +++ b/src/screens/ModelSettingsScreen/TextGenerationSection.tsx @@ -1,7 +1,7 @@ import React, { useState } from 'react'; import { View, Text, Switch } from 'react-native'; -import Slider from '@react-native-community/slider'; import { AdvancedToggle, Card } from '../../components'; +import { NumericStepper } from '../../components/NumericStepper'; import { useTheme, useThemedStyles } from '../../theme'; import { useAppStore } from '../../stores'; import { createStyles } from './styles'; @@ -26,56 +26,40 @@ export const TextGenerationSection: React.FC = () => { const contextLengthLabel = contextLength >= 1024 ? `${(contextLength / 1024).toFixed(0)}K` : String(contextLength); - const ctxSliderMax = modelMaxContext || FALLBACK_MAX_CONTEXT; + const ctxMax = modelMaxContext || FALLBACK_MAX_CONTEXT; return ( Configure LLM behavior for text responses. - {/* ── Basic Settings ── */} - Temperature - {(settings?.temperature || 0.7).toFixed(2)} Higher = more creative, Lower = more focused - updateSettings({ temperature: value })} - minimumTrackTintColor={colors.primary} - maximumTrackTintColor={colors.surface} - thumbTintColor={colors.primary} + min={0} max={2} step={0.05} decimals={2} + onChange={(value) => updateSettings({ temperature: value })} /> Max Tokens - {maxTokensLabel} Maximum response length - updateSettings({ maxTokens: value })} - minimumTrackTintColor={colors.primary} - maximumTrackTintColor={colors.surface} - thumbTintColor={colors.primary} + min={64} max={8192} step={64} + formatValue={() => maxTokensLabel} + onChange={(value) => updateSettings({ maxTokens: value })} /> Context Length - {contextLengthLabel} KV cache size — larger uses more RAM (requires reload) {contextLength > HIGH_CONTEXT_THRESHOLD && ( @@ -83,16 +67,11 @@ export const TextGenerationSection: React.FC = () => { High context uses significant RAM and may crash on some devices )} - updateSettings({ contextLength: value })} - minimumTrackTintColor={colors.primary} - maximumTrackTintColor={colors.surface} - thumbTintColor={colors.primary} + min={512} max={ctxMax} step={1024} + formatValue={() => contextLengthLabel} + onChange={(value) => updateSettings({ contextLength: value })} /> diff --git a/src/screens/ModelSettingsScreen/index.tsx b/src/screens/ModelSettingsScreen/index.tsx index e0aefc790..319c93026 100644 --- a/src/screens/ModelSettingsScreen/index.tsx +++ b/src/screens/ModelSettingsScreen/index.tsx @@ -33,6 +33,7 @@ export const ModelSettingsScreen: React.FC = () => { const task = InteractionManager.runAfterInteractions(() => goTo(pending)); return () => task.cancel(); } + // eslint-disable-next-line react-hooks/exhaustive-deps }, []); const handleReset = () => { diff --git a/src/screens/SettingsScreen.tsx b/src/screens/SettingsScreen.tsx index f1cd721a0..353c9b238 100644 --- a/src/screens/SettingsScreen.tsx +++ b/src/screens/SettingsScreen.tsx @@ -151,6 +151,7 @@ export const SettingsScreen: React.FC = () => { { icon: 'wifi', title: 'Remote Servers', desc: 'Connect to Ollama, LM Studio, and more', screen: 'RemoteServers' as const }, // { icon: 'search', title: 'Web Search', desc: 'Configure search API key for reliable results', screen: 'WebSearchSettings' as const }, { icon: 'mic', title: 'Voice Transcription', desc: 'On-device speech to text', screen: 'VoiceSettings' as const }, + { icon: 'volume-2', title: 'Text to Speech', desc: 'On-device voice responses', screen: 'TTSSettings' as const }, { icon: 'lock', title: 'Security', desc: 'Passphrase and app lock', screen: 'SecuritySettings' as const }, { icon: 'smartphone', title: 'Device Information', desc: 'Hardware and compatibility', screen: 'DeviceInfo' as const }, { icon: 'hard-drive', title: 'Storage', desc: 'Models and data usage', screen: 'StorageSettings' as const }, diff --git a/src/screens/TTSSettingsScreen/index.tsx b/src/screens/TTSSettingsScreen/index.tsx new file mode 100644 index 000000000..a00ca7773 --- /dev/null +++ b/src/screens/TTSSettingsScreen/index.tsx @@ -0,0 +1,441 @@ +import React, { useEffect, useState } from 'react'; +import { View, Text, ScrollView, TouchableOpacity, Switch, ActivityIndicator } from 'react-native'; +import { SafeAreaView } from 'react-native-safe-area-context'; +import Icon from 'react-native-vector-icons/Feather'; +import { NumericStepper } from '../../components/NumericStepper'; +import { useNavigation } from '@react-navigation/native'; +import { Card, Button } from '../../components'; +import { CustomAlert, showAlert, hideAlert, AlertState, initialAlertState } from '../../components/CustomAlert'; +import { useTheme, useThemedStyles } from '../../theme'; +import type { ThemeColors, ThemeShadows } from '../../theme'; +import { TYPOGRAPHY, SPACING } from '../../constants'; +import { useTTSStore } from '../../stores/ttsStore'; +import { ttsRegistry } from '../../engine'; +import { hardwareService } from '../../services/hardware'; +import { TTS_WARN_RAM_GB, TTS_BLOCK_RAM_GB } from '../../constants/ttsModels'; +import type { InterfaceMode } from '../../stores/ttsStore'; + +// ─── Sub-components ─────────────────────────────────────────────────────────── + +type Styles = ReturnType; + +const AssetRow: React.FC<{ + label: string; + sizeMB: number; + status: string; + progress: number; + styles: Styles; + colors: ThemeColors; + border?: boolean; +}> = ({ label, sizeMB, status, progress, styles, colors, border }) => ( + + + + {label} + {sizeMB} MB + + {status === 'downloaded' && } + {status === 'downloading' && {Math.round(progress * 100)}%} + {status === 'not-downloaded' && } + {status === 'error' && } + + {status === 'downloading' && ( + + + + )} + +); + +const InterfaceModeCard: React.FC<{ + mode: InterfaceMode; + deviceBlocked: boolean; + engineReady: boolean; + onModeChange: (m: InterfaceMode) => void; + styles: Styles; +}> = ({ mode, deviceBlocked, engineReady, onModeChange, styles }) => ( + + Interface Mode + + Audio Mode renders responses as voice notes. Chat Mode adds a play button to text bubbles. + + + {(['chat', 'audio'] as InterfaceMode[]).map((m) => { + const active = mode === m; + const blocked = m === 'audio' && (deviceBlocked || !engineReady); + return ( + onModeChange(m)} + disabled={blocked} + > + + {m === 'chat' ? 'Chat' : 'Audio'} + + + ); + })} + + {!engineReady && ( + Download models below to enable Audio Mode. + )} + +); + +const PlaybackCard: React.FC<{ + settings: ReturnType['settings']; + onUpdate: (patch: Partial['settings']>) => void; + colors: ThemeColors; + styles: Styles; +}> = ({ settings, onUpdate, colors, styles }) => ( + + Playback + Speed + `${v.toFixed(1)}x`} + onChange={(v) => onUpdate({ speed: v })} + /> + {settings.interfaceMode === 'chat' && ( + + + Auto-play + Speak AI responses automatically + + onUpdate({ autoPlay: v })} + trackColor={{ true: colors.primary }} + /> + + )} + +); + +const CompatibilityCard: React.FC<{ + ramGB: number; + deviceBlocked: boolean; + deviceWarning: boolean; + styles: Styles; + colors: ThemeColors; +}> = ({ ramGB, deviceBlocked, deviceWarning, styles, colors }) => { + if (!deviceWarning && !deviceBlocked) { return null; } + return ( + + + + + {deviceBlocked + ? `TTS requires at least ${TTS_BLOCK_RAM_GB} GB RAM. Your device has ${ramGB.toFixed(1)} GB.` + : `Your device (${ramGB.toFixed(1)} GB RAM) may run TTS but performance could be slow. 8 GB recommended.`} + + + + ); +}; + +const EnginePickerCard: React.FC<{ + styles: Styles; + colors: ThemeColors; +}> = ({ styles, colors }) => { + const { settings, setEngine } = useTTSStore(); + const engineIds = ttsRegistry.getRegisteredIds(); + + const handleSelect = async (id: string) => { + if (id === settings.engineId) return; + await setEngine(id); + }; + + return ( + + Engine + + Choose which on-device TTS engine powers speech synthesis. + + {engineIds.map((id, i) => { + const engine = ttsRegistry.getEngine(id); + const active = id === settings.engineId; + const supported = engine.isSupported(); + return ( + 0 && styles.voiceRowBorder]} + onPress={() => handleSelect(id)} + disabled={!supported} + > + + + {engine.displayName} + + + {engine.capabilities.peakRamMB} MB + {engine.capabilities.voiceCloning ? ' · Voice cloning' : ''} + {engine.capabilities.streaming ? ' · Streaming' : ''} + {!supported ? ' · Not supported on this device' : ''} + + + {active && } + + ); + })} + + ); +}; + +const VoiceCard: React.FC<{ + styles: Styles; + colors: ThemeColors; +}> = ({ styles, colors }) => { + const { voices, activeVoiceId, isReady, isDownloading, overallDownloadProgress, setVoice } = useTTSStore(); + + return ( + + + Voice + {isDownloading && overallDownloadProgress > 0 && ( + {Math.round(overallDownloadProgress * 100)}% + )} + {!isReady && !isDownloading && ( + + )} + {isReady && ( + + )} + + + Fast on-device voice synthesis. Used for the speak button in Chat Mode. + + {voices.map((voice, i) => { + const active = activeVoiceId === voice.id; + return ( + 0 && styles.voiceRowBorder]} + onPress={() => setVoice(voice.id)} + > + + {voice.label} + + {voice.metadata.accent ? `${voice.metadata.accent} · ` : ''} + {voice.metadata.gender || ''} + + + {active && } + + ); + })} + + ); +}; + +// ─── Main screen ────────────────────────────────────────────────────────────── + +export const TTSSettingsScreen: React.FC = () => { + const navigation = useNavigation(); + const { colors } = useTheme(); + const styles = useThemedStyles(createStyles); + const [alertState, setAlertState] = useState(initialAlertState); + const [ramGB, setRamGB] = useState(8); + + const { + assets, isReady, isDownloading, isLoading, + audioCacheSizeMB, settings, error, + downloadModels, deleteModels, + checkDownloadStatus, refreshCacheSize, clearAudioCache, updateSettings, clearError, + initializeEngine, + } = useTTSStore(); + + useEffect(() => { + setRamGB(hardwareService.getTotalMemoryGB()); + checkDownloadStatus(); + refreshCacheSize(); + // eslint-disable-next-line react-hooks/exhaustive-deps + }, []); + + const deviceBlocked = ramGB < TTS_BLOCK_RAM_GB; + const deviceWarning = !deviceBlocked && ramGB < TTS_WARN_RAM_GB; + const totalSizeMB = Math.round(assets.reduce((sum, a) => sum + a.asset.sizeBytes, 0) / (1024 * 1024)); + const allDownloaded = assets.every(a => a.status === 'downloaded'); + + const handleDelete = () => { + setAlertState( + showAlert('Remove TTS Models', 'This will delete all model files and disable text-to-speech.', [ + { text: 'Cancel', style: 'cancel' }, + { text: 'Remove', style: 'destructive', onPress: () => { setAlertState(hideAlert()); deleteModels(); } }, + ]), + ); + }; + + const handleClearCache = () => { + setAlertState( + showAlert('Clear Audio Cache', `This will delete ${audioCacheSizeMB.toFixed(1)} MB of cached audio.`, [ + { text: 'Cancel', style: 'cancel' }, + { text: 'Clear', style: 'destructive', onPress: () => { setAlertState(hideAlert()); clearAudioCache(); } }, + ]), + ); + }; + + const handleModeChange = (mode: InterfaceMode) => { + if (mode === 'audio' && deviceBlocked) return; + updateSettings({ interfaceMode: mode }); + if (mode === 'audio') initializeEngine(); + }; + + return ( + + + navigation.goBack()}> + + + Text to Speech + {isLoading && } + + + + + + + + + {settings.interfaceMode === 'chat' && ( + + + + Enable TTS + Show play buttons on assistant messages + + updateSettings({ enabled: v })} trackColor={{ true: colors.primary }} /> + + + )} + + + Models{totalSizeMB > 0 ? ` (${totalSizeMB} MB total)` : ''} + {assets.map((assetState, i) => ( + 0} + /> + ))} + + {allDownloaded + ?