diff --git a/.changeset/add-zai-glm-4-7-cerebras-model.md b/.changeset/add-zai-glm-4-7-cerebras-model.md deleted file mode 100644 index 141553f5f1d..00000000000 --- a/.changeset/add-zai-glm-4-7-cerebras-model.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -"kilo-code": patch ---- - -Add `zai-glm-4.7` to Cerebras models diff --git a/.changeset/cli-followup-slash-commands.md b/.changeset/cli-followup-slash-commands.md new file mode 100644 index 00000000000..0805250ec39 --- /dev/null +++ b/.changeset/cli-followup-slash-commands.md @@ -0,0 +1,5 @@ +--- +"@kilocode/cli": patch +--- + +Fix slash commands being intercepted by followup suggestions during `ask_followup_question` prompts. diff --git a/.changeset/cute-flies-dance.md b/.changeset/cute-flies-dance.md deleted file mode 100644 index 753c120f407..00000000000 --- a/.changeset/cute-flies-dance.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -"kilo-code": patch ---- - -Improved prompt caching when using Anthropic models on OpenRouter with native tool calling diff --git a/.changeset/fine-cameras-enter.md b/.changeset/fine-cameras-enter.md new file mode 100644 index 00000000000..2061eddb197 --- /dev/null +++ b/.changeset/fine-cameras-enter.md @@ -0,0 +1,5 @@ +--- +"kilo-code": patch +--- + +Allow users to pick an input device for Speech-to-Text input diff --git a/CHANGELOG.md b/CHANGELOG.md index 638ca212ae5..e2f61f2dbba 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,17 @@ # kilo-code +## 4.143.2 + +### Patch Changes + +- [#4833](https://github.com/Kilo-Org/kilocode/pull/4833) [`2c7cd08`](https://github.com/Kilo-Org/kilocode/commit/2c7cd084bf4707eedda61fed554cf15fcc8b065b) Thanks [@sebastiand-cerebras](https://github.com/sebastiand-cerebras)! - Add `zai-glm-4.7` to Cerebras models + +- [#4853](https://github.com/Kilo-Org/kilocode/pull/4853) [`435c879`](https://github.com/Kilo-Org/kilocode/commit/435c879a29d55b75f5f6ffe7bf14854630e085cb) Thanks [@chrarnoldus](https://github.com/chrarnoldus)! - Improved prompt caching when using Anthropic models on OpenRouter with native tool calling + +- [#4859](https://github.com/Kilo-Org/kilocode/pull/4859) [`35fb2ad`](https://github.com/Kilo-Org/kilocode/commit/35fb2adc65dfb1e71e28f7368f96765062c43579) Thanks [@marius-kilocode](https://github.com/marius-kilocode)! - Fix Architect mode unnecessarily switching to Code mode to edit markdown files + +- [#4829](https://github.com/Kilo-Org/kilocode/pull/4829) [`4e09e36`](https://github.com/Kilo-Org/kilocode/commit/4e09e36bba165a2ab6f5e07f71a420faa49ea3ec) Thanks [@marius-kilocode](https://github.com/marius-kilocode)! - Fix browser action results displaying raw base64 screenshot data as hexadecimal garbage + ## 4.143.1 ### Patch Changes diff --git a/cli/CHANGELOG.md b/cli/CHANGELOG.md index 9e8cbf4ff42..60e44509f7b 100644 --- a/cli/CHANGELOG.md +++ b/cli/CHANGELOG.md @@ -1,5 +1,11 @@ # @kilocode/cli +## 0.19.2 + +### Patch Changes + +- [#4829](https://github.com/Kilo-Org/kilocode/pull/4829) [`4e09e36`](https://github.com/Kilo-Org/kilocode/commit/4e09e36bba165a2ab6f5e07f71a420faa49ea3ec) Thanks [@marius-kilocode](https://github.com/marius-kilocode)! - Fix browser action results displaying raw base64 screenshot data as hexadecimal garbage + ## 0.19.1 ### Patch Changes diff --git a/cli/README.md b/cli/README.md index 2a9a9a7f101..74370cce808 100644 --- a/cli/README.md +++ b/cli/README.md @@ -245,7 +245,7 @@ To build and run the CLI locally off your branch: cd src pnpm bundle pnpm vsix -pnpm vsix:unpackged +pnpm vsix:unpacked cd .. ``` diff --git a/cli/package.dist.json b/cli/package.dist.json index 6ef1d55840a..78d970a51ac 100644 --- a/cli/package.dist.json +++ b/cli/package.dist.json @@ -1,6 +1,6 @@ { "name": "@kilocode/cli", - "version": "0.19.1", + "version": "0.19.2", "description": "Terminal User Interface for Kilo Code", "type": "module", "main": "index.js", diff --git a/cli/package.json b/cli/package.json index 5c90ddba5c7..4cb3c88a356 100644 --- a/cli/package.json +++ b/cli/package.json @@ -1,6 +1,6 @@ { "name": "@kilocode/cli", - "version": "0.19.1", + "version": "0.19.2", "description": "Terminal User Interface for Kilo Code", "type": "module", "main": "dist/index.js", diff --git a/cli/src/state/atoms/__tests__/keyboard.test.ts b/cli/src/state/atoms/__tests__/keyboard.test.ts index b776426ba4b..333bd44a6a1 100644 --- a/cli/src/state/atoms/__tests__/keyboard.test.ts +++ b/cli/src/state/atoms/__tests__/keyboard.test.ts @@ -7,6 +7,8 @@ import { argumentSuggestionsAtom, selectedIndexAtom, fileMentionSuggestionsAtom, + setFollowupSuggestionsAtom, + followupSuggestionsAtom, } from "../ui.js" import { textBufferStringAtom, textBufferStateAtom } from "../textBuffer.js" import { @@ -1151,4 +1153,104 @@ describe("keypress atoms", () => { expect(store.get(exitPromptVisibleAtom)).toBe(true) }) }) + + describe("followup suggestions vs slash command input", () => { + it("should submit typed /command (not followup suggestion) when input starts with '/'", async () => { + const mockCallback = vi.fn() + store.set(submissionCallbackAtom, { callback: mockCallback }) + + // Followup suggestions are active (ask_followup_question), which normally takes priority over autocomplete. + store.set(setFollowupSuggestionsAtom, [{ answer: "Yes, continue" }, { answer: "No, stop" }]) + + // Type a slash command. + for (const char of ["/", "h", "e", "l", "p"]) { + const key: Key = { + name: char, + sequence: char, + ctrl: false, + meta: false, + shift: false, + paste: false, + } + store.set(keyboardHandlerAtom, key) + } + + // Simulate the "auto-select first item" behavior from autocomplete that can set selectedIndex to 0. + // In the buggy behavior, followup mode is still active and this causes Enter to submit the followup suggestion instead. + store.set(selectedIndexAtom, 0) + + // Press Enter to submit. + const enterKey: Key = { + name: "return", + sequence: "\r", + ctrl: false, + meta: false, + shift: false, + paste: false, + } + await store.set(keyboardHandlerAtom, enterKey) + + // Wait for async operations to complete + await new Promise((resolve) => setTimeout(resolve, 10)) + + expect(mockCallback).toHaveBeenCalledWith("/help") + // Followup should remain active after running a slash command. + expect(store.get(followupSuggestionsAtom)).toHaveLength(2) + // Followup should not auto-select after command execution. + expect(store.get(selectedIndexAtom)).toBe(-1) + }) + + it("should dismiss followup suggestions for /clear and /new commands", async () => { + const mockCallback = vi.fn() + store.set(submissionCallbackAtom, { callback: mockCallback }) + + store.set(setFollowupSuggestionsAtom, [{ answer: "Yes, continue" }, { answer: "No, stop" }]) + + // Type /clear + for (const char of ["/", "c", "l", "e", "a", "r"]) { + const key: Key = { + name: char, + sequence: char, + ctrl: false, + meta: false, + shift: false, + paste: false, + } + store.set(keyboardHandlerAtom, key) + } + + const enterKey: Key = { + name: "return", + sequence: "\r", + ctrl: false, + meta: false, + shift: false, + paste: false, + } + await store.set(keyboardHandlerAtom, enterKey) + await new Promise((resolve) => setTimeout(resolve, 10)) + + expect(mockCallback).toHaveBeenCalledWith("/clear") + expect(store.get(followupSuggestionsAtom)).toHaveLength(0) + + // Re-seed followup and type /new + store.set(setFollowupSuggestionsAtom, [{ answer: "Yes, continue" }, { answer: "No, stop" }]) + for (const char of ["/", "n", "e", "w"]) { + const key: Key = { + name: char, + sequence: char, + ctrl: false, + meta: false, + shift: false, + paste: false, + } + store.set(keyboardHandlerAtom, key) + } + await store.set(keyboardHandlerAtom, enterKey) + await new Promise((resolve) => setTimeout(resolve, 10)) + + expect(mockCallback).toHaveBeenCalledWith("/new") + expect(store.get(followupSuggestionsAtom)).toHaveLength(0) + }) + }) }) diff --git a/cli/src/state/atoms/index.ts b/cli/src/state/atoms/index.ts index 1e8cd0eeb9e..fcea0ff1da6 100644 --- a/cli/src/state/atoms/index.ts +++ b/cli/src/state/atoms/index.ts @@ -214,6 +214,7 @@ export { // Followup suggestions state atoms followupSuggestionsAtom, showFollowupSuggestionsAtom, + followupSuggestionsMenuVisibleAtom, selectedFollowupIndexAtom, // Derived UI atoms diff --git a/cli/src/state/atoms/keyboard.ts b/cli/src/state/atoms/keyboard.ts index 31bd9529d3a..79df534bb6c 100644 --- a/cli/src/state/atoms/keyboard.ts +++ b/cli/src/state/atoms/keyboard.ts @@ -14,7 +14,7 @@ import { fileMentionContextAtom, selectedIndexAtom, followupSuggestionsAtom, - showFollowupSuggestionsAtom, + followupSuggestionsMenuVisibleAtom, clearFollowupSuggestionsAtom, inputModeAtom, type InputMode, @@ -408,14 +408,30 @@ export const submitInputAtom = atom(null, (get, set, text: string | Buffer) => { // Convert Buffer to string if needed const textStr = typeof text === "string" ? text : text.toString() + const trimmedText = textStr.trim() + const hasFollowupSuggestions = get(followupSuggestionsAtom).length > 0 + const isSlashCommand = trimmedText.startsWith("/") + const slashCommandName = isSlashCommand ? (trimmedText.match(/^\/([^\s]+)/)?.[1]?.toLowerCase() ?? "") : "" + const shouldDismissFollowupOnSlashCommand = new Set(["clear", "c", "cls", "new", "n", "start", "exit", "q", "quit"]) - if (callback && typeof callback === "function" && textStr && textStr.trim()) { + if (callback && typeof callback === "function" && trimmedText) { // Call the submission callback callback(textStr) // Clear input and related state set(clearTextBufferAtom) - set(clearFollowupSuggestionsAtom) + // If the user runs a slash command while a followup question is active, + // keep the followup question/suggestions so they can answer after the command runs. + if (hasFollowupSuggestions && isSlashCommand) { + if (slashCommandName && shouldDismissFollowupOnSlashCommand.has(slashCommandName)) { + set(clearFollowupSuggestionsAtom) + } else { + // Ensure followup stays in "no selection" mode after executing a slash command. + set(selectedIndexAtom, -1) + } + } else { + set(clearFollowupSuggestionsAtom) + } } }) @@ -1067,7 +1083,7 @@ export const keyboardHandlerAtom = atom(null, async (get, set, key: Key) => { // Priority 2: Determine current mode and route to mode-specific handler const isApprovalPending = get(isApprovalPendingAtom) - const isFollowupVisible = get(showFollowupSuggestionsAtom) + const isFollowupVisible = get(followupSuggestionsMenuVisibleAtom) const isAutocompleteVisible = get(showAutocompleteAtom) const fileMentionSuggestions = get(fileMentionSuggestionsAtom) const isInHistoryMode = get(historyModeAtom) diff --git a/cli/src/state/atoms/ui.ts b/cli/src/state/atoms/ui.ts index 3330b7611db..5bcf46c620e 100644 --- a/cli/src/state/atoms/ui.ts +++ b/cli/src/state/atoms/ui.ts @@ -238,6 +238,23 @@ export const followupSuggestionsAtom = atom([]) */ export const showFollowupSuggestionsAtom = atom(false) +/** + * Derived atom that hides followup suggestions when slash-command autocomplete or file-mention autocomplete is active. + * This prevents the followup menu (and its selection index) from intercepting "/" commands. + */ +export const followupSuggestionsMenuVisibleAtom = atom((get) => { + if (!get(showFollowupSuggestionsAtom)) return false + if (get(followupSuggestionsAtom).length === 0) return false + + // If the user starts a "/" command, show command autocomplete instead of followups. + if (get(showAutocompleteAtom)) return false + + // If file-mention autocomplete is active, it should take precedence as well. + if (get(fileMentionSuggestionsAtom).length > 0) return false + + return true +}) + /** * @deprecated Use selectedIndexAtom instead - this is now shared across all selection contexts * This atom is kept for backward compatibility but will be removed in a future version. diff --git a/cli/src/state/hooks/useFollowupSuggestions.ts b/cli/src/state/hooks/useFollowupSuggestions.ts index dad6d19521a..15a8e5baffb 100644 --- a/cli/src/state/hooks/useFollowupSuggestions.ts +++ b/cli/src/state/hooks/useFollowupSuggestions.ts @@ -8,7 +8,7 @@ import { useMemo, useCallback } from "react" import type { FollowupSuggestion } from "../atoms/ui.js" import { followupSuggestionsAtom, - showFollowupSuggestionsAtom, + followupSuggestionsMenuVisibleAtom, selectedIndexAtom, setFollowupSuggestionsAtom, clearFollowupSuggestionsAtom, @@ -102,7 +102,7 @@ export interface UseFollowupSuggestionsReturn { export function useFollowupSuggestions(): UseFollowupSuggestionsReturn { // Read atoms const suggestions = useAtomValue(followupSuggestionsAtom) - const isVisible = useAtomValue(showFollowupSuggestionsAtom) + const isVisible = useAtomValue(followupSuggestionsMenuVisibleAtom) const selectedIndex = useAtomValue(selectedIndexAtom) const selectedSuggestion = useAtomValue(getSelectedFollowupAtom) const hasSuggestions = useAtomValue(hasFollowupSuggestionsAtom) diff --git a/cli/src/state/hooks/useHotkeys.ts b/cli/src/state/hooks/useHotkeys.ts index 863c5437327..32d1a4b2d1e 100644 --- a/cli/src/state/hooks/useHotkeys.ts +++ b/cli/src/state/hooks/useHotkeys.ts @@ -4,7 +4,7 @@ import { useAtomValue } from "jotai" import { useMemo } from "react" -import { isStreamingAtom, showFollowupSuggestionsAtom } from "../atoms/ui.js" +import { isStreamingAtom, followupSuggestionsMenuVisibleAtom } from "../atoms/ui.js" import { useApprovalHandler } from "./useApprovalHandler.js" import { hasResumeTaskAtom } from "../atoms/extension.js" import { shellModeActiveAtom } from "../atoms/keyboard.js" @@ -46,7 +46,7 @@ function getModifierKey(): string { */ export function useHotkeys(): UseHotkeysReturn { const isStreaming = useAtomValue(isStreamingAtom) - const isFollowupVisible = useAtomValue(showFollowupSuggestionsAtom) + const isFollowupVisible = useAtomValue(followupSuggestionsMenuVisibleAtom) const hasResumeTask = useAtomValue(hasResumeTaskAtom) const isShellModeActive = useAtomValue(shellModeActiveAtom) const { isApprovalPending } = useApprovalHandler() diff --git a/cli/src/ui/components/__tests__/StatusIndicator.test.tsx b/cli/src/ui/components/__tests__/StatusIndicator.test.tsx index 3d1942bc773..967edba32b1 100644 --- a/cli/src/ui/components/__tests__/StatusIndicator.test.tsx +++ b/cli/src/ui/components/__tests__/StatusIndicator.test.tsx @@ -8,7 +8,7 @@ import { describe, it, expect, vi, beforeEach } from "vitest" import { Provider as JotaiProvider } from "jotai" import { createStore } from "jotai" import { StatusIndicator } from "../StatusIndicator.js" -import { showFollowupSuggestionsAtom, isCancellingAtom } from "../../../state/atoms/ui.js" +import { setFollowupSuggestionsAtom, isCancellingAtom } from "../../../state/atoms/ui.js" import { chatMessagesAtom } from "../../../state/atoms/extension.js" import { exitPromptVisibleAtom } from "../../../state/atoms/keyboard.js" import type { ExtensionChatMessage } from "../../../types/messages.js" @@ -63,7 +63,7 @@ describe("StatusIndicator", () => { }) it("should show followup hotkeys when suggestions are visible", () => { - store.set(showFollowupSuggestionsAtom, true) + store.set(setFollowupSuggestionsAtom, [{ answer: "Yes, continue" }, { answer: "No, stop" }]) const { lastFrame } = render( @@ -80,7 +80,7 @@ describe("StatusIndicator", () => { it("should show general command hints when idle", () => { // No messages = not streaming store.set(chatMessagesAtom, []) - store.set(showFollowupSuggestionsAtom, false) + store.set(setFollowupSuggestionsAtom, []) const { lastFrame } = render( diff --git a/cli/src/ui/messages/extension/say/SayBrowserActionResultMessage.tsx b/cli/src/ui/messages/extension/say/SayBrowserActionResultMessage.tsx index c27d3130231..6a8eef3d9dc 100644 --- a/cli/src/ui/messages/extension/say/SayBrowserActionResultMessage.tsx +++ b/cli/src/ui/messages/extension/say/SayBrowserActionResultMessage.tsx @@ -1,14 +1,61 @@ import React from "react" import { Box, Text } from "ink" import type { MessageComponentProps } from "../types.js" -import { MarkdownText } from "../../../components/MarkdownText.js" import { useTheme } from "../../../../state/hooks/useTheme.js" /** - * Display browser action results + * Parsed browser action result data + */ +interface BrowserActionResultData { + screenshot?: string + logs?: string + currentUrl?: string + currentMousePosition?: string + viewportWidth?: number + viewportHeight?: number +} + +/** + * Parse browser action result from message text + */ +function parseBrowserActionResult(text: string | undefined): BrowserActionResultData | null { + if (!text) return null + try { + return JSON.parse(text) as BrowserActionResultData + } catch { + return null + } +} + +/** + * Display browser action results in a readable format + * Parses the JSON data and shows meaningful info instead of raw base64 screenshot data */ export const SayBrowserActionResultMessage: React.FC = ({ message }) => { const theme = useTheme() + const result = parseBrowserActionResult(message.text) + + // If we can't parse, show a simple message + if (!result) { + return ( + + + + 🌐 Browser Action Result + + + + Browser action completed + + + ) + } + + const hasScreenshot = !!result.screenshot + const hasLogs = result.logs && result.logs.trim().length > 0 + const hasUrl = !!result.currentUrl + const hasViewport = result.viewportWidth && result.viewportHeight + return ( @@ -17,11 +64,56 @@ export const SayBrowserActionResultMessage: React.FC = ({ - {message.text && ( - - {message.text} - - )} + + {/* Screenshot indicator */} + {hasScreenshot && ( + + 📷 Screenshot captured + + )} + + {/* Current URL */} + {hasUrl && ( + + + URL: {result.currentUrl} + + + )} + + {/* Viewport dimensions */} + {hasViewport && ( + + + Viewport: {result.viewportWidth}x{result.viewportHeight} + + + )} + + {/* Cursor position */} + {result.currentMousePosition && ( + + Cursor: {result.currentMousePosition} + + )} + + {/* Console logs */} + {hasLogs && ( + + Console logs: + + {result.logs} + + + )} + + {/* Fallback if no meaningful data */} + {!hasScreenshot && !hasLogs && !hasUrl && !hasViewport && ( + + Browser action completed + + )} + ) } diff --git a/cli/src/ui/messages/extension/say/__tests__/SayBrowserActionResultMessage.test.tsx b/cli/src/ui/messages/extension/say/__tests__/SayBrowserActionResultMessage.test.tsx new file mode 100644 index 00000000000..9a45519a584 --- /dev/null +++ b/cli/src/ui/messages/extension/say/__tests__/SayBrowserActionResultMessage.test.tsx @@ -0,0 +1,198 @@ +import React from "react" +import { render } from "ink-testing-library" +import { describe, it, expect } from "vitest" +import { SayBrowserActionResultMessage } from "../SayBrowserActionResultMessage.js" +import type { ExtensionChatMessage } from "../../../../../types/messages.js" + +describe("SayBrowserActionResultMessage", () => { + const baseMessage: ExtensionChatMessage = { + ts: Date.now(), + type: "say", + say: "browser_action_result", + } + + it("should display header for browser action result", () => { + const { lastFrame } = render( + , + ) + expect(lastFrame()).toContain("Browser Action Result") + }) + + it("should show screenshot indicator instead of base64 data", () => { + const browserResult = { + screenshot: "data:image/webp;base64,UklGRn44AABXRUJQVlA4...", // Simulated base64 data + logs: "", + currentUrl: "https://example.com", + viewportWidth: 1280, + viewportHeight: 800, + } + + const { lastFrame } = render( + , + ) + + const output = lastFrame() + + // Should show screenshot indicator + expect(output).toContain("Screenshot captured") + + // Should NOT contain the base64 data + expect(output).not.toContain("UklGRn44AABXRUJQVlA4") + expect(output).not.toContain("data:image") + + // Should show URL + expect(output).toContain("https://example.com") + + // Should show viewport + expect(output).toContain("1280x800") + }) + + it("should display console logs when present", () => { + const browserResult = { + logs: "Console: Hello from the page\nError: Something went wrong", + } + + const { lastFrame } = render( + , + ) + + const output = lastFrame() + expect(output).toContain("Console logs:") + expect(output).toContain("Hello from the page") + expect(output).toContain("Something went wrong") + }) + + it("should display cursor position when present", () => { + const browserResult = { + currentMousePosition: "500,300", + } + + const { lastFrame } = render( + , + ) + + expect(lastFrame()).toContain("Cursor: 500,300") + }) + + it("should handle empty result gracefully", () => { + const { lastFrame } = render( + , + ) + + expect(lastFrame()).toContain("Browser action completed") + }) + + it("should handle invalid JSON gracefully", () => { + const { lastFrame } = render( + , + ) + + expect(lastFrame()).toContain("Browser action completed") + }) + + it("should handle missing text gracefully", () => { + const { lastFrame } = render( + , + ) + + expect(lastFrame()).toContain("Browser action completed") + }) + + it("should not show logs section when logs are empty", () => { + const browserResult = { + screenshot: "data:image/png;base64,abc", + logs: "", + } + + const { lastFrame } = render( + , + ) + + expect(lastFrame()).not.toContain("Console logs:") + }) + + it("should not show logs section when logs are only whitespace", () => { + const browserResult = { + screenshot: "data:image/png;base64,abc", + logs: " \n\t ", + } + + const { lastFrame } = render( + , + ) + + expect(lastFrame()).not.toContain("Console logs:") + }) + + it("should display all available info together", () => { + const browserResult = { + screenshot: "data:image/png;base64,abc123", + logs: "Page loaded", + currentUrl: "https://test.com/page", + currentMousePosition: "100,200", + viewportWidth: 1920, + viewportHeight: 1080, + } + + const { lastFrame } = render( + , + ) + + const output = lastFrame() + expect(output).toContain("Screenshot captured") + expect(output).toContain("https://test.com/page") + expect(output).toContain("1920x1080") + expect(output).toContain("100,200") + expect(output).toContain("Page loaded") + }) +}) diff --git a/flake.nix b/flake.nix index d389996052c..746149ad47c 100644 --- a/flake.nix +++ b/flake.nix @@ -5,120 +5,133 @@ nixpkgs.url = "github:nixos/nixpkgs/nixos-25.05"; }; - outputs = { self, nixpkgs, ... }: let - systems = [ "aarch64-darwin" "x86_64-linux" ]; + outputs = + { self, nixpkgs, ... }: + let + systems = [ + "aarch64-darwin" + "x86_64-linux" + ]; - forAllSystems = nixpkgs.lib.genAttrs systems; + forAllSystems = nixpkgs.lib.genAttrs systems; - mkDevShell = system: let - pkgs = import nixpkgs { inherit system; }; - in pkgs.mkShell { - name = "kilo-code"; + mkDevShell = + system: + let + pkgs = import nixpkgs { inherit system; }; + in + pkgs.mkShell { + name = "kilo-code"; - packages = with pkgs; [ - nodejs_20 - corepack_20 - libnotify - jetbrains.idea-community - jetbrains.jdk - jdk21 - gradle - unzip - # Build tools for native modules - pkg-config - python3 - gcc - gnumake - # Libraries - libsecret - # X11 libraries for JetBrains IDEs and native-keymap - xorg.libX11 - xorg.libX11.dev - xorg.libXext - xorg.libXi - xorg.libXrender - xorg.libXtst - xorg.libXrandr - xorg.libXinerama - xorg.libXcursor - xorg.libXdamage - xorg.libXfixes - xorg.libXcomposite - xorg.libxkbfile - xorg.libxkbfile.dev - xorg.libxcb - # Additional GUI libraries - freetype - fontconfig - glib - gtk3 - cairo - pango - gdk-pixbuf - atk - # JCEF dependencies - nspr - nss - cups - dbus - at-spi2-atk - at-spi2-core - libdrm - mesa - expat - alsa-lib - pulseaudio - # Github - act - ]; + packages = with pkgs; [ + nodejs_20 + corepack_20 + libnotify + jetbrains.idea-community + jetbrains.jdk + jdk21 + gradle + unzip + # Build tools for native modules + pkg-config + python3 + gcc + gnumake + # Libraries + libsecret + # X11 libraries for JetBrains IDEs and native-keymap + xorg.libX11 + xorg.libX11.dev + xorg.libXext + xorg.libXi + xorg.libXrender + xorg.libXtst + xorg.libXrandr + xorg.libXinerama + xorg.libXcursor + xorg.libXdamage + xorg.libXfixes + xorg.libXcomposite + xorg.libxkbfile + xorg.libxkbfile.dev + xorg.libxcb + # Additional GUI libraries + freetype + fontconfig + glib + gtk3 + cairo + pango + gdk-pixbuf + atk + # JCEF dependencies + nspr + nss + cups + dbus + at-spi2-atk + at-spi2-core + libdrm + mesa + expat + alsa-lib + pulseaudio + # Github + act + # Microphone support + ffmpeg_7-full + ]; - # Set library path for dynamic linking - shellHook = '' - export DEVENV="nix" - export JAVA_HOME="${pkgs.jetbrains.jdk}" - export PATH="$JAVA_HOME/bin:$PATH" - export LD_LIBRARY_PATH="${pkgs.lib.makeLibraryPath [ - pkgs.xorg.libX11 - pkgs.xorg.libXext - pkgs.xorg.libXi - pkgs.xorg.libXrender - pkgs.xorg.libXtst - pkgs.xorg.libXrandr - pkgs.xorg.libXinerama - pkgs.xorg.libXcursor - pkgs.xorg.libXdamage - pkgs.xorg.libXfixes - pkgs.xorg.libXcomposite - pkgs.xorg.libxkbfile - pkgs.xorg.libxcb - pkgs.freetype - pkgs.fontconfig - pkgs.glib - pkgs.gtk3 - pkgs.cairo - pkgs.pango - pkgs.gdk-pixbuf - pkgs.atk - pkgs.libsecret - pkgs.jetbrains.jdk - # JCEF-specific libraries - pkgs.nspr - pkgs.nss - pkgs.cups - pkgs.dbus - pkgs.at-spi2-atk - pkgs.at-spi2-core - pkgs.libdrm - pkgs.mesa - pkgs.expat - pkgs.alsa-lib - pkgs.pulseaudio - ]}:$LD_LIBRARY_PATH" - ''; + # Set library path for dynamic linking + shellHook = '' + export DEVENV="nix" + export JAVA_HOME="${pkgs.jetbrains.jdk}" + export PATH="$JAVA_HOME/bin:$PATH" + export LD_LIBRARY_PATH="${ + pkgs.lib.makeLibraryPath [ + pkgs.xorg.libX11 + pkgs.xorg.libXext + pkgs.xorg.libXi + pkgs.xorg.libXrender + pkgs.xorg.libXtst + pkgs.xorg.libXrandr + pkgs.xorg.libXinerama + pkgs.xorg.libXcursor + pkgs.xorg.libXdamage + pkgs.xorg.libXfixes + pkgs.xorg.libXcomposite + pkgs.xorg.libxkbfile + pkgs.xorg.libxcb + pkgs.freetype + pkgs.fontconfig + pkgs.glib + pkgs.gtk3 + pkgs.cairo + pkgs.pango + pkgs.gdk-pixbuf + pkgs.atk + pkgs.libsecret + pkgs.jetbrains.jdk + # JCEF-specific libraries + pkgs.nspr + pkgs.nss + pkgs.cups + pkgs.dbus + pkgs.at-spi2-atk + pkgs.at-spi2-core + pkgs.libdrm + pkgs.mesa + pkgs.expat + pkgs.alsa-lib + pkgs.pulseaudio + ] + }:$LD_LIBRARY_PATH" + ''; + }; + in + { + devShells = forAllSystems (system: { + default = mkDevShell system; + }); }; - in { - devShells = forAllSystems (system: { - default = mkDevShell system; - }); - }; } diff --git a/packages/types/src/global-settings.ts b/packages/types/src/global-settings.ts index 0fe51f944bc..c93cd288bcf 100644 --- a/packages/types/src/global-settings.ts +++ b/packages/types/src/global-settings.ts @@ -166,6 +166,14 @@ export const globalSettingsSchema = z.object({ soundEnabled: z.boolean().optional(), soundVolume: z.number().optional(), systemNotificationsEnabled: z.boolean().optional(), // kilocode_change + selectedMicrophoneDevice: z + .object({ + id: z.string(), + name: z.string(), + platform: z.string(), + }) + .nullable() + .optional(), // kilocode_change: Selected microphone device for STT (matches MicrophoneDevice from sttContract.ts) maxOpenTabsContext: z.number().optional(), maxWorkspaceFiles: z.number().optional(), diff --git a/packages/types/src/mode.ts b/packages/types/src/mode.ts index 8de583e6aae..52ea2bb8619 100644 --- a/packages/types/src/mode.ts +++ b/packages/types/src/mode.ts @@ -148,7 +148,7 @@ export const DEFAULT_MODES: readonly ModeConfig[] = [ description: "Plan and design before implementation", groups: ["read", ["edit", { fileRegex: "\\.md$", description: "Markdown files only" }], "browser", "mcp"], customInstructions: - "1. Do some information gathering (using provided tools) to get more context about the task.\n\n2. You should also ask the user clarifying questions to get a better understanding of the task.\n\n3. Once you've gained more context about the user's request, break down the task into clear, actionable steps and create a todo list using the `update_todo_list` tool. Each todo item should be:\n - Specific and actionable\n - Listed in logical execution order\n - Focused on a single, well-defined outcome\n - Clear enough that another mode could execute it independently\n\n **Note:** If the `update_todo_list` tool is not available, write the plan to a markdown file (e.g., `plan.md` or `todo.md`) instead.\n\n4. As you gather more information or discover new requirements, update the todo list to reflect the current understanding of what needs to be accomplished.\n\n5. Ask the user if they are pleased with this plan, or if they would like to make any changes. Think of this as a brainstorming session where you can discuss the task and refine the todo list.\n\n6. Include Mermaid diagrams if they help clarify complex workflows or system architecture. Please avoid using double quotes (\"\") and parentheses () inside square brackets ([]) in Mermaid diagrams, as this can cause parsing errors.\n\n7. Use the switch_mode tool to request that the user switch to another mode to implement the solution.\n\n**IMPORTANT: Focus on creating clear, actionable todo lists rather than lengthy markdown documents. Use the todo list as your primary planning tool to track and organize the work that needs to be done.**\n\n**CRITICAL: Never provide level of effort time estimates (e.g., hours, days, weeks) for tasks. Focus solely on breaking down the work into clear, actionable steps without estimating how long they will take.**\n\nUnless told otherwise, if you want to save a plan file, put it in the /plans directory", + "1. Do some information gathering (using provided tools) to get more context about the task.\n\n2. You should also ask the user clarifying questions to get a better understanding of the task.\n\n3. Once you've gained more context about the user's request, break down the task into clear, actionable steps and create a todo list using the `update_todo_list` tool. Each todo item should be:\n - Specific and actionable\n - Listed in logical execution order\n - Focused on a single, well-defined outcome\n - Clear enough that another mode could execute it independently\n\n **Note:** If the `update_todo_list` tool is not available, write the plan to a markdown file (e.g., `plan.md` or `todo.md`) instead.\n\n4. As you gather more information or discover new requirements, update the todo list to reflect the current understanding of what needs to be accomplished.\n\n5. Ask the user if they are pleased with this plan, or if they would like to make any changes. Think of this as a brainstorming session where you can discuss the task and refine the todo list.\n\n6. Include Mermaid diagrams if they help clarify complex workflows or system architecture. Please avoid using double quotes (\"\") and parentheses () inside square brackets ([]) in Mermaid diagrams, as this can cause parsing errors.\n\n7. Use the switch_mode tool to request switching to another mode when you need to edit non-markdown files (like source code files: .ts, .js, .py, .java, etc.) or execute commands. You CAN directly create and edit markdown files (.md) without switching modes.\n\n**IMPORTANT: Focus on creating clear, actionable todo lists rather than lengthy markdown documents. Use the todo list as your primary planning tool to track and organize the work that needs to be done.**\n\n**CRITICAL: Never provide level of effort time estimates (e.g., hours, days, weeks) for tasks. Focus solely on breaking down the work into clear, actionable steps without estimating how long they will take.**\n\nUnless told otherwise, if you want to save a plan file, put it in the /plans directory", }, { slug: "code", diff --git a/src/core/prompts/__tests__/__snapshots__/add-custom-instructions/architect-mode-prompt.snap b/src/core/prompts/__tests__/__snapshots__/add-custom-instructions/architect-mode-prompt.snap index df49170e975..093e0d50222 100644 --- a/src/core/prompts/__tests__/__snapshots__/add-custom-instructions/architect-mode-prompt.snap +++ b/src/core/prompts/__tests__/__snapshots__/add-custom-instructions/architect-mode-prompt.snap @@ -497,7 +497,7 @@ Mode-specific Instructions: 6. Include Mermaid diagrams if they help clarify complex workflows or system architecture. Please avoid using double quotes ("") and parentheses () inside square brackets ([]) in Mermaid diagrams, as this can cause parsing errors. -7. Use the switch_mode tool to request that the user switch to another mode to implement the solution. +7. Use the switch_mode tool to request switching to another mode when you need to edit non-markdown files (like source code files: .ts, .js, .py, .java, etc.) or execute commands. You CAN directly create and edit markdown files (.md) without switching modes. **IMPORTANT: Focus on creating clear, actionable todo lists rather than lengthy markdown documents. Use the todo list as your primary planning tool to track and organize the work that needs to be done.** diff --git a/src/core/webview/ClineProvider.ts b/src/core/webview/ClineProvider.ts index 6475143e2d0..68792066db1 100644 --- a/src/core/webview/ClineProvider.ts +++ b/src/core/webview/ClineProvider.ts @@ -2178,6 +2178,7 @@ export class ClineProvider featureRoomoteControlEnabled, yoloMode, // kilocode_change yoloGatekeeperApiConfigId, // kilocode_change: AI gatekeeper for YOLO mode + selectedMicrophoneDevice, // kilocode_change: Selected microphone device for STT isBrowserSessionActive, } = await this.getState() @@ -2405,6 +2406,7 @@ export class ClineProvider (s) => s.autoPurgeIncompleteTaskRetentionDays, ), autoPurgeLastRunTimestamp: await this.getState().then((s) => s.autoPurgeLastRunTimestamp), + selectedMicrophoneDevice, // kilocode_change: Selected microphone device for STT // kilocode_change end kiloCodeImageApiKey, openRouterImageGenerationSelectedModel, @@ -2603,6 +2605,7 @@ export class ClineProvider autoPurgeCompletedTaskRetentionDays: stateValues.autoPurgeCompletedTaskRetentionDays ?? 30, autoPurgeIncompleteTaskRetentionDays: stateValues.autoPurgeIncompleteTaskRetentionDays ?? 7, autoPurgeLastRunTimestamp: stateValues.autoPurgeLastRunTimestamp, + selectedMicrophoneDevice: stateValues.selectedMicrophoneDevice, // kilocode_change: Selected microphone device for STT // kilocode_change end experiments: stateValues.experiments ?? experimentDefault, autoApprovalEnabled: stateValues.autoApprovalEnabled ?? true, diff --git a/src/core/webview/speechToTextCheck.ts b/src/core/webview/speechToTextCheck.ts index f5ca5d7ce84..2d0a64466df 100644 --- a/src/core/webview/speechToTextCheck.ts +++ b/src/core/webview/speechToTextCheck.ts @@ -1,7 +1,7 @@ // kilocode_change - new file: Speech-to-text availability check (extracted from ClineProvider) import type { ProviderSettingsManager } from "../config/ProviderSettingsManager" import { getOpenAiApiKey } from "../../services/stt/utils/getOpenAiCredentials" -import { FFmpegCaptureService } from "../../services/stt/FFmpegCaptureService" +import { findFFmpeg } from "../../services/stt/FFmpegDeviceEnumerator" /** * Result type for speech-to-text availability check @@ -35,7 +35,7 @@ export async function checkSpeechToTextAvailable( } // Check 2: FFmpeg installed - const ffmpegResult = FFmpegCaptureService.findFFmpeg() + const ffmpegResult = findFFmpeg() if (!ffmpegResult.available) { return { available: false, reason: "ffmpegNotInstalled" } } diff --git a/src/core/webview/sttHandlers.ts b/src/core/webview/sttHandlers.ts index 6f4f4a2ae59..8feeb81b52b 100644 --- a/src/core/webview/sttHandlers.ts +++ b/src/core/webview/sttHandlers.ts @@ -1,11 +1,13 @@ // kilocode_change - new file: STT message handlers (replaces speechMessageHandlers.ts) import type { ClineProvider } from "./ClineProvider" -import type { STTCommand, STTSegment } from "../../shared/sttContract" +import type { STTCommand, STTSegment, MicrophoneDevice } from "../../shared/sttContract" import { STTService } from "../../services/stt" import { STTEventEmitter } from "../../services/stt/types" import { getOpenAiApiKey } from "../../services/stt/utils/getOpenAiCredentials" import { VisibleCodeTracker } from "../../services/ghost/context/VisibleCodeTracker" import { extractCodeGlossary, formatGlossaryAsPrompt } from "../../services/stt/context/codeGlossaryExtractor" +import { listMicrophoneDevices } from "../../services/stt/FFmpegDeviceEnumerator" +import { checkSpeechToTextAvailable } from "./speechToTextCheck" /** * Map of ClineProvider -> STTService @@ -63,7 +65,11 @@ function getService(clineProvider: ClineProvider): STTService { const currentTask = clineProvider.getCurrentTask() const codeGlossary = new VisibleCodeGlossary(clineProvider.cwd, currentTask?.rooIgnoreController ?? null) - service = new STTService(emitter, clineProvider.providerSettingsManager, codeGlossary) + const globalSettings = clineProvider.contextProxy.getValues() + const selectedDevice = globalSettings.selectedMicrophoneDevice + const deviceId = selectedDevice?.id + + service = new STTService(emitter, clineProvider.providerSettingsManager, codeGlossary, deviceId) servicesByProviderRef.set(clineProvider, service) } @@ -124,6 +130,49 @@ export async function handleSTTCancel(clineProvider: ClineProvider): Promise { + try { + const devices = await listMicrophoneDevices() + clineProvider.postMessageToWebview({ type: "stt:devices", devices }) + } catch (error) { + console.error("🎙️ [sttHandlers] ❌ Failed to list devices:", error) + clineProvider.postMessageToWebview({ type: "stt:devices", devices: [] }) + } +} + +/** + * Handle stt:selectDevice command + */ +export async function handleSTTSelectDevice( + clineProvider: ClineProvider, + device: MicrophoneDevice | null, +): Promise { + try { + await clineProvider.contextProxy.setValue("selectedMicrophoneDevice", device) + const service = servicesByProviderRef.get(clineProvider) + await service?.setMicrophoneDevice(device) + + clineProvider.postMessageToWebview({ type: "stt:deviceSelected", device }) + await clineProvider.postStateToWebview() + } catch (error) { + console.error("🎙️ [sttHandlers] ❌ Failed to select device:", error) + clineProvider.postMessageToWebview({ type: "stt:deviceSelected", device: null }) + } +} + +/** + * Handle stt:checkAvailability command + */ +export async function handleSTTCheckAvailability(clineProvider: ClineProvider): Promise { + clineProvider.postMessageToWebview({ + type: "stt:statusResponse", + speechToTextStatus: await checkSpeechToTextAvailable(clineProvider.providerSettingsManager), + }) +} + /** * Unified handler for all STT commands */ @@ -138,6 +187,15 @@ export async function handleSTTCommand(clineProvider: ClineProvider, command: ST case "stt:cancel": await handleSTTCancel(clineProvider) break + case "stt:listDevices": + await handleSTTListDevices(clineProvider) + break + case "stt:selectDevice": + await handleSTTSelectDevice(clineProvider, command.device) + break + case "stt:checkAvailability": + await handleSTTCheckAvailability(clineProvider) + break } } diff --git a/src/core/webview/webviewMessageHandler.ts b/src/core/webview/webviewMessageHandler.ts index 267f4e0b1bf..c751d402bda 100644 --- a/src/core/webview/webviewMessageHandler.ts +++ b/src/core/webview/webviewMessageHandler.ts @@ -3661,19 +3661,14 @@ export const webviewMessageHandler = async ( // kilocode_change start: STT (Speech-to-Text) handlers case "stt:start": case "stt:stop": - case "stt:cancel": { + case "stt:cancel": + case "stt:listDevices": + case "stt:selectDevice": + case "stt:checkAvailability": { const { handleSTTCommand } = await import("./sttHandlers") await handleSTTCommand(provider, message as any) break } - case "stt:checkAvailability": { - const { checkSpeechToTextAvailable } = await import("./speechToTextCheck") - provider.postMessageToWebview({ - type: "stt:statusResponse", - speechToTextStatus: await checkSpeechToTextAvailable(provider.providerSettingsManager), - }) - break - } // kilocode_change end: STT (Speech-to-Text) handlers case "insertTextToChatArea": provider.postMessageToWebview({ type: "insertTextToChatArea", text: message.text }) diff --git a/src/package.json b/src/package.json index 9c03f733942..a769233626e 100644 --- a/src/package.json +++ b/src/package.json @@ -3,7 +3,7 @@ "displayName": "%extension.displayName%", "description": "%extension.description%", "publisher": "kilocode", - "version": "4.143.1", + "version": "4.143.2", "icon": "assets/icons/logo-outline-black.png", "galleryBanner": { "color": "#FFFFFF", diff --git a/src/services/stt/FFmpegCaptureService.ts b/src/services/stt/FFmpegCaptureService.ts index ebd7a7d1304..9ebd54e7571 100644 --- a/src/services/stt/FFmpegCaptureService.ts +++ b/src/services/stt/FFmpegCaptureService.ts @@ -1,24 +1,8 @@ // kilocode_change - new file: FFmpeg-based PCM16 audio capture for OpenAI Realtime API import { EventEmitter } from "events" -import { spawn, ChildProcess, execSync } from "child_process" +import { spawn, ChildProcess } from "child_process" import * as os from "os" - -/** - * Global cache for FFmpeg path (shared across all instances) - * undefined = not yet checked, null = not found, string = found path - */ -let cachedFFmpegPath: string | null | undefined = undefined - -// Platform-specific fallback paths -const fallbackPaths: Record = { - darwin: ["/usr/local/bin/ffmpeg", "/opt/homebrew/bin/ffmpeg"], - linux: ["/usr/bin/ffmpeg", "/usr/local/bin/ffmpeg", "/snap/bin/ffmpeg"], - win32: [ - "C:\\ffmpeg\\bin\\ffmpeg.exe", - "C:\\Program Files\\ffmpeg\\bin\\ffmpeg.exe", - "C:\\Program Files (x86)\\ffmpeg\\bin\\ffmpeg.exe", - ], -} +import { findFFmpeg } from "./FFmpegDeviceEnumerator" /** * Calculate RMS energy of PCM16 audio frame @@ -61,13 +45,15 @@ export class FFmpegCaptureService extends EventEmitter { private platform: string private captureStartTime: number = 0 private audioChunkCount: number = 0 + private deviceId: string | undefined - constructor() { + constructor(deviceId?: string) { super() this.platform = os.platform() + this.deviceId = deviceId // Resolve FFmpeg path once (cached globally) - const result = FFmpegCaptureService.findFFmpeg() + const result = findFFmpeg() if (!result.available) { console.error("❌ [FFmpegCapture] FFmpeg not found during initialization") @@ -87,7 +73,7 @@ export class FFmpegCaptureService extends EventEmitter { } // Get FFmpeg path from global cache - const result = FFmpegCaptureService.findFFmpeg() + const result = findFFmpeg() if (!result.available || !result.path) { throw new Error( "FFmpeg not found. Please install FFmpeg to use speech-to-text.\n" + @@ -220,47 +206,6 @@ export class FFmpegCaptureService extends EventEmitter { return this.isCapturing } - /** - * Find FFmpeg executable using platform-specific fallback paths - * Results are cached globally across all instances - */ - static findFFmpeg(forceRecheck = false): { available: boolean; path?: string; error?: string } { - if (cachedFFmpegPath !== undefined && !forceRecheck) { - return { - available: cachedFFmpegPath !== null, - path: cachedFFmpegPath || undefined, - error: cachedFFmpegPath === null ? "FFmpeg not found" : undefined, - } - } - - const platform = os.platform() - try { - execSync("ffmpeg -version", { stdio: "ignore" }) - cachedFFmpegPath = "ffmpeg" - return { available: true, path: "ffmpeg" } - } catch { - console.log(`🎙️ [FFmpeg] ❌ 'ffmpeg' not in PATH, trying fallback paths...`) - } - - const platformPaths = fallbackPaths[platform] || [] - for (const fallbackPath of platformPaths) { - try { - execSync(`"${fallbackPath}" -version`, { stdio: "ignore" }) - cachedFFmpegPath = fallbackPath - return { available: true, path: fallbackPath } - } catch { - continue - } - } - - // Cache the "not found" result to avoid repeated path checks - cachedFFmpegPath = null - return { - available: false, - error: "FFmpeg not found. Install from https://ffmpeg.org/download.html", - } - } - private buildFFmpegArgs(): string[] { const baseArgs = this.getPlatformInputArgs() @@ -283,14 +228,17 @@ export class FFmpegCaptureService extends EventEmitter { private getPlatformInputArgs(): string[] { switch (this.platform) { case "darwin": // macOS - return ["-f", "avfoundation", "-i", ":default"] + // AVFoundation: device ID is already stored in format ":deviceId" (e.g., ":3") + // or use ":default" if no device selected + return ["-f", "avfoundation", "-i", this.deviceId || ":default"] case "linux": - // Try pulse first, fallback to alsa - return ["-f", "pulse", "-i", "default"] + // PulseAudio: device ID is stored as just the number (e.g., "0") + return ["-f", "pulse", "-i", this.deviceId || "default"] case "win32": // Windows - return ["-f", "dshow", "-i", "audio=default"] + // DirectShow: device ID is stored as the device name, need to format as "audio=Device Name" + return ["-f", "dshow", "-i", this.deviceId ? `audio=${this.deviceId}` : "audio=default"] default: throw new Error(`Unsupported platform: ${this.platform}`) diff --git a/src/services/stt/FFmpegDeviceEnumerator.ts b/src/services/stt/FFmpegDeviceEnumerator.ts new file mode 100644 index 00000000000..d0a6d6ba61b --- /dev/null +++ b/src/services/stt/FFmpegDeviceEnumerator.ts @@ -0,0 +1,226 @@ +// kilocode_change - new file: FFmpeg device enumeration utilities +import { execSync } from "child_process" +import * as os from "os" +import { MicrophoneDevice } from "../../shared/sttContract" + +/** + * Global cache for FFmpeg path (shared across all instances) + * undefined = not yet checked, null = not found, string = found path + */ +let cachedFFmpegPath: string | null | undefined = undefined + +// Platform-specific fallback paths +const fallbackPaths: Record = { + darwin: ["/usr/local/bin/ffmpeg", "/opt/homebrew/bin/ffmpeg"], + linux: ["/usr/bin/ffmpeg", "/usr/local/bin/ffmpeg", "/snap/bin/ffmpeg"], + win32: [ + "C:\\ffmpeg\\bin\\ffmpeg.exe", + "C:\\Program Files\\ffmpeg\\bin\\ffmpeg.exe", + "C:\\Program Files (x86)\\ffmpeg\\bin\\ffmpeg.exe", + ], +} + +/** + * Find FFmpeg executable using platform-specific fallback paths + * Results are cached globally across all instances + */ +export function findFFmpeg(forceRecheck = false): { available: boolean; path?: string; error?: string } { + if (cachedFFmpegPath !== undefined && !forceRecheck) { + return { + available: cachedFFmpegPath !== null, + path: cachedFFmpegPath || undefined, + error: cachedFFmpegPath === null ? "FFmpeg not found" : undefined, + } + } + + const platform = os.platform() + try { + execSync("ffmpeg -version", { stdio: "ignore" }) + cachedFFmpegPath = "ffmpeg" + return { available: true, path: "ffmpeg" } + } catch { + console.log(`🎙️ [FFmpeg] ❌ 'ffmpeg' not in PATH, trying fallback paths...`) + } + + const platformPaths = fallbackPaths[platform] || [] + for (const fallbackPath of platformPaths) { + try { + execSync(`"${fallbackPath}" -version`, { stdio: "ignore" }) + cachedFFmpegPath = fallbackPath + return { available: true, path: fallbackPath } + } catch { + continue + } + } + + // Cache the "not found" result to avoid repeated path checks + cachedFFmpegPath = null + return { + available: false, + error: "FFmpeg not found. Install from https://ffmpeg.org/download.html", + } +} + +/** + * List available microphone devices using FFmpeg + * Platform-independent method that returns normalized device information + */ +export async function listMicrophoneDevices(): Promise { + const platform = os.platform() + const result = findFFmpeg() + + if (!result.available || !result.path) { + throw new Error("FFmpeg not found. Please install FFmpeg to list microphone devices.") + } + + try { + switch (platform) { + case "darwin": + return await listAvFoundationDevices(result.path) + case "linux": + return await listPulseDevices(result.path) + case "win32": + return await listDShowDevices(result.path) + default: + throw new Error(`Unsupported platform: ${platform}`) + } + } catch (error) { + console.error("❌ [FFmpegCapture] Error listing devices:", error) + throw error + } +} + +/** + * List devices using macOS AVFoundation + */ +async function listAvFoundationDevices(ffmpegPath: string): Promise { + const devices: MicrophoneDevice[] = [] + try { + // Use -list_devices to enumerate AVFoundation devices + const output = execSync(`"${ffmpegPath}" -f avfoundation -list_devices true -i "" 2>&1 || true`, { + encoding: "utf-8", + maxBuffer: 10 * 1024 * 1024, + }) + + let inAudioSection = false + const lines = output.split("\n") + + for (const line of lines) { + if (line.includes("AVFoundation audio devices:")) { + inAudioSection = true + continue + } + if (line.includes("AVFoundation video devices:")) { + inAudioSection = false + continue + } + + if (inAudioSection) { + // Match pattern: [AVFoundation input device @ ...] [0] Device Name + const match = line.match(/\[(\d+)\]\s+(.+)/) + if (match) { + // AVFoundation requires format ":deviceId" (e.g., ":3") + const id = `:${match[1]}` + const name = match[2].trim() + devices.push({ id, name, platform: "darwin" }) + } + } + } + } catch (error) { + console.error("❌ [FFmpegCapture] Error listing AVFoundation devices:", error) + } + + return devices +} + +/** + * List devices using PulseAudio (Linux) + */ +async function listPulseDevices(ffmpegPath: string): Promise { + const devices: MicrophoneDevice[] = [] + try { + // Use -list_devices to enumerate PulseAudio devices + const output = execSync(`"${ffmpegPath}" -f pulse -list_devices true -i "" 2>&1 || true`, { + encoding: "utf-8", + maxBuffer: 10 * 1024 * 1024, + }) + + // Parse PulseAudio output format: + // [pulse @ 0x...] List of audio devices: + // [pulse @ 0x...] 0: Built-in Audio Analog Stereo + // [pulse @ 0x...] 1: USB Audio Device + + let inDeviceList = false + const lines = output.split("\n") + + for (const line of lines) { + if (line.includes("List of audio devices:")) { + inDeviceList = true + continue + } + + if (inDeviceList) { + // Match pattern: [pulse @ ...] 0: Device Name + const match = line.match(/(\d+):\s+(.+)/) + if (match) { + const id = match[1] + const name = match[2].trim() + devices.push({ id, name, platform: "linux" }) + } + } + } + } catch (error) { + console.error("❌ [FFmpegCapture] Error listing PulseAudio devices:", error) + } + + return devices +} + +/** + * List devices using DirectShow (Windows) + */ +async function listDShowDevices(ffmpegPath: string): Promise { + const devices: MicrophoneDevice[] = [] + try { + // Use -list_devices to enumerate DirectShow devices + const output = execSync(`"${ffmpegPath}" -f dshow -list_devices true -i dummy 2>&1 || true`, { + encoding: "utf-8", + maxBuffer: 10 * 1024 * 1024, + }) + + // Parse DirectShow output format: + // [dshow @ 0x...] DirectShow video devices + // [dshow @ 0x...] DirectShow audio devices + // [dshow @ 0x...] "Microphone (Realtek Audio)" + // [dshow @ 0x...] "Headset Microphone (USB Audio Device)" + + let inAudioSection = false + const lines = output.split("\n") + + for (const line of lines) { + if (line.includes("DirectShow audio devices")) { + inAudioSection = true + continue + } + if (line.includes("DirectShow video devices")) { + inAudioSection = false + continue + } + + if (inAudioSection) { + // Match pattern: [dshow @ ...] "Device Name" + const match = line.match(/"([^"]+)"/) + if (match) { + const name = match[1] + // For DirectShow, FFmpeg requires format "audio=Device Name" + // Store just the name - we'll format it in getPlatformInputArgs + devices.push({ id: name, name, platform: "win32" }) + } + } + } + } catch (error) { + console.error("❌ [FFmpegCapture] Error listing DirectShow devices:", error) + } + + return devices +} diff --git a/src/services/stt/STTService.ts b/src/services/stt/STTService.ts index e35d211edc6..c05b3f5d2e2 100644 --- a/src/services/stt/STTService.ts +++ b/src/services/stt/STTService.ts @@ -25,6 +25,7 @@ export class STTService { // Services private audioCapture: FFmpegCaptureService private transcriptionClient: OpenAIWhisperClient | null = null + private selectedDeviceId: string | undefined // Segment-based state private textSegments: STTSegment[] = [] // All confirmed/polished segments @@ -49,11 +50,23 @@ export class STTService { emitter: STTEventEmitter, providerSettingsManager: ProviderSettingsManager, codeGlossary: VisibleCodeGlossary | null = null, + deviceId?: string, ) { this.emitter = emitter this.providerSettingsManager = providerSettingsManager this.codeGlossary = codeGlossary - this.audioCapture = new FFmpegCaptureService() + this.selectedDeviceId = deviceId + this.audioCapture = new FFmpegCaptureService(deviceId) + } + + /** + * Set the microphone device to use for audio capture + */ + async setMicrophoneDevice(device: { id: string } | null): Promise { + this.selectedDeviceId = device?.id + if (!this.isActive) { + this.audioCapture = new FFmpegCaptureService(this.selectedDeviceId) + } } async start(config: STTProviderConfig, language?: string): Promise { diff --git a/src/services/stt/__tests__/FFmpegCaptureService.spec.ts b/src/services/stt/__tests__/FFmpegCaptureService.spec.ts index d87e2bd336d..2c9f2e14027 100644 --- a/src/services/stt/__tests__/FFmpegCaptureService.spec.ts +++ b/src/services/stt/__tests__/FFmpegCaptureService.spec.ts @@ -34,6 +34,10 @@ vi.mock("child_process", () => ({ execSync: vi.fn(() => Buffer.from("ffmpeg version")), })) +vi.mock("../FFmpegDeviceEnumerator", () => ({ + findFFmpeg: vi.fn(() => ({ available: true, path: "ffmpeg" })), +})) + vi.mock("os", () => ({ platform: vi.fn(() => "darwin"), })) @@ -113,5 +117,49 @@ describe("FFmpegCaptureService", () => { expect(args).toContain("-ar") expect(args).toContain("24000") // 24kHz required by Realtime API }) + + it("should use device ID when provided (macOS format)", async () => { + const deviceId = ":1" + const captureWithDevice = new FFmpegCaptureService(deviceId) + await captureWithDevice.start() + + const spawnCall = vi.mocked(spawn).mock.calls[0] + const args = spawnCall[1] as string[] + const inputIndex = args.indexOf("-i") + expect(inputIndex).toBeGreaterThan(-1) + expect(args[inputIndex + 1]).toBe(":1") + }) + + it("should use device ID when provided (Linux format)", async () => { + const os = await import("os") + vi.mocked(os.platform).mockReturnValue("linux") + + const deviceId = "1" + const captureWithDevice = new FFmpegCaptureService(deviceId) + await captureWithDevice.start() + + const spawnCall = vi.mocked(spawn).mock.calls[0] + const args = spawnCall[1] as string[] + const inputIndex = args.indexOf("-i") + expect(inputIndex).toBeGreaterThan(-1) + // Device ID should be used as-is (just the number) + expect(args[inputIndex + 1]).toBe("1") + }) + + it("should format device ID correctly for Windows", async () => { + const os = await import("os") + vi.mocked(os.platform).mockReturnValue("win32") + + const deviceId = "Headset Microphone (USB Audio Device)" + const captureWithDevice = new FFmpegCaptureService(deviceId) + await captureWithDevice.start() + + const spawnCall = vi.mocked(spawn).mock.calls[0] + const args = spawnCall[1] as string[] + const inputIndex = args.indexOf("-i") + expect(inputIndex).toBeGreaterThan(-1) + // Device ID should be formatted as "audio=Device Name" + expect(args[inputIndex + 1]).toBe("audio=Headset Microphone (USB Audio Device)") + }) }) }) diff --git a/src/services/stt/__tests__/FFmpegDeviceEnumerator.spec.ts b/src/services/stt/__tests__/FFmpegDeviceEnumerator.spec.ts new file mode 100644 index 00000000000..a320a8774a4 --- /dev/null +++ b/src/services/stt/__tests__/FFmpegDeviceEnumerator.spec.ts @@ -0,0 +1,90 @@ +// Run: npx vitest run services/stt/__tests__/FFmpegDeviceEnumerator.spec.ts + +import { execSync } from "child_process" +import { listMicrophoneDevices, findFFmpeg } from "../FFmpegDeviceEnumerator" + +vi.mock("child_process", () => ({ + execSync: vi.fn(() => Buffer.from("ffmpeg version")), +})) + +vi.mock("os", () => ({ + platform: vi.fn(() => "darwin"), +})) + +describe("FFmpegDeviceEnumerator", () => { + beforeEach(() => { + vi.clearAllMocks() + }) + + describe("findFFmpeg", () => { + it("should find FFmpeg in PATH", () => { + vi.mocked(execSync).mockReturnValue(Buffer.from("ffmpeg version")) + const result = findFFmpeg(true) // Force recheck to clear cache + expect(result.available).toBe(true) + expect(result.path).toBe("ffmpeg") + }) + + it("should return not available when FFmpeg is not found", () => { + vi.mocked(execSync).mockImplementation(() => { + throw new Error("Command not found") + }) + const result = findFFmpeg(true) // Force recheck + expect(result.available).toBe(false) + expect(result.error).toBeDefined() + }) + }) + + describe("listMicrophoneDevices", () => { + it("should parse and return macOS AVFoundation devices", async () => { + // Mock macOS AVFoundation device list output + const macOutput = `[AVFoundation input device @ 0x123] AVFoundation video devices: +[AVFoundation input device @ 0x123] [0] FaceTime HD Camera +[AVFoundation input device @ 0x123] AVFoundation audio devices: +[AVFoundation input device @ 0x123] [0] Built-in Microphone +[AVFoundation input device @ 0x123] [1] External USB Microphone +[AVFoundation input device @ 0x123] [2] Bluetooth Headset` + + vi.mocked(execSync).mockImplementation((command: string, options?: any) => { + if (typeof command === "string" && command.includes("-list_devices")) { + // When encoding is specified, execSync returns a string + return options?.encoding === "utf-8" ? macOutput : Buffer.from(macOutput) + } + // For version check (no encoding), return Buffer + if (options?.stdio === "ignore") { + return Buffer.from("ffmpeg version") + } + // For version check with encoding, return string + return options?.encoding === "utf-8" ? "ffmpeg version" : Buffer.from("ffmpeg version") + }) + + // Clear cache before test + findFFmpeg(true) + + const devices = await listMicrophoneDevices() + + // Verify device IDs are stored with colon prefix (format FFmpeg expects) + expect(devices).toHaveLength(3) + expect(devices[0].id).toBe(":0") + expect(devices[0].name).toBe("Built-in Microphone") + expect(devices[0].platform).toBe("darwin") + expect(devices[1].id).toBe(":1") + expect(devices[1].name).toBe("External USB Microphone") + expect(devices[1].platform).toBe("darwin") + expect(devices[2].id).toBe(":2") + expect(devices[2].name).toBe("Bluetooth Headset") + expect(devices[2].platform).toBe("darwin") + }) + + it("should throw error when FFmpeg is not found", async () => { + // Mock execSync to throw for all FFmpeg checks (both "ffmpeg -version" and fallback paths) + vi.mocked(execSync).mockImplementation(() => { + throw new Error("Command not found") + }) + + // Clear cache and force recheck with the new mock + findFFmpeg(true) + + await expect(listMicrophoneDevices()).rejects.toThrow("FFmpeg not found") + }) + }) +}) diff --git a/src/shared/ExtensionMessage.ts b/src/shared/ExtensionMessage.ts index 3e8482f103a..20241d6347f 100644 --- a/src/shared/ExtensionMessage.ts +++ b/src/shared/ExtensionMessage.ts @@ -33,7 +33,7 @@ import { import { ClineRulesToggles } from "./cline-rules" import { KiloCodeWrapperProperties } from "./kilocode/wrapper" import { DeploymentRecord } from "../api/providers/fetchers/sap-ai-core" -import { STTSegment } from "./sttContract" // kilocode_change: STT segment type +import { STTSegment, MicrophoneDevice } from "./sttContract" // kilocode_change: STT segment type and microphone device // kilocode_change end // Command interface for frontend/backend communication @@ -139,6 +139,8 @@ export interface ExtensionMessage { | "stt:volume" // kilocode_change: STT volume level | "stt:stopped" // kilocode_change: STT session stopped | "stt:statusResponse" // kilocode_change: Response to stt:checkAvailability request + | "stt:devices" // kilocode_change: Microphone devices list + | "stt:deviceSelected" // kilocode_change: Device selection confirmation | "setHistoryPreviewCollapsed" | "commandExecutionStatus" | "mcpExecutionStatus" @@ -277,6 +279,8 @@ export interface ExtensionMessage { level?: number // kilocode_change: STT volume level (0-1) reason?: "completed" | "cancelled" | "error" // kilocode_change: STT stop reason speechToTextStatus?: { available: boolean; reason?: "openaiKeyMissing" | "ffmpegNotInstalled" } // kilocode_change: Speech-to-text availability status response + devices?: MicrophoneDevice[] // kilocode_change: Microphone devices list + device?: MicrophoneDevice | null // kilocode_change: Selected microphone device requestId?: string promptText?: string results?: { path: string; type: "file" | "folder"; label?: string }[] @@ -460,6 +464,7 @@ export type ExtensionState = Pick< | "includeCurrentTime" | "includeCurrentCost" | "maxGitStatusFiles" + | "selectedMicrophoneDevice" // kilocode_change: Selected microphone device for STT > & { version: string clineMessages: ClineMessage[] diff --git a/src/shared/WebviewMessage.ts b/src/shared/WebviewMessage.ts index 656f1a0fb2d..8d53b561388 100644 --- a/src/shared/WebviewMessage.ts +++ b/src/shared/WebviewMessage.ts @@ -18,6 +18,7 @@ import { } from "@roo-code/types" import { Mode } from "./modes" +import { MicrophoneDevice } from "./sttContract" // kilocode_change: Microphone device type for STT export type ClineAskResponse = | "yesButtonClicked" @@ -146,6 +147,8 @@ export interface WebviewMessage { | "stt:stop" // kilocode_change: Stop STT recording | "stt:cancel" // kilocode_change: Cancel STT recording | "stt:checkAvailability" // kilocode_change: Check STT availability on demand + | "stt:listDevices" // kilocode_change: List microphone devices + | "stt:selectDevice" // kilocode_change: Select microphone device | "includeTaskHistoryInEnhance" // kilocode_change | "snoozeAutocomplete" // kilocode_change | "autoApprovalEnabled" @@ -340,6 +343,7 @@ export interface WebviewMessage { setting?: string slug?: string language?: string // User's language for speech transcription (STT) + device?: MicrophoneDevice | null // kilocode_change: Microphone device for stt:selectDevice modeConfig?: ModeConfig timeout?: number payload?: WebViewMessagePayload diff --git a/src/shared/sttContract.ts b/src/shared/sttContract.ts index 2ef58a6e855..9a9f7dcc8de 100644 --- a/src/shared/sttContract.ts +++ b/src/shared/sttContract.ts @@ -1,6 +1,15 @@ // kilocode_change - new file: STT contract types shared between extension and webview // Speech-to-Text (STT) event protocol +/** + * Microphone device information + */ +export interface MicrophoneDevice { + id: string // FFmpeg device identifier (e.g., "0" or "audio=Microphone") + name: string // Human-readable name (e.g., "Built-in Microphone") + platform: string // Platform where discovered (e.g., "darwin", "linux", "win32") +} + /** * Commands: WebView → Extension */ @@ -17,7 +26,26 @@ export interface STTCancelCommand { type: "stt:cancel" } -export type STTCommand = STTStartCommand | STTStopCommand | STTCancelCommand +export interface STTListDevicesCommand { + type: "stt:listDevices" +} + +export interface STTSelectDeviceCommand { + type: "stt:selectDevice" + device: MicrophoneDevice | null // null means use system default +} + +export interface STTCheckAvailabilityCommand { + type: "stt:checkAvailability" +} + +export type STTCommand = + | STTStartCommand + | STTStopCommand + | STTCancelCommand + | STTListDevicesCommand + | STTSelectDeviceCommand + | STTCheckAvailabilityCommand /** * Events: Extension → WebView @@ -56,11 +84,36 @@ export interface STTStoppedEvent { error?: string // Error message (when reason === "error") } -export type STTEvent = STTStartedEvent | STTTranscriptEvent | STTVolumeEvent | STTStoppedEvent +export interface STTDevicesEvent { + type: "stt:devices" + devices: MicrophoneDevice[] +} + +export interface STTDeviceSelectedEvent { + type: "stt:deviceSelected" + device: MicrophoneDevice | null +} + +export interface STTStatusResponseEvent { + type: "stt:statusResponse" + speechToTextStatus: { + available: boolean + reason?: "openaiKeyMissing" | "ffmpegNotInstalled" + } +} + +export type STTEvent = + | STTStartedEvent + | STTTranscriptEvent + | STTVolumeEvent + | STTStoppedEvent + | STTDevicesEvent + | STTDeviceSelectedEvent + | STTStatusResponseEvent /** * Type guard for routing in message handlers */ export function isSTTCommand(msg: { type: string }): msg is STTCommand { - return msg.type === "stt:start" || msg.type === "stt:stop" || msg.type === "stt:cancel" + return msg.type.startsWith("stt:") } diff --git a/webview-ui/src/components/settings/ExperimentalSettings.tsx b/webview-ui/src/components/settings/ExperimentalSettings.tsx index 140b389ff9f..46b8cc68a6e 100644 --- a/webview-ui/src/components/settings/ExperimentalSettings.tsx +++ b/webview-ui/src/components/settings/ExperimentalSettings.tsx @@ -17,6 +17,7 @@ import { Section } from "./Section" import { ExperimentalFeature } from "./ExperimentalFeature" import { FastApplySettings } from "./FastApplySettings" // kilocode_change: Use Fast Apply version import { ImageGenerationSettings } from "./ImageGenerationSettings" +import { STTSettings } from "./STTSettings" // kilocode_change: STT microphone settings type ExperimentalSettingsProps = HTMLAttributes & { experiments: Experiments @@ -122,6 +123,22 @@ export const ExperimentalSettings = ({ ) } // kilocode_change end + if (config[0] === "SPEECH_TO_TEXT") { + const enabled = experiments[EXPERIMENT_IDS.SPEECH_TO_TEXT] ?? false + return ( + + + setExperimentEnabled(EXPERIMENT_IDS.SPEECH_TO_TEXT, enabled) + } + /> + {enabled && } + + ) + } if ( config[0] === "IMAGE_GENERATION" && setImageGenerationProvider && diff --git a/webview-ui/src/components/settings/STTSettings.tsx b/webview-ui/src/components/settings/STTSettings.tsx new file mode 100644 index 00000000000..add3e3cf440 --- /dev/null +++ b/webview-ui/src/components/settings/STTSettings.tsx @@ -0,0 +1,86 @@ +// kilocode_change: STT Microphone Settings +import { useEffect } from "react" +import { VSCodeDropdown, VSCodeOption } from "@vscode/webview-ui-toolkit/react" +import { useAppTranslation } from "@/i18n/TranslationContext" +import { useSTT } from "@/hooks/useSTT" +import { useExtensionState } from "@/context/ExtensionStateContext" +import { MicrophoneDevice } from "../../../../src/shared/sttContract" +import { RefreshCw } from "lucide-react" +import { Button } from "../ui/button" + +export const STTSettings = () => { + const { t } = useAppTranslation() + const { devices, isLoadingDevices, loadDevices, selectDevice, selectedDevice } = useSTT() + const extensionState = useExtensionState() + const selectedMicrophoneDevice = extensionState?.selectedMicrophoneDevice + + const savedDevice = selectedMicrophoneDevice + + const handleDeviceChange = (value: string) => { + if (value === "default") { + selectDevice(null) + } else { + const device = devices.find((d) => d.id === value) + if (device) { + selectDevice(device) + } + } + } + + const getCurrentDeviceValue = () => { + // Prefer saved device from extension state, fallback to hook state + const currentDevice = savedDevice !== undefined ? savedDevice : selectedDevice + + // If the current device is no longer available (e.g., device unplugged), fallback to default + if (!currentDevice || !devices.some((device: MicrophoneDevice) => device.id === currentDevice.id)) { + return "default" + } + + return currentDevice.id + } + + useEffect(() => { + loadDevices() + }, [loadDevices]) + + return ( +
+
+ +
+ handleDeviceChange(e.target.value)} + className="flex-1" + disabled={isLoadingDevices}> + + {t("kilocode:speechToText.microphone.defaultOption")} + + {devices.map((device: MicrophoneDevice) => ( + + {device.name} + + ))} + + +
+

+ {t("kilocode:speechToText.microphone.description")} +

+ {isLoadingDevices && ( +

+ {t("kilocode:speechToText.microphone.loading")} +

+ )} + {!isLoadingDevices && devices.length === 0 && ( +

+ {t("kilocode:speechToText.microphone.noDevices")} +

+ )} +
+
+ ) +} diff --git a/webview-ui/src/hooks/useSTT.ts b/webview-ui/src/hooks/useSTT.ts index cf488e197be..429ca7e6ae8 100644 --- a/webview-ui/src/hooks/useSTT.ts +++ b/webview-ui/src/hooks/useSTT.ts @@ -1,7 +1,7 @@ // kilocode_change - new file: React hook for STT (Speech-to-Text) functionality import { useState, useEffect, useCallback, useRef } from "react" import { vscode } from "../utils/vscode" -import { STTSegment } from "../../../src/shared/sttContract" +import { STTSegment, MicrophoneDevice } from "../../../src/shared/sttContract" export interface UseSTTOptions { /** Called when recording completes with final text */ @@ -23,6 +23,16 @@ export interface UseSTTReturn { stop: () => void /** Cancel recording and discard */ cancel: () => void + /** Available microphone devices */ + devices: MicrophoneDevice[] + /** Whether devices are currently loading */ + isLoadingDevices: boolean + /** Load available microphone devices */ + loadDevices: () => Promise + /** Select a microphone device (null for system default) */ + selectDevice: (device: MicrophoneDevice | null) => Promise + /** Currently selected device (null means system default) */ + selectedDevice: MicrophoneDevice | null } /** @@ -46,6 +56,9 @@ export function useSTT(options: UseSTTOptions = {}): UseSTTReturn { const [realIsRecording, setRealIsRecording] = useState(false) const [segments, setSegments] = useState([]) const [volume, setVolume] = useState(0) + const [devices, setDevices] = useState([]) + const [isLoadingDevices, setIsLoadingDevices] = useState(false) + const [selectedDevice, setSelectedDevice] = useState(null) // Track session to ignore stale events const sessionIdRef = useRef(null) @@ -78,8 +91,6 @@ export function useSTT(options: UseSTTOptions = {}): UseSTTReturn { case "stt:transcript": // Ignore events from old sessions if (msg.sessionId !== sessionIdRef.current) return - // Just pass through the segments from extension (stateless) - console.log("🎙️ [useSTT WebView] 📨 Received segments:", JSON.stringify(msg.segments, null, 2)) setSegments(msg.segments || []) break @@ -110,6 +121,15 @@ export function useSTT(options: UseSTTOptions = {}): UseSTTReturn { setSegments([]) sessionIdRef.current = null break + + case "stt:devices": + setDevices(msg.devices || []) + setIsLoadingDevices(false) + break + + case "stt:deviceSelected": + setSelectedDevice(msg.device) + break } } @@ -132,6 +152,15 @@ export function useSTT(options: UseSTTOptions = {}): UseSTTReturn { vscode.postMessage({ type: "stt:cancel" }) }, []) + const loadDevices = useCallback(async () => { + setIsLoadingDevices(true) + vscode.postMessage({ type: "stt:listDevices" }) + }, []) + + const selectDevice = useCallback(async (device: MicrophoneDevice | null) => { + vscode.postMessage({ type: "stt:selectDevice", device }) + }, []) + return { isRecording: optimisticIsRecording, segments, @@ -139,5 +168,10 @@ export function useSTT(options: UseSTTOptions = {}): UseSTTReturn { start, stop, cancel, + devices, + isLoadingDevices, + loadDevices, + selectDevice, + selectedDevice, } } diff --git a/webview-ui/src/i18n/locales/ar/kilocode.json b/webview-ui/src/i18n/locales/ar/kilocode.json index 42157008def..b5c1c2c1924 100644 --- a/webview-ui/src/i18n/locales/ar/kilocode.json +++ b/webview-ui/src/i18n/locales/ar/kilocode.json @@ -343,6 +343,14 @@ "ffmpegReason": "تحتاج إلى تثبيت FFmpeg. انقر هنا لمساعدتك من Kilo في تثبيت FFmpeg." }, "misconfiguredState": "غير متاح بسبب الإعدادات", - "errorState": "غير متاح بسبب خطأ" + "errorState": "غير متاح بسبب خطأ", + "microphone": { + "label": "ميكروفون", + "noDevices": "لم يتم العثور على أجهزة ميكروفون", + "defaultOption": "النظام الافتراضي", + "loading": "جارٍ تحميل أجهزة الميكروفون...", + "description": "حدد جهاز الميكروفون لتحويل الكلام إلى نص", + "refresh": "تحديث" + } } } diff --git a/webview-ui/src/i18n/locales/ca/kilocode.json b/webview-ui/src/i18n/locales/ca/kilocode.json index 3af8943267f..14452745dcb 100644 --- a/webview-ui/src/i18n/locales/ca/kilocode.json +++ b/webview-ui/src/i18n/locales/ca/kilocode.json @@ -343,6 +343,14 @@ "ffmpegMessage": "Ajuda'm a instal·lar FFmpeg" }, "misconfiguredState": "No disponible per la configuració", - "errorState": "No disponible a causa d'un error" + "errorState": "No disponible a causa d'un error", + "microphone": { + "label": "Micròfon", + "description": "Seleccioneu el dispositiu de micròfon per a la conversió de veu a text", + "defaultOption": "Configuració predeterminada del sistema", + "loading": "Carregant dispositius de micròfon...", + "noDevices": "No s'han trobat dispositius de micròfon", + "refresh": "Actualitza" + } } } diff --git a/webview-ui/src/i18n/locales/cs/kilocode.json b/webview-ui/src/i18n/locales/cs/kilocode.json index 37d347978af..c7ed28dffaa 100644 --- a/webview-ui/src/i18n/locales/cs/kilocode.json +++ b/webview-ui/src/i18n/locales/cs/kilocode.json @@ -342,6 +342,14 @@ "ffmpegMessage": "Pomozte mi nainstalovat FFmpeg" }, "misconfiguredState": "Nedostupné kvůli konfiguraci", - "errorState": "Nedostupné kvůli chybě" + "errorState": "Nedostupné kvůli chybě", + "microphone": { + "label": "Mikrofon", + "description": "Vyberte mikrofonní zařízení pro převod řeči na text", + "defaultOption": "Výchozí nastavení systému", + "loading": "Načítání mikrofonních zařízení...", + "refresh": "Obnovit", + "noDevices": "Nebyla nalezena žádná zařízení mikrofonu" + } } } diff --git a/webview-ui/src/i18n/locales/de/kilocode.json b/webview-ui/src/i18n/locales/de/kilocode.json index 02cee224b37..dd76e1e06ee 100644 --- a/webview-ui/src/i18n/locales/de/kilocode.json +++ b/webview-ui/src/i18n/locales/de/kilocode.json @@ -342,6 +342,14 @@ "ffmpegMessage": "Hilf mir, FFmpeg zu installieren" }, "misconfiguredState": "Nicht verfügbar aufgrund der Konfiguration", - "errorState": "Nicht verfügbar aufgrund eines Fehlers" + "errorState": "Nicht verfügbar aufgrund eines Fehlers", + "microphone": { + "label": "Mikrofon", + "defaultOption": "Systemstandard", + "loading": "Mikrofongeräte werden geladen...", + "description": "Wählen Sie das Mikrofon-Gerät für die Sprache-zu-Text-Umwandlung aus", + "noDevices": "Keine Mikrofongeräte gefunden", + "refresh": "Aktualisieren" + } } } diff --git a/webview-ui/src/i18n/locales/en/kilocode.json b/webview-ui/src/i18n/locales/en/kilocode.json index dfe8830801c..4bc28f61d00 100644 --- a/webview-ui/src/i18n/locales/en/kilocode.json +++ b/webview-ui/src/i18n/locales/en/kilocode.json @@ -341,6 +341,14 @@ "openAiReason": "You need a valid OpenAI provider with an API key to use voice transcription.", "ffmpegReason": "You need FFmpeg installed. Click here to have Kilo help you install FFmpeg.", "ffmpegMessage": "Help me install FFmpeg" + }, + "microphone": { + "label": "Microphone", + "description": "Select the microphone device for speech-to-text", + "defaultOption": "System Default", + "loading": "Loading microphone devices...", + "noDevices": "No microphone devices found", + "refresh": "Refresh" } } } diff --git a/webview-ui/src/i18n/locales/es/kilocode.json b/webview-ui/src/i18n/locales/es/kilocode.json index a16a1f5e8f6..f2b53805c5e 100644 --- a/webview-ui/src/i18n/locales/es/kilocode.json +++ b/webview-ui/src/i18n/locales/es/kilocode.json @@ -342,6 +342,14 @@ "ffmpegMessage": "Ayúdame a instalar FFmpeg" }, "misconfiguredState": "No disponible debido a la configuración", - "errorState": "No disponible debido a un error" + "errorState": "No disponible debido a un error", + "microphone": { + "label": "Micrófono", + "description": "Seleccione el dispositivo de micrófono para convertir voz a texto", + "defaultOption": "Predeterminado del sistema", + "loading": "Cargando dispositivos de micrófono...", + "refresh": "Actualizar", + "noDevices": "No se encontraron dispositivos de micrófono" + } } } diff --git a/webview-ui/src/i18n/locales/fr/kilocode.json b/webview-ui/src/i18n/locales/fr/kilocode.json index 4b98de6ddb4..71eb7d7a3be 100644 --- a/webview-ui/src/i18n/locales/fr/kilocode.json +++ b/webview-ui/src/i18n/locales/fr/kilocode.json @@ -342,6 +342,14 @@ "ffmpegMessage": "Aidez-moi à installer FFmpeg" }, "misconfiguredState": "Indisponible en raison de la configuration", - "errorState": "Indisponible en raison d'une erreur" + "errorState": "Indisponible en raison d'une erreur", + "microphone": { + "label": "Microphone", + "description": "Sélectionnez le périphérique microphone pour la reconnaissance vocale", + "loading": "Chargement des périphériques microphone...", + "noDevices": "Aucun périphérique de microphone trouvé", + "defaultOption": "Paramètre par défaut du système", + "refresh": "Actualiser" + } } } diff --git a/webview-ui/src/i18n/locales/hi/kilocode.json b/webview-ui/src/i18n/locales/hi/kilocode.json index 2f6198c2570..16cc598422f 100644 --- a/webview-ui/src/i18n/locales/hi/kilocode.json +++ b/webview-ui/src/i18n/locales/hi/kilocode.json @@ -342,6 +342,14 @@ "ffmpegMessage": "मुझे FFmpeg इंस्टॉल करने में मदद करें" }, "misconfiguredState": "कॉन्फ़िग की वजह से अनुपलब्ध", - "errorState": "त्रुटि के कारण अनुपलब्ध" + "errorState": "त्रुटि के कारण अनुपलब्ध", + "microphone": { + "label": "माइक्रोफ़ोन", + "defaultOption": "सिस्टम डिफ़ॉल्ट", + "loading": "माइक्रोफ़ोन डिवाइस लोड हो रहे हैं...", + "description": "स्पीच-टू-टेक्स्ट के लिए माइक्रोफ़ोन डिवाइस चुनें", + "noDevices": "कोई माइक्रोफ़ोन डिवाइस नहीं मिला", + "refresh": "रीफ़्रेश" + } } } diff --git a/webview-ui/src/i18n/locales/id/kilocode.json b/webview-ui/src/i18n/locales/id/kilocode.json index 6b6202f2074..30cd5dcb90e 100644 --- a/webview-ui/src/i18n/locales/id/kilocode.json +++ b/webview-ui/src/i18n/locales/id/kilocode.json @@ -342,6 +342,14 @@ "ffmpegMessage": "Bantu saya menginstal FFmpeg" }, "misconfiguredState": "Tidak tersedia karena konfigurasi", - "errorState": "Tidak tersedia karena terjadi kesalahan" + "errorState": "Tidak tersedia karena terjadi kesalahan", + "microphone": { + "label": "Mikrofon", + "defaultOption": "Default Sistem", + "description": "Pilih perangkat mikrofon untuk pengenalan suara-ke-teks", + "noDevices": "Tidak ada perangkat mikrofon yang ditemukan", + "loading": "Memuat perangkat mikrofon...", + "refresh": "Segarkan" + } } } diff --git a/webview-ui/src/i18n/locales/it/kilocode.json b/webview-ui/src/i18n/locales/it/kilocode.json index 4dd8c125d20..3bd4bb4d4ad 100644 --- a/webview-ui/src/i18n/locales/it/kilocode.json +++ b/webview-ui/src/i18n/locales/it/kilocode.json @@ -343,6 +343,14 @@ "ffmpegMessage": "Aiutami a installare FFmpeg" }, "misconfiguredState": "Non disponibile a causa della configurazione", - "errorState": "Non disponibile a causa di un errore" + "errorState": "Non disponibile a causa di un errore", + "microphone": { + "label": "Microfono", + "noDevices": "Nessun dispositivo microfono trovato", + "loading": "Caricamento dei dispositivi microfono...", + "refresh": "Aggiorna", + "defaultOption": "Impostazione predefinita del sistema", + "description": "Seleziona il dispositivo microfono per il riconoscimento vocale" + } } } diff --git a/webview-ui/src/i18n/locales/ja/kilocode.json b/webview-ui/src/i18n/locales/ja/kilocode.json index 18276252df4..471c812bea8 100644 --- a/webview-ui/src/i18n/locales/ja/kilocode.json +++ b/webview-ui/src/i18n/locales/ja/kilocode.json @@ -343,6 +343,14 @@ "ffmpegMessage": "FFmpegのインストールを手伝ってください" }, "misconfiguredState": "設定により利用不可", - "errorState": "エラーのため利用できません" + "errorState": "エラーのため利用できません", + "microphone": { + "label": "マイクロフォン", + "defaultOption": "システムデフォルト", + "description": "音声からテキストへの変換用のマイクデバイスを選択してください", + "loading": "マイクデバイスを読み込んでいます...", + "noDevices": "マイクデバイスが見つかりません", + "refresh": "更新" + } } } diff --git a/webview-ui/src/i18n/locales/ko/kilocode.json b/webview-ui/src/i18n/locales/ko/kilocode.json index b5ddb6a6179..a3fe8f4aeab 100644 --- a/webview-ui/src/i18n/locales/ko/kilocode.json +++ b/webview-ui/src/i18n/locales/ko/kilocode.json @@ -342,6 +342,14 @@ "ffmpegMessage": "FFmpeg 설치를 도와주세요" }, "misconfiguredState": "설정으로 인해 사용할 수 없음", - "errorState": "오류로 인해 사용할 수 없음" + "errorState": "오류로 인해 사용할 수 없음", + "microphone": { + "label": "마이크", + "description": "음성 인식을 위한 마이크 장치를 선택하세요", + "loading": "마이크 장치 로딩 중...", + "defaultOption": "시스템 기본값", + "noDevices": "마이크 장치가 발견되지 않음", + "refresh": "새로 고침" + } } } diff --git a/webview-ui/src/i18n/locales/nl/kilocode.json b/webview-ui/src/i18n/locales/nl/kilocode.json index 3e88c7025f4..3d9c01172a0 100644 --- a/webview-ui/src/i18n/locales/nl/kilocode.json +++ b/webview-ui/src/i18n/locales/nl/kilocode.json @@ -341,6 +341,14 @@ "ffmpegReason": "Je hebt FFmpeg nodig. Klik hier om Kilo te laten helpen met het installeren van FFmpeg." }, "misconfiguredState": "Niet beschikbaar vanwege configuratie", - "errorState": "Niet beschikbaar vanwege een fout" + "errorState": "Niet beschikbaar vanwege een fout", + "microphone": { + "label": "Microfoon", + "defaultOption": "Systeemstandaard", + "description": "Selecteer het microfoonapparaat voor spraak-naar-tekst", + "loading": "Microfoonapparaten laden...", + "noDevices": "Geen microfoonapparaten gevonden", + "refresh": "Vernieuwen" + } } } diff --git a/webview-ui/src/i18n/locales/pl/kilocode.json b/webview-ui/src/i18n/locales/pl/kilocode.json index 6642fce2575..f0e8426e294 100644 --- a/webview-ui/src/i18n/locales/pl/kilocode.json +++ b/webview-ui/src/i18n/locales/pl/kilocode.json @@ -343,6 +343,14 @@ "ffmpegMessage": "Pomóż mi zainstalować FFmpeg" }, "misconfiguredState": "Niedostępne z powodu konfiguracji", - "errorState": "Niedostępne z powodu błędu" + "errorState": "Niedostępne z powodu błędu", + "microphone": { + "label": "Mikrofon", + "description": "Wybierz urządzenie mikrofonu do przetwarzania mowy na tekst", + "defaultOption": "Domyślny system", + "loading": "Ładowanie urządzeń mikrofonowych...", + "refresh": "Odśwież", + "noDevices": "Nie znaleziono urządzeń mikrofonowych" + } } } diff --git a/webview-ui/src/i18n/locales/pt-BR/kilocode.json b/webview-ui/src/i18n/locales/pt-BR/kilocode.json index c0267826640..675e9d5f79a 100644 --- a/webview-ui/src/i18n/locales/pt-BR/kilocode.json +++ b/webview-ui/src/i18n/locales/pt-BR/kilocode.json @@ -342,6 +342,14 @@ "ffmpegMessage": "Ajude-me a instalar o FFmpeg" }, "misconfiguredState": "Indisponível devido à configuração", - "errorState": "Indisponível devido a erro" + "errorState": "Indisponível devido a erro", + "microphone": { + "label": "Microfone", + "loading": "Carregando dispositivos de microfone...", + "defaultOption": "Padrão do Sistema", + "description": "Selecione o dispositivo de microfone para conversão de fala em texto", + "noDevices": "Nenhum dispositivo de microfone encontrado", + "refresh": "Atualizar" + } } } diff --git a/webview-ui/src/i18n/locales/ru/kilocode.json b/webview-ui/src/i18n/locales/ru/kilocode.json index 45ebe181e1b..572d4ca77b5 100644 --- a/webview-ui/src/i18n/locales/ru/kilocode.json +++ b/webview-ui/src/i18n/locales/ru/kilocode.json @@ -343,6 +343,14 @@ "ffmpegReason": "Необходимо установить FFmpeg. Нажмите здесь, чтобы Kilo помог вам установить FFmpeg." }, "misconfiguredState": "Недоступно из-за конфигурации", - "errorState": "Недоступно из-за ошибки" + "errorState": "Недоступно из-за ошибки", + "microphone": { + "label": "Микрофон", + "description": "Выберите микрофон для преобразования речи в текст", + "loading": "Загрузка устройств микрофона...", + "refresh": "Обновить", + "noDevices": "Микрофонные устройства не найдены", + "defaultOption": "Системные настройки по умолчанию" + } } } diff --git a/webview-ui/src/i18n/locales/th/kilocode.json b/webview-ui/src/i18n/locales/th/kilocode.json index dd45a391d05..46d77034b35 100644 --- a/webview-ui/src/i18n/locales/th/kilocode.json +++ b/webview-ui/src/i18n/locales/th/kilocode.json @@ -342,6 +342,14 @@ "ffmpegMessage": "ช่วยฉันติดตั้ง FFmpeg" }, "misconfiguredState": "ไม่พร้อมใช้งานเนื่องจากการตั้งค่า", - "errorState": "ไม่พร้อมใช้งานเนื่องจากข้อผิดพลาด" + "errorState": "ไม่พร้อมใช้งานเนื่องจากข้อผิดพลาด", + "microphone": { + "label": "ไมโครโฟน", + "description": "เลือกอุปกรณ์ไมโครโฟนสำหรับการแปลงเสียงเป็นข้อความ", + "defaultOption": "ค่าเริ่มต้นของระบบ", + "loading": "กำลังโหลดอุปกรณ์ไมโครโฟน...", + "noDevices": "ไม่พบอุปกรณ์ไมโครโฟน", + "refresh": "รีเฟรช" + } } } diff --git a/webview-ui/src/i18n/locales/tr/kilocode.json b/webview-ui/src/i18n/locales/tr/kilocode.json index 0ef5444a656..5e7d2e91ef7 100644 --- a/webview-ui/src/i18n/locales/tr/kilocode.json +++ b/webview-ui/src/i18n/locales/tr/kilocode.json @@ -342,6 +342,14 @@ "ffmpegReason": "FFmpeg'in yüklü olması gerekiyor. FFmpeg'i yüklemeniz için Kilo'nun size yardımcı olması için buraya tıklayın." }, "misconfiguredState": "Yapılandırma nedeniyle kullanılamıyor", - "errorState": "Hata nedeniyle kullanılamıyor" + "errorState": "Hata nedeniyle kullanılamıyor", + "microphone": { + "label": "Mikrofon", + "defaultOption": "Sistem Varsayılanı", + "description": "Konuşma-yazma için mikrofon cihazını seçin", + "noDevices": "Mikrofon cihazı bulunamadı", + "loading": "Mikrofon cihazları yükleniyor...", + "refresh": "Yenile" + } } } diff --git a/webview-ui/src/i18n/locales/uk/kilocode.json b/webview-ui/src/i18n/locales/uk/kilocode.json index ee33ecf8450..bbdd5e202e5 100644 --- a/webview-ui/src/i18n/locales/uk/kilocode.json +++ b/webview-ui/src/i18n/locales/uk/kilocode.json @@ -342,6 +342,14 @@ "ffmpegMessage": "Допоможіть мені встановити FFmpeg" }, "misconfiguredState": "Недоступно через конфігурацію", - "errorState": "Недоступно через помилку" + "errorState": "Недоступно через помилку", + "microphone": { + "label": "Мікрофон", + "defaultOption": "Системний стандарт", + "loading": "Завантаження мікрофонних пристроїв...", + "description": "Виберіть пристрій мікрофона для перетворення мови в текст", + "refresh": "Оновити", + "noDevices": "Не знайдено пристроїв мікрофона" + } } } diff --git a/webview-ui/src/i18n/locales/vi/kilocode.json b/webview-ui/src/i18n/locales/vi/kilocode.json index 5be61f12782..8697557e30f 100644 --- a/webview-ui/src/i18n/locales/vi/kilocode.json +++ b/webview-ui/src/i18n/locales/vi/kilocode.json @@ -343,6 +343,14 @@ "ffmpegReason": "Bạn cần cài đặt FFmpeg. Nhấp vào đây để Kilo giúp bạn cài đặt FFmpeg." }, "misconfiguredState": "Không khả dụng do cấu hình", - "errorState": "Không khả dụng do lỗi" + "errorState": "Không khả dụng do lỗi", + "microphone": { + "label": "Micro", + "defaultOption": "Mặc định hệ thống", + "noDevices": "Không tìm thấy thiết bị micro nào", + "loading": "Đang tải các thiết bị microphone...", + "description": "Chọn thiết bị microphone cho chuyển giọng nói thành văn bản", + "refresh": "Làm mới" + } } } diff --git a/webview-ui/src/i18n/locales/zh-CN/kilocode.json b/webview-ui/src/i18n/locales/zh-CN/kilocode.json index 2c5036317ad..365cf4d1635 100644 --- a/webview-ui/src/i18n/locales/zh-CN/kilocode.json +++ b/webview-ui/src/i18n/locales/zh-CN/kilocode.json @@ -343,6 +343,14 @@ "ffmpegMessage": "帮我安装 FFmpeg" }, "misconfiguredState": "因配置问题而不可用", - "errorState": "因错误而不可用" + "errorState": "因错误而不可用", + "microphone": { + "label": "麦克风", + "defaultOption": "系统默认", + "description": "选择用于语音转文字的麦克风设备", + "loading": "正在加载麦克风设备...", + "noDevices": "未找到麦克风设备", + "refresh": "刷新" + } } } diff --git a/webview-ui/src/i18n/locales/zh-TW/kilocode.json b/webview-ui/src/i18n/locales/zh-TW/kilocode.json index 9003e4485d7..e959fdc7e71 100644 --- a/webview-ui/src/i18n/locales/zh-TW/kilocode.json +++ b/webview-ui/src/i18n/locales/zh-TW/kilocode.json @@ -342,6 +342,14 @@ "ffmpegMessage": "帮我安装 FFmpeg" }, "misconfiguredState": "因配置问题而不可用", - "errorState": "因错误而不可用" + "errorState": "因错误而不可用", + "microphone": { + "label": "麥克風", + "description": "選擇麥克風設備進行語音轉文字", + "loading": "正在載入麥克風設備...", + "refresh": "重新整理", + "defaultOption": "系統預設", + "noDevices": "找不到麥克風設備" + } } }