diff --git a/.changeset/add-yolo-keybinding.md b/.changeset/add-yolo-keybinding.md new file mode 100644 index 00000000000..e0e34ec5879 --- /dev/null +++ b/.changeset/add-yolo-keybinding.md @@ -0,0 +1,5 @@ +--- +"@kilocode/cli": patch +--- + +feat(cli): add Ctrl+Y keybinding to toggle YOLO mode diff --git a/.changeset/humble-points-care.md b/.changeset/humble-points-care.md deleted file mode 100644 index 25647d82323..00000000000 --- a/.changeset/humble-points-care.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -"kilo-code": patch ---- - -Fix: bottom controls no longer overlap with create mode button diff --git a/.changeset/pink-gorillas-breathe.md b/.changeset/pink-gorillas-breathe.md deleted file mode 100644 index 44085877c6e..00000000000 --- a/.changeset/pink-gorillas-breathe.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -"kilo-code": patch ---- - -fix: resolve AbortSignal memory leak in CLI (MaxListenersExceededWarning) diff --git a/.changeset/pretty-memes-lose.md b/.changeset/pretty-memes-lose.md deleted file mode 100644 index 79cc638e42c..00000000000 --- a/.changeset/pretty-memes-lose.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -"kilo-code": patch ---- - -Split autocomplete suggestion in current line and next lines in most cases diff --git a/.changeset/smart-otters-smell.md b/.changeset/smart-otters-smell.md deleted file mode 100644 index b519faf1f6a..00000000000 --- a/.changeset/smart-otters-smell.md +++ /dev/null @@ -1,6 +0,0 @@ ---- -"@kilocode/cli": minor -"kilo-code": minor ---- - -send org id and last mode with session data diff --git a/.changeset/tidy-agent-manager-errors.md b/.changeset/tidy-agent-manager-errors.md deleted file mode 100644 index 221811faed5..00000000000 --- a/.changeset/tidy-agent-manager-errors.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -"kilo-code": patch ---- - -Handle different cli authentication errors when using agent manager diff --git a/CHANGELOG.md b/CHANGELOG.md index 5d5fbac0633..73a20816d08 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,27 @@ # kilo-code +## 4.137.0 + +### Minor Changes + +- [#4394](https://github.com/Kilo-Org/kilocode/pull/4394) [`01b968b`](https://github.com/Kilo-Org/kilocode/commit/01b968ba4635a162c787169bffe1809fc1ab973a) Thanks [@hassoncs](https://github.com/hassoncs)! - Add Speech-To-Text experiment for the chat input powered by ffmpeg and the OpenAI Whisper API + +- [#4388](https://github.com/Kilo-Org/kilocode/pull/4388) [`af93318`](https://github.com/Kilo-Org/kilocode/commit/af93318e3648c235721ba58fe9caab9429608241) Thanks [@iscekic](https://github.com/iscekic)! - send org id and last mode with session data + +### Patch Changes + +- [#4412](https://github.com/Kilo-Org/kilocode/pull/4412) [`d56879c`](https://github.com/Kilo-Org/kilocode/commit/d56879c58f65c8da1419c9840816720279bec4e6) Thanks [@quantizoor](https://github.com/quantizoor)! - Added support for xhigh reasoning effort + +- [#4415](https://github.com/Kilo-Org/kilocode/pull/4415) [`5e670d1`](https://github.com/Kilo-Org/kilocode/commit/5e670d14047054a2f92a9057391286402076b5a5) Thanks [@kevinvandijk](https://github.com/kevinvandijk)! - Fix: bottom controls no longer overlap with create mode button + +- [#4416](https://github.com/Kilo-Org/kilocode/pull/4416) [`026da65`](https://github.com/Kilo-Org/kilocode/commit/026da65fdb9f16d23216197412e06ca2ed208639) Thanks [@marius-kilocode](https://github.com/marius-kilocode)! - fix: resolve AbortSignal memory leak in CLI (MaxListenersExceededWarning) + +- [#4392](https://github.com/Kilo-Org/kilocode/pull/4392) [`73681e9`](https://github.com/Kilo-Org/kilocode/commit/73681e9002af4c5aa3fec3bc2a86e8008dc926af) Thanks [@markijbema](https://github.com/markijbema)! - Split autocomplete suggestion in current line and next lines in most cases + +- [#4426](https://github.com/Kilo-Org/kilocode/pull/4426) [`fdc0c0a`](https://github.com/Kilo-Org/kilocode/commit/fdc0c0a07d49c4726997121ad540d6c855965e7b) Thanks [@kevinvandijk](https://github.com/kevinvandijk)! - Fix API request errors with MCP functions incompatible with OpenAI strict mode + +- [#4373](https://github.com/Kilo-Org/kilocode/pull/4373) [`a80ec02`](https://github.com/Kilo-Org/kilocode/commit/a80ec02db75c061163100ce91d099f4fd3846a99) Thanks [@marius-kilocode](https://github.com/marius-kilocode)! - Handle different cli authentication errors when using agent manager + ## 4.136.0 ### Minor Changes diff --git a/apps/kilocode-docs/docs/basic-usage/model-selection-guide.md b/apps/kilocode-docs/docs/basic-usage/model-selection-guide.md index 681dd1ef20a..fa98c0b79c4 100644 --- a/apps/kilocode-docs/docs/basic-usage/model-selection-guide.md +++ b/apps/kilocode-docs/docs/basic-usage/model-selection-guide.md @@ -2,83 +2,41 @@ sidebar_label: "Model Selection Guide" --- -# Kilo Code Model Selection Guide +# Model Selection Guide -Last updated: September 3, 2025. +Here's the honest truth about AI model recommendations: by the time I write them down, they're probably already outdated. New models drop every few weeks, existing ones get updated, prices shift, and yesterday's champion becomes today's budget option. -The AI model landscape evolves rapidly, so this guide focuses on what's delivering excellent results with Kilo Code right now. We update this regularly as new models emerge and performance shifts. +Instead of maintaining a static list that's perpetually behind, we built something better — a real-time leaderboard showing which models Kilo Code users are actually having success with right now. -## Kilo Code Top Performers +## Check the Live Models List -| Model | Context Window | SWE-Bench Verified | Human Eval | LiveCodeBench | Input Price\* | Output Price\* | Best For | -| -------------------- | -------------- | ------------------ | ---------- | ------------- | ------------- | -------------- | ------------------------------------------- | -| **GPT-5** | 400K tokens | 74.9% | 96.3% | 68.2% | $1.25 | $10 | Latest capabilities, multi-modal coding | -| **Claude Sonnet 4** | 1M tokens | 72.7% | 94.8% | 65.9% | $3-6 | $15-22.50 | Enterprise code generation, complex systems | -| **Grok Code Fast 1** | 256K tokens | 70.8% | 92.1% | 63.4% | $0.20 | $1.50 | Rapid development, cost-performance balance | -| **Qwen3 Coder** | 256K tokens | 68.4% | 91.7% | 61.8% | $0.20 | $0.80 | Pure coding tasks, rapid prototyping | -| **Gemini 2.5 Pro** | 1M+ tokens | 67.2% | 89.9% | 59.3% | TBD | TBD | Massive codebases, architectural planning | +**[👉 See what's working today at kilo.ai/models](https://kilo.ai/models)** -\*Per million tokens +This isn't benchmarks from some lab. It's real usage data from developers like you, updated continuously. You'll see which models people are choosing for different tasks, what's delivering results, and how the landscape is shifting in real-time. -## Budget-Conscious Options +## General Guidance -| Model | Context Window | SWE-Bench Verified | Human Eval | LiveCodeBench | Input Price\* | Output Price\* | Notes | -| ---------------- | -------------- | ------------------ | ---------- | ------------- | ------------- | -------------- | ------------------------------------ | -| **DeepSeek V3** | 128K tokens | 64.1% | 87.3% | 56.7% | $0.14 | $0.28 | Exceptional value for daily coding | -| **DeepSeek R1** | 128K tokens | 62.8% | 85.9% | 54.2% | $0.55 | $2.19 | Advanced reasoning at budget prices | -| **Qwen3 32B** | 128K tokens | 60.3% | 83.4% | 52.1% | Varies | Varies | Open source flexibility | -| **Z AI GLM 4.5** | 128K tokens | 58.7% | 81.2% | 49.8% | TBD | TBD | MIT license, hybrid reasoning system | +While the specifics change constantly, some principles stay consistent: -\*Per million tokens +**For complex coding tasks**: Premium models (Claude Sonnet/Opus, GPT-5 class, Gemini Pro) typically handle nuanced requirements, large refactors, and architectural decisions better. -## Comprehensive Evaluation Framework +**For everyday coding**: Mid-tier models often provide the best balance of speed, cost, and quality. They're fast enough to keep your flow state intact and capable enough for most tasks. -### Latency Performance +**For budget-conscious work**: Newer efficient models keep surprising us with price-to-performance ratios. DeepSeek, Qwen, and similar models can handle more than you'd expect. -Response times significantly impact development flow and productivity: +**For local/private work**: Ollama and LM Studio let you run models locally. The tradeoff is usually speed and capability for privacy and zero API costs. -- **Ultra-Fast (< 2s)**: Grok Code Fast 1, Qwen3 Coder -- **Fast (2-4s)**: DeepSeek V3, GPT-5 -- **Moderate (4-8s)**: Claude Sonnet 4, DeepSeek R1 -- **Slower (8-15s)**: Gemini 2.5 Pro, Z AI GLM 4.5 +## Context Windows Matter -**Impact on Development**: Ultra-fast models enable real-time coding assistance and immediate feedback loops. Models with 8+ second latency can disrupt flow state but may be acceptable for complex architectural decisions. +One thing that doesn't change: context window size matters for your workflow. -### Throughput Analysis +- **Small projects** (scripts, components): 32-64K tokens works fine +- **Standard applications**: 128K tokens handles most multi-file context +- **Large codebases**: 256K+ tokens helps with cross-system understanding +- **Massive systems**: 1M+ token models exist but effectiveness degrades at the extremes -Token generation rates affect large codebase processing: +Check [our provider docs](/docs/providers/openrouter) for specific context limits on each model. -- **High Throughput (150+ tokens/s)**: GPT-5, Grok Code Fast 1 -- **Medium Throughput (100-150 tokens/s)**: Claude Sonnet 4, Qwen3 Coder -- **Standard Throughput (50-100 tokens/s)**: DeepSeek models, Gemini 2.5 Pro -- **Variable Throughput**: Open source models depend on infrastructure +## Stay Current -**Scaling Factors**: High throughput models excel when generating extensive documentation, refactoring large files, or batch processing multiple components. - -### Reliability & Availability - -Enterprise considerations for production environments: - -- **Enterprise Grade (99.9%+ uptime)**: Claude Sonnet 4, GPT-5, Gemini 2.5 Pro -- **Production Ready (99%+ uptime)**: Qwen3 Coder, Grok Code Fast 1 -- **Developing Reliability**: DeepSeek models, Z AI GLM 4.5 -- **Self-Hosted**: Qwen3 32B (reliability depends on your infrastructure) - -**Success Rates**: Enterprise models maintain consistent output quality and handle edge cases more gracefully, while budget options may require additional validation steps. - -### Context Window Strategy - -Optimizing for different project scales: - -| Size | Word Count | Typical Use Case | Recommended Models | Strategy | -| ---------------- | --------------- | ------------------------------------- | -------------------------------------- | ----------------------------------------------- | -| **32K tokens** | ~24,000 words | Individual components, scripts | DeepSeek V3, Qwen3 Coder | Focus on single-file optimization | -| **128K tokens** | ~96,000 words | Standard applications, most projects | All budget models, Grok Code Fast 1 | Multi-file context, moderate complexity | -| **256K tokens** | ~192,000 words | Large applications, multiple services | Qwen3 Coder, Grok Code Fast 1 | Full feature context, service integration | -| **400K+ tokens** | ~300,000+ words | Enterprise systems, full stack apps | GPT-5, Claude Sonnet 4, Gemini 2.5 Pro | Architectural overview, system-wide refactoring | - -**Performance Degradation**: Model effectiveness typically drops significantly beyond 400-500K tokens, regardless of advertised limits. Plan context usage accordingly. - -## Community Choice - -The AI model landscape changes quicky to stay up to date [**👉 check Kilo Code Community Favorites on OpenRouter**](https://openrouter.ai/apps?url=https%3A%2F%2Fkilocode.ai%2F) +The AI model space moves fast. Bookmark [kilo.ai/models](https://kilo.ai/models) and check back when you're evaluating options. What's best today might not be best next month — and that's actually exciting. diff --git a/apps/kilocode-docs/docs/features/experimental/experimental-features.md b/apps/kilocode-docs/docs/features/experimental/experimental-features.md index f55c288c8f6..cd8af979752 100644 --- a/apps/kilocode-docs/docs/features/experimental/experimental-features.md +++ b/apps/kilocode-docs/docs/features/experimental/experimental-features.md @@ -24,7 +24,13 @@ When enabled, native JSON function calling improves reliability via explicit sig It replaces brittle XML-style prompts that risk mixed prose/markup, missing fields, and regex-heavy cleanup, yielding more deterministic tool use and clearer error handling. -[More Details are available](native-function-calling) +[More details are available](native-function-calling) + +## Voice Transcription + +When enabled, voice transcription allows you to dictate messages using speech-to-text in the chat interface. Powered by OpenAI's Whisper API and FFmpeg for audio capture. + +[More details are available](voice-transcription) ## Concurrent file edits diff --git a/apps/kilocode-docs/docs/features/experimental/voice-transcription.md b/apps/kilocode-docs/docs/features/experimental/voice-transcription.md new file mode 100644 index 00000000000..ff24e24c204 --- /dev/null +++ b/apps/kilocode-docs/docs/features/experimental/voice-transcription.md @@ -0,0 +1,81 @@ +# Voice Transcription + +Kilo Code now includes experimental support for voice input in the chat interface. This feature allows you to dictate your messages using speech-to-text (STT) technology powered by OpenAI's Whisper API. + +## Prerequisites + +Voice transcription requires two components to be set up: + +### 1. FFmpeg Installation + +FFmpeg is required for audio capture and processing. Install it for your platform: + +**macOS:** + +```bash +brew install ffmpeg +``` + +**Linux (Ubuntu/Debian):** + +```bash +sudo apt update +sudo apt install ffmpeg +``` + +**Windows:** +Download from [ffmpeg.org/download.html](https://ffmpeg.org/download.html) and add to your system PATH. + +### 2. OpenAI API Key + +Voice transcription uses OpenAI's Whisper API for speech recognition. You need an OpenAI API configuration in Kilo Code: + +1. Configure an OpenAI provider profile in Kilo Code settings +2. Add your OpenAI API key to the profile +3. Either **OpenAI** or **OpenAI Native** provider types will work + +## Enabling Voice Transcription + +Voice transcription is an experimental feature that must be enabled: + +1. Open Kilo Code settings +2. Navigate to **Experimental Features** +3. Enable the **Speech to Text** experiment + +## Using Voice Input + +Once configured and enabled, a microphone button will appear in the chat input area: + +1. Click the microphone button to start recording +2. Speak your message clearly +3. Click again to stop recording +4. Your speech will be automatically transcribed into text + +The feature includes real-time audio level visualization and voice activity detection to automatically detect when you're speaking. + +## Technical Details + +- **Audio Processing**: Uses FFmpeg for system audio capture +- **Voice Recognition**: OpenAI Whisper API for transcription + +## Troubleshooting + +**Microphone button not appearing:** + +- Ensure the Speech to Text experiment is enabled +- Verify FFmpeg is installed and in your PATH +- Check that you have an OpenAI provider configured with a valid API key + +**Transcription errors:** + +- Verify your OpenAI API key is valid and has available credits +- Check your internet connection +- Try speaking more clearly or adjusting your microphone settings + +## Limitations + +This feature is currently experimental and may have limitations: + +- Requires active internet connection +- Uses OpenAI API credits based on audio duration +- Transcription accuracy depends on audio quality and speech clarity diff --git a/apps/storybook/stories/ChatView.stories.tsx b/apps/storybook/stories/ChatView.stories.tsx index 241e93dbc27..45a107033dd 100644 --- a/apps/storybook/stories/ChatView.stories.tsx +++ b/apps/storybook/stories/ChatView.stories.tsx @@ -94,6 +94,25 @@ export const Default: Story = { apiModelId: "claude-3-5-sonnet-20241022", apiKey: "mock-key", }, + currentApiConfigName: "Claude 3.5 Sonnet", + listApiConfigMeta: [ + { + id: "config-1", + name: "Claude 3.5 Sonnet", + profileType: "chat", + apiProvider: "anthropic", + apiModelId: "claude-3-5-sonnet-20241022", + }, + { + id: "config-2", + name: "GPT-4", + profileType: "chat", + apiProvider: "openai", + apiModelId: "gpt-4-turbo-preview", + }, + ], + pinnedApiConfigs: {}, + togglePinnedApiConfig: fn(), mcpServers: [], allowedCommands: [], mode: "code", @@ -212,6 +231,25 @@ export const EmptyWithNotificationsAndHistory: Story = { apiModelId: "claude-3-5-sonnet-20241022", apiKey: "mock-key", }, + currentApiConfigName: "Claude 3.5 Sonnet", + listApiConfigMeta: [ + { + id: "config-1", + name: "Claude 3.5 Sonnet", + profileType: "chat", + apiProvider: "anthropic", + apiModelId: "claude-3-5-sonnet-20241022", + }, + { + id: "config-2", + name: "GPT-4", + profileType: "chat", + apiProvider: "openai", + apiModelId: "gpt-4-turbo-preview", + }, + ], + pinnedApiConfigs: {}, + togglePinnedApiConfig: fn(), mcpServers: [], allowedCommands: [], mode: "code", diff --git a/apps/storybook/stories/VolumeVisualizer.stories.tsx b/apps/storybook/stories/VolumeVisualizer.stories.tsx new file mode 100644 index 00000000000..819eebea0bd --- /dev/null +++ b/apps/storybook/stories/VolumeVisualizer.stories.tsx @@ -0,0 +1,37 @@ +import type { Meta, StoryObj } from "@storybook/react-vite" +import { VolumeVisualizer } from "@/components/chat/VolumeVisualizer" + +const meta = { + title: "Components/VolumeVisualizer", + component: VolumeVisualizer, + parameters: { + layout: "centered", + }, + argTypes: { + volume: { + control: { type: "range", min: 0, max: 1, step: 0.01 }, + description: "Volume level from 0 to 1", + }, + isActive: { + control: "boolean", + description: "Whether recording is active (affects color)", + }, + }, +} satisfies Meta + +export default meta +type Story = StoryObj + +export const Default: Story = { + args: { + volume: 0.5, + isActive: true, + }, +} + +export const Inactive: Story = { + args: { + volume: 0.3, + isActive: false, + }, +} diff --git a/cli/CHANGELOG.md b/cli/CHANGELOG.md index dc5b2571481..52ab2aca9cf 100644 --- a/cli/CHANGELOG.md +++ b/cli/CHANGELOG.md @@ -1,5 +1,11 @@ # @kilocode/cli +## 0.16.0 + +### Minor Changes + +- [#4388](https://github.com/Kilo-Org/kilocode/pull/4388) [`af93318`](https://github.com/Kilo-Org/kilocode/commit/af93318e3648c235721ba58fe9caab9429608241) Thanks [@iscekic](https://github.com/iscekic)! - send org id and last mode with session data + ## 0.15.0 ### Minor Changes diff --git a/cli/package.dist.json b/cli/package.dist.json index e70fed5b6bc..4dd78b17894 100644 --- a/cli/package.dist.json +++ b/cli/package.dist.json @@ -1,6 +1,6 @@ { "name": "@kilocode/cli", - "version": "0.15.0", + "version": "0.16.0", "description": "Terminal User Interface for Kilo Code", "type": "module", "main": "index.js", diff --git a/cli/package.json b/cli/package.json index 1009c79bf1b..73fb780752a 100644 --- a/cli/package.json +++ b/cli/package.json @@ -1,6 +1,6 @@ { "name": "@kilocode/cli", - "version": "0.15.0", + "version": "0.16.0", "description": "Terminal User Interface for Kilo Code", "type": "module", "main": "dist/index.js", diff --git a/cli/src/__tests__/config-command.test.ts b/cli/src/__tests__/config-command.test.ts index 3acfdded08a..66b9ea1b7cf 100644 --- a/cli/src/__tests__/config-command.test.ts +++ b/cli/src/__tests__/config-command.test.ts @@ -26,6 +26,9 @@ vi.mock("fs/promises", async () => { } }) +// Mock environment variables to avoid ephemeral mode +vi.stubEnv("KILOCODE_EPHEMERAL", "false") + describe("Config Command", () => { let testDir: string let testConfigFile: string diff --git a/cli/src/state/atoms/actions.ts b/cli/src/state/atoms/actions.ts index 06a1a66074e..a7ecb40b3f2 100644 --- a/cli/src/state/atoms/actions.ts +++ b/cli/src/state/atoms/actions.ts @@ -6,7 +6,7 @@ import { atom } from "jotai" import type { WebviewMessage, ProviderSettings, ClineAskResponse } from "../../types/messages.js" import { extensionServiceAtom, isServiceReadyAtom, setServiceErrorAtom } from "./service.js" -import { resetMessageCutoffAtom } from "./ui.js" +import { resetMessageCutoffAtom, yoloModeAtom } from "./ui.js" import { logs } from "../../services/logs.js" /** @@ -299,3 +299,21 @@ export const sendSecondaryButtonClickAtom = atom(null, async (get, set) => { await set(sendWebviewMessageAtom, message) }) + +/** + * Action atom to toggle YOLO mode + * Sends the yoloMode message to the extension to enable/disable auto-approval of all operations + */ +export const toggleYoloModeAtom = atom(null, async (get, set) => { + const currentValue = get(yoloModeAtom) + const newValue = !currentValue + + set(yoloModeAtom, newValue) + logs.info(`YOLO mode ${newValue ? "enabled" : "disabled"}`, "actions") + + const message: WebviewMessage = { + type: "yoloMode", + bool: newValue, + } + await set(sendWebviewMessageAtom, message) +}) diff --git a/cli/src/state/atoms/index.ts b/cli/src/state/atoms/index.ts index 58154a4758e..1e8cd0eeb9e 100644 --- a/cli/src/state/atoms/index.ts +++ b/cli/src/state/atoms/index.ts @@ -124,6 +124,9 @@ export { refreshStateAtom, sendPrimaryButtonClickAtom, sendSecondaryButtonClickAtom, + + // YOLO mode action + toggleYoloModeAtom, } from "./actions.js" // ============================================================================ @@ -198,6 +201,7 @@ export { messagesAtom, isStreamingAtom, errorAtom, + yoloModeAtom, isCommittingParallelModeAtom, commitCountdownSecondsAtom, diff --git a/cli/src/state/atoms/keyboard.ts b/cli/src/state/atoms/keyboard.ts index a5a25e55454..3e8b7236467 100644 --- a/cli/src/state/atoms/keyboard.ts +++ b/cli/src/state/atoms/keyboard.ts @@ -42,7 +42,7 @@ import { } from "./textBuffer.js" import { isApprovalPendingAtom, approvalOptionsAtom, approveAtom, rejectAtom, executeSelectedAtom } from "./approval.js" import { hasResumeTaskAtom } from "./extension.js" -import { cancelTaskAtom, resumeTaskAtom } from "./actions.js" +import { cancelTaskAtom, resumeTaskAtom, toggleYoloModeAtom } from "./actions.js" import { historyModeAtom, historyEntriesAtom, @@ -827,6 +827,13 @@ function handleGlobalHotkeys(get: Getter, set: Setter, key: Key): boolean { return true } break + case "y": + // Toggle YOLO mode with Ctrl+Y + if (key.ctrl) { + set(toggleYoloModeAtom) + return true + } + break case "shift-1": { // Toggle shell mode with Shift+1 or Shift+! only if input is empty const isEmpty = get(textBufferIsEmptyAtom) diff --git a/cli/src/state/atoms/ui.ts b/cli/src/state/atoms/ui.ts index 3cbf3ff1288..cbfe50456f8 100644 --- a/cli/src/state/atoms/ui.ts +++ b/cli/src/state/atoms/ui.ts @@ -54,6 +54,12 @@ export const messageCutoffTimestampAtom = atom(0) */ export const errorAtom = atom(null) +/** + * Atom to track YOLO mode state + * When enabled, all operations are auto-approved without confirmation + */ +export const yoloModeAtom = atom(false) + /** * Atom to track when parallel mode is committing changes * Used to disable input and show "Committing your changes..." message diff --git a/cli/src/ui/components/StatusBar.tsx b/cli/src/ui/components/StatusBar.tsx index 585caa213cb..197af8e8dc2 100644 --- a/cli/src/ui/components/StatusBar.tsx +++ b/cli/src/ui/components/StatusBar.tsx @@ -12,6 +12,7 @@ import { apiConfigurationAtom, chatMessagesAtom, routerModelsAtom, + yoloModeAtom, } from "../../state/atoms/index.js" import { useGitInfo } from "../../state/hooks/useGitInfo.js" import { useContextUsage } from "../../state/hooks/useContextUsage.js" @@ -102,6 +103,7 @@ export const StatusBar: React.FC = () => { const apiConfig = useAtomValue(apiConfigurationAtom) const messages = useAtomValue(chatMessagesAtom) const routerModels = useAtomValue(routerModelsAtom) + const yoloMode = useAtomValue(yoloModeAtom) // Get git info const gitInfo = useGitInfo(cwd) @@ -181,8 +183,20 @@ export const StatusBar: React.FC = () => { ) : null} - {/* Right side: Mode, Model, and Context */} + {/* Right side: YOLO indicator, Mode, Model, and Context */} + {/* YOLO Mode Indicator */} + {yoloMode && ( + <> + + ⚡ YOLO + + + {" | "} + + + )} + {/* Mode */} {mode ? mode.charAt(0).toUpperCase() + mode.slice(1) : "N/A"} diff --git a/packages/types/src/experiment.ts b/packages/types/src/experiment.ts index dca48371051..c2936b3bfde 100644 --- a/packages/types/src/experiment.ts +++ b/packages/types/src/experiment.ts @@ -6,7 +6,7 @@ import type { Keys, Equals, AssertEqual } from "./type-fu.js" * ExperimentId */ -const kilocodeExperimentIds = ["morphFastApply"] as const +const kilocodeExperimentIds = ["morphFastApply", "speechToText"] as const // kilocode_change export const experimentIds = [ "powerSteering", "multiFileApplyDiff", @@ -26,6 +26,7 @@ export type ExperimentId = z.infer export const experimentsSchema = z.object({ morphFastApply: z.boolean().optional(), // kilocode_change + speechToText: z.boolean().optional(), // kilocode_change powerSteering: z.boolean().optional(), multiFileApplyDiff: z.boolean().optional(), preventFocusDisruption: z.boolean().optional(), diff --git a/packages/types/src/model.ts b/packages/types/src/model.ts index cee8a99e8a1..ecbca1fc281 100644 --- a/packages/types/src/model.ts +++ b/packages/types/src/model.ts @@ -4,7 +4,7 @@ import { z } from "zod" * ReasoningEffort */ -export const reasoningEfforts = ["low", "medium", "high"] as const +export const reasoningEfforts = ["low", "medium", "high", "xhigh"] as const export const reasoningEffortsSchema = z.enum(reasoningEfforts) @@ -22,7 +22,7 @@ export type ReasoningEffortWithMinimal = z.infer { + test("normalizeObjectAdditionalPropertiesFalse adds additionalProperties:false for nested objects with properties", () => { + const schema = { + type: "object", + additionalProperties: false, + properties: { + inputs: { + type: "object", + properties: { + ref: { type: "string" }, + }, + // additionalProperties intentionally missing + }, + }, + } + + const normalized = normalizeObjectAdditionalPropertiesFalse(schema) + expect(normalized.properties.inputs.additionalProperties).toBe(false) + }) +}) + +describe("BaseProvider.convertToolsForOpenAI", () => { + test("adds additionalProperties:false during schema conversion", () => { + const provider = new TestProvider() + const schema = { + type: "object", + properties: { + inputs: { + type: "object", + properties: { + ref: { type: "string" }, + }, + }, + }, + } + + const converted = provider.convertSchema(schema) + expect(converted.properties.inputs.additionalProperties).toBe(false) + }) +}) diff --git a/src/api/providers/base-provider.ts b/src/api/providers/base-provider.ts index 84c8cf6fe97..6870b568e88 100644 --- a/src/api/providers/base-provider.ts +++ b/src/api/providers/base-provider.ts @@ -6,6 +6,8 @@ import type { ApiHandler, ApiHandlerCreateMessageMetadata } from "../index" import { ApiStream } from "../transform/stream" import { countTokens } from "../../utils/countTokens" +import { normalizeObjectAdditionalPropertiesFalse } from "./kilocode/openai-strict-schema" // kilocode_change + /** * Base class for API providers that implements common functionality. */ @@ -86,7 +88,7 @@ export abstract class BaseProvider implements ApiHandler { result.properties = newProps } - return result + return normalizeObjectAdditionalPropertiesFalse(result) // kilocode_change: normalize invalid schemes for strict mode } /** diff --git a/src/api/providers/kilocode/openai-strict-schema.ts b/src/api/providers/kilocode/openai-strict-schema.ts new file mode 100644 index 00000000000..416c27d35ad --- /dev/null +++ b/src/api/providers/kilocode/openai-strict-schema.ts @@ -0,0 +1,73 @@ +// kilocode_change - new file +type JsonSchema = any + +const isObjectLike = (value: unknown): value is Record => + !!value && typeof value === "object" && !Array.isArray(value) + +const isSchemaObjectNode = (schema: JsonSchema): boolean => { + if (!isObjectLike(schema)) return false + return schema.type === "object" || isObjectLike(schema.properties) +} + +/** + * Recursively ensures `additionalProperties` is present (default false) for object schemas that declare `properties`. + * + * OpenAI strict tool schemas require `additionalProperties` to be explicitly provided and `false` + * for objects using `properties`. + */ +export const normalizeObjectAdditionalPropertiesFalse = (schema: JsonSchema): JsonSchema => { + if (!schema || typeof schema !== "object") return schema + + if (Array.isArray(schema)) { + return schema.map((item) => normalizeObjectAdditionalPropertiesFalse(item)) + } + + const result: Record = { ...(schema as any) } + + // Normalize this node + if (isSchemaObjectNode(result) && isObjectLike(result.properties)) { + // Only add when missing/undefined; do not override dictionary semantics. + if (result.additionalProperties === undefined) { + result.additionalProperties = false + } + } + + // Recurse into common schema composition keywords + for (const key of ["anyOf", "oneOf", "allOf"] as const) { + if (Array.isArray(result[key])) { + result[key] = result[key].map((s: any) => normalizeObjectAdditionalPropertiesFalse(s)) + } + } + + // Recurse into items + if (result.items) { + result.items = normalizeObjectAdditionalPropertiesFalse(result.items) + } + + // Recurse into properties + if (isObjectLike(result.properties)) { + const nextProps: Record = { ...result.properties } + for (const [propKey, propSchema] of Object.entries(nextProps)) { + nextProps[propKey] = normalizeObjectAdditionalPropertiesFalse(propSchema) + } + result.properties = nextProps + } + + // Recurse into additionalProperties *schema* if present (doesn't change semantics) + if (isObjectLike(result.additionalProperties)) { + result.additionalProperties = normalizeObjectAdditionalPropertiesFalse(result.additionalProperties) + } + + // Recurse into definitions containers when present + for (const defsKey of ["$defs", "definitions"] as const) { + if (isObjectLike(result[defsKey])) { + const nextDefs: Record = { ...result[defsKey] } + for (const [defKey, defSchema] of Object.entries(nextDefs)) { + nextDefs[defKey] = normalizeObjectAdditionalPropertiesFalse(defSchema) + } + result[defsKey] = nextDefs + } + } + + return result +} diff --git a/src/api/providers/openai-native.ts b/src/api/providers/openai-native.ts index a7585bd59e3..c56acfc4858 100644 --- a/src/api/providers/openai-native.ts +++ b/src/api/providers/openai-native.ts @@ -22,6 +22,7 @@ import { getModelParams } from "../transform/model-params" import { BaseProvider } from "./base-provider" import type { SingleCompletionHandler, ApiHandlerCreateMessageMetadata } from "../index" +import { normalizeObjectAdditionalPropertiesFalse } from "./kilocode/openai-strict-schema" // kilocode_change export type OpenAiNativeModel = ReturnType @@ -292,7 +293,11 @@ export class OpenAiNativeHandler extends BaseProvider implements SingleCompletio type: "function", name: tool.function.name, description: tool.function.description, - parameters: ensureAllRequired(tool.function.parameters), + // kilocode_change start: normalize invalid schemes for strict mode + parameters: normalizeObjectAdditionalPropertiesFalse( + ensureAllRequired(tool.function.parameters), + ), + // kilocode_chang end strict: true, })), }), diff --git a/src/core/webview/ClineProvider.ts b/src/core/webview/ClineProvider.ts index 64a5ceea233..eeb6ce47caf 100644 --- a/src/core/webview/ClineProvider.ts +++ b/src/core/webview/ClineProvider.ts @@ -2204,6 +2204,13 @@ ${prompt} : undefined // kilocode_change end + // kilocode_change start - checkSpeechToTextAvailable (backend prerequisites only, experiment flag checked in frontend) + console.log("🎙️ [ClineProvider] Checking speech-to-text availability for webview state update...") + const { checkSpeechToTextAvailable } = await import("./speechToTextCheck") + const speechToTextAvailable = await checkSpeechToTextAvailable(this.providerSettingsManager) + console.log(`🎙️ [ClineProvider] Speech-to-text available: ${speechToTextAvailable}`) + // kilocode_change end - checkSpeechToTextAvailable + let cloudOrganizations: CloudOrganizationMembership[] = [] try { @@ -2427,6 +2434,7 @@ ${prompt} featureRoomoteControlEnabled, virtualQuotaActiveModel, // kilocode_change: Include virtual quota active model in state debug: vscode.workspace.getConfiguration(Package.name).get("debug", false), + speechToTextAvailable, // kilocode_change: Whether speech-to-text is fully configured } } diff --git a/src/core/webview/speechToTextCheck.ts b/src/core/webview/speechToTextCheck.ts new file mode 100644 index 00000000000..1b25eaa2be3 --- /dev/null +++ b/src/core/webview/speechToTextCheck.ts @@ -0,0 +1,76 @@ +// kilocode_change - new file: Speech-to-text availability check (extracted from ClineProvider) +import type { ProviderSettingsManager } from "../config/ProviderSettingsManager" +import { getOpenAiApiKey } from "../../services/stt/utils/getOpenAiCredentials" +import { FFmpegCaptureService } from "../../services/stt/FFmpegCaptureService" + +/** + * Cached availability result with timestamp + */ +let cachedResult: { available: boolean; timestamp: number } | null = null +const CACHE_DURATION_MS = 30000 // 30 seconds + +/** + * Check if speech-to-text prerequisites are available + * + * This checks the backend prerequisites only: + * 1. OpenAI API key is configured + * 2. FFmpeg is installed and available + * + * Note: The experiment flag is checked on the frontend, not here. + * Results are cached for 30 seconds to prevent redundant FFmpeg checks. + * + * @param providerSettingsManager - Provider settings manager for API configuration + * @param forceRecheck - Force a fresh check, ignoring cache (default: false) + * @returns Promise - true if prerequisites are met + */ +export async function checkSpeechToTextAvailable( + providerSettingsManager: ProviderSettingsManager, + forceRecheck = false, +): Promise { + // Return cached result if valid and not forcing recheck + if (cachedResult !== null && !forceRecheck) { + const age = Date.now() - cachedResult.timestamp + if (age < CACHE_DURATION_MS) { + return cachedResult.available + } + } + + console.log("🎙️ [STT Availability Check] Starting speech-to-text prerequisite check...") + + try { + // Check 1: OpenAI API key + const apiKey = await getOpenAiApiKey(providerSettingsManager) + const hasApiKey = !!apiKey + console.log(`🎙️ [STT Availability Check] OpenAI API key configured: ${hasApiKey}`) + + if (!hasApiKey) { + console.log("🎙️ [STT Availability Check] ❌ FAILED: No OpenAI API key found") + console.log("🎙️ [STT Availability Check] → Add an OpenAI API provider in Settings") + cachedResult = { available: false, timestamp: Date.now() } + return false + } + + // Check 2: FFmpeg installed + console.log("🎙️ [STT Availability Check] Checking FFmpeg installation...") + const ffmpegResult = FFmpegCaptureService.findFFmpeg() + console.log(`🎙️ [STT Availability Check] FFmpeg available: ${ffmpegResult.available}`) + + if (!ffmpegResult.available) { + console.log("🎙️ [STT Availability Check] ❌ FAILED: FFmpeg is not installed or not in PATH") + console.log("🎙️ [STT Availability Check] → Install FFmpeg: https://ffmpeg.org/download.html") + if (ffmpegResult.error) { + console.log(`🎙️ [STT Availability Check] → Error: ${ffmpegResult.error}`) + } + cachedResult = { available: false, timestamp: Date.now() } + return false + } + + console.log("🎙️ [STT Availability Check] ✅ SUCCESS: Speech-to-text prerequisites are met!") + cachedResult = { available: true, timestamp: Date.now() } + return true + } catch (error) { + console.error("🎙️ [STT Availability Check] ❌ FAILED: Unexpected error during check", error) + cachedResult = { available: false, timestamp: Date.now() } + return false + } +} diff --git a/src/core/webview/sttHandlers.ts b/src/core/webview/sttHandlers.ts new file mode 100644 index 00000000000..e44026730d6 --- /dev/null +++ b/src/core/webview/sttHandlers.ts @@ -0,0 +1,152 @@ +// kilocode_change - new file: STT message handlers (replaces speechMessageHandlers.ts) +import type { ClineProvider } from "./ClineProvider" +import type { STTCommand, STTSegment } from "../../shared/sttContract" +import { STTService } from "../../services/stt" +import { STTEventEmitter } from "../../services/stt/types" +import { getOpenAiApiKey } from "../../services/stt/utils/getOpenAiCredentials" +import { VisibleCodeTracker } from "../../services/ghost/context/VisibleCodeTracker" +import { extractCodeGlossary, formatGlossaryAsPrompt } from "../../services/stt/context/codeGlossaryExtractor" + +/** + * Map of ClineProvider -> STTService + * WeakMap ensures cleanup when ClineProvider is garbage collected + */ +const servicesByProviderRef = new WeakMap() + +/** + * Get or create STTService for a provider + */ +function getService(clineProvider: ClineProvider): STTService { + let service = servicesByProviderRef.get(clineProvider) + + if (!service) { + const emitter: STTEventEmitter = { + onStarted: (sessionId: string) => { + clineProvider.postMessageToWebview({ + type: "stt:started", + sessionId, + }) + }, + + onTranscript: (segments: STTSegment[], isFinal: boolean) => { + const sessionId = service?.getSessionId() || "" + clineProvider.postMessageToWebview({ + type: "stt:transcript", + sessionId, + segments, + isFinal, + }) + }, + + onVolume: (level: number) => { + const sessionId = service?.getSessionId() || "" + clineProvider.postMessageToWebview({ + type: "stt:volume", + sessionId, + level, + }) + }, + + onStopped: (reason, text, error) => { + const sessionId = service?.getSessionId() || "" + clineProvider.postMessageToWebview({ + type: "stt:stopped", + sessionId, + reason, + text, + error, + }) + }, + } + + // Create code glossary with snapshotted rooIgnoreController + const currentTask = clineProvider.getCurrentTask() + const codeGlossary = new VisibleCodeGlossary(clineProvider.cwd, currentTask?.rooIgnoreController ?? null) + + service = new STTService(emitter, clineProvider.providerSettingsManager, codeGlossary) + servicesByProviderRef.set(clineProvider, service) + } + + return service +} + +/** + * Handle stt:start command + */ +export async function handleSTTStart(clineProvider: ClineProvider, language?: string): Promise { + const service = getService(clineProvider) + + const apiKey = await getOpenAiApiKey(clineProvider.providerSettingsManager) + if (!apiKey) { + clineProvider.postMessageToWebview({ + type: "stt:stopped", + sessionId: "", + reason: "error", + error: "OpenAI API key not configured. Please add an OpenAI provider in settings.", + }) + return + } + + try { + // Service generates its own prompt from the code glossary + await service.start({ apiKey }, language) + } catch (error) { + console.error("Failed to start STT service:", error) + } +} + +/** + * Handle stt:stop command + */ +export async function handleSTTStop(clineProvider: ClineProvider): Promise { + const service = getService(clineProvider) + await service.stop() +} + +/** + * Handle stt:cancel command + */ +export async function handleSTTCancel(clineProvider: ClineProvider): Promise { + const service = getService(clineProvider) + service.cancel() +} + +/** + * Unified handler for all STT commands + */ +export async function handleSTTCommand(clineProvider: ClineProvider, command: STTCommand): Promise { + switch (command.type) { + case "stt:start": + await handleSTTStart(clineProvider, command.language) + break + case "stt:stop": + await handleSTTStop(clineProvider) + break + case "stt:cancel": + await handleSTTCancel(clineProvider) + break + } +} + +/** + * VisibleCodeGlossary captures visible code and formats it + * Snapshots the VisibleCodeTracker at construction for reuse during recording session + */ +class VisibleCodeGlossary { + private tracker: VisibleCodeTracker + + constructor(cwd: string, rooIgnoreController: any) { + this.tracker = new VisibleCodeTracker(cwd, rooIgnoreController) + } + + async getGlossary(): Promise { + try { + const visibleCode = await this.tracker.captureVisibleCode() + const glossary = extractCodeGlossary(visibleCode) + return formatGlossaryAsPrompt(glossary) || "" + } catch (error) { + // Non-critical failure - return empty string + return "" + } + } +} diff --git a/src/core/webview/webviewMessageHandler.ts b/src/core/webview/webviewMessageHandler.ts index 76939b175bf..bc3d219b10e 100644 --- a/src/core/webview/webviewMessageHandler.ts +++ b/src/core/webview/webviewMessageHandler.ts @@ -95,7 +95,7 @@ import { MarketplaceManager, MarketplaceItemType } from "../../services/marketpl import { UsageTracker } from "../../utils/usage-tracker" // kilocode_change import { seeNewChanges } from "../checkpoints/kilocode/seeNewChanges" // kilocode_change import { getTaskHistory } from "../../shared/kilocode/getTaskHistory" // kilocode_change -import { fetchAndRefreshOrganizationModesOnStartup, refreshOrganizationModes } from "./kiloWebviewMessgeHandlerHelpers" +import { fetchAndRefreshOrganizationModesOnStartup, refreshOrganizationModes } from "./kiloWebviewMessgeHandlerHelpers" // kilocode_change import { getSapAiCoreDeployments } from "../../api/providers/fetchers/sap-ai-core" // kilocode_change import { AutoPurgeScheduler } from "../../services/auto-purge" // kilocode_change import { setPendingTodoList } from "../tools/UpdateTodoListTool" @@ -3643,6 +3643,14 @@ export const webviewMessageHandler = async ( } break } + // kilocode_change start: STT (Speech-to-Text) handlers + case "stt:start": + case "stt:stop": + case "stt:cancel": { + const { handleSTTCommand } = await import("./sttHandlers") + await handleSTTCommand(provider, message as any) + break + } // kilocode_change end: Type-safe global state handler case "insertTextToChatArea": provider.postMessageToWebview({ type: "insertTextToChatArea", text: message.text }) diff --git a/src/package.json b/src/package.json index c5e3fdf4f05..c686d903346 100644 --- a/src/package.json +++ b/src/package.json @@ -3,7 +3,7 @@ "displayName": "%extension.displayName%", "description": "%extension.description%", "publisher": "kilocode", - "version": "4.136.0", + "version": "4.137.0", "icon": "assets/icons/logo-outline-black.png", "galleryBanner": { "color": "#FFFFFF", @@ -759,6 +759,7 @@ "vscode-material-icons": "^0.1.1", "web-tree-sitter": "^0.25.6", "workerpool": "^9.2.0", + "ws": "^8.18.0", "xlsx": "^0.18.5", "yaml": "^2.8.0", "zod": "^3.25.61" @@ -787,6 +788,7 @@ "@types/tmp": "^0.2.6", "@types/turndown": "^5.0.5", "@types/vscode": "^1.84.0", + "@types/ws": "^8.5.13", "@vscode/test-electron": "^2.5.2", "@vscode/vsce": "3.3.2", "dotenv": "^16.4.7", diff --git a/src/services/continuedev/core/autocomplete/context/root-path-context/RootPathContextService.test.ts b/src/services/continuedev/core/autocomplete/context/root-path-context/RootPathContextService.test.ts index 8ff3a29ff23..c4beaa46fdb 100644 --- a/src/services/continuedev/core/autocomplete/context/root-path-context/RootPathContextService.test.ts +++ b/src/services/continuedev/core/autocomplete/context/root-path-context/RootPathContextService.test.ts @@ -1,4 +1,5 @@ -import { describe, test } from "vitest" +import { afterAll, beforeAll, describe, test } from "vitest" +import { setUpTestDir, tearDownTestDir } from "../../../test/testDir" import { PYTHON_TEST_CASES, TYPESCRIPT_TEST_CASES } from "./__test-cases__" import { testRootPathContext } from "./testUtils" @@ -41,6 +42,14 @@ const TEST_CASES = [ ] describe("RootPathContextService", () => { + beforeAll(async () => { + setUpTestDir() + }) + + afterAll(async () => { + tearDownTestDir() + }) + describe("should look for correct type definitions", () => { test.each(TEST_CASES)("$language: $nodeType", async ({ fileName, cursorPosition, definitionPositions }) => { await testRootPathContext("files", fileName, cursorPosition, definitionPositions) diff --git a/src/services/stt/FFmpegCaptureService.ts b/src/services/stt/FFmpegCaptureService.ts new file mode 100644 index 00000000000..2e7cde05e97 --- /dev/null +++ b/src/services/stt/FFmpegCaptureService.ts @@ -0,0 +1,345 @@ +// kilocode_change - new file: FFmpeg-based PCM16 audio capture for OpenAI Realtime API +import { EventEmitter } from "events" +import { spawn, ChildProcess, execSync } from "child_process" +import * as os from "os" + +/** + * Global cache for FFmpeg path (shared across all instances) + * undefined = not yet checked, null = not found, string = found path + */ +let cachedFFmpegPath: string | null | undefined = undefined + +// Platform-specific fallback paths +const fallbackPaths: Record = { + darwin: ["/usr/local/bin/ffmpeg", "/opt/homebrew/bin/ffmpeg"], + linux: ["/usr/bin/ffmpeg", "/usr/local/bin/ffmpeg", "/snap/bin/ffmpeg"], + win32: [ + "C:\\ffmpeg\\bin\\ffmpeg.exe", + "C:\\Program Files\\ffmpeg\\bin\\ffmpeg.exe", + "C:\\Program Files (x86)\\ffmpeg\\bin\\ffmpeg.exe", + ], +} + +/** + * Calculate RMS energy of PCM16 audio frame + * Returns normalized energy level (0-1 scale) + * + * @param pcm16 - Int16Array of PCM16 samples + * @returns Energy level from 0 (silence) to 1 (max volume) + */ +function calculateFrameEnergy(pcm16: Int16Array): number { + // Guard against empty buffer to prevent division by zero + if (pcm16.length === 0) { + return 0 + } + + let sum = 0 + for (let i = 0; i < pcm16.length; i++) { + const normalized = pcm16[i] / 32768 // Normalize to -1 to 1 + sum += normalized * normalized // Square for RMS + } + const rms = Math.sqrt(sum / pcm16.length) + return Math.min(rms, 1.0) // Cap at 1.0 +} + +/** + * FFmpegCaptureService - Captures audio in PCM16 format using FFmpeg for streaming to OpenAI Realtime API + * + * Key features: + * - Outputs PCM16 to stdout (not WebM files) + * - Sample rate: 24kHz (required by Realtime API) + * - Continuous streaming (not segmented chunks) + * - Event-driven Buffer emission for WebSocket transmission + * + * Architecture: + * Microphone → FFmpeg (PCM16) → stdout → Buffer events → WebSocket client + */ +// eslint-disable-next-line @typescript-eslint/no-unsafe-declaration-merging +export class FFmpegCaptureService extends EventEmitter { + private ffmpegProcess: ChildProcess | null = null + private isCapturing: boolean = false + private platform: string + private captureStartTime: number = 0 + private audioChunkCount: number = 0 + + constructor() { + super() + this.platform = os.platform() + + // Resolve FFmpeg path once (cached globally) + const result = FFmpegCaptureService.findFFmpeg() + + if (!result.available) { + console.error("❌ [FFmpegCapture] FFmpeg not found during initialization") + console.error("→ Install: https://ffmpeg.org/download.html") + } else { + console.log(`✅ [FFmpegCapture] FFmpeg resolved to: ${result.path}`) + } + } + + /** + * Start capturing audio in PCM16 format + * Emits 'audioData' events with Buffer chunks ready for WebSocket transmission + */ + async start(): Promise { + if (this.isCapturing) { + throw new Error("Audio capture already in progress") + } + + // Get FFmpeg path from global cache + const result = FFmpegCaptureService.findFFmpeg() + if (!result.available || !result.path) { + throw new Error( + "FFmpeg not found. Please install FFmpeg to use speech-to-text.\n" + + "Installation: https://ffmpeg.org/download.html", + ) + } + + try { + const args = this.buildFFmpegArgs() + + console.log("🔍 [FFmpegCapture] Spawning FFmpeg...") + console.log("🔍 [FFmpegCapture] Path:", result.path) + console.log("🔍 [FFmpegCapture] Args:", JSON.stringify(args)) + + // Use absolute path from cache (not "ffmpeg") + this.ffmpegProcess = spawn(result.path, args, { + stdio: ["ignore", "pipe", "pipe"], + }) + + console.log("✅ [FFmpegCapture] Process spawned, PID:", this.ffmpegProcess.pid) + + this.isCapturing = true + this.captureStartTime = Date.now() + this.audioChunkCount = 0 + + // Stream PCM16 data from stdout + this.ffmpegProcess.stdout?.on("data", (buffer: Buffer) => { + if (this.isCapturing) { + this.audioChunkCount++ + + // Calculate energy from PCM16 buffer + const int16Array = new Int16Array( + buffer.buffer, + buffer.byteOffset, + buffer.byteLength / Int16Array.BYTES_PER_ELEMENT, + ) + const energy = calculateFrameEnergy(int16Array) + // console.log(`🎙️ [FFmpegCapture] Energy: ${energy.toFixed(3)}`) + + // Emit both events + this.emit("audioData", buffer) + this.emit("audioEnergy", energy) + } + }) + + this.ffmpegProcess.stderr?.on("data", (data: Buffer) => { + const message = data.toString() + // Log FFmpeg output for debugging (not emitted as errors unless critical) + if (message.includes("Error") || message.includes("Cannot")) { + console.error("[RealtimeAudioCapture] FFmpeg error:", message) + this.emit("error", new Error(`FFmpeg error: ${message}`)) + } + }) + + this.ffmpegProcess.on("error", (error: Error) => { + console.error("❌ [RealtimeAudioCapture] Process error:", error) + console.error("❌ [RealtimeAudioCapture] Error details:", { + errno: (error as any).errno, + code: (error as any).code, + syscall: (error as any).syscall, + path: (error as any).path, + spawnargs: (error as any).spawnargs, + }) + console.error("❌ [RealtimeAudioCapture] Current PATH:", process.env.PATH) + console.error("❌ [RealtimeAudioCapture] Which platform:", this.platform) + this.emit("error", error) + this.cleanup() + }) + + this.ffmpegProcess.on("exit", (code: number | null, signal: string | null) => { + if (code !== null && code !== 0 && this.isCapturing) { + const error = new Error(`FFmpeg exited with code ${code}${signal ? ` (signal: ${signal})` : ""}`) + console.error("[RealtimeAudioCapture] Process exit:", error) + this.emit("error", error) + } + this.cleanup() + }) + + this.emit("ready") + } catch (error) { + this.cleanup() + throw error + } + } + + async stop(): Promise { + if (!this.isCapturing) { + return + } + + this.isCapturing = false + + if (this.ffmpegProcess) { + return new Promise((resolve) => { + if (!this.ffmpegProcess) { + resolve() + return + } + + // Set up cleanup timeout + const timeout = setTimeout(() => { + if (this.ffmpegProcess && !this.ffmpegProcess.killed) { + this.ffmpegProcess.kill("SIGKILL") + } + this.cleanup() + resolve() + }, 2000) + + // Listen for process exit + this.ffmpegProcess.once("exit", () => { + clearTimeout(timeout) + this.cleanup() + resolve() + }) + + // Send SIGTERM for graceful shutdown + this.ffmpegProcess.kill("SIGTERM") + }) + } + } + + getCaptureDuration(): number { + if (!this.isCapturing) { + return 0 + } + return Date.now() - this.captureStartTime + } + + isActive(): boolean { + return this.isCapturing + } + + /** + * Find FFmpeg executable using platform-specific fallback paths + * Results are cached globally across all instances + */ + static findFFmpeg(forceRecheck = false): { available: boolean; path?: string; error?: string } { + if (cachedFFmpegPath !== undefined && !forceRecheck) { + return { + available: cachedFFmpegPath !== null, + path: cachedFFmpegPath || undefined, + error: cachedFFmpegPath === null ? "FFmpeg not found" : undefined, + } + } + + const platform = os.platform() + try { + execSync("ffmpeg -version", { stdio: "ignore" }) + console.log(`🎙️ [FFmpeg] ✅ Found 'ffmpeg' in PATH`) + cachedFFmpegPath = "ffmpeg" + return { available: true, path: "ffmpeg" } + } catch { + console.log(`🎙️ [FFmpeg] ❌ 'ffmpeg' not in PATH, trying fallback paths...`) + } + + const platformPaths = fallbackPaths[platform] || [] + for (const fallbackPath of platformPaths) { + try { + execSync(`"${fallbackPath}" -version`, { stdio: "ignore" }) + console.log(`🎙️ [FFmpeg] ✅ Found at: ${fallbackPath}`) + cachedFFmpegPath = fallbackPath + return { available: true, path: fallbackPath } + } catch { + continue + } + } + + // Cache the "not found" result to avoid repeated path checks + cachedFFmpegPath = null + return { + available: false, + error: "FFmpeg not found. Install from https://ffmpeg.org/download.html", + } + } + + private buildFFmpegArgs(): string[] { + const baseArgs = this.getPlatformInputArgs() + + // PCM16 output configuration (required by OpenAI Realtime API) + const outputArgs = [ + "-acodec", + "pcm_s16le", // PCM16 format + "-ar", + "24000", // 24kHz sample rate (Realtime API requirement) + "-ac", + "1", // Mono + "-f", + "s16le", // Raw PCM16 format + "-", // Output to stdout + ] + + return [...baseArgs, ...outputArgs] + } + + private getPlatformInputArgs(): string[] { + switch (this.platform) { + case "darwin": // macOS + return ["-f", "avfoundation", "-i", ":default"] + + case "linux": + // Try pulse first, fallback to alsa + return ["-f", "pulse", "-i", "default"] + + case "win32": // Windows + return ["-f", "dshow", "-i", "audio=default"] + + default: + throw new Error(`Unsupported platform: ${this.platform}`) + } + } + + private cleanup(): void { + this.isCapturing = false + + if (this.ffmpegProcess) { + // Remove all listeners to prevent memory leaks + this.ffmpegProcess.stdout?.removeAllListeners() + this.ffmpegProcess.stderr?.removeAllListeners() + this.ffmpegProcess.removeAllListeners() + + // Ensure process is terminated + if (!this.ffmpegProcess.killed) { + this.ffmpegProcess.kill("SIGKILL") + } + + this.ffmpegProcess = null + } + + this.emit("stopped") + } +} + +/** + * Event interface for FFmpegCaptureService + */ +export interface FFmpegCaptureServiceEvents { + ready: () => void + audioData: (buffer: Buffer) => void + audioEnergy: (energy: number) => void + error: (error: Error) => void + stopped: () => void +} + +// Type-safe event emitter interface +// eslint-disable-next-line @typescript-eslint/no-unsafe-declaration-merging +export interface FFmpegCaptureService { + on(event: K, listener: FFmpegCaptureServiceEvents[K]): this + + off(event: K, listener: FFmpegCaptureServiceEvents[K]): this + + emit( + event: K, + ...args: Parameters + ): boolean +} diff --git a/src/services/stt/OpenAIWhisperClient.ts b/src/services/stt/OpenAIWhisperClient.ts new file mode 100644 index 00000000000..51e1dd78d4b --- /dev/null +++ b/src/services/stt/OpenAIWhisperClient.ts @@ -0,0 +1,530 @@ +import { EventEmitter } from "events" +import WebSocket from "ws" +import { ProviderSettingsManager } from "../../core/config/ProviderSettingsManager" +import { getOpenAiApiKey, getOpenAiBaseUrl } from "./utils/getOpenAiCredentials" + +/** + * Configuration for OpenAI Whisper transcription via Realtime API + */ +export interface OpenAIWhisperConfig { + apiKey?: string + baseURL?: string + language?: string + prompt?: string + maxReconnectAttempts?: number + reconnectDelayMs?: number +} + +/** + * OpenAI Realtime API session event types + */ +interface RealtimeSessionUpdateEvent { + type: "session.update" + session: { + turn_detection: null | { + type: "server_vad" + threshold: number + silence_duration_ms: number + } + input_audio_format: "pcm16" + input_audio_transcription: { + model: string + language?: string + prompt?: string + } + } +} + +interface RealtimeAudioAppendEvent { + type: "input_audio_buffer.append" + audio: string +} + +interface RealtimeAudioTranscriptDeltaEvent { + type: "response.audio_transcript.delta" + delta: string + response_id: string + item_id: string + output_index: number + content_index: number +} + +interface RealtimeTranscriptionDeltaEvent { + type: "conversation.item.input_audio_transcription.delta" + delta: string + item_id: string +} + +interface RealtimeTranscriptionCompletedEvent { + type: "conversation.item.input_audio_transcription.completed" + transcript: string + item_id: string +} + +interface RealtimeTranscriptionFailedEvent { + type: "conversation.item.input_audio_transcription.failed" + error: { + type: string + code?: string + message: string + } +} + +interface RealtimeErrorEvent { + type: "error" + error: { + type: string + code?: string + message: string + } +} + +type RealtimeServerEvent = + | RealtimeAudioTranscriptDeltaEvent + | RealtimeTranscriptionDeltaEvent + | RealtimeTranscriptionCompletedEvent + | RealtimeTranscriptionFailedEvent + | RealtimeErrorEvent + | { type: string; [key: string]: unknown } + +/** + * WebSocket client for OpenAI Whisper transcription via Realtime API + * Handles connection, session configuration, and real-time audio streaming for transcription-only use + * + * Events: + * - 'transcriptionDelta': Emitted for live partial transcription updates (text: string) + * - 'transcription': Emitted when a transcription segment is completed (text: string) + * - 'speechStopped': Emitted when VAD detects end of speech (no data) + * - 'error': Emitted when an error occurs (error: Error) + * - 'connected': Emitted when WebSocket connection is established + * - 'disconnected': Emitted when WebSocket connection is closed + * - 'reconnecting': Emitted when attempting to reconnect (attempt: number) + */ +export class OpenAIWhisperClient extends EventEmitter { + private ws: WebSocket | null = null + private config: Required + private providerSettingsManager: ProviderSettingsManager + private reconnectAttempts = 0 + private reconnectTimeout: NodeJS.Timeout | null = null + private isConnecting = false + private isClosing = false + private sessionConfigured = false + private pendingAudioChunks: string[] = [] + private audioChunksSent: number = 0 + private currentPrompt: string = "" + + // Default configuration values + // Server VAD is DISABLED - we use local VAD for better control over chunking + private static readonly DEFAULT_CONFIG: Required> = { + language: "en", + maxReconnectAttempts: 3, + reconnectDelayMs: 2000, + } + + constructor(providerSettingsManager: ProviderSettingsManager, config?: OpenAIWhisperConfig) { + super() + this.providerSettingsManager = providerSettingsManager + this.config = { + ...OpenAIWhisperClient.DEFAULT_CONFIG, + apiKey: config?.apiKey || "", + baseURL: config?.baseURL || "wss://api.openai.com/v1/realtime", + prompt: "", + ...config, + } + // Initialize current prompt from config + this.currentPrompt = this.config.prompt || "" + } + + /** + * Connect to OpenAI Realtime API WebSocket + * Automatically configures session for transcription-only mode + */ + async connect(): Promise { + if (this.isConnecting) { + throw new Error("Connection already in progress") + } + + if (this.ws && this.ws.readyState === WebSocket.OPEN) { + return + } + + this.isConnecting = true + this.isClosing = false + + try { + // Get API key if not provided in config + if (!this.config.apiKey) { + const apiKey = await getOpenAiApiKey(this.providerSettingsManager) + if (!apiKey) { + throw new Error( + "OpenAI API key not configured. Please add an OpenAI or OpenAI-native provider in your settings.", + ) + } + this.config.apiKey = apiKey + } + + // Get base URL if not provided in config + if (this.config.baseURL === "wss://api.openai.com/v1/realtime") { + const baseUrl = await getOpenAiBaseUrl(this.providerSettingsManager) + if (baseUrl && baseUrl !== "https://api.openai.com/v1") { + // Convert HTTP(S) base URL to WebSocket URL + // Remove trailing /v1 if present to avoid duplication + const cleanBaseUrl = baseUrl.replace(/\/v1\/?$/, "") + const wsBaseUrl = cleanBaseUrl.replace(/^https?:/, "wss:") + this.config.baseURL = `${wsBaseUrl}/v1/realtime` + } + } + + // Construct WebSocket URL with model parameter + // Note: Use the model name without date suffix + const wsUrl = `${this.config.baseURL}?model=gpt-4o-realtime-preview` + + // Create WebSocket connection + this.ws = new WebSocket(wsUrl, { + headers: { + Authorization: `Bearer ${this.config.apiKey}`, + "OpenAI-Beta": "realtime=v1", + }, + }) + + this.setupWebSocketHandlers() + + // Wait for connection to open + await new Promise((resolve, reject) => { + const timeout = setTimeout(() => { + reject(new Error("WebSocket connection timeout")) + }, 10000) + + const onOpen = () => { + clearTimeout(timeout) + this.ws!.off("open", onOpen) + this.ws!.off("error", onError) + resolve() + } + + const onError = (error: Error) => { + clearTimeout(timeout) + this.ws!.off("open", onOpen) + this.ws!.off("error", onError) + reject(new Error(`WebSocket connection failed: ${error.message}`)) + } + + this.ws!.once("open", onOpen) + this.ws!.once("error", onError) + }) + + this.isConnecting = false + this.reconnectAttempts = 0 + this.emit("connected") + } catch (error) { + this.isConnecting = false + this.ws = null + throw error + } + } + + /** + * Setup WebSocket event handlers + */ + private setupWebSocketHandlers(): void { + if (!this.ws) return + + this.ws.on("open", () => { + this.configureSession() + }) + + this.ws.on("message", (data: Buffer | string) => { + this.handleServerMessage(data) + }) + + this.ws.on("error", (error: Error) => { + this.emit("error", new Error(`WebSocket error: ${error.message}`)) + }) + + this.ws.on("close", (code: number, reason: Buffer) => { + this.sessionConfigured = false + this.pendingAudioChunks = [] + this.emit("disconnected", { code, reason: reason.toString() }) + + // Attempt reconnection if not intentionally closed + if (!this.isClosing && this.reconnectAttempts < this.config.maxReconnectAttempts) { + this.scheduleReconnect() + } + }) + } + + /** + * Configure session for transcription-only mode (no TTS) + * + * Server VAD is DISABLED - we use local VAD for better control over chunking. + * This prevents the server from automatically segmenting audio and allows us + * to commit audio at natural word boundaries detected by our local VAD. + */ + private configureSession(): void { + if (!this.ws || this.ws.readyState !== WebSocket.OPEN) return + + const sessionUpdate: RealtimeSessionUpdateEvent = { + type: "session.update", + session: { + // DISABLE server_vad - we do our own local VAD + turn_detection: null, + input_audio_format: "pcm16", + input_audio_transcription: { + model: "gpt-4o-mini-transcribe", + language: this.config.language, + // Pass prompt for code glossary - hallucination prevented by not sending silent audio + prompt: this.currentPrompt, + }, + }, + } + + console.log("🎙️ [OpenAIWhisperClient] Session configured with server_vad DISABLED (local VAD enabled)") + this.ws.send(JSON.stringify(sessionUpdate)) + this.sessionConfigured = true + + // Send any pending audio chunks + this.flushPendingAudioChunks() + } + + /** + * Update the transcription prompt dynamically during an active session + */ + updateTranscriptionPrompt(prompt: string): void { + // If unchanged, skip update (optimization) + if (prompt === this.currentPrompt) { + return + } + this.currentPrompt = prompt + + if (!this.ws || this.ws.readyState !== WebSocket.OPEN || !this.sessionConfigured) { + return + } + + // Send session update with ALL required fields (OpenAI requires model) + const promptUpdate = { + type: "session.update", + session: { + input_audio_transcription: { + model: "gpt-4o-mini-transcribe", + language: this.config.language, + prompt: this.currentPrompt, + }, + }, + } + + console.log(`🎙️ [OpenAIWhisperClient] Updating transcription prompt dynamically (${prompt.length} chars)`) + this.ws.send(JSON.stringify(promptUpdate)) + } + + /** + * Handle incoming server messages + */ + private handleServerMessage(data: string | Buffer): void { + try { + const messageText = typeof data === "string" ? data : data.toString() + const message = JSON.parse(messageText) as RealtimeServerEvent + + switch (message.type) { + case "conversation.item.input_audio_transcription.delta": + // Live word-by-word transcription updates + if ("delta" in message && message.delta) { + const deltaText = String(message.delta) + console.log(`🎙️ [OpenAIWhisperClient] Delta text: "${deltaText}"`) + this.emit("transcriptionDelta", deltaText) + } + break + + case "conversation.item.input_audio_transcription.completed": + // Segment completed - this has better combined text + if ("transcript" in message && message.transcript) { + const transcriptText = String(message.transcript) + console.log(`🎙️ [OpenAIWhisperClient] ✅ Completed text: "${transcriptText}"`) + this.emit("transcription", transcriptText) + } + break + + case "conversation.item.input_audio_transcription.failed": { + const errorMsg = + typeof message.error === "object" && message.error !== null && "message" in message.error + ? String(message.error.message) + : "Unknown error" + console.error("🎙️ [OpenAIWhisperClient] Transcription failed:", errorMsg) + this.emit("error", new Error(`Transcription failed: ${errorMsg}`)) + break + } + + // Server VAD events are disabled - we use local VAD instead + // case "input_audio_buffer.speech_started": + // case "input_audio_buffer.speech_stopped": + + case "error": { + const errorMsg = + typeof message.error === "object" && message.error !== null && "message" in message.error + ? String(message.error.message) + : "Unknown error" + + // Ignore "buffer too small" errors - these are expected when we try to commit + // but there isn't enough audio yet. Not a real problem. + if (errorMsg.includes("buffer too small") || errorMsg.includes("buffer only has 0.00ms")) { + break + } + + console.error("🎙️ [OpenAIWhisperClient] API error:", errorMsg) + this.emit("error", new Error(`Realtime API error: ${errorMsg}`)) + break + } + + // Silently ignore other event types (session.created, session.updated, etc.) + default: + break + } + } catch (error) { + console.error("🎙️ [OpenAIWhisperClient] Failed to parse server message:", error) + this.emit("error", new Error(`Failed to parse server message: ${error}`)) + } + } + + /** + * Send manual commit to force transcription of buffered audio + * This allows getting transcription updates even while user is still speaking, + * without waiting for VAD silence detection. + * + * Use case: Call periodically (e.g., every 1-2 seconds) to get interim results + * during long continuous speech. + */ + sendInputBufferCommit(): void { + if (!this.ws || this.ws.readyState !== WebSocket.OPEN || !this.sessionConfigured) { + return + } + try { + const commitEvent = { type: "input_audio_buffer.commit", event_id: `commit_${Date.now()}` } + this.ws.send(JSON.stringify(commitEvent)) + } catch (error) { + console.error("🎙️ [OpenAIWhisperClient] Failed to send commit:", error) + this.emit("error", new Error(`Failed to send commit: ${error}`)) + } + } + + /** + * Send PCM16 audio chunk to the API + * @param pcm16Buffer Raw PCM16 audio data (16-bit, 24kHz, mono) + */ + sendAudioChunk(pcm16Buffer: Buffer): void { + if (!this.ws || this.ws.readyState !== WebSocket.OPEN) { + return + } + + const base64Audio = pcm16Buffer.toString("base64") + if (!this.sessionConfigured) { + // Queue audio chunk if session not configured yet + this.pendingAudioChunks.push(base64Audio) + return + } + + try { + this.ws.send(JSON.stringify({ type: "input_audio_buffer.append", audio: base64Audio })) + this.audioChunksSent++ + } catch (error) { + console.error("🎙️ [OpenAIWhisperClient] Failed to send audio chunk:", error) + this.emit("error", new Error(`Failed to send audio chunk: ${error}`)) + } + } + + /** + * Flush pending audio chunks after session configuration + */ + private flushPendingAudioChunks(): void { + if (!this.ws || this.ws.readyState !== WebSocket.OPEN || !this.sessionConfigured) { + return + } + + for (const base64Audio of this.pendingAudioChunks) { + const audioEvent: RealtimeAudioAppendEvent = { + type: "input_audio_buffer.append", + audio: base64Audio, + } + + try { + this.ws.send(JSON.stringify(audioEvent)) + } catch (error) { + this.emit("error", new Error(`Failed to send pending audio chunk: ${error}`)) + } + } + + this.pendingAudioChunks = [] + } + + /** + * Schedule reconnection attempt + */ + private scheduleReconnect(): void { + if (this.reconnectTimeout) { + return + } + + this.reconnectAttempts++ + this.emit("reconnecting", this.reconnectAttempts) + + this.reconnectTimeout = setTimeout(async () => { + this.reconnectTimeout = null + + try { + await this.connect() + } catch (error) { + this.emit("error", new Error(`Reconnection attempt ${this.reconnectAttempts} failed: ${error}`)) + + // Schedule next attempt if not exceeded max + if (this.reconnectAttempts < this.config.maxReconnectAttempts) { + this.scheduleReconnect() + } + } + }, this.config.reconnectDelayMs) + } + + /** + * Disconnect from the WebSocket + */ + async disconnect(): Promise { + this.isClosing = true + + // Clear reconnection timeout + if (this.reconnectTimeout) { + clearTimeout(this.reconnectTimeout) + this.reconnectTimeout = null + } + + // Close WebSocket + if (this.ws) { + this.ws.close(1000, "Client disconnect") + this.ws = null + } + + this.sessionConfigured = false + this.pendingAudioChunks = [] + this.reconnectAttempts = 0 + } + + /** + * Check if client is connected + */ + isConnected(): boolean { + return this.ws !== null && this.ws.readyState === WebSocket.OPEN && this.sessionConfigured + } + + /** + * Get current connection state + */ + getConnectionState(): "connecting" | "connected" | "disconnected" | "reconnecting" { + if (this.isConnecting) return "connecting" + if (this.reconnectAttempts > 0) return "reconnecting" + if (this.ws && this.ws.readyState === WebSocket.OPEN) return "connected" + return "disconnected" + } + + /** + * Get current configuration values + */ + getConfig(): Readonly> { + return { ...this.config } + } +} diff --git a/src/services/stt/STTService.ts b/src/services/stt/STTService.ts new file mode 100644 index 00000000000..5f9c284dc97 --- /dev/null +++ b/src/services/stt/STTService.ts @@ -0,0 +1,420 @@ +// kilocode_change - new file: Consolidated STT service - manages OpenAI Realtime transcription lifecycle +import { STTProviderConfig, STTEventEmitter, VisibleCodeGlossary, VADConfig, DEFAULT_VAD_CONFIG } from "./types" +import { STTSegment } from "../../shared/sttContract" +import { ProviderSettingsManager } from "../../core/config/ProviderSettingsManager" +import { FFmpegCaptureService } from "./FFmpegCaptureService" +import { OpenAIWhisperClient } from "./OpenAIWhisperClient" + +/** + * Consolidated STT service - manages OpenAI Realtime transcription + * One instance per ClineProvider (WebView) + * + * Coordinates FFmpegCaptureService and OpenAIWhisperClient to provide + * low-latency streaming transcription via OpenAI Realtime API. + * + * Flow: + * 1. All audio frames are streamed immediately to OpenAI (they buffer) + * 2. We track voice activity locally via energy detection + * 3. We only commit when we have enough voiced frames AND detect a pause + * 4. OpenAI returns transcription events when we commit + */ +export class STTService { + private readonly emitter: STTEventEmitter + private readonly providerSettingsManager: ProviderSettingsManager + + // Services + private audioCapture: FFmpegCaptureService + private transcriptionClient: OpenAIWhisperClient | null = null + + // Segment-based state + private textSegments: STTSegment[] = [] // All confirmed/polished segments + private currentPreviewText: string = "" // Current streaming preview text + + // Session state + private sessionId: string | null = null + private isActive = false + + // Helps ignore late events from previous runs + private internalSessionId = 0 + + // VAD configuration and state + private vadConfig: VADConfig = DEFAULT_VAD_CONFIG + private totalFrameCount: number = 0 // Frames sent since last commit + private voicedFrameCount: number = 0 // Frames with voice activity since last commit + private lastVoicedAtMs: number = 0 + + private readonly codeGlossary: VisibleCodeGlossary | null + + constructor( + emitter: STTEventEmitter, + providerSettingsManager: ProviderSettingsManager, + codeGlossary: VisibleCodeGlossary | null = null, + ) { + this.emitter = emitter + this.providerSettingsManager = providerSettingsManager + this.codeGlossary = codeGlossary + this.audioCapture = new FFmpegCaptureService() + } + + async start(config: STTProviderConfig, language?: string): Promise { + this.cancel() + + this.sessionId = `stt-${Date.now()}-${Math.random().toString(36).slice(2, 7)}` + if (config.vadConfig) { + this.vadConfig = { ...DEFAULT_VAD_CONFIG, ...config.vadConfig } + } + + // New session + this.internalSessionId++ + this.isActive = true // Set BEFORE audio starts to avoid dropping first frames + this.textSegments = [] + this.currentPreviewText = "" + + // Reset VAD state + this.totalFrameCount = 0 + this.voicedFrameCount = 0 + this.lastVoicedAtMs = 0 + + // The prompt is making it hallucinate more so remove it for now + // const prompt = await this.codeGlossary?.getGlossary() + try { + this.transcriptionClient = new OpenAIWhisperClient(this.providerSettingsManager, { + apiKey: config.apiKey || "", + language: language || config.language || "en", + // prompt, + }) + + this.setupEventHandlers() + + await this.audioCapture.start() + await this.transcriptionClient?.connect() + + this.emitter.onStarted(this.sessionId) + } catch (error) { + const errorMessage = error instanceof Error ? error.message : "Failed to start" + this.emitter.onStopped("error", undefined, errorMessage) + await this.cleanupOnError() + this.sessionId = null + throw error + } + } + + async stop(): Promise { + if (!this.isActive) { + return this.getFullText() + } + + this.isActive = false // Prevent new audio + late deltas + const currentSession = this.internalSessionId + + try { + await this.stopCapture() + + // Only commit if we have voiced data + if (this.voicedFrameCount > 0 && this.transcriptionClient?.isConnected()) { + this.commit() + } + + // Wait for any pending transcriptions to arrive + await new Promise((resolve) => setTimeout(resolve, 1000)) + + // Convert any remaining preview to confirmed + if (this.currentPreviewText.trim()) { + this.textSegments.push({ text: this.currentPreviewText.trim(), isPreview: false }) + this.currentPreviewText = "" + } + + const finalText = this.getFullText() + await this.transcriptionClient?.disconnect() + + // Only reset if this is still the latest session + if (this.internalSessionId === currentSession) { + this.resetSession() + } + + this.emitter.onStopped("completed", finalText) + return finalText + } catch (error) { + console.error("🎙️ [STTService] Error during stop:", error) + + await this.disconnectClient() + const finalText = this.getFullText() + + if (this.internalSessionId === currentSession) { + this.resetSession() + } + + const errorMessage = error instanceof Error ? error.message : "Failed to stop" + this.emitter.onStopped("error", finalText, errorMessage) + return finalText + } + } + + cancel(): void { + if (!this.transcriptionClient) { + return + } + + try { + this.isActive = false + + this.transcriptionClient.disconnect().catch(() => {}) // Ignore during cancel + this.audioCapture.stop().catch(() => {}) // Ignore during cancel + + this.resetSession() + this.emitter.onStopped("cancelled") + } catch (_error) {} + + this.cleanup() + } + + getSessionId(): string | null { + return this.sessionId + } + + isRecording(): boolean { + return this.isActive + } + + private setupEventHandlers(): void { + this.connectAudioToClient() + this.forwardTranscriptionEvents() + this.handleErrors() + } + + /** + * Connect audio capture to transcription client with local VAD logic + * + * Strategy: + * - Stream ALL audio to OpenAI immediately (they buffer until we commit) + * - Track voice activity locally via energy detection + * - Only commit when: enough voiced frames AND at a pause + * - Never commit empty/silent buffers to avoid hallucination + */ + private connectAudioToClient(): void { + // Stream all audio immediately - OpenAI buffers until we commit + this.audioCapture.on("audioData", (pcm16Buffer: Buffer) => { + if (!this.isActive) return + + // Always stream to OpenAI (they buffer until we commit) + this.transcriptionClient?.sendAudioChunk(pcm16Buffer) + this.totalFrameCount++ + + // Check if we should commit based on voice activity + if (this.shouldCommit()) { + this.commit() + } + }) + + // Track voice activity separately via energy detection + this.audioCapture.on("audioEnergy", (energy: number) => { + if (!this.isActive) return + + if (energy > this.vadConfig.energyThreshold) { + this.lastVoicedAtMs = Date.now() + this.voicedFrameCount++ + } + this.emitter.onVolume(energy) + }) + } + + /** + * Check if conditions met for committing current audio chunk + * Requirements: + * 1. Minimum total duration (prevents tiny fragments) + * 2. Minimum voiced duration (ensures meaningful speech content) + * 3. Natural pause detected (between-word silence) + */ + private shouldCommit(): boolean { + if (this.voicedFrameCount === 0) { + return false + } + + const now = Date.now() + const silenceSinceMs = now - this.lastVoicedAtMs + const bufferedMs = this.totalFrameCount * this.vadConfig.frameDurationMs + const voicedMs = this.voicedFrameCount * this.vadConfig.frameDurationMs + + // Requirements (absolute time only - no percentage) + const hasMinDuration = bufferedMs >= this.vadConfig.minChunkMs // 1000ms total + const hasMinVoice = voicedMs >= 500 // 500ms of actual speech + const atPause = silenceSinceMs >= this.vadConfig.shortPauseMs // 150ms between-word pause + + // Debug logging + if (bufferedMs >= 2000 && bufferedMs % 1000 < 20) { + console.log( + `🎙️ [STTService] 🔍 Check: buffered=${bufferedMs}ms, voiced=${voicedMs}ms, silence=${silenceSinceMs}ms | min=${hasMinDuration}, voice=${hasMinVoice}, pause=${atPause}`, + ) + } + + // Safety cap: force commit if too long + const atSafetyCap = bufferedMs >= this.vadConfig.maxChunkMs && atPause + + // Commit when all requirements met + if (hasMinDuration && hasMinVoice && atPause) { + console.log( + `🎙️ [STTService] ✓ Commit: pause detected (${silenceSinceMs}ms silence, ${bufferedMs}ms total, ${voicedMs}ms voiced)`, + ) + return true + } + + // Safety cap + if (atSafetyCap && hasMinVoice) { + console.log(`🎙️ [STTService] ✓ Commit: safety cap (${bufferedMs}ms total, ${voicedMs}ms voiced)`) + return true + } + + return false + } + + /** + * Commit buffered audio to OpenAI for transcription + * Only called when we have voiced data + */ + private commit(): void { + if (this.voicedFrameCount === 0) { + return + } + + const bufferedMs = this.totalFrameCount * this.vadConfig.frameDurationMs + const voicedMs = this.voicedFrameCount * this.vadConfig.frameDurationMs + + this.transcriptionClient?.sendInputBufferCommit() + + // Reset counters for next segment + this.totalFrameCount = 0 + this.voicedFrameCount = 0 + } + + private forwardTranscriptionEvents(): void { + if (!this.transcriptionClient) return + + // Delta events: incremental word-by-word streaming (gpt-4o-mini-transcribe) + // Each delta adds new text to build up the current preview + this.transcriptionClient.on("transcriptionDelta", (delta: string) => { + if (!this.isActive) return + + const trimmedDelta = delta.trim() + this.currentPreviewText = (this.currentPreviewText + " " + trimmedDelta).trim() + this.emitCurrentState() + }) + + // Completed event: OpenAI sends polished/corrected text after commit + this.transcriptionClient.on("transcription", (text: string) => { + if (!this.isActive) return + + const trimmed = text.trim() + if (!trimmed) return + + // Convert preview to confirmed segment + this.textSegments.push({ text: trimmed, isPreview: false }) + this.currentPreviewText = "" + + // Emit updated state + this.emitCurrentState() + }) + } + + /** + * Build and emit current transcript state + * Sends complete segments array to WebView + */ + private emitCurrentState(): void { + const allSegments: STTSegment[] = [...this.textSegments] + + // Add current preview if any + if (this.currentPreviewText.trim()) { + allSegments.push({ text: this.currentPreviewText.trim(), isPreview: true }) + } + + this.emitter.onTranscript(allSegments, false) + } + + private handleErrors(): void { + this.audioCapture.on("error", (error: Error) => { + console.error("🎙️ [STTService] Audio capture error:", error) + this.handleRecoverableError(error) + }) + + if (this.transcriptionClient) { + this.transcriptionClient.on("error", (error: Error) => { + console.error("🎙️ [STTService] Transcription API error:", error) + this.handleRecoverableError(error) + }) + } + } + + /** + * Handle recoverable errors by emitting to UI and cleaning up + */ + private async handleRecoverableError(error: Error): Promise { + this.emitter.onStopped("error", undefined, error.message) + + if (this.isActive) { + try { + await this.cleanupOnError() + } catch (cleanupError) { + console.error("Failed to cleanup after error:", cleanupError) + } + } + } + + /** + * Get full text for onComplete callback + * Joins all confirmed segment texts + */ + private getFullText(): string { + return this.textSegments + .map((s) => s.text) + .join("") + .trim() + } + + private async stopCapture(): Promise { + try { + await this.audioCapture.stop() + } catch (error) { + console.error("🎙️ [STTService] Error stopping audio capture:", error) + } + } + + private async disconnectClient(): Promise { + try { + await this.transcriptionClient?.disconnect() + } catch (error) { + console.error("🎙️ [STTService] Error disconnecting client:", error) + } + } + + private resetSession(): void { + this.textSegments = [] + this.currentPreviewText = "" + } + + private async cleanupOnError(): Promise { + this.isActive = false + + // Force kill FFmpeg and disconnect - use Promise.allSettled to ensure both run + const cleanupResults = await Promise.allSettled([ + this.audioCapture.stop(), + this.transcriptionClient?.disconnect() ?? Promise.resolve(), + ]) + + // Log cleanup results for debugging + cleanupResults.forEach((result, index) => { + const name = index === 0 ? "audioCapture" : "transcriptionClient" + if (result.status === "rejected") { + console.error(`🎙️ [STTService] Failed to cleanup ${name}:`, result.reason) + } else { + console.log(`🎙️ [STTService] ${name} cleaned up successfully`) + } + }) + + this.resetSession() + } + + private cleanup(): void { + this.transcriptionClient = null + this.sessionId = null + } +} diff --git a/src/services/stt/__tests__/FFmpegCaptureService.spec.ts b/src/services/stt/__tests__/FFmpegCaptureService.spec.ts new file mode 100644 index 00000000000..d87e2bd336d --- /dev/null +++ b/src/services/stt/__tests__/FFmpegCaptureService.spec.ts @@ -0,0 +1,117 @@ +// Run: npx vitest run services/stt/__tests__/FFmpegCaptureService.spec.ts + +import { EventEmitter } from "events" +import { spawn } from "child_process" +import { FFmpegCaptureService } from "../FFmpegCaptureService" + +// Mock child_process +const createMockProcess = () => { + const stdout = new EventEmitter() + const stderr = new EventEmitter() + + stdout.removeAllListeners = vi.fn(() => stdout) as any + stderr.removeAllListeners = vi.fn(() => stderr) as any + + return { + stdout, + stderr, + kill: vi.fn(), + on: vi.fn(), + once: vi.fn(), + removeAllListeners: vi.fn(), + } +} + +let mockProcess = createMockProcess() +let mockStdout = mockProcess.stdout + +vi.mock("child_process", () => ({ + spawn: vi.fn(() => { + mockProcess = createMockProcess() + mockStdout = mockProcess.stdout + return mockProcess + }), + execSync: vi.fn(() => Buffer.from("ffmpeg version")), +})) + +vi.mock("os", () => ({ + platform: vi.fn(() => "darwin"), +})) + +describe("FFmpegCaptureService", () => { + let capture: FFmpegCaptureService + + beforeEach(() => { + vi.clearAllMocks() + capture = new FFmpegCaptureService() + }) + + afterEach(async () => { + if (capture.isActive()) { + await capture.stop() + } + }) + + describe("Basic functionality", () => { + it("should start and stop audio capture", async () => { + expect(capture.isActive()).toBe(false) + + await capture.start() + expect(capture.isActive()).toBe(true) + + mockProcess.once.mock.calls.find((call) => call[0] === "exit")?.[1]() + await capture.stop() + expect(capture.isActive()).toBe(false) + }) + + it("should emit audioData events when receiving PCM16 data", async () => { + const audioDataHandler = vi.fn() + capture.on("audioData", audioDataHandler) + + await capture.start() + + const testBuffer = Buffer.from([1, 2, 3, 4]) + mockStdout.emit("data", testBuffer) + + expect(audioDataHandler).toHaveBeenCalledWith(testBuffer) + }) + + it("should throw error if already capturing", async () => { + await capture.start() + await expect(capture.start()).rejects.toThrow("Audio capture already in progress") + }) + }) + + describe("Error handling", () => { + it("should emit error on FFmpeg process error", async () => { + const errorHandler = vi.fn() + capture.on("error", errorHandler) + + await capture.start() + + const testError = new Error("FFmpeg process error") + mockProcess.on.mock.calls.find((call) => call[0] === "error")?.[1](testError) + + expect(errorHandler).toHaveBeenCalledWith(testError) + }) + }) + + describe("Platform support", () => { + it("should use correct FFmpeg args for macOS", async () => { + await capture.start() + + const spawnCall = vi.mocked(spawn).mock.calls[0] + const args = spawnCall[1] as string[] + + // Verify platform-specific input + expect(args).toContain("-f") + expect(args).toContain("avfoundation") + + // Verify PCM16 output format + expect(args).toContain("-acodec") + expect(args).toContain("pcm_s16le") + expect(args).toContain("-ar") + expect(args).toContain("24000") // 24kHz required by Realtime API + }) + }) +}) diff --git a/src/services/stt/__tests__/vadLogic.spec.ts b/src/services/stt/__tests__/vadLogic.spec.ts new file mode 100644 index 00000000000..5b853eb0d4b --- /dev/null +++ b/src/services/stt/__tests__/vadLogic.spec.ts @@ -0,0 +1,252 @@ +import { describe, it, expect } from "vitest" +import { DEFAULT_VAD_CONFIG } from "../types" + +/** + * Calculate RMS energy of PCM16 audio frame + * (Extracted from FFmpegCaptureService for testing) + */ +function calculateFrameEnergy(pcm16: Int16Array): number { + let sum = 0 + for (let i = 0; i < pcm16.length; i++) { + const normalized = pcm16[i] / 32768 + sum += normalized * normalized + } + const rms = Math.sqrt(sum / pcm16.length) + return Math.min(rms, 1.0) +} + +/** + * Check if frame is voiced based on energy threshold + */ +function isVoicedFrame(energy: number, threshold: number = DEFAULT_VAD_CONFIG.energyThreshold): boolean { + return energy > threshold +} + +/** + * Check if should commit chunk based on VAD logic + * Now requires voicedRatio to be above minVoicedRatio + */ +function shouldCommitChunk( + bufferedAudioMs: number, + silenceSinceMs: number, + voicedRatio: number, + minChunkMs: number = DEFAULT_VAD_CONFIG.minChunkMs, + shortPauseMs: number = DEFAULT_VAD_CONFIG.shortPauseMs, + maxChunkMs: number = DEFAULT_VAD_CONFIG.maxChunkMs, + minVoicedRatio: number = DEFAULT_VAD_CONFIG.minVoicedRatio, +): boolean { + // Never commit without voiced data + if (voicedRatio === 0) { + return false + } + + const hasEnoughAudio = bufferedAudioMs >= minChunkMs + const atShortPause = silenceSinceMs >= shortPauseMs + const hasVoice = voicedRatio >= minVoicedRatio + const atSafetyCap = bufferedAudioMs >= maxChunkMs && atShortPause + + // Commit on natural pause with enough voice content + if (hasEnoughAudio && atShortPause && hasVoice) { + return true + } + + // Safety cap: force commit if too long AND at pause AND has voice + if (atSafetyCap && hasVoice) { + return true + } + + return false +} + +describe("VAD Energy Calculation", () => { + it("should calculate zero energy for silence", () => { + const silence = new Int16Array(480).fill(0) // 20ms at 24kHz + const energy = calculateFrameEnergy(silence) + expect(energy).toBe(0) + }) + + it("should calculate non-zero energy for audio", () => { + const audio = new Int16Array(480) + for (let i = 0; i < audio.length; i++) { + audio[i] = Math.sin(i / 10) * 1000 // Sine wave + } + const energy = calculateFrameEnergy(audio) + expect(energy).toBeGreaterThan(0) + expect(energy).toBeLessThanOrEqual(1) + }) + + it("should cap energy at 1.0", () => { + const loudAudio = new Int16Array(480).fill(32767) // Max PCM16 value + const energy = calculateFrameEnergy(loudAudio) + expect(energy).toBeCloseTo(1.0, 2) // Within 0.01 of 1.0 + }) + + it("should calculate different energies for different amplitudes", () => { + const quietAudio = new Int16Array(480).fill(1000) + const loudAudio = new Int16Array(480).fill(10000) + + const quietEnergy = calculateFrameEnergy(quietAudio) + const loudEnergy = calculateFrameEnergy(loudAudio) + + expect(loudEnergy).toBeGreaterThan(quietEnergy) + }) + + it("should detect voiced frames above threshold", () => { + expect(isVoicedFrame(0.02, 0.015)).toBe(true) // Above threshold + expect(isVoicedFrame(0.01, 0.015)).toBe(false) // Below threshold + expect(isVoicedFrame(0.015, 0.015)).toBe(false) // Equal to threshold + }) +}) + +describe("VAD Chunking Logic", () => { + it("should not commit without any voice data", () => { + const bufferedMs = 1200 // Above 1000ms min + const silenceMs = 400 // Above 300ms short pause + const voicedRatio = 0 // No voice at all + + const shouldCommit = shouldCommitChunk(bufferedMs, silenceMs, voicedRatio) + expect(shouldCommit).toBe(false) + }) + + it("should not commit before minChunkMs even with voice", () => { + const bufferedMs = 500 // Below 1000ms min + const silenceMs = 400 // Above 300ms short pause + const voicedRatio = 0.5 // Good voice ratio + + const shouldCommit = shouldCommitChunk(bufferedMs, silenceMs, voicedRatio) + expect(shouldCommit).toBe(false) + }) + + it("should commit at short pause with enough audio and voice", () => { + const bufferedMs = 1200 // Above 1000ms min + const silenceMs = 400 // Above 300ms short pause + const voicedRatio = 0.5 // Above 30% min + + const shouldCommit = shouldCommitChunk(bufferedMs, silenceMs, voicedRatio) + expect(shouldCommit).toBe(true) + }) + + it("should not commit with enough audio but no pause", () => { + const bufferedMs = 1200 // Above 1000ms min + const silenceMs = 100 // Below 150ms short pause + const voicedRatio = 0.5 // Good voice ratio + + const shouldCommit = shouldCommitChunk(bufferedMs, silenceMs, voicedRatio) + expect(shouldCommit).toBe(false) + }) + + it("should not commit with enough audio and pause but insufficient voice", () => { + const bufferedMs = 1200 // Above 1000ms min + const silenceMs = 400 // Above 300ms short pause + const voicedRatio = 0.2 // Below 30% min + + const shouldCommit = shouldCommitChunk(bufferedMs, silenceMs, voicedRatio) + expect(shouldCommit).toBe(false) + }) + + it("should force commit at maxChunkMs only with pause and voice", () => { + const bufferedMs = 10100 // Above 10000ms max + const silenceMs = 0 // No silence at all + const voicedRatio = 0.5 // Good voice ratio + + // No pause = no commit even at safety cap + const shouldCommit = shouldCommitChunk(bufferedMs, silenceMs, voicedRatio) + expect(shouldCommit).toBe(false) + }) + + it("should force commit at maxChunkMs with pause and voice", () => { + const bufferedMs = 10100 // Above 10000ms max + const silenceMs = 400 // Above 300ms short pause + const voicedRatio = 0.5 // Good voice ratio + + const shouldCommit = shouldCommitChunk(bufferedMs, silenceMs, voicedRatio) + expect(shouldCommit).toBe(true) + }) + + it("should not commit with short audio and short silence", () => { + const bufferedMs = 500 // Below 1000ms min + const silenceMs = 100 // Below 300ms short pause + const voicedRatio = 0.5 // Good voice ratio + + const shouldCommit = shouldCommitChunk(bufferedMs, silenceMs, voicedRatio) + expect(shouldCommit).toBe(false) + }) + + it("should commit exactly at threshold values", () => { + // Exactly at min audio + short pause + min voice + expect(shouldCommitChunk(1000, 300, 0.3)).toBe(true) + + // At max chunk with pause and voice + expect(shouldCommitChunk(10000, 300, 0.3)).toBe(true) + }) +}) + +describe("VAD Configuration", () => { + it("should have sensible default values", () => { + expect(DEFAULT_VAD_CONFIG.energyThreshold).toBe(0.02) + expect(DEFAULT_VAD_CONFIG.minChunkMs).toBe(1000) + expect(DEFAULT_VAD_CONFIG.shortPauseMs).toBe(150) + expect(DEFAULT_VAD_CONFIG.maxChunkMs).toBe(10000) + expect(DEFAULT_VAD_CONFIG.frameDurationMs).toBe(20) + expect(DEFAULT_VAD_CONFIG.minVoicedRatio).toBe(0.3) + }) + + it("should have maxChunkMs > minChunkMs", () => { + expect(DEFAULT_VAD_CONFIG.maxChunkMs).toBeGreaterThan(DEFAULT_VAD_CONFIG.minChunkMs) + }) + + it("should have reasonable frame duration", () => { + // Frame duration should be between 10-30ms for real-time audio + expect(DEFAULT_VAD_CONFIG.frameDurationMs).toBeGreaterThanOrEqual(10) + expect(DEFAULT_VAD_CONFIG.frameDurationMs).toBeLessThanOrEqual(30) + }) + + it("should have reasonable voice ratio threshold", () => { + // Should be between 10-50% for practical use + expect(DEFAULT_VAD_CONFIG.minVoicedRatio).toBeGreaterThanOrEqual(0.1) + expect(DEFAULT_VAD_CONFIG.minVoicedRatio).toBeLessThanOrEqual(0.5) + }) +}) + +describe("VAD Edge Cases", () => { + it("should handle zero-length audio frames", () => { + const emptyFrame = new Int16Array(0) + const energy = calculateFrameEnergy(emptyFrame) + expect(isNaN(energy) || energy === 0).toBe(true) + }) + + it("should handle negative PCM values", () => { + const negativeAudio = new Int16Array(480).fill(-10000) + const energy = calculateFrameEnergy(negativeAudio) + expect(energy).toBeGreaterThan(0) + expect(energy).toBeLessThanOrEqual(1) + }) + + it("should handle mixed positive/negative values", () => { + const mixedAudio = new Int16Array(480) + for (let i = 0; i < mixedAudio.length; i++) { + mixedAudio[i] = i % 2 === 0 ? 5000 : -5000 + } + const energy = calculateFrameEnergy(mixedAudio) + expect(energy).toBeGreaterThan(0) + expect(energy).toBeLessThanOrEqual(1) + }) + + it("should not commit very long buffer without voice", () => { + const bufferedMs = 10000 // Very long + const silenceMs = 400 // Has pause + const voicedRatio = 0 // No voice + + const shouldCommit = shouldCommitChunk(bufferedMs, silenceMs, voicedRatio) + expect(shouldCommit).toBe(false) // Never commit without voice + }) + + it("should handle very long silence with voice", () => { + const bufferedMs = 1200 // Above 1000ms min + const silenceMs = 5000 // Very long silence + const voicedRatio = 0.4 // Good voice ratio + + const shouldCommit = shouldCommitChunk(bufferedMs, silenceMs, voicedRatio) + expect(shouldCommit).toBe(true) // Should commit on pause with voice + }) +}) diff --git a/src/services/stt/context/__tests__/codeGlossaryExtractor.spec.ts b/src/services/stt/context/__tests__/codeGlossaryExtractor.spec.ts new file mode 100644 index 00000000000..e8233f606b1 --- /dev/null +++ b/src/services/stt/context/__tests__/codeGlossaryExtractor.spec.ts @@ -0,0 +1,189 @@ +// Essential sanity checks for code glossary extraction +// Run: cd $WORKSPACE_ROOT/src && npx vitest run services/stt/context/__tests__/codeGlossaryExtractor.spec.ts + +import { extractCodeGlossary, formatGlossaryAsPrompt } from "../codeGlossaryExtractor" +import type { VisibleCodeContext } from "../../../ghost/types" + +describe("extractCodeGlossary", () => { + it("always includes core Kilocode terms", () => { + const visibleCode: VisibleCodeContext = { + timestamp: Date.now(), + editors: [], + } + + const glossary = extractCodeGlossary(visibleCode) + + // Core terms should always be present + expect(glossary.identifiers).toContain("Kilocode") + expect(glossary.identifiers).toContain("Kilo Code") + expect(glossary.identifiers).toContain("VSCode") + expect(glossary.identifiers).toContain("MCP") + }) + + it("extracts meaningful identifiers (4+ chars) and filters out short common words", () => { + const visibleCode: VisibleCodeContext = { + timestamp: Date.now(), + editors: [ + { + filePath: "/test.ts", + relativePath: "test.ts", + languageId: "typescript", + isActive: true, + visibleRanges: [ + { + startLine: 0, + endLine: 3, + content: "const userName = 42; const db = null; for (let i = 0; i < 10; i++) {}", + }, + ], + cursorPosition: null, + selections: [], + diffInfo: undefined, + }, + ], + } + + const glossary = extractCodeGlossary(visibleCode) + + // Core terms are present + expect(glossary.identifiers).toContain("Kilocode") + + // Meaningful identifiers (4+ chars) are included + expect(glossary.identifiers).toContain("userName") + expect(glossary.identifiers).toContain("const") + expect(glossary.identifiers).toContain("null") + + // Short common words (3 chars or less) are filtered out + expect(glossary.identifiers).not.toContain("db") + expect(glossary.identifiers).not.toContain("let") + expect(glossary.identifiers).not.toContain("for") + expect(glossary.identifiers).not.toContain("i") + }) + + it("extracts meaningful identifiers from realistic code", () => { + const visibleCode: VisibleCodeContext = { + timestamp: Date.now(), + editors: [ + { + filePath: "/app.js", + relativePath: "app.js", + languageId: "javascript", + isActive: true, + visibleRanges: [ + { + startLine: 0, + endLine: 20, + content: ` + const galleryItems = document.querySelectorAll('.gallery-item'); + galleryItems.forEach(item => { + item.addEventListener('mouseenter', function () { + this.style.transform = 'scale(1.05) rotate(1deg)'; + }); + }); + + const subtitle = document.querySelector('.hero-subtitle'); + if (subtitle) { + const text = subtitle.textContent; + subtitle.textContent = ''; + } + `, + }, + ], + cursorPosition: null, + selections: [], + diffInfo: undefined, + }, + ], + } + + const glossary = extractCodeGlossary(visibleCode) + + // Should extract meaningful code identifiers + expect(glossary.identifiers).toContain("galleryItems") + expect(glossary.identifiers).toContain("document") + expect(glossary.identifiers).toContain("querySelectorAll") + expect(glossary.identifiers).toContain("forEach") + expect(glossary.identifiers).toContain("addEventListener") + expect(glossary.identifiers).toContain("mouseenter") + expect(glossary.identifiers).toContain("function") + expect(glossary.identifiers).toContain("style") + expect(glossary.identifiers).toContain("transform") + expect(glossary.identifiers).toContain("scale") + expect(glossary.identifiers).toContain("rotate") + expect(glossary.identifiers).toContain("subtitle") + expect(glossary.identifiers).toContain("querySelector") + expect(glossary.identifiers).toContain("textContent") + + // Should NOT extract short common words or keywords + expect(glossary.identifiers).not.toContain("the") + expect(glossary.identifiers).not.toContain("to") + expect(glossary.identifiers).not.toContain("if") + expect(glossary.identifiers).not.toContain("deg") + + // Core terms should still be present + expect(glossary.identifiers).toContain("Kilocode") + expect(glossary.identifiers).toContain("VSCode") + }) + + it("includes core terms even with empty visible code", () => { + const visibleCode: VisibleCodeContext = { + timestamp: Date.now(), + editors: [], + } + + const glossary = extractCodeGlossary(visibleCode) + + // Should have core terms even without visible code + expect(glossary.identifiers.length).toBeGreaterThan(0) + expect(glossary.identifiers).toContain("Kilocode") + }) +}) + +describe("formatGlossaryAsPrompt", () => { + it("formats identifiers as natural language prompt", () => { + const glossary = { + identifiers: ["userName", "firstName", "lastName"], + } + + const prompt = formatGlossaryAsPrompt(glossary) + + // Check format matches current implementation + expect(prompt).toContain( + "Context: The user is a software developer. Terms that MAY appear in their speech include:", + ) + expect(prompt).toContain("userName") + expect(prompt).toContain("firstName") + expect(prompt).toContain("lastName") + }) + + it("returns empty string for empty glossary", () => { + const glossary = { identifiers: [] } + + const prompt = formatGlossaryAsPrompt(glossary) + + expect(prompt).toBe("") + }) + + it("prioritizes core terms then limits to 50 total terms", () => { + // Create glossary with core terms + many extracted terms + const coreTerms = ["Kilocode", "Kilo Code", "VSCode", "MCP"] + const extractedTerms = Array(100) + .fill(0) + .map((_, i) => `id${i}`) + + const glossary = { + identifiers: [...coreTerms, ...extractedTerms], + } + + const prompt = formatGlossaryAsPrompt(glossary) + + // Core terms should be present + expect(prompt).toContain("Kilocode") + expect(prompt).toContain("VSCode") + + // Count total comma-separated terms + const allTerms = prompt.split("include: ")[1].split(".") + const termsList = allTerms[0].split(", ") + expect(termsList.length).toBeLessThanOrEqual(50) + }) +}) diff --git a/src/services/stt/context/codeGlossaryExtractor.ts b/src/services/stt/context/codeGlossaryExtractor.ts new file mode 100644 index 00000000000..27473b4fe80 --- /dev/null +++ b/src/services/stt/context/codeGlossaryExtractor.ts @@ -0,0 +1,68 @@ +// kilocode_change - new file: Extract code identifiers from visible code for STT context +import type { VisibleCodeContext } from "../../ghost/types" + +export interface CodeGlossary { + identifiers: string[] // Unique code identifiers from visible code +} + +/** + * Core terms that should always be included in the glossary + * These are Kilocode-specific or commonly spoken technical terms + */ +const CORE_TERMS = ["Kilocode", "Kilo Code", "VSCode", "MCP"] + +/** + * Extract code identifiers from visible editors + * Uses regex-based extraction (fast, language-agnostic) + * Always includes core Kilocode-related terms + */ +export function extractCodeGlossary(visibleCode: VisibleCodeContext): CodeGlossary { + const identifiers = new Set() + + // Always include core terms + for (const term of CORE_TERMS) { + identifiers.add(term) + } + + for (const editor of visibleCode.editors) { + for (const range of editor.visibleRanges) { + // Extract identifiers using regex: camelCase, PascalCase, snake_case + const regex = /\b[a-zA-Z_][a-zA-Z0-9_]*\b/g + const matches = range.content.match(regex) || [] + + for (const match of matches) { + if (isValidIdentifier(match)) { + identifiers.add(match) + } + } + } + } + + return { + identifiers: Array.from(identifiers), + } +} + +/** + * Format glossary as OpenAI Whisper prompt + * Optimized for Whisper's ~224 token limit (~150-200 words) + * Prioritizes core terms, then longer identifiers + */ +export function formatGlossaryAsPrompt(glossary: CodeGlossary, limitCount: number = 50): string { + if (glossary.identifiers.length === 0) { + return "" + } + + // Sort extracted terms by length (longer first) + const sortedExtracted = glossary.identifiers.sort((a, b) => b.length - a.length) + + // Always include core terms first, then fill remaining space with extracted terms + const remainingSlots = Math.max(0, limitCount - CORE_TERMS.length) + const prioritized = [...CORE_TERMS, ...sortedExtracted.slice(0, remainingSlots)] + + return `Context: The user is a software developer. Terms that MAY appear in their speech include: ${prioritized.join(", ")}.` +} + +function isValidIdentifier(word: string): boolean { + return word.length >= 4 +} diff --git a/src/services/stt/index.ts b/src/services/stt/index.ts new file mode 100644 index 00000000000..2c9e37239f8 --- /dev/null +++ b/src/services/stt/index.ts @@ -0,0 +1,5 @@ +// kilocode_change - new file: STT service public exports +export { STTService } from "./STTService" +export { OpenAIWhisperClient } from "./OpenAIWhisperClient" +export { FFmpegCaptureService } from "./FFmpegCaptureService" +export * from "./types" diff --git a/src/services/stt/types.ts b/src/services/stt/types.ts new file mode 100644 index 00000000000..0ef4e515e79 --- /dev/null +++ b/src/services/stt/types.ts @@ -0,0 +1,122 @@ +// kilocode_change - new file: STT service type definitions +import { STTSegment } from "../../shared/sttContract" + +/** + * Interface for providing code glossary context to STT service + * Implementations capture visible code and format it as a prompt + */ +export interface VisibleCodeGlossary { + getGlossary(): Promise +} + +/** + * Voice Activity Detection (VAD) configuration + * + * Configuration guide: + * - energyThreshold (0.02 default): Voice detection sensitivity + * - Lower = more sensitive (may pick up background noise) + * - Higher = less sensitive (may miss quiet speech) + * - Typical range: 0.01 - 0.03 + * + * - shortPauseMs (300ms default): Natural pause detection for commits + * - Detects brief pauses between phrases/breaths + * - Too low: commits too frequently, may split words + * - Too high: delays transcription + * - Typical range: 200-500ms + * + * - minChunkMs (1000ms default): Minimum audio before allowing commit + * - Prevents tiny fragments from being committed + * - Typical range: 500-1500ms + * + * - maxChunkMs (10000ms default): Safety cap for continuous speech + * - Forces commit during pauses if buffer grows too large + * - Typical range: 8000-15000ms + * + * - minVoicedRatio (0.3 default): Minimum voice content required + * - Prevents committing silent/empty buffers (causes hallucination) + * - Typical range: 0.2-0.5 + */ +export interface VADConfig { + /** Energy threshold for voiced frames (0-1 scale) */ + energyThreshold: number + + /** Minimum chunk duration in ms before allowing commit */ + minChunkMs: number + + /** Short pause duration in ms (word gap detection) */ + shortPauseMs: number + + /** Maximum chunk duration in ms (safety cap for continuous speech) */ + maxChunkMs: number + + /** Frame duration in ms (depends on FFmpeg buffer size) */ + frameDurationMs: number + + /** Minimum ratio of voiced frames required to commit (0-1 scale) */ + minVoicedRatio: number +} + +/** + * Default VAD configuration + * Conservative settings to avoid committing empty/silent buffers + */ +export const DEFAULT_VAD_CONFIG: VADConfig = { + energyThreshold: 0.02, // Voice detection threshold + minChunkMs: 1000, // Minimum 1 second of audio before allowing commit + shortPauseMs: 150, // 150ms pause for commits (catches natural between-word pauses) + maxChunkMs: 10000, // Safety cap at 10 seconds + frameDurationMs: 20, // Typical for 24kHz audio + minVoicedRatio: 0.3, // Require 30% voice activity before committing +} + +/** + * Configuration passed to STT provider + */ +export interface STTProviderConfig { + apiKey?: string + language?: string + prompt?: string // Code glossary/context for better accuracy + vadConfig?: Partial // Override VAD configuration +} + +/** + * Callbacks providers use to emit events + * This is the bridge between provider internals and the WebView event system + */ +export interface STTEventEmitter { + onStarted: (sessionId: string) => void + onTranscript: (segments: STTSegment[], isFinal: boolean) => void + onVolume: (level: number) => void + onStopped: (reason: "completed" | "cancelled" | "error", text?: string, error?: string) => void +} + +/** + * Internal state tracking for providers + */ +export interface STTSessionState { + sessionId: string + isRecording: boolean + language?: string +} + +/** + * Progressive transcription result + * Emitted during recording with real-time transcription updates + */ +export interface ProgressiveResult { + chunkId: number + text: string + isInterim: boolean + confidence: number + totalDuration: number + sequenceNumber: number +} + +/** + * Configuration for transcription service + */ +export interface TranscriptionServiceConfig { + apiKey: string + language?: string + prompt?: string // Optional context prompt for code identifiers +} diff --git a/src/services/stt/utils/getOpenAiCredentials.ts b/src/services/stt/utils/getOpenAiCredentials.ts new file mode 100644 index 00000000000..6f1af42dc07 --- /dev/null +++ b/src/services/stt/utils/getOpenAiCredentials.ts @@ -0,0 +1,63 @@ +// kilocode_change - new file: Shared utility for OpenAI credential retrieval +import type { ProviderSettingsManager } from "../../../core/config/ProviderSettingsManager" + +/** + * Get OpenAI API key from provider settings + * Searches for any provider with type "openai" or "openai-native" + * + * Both provider types can access the OpenAI API, but store keys in different fields: + * - "openai" provider: uses openAiApiKey field + * - "openai-native" provider: uses openAiNativeApiKey field + */ +export async function getOpenAiApiKey(providerSettingsManager: ProviderSettingsManager): Promise { + try { + const allProfiles = await providerSettingsManager.listConfig() + + for (const profile of allProfiles) { + if (profile.apiProvider === "openai" || profile.apiProvider === "openai-native") { + const fullProfile = await providerSettingsManager.getProfile({ id: profile.id }) + + if (profile.apiProvider === "openai" && fullProfile.openAiApiKey) { + return fullProfile.openAiApiKey + } + + if (profile.apiProvider === "openai-native" && fullProfile.openAiNativeApiKey) { + return fullProfile.openAiNativeApiKey + } + } + } + + return null + } catch (error) { + console.error("[getOpenAiCredentials] Error getting API key:", error) + return null + } +} + +/** + * Get OpenAI base URL from provider settings + * Returns the configured base URL or defaults to OpenAI's official API + */ +export async function getOpenAiBaseUrl(providerSettingsManager: ProviderSettingsManager): Promise { + try { + const allProfiles = await providerSettingsManager.listConfig() + + for (const profile of allProfiles) { + if (profile.apiProvider === "openai" || profile.apiProvider === "openai-native") { + const fullProfile = await providerSettingsManager.getProfile({ id: profile.id }) + + if (profile.apiProvider === "openai" && fullProfile.openAiBaseUrl) { + return fullProfile.openAiBaseUrl + } + + if (profile.apiProvider === "openai-native" && fullProfile.openAiNativeBaseUrl) { + return fullProfile.openAiNativeBaseUrl + } + } + } + + return "https://api.openai.com/v1" + } catch { + return "https://api.openai.com/v1" + } +} diff --git a/src/shared/ExtensionMessage.ts b/src/shared/ExtensionMessage.ts index 2288131b47d..0b383a7541e 100644 --- a/src/shared/ExtensionMessage.ts +++ b/src/shared/ExtensionMessage.ts @@ -33,6 +33,7 @@ import { import { ClineRulesToggles } from "./cline-rules" import { KiloCodeWrapperProperties } from "./kilocode/wrapper" import { DeploymentRecord } from "../api/providers/fetchers/sap-ai-core" +import { STTSegment } from "./sttContract" // kilocode_change: STT segment type // kilocode_change end // Command interface for frontend/backend communication @@ -133,6 +134,10 @@ export interface ExtensionMessage { | "openInBrowser" // kilocode_change | "acceptInput" | "focusChatInput" // kilocode_change + | "stt:started" // kilocode_change: STT session started + | "stt:transcript" // kilocode_change: STT transcript update + | "stt:volume" // kilocode_change: STT volume level + | "stt:stopped" // kilocode_change: STT session stopped | "setHistoryPreviewCollapsed" | "commandExecutionStatus" | "mcpExecutionStatus" @@ -264,6 +269,11 @@ export interface ExtensionMessage { slug?: string success?: boolean values?: Record + sessionId?: string // kilocode_change: STT session ID + segments?: STTSegment[] // kilocode_change: STT transcript segments (complete state) + isFinal?: boolean // kilocode_change: STT transcript is final + level?: number // kilocode_change: STT volume level (0-1) + reason?: "completed" | "cancelled" | "error" // kilocode_change: STT stop reason requestId?: string promptText?: string results?: { path: string; type: "file" | "folder"; label?: string }[] @@ -528,6 +538,7 @@ export type ExtensionState = Pick< virtualQuotaActiveModel?: { id: string; info: ModelInfo } // kilocode_change: Add virtual quota active model for UI display showTimestamps?: boolean // kilocode_change: Show timestamps in chat messages debug?: boolean + speechToTextAvailable?: boolean // kilocode_change: Whether speech-to-text is fully configured (FFmpeg + OpenAI key) } export interface ClineSayTool { diff --git a/src/shared/WebviewMessage.ts b/src/shared/WebviewMessage.ts index 419adea43cd..4d2122066f5 100644 --- a/src/shared/WebviewMessage.ts +++ b/src/shared/WebviewMessage.ts @@ -114,9 +114,12 @@ export interface WebviewMessage { | "toggleToolEnabledForPrompt" | "toggleMcpServer" | "updateMcpTimeout" + | "fuzzyMatchThreshold" // kilocode_change | "morphApiKey" // kilocode_change: Morph fast apply - global setting | "fastApplyModel" // kilocode_change: Fast Apply model selection | "fastApplyApiProvider" // kilocode_change: Fast Apply model api base url + | "writeDelayMs" // kilocode_change + | "diagnosticsEnabled" // kilocode_change | "enhancePrompt" | "enhancedPrompt" | "draggedImages" @@ -138,6 +141,10 @@ export interface WebviewMessage { | "commitMessageApiConfigId" // kilocode_change | "terminalCommandApiConfigId" // kilocode_change | "ghostServiceSettings" // kilocode_change + | "stt:start" // kilocode_change: Start STT recording + | "stt:stop" // kilocode_change: Stop STT recording + | "stt:cancel" // kilocode_change: Cancel STT recording + | "includeTaskHistoryInEnhance" // kilocode_change | "autoApprovalEnabled" | "yoloMode" // kilocode_change | "updateCustomMode" @@ -220,6 +227,7 @@ export interface WebviewMessage { | "marketplaceInstallResult" | "fetchMarketplaceData" | "switchTab" + | "profileThresholds" // kilocode_change | "editMessage" // kilocode_change | "systemNotificationsEnabled" // kilocode_change | "dismissNotificationId" // kilocode_change @@ -232,7 +240,7 @@ export interface WebviewMessage { | "autoPurgeCompletedTaskRetentionDays" // kilocode_change | "autoPurgeIncompleteTaskRetentionDays" // kilocode_change | "manualPurge" // kilocode_change - | "shareTaskSuccess" + | "shareTaskSuccess" // kilocode_change | "exportMode" | "exportModeResult" | "importMode" @@ -326,6 +334,7 @@ export interface WebviewMessage { query?: string setting?: string slug?: string + language?: string // User's language for speech transcription (STT) modeConfig?: ModeConfig timeout?: number payload?: WebViewMessagePayload diff --git a/src/shared/__tests__/experiments.spec.ts b/src/shared/__tests__/experiments.spec.ts index 8783a2b1148..7d18d9b6a29 100644 --- a/src/shared/__tests__/experiments.spec.ts +++ b/src/shared/__tests__/experiments.spec.ts @@ -23,10 +23,20 @@ describe("experiments", () => { }) }) + describe("SPEECH_TO_TEXT", () => { + it("is configured correctly", () => { + expect(EXPERIMENT_IDS.SPEECH_TO_TEXT).toBe("speechToText") + expect(experimentConfigsMap.SPEECH_TO_TEXT).toMatchObject({ + enabled: false, + }) + }) + }) + describe("isEnabled", () => { it("returns false when POWER_STEERING experiment is not enabled", () => { const experiments: Record = { morphFastApply: false, // kilocode_change + speechToText: false, // kilocode_change powerSteering: false, multiFileApplyDiff: false, preventFocusDisruption: false, @@ -40,6 +50,7 @@ describe("experiments", () => { it("returns true when experiment POWER_STEERING is enabled", () => { const experiments: Record = { morphFastApply: false, // kilocode_change + speechToText: false, // kilocode_change powerSteering: true, multiFileApplyDiff: false, preventFocusDisruption: false, @@ -53,6 +64,7 @@ describe("experiments", () => { it("returns false when experiment is not present", () => { const experiments: Record = { morphFastApply: false, // kilocode_change + speechToText: false, // kilocode_change powerSteering: false, multiFileApplyDiff: false, preventFocusDisruption: false, diff --git a/src/shared/experiments.ts b/src/shared/experiments.ts index 7fc605f5420..bf4c4e14790 100644 --- a/src/shared/experiments.ts +++ b/src/shared/experiments.ts @@ -2,6 +2,7 @@ import type { AssertEqual, Equals, Keys, Values, ExperimentId, Experiments } fro export const EXPERIMENT_IDS = { MORPH_FAST_APPLY: "morphFastApply", // kilocode_change + SPEECH_TO_TEXT: "speechToText", // kilocode_change MULTI_FILE_APPLY_DIFF: "multiFileApplyDiff", POWER_STEERING: "powerSteering", PREVENT_FOCUS_DISRUPTION: "preventFocusDisruption", @@ -20,6 +21,7 @@ interface ExperimentConfig { export const experimentConfigsMap: Record = { MORPH_FAST_APPLY: { enabled: false }, // kilocode_change + SPEECH_TO_TEXT: { enabled: false }, // kilocode_change MULTI_FILE_APPLY_DIFF: { enabled: false }, POWER_STEERING: { enabled: false }, PREVENT_FOCUS_DISRUPTION: { enabled: false }, diff --git a/src/shared/sttContract.ts b/src/shared/sttContract.ts new file mode 100644 index 00000000000..2ef58a6e855 --- /dev/null +++ b/src/shared/sttContract.ts @@ -0,0 +1,66 @@ +// kilocode_change - new file: STT contract types shared between extension and webview +// Speech-to-Text (STT) event protocol + +/** + * Commands: WebView → Extension + */ +export interface STTStartCommand { + type: "stt:start" + language?: string // ISO 639-1 (e.g., "en", "es", "zh") +} + +export interface STTStopCommand { + type: "stt:stop" +} + +export interface STTCancelCommand { + type: "stt:cancel" +} + +export type STTCommand = STTStartCommand | STTStopCommand | STTCancelCommand + +/** + * Events: Extension → WebView + */ +export interface STTStartedEvent { + type: "stt:started" + sessionId: string +} + +/** + * A segment of transcribed text + */ +export interface STTSegment { + text: string // The transcribed text + isPreview: boolean // true = streaming/tentative, false = completed/polished +} + +export interface STTTranscriptEvent { + type: "stt:transcript" + sessionId: string + segments: STTSegment[] // Ordered list of all text segments + isFinal: boolean // false = still updating, true = utterance complete +} + +export interface STTVolumeEvent { + type: "stt:volume" + sessionId: string + level: number // 0.0 to 1.0 +} + +export interface STTStoppedEvent { + type: "stt:stopped" + sessionId: string + reason: "completed" | "cancelled" | "error" + text?: string // Final transcript (when reason === "completed") + error?: string // Error message (when reason === "error") +} + +export type STTEvent = STTStartedEvent | STTTranscriptEvent | STTVolumeEvent | STTStoppedEvent + +/** + * Type guard for routing in message handlers + */ +export function isSTTCommand(msg: { type: string }): msg is STTCommand { + return msg.type === "stt:start" || msg.type === "stt:stop" || msg.type === "stt:cancel" +} diff --git a/webview-ui/src/components/chat/ChatTextArea.tsx b/webview-ui/src/components/chat/ChatTextArea.tsx index 001a68ff9d6..1c0beeb8299 100644 --- a/webview-ui/src/components/chat/ChatTextArea.tsx +++ b/webview-ui/src/components/chat/ChatTextArea.tsx @@ -22,28 +22,23 @@ import { import { convertToMentionPath } from "@/utils/path-mentions" import { escapeHtml } from "@/utils/highlight" // kilocode_change - FIM autocomplete import { useChatGhostText } from "./hooks/useChatGhostText" // kilocode_change: FIM autocomplete -import { DropdownOptionType, Button, StandardTooltip } from "@/components/ui" // kilocode_change +import { DropdownOptionType, Button, StandardTooltip } from "@/components/ui" import Thumbnails from "../common/Thumbnails" import { ModeSelector } from "./ModeSelector" import KiloModeSelector from "../kilocode/KiloModeSelector" -import { KiloProfileSelector } from "../kilocode/chat/KiloProfileSelector" // kilocode_change +import { KiloProfileSelector } from "../kilocode/chat/KiloProfileSelector" import { MAX_IMAGES_PER_MESSAGE } from "./ChatView" import ContextMenu from "./ContextMenu" -import { ImageWarningBanner } from "./ImageWarningBanner" // kilocode_change -import { - VolumeX, - Pin, - Check, - // Image, // kilocode_change - WandSparkles, - SendHorizontal, - Paperclip, // kilocode_change - MessageSquareX, -} from "lucide-react" +import { ImageWarningBanner } from "./ImageWarningBanner" +import { VolumeX, Pin, Check, WandSparkles, SendHorizontal, Paperclip, MessageSquareX } from "lucide-react" import { IndexingStatusBadge } from "./IndexingStatusBadge" +import { MicrophoneButton } from "./MicrophoneButton" // kilocode_change: STT microphone button +import { VolumeVisualizer } from "./VolumeVisualizer" // kilocode_change: STT volume level visual +import { VoiceRecordingCursor } from "./VoiceRecordingCursor" // kilocode_change: STT recording cursor import { cn } from "@/lib/utils" import { usePromptHistory } from "./hooks/usePromptHistory" +import { useSTT } from "@/hooks/useSTT" // kilocode_change: STT hook // kilocode_change start: pull slash commands from Cline import SlashCommandMenu from "@/components/chat/SlashCommandMenu" @@ -165,6 +160,9 @@ export const ChatTextArea = forwardRef( taskHistoryVersion, // kilocode_change clineMessages, ghostServiceSettings, // kilocode_change + language, // User's VSCode display language + experiments, // kilocode_change: For speechToText experiment flag + speechToTextAvailable, // kilocode_change: Whether voice transcription is configured } = useExtensionState() // kilocode_change start - autocomplete profile type system @@ -216,7 +214,7 @@ export const ChatTextArea = forwardRef( resizeObserver.disconnect() } }, []) - // kilocode_change end + // kilocode_change end: Container width tracking for responsive UI const [searchLoading, setSearchLoading] = useState(false) const [searchRequestId, setSearchRequestId] = useState("") @@ -276,8 +274,8 @@ export const ChatTextArea = forwardRef( if (message.requestId === searchRequestId) { setFileSearchResults(message.results || []) } - // kilocode_change start } else if (message.type === "insertTextToChatArea") { + // kilocode_change if (message.text) { setInputValue(message.text) setTimeout(() => { @@ -287,26 +285,115 @@ export const ChatTextArea = forwardRef( }, 0) } } - // kilocode_change end } window.addEventListener("message", messageHandler) return () => window.removeEventListener("message", messageHandler) - }, [setInputValue, searchRequestId]) - + }, [setInputValue, searchRequestId, inputValue, onSend]) const [isDraggingOver, setIsDraggingOver] = useState(false) - // kilocode_change start: pull slash commands from Cline + // kilocode_change start: Slash commands state const [showSlashCommandsMenu, setShowSlashCommandsMenu] = useState(false) const [selectedSlashCommandsIndex, setSelectedSlashCommandsIndex] = useState(0) const [slashCommandsQuery, setSlashCommandsQuery] = useState("") const slashCommandsMenuContainerRef = useRef(null) - // kilocode_end + // kilocode_change end: Slash commands state const [textAreaBaseHeight, setTextAreaBaseHeight] = useState(undefined) const [showContextMenu, setShowContextMenu] = useState(false) const [cursorPosition, setCursorPosition] = useState(0) const [searchQuery, setSearchQuery] = useState("") const textAreaRef = useRef(null) const [isMouseDownOnMenu, setIsMouseDownOnMenu] = useState(false) + + // kilocode_change: Use STT (Speech-to-Text) hook + // Track input state when recording starts + const recordingStartStateRef = useRef<{ beforeCursor: string; afterCursor: string; position: number } | null>( + null, + ) + const { + isRecording, + segments, + volume: volumeLevel, + start: startSTT, + stop: stopSTT, + } = useSTT({ + onComplete: (text) => { + // Insert transcribed text at cursor position + if (recordingStartStateRef.current) { + const { beforeCursor, afterCursor } = recordingStartStateRef.current + const separator = beforeCursor && !beforeCursor.endsWith(" ") ? " " : "" + const newValue = beforeCursor + separator + text + afterCursor + setInputValue(newValue) + // Set cursor after inserted text + const newCursorPos = beforeCursor.length + separator.length + text.length + setCursorPosition(newCursorPos) + setIntendedCursorPosition(newCursorPos) + } else { + setInputValue(text) + } + recordingStartStateRef.current = null + }, + onError: (error) => { + console.error("STT error:", error) + recordingStartStateRef.current = null + }, + }) + + // Convert segments to text for display + const liveTranscript = useMemo(() => { + return segments.map((s) => s.text).join(" ") + }, [segments]) + + // Track preview ranges for highlighting + const previewRanges = useMemo(() => { + const ranges: { start: number; end: number }[] = [] + let offset = 0 + + for (let i = 0; i < segments.length; i++) { + const segment = segments[i] + if (segment.isPreview) { + ranges.push({ start: offset, end: offset + segment.text.length }) + } + offset += segment.text.length + // Add space offset except for the last segment + if (i < segments.length - 1) { + offset += 1 // Account for the space added by join(" ") + } + } + + // console.log("🎙️ [ChatTextArea] 🎨 previewRanges:", ranges, "from segments:", segments) + return ranges + }, [segments]) + + // Store cursor position and split input when recording starts + useEffect(() => { + if (isRecording && !recordingStartStateRef.current) { + const pos = textAreaRef.current?.selectionStart ?? inputValue.length + recordingStartStateRef.current = { + beforeCursor: inputValue.slice(0, pos), + afterCursor: inputValue.slice(pos), + position: pos, + } + } + }, [isRecording, inputValue]) + + const displayValue = useMemo(() => { + if (isRecording && liveTranscript && recordingStartStateRef.current) { + const { beforeCursor, afterCursor } = recordingStartStateRef.current + const separator = beforeCursor && !beforeCursor.endsWith(" ") ? " " : "" + return beforeCursor + separator + liveTranscript + afterCursor + } + return inputValue + }, [isRecording, liveTranscript, inputValue]) + + // Show cursor at insertion point during recording + const recordingCursorPosition = + isRecording && recordingStartStateRef.current + ? recordingStartStateRef.current.position + + (recordingStartStateRef.current.beforeCursor && + !recordingStartStateRef.current.beforeCursor.endsWith(" ") + ? 1 + : 0) + : 0 const highlightLayerRef = useRef(null) const shouldAutoScrollToCaretRef = useRef(false) // kilocode_change const [selectedMenuIndex, setSelectedMenuIndex] = useState(-1) @@ -315,8 +402,7 @@ export const ChatTextArea = forwardRef( const [intendedCursorPosition, setIntendedCursorPosition] = useState(null) const contextMenuContainerRef = useRef(null) const [isEnhancingPrompt, setIsEnhancingPrompt] = useState(false) - const [isFocused, setIsFocused] = useState(false) - const [imageWarning, setImageWarning] = useState(null) // kilocode_change + // const [isFocused, setIsFocused] = useState(false) // kilocode_change - not needed // kilocode_change start: FIM autocomplete ghost text const { ghostText, @@ -327,12 +413,13 @@ export const ChatTextArea = forwardRef( enableChatAutocomplete: ghostServiceSettings?.enableChatAutocomplete ?? false, }) // kilocode_change end: FIM autocomplete ghost text + const [imageWarning, setImageWarning] = useState(null) // kilocode_change // Use custom hook for prompt history navigation const { handleHistoryNavigation, resetHistoryNavigation, resetOnInputChange } = usePromptHistory({ clineMessages, - taskHistoryVersion, // kilocode_change - cwd, + taskHistoryVersion, + cwd, // kilocode_change inputValue, setInputValue, }) @@ -359,18 +446,27 @@ export const ChatTextArea = forwardRef( } }, [inputValue, setInputValue, t]) - // kilocode_change start: Image warning handlers - const showImageWarning = useCallback((messageKey: string) => { - setImageWarning(messageKey) - }, []) + // kilocode_change start: Image and speech handlers + const showImageWarning = useCallback( + (messageKey: string) => { + setImageWarning(messageKey) + }, + [setImageWarning], + ) const dismissImageWarning = useCallback(() => { setImageWarning(null) - }, []) - // kilocode_change end: Image warning handlers + }, [setImageWarning]) + + const handleMicrophoneClick = useCallback(() => { + if (isRecording) { + stopSTT() + } else { + startSTT(language || "en") // Pass user's language from extension state + } + }, [isRecording, startSTT, stopSTT, language]) - // kilocode_change start: Clear images if unsupported - // Track previous shouldDisableImages state to detect when model image support changes + // kilocode_change start: Auto-clear images when model changes to non-image-supporting const prevShouldDisableImages = useRef(shouldDisableImages) useEffect(() => { if (!prevShouldDisableImages.current && shouldDisableImages && selectedImages.length > 0) { @@ -379,7 +475,7 @@ export const ChatTextArea = forwardRef( } prevShouldDisableImages.current = shouldDisableImages }, [shouldDisableImages, selectedImages.length, setSelectedImages, showImageWarning]) - // kilocode_change end: Clear images if unsupported + // kilocode_change end: Auto-clear images when model changes to non-image-supporting const allModes = useMemo(() => getAllModes(customModes), [customModes]) @@ -425,8 +521,8 @@ export const ChatTextArea = forwardRef( const handleMentionSelect = useCallback( (type: ContextMenuOptionType, value?: string) => { - // kilocode_change start if (type === ContextMenuOptionType.Image) { + // kilocode_change start: Image selection handling // Close the context menu and remove the @character in this case setShowContextMenu(false) setSelectedType(null) @@ -445,8 +541,7 @@ export const ChatTextArea = forwardRef( // Call the image selection function onSelectImages() return - } - // kilocode_change end + } // kilocode_change end: Image selection handling if (type === ContextMenuOptionType.NoResults) { return @@ -516,8 +611,8 @@ export const ChatTextArea = forwardRef( [setInputValue, cursorPosition], ) - // kilocode_change start: pull slash commands from Cline const handleSlashCommandsSelect = useCallback( + // kilocode_change start: Slash command selection (command: SlashCommand) => { setShowSlashCommandsMenu(false) @@ -549,13 +644,13 @@ export const ChatTextArea = forwardRef( } }, [setInputValue, setMode, customModes], - ) - // kilocode_change end + ) // kilocode_change end: Slash command selection const handleKeyDown = useCallback( (event: React.KeyboardEvent) => { // kilocode_change start: pull slash commands from Cline if (showSlashCommandsMenu) { + // kilocode_change start: Slash command menu navigation if (event.key === "Escape") { setShowSlashCommandsMenu(false) return @@ -589,14 +684,13 @@ export const ChatTextArea = forwardRef( customModes, localWorkflows, globalWorkflows, - ) // kilocode_change + ) if (commands.length > 0) { handleSlashCommandsSelect(commands[selectedSlashCommandsIndex]) } return } - } - // kilocode_change end + } // kilocode_change end: Slash command menu navigation if (showContextMenu) { if (event.key === "Escape") { setShowContextMenu(false) @@ -671,8 +765,7 @@ export const ChatTextArea = forwardRef( const isComposing = event.nativeEvent?.isComposing ?? false - // kilocode_change start - const shouldSendMessage = + const shouldSendMessage = // kilocode_change start: Send message handling !isComposing && event.key === "Enter" && ((sendMessageOnEnter && !event.shiftKey) || (!sendMessageOnEnter && event.shiftKey)) @@ -695,8 +788,7 @@ export const ChatTextArea = forwardRef( // Handle prompt history navigation using custom hook if (handleHistoryNavigation(event, showContextMenu, isComposing)) { return - } - // kilocode_change end + } // kilocode_change end: Send message handling if (event.key === "Backspace" && !isComposing) { const charBeforeCursor = inputValue[cursorPosition - 1] @@ -743,8 +835,7 @@ export const ChatTextArea = forwardRef( } }, [ - // kilocode_change start - showSlashCommandsMenu, + showSlashCommandsMenu, // kilocode_change start localWorkflows, globalWorkflows, customModes, @@ -802,8 +893,7 @@ export const ChatTextArea = forwardRef( const newCursorPosition = target.selectionStart // Use target for consistency setCursorPosition(newCursorPosition) - // kilocode_change start: pull slash commands from Cline - let showMenu = shouldShowContextMenu(newValue, newCursorPosition) + let showMenu = shouldShowContextMenu(newValue, newCursorPosition) // kilocode_change start: Slash command menu logic const showSlashCommandsMenu = shouldShowSlashCommandsMenu(newValue, newCursorPosition) // we do not allow both menus to be shown at the same time @@ -812,13 +902,12 @@ export const ChatTextArea = forwardRef( showMenu = false } - setShowSlashCommandsMenu(showSlashCommandsMenu) - // kilocode_change end + setShowSlashCommandsMenu(showSlashCommandsMenu) // kilocode_change end: Slash command menu logic setShowContextMenu(showMenu) - // kilocode_change start: pull slash commands from Cline if (showSlashCommandsMenu) { + // kilocode_change start: Slash command query handling const slashIndex = newValue.indexOf("/") const query = newValue.slice(slashIndex + 1, newCursorPosition) setSlashCommandsQuery(query) @@ -826,15 +915,13 @@ export const ChatTextArea = forwardRef( } else { setSlashCommandsQuery("") setSelectedSlashCommandsIndex(0) - } - // kilocode_change end + } // kilocode_change end: Slash command query handling if (showMenu) { - // kilocode_change start - check lastAtIndex before handling slash commands const lastAtIndex = newValue.lastIndexOf("@", newCursorPosition - 1) - // if (newValue.startsWith("/")) { ⚠️ kilocode_change added lastAtIndex check if (newValue.startsWith("/") && lastAtIndex === -1) { + // kilocode_change: Prevent slash command conflict with mentions // Handle slash command. const query = newValue setSearchQuery(query) @@ -900,10 +987,10 @@ export const ChatTextArea = forwardRef( // Only hide the context menu if the user didn't click on it. if (!isMouseDownOnMenu) { setShowContextMenu(false) - setShowSlashCommandsMenu(false) // kilocode_change: pull slash commands from Cline - } + setShowSlashCommandsMenu(false) + } // kilocode_change - setIsFocused(false) + // setIsFocused(false) // kilocode_change - not needed }, [isMouseDownOnMenu]) const handlePaste = useCallback( @@ -943,8 +1030,8 @@ export const ChatTextArea = forwardRef( return type === "image" && acceptedTypes.includes(subtype) }) - // kilocode_change start: Image validation with warning messages if (imageItems.length > 0) { + // kilocode_change start: Image paste validation e.preventDefault() if (shouldDisableImages) { @@ -954,8 +1041,7 @@ export const ChatTextArea = forwardRef( if (selectedImages.length >= MAX_IMAGES_PER_MESSAGE) { showImageWarning(`kilocode:imageWarnings.maxImagesReached`) return - } - // kilocode_change end: Image validation with warning messages + } // kilocode_change end: Image paste validation const imagePromises = imageItems.map((item) => { return new Promise((resolve) => { @@ -999,8 +1085,8 @@ export const ChatTextArea = forwardRef( setInputValue, inputValue, t, - selectedImages.length, // kilocode_change - added selectedImages.length - showImageWarning, // kilocode_change - added showImageWarning + selectedImages.length, + showImageWarning, // kilocode_change ], ) @@ -1011,8 +1097,7 @@ export const ChatTextArea = forwardRef( const updateHighlights = useCallback(() => { if (!textAreaRef.current || !highlightLayerRef.current) return - // kilocode_change start: pull slash commands from Cline - let processedText = textAreaRef.current.value + let processedText = textAreaRef.current.value // kilocode_change start: Slash command highlighting processedText = processedText .replace(/\n$/, "\n\n") @@ -1039,6 +1124,25 @@ export const ChatTextArea = forwardRef( processedText.substring(0, slashIndex) + highlighted + processedText.substring(endIndex) } } + + // kilocode_change start - STT preview text highlighting + if (isRecording && previewRanges.length > 0 && recordingStartStateRef.current) { + const { beforeCursor } = recordingStartStateRef.current + const separator = beforeCursor && !beforeCursor.endsWith(" ") ? " " : "" + const baseOffset = beforeCursor.length + separator.length + for (let i = previewRanges.length - 1; i >= 0; i--) { + const range = previewRanges[i] + const start = baseOffset + range.start + const end = baseOffset + range.end + + const before = processedText.substring(0, start) + const previewText = processedText.substring(start, end) + const after = processedText.substring(end) + + processedText = before + `${previewText}` + after + } + } + // kilocode_change end - STT preview text highlighting // kilocode_change start - autocomplete ghost text display if (inputValue && ghostText) { processedText += `${escapeHtml(ghostText)}` @@ -1048,7 +1152,7 @@ export const ChatTextArea = forwardRef( highlightLayerRef.current.innerHTML = processedText highlightLayerRef.current.scrollTop = textAreaRef.current.scrollTop highlightLayerRef.current.scrollLeft = textAreaRef.current.scrollLeft - }, [customModes, ghostText, inputValue]) // kilocode_change - add inputValue + }, [customModes, ghostText, inputValue, isRecording, previewRanges]) // kilocode_change - merged dependencies useLayoutEffect(() => { updateHighlights() @@ -1075,7 +1179,7 @@ export const ChatTextArea = forwardRef( return () => cancelAnimationFrame(rafId) // kilocode_change end - }, [inputValue, updateHighlights]) + }, [inputValue, liveTranscript, updateHighlights]) const updateCursorPosition = useCallback(() => { if (textAreaRef.current) { @@ -1391,11 +1495,9 @@ export const ChatTextArea = forwardRef( "font-vscode-font-family", "text-vscode-editor-font-size", "leading-vscode-editor-line-height", - isFocused - ? "border border-vscode-focusBorder outline outline-vscode-focusBorder" - : isDraggingOver - ? "border-2 border-dashed border-vscode-focusBorder" - : "border border-transparent", + isDraggingOver + ? "border-2 border-dashed border-vscode-focusBorder" + : "border border-transparent", isEditMode ? "pt-1.5 pb-10 px-2" : "py-1.5 px-2", "px-[8px]", "pr-9", @@ -1416,12 +1518,15 @@ export const ChatTextArea = forwardRef( } textAreaRef.current = el }} - value={inputValue} + value={displayValue} onChange={(e) => { - handleInputChange(e) - updateHighlights() + // During recording, ignore changes to prevent cursor jumping + if (!isRecording) { + handleInputChange(e) + updateHighlights() + } }} - onFocus={() => setIsFocused(true)} + // onFocus={() => setIsFocused(true)} // kilocode_change - not needed onKeyDown={(e) => { // Handle ESC to cancel in edit mode if (isEditMode && e.key === "Escape" && !e.nativeEvent?.isComposing) { @@ -1443,11 +1548,18 @@ export const ChatTextArea = forwardRef( onHeightChange?.(height) }} - // kilocode_change: combine placeholderText and placeholderBottomText here + // kilocode_change: use regular placeholder, streaming text goes to actual input placeholder={`${placeholderText}\n${placeholderBottomText}`} minRows={3} maxRows={15} autoFocus={true} + // kilocode_change start - isRecording active + style={{ + border: isRecording + ? "1px solid var(--vscode-editorError-foreground)" + : "1px solid transparent", + }} + // kilocode_change end - isRecording active className={cn( "w-full", "text-vscode-input-foreground", @@ -1456,11 +1568,14 @@ export const ChatTextArea = forwardRef( "leading-vscode-editor-line-height", "cursor-text", isEditMode ? "pt-1.5 pb-10 px-2" : "py-1.5 px-2", - isFocused - ? "border border-vscode-focusBorder outline outline-vscode-focusBorder" - : isDraggingOver - ? "border-2 border-dashed border-vscode-focusBorder" - : "border border-transparent", + // kilocode_change start - removing duplicated border + isRecording && "focus:outline-0", + // isFocused + // ? "border border-vscode-focusBorder outline outline-vscode-focusBorder" + // : isDraggingOver + // ? "border-2 border-dashed border-vscode-focusBorder" + // : "border border-transparent", + // kilocode_change end - removing duplicated border isDraggingOver ? "bg-[color-mix(in_srgb,var(--vscode-input-background)_95%,var(--vscode-focusBorder))]" : "bg-vscode-input-background", @@ -1487,6 +1602,13 @@ export const ChatTextArea = forwardRef( aria-hidden="true" /> + {/* kilocode_change: Visual cursor indicator during voice recording */} + + {isTtsPlaying && ( )} - - - + + {/* kilocode_change start: Show microphone button only if experiment enabled */} + {experiments?.speechToText && ( + + )} + {/* kilocode_change end */} + + {inputValue.trim() !== "" && ( + + + + )} {/* kilocode_change end */} diff --git a/webview-ui/src/components/chat/MicrophoneButton.tsx b/webview-ui/src/components/chat/MicrophoneButton.tsx new file mode 100644 index 00000000000..20a274fa32f --- /dev/null +++ b/webview-ui/src/components/chat/MicrophoneButton.tsx @@ -0,0 +1,53 @@ +// kilocode_change - new file: Microphone button component for speech-to-text recording +import React from "react" +import { Mic, Square } from "lucide-react" +import { useTranslation } from "react-i18next" +import { StandardTooltip } from "@/components/ui" +import { cn } from "@/lib/utils" + +interface MicrophoneButtonProps { + isRecording: boolean + onClick: () => void + containerWidth?: number + disabled?: boolean + tooltipContent?: string +} + +export const MicrophoneButton: React.FC = ({ + isRecording, + onClick, + containerWidth, + disabled = false, + tooltipContent, +}) => { + const { t } = useTranslation() + + const defaultTooltip = isRecording + ? t("kilocode:speechToText.stopRecording") + : t("kilocode:speechToText.startRecording") + + return ( + + + + ) +} diff --git a/webview-ui/src/components/chat/VoiceRecordingCursor.tsx b/webview-ui/src/components/chat/VoiceRecordingCursor.tsx new file mode 100644 index 00000000000..962b5400af5 --- /dev/null +++ b/webview-ui/src/components/chat/VoiceRecordingCursor.tsx @@ -0,0 +1,91 @@ +// kilocode_change - new file: Visual cursor indicator during voice recording +import React, { useEffect, useState, useRef } from "react" +import { cn } from "@/lib/utils" + +interface VoiceRecordingCursorProps { + textAreaRef: React.RefObject + cursorPosition: number + isVisible: boolean +} + +export const VoiceRecordingCursor: React.FC = ({ + textAreaRef, + cursorPosition, + isVisible, +}) => { + const [position, setPosition] = useState<{ top: number; left: number } | null>(null) + const cursorRef = useRef(null) + + useEffect(() => { + if (!isVisible || !textAreaRef.current) { + setPosition(null) + return + } + + const textarea = textAreaRef.current + const text = textarea.value + + // Create a temporary div to measure text position + const measureDiv = document.createElement("div") + const computedStyle = window.getComputedStyle(textarea) + + // Copy textarea styles to measure div + measureDiv.style.cssText = ` + position: absolute; + visibility: hidden; + white-space: pre-wrap; + word-wrap: break-word; + font-family: ${computedStyle.fontFamily}; + font-size: ${computedStyle.fontSize}; + font-weight: ${computedStyle.fontWeight}; + line-height: ${computedStyle.lineHeight}; + letter-spacing: ${computedStyle.letterSpacing}; + padding: ${computedStyle.padding}; + border: ${computedStyle.border}; + width: ${textarea.clientWidth}px; + ` + + document.body.appendChild(measureDiv) + + // Insert text up to cursor position + const textBeforeCursor = text.substring(0, cursorPosition) + measureDiv.textContent = textBeforeCursor + + // Add a span to measure cursor position + const cursorSpan = document.createElement("span") + cursorSpan.textContent = "|" + measureDiv.appendChild(cursorSpan) + + // Get position relative to the measure div + const spanRect = cursorSpan.getBoundingClientRect() + const textareaRect = textarea.getBoundingClientRect() + + // Calculate position relative to textarea + const top = spanRect.top - textareaRect.top + textarea.scrollTop + const left = spanRect.left - textareaRect.left + textarea.scrollLeft + + document.body.removeChild(measureDiv) + + setPosition({ top, left }) + }, [textAreaRef, cursorPosition, isVisible]) + + if (!isVisible || !position) { + return null + } + + return ( +
+ ) +} diff --git a/webview-ui/src/components/chat/VolumeVisualizer.tsx b/webview-ui/src/components/chat/VolumeVisualizer.tsx new file mode 100644 index 00000000000..bfb1001125f --- /dev/null +++ b/webview-ui/src/components/chat/VolumeVisualizer.tsx @@ -0,0 +1,167 @@ +// kilocode_change - new file: Volume visualizer component for microphone input +import { useEffect, useRef, useState } from "react" +import { cn } from "@/lib/utils" + +export interface VolumeVisualizerProps { + /** Volume level from 0 to 1 */ + volume: number + /** Whether recording is active */ + isActive?: boolean + /** Custom className */ + className?: string +} + +const BAR_COUNT = 5 +const BAR_WIDTH = 2 +const BAR_GAP = 2 +const MAX_HEIGHT = 16 +const MIN_HEIGHT_PERCENT = 10 +const EASING = 0.15 +const ANIMATION_THRESHOLD = 0.001 + +// Energy normalization constants +// Based on real FFmpeg PCM16 energy values which typically range 0.02-0.10 +const ENERGY_MIN = 0.02 // Voice detection threshold +const ENERGY_MAX = 0.12 // Realistic maximum for normal speech (reduced for more sensitivity) +const ENERGY_SCALE = 2.0 // Base amplification (increased from 1.5) +const ENERGY_CURVE = 0.6 // Exponential curve (<1 = boost quiet sounds more) + +interface AnimationState { + targetHeights: number[] + currentHeights: number[] + frameId: number | null +} + +/** + * Normalize raw energy (0.02-0.15) to visual range (0-1) + * Uses exponential scaling to make quiet sounds more visible + * + * Curve explanation: + * - Linear: normalized * scale + * - Exponential (curve<1): pow(normalized, 0.6) - boosts low values more + * - Result: Tiny sounds near threshold get animated, loud sounds still natural + */ +function normalizeEnergy(rawEnergy: number): number { + // Map 0.02-0.12 range to 0-1 + const normalized = Math.max(0, (rawEnergy - ENERGY_MIN) / (ENERGY_MAX - ENERGY_MIN)) + + // Apply exponential curve (0.6 power boosts low values) + // Example: 0.1 linear → 0.25 exponential (2.5x boost for quiet sounds) + const curved = Math.pow(normalized, ENERGY_CURVE) + + // Apply final scaling and clamp + const amplified = Math.min(1, curved * ENERGY_SCALE) + return amplified +} + +function calculateTargetHeights(volume: number): number[] { + // Normalize the raw energy value for better visual response + const normalizedVolume = normalizeEnergy(volume) + + const centerIndex = Math.floor(BAR_COUNT / 2) + return Array.from({ length: BAR_COUNT }, (_, i) => { + const distanceFromCenter = Math.abs(i - centerIndex) + const heightMultiplier = 1 - distanceFromCenter * 0.15 + const randomness = 0.85 + Math.random() * 0.15 + return normalizedVolume * heightMultiplier * randomness + }) +} + +/** + * VolumeVisualizer - Animated vertical bars that respond to audio volume + * + * Features: + * - 5 vertical bars with staggered heights based on volume + * - Smooth spring-like animation with easing + * - Yellow color when active, gray when inactive + * - Responsive to volume changes (0-1 scale) + * - Uses REAL audio energy from FFmpeg PCM16 analysis + */ +export function VolumeVisualizer({ volume, isActive = true, className }: VolumeVisualizerProps) { + const [barHeights, setBarHeights] = useState(new Array(BAR_COUNT).fill(MIN_HEIGHT_PERCENT)) + const volumeRef = useRef(volume) + const animationRef = useRef({ + targetHeights: new Array(BAR_COUNT).fill(0), + currentHeights: new Array(BAR_COUNT).fill(0), + frameId: null, + }) + + // Update volume ref without triggering re-render + useEffect(() => { + volumeRef.current = volume + }, [volume]) + + useEffect(() => { + if (!isActive) { + // Reset to minimum when inactive + setBarHeights(new Array(BAR_COUNT).fill(MIN_HEIGHT_PERCENT)) + return + } + + const state = animationRef.current + + const animate = () => { + if (!isActive) { + state.frameId = null + return + } + + // Use REAL volume from ref (updated every frame without restarting animation) + state.targetHeights = calculateTargetHeights(volumeRef.current) + + const newHeights = state.currentHeights.map((current, i) => { + const target = state.targetHeights[i] + const diff = target - current + + if (Math.abs(diff) > ANIMATION_THRESHOLD) { + return current + diff * EASING + } + + return current + }) + + state.currentHeights = newHeights + setBarHeights(newHeights.map((h) => Math.max(MIN_HEIGHT_PERCENT, h * 100))) + state.frameId = requestAnimationFrame(animate) + } + + state.frameId = requestAnimationFrame(animate) + + return () => { + if (state.frameId !== null) { + cancelAnimationFrame(state.frameId) + state.frameId = null + } + } + }, [isActive]) // Removed volume from dependencies - animation runs continuously + + return ( +
+ {barHeights.map((height, i) => ( +
+ ))} +
+ ) +} diff --git a/webview-ui/src/context/ExtensionStateContext.tsx b/webview-ui/src/context/ExtensionStateContext.tsx index 39938b4127f..ffbb5bb16fc 100644 --- a/webview-ui/src/context/ExtensionStateContext.tsx +++ b/webview-ui/src/context/ExtensionStateContext.tsx @@ -98,6 +98,7 @@ export interface ExtensionStateContextType extends ExtensionState { setCustomCondensingPrompt: (value: string) => void yoloGatekeeperApiConfigId?: string // kilocode_change: AI gatekeeper for YOLO mode setYoloGatekeeperApiConfigId: (value: string) => void // kilocode_change: AI gatekeeper for YOLO mode + speechToTextAvailable?: boolean // kilocode_change: Whether voice transcription is fully configured marketplaceItems?: any[] marketplaceInstalledMetadata?: MarketplaceInstalledMetadata profileThresholds: Record diff --git a/webview-ui/src/context/__tests__/ExtensionStateContext.spec.tsx b/webview-ui/src/context/__tests__/ExtensionStateContext.spec.tsx index 76d8c67df19..28f64fd3a5f 100644 --- a/webview-ui/src/context/__tests__/ExtensionStateContext.spec.tsx +++ b/webview-ui/src/context/__tests__/ExtensionStateContext.spec.tsx @@ -281,6 +281,7 @@ describe("mergeExtensionState", () => { multiFileApplyDiff: true, preventFocusDisruption: false, morphFastApply: false, // kilocode_change + speechToText: false, // kilocode_change newTaskRequireTodos: false, imageGeneration: false, runSlashCommand: false, @@ -302,6 +303,7 @@ describe("mergeExtensionState", () => { multiFileApplyDiff: true, preventFocusDisruption: false, morphFastApply: false, // kilocode_change + speechToText: false, // kilocode_change newTaskRequireTodos: false, imageGeneration: false, runSlashCommand: false, diff --git a/webview-ui/src/hooks/useSTT.ts b/webview-ui/src/hooks/useSTT.ts new file mode 100644 index 00000000000..a30288f0ae4 --- /dev/null +++ b/webview-ui/src/hooks/useSTT.ts @@ -0,0 +1,133 @@ +// kilocode_change - new file: React hook for STT (Speech-to-Text) functionality +import { useState, useEffect, useCallback, useRef } from "react" +import { vscode } from "../utils/vscode" +import { STTSegment } from "../../../src/shared/sttContract" + +export interface UseSTTOptions { + /** Called when recording completes with final text */ + onComplete?: (text: string) => void + /** Called on error */ + onError?: (error: string) => void +} + +export interface UseSTTReturn { + /** Whether currently recording */ + isRecording: boolean + /** Transcript segments (complete state from extension) */ + segments: STTSegment[] + /** Current volume level 0-1 */ + volume: number + /** Start recording */ + start: (language?: string) => void + /** Stop recording and finalize */ + stop: () => void + /** Cancel recording and discard */ + cancel: () => void +} + +/** + * Hook for Speech-to-Text functionality + * + * Usage: + * ```tsx + * const { isRecording, transcript, start, stop } = useSTT({ + * onComplete: (text) => { + * setInputValue(prev => prev + " " + text) + * } + * }) + * ``` + */ +export function useSTT(options: UseSTTOptions = {}): UseSTTReturn { + const { onComplete, onError } = options + + const [isRecording, setIsRecording] = useState(false) + const [segments, setSegments] = useState([]) + const [volume, setVolume] = useState(0) + + // Track session to ignore stale events + const sessionIdRef = useRef(null) + // Use ref to avoid stale closure - segments must be current when stt:stopped fires + const segmentsRef = useRef([]) + + useEffect(() => { + segmentsRef.current = segments + }, [segments]) + + useEffect(() => { + const handler = (event: MessageEvent) => { + const msg = event.data + + // Only handle STT events + if (!msg.type?.startsWith("stt:")) return + + switch (msg.type) { + case "stt:started": + sessionIdRef.current = msg.sessionId + setIsRecording(true) + setSegments([]) + break + + case "stt:transcript": + // Ignore events from old sessions + if (msg.sessionId !== sessionIdRef.current) return + // Just pass through the segments from extension (stateless) + console.log("🎙️ [useSTT WebView] 📨 Received segments:", JSON.stringify(msg.segments, null, 2)) + setSegments(msg.segments || []) + break + + case "stt:volume": + if (msg.sessionId !== sessionIdRef.current) return + setVolume(msg.level) + break + + case "stt:stopped": + if (msg.sessionId !== sessionIdRef.current) return + + setIsRecording(false) + setVolume(0) + + if (msg.reason === "completed") { + // Get final text from most recent segments (via ref to avoid stale closure) + const finalText = segmentsRef.current + .map((s) => s.text) + .join(" ") + .trim() + if (finalText) { + onComplete?.(finalText) + } + } else if (msg.reason === "error" && msg.error) { + onError?.(msg.error) + } + + // Clear segments + setSegments([]) + sessionIdRef.current = null + break + } + } + + window.addEventListener("message", handler) + return () => window.removeEventListener("message", handler) + }, [onComplete, onError]) + + const start = useCallback((language?: string) => { + vscode.postMessage({ type: "stt:start", language }) + }, []) + + const stop = useCallback(() => { + vscode.postMessage({ type: "stt:stop" }) + }, []) + + const cancel = useCallback(() => { + vscode.postMessage({ type: "stt:cancel" }) + }, []) + + return { + isRecording, + segments, + volume, + start, + stop, + cancel, + } +} diff --git a/webview-ui/src/i18n/locales/ar/kilocode.json b/webview-ui/src/i18n/locales/ar/kilocode.json index d7147b49e26..4cf692020e5 100644 --- a/webview-ui/src/i18n/locales/ar/kilocode.json +++ b/webview-ui/src/i18n/locales/ar/kilocode.json @@ -310,5 +310,9 @@ "tryAgain": "حاول مرة أخرى", "cancelled": "تم إلغاء المصادقة", "initiating": "جاري بدء المصادقة..." + }, + "speechToText": { + "stopRecording": "إيقاف الإدخال الصوتي", + "startRecording": "ابدأ الإدخال الصوتي" } } diff --git a/webview-ui/src/i18n/locales/ar/settings.json b/webview-ui/src/i18n/locales/ar/settings.json index 39a01d22268..35a32309087 100644 --- a/webview-ui/src/i18n/locales/ar/settings.json +++ b/webview-ui/src/i18n/locales/ar/settings.json @@ -610,7 +610,8 @@ "minimal": "الحد الأدنى (الأسرع)", "low": "منخفض", "medium": "متوسط", - "high": "عالي" + "high": "عالي", + "xhigh": "عالي جداً" }, "verbosity": { "label": "مستوى التفصيل في المخرجات", @@ -1084,6 +1085,10 @@ "INLINE_ASSIST": { "name": "Autocomplete", "description": "تمكين ميزات الإكمال التلقائي للحصول على اقتراحات كود سريعة وتحسينات مباشرة في محررك. يتضمن المهمة السريعة (Cmd+I) للتغييرات المستهدفة والإكمال التلقائي للتحسينات السياقية." + }, + "SPEECH_TO_TEXT": { + "name": "التحويل من الكلام إلى النص (STT)", + "description": "عند التفعيل، يمكن لـ Kilo Code تحويل صوتك إلى نص باستخدام نموذج Whisper من OpenAI. انقر على زر الميكروفون في المحادثة لبدء التسجيل. يتطلب تثبيت FFmpeg ومزود API OpenAI مُكوَّن." } }, "promptCaching": { diff --git a/webview-ui/src/i18n/locales/ca/kilocode.json b/webview-ui/src/i18n/locales/ca/kilocode.json index 42aa86c213f..e741e5bff30 100644 --- a/webview-ui/src/i18n/locales/ca/kilocode.json +++ b/webview-ui/src/i18n/locales/ca/kilocode.json @@ -311,5 +311,9 @@ "tryAgain": "Torna-ho a provar", "cancelled": "Autenticació cancel·lada", "initiating": "Iniciant autenticació..." + }, + "speechToText": { + "stopRecording": "Atura l'entrada de veu", + "startRecording": "Inicia l'entrada de veu" } } diff --git a/webview-ui/src/i18n/locales/ca/settings.json b/webview-ui/src/i18n/locales/ca/settings.json index 7de5287cfdd..01d4bfabe1f 100644 --- a/webview-ui/src/i18n/locales/ca/settings.json +++ b/webview-ui/src/i18n/locales/ca/settings.json @@ -574,7 +574,8 @@ "minimal": "Mínim (el més ràpid)", "high": "Alt", "medium": "Mitjà", - "low": "Baix" + "low": "Baix", + "xhigh": "Molt Alt" }, "verbosity": { "label": "Verbositat de la sortida", @@ -980,6 +981,10 @@ "MULTIPLE_NATIVE_TOOL_CALLS": { "name": "Crides paral·leles a eines", "description": "Quan està activat, el protocol natiu pot executar múltiples eines en un sol torn de missatge de l'assistent." + }, + "SPEECH_TO_TEXT": { + "name": "Veu a text (STT)", + "description": "Quan està activat, Kilo Code pot transcriure la teva veu a text utilitzant el model Whisper d'OpenAI. Fes clic al botó del micròfon al xat per començar a gravar. Requereix que FFmpeg estigui instal·lat i un proveïdor d'API d'OpenAI configurat." } }, "promptCaching": { diff --git a/webview-ui/src/i18n/locales/cs/kilocode.json b/webview-ui/src/i18n/locales/cs/kilocode.json index f4956b678e0..d01801b4cbe 100644 --- a/webview-ui/src/i18n/locales/cs/kilocode.json +++ b/webview-ui/src/i18n/locales/cs/kilocode.json @@ -311,5 +311,9 @@ "tryAgain": "Zkusit znovu", "cancelled": "Ověření zrušeno", "initiating": "Spouštění ověření..." + }, + "speechToText": { + "startRecording": "Spustit hlasový vstup", + "stopRecording": "Zastavit hlasový vstup" } } diff --git a/webview-ui/src/i18n/locales/cs/settings.json b/webview-ui/src/i18n/locales/cs/settings.json index eead17839ad..9a8c2f4387b 100644 --- a/webview-ui/src/i18n/locales/cs/settings.json +++ b/webview-ui/src/i18n/locales/cs/settings.json @@ -601,7 +601,8 @@ "minimal": "Minimální (nejrychlejší)", "low": "Nízké", "medium": "Střední", - "high": "Vysoké" + "high": "Vysoké", + "xhigh": "Extra vysoké" }, "verbosity": { "label": "Podrobnost výstupu", @@ -1060,6 +1061,10 @@ "MULTIPLE_NATIVE_TOOL_CALLS": { "name": "Paralelní volání nástrojů", "description": "Pokud je povoleno, nativní protokol může provádět více nástrojů v jednom obratu zprávy asistenta." + }, + "SPEECH_TO_TEXT": { + "name": "Převod řeči na text (STT)", + "description": "Když je povoleno, Kilo Code může přepsat váš hlas do textu pomocí modelu Whisper od OpenAI. Klikněte na tlačítko mikrofonu v chatu pro zahájení nahrávání. Vyžaduje nainstalovaný FFmpeg a nakonfigurovaného poskytovatele OpenAI API." } }, "promptCaching": { diff --git a/webview-ui/src/i18n/locales/de/kilocode.json b/webview-ui/src/i18n/locales/de/kilocode.json index aa3261c28f7..27f2589c360 100644 --- a/webview-ui/src/i18n/locales/de/kilocode.json +++ b/webview-ui/src/i18n/locales/de/kilocode.json @@ -310,5 +310,9 @@ "tryAgain": "Erneut versuchen", "cancelled": "Authentifizierung abgebrochen", "initiating": "Authentifizierung wird gestartet..." + }, + "speechToText": { + "startRecording": "Spracheingabe starten", + "stopRecording": "Spracheingabe stoppen" } } diff --git a/webview-ui/src/i18n/locales/de/settings.json b/webview-ui/src/i18n/locales/de/settings.json index ab9c17a74f2..03e9db762a4 100644 --- a/webview-ui/src/i18n/locales/de/settings.json +++ b/webview-ui/src/i18n/locales/de/settings.json @@ -583,7 +583,8 @@ "minimal": "Minimal (schnellste)", "high": "Hoch", "medium": "Mittel", - "low": "Niedrig" + "low": "Niedrig", + "xhigh": "Sehr hoch" }, "verbosity": { "label": "Ausgabe-Ausführlichkeit", @@ -976,6 +977,10 @@ "MULTIPLE_NATIVE_TOOL_CALLS": { "name": "Parallele Tool-Aufrufe", "description": "Wenn aktiviert, kann das native Protokoll mehrere Tools in einer einzigen Assistenten-Nachrichtenrunde ausführen." + }, + "SPEECH_TO_TEXT": { + "name": "Sprache-zu-Text (STT)", + "description": "Wenn aktiviert, kann Kilo Code Ihre Sprache mit OpenAIs Whisper-Modell in Text umwandeln. Klicken Sie auf das Mikrofon-Symbol im Chat, um die Aufnahme zu starten. Erfordert eine installierte FFmpeg-Version und einen konfigurierten OpenAI API-Anbieter." } }, "promptCaching": { diff --git a/webview-ui/src/i18n/locales/en/kilocode.json b/webview-ui/src/i18n/locales/en/kilocode.json index 21f85eaf338..dd10300f0c1 100644 --- a/webview-ui/src/i18n/locales/en/kilocode.json +++ b/webview-ui/src/i18n/locales/en/kilocode.json @@ -309,5 +309,9 @@ "tryAgain": "Try Again", "cancelled": "Authentication Cancelled", "initiating": "Starting authentication..." + }, + "speechToText": { + "startRecording": "Start voice input", + "stopRecording": "Stop voice input" } } diff --git a/webview-ui/src/i18n/locales/en/settings.json b/webview-ui/src/i18n/locales/en/settings.json index 06b429f95ef..7005f3aaaa4 100644 --- a/webview-ui/src/i18n/locales/en/settings.json +++ b/webview-ui/src/i18n/locales/en/settings.json @@ -591,7 +591,8 @@ "minimal": "Minimal (Fastest)", "low": "Low", "medium": "Medium", - "high": "High" + "high": "High", + "xhigh": "Extra High" }, "verbosity": { "label": "Output Verbosity", @@ -956,6 +957,10 @@ "name": "Enable concurrent file edits", "description": "When enabled, Kilo Code can edit multiple files in a single request. When disabled, Kilo Code must edit files one at a time. Disabling this can help when working with less capable models or when you want more control over file modifications." }, + "SPEECH_TO_TEXT": { + "name": "Speech-to-Text (STT)", + "description": "When enabled, Kilo Code can transcribe your voice into text using OpenAI's Whisper model. Click the microphone button in the chat to start recording. Requires FFmpeg to be installed and a configured OpenAI API provider." + }, "MORPH_FAST_APPLY": { "name": "Enable Fast Apply", "description": "When enabled, Kilo Code can edit files using Fast Apply with specialized models optimized for code modifications. Requires the Kilo Gateway Provider, OpenRouter, or a Morph API key.", diff --git a/webview-ui/src/i18n/locales/es/kilocode.json b/webview-ui/src/i18n/locales/es/kilocode.json index 3ef0b498525..6f3b5a838e2 100644 --- a/webview-ui/src/i18n/locales/es/kilocode.json +++ b/webview-ui/src/i18n/locales/es/kilocode.json @@ -311,5 +311,9 @@ "tryAgain": "Intentar de nuevo", "cancelled": "Autenticación cancelada", "initiating": "Iniciando autenticación..." + }, + "speechToText": { + "startRecording": "Iniciar entrada de voz", + "stopRecording": "Detener entrada de voz" } } diff --git a/webview-ui/src/i18n/locales/es/settings.json b/webview-ui/src/i18n/locales/es/settings.json index 0ee9baddf50..72d4e8990e3 100644 --- a/webview-ui/src/i18n/locales/es/settings.json +++ b/webview-ui/src/i18n/locales/es/settings.json @@ -210,17 +210,6 @@ "label": "Eliminar", "description": "Eliminar archivos y directorios automáticamente sin requerir aprobación" }, - "browser": { - "delayLabel": "Retraso después de escritura para permitir que los diagnósticos detecten posibles problemas", - "outsideWorkspace": { - "label": "Incluir archivos fuera del espacio de trabajo", - "description": "Permitir a Kilo Code crear y editar archivos fuera del espacio de trabajo actual sin requerir aprobación." - }, - "protected": { - "label": "Incluir archivos protegidos", - "description": "Permitir a Kilo Code crear y editar archivos protegidos (como .kilocodeignore y archivos de configuración .kilocode/) sin requerir aprobación." - } - }, "browser": { "label": "Navegador", "description": "Realizar acciones del navegador automáticamente sin requerir aprobación. Nota: Solo se aplica cuando el modelo admite el uso del ordenador" @@ -568,7 +557,8 @@ "minimal": "Mínimo (el más rápido)", "high": "Alto", "medium": "Medio", - "low": "Bajo" + "low": "Bajo", + "xhigh": "Extra Alto" }, "verbosity": { "label": "Verbosidad de la salida", @@ -991,6 +981,10 @@ "MULTIPLE_NATIVE_TOOL_CALLS": { "name": "Llamadas paralelas a herramientas", "description": "Cuando está habilitado, el protocolo nativo puede ejecutar múltiples herramientas en un solo turno de mensaje del asistente." + }, + "SPEECH_TO_TEXT": { + "name": "Voz a Texto (STT)", + "description": "Cuando esté habilitado, Kilo Code puede transcribir tu voz a texto usando el modelo Whisper de OpenAI. Haz clic en el botón del micrófono en el chat para comenzar a grabar. Requiere que FFmpeg esté instalado y un proveedor de API de OpenAI configurado." } }, "promptCaching": { diff --git a/webview-ui/src/i18n/locales/fr/kilocode.json b/webview-ui/src/i18n/locales/fr/kilocode.json index 813b4becd3f..b7f9f613ab2 100644 --- a/webview-ui/src/i18n/locales/fr/kilocode.json +++ b/webview-ui/src/i18n/locales/fr/kilocode.json @@ -311,5 +311,9 @@ "tryAgain": "Réessayer", "cancelled": "Authentification annulée", "initiating": "Démarrage de l'authentification..." + }, + "speechToText": { + "startRecording": "Démarrer la saisie vocale", + "stopRecording": "Arrêter la saisie vocale" } } diff --git a/webview-ui/src/i18n/locales/fr/settings.json b/webview-ui/src/i18n/locales/fr/settings.json index f2072d845f1..8cf42c74a43 100644 --- a/webview-ui/src/i18n/locales/fr/settings.json +++ b/webview-ui/src/i18n/locales/fr/settings.json @@ -557,7 +557,8 @@ "minimal": "Minimal (le plus rapide)", "high": "Élevé", "medium": "Moyen", - "low": "Faible" + "low": "Faible", + "xhigh": "Très élevé" }, "verbosity": { "label": "Verbosité de la sortie", @@ -980,6 +981,10 @@ "MULTIPLE_NATIVE_TOOL_CALLS": { "name": "Appels d'outils parallèles", "description": "Lorsqu'activé, le protocole natif peut exécuter plusieurs outils en un seul tour de message d'assistant." + }, + "SPEECH_TO_TEXT": { + "name": "Reconnaissance vocale (STT)", + "description": "Lorsque cette option est activée, Kilo Code peut transcrire votre voix en texte en utilisant le modèle Whisper d'OpenAI. Cliquez sur le bouton microphone dans le chat pour commencer l'enregistrement. Nécessite l'installation de FFmpeg et un fournisseur d'API OpenAI configuré." } }, "promptCaching": { diff --git a/webview-ui/src/i18n/locales/hi/kilocode.json b/webview-ui/src/i18n/locales/hi/kilocode.json index be7f045f456..7c1caf5ea7a 100644 --- a/webview-ui/src/i18n/locales/hi/kilocode.json +++ b/webview-ui/src/i18n/locales/hi/kilocode.json @@ -311,5 +311,9 @@ "tryAgain": "पुनः प्रयास करें", "cancelled": "प्रमाणीकरण रद्द किया गया", "initiating": "प्रमाणीकरण शुरू हो रहा है..." + }, + "speechToText": { + "stopRecording": "वॉइस इनपुट बंद करें", + "startRecording": "वॉयस इनपुट शुरू करें" } } diff --git a/webview-ui/src/i18n/locales/hi/settings.json b/webview-ui/src/i18n/locales/hi/settings.json index abe04d24924..59e85ab6950 100644 --- a/webview-ui/src/i18n/locales/hi/settings.json +++ b/webview-ui/src/i18n/locales/hi/settings.json @@ -197,7 +197,6 @@ "label": "लिखें", "description": "अनुमोदन की आवश्यकता के बिना स्वचालित रूप से फाइलें बनाएँ और संपादित करें", "delayLabel": "लिखने के बाद विलंब ताकि डायग्नोस्टिक संभावित समस्याओं का पता लगा सकें", - "delayLabel": "लिखने के बाद विलंब ताकि डायग्नोस्टिक संभावित समस्याओं का पता लगा सकें", "outsideWorkspace": { "label": "वर्कस्पेस के बाहर की फाइलें शामिल करें", "description": "Kilo Code को अनुमोदन की आवश्यकता के बिना वर्तमान वर्कस्पेस के बाहर फाइलें बनाने और संपादित करने की अनुमति दें।" @@ -575,7 +574,8 @@ "minimal": "न्यूनतम (सबसे तेज़)", "high": "उच्च", "medium": "मध्यम", - "low": "निम्न" + "low": "निम्न", + "xhigh": "अत्यंत उच्च" }, "verbosity": { "label": "आउटपुट वर्बोसिटी", @@ -982,6 +982,10 @@ "MULTIPLE_NATIVE_TOOL_CALLS": { "name": "समानांतर टूल कॉल", "description": "सक्षम होने पर, नेटिव प्रोटोकॉल एकल सहायक संदेश टर्न में एकाधिक टूल निष्पादित कर सकता है।" + }, + "SPEECH_TO_TEXT": { + "name": "वाक्-से-पाठ (STT)", + "description": "जब सक्षम किया जाता है, तो Kilo Code OpenAI के Whisper मॉडल का उपयोग करके आपकी आवाज़ को टेक्स्ट में बदल सकता है। रिकॉर्डिंग शुरू करने के लिए चैट में माइक्रोफ़ोन बटन पर क्लिक करें। इसके लिए FFmpeg इंस्टॉल होना और एक कॉन्फ़िगर किया गया OpenAI API प्रोवाइडर होना आवश्यक है।" } }, "promptCaching": { diff --git a/webview-ui/src/i18n/locales/id/kilocode.json b/webview-ui/src/i18n/locales/id/kilocode.json index a8e32b83651..fc641146f57 100644 --- a/webview-ui/src/i18n/locales/id/kilocode.json +++ b/webview-ui/src/i18n/locales/id/kilocode.json @@ -311,5 +311,9 @@ "tryAgain": "Coba Lagi", "cancelled": "Autentikasi Dibatalkan", "initiating": "Memulai autentikasi..." + }, + "speechToText": { + "startRecording": "Mulai input suara", + "stopRecording": "Hentikan input suara" } } diff --git a/webview-ui/src/i18n/locales/id/settings.json b/webview-ui/src/i18n/locales/id/settings.json index 3b4cdd8241f..ed33ae97543 100644 --- a/webview-ui/src/i18n/locales/id/settings.json +++ b/webview-ui/src/i18n/locales/id/settings.json @@ -574,7 +574,8 @@ "minimal": "Minimal (Tercepat)", "high": "Tinggi", "medium": "Sedang", - "low": "Rendah" + "low": "Rendah", + "xhigh": "Ekstra Tinggi" }, "verbosity": { "label": "Verbositas Output", @@ -1002,6 +1003,10 @@ "MULTIPLE_NATIVE_TOOL_CALLS": { "name": "Panggilan tool paralel", "description": "Ketika diaktifkan, protokol native dapat mengeksekusi beberapa tool dalam satu giliran pesan asisten." + }, + "SPEECH_TO_TEXT": { + "name": "Ucapan-ke-Teks (STT)", + "description": "Ketika diaktifkan, Kilo Code dapat mengubah suara Anda menjadi teks menggunakan model Whisper dari OpenAI. Klik tombol mikrofon di chat untuk mulai merekam. Memerlukan FFmpeg yang terinstal dan penyedia API OpenAI yang dikonfigurasi." } }, "promptCaching": { diff --git a/webview-ui/src/i18n/locales/it/kilocode.json b/webview-ui/src/i18n/locales/it/kilocode.json index bf6600e617b..f10125f23a3 100644 --- a/webview-ui/src/i18n/locales/it/kilocode.json +++ b/webview-ui/src/i18n/locales/it/kilocode.json @@ -311,5 +311,9 @@ "tryAgain": "Riprova", "cancelled": "Autenticazione annullata", "initiating": "Avvio autenticazione..." + }, + "speechToText": { + "startRecording": "Avvia input vocale", + "stopRecording": "Interrompi input vocale" } } diff --git a/webview-ui/src/i18n/locales/it/settings.json b/webview-ui/src/i18n/locales/it/settings.json index e7fdad10e79..0deb5b2275d 100644 --- a/webview-ui/src/i18n/locales/it/settings.json +++ b/webview-ui/src/i18n/locales/it/settings.json @@ -584,7 +584,8 @@ "minimal": "Minimo (più veloce)", "high": "Alto", "medium": "Medio", - "low": "Basso" + "low": "Basso", + "xhigh": "Extra Alto" }, "verbosity": { "label": "Verbosity dell'output", @@ -982,6 +983,10 @@ "MULTIPLE_NATIVE_TOOL_CALLS": { "name": "Chiamate parallele agli strumenti", "description": "Quando abilitato, il protocollo nativo può eseguire più strumenti in un singolo turno di messaggio dell'assistente." + }, + "SPEECH_TO_TEXT": { + "name": "Riconoscimento vocale (STT)", + "description": "Quando abilitato, Kilo Code può trascrivere la tua voce in testo utilizzando il modello Whisper di OpenAI. Clicca il pulsante del microfono nella chat per iniziare la registrazione. Richiede l'installazione di FFmpeg e un provider API OpenAI configurato." } }, "promptCaching": { diff --git a/webview-ui/src/i18n/locales/ja/kilocode.json b/webview-ui/src/i18n/locales/ja/kilocode.json index 3193b3abc3f..d471f741d9e 100644 --- a/webview-ui/src/i18n/locales/ja/kilocode.json +++ b/webview-ui/src/i18n/locales/ja/kilocode.json @@ -311,5 +311,9 @@ "tryAgain": "再試行", "cancelled": "認証がキャンセルされました", "initiating": "認証を開始しています..." + }, + "speechToText": { + "startRecording": "音声入力を開始", + "stopRecording": "音声入力を停止" } } diff --git a/webview-ui/src/i18n/locales/ja/settings.json b/webview-ui/src/i18n/locales/ja/settings.json index 31e66d9e89f..95483009cfe 100644 --- a/webview-ui/src/i18n/locales/ja/settings.json +++ b/webview-ui/src/i18n/locales/ja/settings.json @@ -575,7 +575,8 @@ "minimal": "最小 (最速)", "high": "高", "medium": "中", - "low": "低" + "low": "低", + "xhigh": "極高" }, "verbosity": { "label": "出力の冗長性", @@ -982,6 +983,10 @@ "MULTIPLE_NATIVE_TOOL_CALLS": { "name": "並列ツール呼び出し", "description": "有効にすると、ネイティブプロトコルは単一のアシスタントメッセージターンで複数のツールを実行できます。" + }, + "SPEECH_TO_TEXT": { + "name": "音声認識 (STT)", + "description": "有効にすると、Kilo CodeはOpenAIのWhisperモデルを使用して音声をテキストに変換できます。録音を開始するには、チャット内のマイクボタンをクリックしてください。FFmpegのインストールと、設定済みのOpenAI APIプロバイダーが必要です。" } }, "promptCaching": { diff --git a/webview-ui/src/i18n/locales/ko/kilocode.json b/webview-ui/src/i18n/locales/ko/kilocode.json index 26daff0b4c7..a504db7212d 100644 --- a/webview-ui/src/i18n/locales/ko/kilocode.json +++ b/webview-ui/src/i18n/locales/ko/kilocode.json @@ -311,5 +311,9 @@ "tryAgain": "다시 시도", "cancelled": "인증이 취소되었습니다", "initiating": "인증을 시작하는 중..." + }, + "speechToText": { + "stopRecording": "음성 입력 중지", + "startRecording": "음성 입력 시작" } } diff --git a/webview-ui/src/i18n/locales/ko/settings.json b/webview-ui/src/i18n/locales/ko/settings.json index 1546273c42b..f43bd3660ad 100644 --- a/webview-ui/src/i18n/locales/ko/settings.json +++ b/webview-ui/src/i18n/locales/ko/settings.json @@ -574,7 +574,8 @@ "minimal": "최소 (가장 빠름)", "high": "높음", "medium": "중간", - "low": "낮음" + "low": "낮음", + "xhigh": "매우 높음" }, "verbosity": { "label": "출력 상세도", @@ -981,6 +982,10 @@ "MULTIPLE_NATIVE_TOOL_CALLS": { "name": "병렬 도구 호출", "description": "활성화되면 네이티브 프로토콜이 단일 어시스턴트 메시지 턴에서 여러 도구를 실행할 수 있습니다." + }, + "SPEECH_TO_TEXT": { + "description": "활성화되면 Kilo Code가 OpenAI의 Whisper 모델을 사용하여 음성을 텍스트로 변환할 수 있습니다. 채팅에서 마이크 버튼을 클릭하여 녹음을 시작하세요. FFmpeg가 설치되어 있고 OpenAI API 제공자가 구성되어 있어야 합니다.", + "name": "음성 인식 (STT)" } }, "promptCaching": { diff --git a/webview-ui/src/i18n/locales/nl/kilocode.json b/webview-ui/src/i18n/locales/nl/kilocode.json index 1c014ade6c5..77749ddea6e 100644 --- a/webview-ui/src/i18n/locales/nl/kilocode.json +++ b/webview-ui/src/i18n/locales/nl/kilocode.json @@ -311,5 +311,9 @@ "tryAgain": "Opnieuw proberen", "cancelled": "Authenticatie geannuleerd", "initiating": "Authenticatie starten..." + }, + "speechToText": { + "startRecording": "Start spraakinvoer", + "stopRecording": "Stop sprakinvoer" } } diff --git a/webview-ui/src/i18n/locales/nl/settings.json b/webview-ui/src/i18n/locales/nl/settings.json index a3b9a20c0bf..f9990f741b2 100644 --- a/webview-ui/src/i18n/locales/nl/settings.json +++ b/webview-ui/src/i18n/locales/nl/settings.json @@ -574,7 +574,8 @@ "minimal": "Minimaal (Snelst)", "high": "Hoog", "medium": "Middel", - "low": "Laag" + "low": "Laag", + "xhigh": "Extra Hoog" }, "verbosity": { "label": "Uitvoerbaarheid", @@ -981,6 +982,10 @@ "MULTIPLE_NATIVE_TOOL_CALLS": { "name": "Parallelle tool-aanroepen", "description": "Wanneer ingeschakeld, kan het native protocol meerdere tools uitvoeren in één enkele assistent-berichtbeurt." + }, + "SPEECH_TO_TEXT": { + "name": "Spraak-naar-Tekst (STT)", + "description": "Wanneer ingeschakeld kan Kilo Code je stem naar tekst transcriberen met behulp van OpenAI's Whisper model. Klik op de microfoonknop in de chat om op te nemen. Vereist dat FFmpeg geïnstalleerd is en een geconfigureerde OpenAI API provider." } }, "promptCaching": { diff --git a/webview-ui/src/i18n/locales/pl/kilocode.json b/webview-ui/src/i18n/locales/pl/kilocode.json index b78f64d7a63..c226d18c3d4 100644 --- a/webview-ui/src/i18n/locales/pl/kilocode.json +++ b/webview-ui/src/i18n/locales/pl/kilocode.json @@ -311,5 +311,9 @@ "tryAgain": "Spróbuj ponownie", "cancelled": "Uwierzytelnianie anulowane", "initiating": "Rozpoczynanie uwierzytelniania..." + }, + "speechToText": { + "startRecording": "Rozpocznij wprowadzanie głosowe", + "stopRecording": "Zatrzymaj wprowadzanie głosowe" } } diff --git a/webview-ui/src/i18n/locales/pl/settings.json b/webview-ui/src/i18n/locales/pl/settings.json index a956f8f06b4..74dad4b4f9e 100644 --- a/webview-ui/src/i18n/locales/pl/settings.json +++ b/webview-ui/src/i18n/locales/pl/settings.json @@ -574,7 +574,8 @@ "minimal": "Minimalny (najszybszy)", "high": "Wysoki", "medium": "Średni", - "low": "Niski" + "low": "Niski", + "xhigh": "Bardzo wysoki" }, "verbosity": { "label": "Szczegółowość danych wyjściowych", @@ -981,6 +982,10 @@ "MULTIPLE_NATIVE_TOOL_CALLS": { "name": "Równoległe wywołania narzędzi", "description": "Po włączeniu protokół natywny może wykonywać wiele narzędzi w jednej turze wiadomości asystenta." + }, + "SPEECH_TO_TEXT": { + "name": "Zamiana mowy na tekst (STT)", + "description": "Gdy włączone, Kilo Code może transkrybować Twój głos na tekst używając modelu Whisper od OpenAI. Kliknij przycisk mikrofonu w czacie, aby rozpocząć nagrywanie. Wymaga zainstalowania FFmpeg i skonfigurowanego dostawcy API OpenAI." } }, "promptCaching": { diff --git a/webview-ui/src/i18n/locales/pt-BR/kilocode.json b/webview-ui/src/i18n/locales/pt-BR/kilocode.json index 22a0833c3fb..8a250c747e8 100644 --- a/webview-ui/src/i18n/locales/pt-BR/kilocode.json +++ b/webview-ui/src/i18n/locales/pt-BR/kilocode.json @@ -311,5 +311,9 @@ "tryAgain": "Tentar novamente", "cancelled": "Autenticação cancelada", "initiating": "Iniciando autenticação..." + }, + "speechToText": { + "startRecording": "Iniciar entrada de voz", + "stopRecording": "Parar entrada de voz" } } diff --git a/webview-ui/src/i18n/locales/pt-BR/settings.json b/webview-ui/src/i18n/locales/pt-BR/settings.json index d665a47b1ee..24d222653ee 100644 --- a/webview-ui/src/i18n/locales/pt-BR/settings.json +++ b/webview-ui/src/i18n/locales/pt-BR/settings.json @@ -548,7 +548,8 @@ "minimal": "Mínimo (mais rápido)", "high": "Alto", "medium": "Médio", - "low": "Baixo" + "low": "Baixo", + "xhigh": "Extra Alto" }, "verbosity": { "label": "Verbosidade da saída", @@ -981,6 +982,10 @@ "MULTIPLE_NATIVE_TOOL_CALLS": { "name": "Chamadas paralelas de ferramentas", "description": "Quando habilitado, o protocolo nativo pode executar múltiplas ferramentas em um único turno de mensagem do assistente." + }, + "SPEECH_TO_TEXT": { + "description": "Quando habilitado, o Kilo Code pode transcrever sua voz em texto usando o modelo Whisper da OpenAI. Clique no botão do microfone no chat para começar a gravação. Requer que o FFmpeg esteja instalado e um provedor de API OpenAI configurado.", + "name": "Fala-para-Texto (STT)" } }, "promptCaching": { diff --git a/webview-ui/src/i18n/locales/ru/kilocode.json b/webview-ui/src/i18n/locales/ru/kilocode.json index d9324b66c4a..48f8b0dae15 100644 --- a/webview-ui/src/i18n/locales/ru/kilocode.json +++ b/webview-ui/src/i18n/locales/ru/kilocode.json @@ -311,5 +311,9 @@ "tryAgain": "Попробовать снова", "cancelled": "Аутентификация отменена", "initiating": "Запуск аутентификации..." + }, + "speechToText": { + "startRecording": "Начать голосовой ввод", + "stopRecording": "Остановить голосовой ввод" } } diff --git a/webview-ui/src/i18n/locales/ru/settings.json b/webview-ui/src/i18n/locales/ru/settings.json index d989df0a7a1..9081c0d0cf7 100644 --- a/webview-ui/src/i18n/locales/ru/settings.json +++ b/webview-ui/src/i18n/locales/ru/settings.json @@ -574,7 +574,8 @@ "minimal": "Минимальный (самый быстрый)", "high": "Высокие", "medium": "Средние", - "low": "Низкие" + "low": "Низкие", + "xhigh": "Очень высокие" }, "verbosity": { "label": "Подробность вывода", @@ -981,6 +982,10 @@ "MULTIPLE_NATIVE_TOOL_CALLS": { "name": "Параллельные вызовы инструментов", "description": "При включении нативный протокол может выполнять несколько инструментов в одном ходе сообщения ассистента." + }, + "SPEECH_TO_TEXT": { + "name": "Речь-в-текст (STT)", + "description": "При включении Kilo Code может преобразовывать вашу речь в текст с помощью модели Whisper от OpenAI. Нажмите кнопку микрофона в чате, чтобы начать запись. Требуется установленный FFmpeg и настроенный API-провайдер OpenAI." } }, "promptCaching": { diff --git a/webview-ui/src/i18n/locales/th/kilocode.json b/webview-ui/src/i18n/locales/th/kilocode.json index 1a2db287452..92936386c1d 100644 --- a/webview-ui/src/i18n/locales/th/kilocode.json +++ b/webview-ui/src/i18n/locales/th/kilocode.json @@ -311,5 +311,9 @@ "tryAgain": "ลองอีกครั้ง", "cancelled": "ยกเลิกการยืนยันตัวตนแล้ว", "initiating": "กำลังเริ่มการยืนยันตัวตน..." + }, + "speechToText": { + "startRecording": "เริ่มการป้อนเสียง", + "stopRecording": "หยุดการป้อนเสียง" } } diff --git a/webview-ui/src/i18n/locales/th/settings.json b/webview-ui/src/i18n/locales/th/settings.json index 4ffc9baa1f2..6c635f43343 100644 --- a/webview-ui/src/i18n/locales/th/settings.json +++ b/webview-ui/src/i18n/locales/th/settings.json @@ -569,7 +569,8 @@ "minimal": "น้อยที่สุด (เร็วที่สุด)", "low": "ต่ำ", "medium": "ปานกลาง", - "high": "สูง" + "high": "สูง", + "xhigh": "สูงพิเศษ" }, "verbosity": { "label": "ระดับความละเอียดของผลลัพธ์", @@ -1071,6 +1072,10 @@ "MULTIPLE_NATIVE_TOOL_CALLS": { "name": "การเรียกใช้เครื่องมือแบบขนาน", "description": "เมื่อเปิดใช้งาน โปรโตคอล native สามารถดำเนินการเครื่องมือหลายรายการในข้อความของผู้ช่วยเดียว" + }, + "SPEECH_TO_TEXT": { + "name": "การแปลงเสียงพูดเป็นข้อความ (STT)", + "description": "เมื่อเปิดใช้งาน Kilo Code สามารถแปลงเสียงของคุณเป็นข้อความได้โดยใช้โมเดล Whisper ของ OpenAI คลิกปุ่มไมโครโฟนในแชทเพื่อเริ่มบันทึกเสียง ต้องติดตั้ง FFmpeg และกำหนดค่าผู้ให้บริการ OpenAI API แล้ว" } }, "promptCaching": { diff --git a/webview-ui/src/i18n/locales/tr/kilocode.json b/webview-ui/src/i18n/locales/tr/kilocode.json index b102dae066e..a9ed5af92b8 100644 --- a/webview-ui/src/i18n/locales/tr/kilocode.json +++ b/webview-ui/src/i18n/locales/tr/kilocode.json @@ -311,5 +311,9 @@ "tryAgain": "Tekrar Dene", "cancelled": "Kimlik Doğrulama İptal Edildi", "initiating": "Kimlik doğrulama başlatılıyor..." + }, + "speechToText": { + "startRecording": "Sesli girişi başlat", + "stopRecording": "Sesli girişi durdur" } } diff --git a/webview-ui/src/i18n/locales/tr/settings.json b/webview-ui/src/i18n/locales/tr/settings.json index 888f31b42d5..0759cc5e292 100644 --- a/webview-ui/src/i18n/locales/tr/settings.json +++ b/webview-ui/src/i18n/locales/tr/settings.json @@ -549,7 +549,8 @@ "minimal": "Minimal (en hızlı)", "high": "Yüksek", "medium": "Orta", - "low": "Düşük" + "low": "Düşük", + "xhigh": "Ekstra Yüksek" }, "verbosity": { "label": "Çıktı Ayrıntı Düzeyi", @@ -982,6 +983,10 @@ "MULTIPLE_NATIVE_TOOL_CALLS": { "name": "Paralel araç çağrıları", "description": "Etkinleştirildiğinde, yerel protokol tek bir asistan mesaj turunda birden fazla araç yürütebilir." + }, + "SPEECH_TO_TEXT": { + "description": "Etkinleştirildiğinde, Kilo Code sesinizi OpenAI'nin Whisper modeli kullanarak metne dönüştürebilir. Kayda başlamak için sohbetteki mikrofon düğmesine tıklayın. FFmpeg'in yüklü olması ve yapılandırılmış bir OpenAI API sağlayıcısı gereklidir.", + "name": "Konuşmadan Metne (STT)" } }, "promptCaching": { diff --git a/webview-ui/src/i18n/locales/uk/kilocode.json b/webview-ui/src/i18n/locales/uk/kilocode.json index b50faf79cc5..c7060c481ba 100644 --- a/webview-ui/src/i18n/locales/uk/kilocode.json +++ b/webview-ui/src/i18n/locales/uk/kilocode.json @@ -311,5 +311,9 @@ "tryAgain": "Спробувати знову", "cancelled": "Автентифікацію скасовано", "initiating": "Запуск автентифікації..." + }, + "speechToText": { + "startRecording": "Почати голосове введення", + "stopRecording": "Зупинити голосове введення" } } diff --git a/webview-ui/src/i18n/locales/uk/settings.json b/webview-ui/src/i18n/locales/uk/settings.json index 9c42dcc6dff..2306167f038 100644 --- a/webview-ui/src/i18n/locales/uk/settings.json +++ b/webview-ui/src/i18n/locales/uk/settings.json @@ -104,7 +104,6 @@ "vercelAiGatewayProvider": "Vercel AI Gateway", "vercelAiGatewayApiKeyLabel": "Ключ API", "vercelAiGatewayApiKeyPlaceholder": "Введіть ваш ключ API Vercel AI Gateway", - "bedrockProvider": "Amazon Bedrock", "bedrockConfigTitle": "Конфігурація Amazon Bedrock", "bedrockConfigDesc": "Налаштуйте Amazon Bedrock для використання моделей вбудовувань AWS", "bedrockEmbeddingModel": "Модель вбудовувань Bedrock", @@ -610,7 +609,8 @@ "minimal": "Мінімальний (найшвидший)", "high": "Високий", "medium": "Середній", - "low": "Низький" + "low": "Низький", + "xhigh": "Екстра високий" }, "verbosity": { "label": "Рівень деталізації виводу", @@ -1086,7 +1086,11 @@ "experimentalParallelToolCalls": "Експериментальні паралельні виклики інструментів", "experimentalParallelToolCallsDesc": "Дозволяє моделі викликати кілька інструментів одночасно для покращення ефективності", "experimentalParallelToolCallsInfo": "Ця функція може покращити швидкість виконання завдань, але може бути нестабільною з деякими моделями", - "experimentalParallelToolCallsWarning": "Використовуйте з обережністю: може призвести до неочікуваної поведінки" + "experimentalParallelToolCallsWarning": "Використовуйте з обережністю: може призвести до неочікуваної поведінки", + "SPEECH_TO_TEXT": { + "name": "Мовлення в текст (STT)", + "description": "Коли увімкнено, Kilo Code може транскрибувати ваш голос у текст за допомогою моделі Whisper від OpenAI. Натисніть кнопку мікрофона в чаті, щоб почати запис. Потребує встановленого FFmpeg та налаштованого провайдера OpenAI API." + } }, "promptCaching": { "label": "Вимкнути кешування підказок", diff --git a/webview-ui/src/i18n/locales/vi/kilocode.json b/webview-ui/src/i18n/locales/vi/kilocode.json index 963f2d8652c..ab4aabd8c9a 100644 --- a/webview-ui/src/i18n/locales/vi/kilocode.json +++ b/webview-ui/src/i18n/locales/vi/kilocode.json @@ -311,5 +311,9 @@ "tryAgain": "Thử lại", "cancelled": "Đã hủy xác thực", "initiating": "Đang bắt đầu xác thực..." + }, + "speechToText": { + "stopRecording": "Dừng nhập bằng giọng nói", + "startRecording": "Bắt đầu nhập bằng giọng nói" } } diff --git a/webview-ui/src/i18n/locales/vi/settings.json b/webview-ui/src/i18n/locales/vi/settings.json index 9671225a239..856e61a7391 100644 --- a/webview-ui/src/i18n/locales/vi/settings.json +++ b/webview-ui/src/i18n/locales/vi/settings.json @@ -574,7 +574,8 @@ "minimal": "Tối thiểu (nhanh nhất)", "high": "Cao", "medium": "Trung bình", - "low": "Thấp" + "low": "Thấp", + "xhigh": "Rất cao" }, "verbosity": { "label": "Mức độ chi tiết đầu ra", @@ -981,6 +982,10 @@ "MULTIPLE_NATIVE_TOOL_CALLS": { "name": "Lệnh gọi công cụ song song", "description": "Khi được bật, giao thức native có thể thực thi nhiều công cụ trong một lượt tin nhắn của trợ lý." + }, + "SPEECH_TO_TEXT": { + "name": "Chuyển Giọng nói thành Văn bản (STT)", + "description": "Khi được bật, Kilo Code có thể chuyển đổi giọng nói của bạn thành văn bản bằng mô hình Whisper của OpenAI. Nhấp vào nút microphone trong chat để bắt đầu ghi âm. Yêu cầu cài đặt FFmpeg và cấu hình nhà cung cấp API OpenAI." } }, "promptCaching": { diff --git a/webview-ui/src/i18n/locales/zh-CN/kilocode.json b/webview-ui/src/i18n/locales/zh-CN/kilocode.json index 827f5a91db7..8ec1236db25 100644 --- a/webview-ui/src/i18n/locales/zh-CN/kilocode.json +++ b/webview-ui/src/i18n/locales/zh-CN/kilocode.json @@ -311,5 +311,9 @@ "tryAgain": "重试", "cancelled": "验证已取消", "initiating": "正在启动验证..." + }, + "speechToText": { + "startRecording": "开始语音输入", + "stopRecording": "停止语音输入" } } diff --git a/webview-ui/src/i18n/locales/zh-CN/settings.json b/webview-ui/src/i18n/locales/zh-CN/settings.json index 5f01fe6aa73..96fe46c0ffc 100644 --- a/webview-ui/src/i18n/locales/zh-CN/settings.json +++ b/webview-ui/src/i18n/locales/zh-CN/settings.json @@ -23,7 +23,7 @@ "sections": { "providers": "提供商", "modes": "模式", - "mcp": "MCP 服务", + "mcp": "MCP 服务器", "autoApprove": "自动批准", "browser": "计算机交互", "checkpoints": "存档点", @@ -37,7 +37,6 @@ "experimental": "实验性", "language": "语言", "about": "关于 Kilo Code", - "mcp": "MCP 服务器", "autoPurge": "自动清理" }, "about": { @@ -372,9 +371,6 @@ "getZaiApiKey": "获取 Z AI API 密钥", "zaiEntrypoint": "Z AI 服务站点", "zaiEntrypointDescription": "请根据您的位置选择适当的 API 服务站点。如果您在中国,请选择 open.bigmodel.cn。否则,请选择 api.z.ai。", - "minimaxApiKey": "MiniMax API 密钥", - "getMiniMaxApiKey": "获取 MiniMax API 密钥", - "minimaxBaseUrl": "MiniMax 服务站点", "geminiApiKey": "Gemini API 密钥", "getGroqApiKey": "获取 Groq API 密钥", "groqApiKey": "Groq API 密钥", @@ -578,7 +574,8 @@ "minimal": "最小 (最快)", "high": "高", "medium": "中", - "low": "低" + "low": "低", + "xhigh": "极高" }, "verbosity": { "label": "输出详细程度", @@ -989,6 +986,10 @@ "MULTIPLE_NATIVE_TOOL_CALLS": { "name": "并行工具调用", "description": "启用后,原生协议可在单个助手消息轮次中执行多个工具。" + }, + "SPEECH_TO_TEXT": { + "description": "启用后,Kilo Code 可以使用 OpenAI 的 Whisper 模型将您的语音转录为文本。点击聊天中的麦克风按钮开始录音。需要安装 FFmpeg 并配置 OpenAI API 提供商。", + "name": "语音转文本 (STT)" } }, "promptCaching": { diff --git a/webview-ui/src/i18n/locales/zh-TW/kilocode.json b/webview-ui/src/i18n/locales/zh-TW/kilocode.json index d648193de25..a92e7171156 100644 --- a/webview-ui/src/i18n/locales/zh-TW/kilocode.json +++ b/webview-ui/src/i18n/locales/zh-TW/kilocode.json @@ -311,5 +311,9 @@ "tryAgain": "重試", "cancelled": "驗證已取消", "initiating": "正在啟動驗證..." + }, + "speechToText": { + "startRecording": "开始语音输入", + "stopRecording": "停止语音输入" } } diff --git a/webview-ui/src/i18n/locales/zh-TW/settings.json b/webview-ui/src/i18n/locales/zh-TW/settings.json index 2a1dc2a46f5..05de8ee9f48 100644 --- a/webview-ui/src/i18n/locales/zh-TW/settings.json +++ b/webview-ui/src/i18n/locales/zh-TW/settings.json @@ -549,7 +549,8 @@ "minimal": "最小 (最快)", "high": "高", "medium": "中", - "low": "低" + "low": "低", + "xhigh": "超高" }, "verbosity": { "label": "輸出詳細程度", @@ -982,6 +983,10 @@ "MULTIPLE_NATIVE_TOOL_CALLS": { "name": "並行工具呼叫", "description": "啟用後,原生協定可在單個助理訊息輪次中執行多個工具。" + }, + "SPEECH_TO_TEXT": { + "name": "语音转文本 (STT)", + "description": "启用后,Kilo Code 可以使用 OpenAI 的 Whisper 模型将您的语音转录为文本。点击聊天中的麦克风按钮开始录音。需要安装 FFmpeg 并配置 OpenAI API 提供商。" } }, "promptCaching": { diff --git a/webview-ui/src/index.css b/webview-ui/src/index.css index e0f74c9c5de..8f376580887 100644 --- a/webview-ui/src/index.css +++ b/webview-ui/src/index.css @@ -215,6 +215,18 @@ } } +/* kilocode_change start: Cursor blink animation for voice recording */ +@keyframes blink { + 0%, + 100% { + opacity: 1; + } + 50% { + opacity: 0; + } +} +/* kilocode_change end */ + /* Form Element Focus States */ textarea:focus { @@ -530,3 +542,11 @@ input[cmdk-input]:focus { .diff-content-context { background-color: color-mix(in srgb, var(--vscode-editorGroup-border) 100%, transparent); } + +/* kilocode_change start - STT preview text styling */ +.stt-preview-text { + color: var(--vscode-descriptionForeground); + font-style: italic; + opacity: 0.7; +} +/* kilocode_change end - STT preview text styling */