diff --git a/.changeset/brave-frogs-check.md b/.changeset/brave-frogs-check.md deleted file mode 100644 index 87d59d7dc70..00000000000 --- a/.changeset/brave-frogs-check.md +++ /dev/null @@ -1,6 +0,0 @@ ---- -"@kilocode/cli": minor -"kilo-code": minor ---- - -improve session sync mechanism (event based instead of timer) diff --git a/.changeset/cli-auth-reminder.md b/.changeset/cli-auth-reminder.md deleted file mode 100644 index 73be0e9f982..00000000000 --- a/.changeset/cli-auth-reminder.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -"kilo-code": patch ---- - -Agent Manager: remind first-time CLI installs to run `kilocode auth` after opening the install terminal, with translations. diff --git a/.changeset/fair-sloths-post.md b/.changeset/fair-sloths-post.md deleted file mode 100644 index cde6c3384b1..00000000000 --- a/.changeset/fair-sloths-post.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -"@kilocode/cli": patch ---- - -flush cli session on completion diff --git a/.changeset/giant-spoons-jump.md b/.changeset/giant-spoons-jump.md deleted file mode 100644 index 0d88af33b38..00000000000 --- a/.changeset/giant-spoons-jump.md +++ /dev/null @@ -1,6 +0,0 @@ ---- -"@kilocode/cli": patch -"kilo-code": patch ---- - -extract an extension message handler for extension/cli reuse diff --git a/.changeset/loud-ads-slide.md b/.changeset/loud-ads-slide.md deleted file mode 100644 index 41fe000374e..00000000000 --- a/.changeset/loud-ads-slide.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -"kilo-code": patch ---- - -Fix Kilo Auth flow diff --git a/.changeset/multi-version-sessions.md b/.changeset/multi-version-sessions.md new file mode 100644 index 00000000000..540957b48f9 --- /dev/null +++ b/.changeset/multi-version-sessions.md @@ -0,0 +1,5 @@ +--- +"kilo-code": minor +--- + +Add multi-version feature to Agent Manager - launch 1-4 parallel agents in parallel on git worktrees diff --git a/.changeset/ripe-bats-wish.md b/.changeset/ripe-bats-wish.md deleted file mode 100644 index 3fa8281fbe2..00000000000 --- a/.changeset/ripe-bats-wish.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -"kilo-code": patch ---- - -Fix styling issue on task headers diff --git a/.changeset/seven-seas-show.md b/.changeset/seven-seas-show.md deleted file mode 100644 index 257741725ad..00000000000 --- a/.changeset/seven-seas-show.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -"kilo-code": patch ---- - -Minor tuning to autocomplete diff --git a/.changeset/witty-books-taste.md b/.changeset/witty-books-taste.md deleted file mode 100644 index 7872ba4f674..00000000000 --- a/.changeset/witty-books-taste.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -"kilo-code": patch ---- - -Agent-Manager - Fix Chat Input scroll diff --git a/CHANGELOG.md b/CHANGELOG.md index 262f8df45d8..f9aa5e51e37 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,87 @@ # kilo-code +## 4.135.0 + +### Minor Changes + +- [#4326](https://github.com/Kilo-Org/kilocode/pull/4326) [`6d62090`](https://github.com/Kilo-Org/kilocode/commit/6d620905dfc6d8419bdbc9ffcad54109057e709e) Thanks [@iscekic](https://github.com/iscekic)! - improve session sync mechanism (event based instead of timer) + +- [#4333](https://github.com/Kilo-Org/kilocode/pull/4333) [`0093fd1`](https://github.com/Kilo-Org/kilocode/commit/0093fd15e1a3baa80a872bc8889c5e219684004c) Thanks [@kevinvandijk](https://github.com/kevinvandijk)! - Include changes from Roo Code v3.36.2 + + - Restrict GPT-5 tool set to apply_patch for improved compatibility (PR #9853 by @hannesrudolph) + - Fix: Resolve Chutes provider model fetching issue (PR #9854 by @cte) + - Add MessageManager layer for centralized history coordination, fixing message synchronization issues (PR #9842 by @hannesrudolph) + - Fix: Prevent cascading truncation loop by only truncating visible messages (PR #9844 by @hannesrudolph) + - Fix: Handle unknown/invalid native tool calls to prevent extension freeze (PR #9834 by @daniel-lxs) + - Always enable reasoning for models that require it (PR #9836 by @cte) + - ChatView: Smoother stick-to-bottom behavior during streaming (PR #8999 by @hannesrudolph) + - UX: Improved error messages and documentation links (PR #9777 by @brunobergher) + - Fix: Overly round follow-up question suggestions styling (PR #9829 by @brunobergher) + - Ignore input to the execa terminal process for safer command execution (PR #9827 by @mrubens) + - Be safer about large file reads (PR #9843 by @jr) + - Add gpt-5.1-codex-max model to OpenAI provider (PR #9848 by @hannesrudolph) + - Evals UI: Add filtering, bulk delete, tool consolidation, and run notes (PR #9837 by @hannesrudolph) + - Evals UI: Add multi-model launch and UI improvements (PR #9845 by @hannesrudolph) + - Web: New pricing page (PR #9821 by @brunobergher) + - Fix: Restore context when rewinding after condense (#8295 by @hannesrudolph, PR #9665 by @hannesrudolph) + - Enable search_and_replace for Minimax models (PR #9780 by @mrubens) + - Fix: Resolve Vercel AI Gateway model fetching issues (PR #9791 by @cte) + - Fix: Apply conservative max tokens for Cerebras provider (PR #9804 by @sebastiand-cerebras) + - Fix: Remove omission detection logic to eliminate false positives (#9785 by @Michaelzag, PR #9787 by @app/roomote) + - Refactor: Remove deprecated insert_content tool (PR #9751 by @daniel-lxs) + - Chore: Hide parallel tool calls experiment and disable feature (PR #9798 by @hannesrudolph) + - Update next.js documentation site dependencies (PR #9799 by @jr) + - Fix: Correct download count display on homepage (PR #9807 by @mrubens) + - Feat: Add provider routing selection for OpenRouter embeddings (#9144 by @SannidhyaSah, PR #9693 by @SannidhyaSah) + - Default Minimax M2 to native tool calling (PR #9778 by @mrubens) + - Sanitize the native tool calls to fix a bug with Gemini (PR #9769 by @mrubens) + - Fix: Handle malformed native tool calls to prevent hanging (PR #9758 by @daniel-lxs) + - Fix: Remove reasoning toggles for GLM-4.5 and GLM-4.6 on z.ai provider (PR #9752 by @roomote) + - Refactor: Remove line_count parameter from write_to_file tool (PR #9667 by @hannesrudolph) + - Switch to new welcome view for improved onboarding experience (PR #9741 by @mrubens) + - Update homepage with latest changes (PR #9675 by @brunobergher) + - Improve privacy for stealth models by adding vendor confidentiality section to system prompt (PR #9742 by @mrubens) + - Allow models to contain default temperature settings for provider-specific optimal defaults (PR #9734 by @mrubens) + - Enable native tool support for all LiteLLM models by default (PR #9736 by @mrubens) + - Pass app version to provider for improved request tracking (PR #9730 by @cte) + - Fix: Flush pending tool results before task delegation (PR #9726 by @daniel-lxs) + - Improve: Better IPC error logging for easier debugging (PR #9727 by @cte) + - Metadata-driven subtasks with automatic parent resume and single-open safety for improved task orchestration (#8081 by @hannesrudolph, PR #9090 by @hannesrudolph) + - Native tool calling support expanded across many providers: Bedrock (PR #9698 by @mrubens), Cerebras (PR #9692 by @mrubens), Chutes with auto-detection from API (PR #9715 by @daniel-lxs), DeepInfra (PR #9691 by @mrubens), DeepSeek and Doubao (PR #9671 by @daniel-lxs), Groq (PR #9673 by @daniel-lxs), LiteLLM (PR #9719 by @daniel-lxs), Ollama (PR #9696 by @mrubens), OpenAI-compatible providers (PR #9676 by @daniel-lxs), Requesty (PR #9672 by @daniel-lxs), Unbound (PR #9699 by @mrubens), Vercel AI Gateway (PR #9697 by @mrubens), Vertex Gemini (PR #9678 by @daniel-lxs), and xAI with new Grok 4 Fast and Grok 4.1 Fast models (PR #9690 by @mrubens) + - Fix: Preserve tool_use blocks in summary for parallel tool calls (#9700 by @SilentFlower, PR #9714 by @SilentFlower) + - Default Grok Code Fast to native tools for better performance (PR #9717 by @mrubens) + - UX toolbar cleanup and settings consolidation for a cleaner interface (PR #9710 by @brunobergher) + - Add model-specific tool customization via `excludedTools` and `includedTools` configuration (PR #9641 by @daniel-lxs) + - Add new `apply_patch` native tool for more efficient file editing operations (PR #9663 by @hannesrudolph) + - Add new `search_and_replace` tool for batch text replacements across files (PR #9549 by @hannesrudolph) + - Add debug buttons to view API and UI history for troubleshooting (PR #9684 by @hannesrudolph) + - Include tool format in environment details for better context awareness (PR #9661 by @mrubens) + - Fix: Display install count in millions instead of thousands (PR #9677 by @app/roomote) + - Fix: Prevent navigation buttons from wrapping on smaller screens (PR #9721 by @app/roomote) + - Fix: Race condition in new_task tool for native protocol (PR #9655 by @daniel-lxs) + +### Patch Changes + +- [#4379](https://github.com/Kilo-Org/kilocode/pull/4379) [`37b90be`](https://github.com/Kilo-Org/kilocode/commit/37b90be866111761dd90c3a0c8f179f5be16242c) Thanks [@marius-kilocode](https://github.com/marius-kilocode)! - Add todo list UI to Agent Manager, displaying task progress above the chat input with a collapsible list view + +- [#4266](https://github.com/Kilo-Org/kilocode/pull/4266) [`3ad7248`](https://github.com/Kilo-Org/kilocode/commit/3ad7248effa3b78f93b2f39c875735cd50b78d98) Thanks [@helloGitWorld-ctrl](https://github.com/helloGitWorld-ctrl)! - JetBrains - Improve multiproject conflicts + +- [#4366](https://github.com/Kilo-Org/kilocode/pull/4366) [`11c2f87`](https://github.com/Kilo-Org/kilocode/commit/11c2f870a82b39cbbb2d3e9bcdecc8bc13b44adb) Thanks [@marius-kilocode](https://github.com/marius-kilocode)! - Agent Manager: remind first-time CLI installs to run `kilocode auth` after opening the install terminal, with translations. + +- [#4389](https://github.com/Kilo-Org/kilocode/pull/4389) [`ac3350e`](https://github.com/Kilo-Org/kilocode/commit/ac3350e3caff0c3c93e9f3808633d776855cefa8) Thanks [@iscekic](https://github.com/iscekic)! - fix share url handling + +- [#4362](https://github.com/Kilo-Org/kilocode/pull/4362) [`d596a08`](https://github.com/Kilo-Org/kilocode/commit/d596a08d6fe5c1a719855616ba5f582407f6769a) Thanks [@iscekic](https://github.com/iscekic)! - extract an extension message handler for extension/cli reuse + +- [#4361](https://github.com/Kilo-Org/kilocode/pull/4361) [`24813e9`](https://github.com/Kilo-Org/kilocode/commit/24813e900e50bf63dbb553a951970467221ce73d) Thanks [@catrielmuller](https://github.com/catrielmuller)! - Fix Kilo Auth flow + +- [#4374](https://github.com/Kilo-Org/kilocode/pull/4374) [`612e472`](https://github.com/Kilo-Org/kilocode/commit/612e47277d32eb4c481e15fa47c4216015597e88) Thanks [@kevinvandijk](https://github.com/kevinvandijk)! - Fix styling issue on task headers + +- [#4308](https://github.com/Kilo-Org/kilocode/pull/4308) [`a9eab93`](https://github.com/Kilo-Org/kilocode/commit/a9eab931b11baf20e229dd328dd47557fa29fe49) Thanks [@markijbema](https://github.com/markijbema)! - Minor tuning to autocomplete + +- [#4375](https://github.com/Kilo-Org/kilocode/pull/4375) [`58c4096`](https://github.com/Kilo-Org/kilocode/commit/58c40964bb07135a0e9df29a253651a255ccffa2) Thanks [@catrielmuller](https://github.com/catrielmuller)! - Agent Manager - Local CLI install for immutable environments + +- [#4369](https://github.com/Kilo-Org/kilocode/pull/4369) [`5195bd0`](https://github.com/Kilo-Org/kilocode/commit/5195bd00067d83474606dfca0df71abfed13566a) Thanks [@catrielmuller](https://github.com/catrielmuller)! - Agent-Manager - Fix Chat Input scroll + ## 4.134.0 ### Minor Changes diff --git a/apps/storybook/src/mockData/chatMessages.ts b/apps/storybook/src/mockData/chatMessages.ts index ba05d8812e3..c5c6cf487f2 100644 --- a/apps/storybook/src/mockData/chatMessages.ts +++ b/apps/storybook/src/mockData/chatMessages.ts @@ -82,17 +82,6 @@ export const ASK_PRESETS = { }), } satisfies ClineMessageData, - tool_insert_content: { - type: "ask", - ask: "tool", - text: JSON.stringify({ - tool: "insert_content", - path: "src/app.ts", - lineNumber: 42, - content: "// New import added\nimport { newFunction } from './utils'", - }), - } satisfies ClineMessageData, - tool_read_file: { type: "ask", ask: "tool", diff --git a/apps/vscode-e2e/src/suite/extension.test.ts b/apps/vscode-e2e/src/suite/extension.test.ts index 1e8ede62a61..cbdb0cb7463 100644 --- a/apps/vscode-e2e/src/suite/extension.test.ts +++ b/apps/vscode-e2e/src/suite/extension.test.ts @@ -10,7 +10,9 @@ suite("Kilo Code Extension", function () { const expectedCommands = [ "activationCompleted", "plusButtonClicked", - "promptsButtonClicked", + "popoutButtonClicked", + "openInNewTab", + "settingsButtonClicked", "historyButtonClicked", "popoutButtonClicked", "accountButtonClicked", diff --git a/apps/vscode-e2e/src/suite/tools/insert-content.test.ts b/apps/vscode-e2e/src/suite/tools/insert-content.test.ts deleted file mode 100644 index a3a3abb1866..00000000000 --- a/apps/vscode-e2e/src/suite/tools/insert-content.test.ts +++ /dev/null @@ -1,628 +0,0 @@ -import * as assert from "assert" -import * as fs from "fs/promises" -import * as path from "path" -import * as vscode from "vscode" - -import { RooCodeEventName, type ClineMessage } from "@roo-code/types" - -import { waitFor, sleep } from "../utils" -import { setDefaultSuiteTimeout } from "../test-utils" - -suite.skip("Roo Code insert_content Tool", function () { - setDefaultSuiteTimeout(this) - - let workspaceDir: string - - // Pre-created test files that will be used across tests - const testFiles = { - simpleText: { - name: `test-insert-simple-${Date.now()}.txt`, - content: "Line 1\nLine 2\nLine 3", - path: "", - }, - jsFile: { - name: `test-insert-js-${Date.now()}.js`, - content: `function hello() { - console.log("Hello World") -} - -function goodbye() { - console.log("Goodbye World") -}`, - path: "", - }, - emptyFile: { - name: `test-insert-empty-${Date.now()}.txt`, - content: "", - path: "", - }, - pythonFile: { - name: `test-insert-python-${Date.now()}.py`, - content: `def main(): - print("Start") - print("End")`, - path: "", - }, - } - - // Get the actual workspace directory that VSCode is using and create all test files - suiteSetup(async function () { - // Get the workspace folder from VSCode - const workspaceFolders = vscode.workspace.workspaceFolders - if (!workspaceFolders || workspaceFolders.length === 0) { - throw new Error("No workspace folder found") - } - workspaceDir = workspaceFolders[0]!.uri.fsPath - console.log("Using workspace directory:", workspaceDir) - - // Create all test files before any tests run - console.log("Creating test files in workspace...") - for (const [key, file] of Object.entries(testFiles)) { - file.path = path.join(workspaceDir, file.name) - await fs.writeFile(file.path, file.content) - console.log(`Created ${key} test file at:`, file.path) - } - - // Verify all files exist - for (const [key, file] of Object.entries(testFiles)) { - const exists = await fs - .access(file.path) - .then(() => true) - .catch(() => false) - if (!exists) { - throw new Error(`Failed to create ${key} test file at ${file.path}`) - } - } - }) - - // Clean up after all tests - suiteTeardown(async () => { - // Cancel any running tasks before cleanup - test("Should insert content at the beginning of a file (line 1)", async function () { - const api = globalThis.api - // Clean up before each test - setup(async () => { - // Cancel any previous task - try { - await globalThis.api.cancelCurrentTask() - } catch { - // Task might not be running - } - - // Small delay to ensure clean state - await sleep(100) - }) - - // Clean up after each test - teardown(async () => { - // Cancel the current task - try { - await globalThis.api.cancelCurrentTask() - } catch { - // Task might not be running - } - - // Small delay to ensure clean state - await sleep(100) - }) - const messages: ClineMessage[] = [] - const testFile = testFiles.simpleText - const insertContent = "New first line" - const expectedContent = `${insertContent} -${testFile.content}` - let taskStarted = false - let taskCompleted = false - let errorOccurred: string | null = null - let insertContentExecuted = false - - // Listen for messages - const messageHandler = ({ message }: { message: ClineMessage }) => { - messages.push(message) - - // Log important messages for debugging - if (message.type === "say" && message.say === "error") { - errorOccurred = message.text || "Unknown error" - console.error("Error:", message.text) - } - if (message.type === "ask" && message.ask === "tool") { - console.log("Tool request:", message.text?.substring(0, 200)) - } - if (message.type === "say" && (message.say === "completion_result" || message.say === "text")) { - console.log("AI response:", message.text?.substring(0, 200)) - } - - // Check for tool execution - if (message.type === "say" && message.say === "api_req_started" && message.text) { - console.log("API request started:", message.text.substring(0, 200)) - try { - const requestData = JSON.parse(message.text) - if (requestData.request && requestData.request.includes("insert_content")) { - insertContentExecuted = true - console.log("insert_content tool executed!") - } - } catch (e) { - console.log("Failed to parse api_req_started message:", e) - } - } - } - api.on(RooCodeEventName.Message, messageHandler) - - // Listen for task events - const taskStartedHandler = (id: string) => { - if (id === taskId) { - taskStarted = true - console.log("Task started:", id) - } - } - api.on(RooCodeEventName.TaskStarted, taskStartedHandler) - - const taskCompletedHandler = (id: string) => { - if (id === taskId) { - taskCompleted = true - console.log("Task completed:", id) - } - } - api.on(RooCodeEventName.TaskCompleted, taskCompletedHandler) - - let taskId: string - try { - // Start the task - taskId = await api.startNewTask({ - configuration: { - mode: "code", - autoApprovalEnabled: true, - alwaysAllowWrite: true, - alwaysAllowReadOnly: true, - alwaysAllowReadOnlyOutsideWorkspace: true, - }, - text: `Use insert_content to add "${insertContent}" at line 1 (beginning) of the file ${testFile.name}. The file already exists with this content: -${testFile.content} - -Assume the file exists and you can modify it directly.`, - }) - - console.log("Task ID:", taskId) - console.log("Test filename:", testFile.name) - - // Wait for task to start - await waitFor(() => taskStarted, { timeout: 45_000 }) - - // Check for early errors - if (errorOccurred) { - console.error("Early error detected:", errorOccurred) - } - - // Wait for task completion - await waitFor(() => taskCompleted, { timeout: 45_000 }) - - // Give extra time for file system operations - await sleep(2000) - - // Check if the file was modified correctly - const actualContent = await fs.readFile(testFile.path, "utf-8") - console.log("File content after insertion:", actualContent) - - // Verify tool was executed - assert.strictEqual(insertContentExecuted, true, "insert_content tool should have been executed") - - // Verify file content - assert.strictEqual( - actualContent.trim(), - expectedContent.trim(), - "Content should be inserted at the beginning of the file", - ) - - // Verify no errors occurred - assert.strictEqual( - errorOccurred, - null, - `Task should complete without errors, but got: ${errorOccurred}`, - ) - - console.log("Test passed! insert_content tool executed and content inserted at beginning successfully") - } finally { - api.off(RooCodeEventName.Message, messageHandler) - api.off(RooCodeEventName.TaskStarted, taskStartedHandler) - api.off(RooCodeEventName.TaskCompleted, taskCompletedHandler) - } - }) - try { - await globalThis.api.cancelCurrentTask() - } catch { - // Task might not be running - } - - // Clean up all test files - console.log("Cleaning up test files...") - for (const [key, file] of Object.entries(testFiles)) { - try { - await fs.unlink(file.path) - console.log(`Cleaned up ${key} test file`) - } catch (error) { - console.log(`Failed to clean up ${key} test file:`, error) - } - } - }) - - test("Should insert content at the end of a file (line 0)", async function () { - const api = globalThis.api - const messages: ClineMessage[] = [] - const testFile = testFiles.simpleText - const insertContent = "New last line" - const expectedContent = `${testFile.content} -${insertContent}` - let taskStarted = false - let taskCompleted = false - let errorOccurred: string | null = null - let insertContentExecuted = false - - // Listen for messages - const messageHandler = ({ message }: { message: ClineMessage }) => { - messages.push(message) - - // Log important messages for debugging - if (message.type === "say" && message.say === "error") { - errorOccurred = message.text || "Unknown error" - console.error("Error:", message.text) - } - if (message.type === "ask" && message.ask === "tool") { - console.log("Tool request:", message.text?.substring(0, 200)) - } - if (message.type === "say" && (message.say === "completion_result" || message.say === "text")) { - console.log("AI response:", message.text?.substring(0, 200)) - } - - // Check for tool execution - if (message.type === "say" && message.say === "api_req_started" && message.text) { - console.log("API request started:", message.text.substring(0, 200)) - try { - const requestData = JSON.parse(message.text) - if (requestData.request && requestData.request.includes("insert_content")) { - insertContentExecuted = true - console.log("insert_content tool executed!") - } - } catch (e) { - console.log("Failed to parse api_req_started message:", e) - } - } - } - api.on(RooCodeEventName.Message, messageHandler) - - // Listen for task events - const taskStartedHandler = (id: string) => { - if (id === taskId) { - taskStarted = true - console.log("Task started:", id) - } - } - api.on(RooCodeEventName.TaskStarted, taskStartedHandler) - - const taskCompletedHandler = (id: string) => { - if (id === taskId) { - taskCompleted = true - console.log("Task completed:", id) - } - } - api.on(RooCodeEventName.TaskCompleted, taskCompletedHandler) - - let taskId: string - try { - // Start the task - taskId = await api.startNewTask({ - configuration: { - mode: "code", - autoApprovalEnabled: true, - alwaysAllowWrite: true, - alwaysAllowReadOnly: true, - alwaysAllowReadOnlyOutsideWorkspace: true, - }, - text: `Use insert_content to add "${insertContent}" at line 0 (end of file) of the file ${testFile.name}. The file already exists with this content: -${testFile.content} - -Assume the file exists and you can modify it directly.`, - }) - - console.log("Task ID:", taskId) - console.log("Test filename:", testFile.name) - - // Wait for task to start - await waitFor(() => taskStarted, { timeout: 45_000 }) - - // Check for early errors - if (errorOccurred) { - console.error("Early error detected:", errorOccurred) - } - - // Wait for task completion - await waitFor(() => taskCompleted, { timeout: 45_000 }) - - // Give extra time for file system operations - await sleep(2000) - - // Check if the file was modified correctly - const actualContent = await fs.readFile(testFile.path, "utf-8") - console.log("File content after insertion:", actualContent) - - // Verify tool was executed - test("Should insert multiline content into a JavaScript file", async function () { - const api = globalThis.api - const messages: ClineMessage[] = [] - const testFile = testFiles.jsFile - const insertContent = `// New import statements -import { utils } from './utils' -import { helpers } from './helpers'` - const expectedContent = `${insertContent} -${testFile.content}` - let taskStarted = false - let taskCompleted = false - let errorOccurred: string | null = null - let insertContentExecuted = false - - // Listen for messages - const messageHandler = ({ message }: { message: ClineMessage }) => { - messages.push(message) - - // Log important messages for debugging - if (message.type === "say" && message.say === "error") { - errorOccurred = message.text || "Unknown error" - console.error("Error:", message.text) - } - if (message.type === "ask" && message.ask === "tool") { - console.log("Tool request:", message.text?.substring(0, 200)) - } - if (message.type === "say" && (message.say === "completion_result" || message.say === "text")) { - console.log("AI response:", message.text?.substring(0, 200)) - } - - // Check for tool execution - if (message.type === "say" && message.say === "api_req_started" && message.text) { - console.log("API request started:", message.text.substring(0, 200)) - try { - const requestData = JSON.parse(message.text) - if (requestData.request && requestData.request.includes("insert_content")) { - insertContentExecuted = true - console.log("insert_content tool executed!") - } - } catch (e) { - console.log("Failed to parse api_req_started message:", e) - } - } - } - api.on(RooCodeEventName.Message, messageHandler) - - // Listen for task events - const taskStartedHandler = (id: string) => { - if (id === taskId) { - taskStarted = true - console.log("Task started:", id) - } - } - api.on(RooCodeEventName.TaskStarted, taskStartedHandler) - - const taskCompletedHandler = (id: string) => { - if (id === taskId) { - taskCompleted = true - console.log("Task completed:", id) - } - } - api.on(RooCodeEventName.TaskCompleted, taskCompletedHandler) - - let taskId: string - try { - // Start the task - taskId = await api.startNewTask({ - configuration: { - mode: "code", - autoApprovalEnabled: true, - alwaysAllowWrite: true, - alwaysAllowReadOnly: true, - alwaysAllowReadOnlyOutsideWorkspace: true, - }, - text: `Use insert_content to add import statements at the beginning (line 1) of the JavaScript file ${testFile.name}. Add these lines: -${insertContent} - -The file already exists with this content: -${testFile.content} - -Assume the file exists and you can modify it directly.`, - }) - - console.log("Task ID:", taskId) - console.log("Test filename:", testFile.name) - - // Wait for task to start - await waitFor(() => taskStarted, { timeout: 45_000 }) - - // Check for early errors - if (errorOccurred) { - console.error("Early error detected:", errorOccurred) - } - - // Wait for task completion - await waitFor(() => taskCompleted, { timeout: 45_000 }) - - // Give extra time for file system operations - await sleep(2000) - - test("Should insert content into an empty file", async function () { - const api = globalThis.api - const messages: ClineMessage[] = [] - const testFile = testFiles.emptyFile - const insertContent = `# My New File -This is the first line of content -And this is the second line` - const expectedContent = insertContent - let taskStarted = false - let taskCompleted = false - let errorOccurred: string | null = null - let insertContentExecuted = false - - // Listen for messages - const messageHandler = ({ message }: { message: ClineMessage }) => { - messages.push(message) - - // Log important messages for debugging - if (message.type === "say" && message.say === "error") { - errorOccurred = message.text || "Unknown error" - console.error("Error:", message.text) - } - if (message.type === "ask" && message.ask === "tool") { - console.log("Tool request:", message.text?.substring(0, 200)) - } - if ( - message.type === "say" && - (message.say === "completion_result" || message.say === "text") - ) { - console.log("AI response:", message.text?.substring(0, 200)) - } - - // Check for tool execution - if (message.type === "say" && message.say === "api_req_started" && message.text) { - console.log("API request started:", message.text.substring(0, 200)) - try { - const requestData = JSON.parse(message.text) - if (requestData.request && requestData.request.includes("insert_content")) { - insertContentExecuted = true - console.log("insert_content tool executed!") - } - } catch (e) { - console.log("Failed to parse api_req_started message:", e) - } - } - } - api.on(RooCodeEventName.Message, messageHandler) - - // Listen for task events - const taskStartedHandler = (id: string) => { - if (id === taskId) { - taskStarted = true - console.log("Task started:", id) - } - } - api.on(RooCodeEventName.TaskStarted, taskStartedHandler) - - const taskCompletedHandler = (id: string) => { - if (id === taskId) { - taskCompleted = true - console.log("Task completed:", id) - } - } - api.on(RooCodeEventName.TaskCompleted, taskCompletedHandler) - - let taskId: string - try { - // Start the task - taskId = await api.startNewTask({ - configuration: { - mode: "code", - autoApprovalEnabled: true, - alwaysAllowWrite: true, - alwaysAllowReadOnly: true, - alwaysAllowReadOnlyOutsideWorkspace: true, - }, - text: `Use insert_content to add content to the empty file ${testFile.name}. Add this content at line 0 (end of file): -${insertContent} - -The file is currently empty. Assume the file exists and you can modify it directly.`, - }) - - console.log("Task ID:", taskId) - console.log("Test filename:", testFile.name) - - // Wait for task to start - await waitFor(() => taskStarted, { timeout: 45_000 }) - - // Check for early errors - if (errorOccurred) { - console.error("Early error detected:", errorOccurred) - } - - // Wait for task completion - await waitFor(() => taskCompleted, { timeout: 45_000 }) - - // Give extra time for file system operations - await sleep(2000) - - // Check if the file was modified correctly - const actualContent = await fs.readFile(testFile.path, "utf-8") - console.log("File content after insertion:", actualContent) - - // Verify tool was executed - assert.strictEqual( - insertContentExecuted, - true, - "insert_content tool should have been executed", - ) - - // Verify file content - assert.strictEqual( - actualContent.trim(), - expectedContent.trim(), - "Content should be inserted into the empty file", - ) - - // Verify no errors occurred - assert.strictEqual( - errorOccurred, - null, - `Task should complete without errors, but got: ${errorOccurred}`, - ) - - console.log( - "Test passed! insert_content tool executed and content inserted into empty file successfully", - ) - } finally { - api.off(RooCodeEventName.Message, messageHandler) - api.off(RooCodeEventName.TaskStarted, taskStartedHandler) - api.off(RooCodeEventName.TaskCompleted, taskCompletedHandler) - } - }) - // Check if the file was modified correctly - const actualContent = await fs.readFile(testFile.path, "utf-8") - console.log("File content after insertion:", actualContent) - - // Verify tool was executed - assert.strictEqual(insertContentExecuted, true, "insert_content tool should have been executed") - - // Verify file content - assert.strictEqual( - actualContent.trim(), - expectedContent.trim(), - "Multiline content should be inserted at the beginning of the JavaScript file", - ) - - // Verify no errors occurred - assert.strictEqual( - errorOccurred, - null, - `Task should complete without errors, but got: ${errorOccurred}`, - ) - - console.log("Test passed! insert_content tool executed and multiline content inserted successfully") - } finally { - api.off(RooCodeEventName.Message, messageHandler) - api.off(RooCodeEventName.TaskStarted, taskStartedHandler) - api.off(RooCodeEventName.TaskCompleted, taskCompletedHandler) - } - }) - assert.strictEqual(insertContentExecuted, true, "insert_content tool should have been executed") - - // Verify file content - assert.strictEqual( - actualContent.trim(), - expectedContent.trim(), - "Content should be inserted at the end of the file", - ) - - // Verify no errors occurred - assert.strictEqual(errorOccurred, null, `Task should complete without errors, but got: ${errorOccurred}`) - - console.log("Test passed! insert_content tool executed and content inserted at end successfully") - } finally { - api.off(RooCodeEventName.Message, messageHandler) - api.off(RooCodeEventName.TaskStarted, taskStartedHandler) - api.off(RooCodeEventName.TaskCompleted, taskCompletedHandler) - } - }) - // Tests will be added here one by one -}) diff --git a/apps/web-evals/package.json b/apps/web-evals/package.json index f2d5043ec36..b2ac0d43460 100644 --- a/apps/web-evals/package.json +++ b/apps/web-evals/package.json @@ -29,12 +29,13 @@ "@roo-code/evals": "workspace:^", "@roo-code/types": "workspace:^", "@tanstack/react-query": "^5.69.0", + "archiver": "^7.0.1", "class-variance-authority": "^0.7.1", "clsx": "^2.1.1", "cmdk": "^1.1.0", "fuzzysort": "^3.1.0", "lucide-react": "^0.518.0", - "next": "^15.2.5", + "next": "~15.2.6", "next-themes": "^0.4.6", "p-map": "^7.0.3", "react": "^18.3.1", @@ -52,6 +53,8 @@ "@roo-code/config-eslint": "workspace:^", "@roo-code/config-typescript": "workspace:^", "@tailwindcss/postcss": "^4", + "@types/archiver": "^7.0.0", + "@types/ps-tree": "^1.1.6", "@types/react": "^18.3.23", "@types/react-dom": "^18.3.5", "tailwindcss": "^4", diff --git a/apps/web-evals/src/actions/__tests__/killRun.spec.ts b/apps/web-evals/src/actions/__tests__/killRun.spec.ts new file mode 100644 index 00000000000..814d70d9fca --- /dev/null +++ b/apps/web-evals/src/actions/__tests__/killRun.spec.ts @@ -0,0 +1,207 @@ +// npx vitest run src/actions/__tests__/killRun.spec.ts + +import { execFileSync } from "child_process" + +// Mock child_process +vi.mock("child_process", () => ({ + execFileSync: vi.fn(), + spawn: vi.fn(), +})) + +// Mock next/cache +vi.mock("next/cache", () => ({ + revalidatePath: vi.fn(), +})) + +// Mock redis client +vi.mock("@/lib/server/redis", () => ({ + redisClient: vi.fn().mockResolvedValue({ + del: vi.fn().mockResolvedValue(1), + }), +})) + +// Mock @roo-code/evals +vi.mock("@roo-code/evals", () => ({ + createRun: vi.fn(), + deleteRun: vi.fn(), + createTask: vi.fn(), + exerciseLanguages: [], + getExercisesForLanguage: vi.fn().mockResolvedValue([]), +})) + +// Mock timers to speed up tests +vi.useFakeTimers() + +// Import after mocks +import { killRun } from "../runs" + +const mockExecFileSync = execFileSync as ReturnType + +describe("killRun", () => { + beforeEach(() => { + vi.clearAllMocks() + }) + + afterEach(() => { + vi.clearAllTimers() + }) + + it("should kill controller first, wait, then kill task containers", async () => { + const runId = 123 + + // execFileSync is used for all docker commands + mockExecFileSync + .mockReturnValueOnce("") // docker kill controller + .mockReturnValueOnce("evals-task-123-456.0\nevals-task-123-789.1\n") // docker ps + .mockReturnValueOnce("") // docker kill evals-task-123-456.0 + .mockReturnValueOnce("") // docker kill evals-task-123-789.1 + + const resultPromise = killRun(runId) + + // Fast-forward past the 10 second sleep + await vi.advanceTimersByTimeAsync(10000) + + const result = await resultPromise + + expect(result.success).toBe(true) + expect(result.killedContainers).toContain("evals-controller-123") + expect(result.killedContainers).toContain("evals-task-123-456.0") + expect(result.killedContainers).toContain("evals-task-123-789.1") + expect(result.errors).toHaveLength(0) + + // Verify execFileSync was called for docker kill + expect(mockExecFileSync).toHaveBeenNthCalledWith( + 1, + "docker", + ["kill", "evals-controller-123"], + expect.any(Object), + ) + // Verify execFileSync was called for docker ps with run-specific filter + expect(mockExecFileSync).toHaveBeenNthCalledWith( + 2, + "docker", + ["ps", "--format", "{{.Names}}", "--filter", "name=evals-task-123-"], + expect.any(Object), + ) + }) + + it("should continue killing runners even if controller is not running", async () => { + const runId = 456 + + mockExecFileSync + .mockImplementationOnce(() => { + throw new Error("No such container") + }) // controller kill fails + .mockReturnValueOnce("evals-task-456-100.0\n") // docker ps + .mockReturnValueOnce("") // docker kill task + + const resultPromise = killRun(runId) + await vi.advanceTimersByTimeAsync(10000) + const result = await resultPromise + + expect(result.success).toBe(true) + expect(result.killedContainers).toContain("evals-task-456-100.0") + // Controller not in list since it failed + expect(result.killedContainers).not.toContain("evals-controller-456") + }) + + it("should clear Redis state after killing containers", async () => { + const runId = 789 + + const mockDel = vi.fn().mockResolvedValue(1) + const { redisClient } = await import("@/lib/server/redis") + vi.mocked(redisClient).mockResolvedValue({ del: mockDel } as never) + + mockExecFileSync + .mockReturnValueOnce("") // controller kill + .mockReturnValueOnce("") // docker ps (no tasks) + + const resultPromise = killRun(runId) + await vi.advanceTimersByTimeAsync(10000) + await resultPromise + + expect(mockDel).toHaveBeenCalledWith("heartbeat:789") + expect(mockDel).toHaveBeenCalledWith("runners:789") + }) + + it("should handle docker ps failure gracefully", async () => { + const runId = 111 + + mockExecFileSync + .mockReturnValueOnce("") // controller kill succeeds + .mockImplementationOnce(() => { + throw new Error("Docker error") + }) // docker ps fails + + const resultPromise = killRun(runId) + await vi.advanceTimersByTimeAsync(10000) + const result = await resultPromise + + // Should still be successful because controller was killed + expect(result.success).toBe(true) + expect(result.killedContainers).toContain("evals-controller-111") + expect(result.errors).toContain("Failed to list Docker task containers") + }) + + it("should handle individual task kill failures", async () => { + const runId = 222 + + mockExecFileSync + .mockReturnValueOnce("") // controller kill + .mockReturnValueOnce("evals-task-222-300.0\nevals-task-222-400.0\n") // docker ps + .mockImplementationOnce(() => { + throw new Error("Kill failed") + }) // first task kill fails + .mockReturnValueOnce("") // second task kill succeeds + + const resultPromise = killRun(runId) + await vi.advanceTimersByTimeAsync(10000) + const result = await resultPromise + + expect(result.success).toBe(true) + expect(result.killedContainers).toContain("evals-controller-222") + expect(result.killedContainers).toContain("evals-task-222-400.0") + expect(result.errors.length).toBe(1) + expect(result.errors[0]).toContain("evals-task-222-300.0") + }) + + it("should return success with no containers when nothing is running", async () => { + const runId = 333 + + mockExecFileSync + .mockImplementationOnce(() => { + throw new Error("No such container") + }) // controller not running + .mockReturnValueOnce("") // no task containers + + const resultPromise = killRun(runId) + await vi.advanceTimersByTimeAsync(10000) + const result = await resultPromise + + expect(result.success).toBe(true) + expect(result.killedContainers).toHaveLength(0) + expect(result.errors).toHaveLength(0) + }) + + it("should only kill containers belonging to the specific run", async () => { + const runId = 555 + + mockExecFileSync + .mockReturnValueOnce("") // controller kill + .mockReturnValueOnce("evals-task-555-100.0\n") // docker ps + .mockReturnValueOnce("") // docker kill task + + const resultPromise = killRun(runId) + await vi.advanceTimersByTimeAsync(10000) + const result = await resultPromise + + expect(result.success).toBe(true) + // Verify execFileSync was called for docker ps with run-specific filter + expect(mockExecFileSync).toHaveBeenNthCalledWith( + 2, + "docker", + ["ps", "--format", "{{.Names}}", "--filter", "name=evals-task-555-"], + expect.any(Object), + ) + }) +}) diff --git a/apps/web-evals/src/actions/runs.ts b/apps/web-evals/src/actions/runs.ts index 82a7ebfcbe5..9d213547cee 100644 --- a/apps/web-evals/src/actions/runs.ts +++ b/apps/web-evals/src/actions/runs.ts @@ -3,7 +3,7 @@ import * as path from "path" import fs from "fs" import { fileURLToPath } from "url" -import { spawn } from "child_process" +import { spawn, execFileSync } from "child_process" import { revalidatePath } from "next/cache" import pMap from "p-map" @@ -13,15 +13,22 @@ import { exerciseLanguages, createRun as _createRun, deleteRun as _deleteRun, + updateRun as _updateRun, + getIncompleteRuns as _getIncompleteRuns, + deleteRunsByIds as _deleteRunsByIds, createTask, getExercisesForLanguage, } from "@roo-code/evals" import { CreateRun } from "@/lib/schemas" +import { redisClient } from "@/lib/server/redis" + +// Storage base path for eval logs +const EVALS_STORAGE_PATH = "/tmp/evals/runs" const EVALS_REPO_PATH = path.resolve(path.dirname(fileURLToPath(import.meta.url)), "../../../../../evals") -export async function createRun({ suite, exercises = [], timeout, ...values }: CreateRun) { +export async function createRun({ suite, exercises = [], timeout, iterations = 1, ...values }: CreateRun) { const run = await _createRun({ ...values, timeout, @@ -36,15 +43,34 @@ export async function createRun({ suite, exercises = [], timeout, ...values }: C throw new Error("Invalid exercise path: " + path) } - await createTask({ ...values, runId: run.id, language: language as ExerciseLanguage, exercise }) + // Create multiple tasks for each iteration + for (let iteration = 1; iteration <= iterations; iteration++) { + await createTask({ + ...values, + runId: run.id, + language: language as ExerciseLanguage, + exercise, + iteration, + }) + } } } else { for (const language of exerciseLanguages) { - const exercises = await getExercisesForLanguage(EVALS_REPO_PATH, language) + const languageExercises = await getExercisesForLanguage(EVALS_REPO_PATH, language) - await pMap(exercises, (exercise) => createTask({ runId: run.id, language, exercise }), { - concurrency: 10, - }) + // Create tasks for all iterations of each exercise + const tasksToCreate: Array<{ language: ExerciseLanguage; exercise: string; iteration: number }> = [] + for (const exercise of languageExercises) { + for (let iteration = 1; iteration <= iterations; iteration++) { + tasksToCreate.push({ language, exercise, iteration }) + } + } + + await pMap( + tasksToCreate, + ({ language, exercise, iteration }) => createTask({ runId: run.id, language, exercise, iteration }), + { concurrency: 10 }, + ) } } @@ -97,3 +123,247 @@ export async function deleteRun(runId: number) { await _deleteRun(runId) revalidatePath("/runs") } + +export type KillRunResult = { + success: boolean + killedContainers: string[] + errors: string[] +} + +const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms)) + +/** + * Kill all Docker containers associated with a run (controller and task runners). + * Kills the controller first, waits 10 seconds, then kills runners. + * Also clears Redis state for heartbeat and runners. + * + * Container naming conventions: + * - Controller: evals-controller-{runId} + * - Task runners: evals-task-{runId}-{taskId}.{attempt} + */ +export async function killRun(runId: number): Promise { + const killedContainers: string[] = [] + const errors: string[] = [] + const controllerPattern = `evals-controller-${runId}` + const taskPattern = `evals-task-${runId}-` + + try { + // Step 1: Kill the controller first + console.log(`Killing controller: ${controllerPattern}`) + try { + execFileSync("docker", ["kill", controllerPattern], { encoding: "utf-8", timeout: 10000 }) + killedContainers.push(controllerPattern) + console.log(`Killed controller container: ${controllerPattern}`) + } catch (_error) { + // Controller might not be running - that's ok, continue to kill runners + console.log(`Controller ${controllerPattern} not running or already stopped`) + } + + // Step 2: Wait 10 seconds before killing runners + console.log("Waiting 10 seconds before killing runners...") + await sleep(10000) + + // Step 3: Find and kill all task runner containers for THIS run only + let taskContainerNames: string[] = [] + + try { + const output = execFileSync("docker", ["ps", "--format", "{{.Names}}", "--filter", `name=${taskPattern}`], { + encoding: "utf-8", + timeout: 10000, + }) + taskContainerNames = output + .split("\n") + .map((name) => name.trim()) + .filter((name) => name.length > 0 && name.startsWith(taskPattern)) + } catch (error) { + console.error("Failed to list task containers:", error) + errors.push("Failed to list Docker task containers") + } + + // Kill each task runner container + for (const containerName of taskContainerNames) { + try { + execFileSync("docker", ["kill", containerName], { encoding: "utf-8", timeout: 10000 }) + killedContainers.push(containerName) + console.log(`Killed task container: ${containerName}`) + } catch (error) { + // Container might have already stopped + console.error(`Failed to kill container ${containerName}:`, error) + errors.push(`Failed to kill container: ${containerName}`) + } + } + + // Step 4: Clear Redis state + try { + const redis = await redisClient() + const heartbeatKey = `heartbeat:${runId}` + const runnersKey = `runners:${runId}` + + await redis.del(heartbeatKey) + await redis.del(runnersKey) + console.log(`Cleared Redis keys: ${heartbeatKey}, ${runnersKey}`) + } catch (error) { + console.error("Failed to clear Redis state:", error) + errors.push("Failed to clear Redis state") + } + } catch (error) { + console.error("Error in killRun:", error) + errors.push("Unexpected error while killing containers") + } + + revalidatePath(`/runs/${runId}`) + revalidatePath("/runs") + + return { + success: killedContainers.length > 0 || errors.length === 0, + killedContainers, + errors, + } +} + +export type DeleteIncompleteRunsResult = { + success: boolean + deletedCount: number + deletedRunIds: number[] + storageErrors: string[] +} + +/** + * Delete all incomplete runs (runs without a taskMetricsId/final score). + * Removes both database records and storage folders. + */ +export async function deleteIncompleteRuns(): Promise { + const storageErrors: string[] = [] + + // Get all incomplete runs + const incompleteRuns = await _getIncompleteRuns() + const runIds = incompleteRuns.map((run) => run.id) + + if (runIds.length === 0) { + return { + success: true, + deletedCount: 0, + deletedRunIds: [], + storageErrors: [], + } + } + + // Delete storage folders for each run + for (const runId of runIds) { + const storagePath = path.join(EVALS_STORAGE_PATH, String(runId)) + try { + if (fs.existsSync(storagePath)) { + fs.rmSync(storagePath, { recursive: true, force: true }) + console.log(`Deleted storage folder: ${storagePath}`) + } + } catch (error) { + console.error(`Failed to delete storage folder ${storagePath}:`, error) + storageErrors.push(`Failed to delete storage for run ${runId}`) + } + + // Also try to clear Redis state for any potentially running incomplete runs + try { + const redis = await redisClient() + await redis.del(`heartbeat:${runId}`) + await redis.del(`runners:${runId}`) + } catch (error) { + // Non-critical error, just log it + console.error(`Failed to clear Redis state for run ${runId}:`, error) + } + } + + // Delete from database + await _deleteRunsByIds(runIds) + + revalidatePath("/runs") + + return { + success: true, + deletedCount: runIds.length, + deletedRunIds: runIds, + storageErrors, + } +} + +/** + * Get count of incomplete runs (for UI display) + */ +export async function getIncompleteRunsCount(): Promise { + const incompleteRuns = await _getIncompleteRuns() + return incompleteRuns.length +} + +/** + * Delete all runs older than 30 days. + * Removes both database records and storage folders. + */ +export async function deleteOldRuns(): Promise { + const storageErrors: string[] = [] + + // Get all runs older than 30 days + const thirtyDaysAgo = new Date(Date.now() - 30 * 24 * 60 * 60 * 1000) + const { getRuns } = await import("@roo-code/evals") + const allRuns = await getRuns() + const oldRuns = allRuns.filter((run) => run.createdAt < thirtyDaysAgo) + const runIds = oldRuns.map((run) => run.id) + + if (runIds.length === 0) { + return { + success: true, + deletedCount: 0, + deletedRunIds: [], + storageErrors: [], + } + } + + // Delete storage folders for each run + for (const runId of runIds) { + const storagePath = path.join(EVALS_STORAGE_PATH, String(runId)) + try { + if (fs.existsSync(storagePath)) { + fs.rmSync(storagePath, { recursive: true, force: true }) + console.log(`Deleted storage folder: ${storagePath}`) + } + } catch (error) { + console.error(`Failed to delete storage folder ${storagePath}:`, error) + storageErrors.push(`Failed to delete storage for run ${runId}`) + } + + // Also try to clear Redis state + try { + const redis = await redisClient() + await redis.del(`heartbeat:${runId}`) + await redis.del(`runners:${runId}`) + } catch (error) { + // Non-critical error, just log it + console.error(`Failed to clear Redis state for run ${runId}:`, error) + } + } + + // Delete from database + await _deleteRunsByIds(runIds) + + revalidatePath("/runs") + + return { + success: true, + deletedCount: runIds.length, + deletedRunIds: runIds, + storageErrors, + } +} + +/** + * Update the description of a run. + */ +export async function updateRunDescription(runId: number, description: string | null): Promise<{ success: boolean }> { + try { + await _updateRun(runId, { description }) + revalidatePath("/runs") + revalidatePath(`/runs/${runId}`) + return { success: true } + } catch (error) { + console.error("Failed to update run description:", error) + return { success: false } + } +} diff --git a/apps/web-evals/src/app/api/runs/[id]/logs/[taskId]/route.ts b/apps/web-evals/src/app/api/runs/[id]/logs/[taskId]/route.ts new file mode 100644 index 00000000000..e5ec8751ab0 --- /dev/null +++ b/apps/web-evals/src/app/api/runs/[id]/logs/[taskId]/route.ts @@ -0,0 +1,74 @@ +import { NextResponse } from "next/server" +import type { NextRequest } from "next/server" +import * as fs from "node:fs/promises" +import * as path from "node:path" + +import { findTask, findRun } from "@roo-code/evals" + +export const dynamic = "force-dynamic" + +const LOG_BASE_PATH = "/tmp/evals/runs" + +// Sanitize path components to prevent path traversal attacks +function sanitizePathComponent(component: string): string { + // Remove any path separators, null bytes, and other dangerous characters + return component.replace(/[/\\:\0*?"<>|]/g, "_") +} + +export async function GET(request: NextRequest, { params }: { params: Promise<{ id: string; taskId: string }> }) { + const { id, taskId } = await params + + try { + const runId = Number(id) + const taskIdNum = Number(taskId) + + if (isNaN(runId) || isNaN(taskIdNum)) { + return NextResponse.json({ error: "Invalid run ID or task ID" }, { status: 400 }) + } + + // Verify the run exists + await findRun(runId) + + // Get the task to find its language and exercise + const task = await findTask(taskIdNum) + + // Verify the task belongs to this run + if (task.runId !== runId) { + return NextResponse.json({ error: "Task does not belong to this run" }, { status: 404 }) + } + + // Sanitize language and exercise to prevent path traversal + const safeLanguage = sanitizePathComponent(task.language) + const safeExercise = sanitizePathComponent(task.exercise) + + // Construct the log file path + const logFileName = `${safeLanguage}-${safeExercise}.log` + const logFilePath = path.join(LOG_BASE_PATH, String(runId), logFileName) + + // Verify the resolved path is within the expected directory (defense in depth) + const resolvedPath = path.resolve(logFilePath) + const expectedBase = path.resolve(LOG_BASE_PATH) + if (!resolvedPath.startsWith(expectedBase)) { + return NextResponse.json({ error: "Invalid log path" }, { status: 400 }) + } + + // Check if the log file exists and read it (async) + try { + const logContent = await fs.readFile(logFilePath, "utf-8") + return NextResponse.json({ logContent }) + } catch (err) { + if ((err as NodeJS.ErrnoException).code === "ENOENT") { + return NextResponse.json({ error: "Log file not found", logContent: null }, { status: 200 }) + } + throw err + } + } catch (error) { + console.error("Error reading task log:", error) + + if (error instanceof Error && error.name === "RecordNotFoundError") { + return NextResponse.json({ error: "Task or run not found" }, { status: 404 }) + } + + return NextResponse.json({ error: "Failed to read log file" }, { status: 500 }) + } +} diff --git a/apps/web-evals/src/app/api/runs/[id]/logs/failed/route.ts b/apps/web-evals/src/app/api/runs/[id]/logs/failed/route.ts new file mode 100644 index 00000000000..f8c6cec06be --- /dev/null +++ b/apps/web-evals/src/app/api/runs/[id]/logs/failed/route.ts @@ -0,0 +1,129 @@ +import { NextResponse } from "next/server" +import type { NextRequest } from "next/server" +import * as fs from "node:fs" +import * as path from "node:path" +import archiver from "archiver" + +import { findRun, getTasks } from "@roo-code/evals" + +export const dynamic = "force-dynamic" + +const LOG_BASE_PATH = "/tmp/evals/runs" + +// Sanitize path components to prevent path traversal attacks +function sanitizePathComponent(component: string): string { + // Remove any path separators, null bytes, and other dangerous characters + return component.replace(/[/\\:\0*?"<>|]/g, "_") +} + +export async function GET(request: NextRequest, { params }: { params: Promise<{ id: string }> }) { + const { id } = await params + + try { + const runId = Number(id) + + if (isNaN(runId)) { + return NextResponse.json({ error: "Invalid run ID" }, { status: 400 }) + } + + // Verify the run exists + await findRun(runId) + + // Get all tasks for this run + const tasks = await getTasks(runId) + + // Filter for failed tasks only + const failedTasks = tasks.filter((task) => task.passed === false) + + if (failedTasks.length === 0) { + return NextResponse.json({ error: "No failed tasks to export" }, { status: 400 }) + } + + // Create a zip archive + const archive = archiver("zip", { zlib: { level: 9 } }) + + // Collect chunks to build the response + const chunks: Buffer[] = [] + + archive.on("data", (chunk: Buffer) => { + chunks.push(chunk) + }) + + // Track archive errors + let archiveError: Error | null = null + archive.on("error", (err: Error) => { + archiveError = err + }) + + // Set up the end promise before finalizing (proper event listener ordering) + const archiveEndPromise = new Promise((resolve, reject) => { + archive.on("end", resolve) + archive.on("error", reject) + }) + + // Add each failed task's log file to the archive + const logDir = path.join(LOG_BASE_PATH, String(runId)) + let filesAdded = 0 + + for (const task of failedTasks) { + // Sanitize language and exercise to prevent path traversal + const safeLanguage = sanitizePathComponent(task.language) + const safeExercise = sanitizePathComponent(task.exercise) + const logFileName = `${safeLanguage}-${safeExercise}.log` + const logFilePath = path.join(logDir, logFileName) + + // Verify the resolved path is within the expected directory (defense in depth) + const resolvedPath = path.resolve(logFilePath) + const expectedBase = path.resolve(LOG_BASE_PATH) + if (!resolvedPath.startsWith(expectedBase)) { + continue // Skip files with suspicious paths + } + + if (fs.existsSync(logFilePath)) { + archive.file(logFilePath, { name: logFileName }) + filesAdded++ + } + } + + // Check if any files were actually added + if (filesAdded === 0) { + archive.abort() + return NextResponse.json( + { error: "No log files found - they may have been cleared from disk" }, + { status: 404 }, + ) + } + + // Finalize the archive + await archive.finalize() + + // Wait for all data to be collected + await archiveEndPromise + + // Check for archive errors + if (archiveError) { + throw archiveError + } + + // Combine all chunks into a single buffer + const zipBuffer = Buffer.concat(chunks) + + // Return the zip file + return new NextResponse(zipBuffer, { + status: 200, + headers: { + "Content-Type": "application/zip", + "Content-Disposition": `attachment; filename="run-${runId}-failed-logs.zip"`, + "Content-Length": String(zipBuffer.length), + }, + }) + } catch (error) { + console.error("Error exporting failed logs:", error) + + if (error instanceof Error && error.name === "RecordNotFoundError") { + return NextResponse.json({ error: "Run not found" }, { status: 404 }) + } + + return NextResponse.json({ error: "Failed to export logs" }, { status: 500 }) + } +} diff --git a/apps/web-evals/src/app/runs/[id]/page.tsx b/apps/web-evals/src/app/runs/[id]/page.tsx index aae3fc70f9b..8b993eec8a0 100644 --- a/apps/web-evals/src/app/runs/[id]/page.tsx +++ b/apps/web-evals/src/app/runs/[id]/page.tsx @@ -7,7 +7,7 @@ export default async function Page({ params }: { params: Promise<{ id: string }> const run = await findRun(Number(id)) return ( -
+
) diff --git a/apps/web-evals/src/app/runs/[id]/run-status.tsx b/apps/web-evals/src/app/runs/[id]/run-status.tsx index 4b94ef14fab..e05b1b51ebe 100644 --- a/apps/web-evals/src/app/runs/[id]/run-status.tsx +++ b/apps/web-evals/src/app/runs/[id]/run-status.tsx @@ -1,55 +1,79 @@ "use client" +import { Link2, Link2Off, CheckCircle2 } from "lucide-react" import type { RunStatus as _RunStatus } from "@/hooks/use-run-status" import { cn } from "@/lib/utils" +import { Tooltip, TooltipContent, TooltipTrigger } from "@/components/ui" -export const RunStatus = ({ runStatus: { sseStatus, heartbeat, runners = [] } }: { runStatus: _RunStatus }) => ( -
-
-
-
Task Stream:
-
{sseStatus}
-
-
-
-
-
-
-
-
-
Task Controller:
-
{heartbeat ?? "dead"}
-
-
-
-
-
-
-
-
Task Runners:
- {runners.length > 0 &&
{runners?.join(", ")}
} -
-
-) +function StreamIcon({ status }: { status: "connected" | "waiting" | "error" }) { + if (status === "connected") { + return + } + return +} + +export const RunStatus = ({ + runStatus: { sseStatus, heartbeat, runners = [] }, + isComplete = false, +}: { + runStatus: _RunStatus + isComplete?: boolean +}) => { + // For completed runs, show a simple "Complete" badge + if (isComplete) { + return ( + + +
+ +
+
+ + Run complete + +
+ ) + } + + return ( + + +
+ {/* Task Stream status icon */} + + + {/* Task Controller ID */} + {heartbeat ?? "-"} + + {/* Task Runners count */} + 0 ? "text-green-500" : "text-rose-500"}> + {runners.length > 0 ? `${runners.length}r` : "0r"} + +
+
+ +
+
+ + Task Stream: {sseStatus} +
+
+ + Task Controller: {heartbeat ?? "dead"} +
+
+ 0 ? "text-green-500" : "text-rose-500"}>● + Task Runners: {runners.length > 0 ? runners.length : "none"} +
+ {runners.length > 0 && ( +
+ {runners.map((runner) => ( +
{runner}
+ ))} +
+ )} +
+
+
+ ) +} diff --git a/apps/web-evals/src/app/runs/[id]/run.tsx b/apps/web-evals/src/app/runs/[id]/run.tsx index a8ff1484fe7..a4b39100245 100644 --- a/apps/web-evals/src/app/runs/[id]/run.tsx +++ b/apps/web-evals/src/app/runs/[id]/run.tsx @@ -1,12 +1,15 @@ "use client" -import { useMemo } from "react" -import { LoaderCircle } from "lucide-react" +import { useMemo, useState, useCallback, useEffect } from "react" +import { toast } from "sonner" +import { LoaderCircle, FileText, Copy, Check, StopCircle } from "lucide-react" -import type { Run, TaskMetrics as _TaskMetrics } from "@roo-code/evals" +import type { Run, TaskMetrics as _TaskMetrics, Task } from "@roo-code/evals" +import type { ToolName } from "@roo-code/types" import { formatCurrency, formatDuration, formatTokens, formatToolUsageSuccessRate } from "@/lib/formatters" import { useRunStatus } from "@/hooks/use-run-status" +import { killRun } from "@/actions/runs" import { Table, TableBody, @@ -17,6 +20,20 @@ import { Tooltip, TooltipContent, TooltipTrigger, + Dialog, + DialogContent, + DialogHeader, + DialogTitle, + ScrollArea, + Button, + AlertDialog, + AlertDialogAction, + AlertDialogCancel, + AlertDialogContent, + AlertDialogDescription, + AlertDialogFooter, + AlertDialogHeader, + AlertDialogTitle, } from "@/components/ui" import { TaskStatus } from "./task-status" @@ -35,9 +52,288 @@ function getToolAbbreviation(toolName: string): string { .join("") } +// Pattern definitions for syntax highlighting +type HighlightPattern = { + pattern: RegExp + className: string + // If true, wraps the entire match; if a number, wraps that capture group + wrapGroup?: number +} + +const HIGHLIGHT_PATTERNS: HighlightPattern[] = [ + // Log levels - styled as badges + { pattern: /\|\s*(INFO)\s*\|/g, className: "text-green-400", wrapGroup: 1 }, + { pattern: /\|\s*(WARN|WARNING)\s*\|/g, className: "text-yellow-400", wrapGroup: 1 }, + { pattern: /\|\s*(ERROR)\s*\|/g, className: "text-red-400 font-semibold", wrapGroup: 1 }, + { pattern: /\|\s*(DEBUG)\s*\|/g, className: "text-gray-400", wrapGroup: 1 }, + // Task identifiers - important events + { + pattern: /(taskCreated|taskFocused|taskStarted|taskCompleted|taskAborted|taskResumable)/g, + className: "text-purple-400 font-medium", + }, + // Tool failures - highlight in red + { pattern: /(taskToolFailed)/g, className: "text-red-400 font-bold" }, + { pattern: /(Tool execution failed|tool.*failed|failed.*tool)/gi, className: "text-red-400" }, + { pattern: /(EvalPass)/g, className: "text-green-400 font-bold" }, + { pattern: /(EvalFail)/g, className: "text-red-400 font-bold" }, + // Message arrows + { pattern: /→/g, className: "text-cyan-400" }, + // Tool names in quotes + { pattern: /"(tool)":\s*"([^"]+)"/g, className: "text-orange-400" }, + // JSON keys + { pattern: /"([^"]+)":/g, className: "text-sky-300" }, + // Boolean values + { pattern: /:\s*(true|false)/g, className: "text-amber-400", wrapGroup: 1 }, + // Numbers + { pattern: /:\s*(-?\d+\.?\d*)/g, className: "text-emerald-400", wrapGroup: 1 }, +] + +// Extract timestamp from a log line and return elapsed time from baseline +function formatElapsedTime(timestamp: string, baselineMs: number): string { + const currentMs = new Date(timestamp).getTime() + const elapsedMs = currentMs - baselineMs + const totalSeconds = Math.floor(elapsedMs / 1000) + const minutes = Math.floor(totalSeconds / 60) + const seconds = totalSeconds % 60 + return `${minutes.toString().padStart(2, "0")}:${seconds.toString().padStart(2, "0")}` +} + +// Extract the first timestamp from the log to use as baseline +function extractFirstTimestamp(log: string): number | null { + // Match timestamp at start of line: [2025-11-28T09:35:23.187Z | ... or [2025-11-28T09:35:23.187Z] + const match = log.match(/\[(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3}Z)[\s|\]]/) + const isoString = match?.[1] + if (!isoString) return null + return new Date(isoString).getTime() +} + +// Simplify log line by removing redundant metadata +function simplifyLogLine(line: string, baselineMs: number | null): { timestamp: string; simplified: string } { + // Extract timestamp - matches [2025-11-28T09:35:23.187Z | ... format + const timestampMatch = line.match(/\[(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3}Z)[\s|\]]/) + const isoTimestamp = timestampMatch?.[1] + if (!isoTimestamp) { + return { timestamp: "", simplified: line } + } + + const timestamp = baselineMs !== null ? formatElapsedTime(isoTimestamp, baselineMs) : isoTimestamp.slice(11, 19) + + // Remove the timestamp from the line (handles both [timestamp] and [timestamp | formats) + let simplified = line.replace(/\[\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3}Z\s*\|?\s*/, "") + + // Remove redundant metadata: pid, run, task IDs (they're same for entire log) + simplified = simplified.replace(/\|\s*pid:\d+\s*/g, "") + simplified = simplified.replace(/\|\s*run:\d+\s*/g, "") + simplified = simplified.replace(/\|\s*task:\d+\s*/g, "") + simplified = simplified.replace(/runTask\s*\|\s*/g, "") + + // Clean up extra pipes, spaces, and trailing brackets + simplified = simplified.replace(/\|\s*\|/g, "|") + simplified = simplified.replace(/^\s*\|\s*/, "") + simplified = simplified.replace(/\]\s*$/, "") // Remove trailing bracket if present + + return { timestamp, simplified } +} + +// Format a single line with syntax highlighting using React elements (XSS-safe) +function formatLine(line: string): React.ReactNode[] { + // Find all matches with their positions + type Match = { start: number; end: number; text: string; className: string } + const matches: Match[] = [] + + for (const { pattern, className, wrapGroup } of HIGHLIGHT_PATTERNS) { + // Reset regex state + pattern.lastIndex = 0 + let regexMatch + while ((regexMatch = pattern.exec(line)) !== null) { + const capturedText = wrapGroup !== undefined ? regexMatch[wrapGroup] : regexMatch[0] + // Skip if capture group didn't match + if (!capturedText) continue + const start = + wrapGroup !== undefined ? regexMatch.index + regexMatch[0].indexOf(capturedText) : regexMatch.index + matches.push({ + start, + end: start + capturedText.length, + text: capturedText, + className, + }) + } + } + + // Sort matches by position and filter overlapping ones + matches.sort((a, b) => a.start - b.start) + const filteredMatches: Match[] = [] + for (const m of matches) { + const lastMatch = filteredMatches[filteredMatches.length - 1] + if (!lastMatch || m.start >= lastMatch.end) { + filteredMatches.push(m) + } + } + + // Build result with highlighted spans + const result: React.ReactNode[] = [] + let currentPos = 0 + + for (const [i, m] of filteredMatches.entries()) { + // Add text before this match + if (m.start > currentPos) { + result.push(line.slice(currentPos, m.start)) + } + // Add highlighted match + result.push( + + {m.text} + , + ) + currentPos = m.end + } + + // Add remaining text + if (currentPos < line.length) { + result.push(line.slice(currentPos)) + } + + return result.length > 0 ? result : [line] +} + +// Determine the visual style for a log line based on its content +function getLineStyle(line: string): string { + if (line.includes("ERROR")) return "bg-red-950/30 border-l-2 border-red-500" + if (line.includes("WARN") || line.includes("WARNING")) return "bg-yellow-950/20 border-l-2 border-yellow-500" + if (line.includes("taskToolFailed")) return "bg-red-950/30 border-l-2 border-red-500" + if (line.includes("taskStarted") || line.includes("taskCreated")) return "bg-purple-950/20" + if (line.includes("EvalPass")) return "bg-green-950/30 border-l-2 border-green-500" + if (line.includes("EvalFail")) return "bg-red-950/30 border-l-2 border-red-500" + if (line.includes("taskCompleted") || line.includes("taskAborted")) return "bg-blue-950/20" + return "" +} + +// Format log content with basic highlighting (XSS-safe - no dangerouslySetInnerHTML) +function formatLogContent(log: string): React.ReactNode[] { + const lines = log.split("\n") + const baselineMs = extractFirstTimestamp(log) + + return lines.map((line, index) => { + if (!line.trim()) { + return ( +
+ {" "} +
+ ) + } + + const parsed = simplifyLogLine(line, baselineMs) + const lineStyle = getLineStyle(line) + + return ( +
+ {/* Elapsed time */} + + {parsed.timestamp} + + {/* Log content - pl-12 ensures wrapped lines are indented under the timestamp */} + + {formatLine(parsed.simplified)} + +
+ ) + }) +} + export function Run({ run }: { run: Run }) { const runStatus = useRunStatus(run) - const { tasks, tokenUsage, usageUpdatedAt } = runStatus + const { tasks, tokenUsage, usageUpdatedAt, heartbeat, runners } = runStatus + + const [selectedTask, setSelectedTask] = useState(null) + const [taskLog, setTaskLog] = useState(null) + const [isLoadingLog, setIsLoadingLog] = useState(false) + const [copied, setCopied] = useState(false) + const [showKillDialog, setShowKillDialog] = useState(false) + const [isKilling, setIsKilling] = useState(false) + + // Determine if run is still active (has heartbeat or runners) + const isRunActive = !run.taskMetricsId && (!!heartbeat || (runners && runners.length > 0)) + + const onKillRun = useCallback(async () => { + setIsKilling(true) + try { + const result = await killRun(run.id) + if (result.killedContainers.length > 0) { + toast.success(`Killed ${result.killedContainers.length} container(s)`) + } else if (result.errors.length === 0) { + toast.info("No running containers found") + } else { + toast.error(result.errors.join(", ")) + } + } catch (error) { + console.error("Failed to kill run:", error) + toast.error("Failed to kill run") + } finally { + setIsKilling(false) + setShowKillDialog(false) + } + }, [run.id]) + + const onCopyLog = useCallback(async () => { + if (!taskLog) return + + try { + await navigator.clipboard.writeText(taskLog) + setCopied(true) + toast.success("Log copied to clipboard") + setTimeout(() => setCopied(false), 2000) + } catch (error) { + console.error("Failed to copy log:", error) + toast.error("Failed to copy log") + } + }, [taskLog]) + + // Handle ESC key to close the dialog + useEffect(() => { + const handleKeyDown = (e: KeyboardEvent) => { + if (e.key === "Escape" && selectedTask) { + setSelectedTask(null) + } + } + + document.addEventListener("keydown", handleKeyDown) + return () => document.removeEventListener("keydown", handleKeyDown) + }, [selectedTask]) + + const onViewTaskLog = useCallback( + async (task: Task) => { + // Only allow viewing logs for tasks that have started + if (!task.startedAt && !tokenUsage.get(task.id)) { + toast.error("Task has not started yet") + return + } + + setSelectedTask(task) + setIsLoadingLog(true) + setTaskLog(null) + + try { + const response = await fetch(`/api/runs/${run.id}/logs/${task.id}`) + + if (!response.ok) { + const error = await response.json() + toast.error(error.error || "Failed to load log") + setSelectedTask(null) + return + } + + const data = await response.json() + setTaskLog(data.logContent) + } catch (error) { + console.error("Error loading task log:", error) + toast.error("Failed to load log") + setSelectedTask(null) + } finally { + setIsLoadingLog(false) + } + }, + [run.id, tokenUsage], + ) const taskMetrics: Record = useMemo(() => { const metrics: Record = {} @@ -62,22 +358,34 @@ export function Run({ run }: { run: Run }) { // eslint-disable-next-line react-hooks/exhaustive-deps }, [tasks, tokenUsage, usageUpdatedAt]) + // Collect all unique tool names from all tasks and sort by total attempts + const toolColumns = useMemo(() => { + if (!tasks) return [] + + const toolTotals = new Map() + + for (const task of tasks) { + if (task.taskMetrics?.toolUsage) { + for (const [toolName, usage] of Object.entries(task.taskMetrics.toolUsage)) { + const tool = toolName as ToolName + const current = toolTotals.get(tool) ?? 0 + toolTotals.set(tool, current + usage.attempts) + } + } + } + + // Sort by total attempts descending + return Array.from(toolTotals.entries()) + .sort((a, b) => b[1] - a[1]) + .map(([name]): ToolName => name) + }, [tasks]) + // Compute aggregate stats const stats = useMemo(() => { if (!tasks) return null const passed = tasks.filter((t) => t.passed === true).length const failed = tasks.filter((t) => t.passed === false).length - // Count running tasks exactly like TaskStatus shows spinner: - // - passed is not true and not false (null/undefined) - // - AND has activity (startedAt or tokenUsage) - const running = tasks.filter( - (t) => t.passed !== true && t.passed !== false && (t.startedAt || tokenUsage.get(t.id)), - ).length - const pending = tasks.filter( - (t) => t.passed !== true && t.passed !== false && !t.startedAt && !tokenUsage.get(t.id), - ).length - const total = tasks.length const completed = passed + failed let totalTokensIn = 0 @@ -113,9 +421,6 @@ export function Run({ run }: { run: Run }) { return { passed, failed, - running, - pending, - total, completed, passRate: completed > 0 ? ((passed / completed) * 100).toFixed(1) : null, totalTokensIn, @@ -127,42 +432,96 @@ export function Run({ run }: { run: Run }) { // eslint-disable-next-line react-hooks/exhaustive-deps }, [tasks, taskMetrics, tokenUsage, usageUpdatedAt]) + // Calculate elapsed time (wall-clock time from run creation to completion or now) + const elapsedTime = useMemo(() => { + if (!tasks || tasks.length === 0) return null + + const startTime = new Date(run.createdAt).getTime() + + // If run is complete, find the latest finishedAt from tasks + if (run.taskMetricsId) { + const latestFinish = tasks.reduce((latest, task) => { + if (task.finishedAt) { + const finishTime = new Date(task.finishedAt).getTime() + return finishTime > latest ? finishTime : latest + } + return latest + }, startTime) + return latestFinish - startTime + } + + // If still running, use current time + return Date.now() - startTime + // eslint-disable-next-line react-hooks/exhaustive-deps + }, [tasks, run.createdAt, run.taskMetricsId, usageUpdatedAt]) + return ( <>
-
-
-
{run.model}
- {run.description &&
{run.description}
} -
- {!run.taskMetricsId && } -
- {stats && ( -
+
+ {/* Provider, Model title and status */} +
+ {run.settings?.apiProvider && ( + {run.settings.apiProvider} + )} +
{run.model}
+ + {run.description && ( + - {run.description} + )} + {isRunActive && ( + + + + + Stop all containers for this run + + )} +
{/* Main Stats Row */} -
+
{/* Passed/Failed */} -
+
{stats.passed} / {stats.failed} - {stats.running > 0 && ( - ({stats.running}) - )}
Passed / Failed
{/* Pass Rate */} -
-
{stats.passRate ? `${stats.passRate}%` : "-"}
+
+
= 80 + ? "text-yellow-500" + : "text-red-500" + }`}> + {stats.passRate ? `${stats.passRate}%` : "-"} +
Pass Rate
{/* Tokens */} -
+
{formatTokens(stats.totalTokensIn)} / @@ -172,58 +531,64 @@ export function Run({ run }: { run: Run }) {
{/* Cost */} -
+
{formatCurrency(stats.totalCost)}
Cost
{/* Duration */} -
+
{stats.totalDuration > 0 ? formatDuration(stats.totalDuration) : "-"}
Duration
- {/* Tool Usage - Inline */} - {Object.keys(stats.toolUsage).length > 0 && ( -
- {Object.entries(stats.toolUsage) - .sort(([, a], [, b]) => b.attempts - a.attempts) - .map(([toolName, usage]) => { - const abbr = getToolAbbreviation(toolName) - const successRate = - usage.attempts > 0 - ? ((usage.attempts - usage.failures) / usage.attempts) * 100 - : 100 - const rateColor = - successRate === 100 - ? "text-green-500" - : successRate >= 80 - ? "text-yellow-500" - : "text-red-500" - return ( - - -
- - {abbr} - - - {usage.attempts} - - - {formatToolUsageSuccessRate(usage)} - -
-
- {toolName} -
- ) - })} + {/* Elapsed Time */} +
+
+ {elapsedTime !== null ? formatDuration(elapsedTime) : "-"}
- )} +
Elapsed
+
+ + {/* Tool Usage Row */} + {Object.keys(stats.toolUsage).length > 0 && ( +
+ {Object.entries(stats.toolUsage) + .sort(([, a], [, b]) => b.attempts - a.attempts) + .map(([toolName, usage]) => { + const abbr = getToolAbbreviation(toolName) + const successRate = + usage.attempts > 0 + ? ((usage.attempts - usage.failures) / usage.attempts) * 100 + : 100 + const rateColor = + successRate === 100 + ? "text-green-500" + : successRate >= 80 + ? "text-yellow-500" + : "text-red-500" + return ( + + +
+ + {abbr} + + {usage.attempts} + + {formatToolUsageSuccessRate(usage)} + +
+
+ {toolName} +
+ ) + })} +
+ )}
)} {!tasks ? ( @@ -235,53 +600,206 @@ export function Run({ run }: { run: Run }) { Exercise Tokens In / Out Context + {toolColumns.map((toolName) => ( + + + {getToolAbbreviation(toolName)} + {toolName} + + + ))} Duration Cost - {tasks.map((task) => ( - - -
- -
- {task.language}/{task.exercise} -
-
-
- {taskMetrics[task.id] ? ( - <> - -
-
{formatTokens(taskMetrics[task.id]!.tokensIn)}
/ -
{formatTokens(taskMetrics[task.id]!.tokensOut)}
+ {tasks.map((task) => { + const hasStarted = !!task.startedAt || !!tokenUsage.get(task.id) + return ( + hasStarted && onViewTaskLog(task)}> + +
+ +
+ + {task.language}/{task.exercise} + {task.iteration > 1 && ( + + (#{task.iteration}) + + )} + + {hasStarted && ( + + + + + Click to view log + + )}
- - - {formatTokens(taskMetrics[task.id]!.tokensContext)} - - - {taskMetrics[task.id]!.duration - ? formatDuration(taskMetrics[task.id]!.duration) - : "-"} - - - {formatCurrency(taskMetrics[task.id]!.cost)} - - - ) : ( - - )} - - ))} +
+
+ {taskMetrics[task.id] ? ( + <> + +
+
{formatTokens(taskMetrics[task.id]!.tokensIn)}
/ +
{formatTokens(taskMetrics[task.id]!.tokensOut)}
+
+
+ + {formatTokens(taskMetrics[task.id]!.tokensContext)} + + {toolColumns.map((toolName) => { + const usage = task.taskMetrics?.toolUsage?.[toolName] + const successRate = + usage && usage.attempts > 0 + ? ((usage.attempts - usage.failures) / usage.attempts) * 100 + : 100 + const rateColor = + successRate === 100 + ? "text-muted-foreground" + : successRate >= 80 + ? "text-yellow-500" + : "text-red-500" + return ( + + {usage ? ( +
+ + {usage.attempts} + + + {formatToolUsageSuccessRate(usage)} + +
+ ) : ( + - + )} +
+ ) + })} + + {taskMetrics[task.id]!.duration + ? formatDuration(taskMetrics[task.id]!.duration) + : "-"} + + + {formatCurrency(taskMetrics[task.id]!.cost)} + + + ) : ( + + )} +
+ ) + })} )}
+ + {/* Task Log Dialog - Full Screen */} + setSelectedTask(null)}> + + +
+ + + {selectedTask?.language}/{selectedTask?.exercise} + {selectedTask?.iteration && selectedTask.iteration > 1 && ( + (#{selectedTask.iteration}) + )} + + ( + {selectedTask?.passed === true + ? "Passed" + : selectedTask?.passed === false + ? "Failed" + : "Running"} + ) + + + {taskLog && ( + + )} +
+
+
+ {isLoadingLog ? ( +
+ +
+ ) : taskLog ? ( + +
+ {formatLogContent(taskLog)} +
+
+ ) : ( +
+ Log file not available (may have been cleared) +
+ )} +
+
+
+ + {/* Kill Run Confirmation Dialog */} + + + + Kill Run? + + This will stop the controller and all task runner containers for this run. Any running tasks + will be terminated immediately. This action cannot be undone. + + + + Cancel + + {isKilling ? ( + <> + + Killing... + + ) : ( + "Kill Run" + )} + + + + ) } diff --git a/apps/web-evals/src/app/runs/new/new-run.tsx b/apps/web-evals/src/app/runs/new/new-run.tsx index 5cba0058c8a..be015ac8ca3 100644 --- a/apps/web-evals/src/app/runs/new/new-run.tsx +++ b/apps/web-evals/src/app/runs/new/new-run.tsx @@ -7,7 +7,7 @@ import { useQuery } from "@tanstack/react-query" import { useForm, FormProvider } from "react-hook-form" import { zodResolver } from "@hookform/resolvers/zod" import { toast } from "sonner" -import { X, Rocket, Check, ChevronsUpDown, SlidersHorizontal } from "lucide-react" +import { X, Rocket, Check, ChevronsUpDown, SlidersHorizontal, Info, Plus, Minus } from "lucide-react" import { globalSettingsSchema, @@ -30,6 +30,9 @@ import { TIMEOUT_MIN, TIMEOUT_MAX, TIMEOUT_DEFAULT, + ITERATIONS_MIN, + ITERATIONS_MAX, + ITERATIONS_DEFAULT, } from "@/lib/schemas" import { cn } from "@/lib/utils" @@ -61,7 +64,9 @@ import { PopoverTrigger, Slider, Label, - FormDescription, + Tooltip, + TooltipContent, + TooltipTrigger, } from "@/components/ui" import { SettingsDiff } from "./settings-diff" @@ -72,17 +77,38 @@ type ImportedSettings = { currentApiConfigName: string } +// Type for a model selection entry +type ModelSelection = { + id: string + model: string + popoverOpen: boolean +} + +// Type for a config selection entry (for import mode) +type ConfigSelection = { + id: string + configName: string + popoverOpen: boolean +} + export function NewRun() { const router = useRouter() - const [provider, setModelSource] = useState<"roo" | "openrouter" | "other">("roo") - const [modelPopoverOpen, setModelPopoverOpen] = useState(false) + const [provider, setModelSource] = useState<"roo" | "openrouter" | "other">("other") const [useNativeToolProtocol, setUseNativeToolProtocol] = useState(true) + const [commandExecutionTimeout, setCommandExecutionTimeout] = useState(20) + const [terminalShellIntegrationTimeout, setTerminalShellIntegrationTimeout] = useState(30) // seconds + + // State for multiple model selections + const [modelSelections, setModelSelections] = useState([ + { id: crypto.randomUUID(), model: "", popoverOpen: false }, + ]) - // State for imported settings with config selection + // State for imported settings with multiple config selections const [importedSettings, setImportedSettings] = useState(null) - const [selectedConfigName, setSelectedConfigName] = useState("") - const [configPopoverOpen, setConfigPopoverOpen] = useState(false) + const [configSelections, setConfigSelections] = useState([ + { id: crypto.randomUUID(), configName: "", popoverOpen: false }, + ]) const openRouter = useOpenRouterModels() const rooCodeCloud = useRooCodeCloudModels() @@ -106,6 +132,7 @@ export function NewRun() { settings: undefined, concurrency: CONCURRENCY_DEFAULT, timeout: TIMEOUT_DEFAULT, + iterations: ITERATIONS_DEFAULT, jobToken: "", }, }) @@ -117,9 +144,9 @@ export function NewRun() { formState: { isSubmitting }, } = form - const [model, suite, settings] = watch(["model", "suite", "settings", "concurrency"]) + const [suite, settings] = watch(["suite", "settings", "concurrency"]) - // Load concurrency and timeout from localStorage on mount + // Load settings from localStorage on mount useEffect(() => { const savedConcurrency = localStorage.getItem("evals-concurrency") if (savedConcurrency) { @@ -135,6 +162,37 @@ export function NewRun() { setValue("timeout", parsed) } } + const savedCommandTimeout = localStorage.getItem("evals-command-execution-timeout") + if (savedCommandTimeout) { + const parsed = parseInt(savedCommandTimeout, 10) + if (!isNaN(parsed) && parsed >= 20 && parsed <= 60) { + setCommandExecutionTimeout(parsed) + } + } + const savedShellTimeout = localStorage.getItem("evals-shell-integration-timeout") + if (savedShellTimeout) { + const parsed = parseInt(savedShellTimeout, 10) + if (!isNaN(parsed) && parsed >= 30 && parsed <= 60) { + setTerminalShellIntegrationTimeout(parsed) + } + } + // Load saved exercises selection + const savedSuite = localStorage.getItem("evals-suite") + if (savedSuite === "partial") { + setValue("suite", "partial") + const savedExercises = localStorage.getItem("evals-exercises") + if (savedExercises) { + try { + const parsed = JSON.parse(savedExercises) as string[] + if (Array.isArray(parsed)) { + setSelectedExercises(parsed) + setValue("exercises", parsed) + } + } catch { + // Invalid JSON, ignore + } + } + } }, [setValue]) // Extract unique languages from exercises @@ -178,6 +236,7 @@ export function NewRun() { setSelectedExercises(newSelected) setValue("exercises", newSelected) + localStorage.setItem("evals-exercises", JSON.stringify(newSelected)) }, [getExercisesForLanguage, selectedExercises, setValue], ) @@ -201,40 +260,165 @@ export function NewRun() { [getExercisesForLanguage, selectedExercises], ) + // Add a new model selection + const addModelSelection = useCallback(() => { + setModelSelections((prev) => [...prev, { id: crypto.randomUUID(), model: "", popoverOpen: false }]) + }, []) + + // Remove a model selection + const removeModelSelection = useCallback((id: string) => { + setModelSelections((prev) => prev.filter((s) => s.id !== id)) + }, []) + + // Update a model selection + const updateModelSelection = useCallback( + (id: string, model: string) => { + setModelSelections((prev) => prev.map((s) => (s.id === id ? { ...s, model, popoverOpen: false } : s))) + // Also set the form model field for validation (use first non-empty model) + setValue("model", model) + }, + [setValue], + ) + + // Toggle popover for a model selection + const toggleModelPopover = useCallback((id: string, open: boolean) => { + setModelSelections((prev) => prev.map((s) => (s.id === id ? { ...s, popoverOpen: open } : s))) + }, []) + + // Add a new config selection + const addConfigSelection = useCallback(() => { + setConfigSelections((prev) => [...prev, { id: crypto.randomUUID(), configName: "", popoverOpen: false }]) + }, []) + + // Remove a config selection + const removeConfigSelection = useCallback((id: string) => { + setConfigSelections((prev) => prev.filter((s) => s.id !== id)) + }, []) + + // Update a config selection + const updateConfigSelection = useCallback( + (id: string, configName: string) => { + setConfigSelections((prev) => prev.map((s) => (s.id === id ? { ...s, configName, popoverOpen: false } : s))) + // Also update the form settings for the first config (for validation) + if (importedSettings) { + const providerSettings = importedSettings.apiConfigs[configName] ?? {} + setValue("model", getModelId(providerSettings) ?? "") + setValue("settings", { ...EVALS_SETTINGS, ...providerSettings, ...importedSettings.globalSettings }) + } + }, + [importedSettings, setValue], + ) + + // Toggle popover for a config selection + const toggleConfigPopover = useCallback((id: string, open: boolean) => { + setConfigSelections((prev) => prev.map((s) => (s.id === id ? { ...s, popoverOpen: open } : s))) + }, []) + const onSubmit = useCallback( async (values: CreateRun) => { try { - if (provider === "openrouter") { - values.settings = { - ...(values.settings || {}), - apiProvider: "openrouter", - openRouterModelId: model, - toolStyle: useNativeToolProtocol ? "json" : "xml", // kilocode_change + // Validate jobToken for Roo Code Cloud provider + if (provider === "roo" && !values.jobToken?.trim()) { + toast.error("Roo Code Cloud Token is required") + return + } + + // Determine which selections to use based on provider + const selectionsToLaunch: { model: string; configName?: string }[] = [] + + if (provider === "other") { + // For import mode, use config selections + for (const config of configSelections) { + if (config.configName) { + selectionsToLaunch.push({ model: "", configName: config.configName }) + } } - } else if (provider === "roo") { - values.settings = { - ...(values.settings || {}), - apiProvider: "roo", - apiModelId: model, - toolStyle: useNativeToolProtocol ? "json" : "xml", // kilocode_change + } else { + // For openrouter/roo, use model selections + for (const selection of modelSelections) { + if (selection.model) { + selectionsToLaunch.push({ model: selection.model }) + } + } + } + + if (selectionsToLaunch.length === 0) { + toast.error("Please select at least one model or config") + return + } + + // Show launching toast + const totalRuns = selectionsToLaunch.length + toast.info(totalRuns > 1 ? `Launching ${totalRuns} runs (every 20 seconds)...` : "Launching run...") + + // Launch runs with 20-second delay between each + for (let i = 0; i < selectionsToLaunch.length; i++) { + const selection = selectionsToLaunch[i]! + + // Wait 20 seconds between runs (except for the first one) + if (i > 0) { + await new Promise((resolve) => setTimeout(resolve, 20000)) + } + + const runValues = { ...values } + + if (provider === "openrouter") { + runValues.model = selection.model + runValues.settings = { + ...(runValues.settings || {}), + apiProvider: "openrouter", + openRouterModelId: selection.model, + toolProtocol: useNativeToolProtocol ? "native" : "xml", + commandExecutionTimeout, + terminalShellIntegrationTimeout: terminalShellIntegrationTimeout * 1000, + } + } else if (provider === "roo") { + runValues.model = selection.model + runValues.settings = { + ...(runValues.settings || {}), + apiProvider: "roo", + apiModelId: selection.model, + toolProtocol: useNativeToolProtocol ? "native" : "xml", + commandExecutionTimeout, + terminalShellIntegrationTimeout: terminalShellIntegrationTimeout * 1000, + } + } else if (provider === "other" && selection.configName && importedSettings) { + const providerSettings = importedSettings.apiConfigs[selection.configName] ?? {} + runValues.model = getModelId(providerSettings) ?? "" + runValues.settings = { + ...EVALS_SETTINGS, + ...providerSettings, + ...importedSettings.globalSettings, + toolProtocol: useNativeToolProtocol ? "native" : "xml", + commandExecutionTimeout, + terminalShellIntegrationTimeout: terminalShellIntegrationTimeout * 1000, + } + } + + try { + await createRun(runValues) + toast.success(`Run ${i + 1}/${totalRuns} launched`) + } catch (e) { + toast.error(`Run ${i + 1} failed: ${e instanceof Error ? e.message : "Unknown error"}`) } } - const { id } = await createRun(values) - router.push(`/runs/${id}`) + // Navigate back to main evals UI + router.push("/") } catch (e) { toast.error(e instanceof Error ? e.message : "An unknown error occurred.") } }, - [provider, model, router, useNativeToolProtocol], - ) - - const onSelectModel = useCallback( - (model: string) => { - setValue("model", model) - setModelPopoverOpen(false) - }, - [setValue, setModelPopoverOpen], + [ + provider, + modelSelections, + configSelections, + importedSettings, + router, + useNativeToolProtocol, + commandExecutionTimeout, + terminalShellIntegrationTimeout, + ], ) const onImportSettings = useCallback( @@ -265,9 +449,9 @@ export function NewRun() { currentApiConfigName: providerProfiles.currentApiConfigName, }) - // Default to the current config + // Default to the current config for the first selection const defaultConfigName = providerProfiles.currentApiConfigName - setSelectedConfigName(defaultConfigName) + setConfigSelections([{ id: crypto.randomUUID(), configName: defaultConfigName, popoverOpen: false }]) // Apply the default config const providerSettings = providerProfiles.apiConfigs[defaultConfigName] ?? {} @@ -283,22 +467,6 @@ export function NewRun() { [clearErrors, setValue], ) - const onSelectConfig = useCallback( - (configName: string) => { - if (!importedSettings) { - return - } - - setSelectedConfigName(configName) - setConfigPopoverOpen(false) - - const providerSettings = importedSettings.apiConfigs[configName] ?? {} - setValue("model", getModelId(providerSettings) ?? "") - setValue("settings", { ...EVALS_SETTINGS, ...providerSettings, ...importedSettings.globalSettings }) - }, - [importedSettings, setValue], - ) - return ( <> @@ -314,9 +482,9 @@ export function NewRun() { value={provider} onValueChange={(value) => setModelSource(value as "roo" | "openrouter" | "other")}> + Import Roo Code Cloud OpenRouter - Other @@ -338,121 +506,217 @@ export function NewRun() { onChange={onImportSettings} /> - {importedSettings && Object.keys(importedSettings.apiConfigs).length > 1 && ( -
- - - - - - - - - - No config found. - - {Object.keys(importedSettings.apiConfigs).map( - (configName) => ( - - {configName} - {configName === - importedSettings.currentApiConfigName && ( - - (default) - - )} - 0 && ( +
+ + {configSelections.map((selection, index) => ( +
+ + toggleConfigPopover(selection.id, open) + }> + + + + + + + + No config found. + + {Object.keys( + importedSettings.apiConfigs, + ).map((configName) => ( + + updateConfigSelection( + selection.id, + configName, + ) + }> + {configName} + {configName === + importedSettings.currentApiConfigName && ( + + (default) + )} - /> - - ), - )} - - - - - + + + ))} + + + + + + {index === configSelections.length - 1 ? ( + + ) : ( + + )} +
+ ))}
)} +
+ +
+ +
+
+ {settings && ( )}
) : ( <> - - - - - - - + {modelSelections.map((selection, index) => ( +
+ toggleModelPopover(selection.id, open)}> + + + + + + + + No model found. + + {models?.map(({ id, name }) => ( + + updateModelSelection( + selection.id, + id, + ) + }> + {name} + + + ))} + + + + + + {index === modelSelections.length - 1 ? ( + + ) : ( + + )} +
+ ))} +
+ +
+ +
+
)} @@ -468,20 +732,28 @@ export function NewRun() { name="jobToken" render={({ field }) => ( - Roo Code Cloud Token +
+ Roo Code Cloud Token + + + + + +

+ If you have access to the Roo Code Cloud repository and the + decryption key for the .env.* files, generate a token with: +

+ + pnpm --filter @roo-code-cloud/auth production:create-auth-token + [email] [org] [ttl] + +
+
+
- + - - If you have access to the Roo Code Cloud repository then you can generate a - token with: -
- - pnpm --filter @roo-code-cloud/auth production:create-job-token [org] - [timeout] - -
)} /> @@ -495,12 +767,14 @@ export function NewRun() { Exercises
{ setValue("suite", value as "full" | "partial") + localStorage.setItem("evals-suite", value) if (value === "full") { setSelectedExercises([]) setValue("exercises", []) + localStorage.removeItem("evals-exercises") } }}> @@ -537,6 +811,7 @@ export function NewRun() { onValueChange={(value) => { setSelectedExercises(value) setValue("exercises", value) + localStorage.setItem("evals-exercises", JSON.stringify(value)) }} placeholder="Select" variant="inverted" @@ -548,57 +823,153 @@ export function NewRun() { )} /> - ( - - Concurrency - -
- { - field.onChange(value[0]) - localStorage.setItem("evals-concurrency", String(value[0])) - }} - /> -
{field.value}
-
-
- -
- )} - /> + {/* Concurrency, Timeout, and Iterations in a 3-column row */} +
+ ( + + Concurrency + +
+ { + field.onChange(value[0]) + localStorage.setItem("evals-concurrency", String(value[0])) + }} + /> +
{field.value}
+
+
+ +
+ )} + /> - ( - - Timeout (Minutes) - -
- { - field.onChange(value[0]) - localStorage.setItem("evals-timeout", String(value[0])) - }} - /> -
{field.value}
-
-
- -
- )} - /> + ( + + Timeout (Minutes) + +
+ { + field.onChange(value[0]) + localStorage.setItem("evals-timeout", String(value[0])) + }} + /> +
{field.value}
+
+
+ +
+ )} + /> + + ( + + Iterations + +
+ { + field.onChange(value[0]) + }} + /> +
{field.value}
+
+
+ +
+ )} + /> +
+ + {/* Terminal timeouts in a 2-column row */} +
+ +
+ + + + + + +

+ Maximum time in seconds to wait for terminal command execution to complete + before timing out. This applies to commands run via the execute_command + tool. +

+
+
+
+
+ { + if (value !== undefined) { + setCommandExecutionTimeout(value) + localStorage.setItem("evals-command-execution-timeout", String(value)) + } + }} + /> +
{commandExecutionTimeout}
+
+
+ + +
+ + + + + + +

+ Maximum time in seconds to wait for shell integration to initialize when + opening a new terminal. +

+
+
+
+
+ { + if (value !== undefined) { + setTerminalShellIntegrationTimeout(value) + localStorage.setItem("evals-shell-integration-timeout", String(value)) + } + }} + /> +
{terminalShellIntegrationTimeout}
+
+
+
[] +export const ROO_CODE_SETTINGS_KEYS = [ + ...new Set([...GLOBAL_SETTINGS_KEYS, ...PROVIDER_SETTINGS_KEYS]), +] as Keys[] type SettingsDiffProps = { defaultSettings: RooCodeSettings diff --git a/apps/web-evals/src/components/home/run.tsx b/apps/web-evals/src/components/home/run.tsx index b37fff43247..99950bae436 100644 --- a/apps/web-evals/src/components/home/run.tsx +++ b/apps/web-evals/src/components/home/run.tsx @@ -1,12 +1,13 @@ import { useCallback, useState, useRef } from "react" import Link from "next/link" import { useRouter } from "next/navigation" -import { Ellipsis, ClipboardList, Copy, Check, LoaderCircle, Trash, Settings } from "lucide-react" +import { toast } from "sonner" +import { Ellipsis, ClipboardList, Copy, Check, LoaderCircle, Trash, Settings, FileDown, StickyNote } from "lucide-react" import type { Run as EvalsRun, TaskMetrics as EvalsTaskMetrics } from "@roo-code/evals" import type { ToolName } from "@roo-code/types" -import { deleteRun } from "@/actions/runs" +import { deleteRun, updateRunDescription } from "@/actions/runs" import { formatCurrency, formatDateTime, @@ -19,6 +20,10 @@ import { Button, TableCell, TableRow, + Textarea, + Tooltip, + TooltipContent, + TooltipTrigger, DropdownMenu, DropdownMenuContent, DropdownMenuItem, @@ -33,6 +38,7 @@ import { AlertDialogTitle, Dialog, DialogContent, + DialogFooter, DialogHeader, DialogTitle, ScrollArea, @@ -42,15 +48,77 @@ type RunProps = { run: EvalsRun taskMetrics: EvalsTaskMetrics | null toolColumns: ToolName[] + consolidatedToolColumns: string[] } -export function Run({ run, taskMetrics, toolColumns }: RunProps) { +export function Run({ run, taskMetrics, toolColumns, consolidatedToolColumns }: RunProps) { const router = useRouter() const [deleteRunId, setDeleteRunId] = useState() const [showSettings, setShowSettings] = useState(false) + const [isExportingLogs, setIsExportingLogs] = useState(false) + const [showNotesDialog, setShowNotesDialog] = useState(false) + const [editingDescription, setEditingDescription] = useState(run.description ?? "") + const [isSavingNotes, setIsSavingNotes] = useState(false) const continueRef = useRef(null) const { isPending, copyRun, copied } = useCopyRun(run.id) + const hasDescription = Boolean(run.description && run.description.trim().length > 0) + + const handleSaveDescription = useCallback(async () => { + setIsSavingNotes(true) + try { + const result = await updateRunDescription(run.id, editingDescription.trim() || null) + if (result.success) { + toast.success("Description saved") + setShowNotesDialog(false) + router.refresh() + } else { + toast.error("Failed to save description") + } + } catch (error) { + console.error("Error saving description:", error) + toast.error("Failed to save description") + } finally { + setIsSavingNotes(false) + } + }, [run.id, editingDescription, router]) + + const onExportFailedLogs = useCallback(async () => { + if (run.failed === 0) { + toast.error("No failed tasks to export") + return + } + + setIsExportingLogs(true) + try { + const response = await fetch(`/api/runs/${run.id}/logs/failed`) + + if (!response.ok) { + const error = await response.json() + toast.error(error.error || "Failed to export logs") + return + } + + // Download the zip file + const blob = await response.blob() + const url = window.URL.createObjectURL(blob) + const a = document.createElement("a") + a.href = url + a.download = `run-${run.id}-failed-logs.zip` + document.body.appendChild(a) + a.click() + window.URL.revokeObjectURL(url) + document.body.removeChild(a) + + toast.success("Failed logs exported successfully") + } catch (error) { + console.error("Error exporting logs:", error) + toast.error("Failed to export logs") + } finally { + setIsExportingLogs(false) + } + }, [run.id, run.failed]) + const onConfirmDelete = useCallback(async () => { if (!deleteRunId) { return @@ -86,9 +154,13 @@ export function Run({ run, taskMetrics, toolColumns }: RunProps) { {run.passed} {run.failed} - {run.passed + run.failed > 0 && ( - {((run.passed / (run.passed + run.failed)) * 100).toFixed(1)}% - )} + {run.passed + run.failed > 0 && + (() => { + const percent = (run.passed / (run.passed + run.failed)) * 100 + const colorClass = + percent === 100 ? "text-green-500" : percent >= 80 ? "text-yellow-500" : "text-red-500" + return {percent.toFixed(1)}% + })()} {taskMetrics && ( @@ -98,14 +170,84 @@ export function Run({ run, taskMetrics, toolColumns }: RunProps) {
)} + {consolidatedToolColumns.length > 0 && ( + + {taskMetrics?.toolUsage ? ( + (() => { + // Calculate aggregated stats for consolidated tools + let totalAttempts = 0 + let totalFailures = 0 + const breakdown: Array<{ tool: string; attempts: number; rate: string }> = [] + + for (const toolName of consolidatedToolColumns) { + const usage = taskMetrics.toolUsage[toolName as ToolName] + if (usage) { + totalAttempts += usage.attempts + totalFailures += usage.failures + const rate = + usage.attempts > 0 + ? `${Math.round(((usage.attempts - usage.failures) / usage.attempts) * 100)}%` + : "0%" + breakdown.push({ tool: toolName, attempts: usage.attempts, rate }) + } + } + + const consolidatedRate = + totalAttempts > 0 ? ((totalAttempts - totalFailures) / totalAttempts) * 100 : 100 + const rateColor = + consolidatedRate === 100 + ? "text-muted-foreground" + : consolidatedRate >= 80 + ? "text-yellow-500" + : "text-red-500" + + return totalAttempts > 0 ? ( + + +
+ {totalAttempts} + {Math.round(consolidatedRate)}% +
+
+ +
+
Consolidated Tools:
+ {breakdown.map(({ tool, attempts, rate }) => ( +
+ {tool}: + + {attempts} ({rate}) + +
+ ))} +
+
+
+ ) : ( + - + ) + })() + ) : ( + - + )} +
+ )} {toolColumns.map((toolName) => { const usage = taskMetrics?.toolUsage?.[toolName] + const successRate = + usage && usage.attempts > 0 ? ((usage.attempts - usage.failures) / usage.attempts) * 100 : 100 + const rateColor = + successRate === 100 + ? "text-muted-foreground" + : successRate >= 80 + ? "text-yellow-500" + : "text-red-500" return ( {usage ? (
{usage.attempts} - {formatToolUsageSuccessRate(usage)} + {formatToolUsageSuccessRate(usage)}
) : ( - @@ -116,63 +258,107 @@ export function Run({ run, taskMetrics, toolColumns }: RunProps) { {taskMetrics && formatCurrency(taskMetrics.cost)} {taskMetrics && formatDuration(taskMetrics.duration)} e.stopPropagation()}> - - - - - -
- -
View Tasks
-
- -
- {run.settings && ( - setShowSettings(true)}> -
- -
View Settings
-
+
+ {/* Note Icon */} + + + + + + {hasDescription ? ( +
{run.description}
+ ) : ( +
No description. Click to add one.
+ )} +
+
+ + {/* More Actions Menu */} + + + + + +
+ +
View Tasks
+
+
- )} - {run.taskMetricsId && ( - copyRun()} disabled={isPending || copied}> + {run.settings && ( + setShowSettings(true)}> +
+ +
View Settings
+
+
+ )} + {run.taskMetricsId && ( + copyRun()} disabled={isPending || copied}> +
+ {isPending ? ( + <> + + Copying... + + ) : copied ? ( + <> + + Copied! + + ) : ( + <> + + Copy to Production + + )} +
+
+ )} + {run.failed > 0 && ( + +
+ {isExportingLogs ? ( + <> + + Exporting... + + ) : ( + <> + + Export Failed Logs + + )} +
+
+ )} + { + setDeleteRunId(run.id) + setTimeout(() => continueRef.current?.focus(), 0) + }}>
- {isPending ? ( - <> - - Copying... - - ) : copied ? ( - <> - - Copied! - - ) : ( - <> - - Copy to Production - - )} + +
Delete
- )} - { - setDeleteRunId(run.id) - setTimeout(() => continueRef.current?.focus(), 0) - }}> -
- -
Delete
-
-
-
-
+ + +
setDeleteRunId(undefined)}> @@ -201,6 +387,39 @@ export function Run({ run, taskMetrics, toolColumns }: RunProps) { + + {/* Notes/Description Dialog */} + + + + Run Description + +
+