Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
116 commits
Select commit Hold shift + click to select a range
65c750d
Changeset version bump (#9856)
github-actions[bot] Dec 5, 2025
5c50160
Better error logs for parseToolCall exceptions (#9857)
cte Dec 5, 2025
642a187
(update): Add DeepSeek V3-2 Support for Baseten Provider (#9861)
AlexKer Dec 5, 2025
d285d01
web: Product pages (#9865)
brunobergher Dec 5, 2025
9f4dcfc
fix: sanitize removed/invalid API providers to prevent infinite loop …
hannesrudolph Dec 5, 2025
9d5eca9
Update xAI models catalog (#9872)
hannesrudolph Dec 5, 2025
dd92453
refactor: decouple tools from system prompt (#9784)
daniel-lxs Dec 5, 2025
4a5cbcb
Stop making count_tokens requests (#9884)
mrubens Dec 6, 2025
2eae321
Default to using native tools when supported on openrouter (#9878)
mrubens Dec 6, 2025
946fd03
feat: change defaultToolProtocol default from xml to native (#9892)
roomote[bot] Dec 6, 2025
8aa1346
Refactor: Unified context-management architecture with improved UX (#…
hannesrudolph Dec 7, 2025
1f7e1ee
Make eval runs deleteable (#9909)
mrubens Dec 8, 2025
bea7626
fix: add Kimi, MiniMax, and Qwen model configurations for Bedrock (#9…
roomote[bot] Dec 8, 2025
1370cb0
fix: use foreground color for context-management icons (#9912)
hannesrudolph Dec 8, 2025
efbf427
feat: add xhigh reasoning effort for gpt-5.1-codex-max (#9900)
andrewginns Dec 8, 2025
fba8508
feat: add search_replace native tool for single-replacement operation…
hannesrudolph Dec 8, 2025
6f602fc
Improve cloud job error logging for RCC provider errors (#9924)
cte Dec 8, 2025
754b701
feat: configure tool preferences for xAI models (#9923)
hannesrudolph Dec 8, 2025
88a0bed
fix: process finish_reason to emit tool_call_end events (#9927)
daniel-lxs Dec 8, 2025
ee48b3a
fix: suppress 'ask promise was ignored' error in handleError (#9914)
daniel-lxs Dec 9, 2025
375c103
fix: exclude apply_diff from native tools when diffEnabled is false (…
roomote[bot] Dec 9, 2025
93a43e4
Try to make OpenAI errors more useful (#9639)
mrubens Dec 9, 2025
de00ab1
refactor: consolidate ThinkingBudget components and fix disable handl…
hannesrudolph Dec 9, 2025
3356267
Add timeout to OpenAI Compatible Provider Client (#9898)
dcbartlett Dec 9, 2025
2efebf5
fix: add finish_reason processing to xai.ts provider (#9929)
daniel-lxs Dec 9, 2025
5bde2e5
Remove defaultTemperature from Roo provider configuration (#9932)
mrubens Dec 9, 2025
54a5265
feat: forbid time estimates in architect mode (#9931)
roomote[bot] Dec 9, 2025
c103a4a
feat: streaming tool stats + token usage throttling (#9926)
hannesrudolph Dec 9, 2025
8a98f14
feat: Make Architect save to `/plans` and gitignore it (#9944)
brunobergher Dec 9, 2025
e142906
feat: add announcement support CTA and social icons (#9945)
hannesrudolph Dec 9, 2025
f89a6be
fix: display actual API error message instead of generic text on retr…
hannesrudolph Dec 9, 2025
83787a7
feat(roo): add versioned settings support with minPluginVersion gatin…
hannesrudolph Dec 9, 2025
0068d1f
Revert "feat: change defaultToolProtocol default from xml to native" …
mrubens Dec 9, 2025
4608c97
fix: return undefined instead of 0 for disabled API timeout (#9960)
hannesrudolph Dec 9, 2025
1898848
feat(deepseek): update DeepSeek models to V3.2 with new pricing (#9962)
hannesrudolph Dec 9, 2025
721b02e
Add a way to save screenshots from the browser tool (#9963)
mrubens Dec 10, 2025
ada7411
Tweaks to baseten model definitions (#9866)
mrubens Dec 10, 2025
29d6f6d
fix: always show tool protocol selector for openai-compatible (#9966)
hannesrudolph Dec 10, 2025
24eb6ae
feat: add API error telemetry to OpenRouter provider (#9953)
daniel-lxs Dec 10, 2025
f472a82
fix: validate and fix tool_result IDs before API requests (#9952)
daniel-lxs Dec 10, 2025
df5fdef
fix: respect explicit supportsReasoningEffort array values (#9970)
hannesrudolph Dec 10, 2025
0cf5b28
v3.36.3 (#9972)
cte Dec 10, 2025
048e7f3
feat(gemini): add minimal and medium reasoning effort levels (#9973)
hannesrudolph Dec 10, 2025
03912d8
Delete changeset files (#9977)
cte Dec 10, 2025
36ef603
Add missing release notes for v3.36.3 (#9979)
cte Dec 10, 2025
ab18bf3
feat: add error details modal with on-demand display (#9985)
roomote[bot] Dec 10, 2025
e092e77
Fix: Correct TODO list display order in chat view (ROO-107) (#9991)
roomote[bot] Dec 10, 2025
5a4315f
fix: prevent premature rawChunkTracker clearing for MCP tools (#9993)
daniel-lxs Dec 10, 2025
fda020a
fix: filter out 429 rate limit errors from API error telemetry (#9987)
daniel-lxs Dec 10, 2025
2cd772c
Release v3.36.4 (#9994)
cte Dec 10, 2025
380a578
Changeset version bump (#9995)
github-actions[bot] Dec 10, 2025
1cf6ae6
feat(telemetry): add app version to exception captures and filter 402…
daniel-lxs Dec 10, 2025
f05dd59
Remove Glama provider (#9801)
hannesrudolph Dec 10, 2025
2a70a2e
@roo-code/types v1.90.0 (#9998)
cte Dec 10, 2025
483e70c
fix: apply versioned settings on nightly builds (#9997)
hannesrudolph Dec 10, 2025
0cbaed7
feat: add toggle for Enter key behavior in chat input (#10002)
hannesrudolph Dec 11, 2025
6a30d94
chore: remove list_code_definition_names tool (#10005)
hannesrudolph Dec 11, 2025
a1d3a43
Update roomotes.yml (#10008)
cte Dec 11, 2025
f9cfc66
fix: add general API endpoints for Z.ai provider (#9894)
roomote[bot] Dec 11, 2025
47320dc
fix: handle empty Gemini responses and reasoning loops (#10007)
hannesrudolph Dec 11, 2025
8731709
fix: add missing tool_result blocks to prevent API errors (#10015)
daniel-lxs Dec 11, 2025
51dbccf
feat: add gpt-5.2 model to openai-native provider (#10024)
hannesrudolph Dec 11, 2025
8a68b04
fix: filter orphaned tool_results when more results than tool_uses (#…
daniel-lxs Dec 11, 2025
526e195
Release v3.36.5 (#10029)
cte Dec 11, 2025
21c2d93
Changeset version bump (#10032)
github-actions[bot] Dec 11, 2025
c513df5
fix: merge settings and versionedSettings for Roo provider models (#1…
hannesrudolph Dec 11, 2025
7766b91
Revert "fix: merge settings and versionedSettings for Roo provider mo…
cte Dec 11, 2025
5072ff1
Revert the 3.6.5 release (we halted it) (#10036)
cte Dec 11, 2025
4dabd52
Release v3.36.5 (#10037)
cte Dec 11, 2025
f97b515
Changeset version bump (#10038)
github-actions[bot] Dec 11, 2025
495b5c6
ux: improve auto-approve timer visibility in follow-up suggestions (#…
brunobergher Dec 12, 2025
d976a9b
fix: cancel auto-approval timeout when user starts typing (#9937)
roomote[bot] Dec 12, 2025
23a214c
fix: extract raw error message from OpenRouter metadata (#10039)
daniel-lxs Dec 12, 2025
ba7c553
feat: add tool alias support for model-specific tool customization (#…
daniel-lxs Dec 12, 2025
0f8fac9
fix: show tool protocol dropdown for LiteLLM provider (#10053)
daniel-lxs Dec 12, 2025
8da4d3d
feat: add WorkspaceTaskVisibility type for organization cloud setting…
roomote[bot] Dec 12, 2025
f60c14e
Release: v1.91.0 (#10055)
jr Dec 12, 2025
3521270
feat: sanitize MCP server/tool names for API compatibility (#10054)
daniel-lxs Dec 12, 2025
0742335
Release v3.36.6 (#10057)
cte Dec 12, 2025
f961b73
changeset version bump
github-actions[bot] Dec 12, 2025
0b112ce
Update CHANGELOG.md
cte Dec 12, 2025
93e8ed3
Merge branch 'upstream-at-v3.36.6' into roo-v3.36.6
kevinvandijk Dec 16, 2025
e8792c8
Revive glama which was deleted upstream
kevinvandijk Dec 16, 2025
108a4f4
Fix type for xhigh reasoning
kevinvandijk Dec 16, 2025
2ffd221
Fix types
kevinvandijk Dec 16, 2025
7b17ae2
Fix typing in tests
kevinvandijk Dec 16, 2025
aad9fed
Fix remaining merge conflicts
kevinvandijk Dec 16, 2025
878703c
Update locales
kevinvandijk Dec 17, 2025
2da872f
Fix build failing in file we don't care about
kevinvandijk Dec 17, 2025
8ad3df4
Fix throttle events
kevinvandijk Dec 17, 2025
dc73bb5
Fix snapshots
kevinvandijk Dec 17, 2025
713c811
Fix mock in unit test
kevinvandijk Dec 17, 2025
03de7b4
Update snapshots because we use a different default mode
kevinvandijk Dec 17, 2025
2fbc511
Fix more mocks and skip tests we don't need
kevinvandijk Dec 17, 2025
a6c0036
Re-add countTokens for anthropic handler because of our custom changes
kevinvandijk Dec 17, 2025
cf70498
Disable irrelevant tests
kevinvandijk Dec 17, 2025
31bef3d
Resolve circular dependencies causing failing tests
kevinvandijk Dec 17, 2025
fa53ec9
Re-add Morp Fast Apply to refactored tools
kevinvandijk Dec 17, 2025
8668b72
Re-add compatibility with Morph Fast Apply to refactored native tools
kevinvandijk Dec 17, 2025
0379019
Merge branch 'main' into roo-v3.36.6
kevinvandijk Dec 17, 2025
e5c0687
Disable tests because we have a different method
kevinvandijk Dec 17, 2025
4eea059
Adjust snapshots for fast apply rule
kevinvandijk Dec 17, 2025
2d4a07e
Fix webview test
kevinvandijk Dec 18, 2025
1d2e27e
Align test to actual expected behavior
kevinvandijk Dec 18, 2025
4ca152d
Merge branch 'main' into roo-v3.36.6
kevinvandijk Dec 18, 2025
8a9fddd
Add changeset
kevinvandijk Dec 18, 2025
4adeb64
Set Anthropic headers when calling OpenRouter
chrarnoldus Dec 18, 2025
02baa58
Parse versioned model settings
chrarnoldus Dec 18, 2025
59d348b
Add comment
chrarnoldus Dec 18, 2025
b85c463
Also fix in the other place this is duplicated
chrarnoldus Dec 18, 2025
24e12b5
Merge pull request #4555 from Kilo-Org/christiaan/parse-versioned-set…
chrarnoldus Dec 18, 2025
04fa140
Merge pull request #4554 from Kilo-Org/christiaan/ant-headers
chrarnoldus Dec 18, 2025
7345904
Fix unit tests for added headers
kevinvandijk Dec 18, 2025
1c77e28
Merge branch 'main' into roo-v3.36.6
kevinvandijk Dec 18, 2025
2dd1b27
Update locales
kevinvandijk Dec 18, 2025
0642041
Merge pull request #4509 from Kilo-Org/roo-v3.36.6
kevinvandijk Dec 18, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 59 additions & 0 deletions .changeset/polite-games-arrive.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
---
"kilo-code": patch
---

Include changes from Roo Code v3.36.6

- Add tool alias support for model-specific tool customization, allowing users to configure how tools are presented to different AI models (PR #9989 by @daniel-lxs)
- Sanitize MCP server and tool names for API compatibility, ensuring special characters don't cause issues with API calls (PR #10054 by @daniel-lxs)
- Improve auto-approve timer visibility in follow-up suggestions for better user awareness of pending actions (PR #10048 by @brunobergher)
- Fix: Cancel auto-approval timeout when user starts typing, preventing accidental auto-approvals during user interaction (PR #9937 by @roomote)
- Add WorkspaceTaskVisibility type for organization cloud settings to support team visibility controls (PR #10020 by @roomote)
- Fix: Extract raw error message from OpenRouter metadata for clearer error reporting (PR #10039 by @daniel-lxs)
- Fix: Show tool protocol dropdown for LiteLLM provider, restoring missing configuration option (PR #10053 by @daniel-lxs)
- Add: GPT-5.2 model to openai-native provider (PR #10024 by @hannesrudolph)
- Fix: Handle empty Gemini responses and reasoning loops to prevent infinite retries (PR #10007 by @hannesrudolph)
- Fix: Add missing tool_result blocks to prevent API errors when tool results are expected (PR #10015 by @daniel-lxs)
- Fix: Filter orphaned tool_results when more results than tool_uses to prevent message validation errors (PR #10027 by @daniel-lxs)
- Fix: Add general API endpoints for Z.ai provider (#9879 by @richtong, PR #9894 by @roomote)
- Remove: Deprecated list_code_definition_names tool (PR #10005 by @hannesrudolph)
- Add error details modal with on-demand display for improved error visibility when debugging issues (PR #9985 by @roomote)
- Fix: Prevent premature rawChunkTracker clearing for MCP tools, improving reliability of MCP tool streaming (PR #9993 by @daniel-lxs)
- Fix: Filter out 429 rate limit errors from API error telemetry for cleaner metrics (PR #9987 by @daniel-lxs)
- Fix: Correct TODO list display order in chat view to show items in proper sequence (PR #9991 by @roomote)
- Refactor: Unified context-management architecture with improved UX for better context control (PR #9795 by @hannesrudolph)
- Add new `search_replace` native tool for single-replacement operations with improved editing precision (PR #9918 by @hannesrudolph)
- Streaming tool stats and token usage throttling for better real-time feedback during generation (PR #9926 by @hannesrudolph)
- Add versioned settings support with minPluginVersion gating for Roo provider (PR #9934 by @hannesrudolph)
- Make Architect mode save plans to `/plans` directory and gitignore it (PR #9944 by @brunobergher)
- Add ability to save screenshots from the browser tool (PR #9963 by @mrubens)
- Refactor: Decouple tools from system prompt for cleaner architecture (PR #9784 by @daniel-lxs)
- Update DeepSeek models to V3.2 with new pricing (PR #9962 by @hannesrudolph)
- Add minimal and medium reasoning effort levels for Gemini models (PR #9973 by @hannesrudolph)
- Update xAI models catalog with latest model options (PR #9872 by @hannesrudolph)
- Add DeepSeek V3-2 support for Baseten provider (PR #9861 by @AlexKer)
- Tweaks to Baseten model definitions for better defaults (PR #9866 by @mrubens)
- Fix: Add xhigh reasoning effort support for gpt-5.1-codex-max (#9891 by @andrewginns, PR #9900 by @andrewginns)
- Fix: Add Kimi, MiniMax, and Qwen model configurations for Bedrock (#9902 by @jbearak, PR #9905 by @app/roomote)
- Configure tool preferences for xAI models (PR #9923 by @hannesrudolph)
- Default to using native tools when supported on OpenRouter (PR #9878 by @mrubens)
- Fix: Exclude apply_diff from native tools when diffEnabled is false (#9919 by @denis-kudelin, PR #9920 by @app/roomote)
- Fix: Always show tool protocol selector for openai-compatible provider (#9965 by @bozoweed, PR #9966 by @hannesrudolph)
- Fix: Respect explicit supportsReasoningEffort array values for proper model configuration (PR #9970 by @hannesrudolph)
- Add timeout configuration to OpenAI Compatible Provider Client (PR #9898 by @dcbartlett)
- Revert default tool protocol change from xml to native for stability (PR #9956 by @mrubens)
- Improve OpenAI error messages to be more useful for debugging (PR #9639 by @mrubens)
- Better error logs for parseToolCall exceptions (PR #9857 by @cte)
- Improve cloud job error logging for RCC provider errors (PR #9924 by @cte)
- Fix: Display actual API error message instead of generic text on retry (PR #9954 by @hannesrudolph)
- Add API error telemetry to OpenRouter provider for better diagnostics (PR #9953 by @daniel-lxs)
- Fix: Sanitize removed/invalid API providers to prevent infinite loop (PR #9869 by @hannesrudolph)
- Fix: Use foreground color for context-management icons (PR #9912 by @hannesrudolph)
- Fix: Suppress 'ask promise was ignored' error in handleError (PR #9914 by @daniel-lxs)
- Fix: Process finish_reason to emit tool_call_end events properly (PR #9927 by @daniel-lxs)
- Fix: Add finish_reason processing to xai.ts provider (PR #9929 by @daniel-lxs)
- Fix: Validate and fix tool_result IDs before API requests (PR #9952 by @daniel-lxs)
- Fix: Return undefined instead of 0 for disabled API timeout (PR #9960 by @hannesrudolph)
- Stop making unnecessary count_tokens requests for better performance (PR #9884 by @mrubens)
- Refactor: Consolidate ThinkingBudget components and fix disable handling (PR #9930 by @hannesrudolph)
- Forbid time estimates in architect mode for more focused planning (PR #9931 by @app/roomote
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -68,4 +68,6 @@ qdrant_storage/
*.code-workspace

# Act Secret Files
.secrets
.secrets
# Architect plans
plans/
15 changes: 10 additions & 5 deletions .roo/rules-translate/AGENTS.md
Original file line number Diff line number Diff line change
Expand Up @@ -316,31 +316,36 @@ For each language that is missing translations:
"dragFiles": "按住shift拖动文件"
=======
"dragFiles": "Shift+拖拽文件"
>>>>>>> AFTER

> > > > > > > AFTER

<<<<<<< BEFORE
"description": "启用后,Kilo Code 将能够与 MCP 服务器交互以获取高级功能。"
=======
"description": "启用后 Kilo Code 可与 MCP 服务交互获取高级功能。"
>>>>>>> AFTER

> > > > > > > AFTER

<<<<<<< BEFORE
"cannotUndo": "此操作无法撤消。"
=======
"cannotUndo": "此操作不可逆。"
>>>>>>> AFTER

> > > > > > > AFTER

<<<<<<< BEFORE
"hold shift to drag in files" → "按住shift拖动文件"
=======
"hold shift to drag in files" → "Shift+拖拽文件"
>>>>>>> AFTER

> > > > > > > AFTER

<<<<<<< BEFORE
"Double click to edit" → "双击进行编辑"
=======
"Double click to edit" → "双击编辑"
>>>>>>> AFTER

> > > > > > > AFTER
```

### Common Pitfalls
Expand Down
23 changes: 12 additions & 11 deletions apps/kilocode-docs/docs/advanced-usage/appbuilder.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,26 +39,27 @@ Before using App Builder:
1. Navigate to **[App Builder](https://app.kilo.ai/app-builder)** from your Kilo dashboard.
2. Choose an **AI Model** for development (e.g., Grok Code Fast 1, Claude Sonnet 4.5, GPT-5.2).
3. Describe your application in plain language:
- What it should do
- Key features and functionality
- Design preferences or constraints
- What it should do
- Key features and functionality
- Design preferences or constraints
4. Watch the **live preview** update as the AI generates your app.
5. Provide feedback to refine:
- "Make the header sticky"
- "Add a dark mode toggle"
- "Connect this form to a database"
- "Make the header sticky"
- "Add a dark mode toggle"
- "Connect this form to a database"
6. When satisfied, click **Deploy** to push your app live.

---

## How App Builder Works

- When you describe your application:
1. The AI model interprets your requirements and generates an initial implementation.
2. Code is rendered in real-time in the live preview panel.
3. You can interact with the preview as if it were the deployed app.
4. Each refinement request triggers targeted updates to the codebase.
5. The AI maintains context across your entire conversation for coherent iteration.

1. The AI model interprets your requirements and generates an initial implementation.
2. Code is rendered in real-time in the live preview panel.
3. You can interact with the preview as if it were the deployed app.
4. Each refinement request triggers targeted updates to the codebase.
5. The AI maintains context across your entire conversation for coherent iteration.

- Deployment packages your application and provisions hosting automatically.

Expand Down
111 changes: 82 additions & 29 deletions apps/web-evals/src/app/runs/[id]/run.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -242,7 +242,7 @@ function formatLogContent(log: string): React.ReactNode[] {

export function Run({ run }: { run: Run }) {
const runStatus = useRunStatus(run)
const { tasks, tokenUsage, usageUpdatedAt, heartbeat, runners } = runStatus
const { tasks, tokenUsage, toolUsage, usageUpdatedAt, heartbeat, runners } = runStatus

const [selectedTask, setSelectedTask] = useState<Task | null>(null)
const [taskLog, setTaskLog] = useState<string | null>(null)
Expand Down Expand Up @@ -336,37 +336,70 @@ export function Run({ run }: { run: Run }) {
)

const taskMetrics: Record<number, TaskMetrics> = useMemo(() => {
// Reference usageUpdatedAt to trigger recomputation when Map contents change
void usageUpdatedAt
const metrics: Record<number, TaskMetrics> = {}

tasks?.forEach((task) => {
const usage = tokenUsage.get(task.id)

if (task.finishedAt && task.taskMetrics) {
metrics[task.id] = task.taskMetrics
} else if (usage) {
const streamingUsage = tokenUsage.get(task.id)
const dbMetrics = task.taskMetrics

// For finished tasks, prefer DB values but fall back to streaming values
// This handles race conditions during timeout where DB might not have latest data
if (task.finishedAt) {
// Check if DB metrics have meaningful values (not just default/empty)
const dbHasData = dbMetrics && (dbMetrics.tokensIn > 0 || dbMetrics.tokensOut > 0 || dbMetrics.cost > 0)
if (dbHasData) {
metrics[task.id] = dbMetrics
} else if (streamingUsage) {
// Fall back to streaming values if DB is empty/stale
metrics[task.id] = {
tokensIn: streamingUsage.totalTokensIn,
tokensOut: streamingUsage.totalTokensOut,
tokensContext: streamingUsage.contextTokens,
duration: streamingUsage.duration ?? 0,
cost: streamingUsage.totalCost,
}
}
} else if (streamingUsage) {
// For running tasks, use streaming values
metrics[task.id] = {
tokensIn: usage.totalTokensIn,
tokensOut: usage.totalTokensOut,
tokensContext: usage.contextTokens,
duration: usage.duration ?? 0,
cost: usage.totalCost,
tokensIn: streamingUsage.totalTokensIn,
tokensOut: streamingUsage.totalTokensOut,
tokensContext: streamingUsage.contextTokens,
duration: streamingUsage.duration ?? 0,
cost: streamingUsage.totalCost,
}
}
})

return metrics
// eslint-disable-next-line react-hooks/exhaustive-deps
}, [tasks, tokenUsage, usageUpdatedAt])

// Collect all unique tool names from all tasks and sort by total attempts
const toolColumns = useMemo<ToolName[]>(() => {
// Reference usageUpdatedAt to trigger recomputation when Map contents change
void usageUpdatedAt
if (!tasks) return []

const toolTotals = new Map<ToolName, number>()

for (const task of tasks) {
if (task.taskMetrics?.toolUsage) {
for (const [toolName, usage] of Object.entries(task.taskMetrics.toolUsage)) {
// Get both DB and streaming values
const dbToolUsage = task.taskMetrics?.toolUsage
const streamingToolUsage = toolUsage.get(task.id)

// For finished tasks, prefer DB values but fall back to streaming values
// For running tasks, use streaming values
// This handles race conditions during timeout where DB might not have latest data
const taskToolUsage = task.finishedAt
? dbToolUsage && Object.keys(dbToolUsage).length > 0
? dbToolUsage
: streamingToolUsage
: streamingToolUsage

if (taskToolUsage) {
for (const [toolName, usage] of Object.entries(taskToolUsage)) {
const tool = toolName as ToolName
const current = toolTotals.get(tool) ?? 0
toolTotals.set(tool, current + usage.attempts)
Expand All @@ -378,10 +411,13 @@ export function Run({ run }: { run: Run }) {
return Array.from(toolTotals.entries())
.sort((a, b) => b[1] - a[1])
.map(([name]): ToolName => name)
}, [tasks])
// toolUsage ref is stable; usageUpdatedAt triggers recomputation when Map contents change
}, [tasks, toolUsage, usageUpdatedAt])

// Compute aggregate stats
const stats = useMemo(() => {
// Reference usageUpdatedAt to trigger recomputation when Map contents change
void usageUpdatedAt
if (!tasks) return null

const passed = tasks.filter((t) => t.passed === true).length
Expand All @@ -393,8 +429,8 @@ export function Run({ run }: { run: Run }) {
let totalCost = 0
let totalDuration = 0

// Aggregate tool usage from completed tasks
const toolUsage: ToolUsage = {}
// Aggregate tool usage from all tasks (both finished and running)
const toolUsageAggregate: ToolUsage = {}

for (const task of tasks) {
const metrics = taskMetrics[task.id]
Expand All @@ -405,15 +441,24 @@ export function Run({ run }: { run: Run }) {
totalDuration += metrics.duration
}

// Aggregate tool usage from finished tasks with taskMetrics
if (task.finishedAt && task.taskMetrics?.toolUsage) {
for (const [key, usage] of Object.entries(task.taskMetrics.toolUsage)) {
// Aggregate tool usage: prefer DB values for finished tasks, fall back to streaming values
// This handles race conditions during timeout where DB might not have latest data
const dbToolUsage = task.taskMetrics?.toolUsage
const streamingToolUsage = toolUsage.get(task.id)
const taskToolUsage = task.finishedAt
? dbToolUsage && Object.keys(dbToolUsage).length > 0
? dbToolUsage
: streamingToolUsage
: streamingToolUsage

if (taskToolUsage) {
for (const [key, usage] of Object.entries(taskToolUsage)) {
const tool = key as keyof ToolUsage
if (!toolUsage[tool]) {
toolUsage[tool] = { attempts: 0, failures: 0 }
if (!toolUsageAggregate[tool]) {
toolUsageAggregate[tool] = { attempts: 0, failures: 0 }
}
toolUsage[tool].attempts += usage.attempts
toolUsage[tool].failures += usage.failures
toolUsageAggregate[tool].attempts += usage.attempts
toolUsageAggregate[tool].failures += usage.failures
}
}
}
Expand All @@ -427,13 +472,15 @@ export function Run({ run }: { run: Run }) {
totalTokensOut,
totalCost,
totalDuration,
toolUsage,
toolUsage: toolUsageAggregate,
}
// eslint-disable-next-line react-hooks/exhaustive-deps
}, [tasks, taskMetrics, tokenUsage, usageUpdatedAt])
// Map refs are stable; usageUpdatedAt triggers recomputation when Map contents change
}, [tasks, taskMetrics, toolUsage, usageUpdatedAt])

// Calculate elapsed time (wall-clock time from run creation to completion or now)
const elapsedTime = useMemo(() => {
// Reference usageUpdatedAt to trigger recomputation for live elapsed time updates
void usageUpdatedAt
if (!tasks || tasks.length === 0) return null

const startTime = new Date(run.createdAt).getTime()
Expand All @@ -452,7 +499,6 @@ export function Run({ run }: { run: Run }) {

// If still running, use current time
return Date.now() - startTime
// eslint-disable-next-line react-hooks/exhaustive-deps
}, [tasks, run.createdAt, run.taskMetricsId, usageUpdatedAt])

return (
Expand Down Expand Up @@ -655,7 +701,14 @@ export function Run({ run }: { run: Run }) {
{formatTokens(taskMetrics[task.id]!.tokensContext)}
</TableCell>
{toolColumns.map((toolName) => {
const usage = task.taskMetrics?.toolUsage?.[toolName]
// Use DB values for finished tasks, but fall back to streaming values
// if DB values are missing (handles race condition during timeout)
const dbUsage = task.taskMetrics?.toolUsage?.[toolName]
const streamingUsage = toolUsage.get(task.id)?.[toolName]
const usage = task.finishedAt
? (dbUsage ?? streamingUsage)
: streamingUsage

const successRate =
usage && usage.attempts > 0
? ((usage.attempts - usage.failures) / usage.attempts) * 100
Expand Down
11 changes: 10 additions & 1 deletion apps/web-evals/src/hooks/use-run-status.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import { useState, useCallback, useRef } from "react"
import { useQuery, keepPreviousData } from "@tanstack/react-query"

import { type TokenUsage, RooCodeEventName, taskEventSchema } from "@roo-code/types"
import { type TokenUsage, type ToolUsage, RooCodeEventName, taskEventSchema } from "@roo-code/types"
import type { Run, Task, TaskMetrics } from "@roo-code/evals"

import { getHeartbeat } from "@/actions/heartbeat"
Expand All @@ -15,6 +15,7 @@ export type RunStatus = {
runners: string[] | undefined
tasks: (Task & { taskMetrics: TaskMetrics | null })[] | undefined
tokenUsage: Map<number, TokenUsage & { duration?: number }>
toolUsage: Map<number, ToolUsage>
usageUpdatedAt: number | undefined
}

Expand All @@ -23,6 +24,7 @@ export const useRunStatus = (run: Run): RunStatus => {
const [usageUpdatedAt, setUsageUpdatedAt] = useState<number>()

const tokenUsage = useRef<Map<number, TokenUsage & { duration?: number }>>(new Map())
const toolUsage = useRef<Map<number, ToolUsage>>(new Map())
const startTimes = useRef<Map<number, number>>(new Map())

const { data: heartbeat } = useQuery({
Expand Down Expand Up @@ -78,6 +80,12 @@ export const useRunStatus = (run: Run): RunStatus => {
const startTime = startTimes.current.get(taskId)
const duration = startTime ? Date.now() - startTime : undefined
tokenUsage.current.set(taskId, { ...payload[1], duration })

// Track tool usage from streaming updates
if (payload[2]) {
toolUsage.current.set(taskId, payload[2])
}

setUsageUpdatedAt(Date.now())
break
}
Expand All @@ -96,6 +104,7 @@ export const useRunStatus = (run: Run): RunStatus => {
runners,
tasks,
tokenUsage: tokenUsage.current,
toolUsage: toolUsage.current,
usageUpdatedAt,
}
}
Loading