From 0cabddb07270764212c48fc0bc5c8b05bd85f60f Mon Sep 17 00:00:00 2001 From: Hannes Rudolph Date: Tue, 16 Dec 2025 09:22:58 -0700 Subject: [PATCH 1/4] fix: duration not reported in evals UI - Fix backend race condition in runTask.ts where TaskTokenUsageUpdated could arrive before TaskStarted handler set taskMetricsId - Add Promise-based synchronization (taskMetricsReady) for event handlers - Fix UI to fall back to database timestamps (startedAt/finishedAt) when streaming duration is unavailable (e.g., page loaded after TaskStarted) --- apps/web-evals/src/app/runs/[id]/run.tsx | 32 +++++++++++++++++++++--- packages/evals/src/cli/runTask.ts | 22 ++++++++++++---- 2 files changed, 46 insertions(+), 8 deletions(-) diff --git a/apps/web-evals/src/app/runs/[id]/run.tsx b/apps/web-evals/src/app/runs/[id]/run.tsx index e2079343d32..0aabc8bcdce 100644 --- a/apps/web-evals/src/app/runs/[id]/run.tsx +++ b/apps/web-evals/src/app/runs/[id]/run.tsx @@ -321,6 +321,15 @@ export function Run({ run }: { run: Run }) { void usageUpdatedAt const metrics: Record = {} + // Helper to calculate duration from database timestamps when streaming duration + // is unavailable (e.g., page was loaded after TaskStarted event was published) + const calculateDurationFromTimestamps = (task: TaskWithMetrics): number => { + if (!task.startedAt) return 0 + const startTime = new Date(task.startedAt).getTime() + const endTime = task.finishedAt ? new Date(task.finishedAt).getTime() : Date.now() + return endTime - startTime + } + tasks?.forEach((task) => { const streamingUsage = tokenUsage.get(task.id) const dbMetrics = task.taskMetrics @@ -331,26 +340,43 @@ export function Run({ run }: { run: Run }) { // Check if DB metrics have meaningful values (not just default/empty) const dbHasData = dbMetrics && (dbMetrics.tokensIn > 0 || dbMetrics.tokensOut > 0 || dbMetrics.cost > 0) if (dbHasData) { - metrics[task.id] = dbMetrics + // If DB duration is 0 but we have timestamps, calculate from timestamps + const duration = dbMetrics.duration || calculateDurationFromTimestamps(task) + metrics[task.id] = { ...dbMetrics, duration } } else if (streamingUsage) { // Fall back to streaming values if DB is empty/stale + // Use streaming duration, or calculate from timestamps if not available + const duration = streamingUsage.duration || calculateDurationFromTimestamps(task) metrics[task.id] = { tokensIn: streamingUsage.totalTokensIn, tokensOut: streamingUsage.totalTokensOut, tokensContext: streamingUsage.contextTokens, - duration: streamingUsage.duration ?? 0, + duration, cost: streamingUsage.totalCost, } } } else if (streamingUsage) { // For running tasks, use streaming values + // Use streaming duration, or calculate from task.startedAt if not available + // (happens when page loads after TaskStarted event was already published) + const duration = streamingUsage.duration || calculateDurationFromTimestamps(task) metrics[task.id] = { tokensIn: streamingUsage.totalTokensIn, tokensOut: streamingUsage.totalTokensOut, tokensContext: streamingUsage.contextTokens, - duration: streamingUsage.duration ?? 0, + duration, cost: streamingUsage.totalCost, } + } else if (task.startedAt) { + // Task has started (has startedAt in DB) but no streaming data yet + // This can happen when page loads after TaskStarted but before TokenUsageUpdated + metrics[task.id] = { + tokensIn: 0, + tokensOut: 0, + tokensContext: 0, + duration: calculateDurationFromTimestamps(task), + cost: 0, + } } }) diff --git a/packages/evals/src/cli/runTask.ts b/packages/evals/src/cli/runTask.ts index 5f737c1ad5d..bb3187a9ad8 100644 --- a/packages/evals/src/cli/runTask.ts +++ b/packages/evals/src/cli/runTask.ts @@ -281,6 +281,13 @@ export const runTask = async ({ run, task, publish, logger, jobToken }: RunTaskO // Track accumulated tool usage across task instances (handles rehydration after abort) const accumulatedToolUsage: ToolUsage = {} + // Promise that resolves when taskMetricsId is set, preventing race conditions + // where TaskTokenUsageUpdated arrives before TaskStarted handler completes + let resolveTaskMetricsReady: () => void + const taskMetricsReady = new Promise((resolve) => { + resolveTaskMetricsReady = resolve + }) + const ignoreEvents: Record<"broadcast" | "log", RooCodeEventName[]> = { broadcast: [RooCodeEventName.Message], log: [RooCodeEventName.TaskTokenUsageUpdated, RooCodeEventName.TaskAskResponded], @@ -360,6 +367,9 @@ export const runTask = async ({ run, task, publish, logger, jobToken }: RunTaskO taskStartedAt = Date.now() taskMetricsId = taskMetrics.id rooTaskId = payload[0] + + // Signal that taskMetricsId is now ready for other handlers + resolveTaskMetricsReady() } if (eventName === RooCodeEventName.TaskToolFailed) { @@ -367,10 +377,12 @@ export const runTask = async ({ run, task, publish, logger, jobToken }: RunTaskO await createToolError({ taskId: task.id, toolName, error }) } - if ( - (eventName === RooCodeEventName.TaskTokenUsageUpdated || eventName === RooCodeEventName.TaskCompleted) && - taskMetricsId - ) { + if (eventName === RooCodeEventName.TaskTokenUsageUpdated || eventName === RooCodeEventName.TaskCompleted) { + // Wait for taskMetricsId to be set by the TaskStarted handler. + // This prevents a race condition where these events arrive before + // the TaskStarted handler finishes its async database operations. + await taskMetricsReady + const duration = Date.now() - taskStartedAt const { totalCost, totalTokensIn, totalTokensOut, contextTokens, totalCacheWrites, totalCacheReads } = @@ -397,7 +409,7 @@ export const runTask = async ({ run, task, publish, logger, jobToken }: RunTaskO } } - await updateTaskMetrics(taskMetricsId, { + await updateTaskMetrics(taskMetricsId!, { cost: totalCost, tokensIn: totalTokensIn, tokensOut: totalTokensOut, From 16d5f7c376b1ec7d9883dbcd00a9540b4b853194 Mon Sep 17 00:00:00 2001 From: Hannes Rudolph Date: Tue, 16 Dec 2025 10:40:11 -0700 Subject: [PATCH 2/4] feat(evals): add tool groups for aggregating tool usage stats - Add tool groups feature with customizable name and icon - Groups aggregate tool usage stats in table columns - Persist groups to localStorage - Tools can only belong to one group - Each group displays only icon in header with tooltip showing name and tools --- apps/web-evals/src/components/home/run.tsx | 134 +++--- apps/web-evals/src/components/home/runs.tsx | 475 ++++++++++++++++---- 2 files changed, 455 insertions(+), 154 deletions(-) diff --git a/apps/web-evals/src/components/home/run.tsx b/apps/web-evals/src/components/home/run.tsx index 99950bae436..379daf48a40 100644 --- a/apps/web-evals/src/components/home/run.tsx +++ b/apps/web-evals/src/components/home/run.tsx @@ -44,14 +44,22 @@ import { ScrollArea, } from "@/components/ui" +// Tool group type (same as in runs.tsx) +type ToolGroup = { + id: string + name: string + icon: string + tools: string[] +} + type RunProps = { run: EvalsRun taskMetrics: EvalsTaskMetrics | null toolColumns: ToolName[] - consolidatedToolColumns: string[] + toolGroups: ToolGroup[] } -export function Run({ run, taskMetrics, toolColumns, consolidatedToolColumns }: RunProps) { +export function Run({ run, taskMetrics, toolColumns, toolGroups }: RunProps) { const router = useRouter() const [deleteRunId, setDeleteRunId] = useState() const [showSettings, setShowSettings] = useState(false) @@ -143,6 +151,62 @@ export function Run({ run, taskMetrics, toolColumns, consolidatedToolColumns }: [router, run.id], ) + // Helper to render a tool group cell + const renderToolGroupCell = (group: ToolGroup) => { + if (!taskMetrics?.toolUsage) { + return - + } + + let totalAttempts = 0 + let totalFailures = 0 + const breakdown: Array<{ tool: string; attempts: number; rate: string }> = [] + + for (const toolName of group.tools) { + const usage = taskMetrics.toolUsage[toolName as ToolName] + if (usage) { + totalAttempts += usage.attempts + totalFailures += usage.failures + const rate = + usage.attempts > 0 + ? `${Math.round(((usage.attempts - usage.failures) / usage.attempts) * 100)}%` + : "0%" + breakdown.push({ tool: toolName, attempts: usage.attempts, rate }) + } + } + + if (totalAttempts === 0) { + return - + } + + const successRate = ((totalAttempts - totalFailures) / totalAttempts) * 100 + const rateColor = + successRate === 100 ? "text-muted-foreground" : successRate >= 80 ? "text-yellow-500" : "text-red-500" + + return ( + + +
+ {totalAttempts} + {Math.round(successRate)}% +
+
+ +
+
{group.name}
+ {breakdown.map(({ tool, attempts, rate }) => ( +
+ {tool}: + + {attempts} ({rate}) + +
+ ))} +
+
+
+ ) + } + return ( <> @@ -170,68 +234,12 @@ export function Run({ run, taskMetrics, toolColumns, consolidatedToolColumns }: )} - {consolidatedToolColumns.length > 0 && ( - - {taskMetrics?.toolUsage ? ( - (() => { - // Calculate aggregated stats for consolidated tools - let totalAttempts = 0 - let totalFailures = 0 - const breakdown: Array<{ tool: string; attempts: number; rate: string }> = [] - - for (const toolName of consolidatedToolColumns) { - const usage = taskMetrics.toolUsage[toolName as ToolName] - if (usage) { - totalAttempts += usage.attempts - totalFailures += usage.failures - const rate = - usage.attempts > 0 - ? `${Math.round(((usage.attempts - usage.failures) / usage.attempts) * 100)}%` - : "0%" - breakdown.push({ tool: toolName, attempts: usage.attempts, rate }) - } - } - - const consolidatedRate = - totalAttempts > 0 ? ((totalAttempts - totalFailures) / totalAttempts) * 100 : 100 - const rateColor = - consolidatedRate === 100 - ? "text-muted-foreground" - : consolidatedRate >= 80 - ? "text-yellow-500" - : "text-red-500" - - return totalAttempts > 0 ? ( - - -
- {totalAttempts} - {Math.round(consolidatedRate)}% -
-
- -
-
Consolidated Tools:
- {breakdown.map(({ tool, attempts, rate }) => ( -
- {tool}: - - {attempts} ({rate}) - -
- ))} -
-
-
- ) : ( - - - ) - })() - ) : ( - - - )} + {/* Tool Group Columns */} + {toolGroups.map((group) => ( + + {renderToolGroupCell(group)} - )} + ))} {toolColumns.map((toolName) => { const usage = taskMetrics?.toolUsage?.[toolName] const successRate = diff --git a/apps/web-evals/src/components/home/runs.tsx b/apps/web-evals/src/components/home/runs.tsx index f3cd0e39dab..0ac333f2cea 100644 --- a/apps/web-evals/src/components/home/runs.tsx +++ b/apps/web-evals/src/components/home/runs.tsx @@ -1,19 +1,49 @@ "use client" -import { useCallback, useEffect, useMemo, useState } from "react" +import { useCallback, useEffect, useMemo, useState, memo } from "react" import { useRouter } from "next/navigation" import { ArrowDown, ArrowUp, ArrowUpDown, + Box, + Boxes, + Check, + CheckCircle, + CircleDot, + ClipboardList, + Cog, Combine, Ellipsis, + File, + FileText, + Folder, + FolderOpen, + Hammer, + Hexagon, + Layers, + List, + ListChecks, + ListTodo, LoaderCircle, + Package, + Pencil, + PencilLine, + Plus, Rocket, - RotateCcw, + Search, + Settings2, + Shapes, + Square, + Star, + Tag, + Terminal, Trash2, + Wrench, X, + Zap, } from "lucide-react" +import type { LucideIcon } from "lucide-react" import { toast } from "sonner" import type { Run, TaskMetrics } from "@roo-code/evals" @@ -30,10 +60,17 @@ import { AlertDialogHeader, AlertDialogTitle, Button, + Dialog, + DialogContent, + DialogFooter, + DialogHeader, + DialogTitle, DropdownMenu, DropdownMenuContent, DropdownMenuItem, + DropdownMenuSeparator, DropdownMenuTrigger, + Input, MultiSelect, Select, SelectContent, @@ -52,6 +89,166 @@ import { } from "@/components/ui" import { Run as Row } from "@/components/home/run" +// Available icons for tool groups +const TOOL_GROUP_ICONS: { name: string; icon: LucideIcon }[] = [ + { name: "combine", icon: Combine }, + { name: "layers", icon: Layers }, + { name: "box", icon: Box }, + { name: "boxes", icon: Boxes }, + { name: "package", icon: Package }, + { name: "folder", icon: Folder }, + { name: "folder-open", icon: FolderOpen }, + { name: "file", icon: File }, + { name: "file-text", icon: FileText }, + { name: "list", icon: List }, + { name: "list-todo", icon: ListTodo }, + { name: "list-checks", icon: ListChecks }, + { name: "clipboard-list", icon: ClipboardList }, + { name: "check", icon: Check }, + { name: "check-circle", icon: CheckCircle }, + { name: "pencil", icon: PencilLine }, + { name: "trash", icon: Trash2 }, + { name: "x", icon: X }, + { name: "search", icon: Search }, + { name: "terminal", icon: Terminal }, + { name: "shapes", icon: Shapes }, + { name: "hexagon", icon: Hexagon }, + { name: "square", icon: Square }, + { name: "circle-dot", icon: CircleDot }, + { name: "star", icon: Star }, + { name: "zap", icon: Zap }, + { name: "hammer", icon: Hammer }, + { name: "wrench", icon: Wrench }, + { name: "cog", icon: Cog }, + { name: "settings", icon: Settings2 }, + { name: "tag", icon: Tag }, +] + +// Tool group type +export type ToolGroup = { + id: string + name: string + icon: string + tools: string[] +} + +// Helper to get icon component by name +function getIconByName(name: string): LucideIcon { + return TOOL_GROUP_ICONS.find((i) => i.name === name)?.icon ?? Combine +} + +// Generate a unique ID for tool groups +function generateGroupId(): string { + return `group-${Date.now()}-${Math.random().toString(36).substring(2, 9)}` +} + +// Isolated dialog component to prevent parent re-renders on state changes +const ToolGroupEditorDialog = memo(function ToolGroupEditorDialog({ + open, + onOpenChange, + editingGroup, + availableTools, + onSave, +}: { + open: boolean + onOpenChange: (open: boolean) => void + editingGroup: ToolGroup | null + availableTools: { label: string; value: string }[] + onSave: (group: ToolGroup) => void +}) { + const [groupName, setGroupName] = useState(editingGroup?.name ?? "") + const [groupIcon, setGroupIcon] = useState(editingGroup?.icon ?? "combine") + const [groupTools, setGroupTools] = useState(editingGroup?.tools ?? []) + + // Reset form when dialog opens or editingGroup changes + useEffect(() => { + if (open) { + setGroupName(editingGroup?.name ?? "") + setGroupIcon(editingGroup?.icon ?? "combine") + setGroupTools(editingGroup?.tools ?? []) + } + }, [open, editingGroup]) + + const canSaveGroup = groupName.trim().length > 0 && groupTools.length > 0 + + const handleSave = () => { + if (!canSaveGroup) return + const group: ToolGroup = { + id: editingGroup?.id ?? generateGroupId(), + name: groupName.trim(), + icon: groupIcon, + tools: groupTools, + } + onSave(group) + onOpenChange(false) + } + + return ( + + + + {editingGroup ? "Edit Tool Group" : "Create Tool Group"} + +
+
+ + setGroupName(e.target.value)} + className={!groupName.trim() ? "border-muted-foreground/30" : ""} + /> +
+
+ +
+ {TOOL_GROUP_ICONS.map(({ name, icon: IconComponent }) => ( + + ))} +
+
+
+ + +
+ {groupTools.length > 0 + ? `${groupTools.length} tool${groupTools.length !== 1 ? "s" : ""} selected` + : "Select at least one tool"} +
+
+
+ + + + +
+
+ ) +}) + type RunWithTaskMetrics = Run & { taskMetrics: TaskMetrics | null } type SortColumn = "model" | "provider" | "passed" | "failed" | "percent" | "cost" | "duration" | "createdAt" @@ -72,7 +269,7 @@ const STORAGE_KEYS = { TIMEFRAME: "evals-runs-timeframe", MODEL_FILTER: "evals-runs-model-filter", PROVIDER_FILTER: "evals-runs-provider-filter", - CONSOLIDATED_TOOLS: "evals-runs-consolidated-tools", + TOOL_GROUPS: "evals-runs-tool-groups", } function getTimeframeStartDate(timeframe: TimeframeOption): Date | null { @@ -137,13 +334,24 @@ export function Runs({ runs }: { runs: RunWithTaskMetrics[] }) { return stored ? JSON.parse(stored) : [] }) - // Tool column consolidation state - initialize from localStorage - const [consolidatedToolColumns, setConsolidatedToolColumns] = useState(() => { + // Tool groups state - initialize from localStorage + const [toolGroups, setToolGroups] = useState(() => { if (typeof window === "undefined") return [] - const stored = localStorage.getItem(STORAGE_KEYS.CONSOLIDATED_TOOLS) - return stored ? JSON.parse(stored) : [] + const stored = localStorage.getItem(STORAGE_KEYS.TOOL_GROUPS) + if (stored) { + try { + return JSON.parse(stored) + } catch { + return [] + } + } + return [] }) + // Tool group editor dialog state + const [showGroupDialog, setShowGroupDialog] = useState(false) + const [editingGroup, setEditingGroup] = useState(null) + // Delete runs state const [showDeleteConfirm, setShowDeleteConfirm] = useState(false) const [showDeleteOldConfirm, setShowDeleteOldConfirm] = useState(false) @@ -163,8 +371,8 @@ export function Runs({ runs }: { runs: RunWithTaskMetrics[] }) { }, [providerFilter]) useEffect(() => { - localStorage.setItem(STORAGE_KEYS.CONSOLIDATED_TOOLS, JSON.stringify(consolidatedToolColumns)) - }, [consolidatedToolColumns]) + localStorage.setItem(STORAGE_KEYS.TOOL_GROUPS, JSON.stringify(toolGroups)) + }, [toolGroups]) // Count incomplete runs (runs without taskMetricsId) const incompleteRunsCount = useMemo(() => { @@ -300,7 +508,7 @@ export function Runs({ runs }: { runs: RunWithTaskMetrics[] }) { .map(([name]): ToolName => name) }, [filteredRuns]) - // Tool column options for the consolidation dropdown + // Tool column options for the group editor const toolColumnOptions = useMemo(() => { return allToolColumns.map((tool) => ({ label: tool, @@ -308,13 +516,21 @@ export function Runs({ runs }: { runs: RunWithTaskMetrics[] }) { })) }, [allToolColumns]) - // Separate consolidated and individual tool columns - const individualToolColumns = useMemo(() => { - return allToolColumns.filter((tool) => !consolidatedToolColumns.includes(tool)) - }, [allToolColumns, consolidatedToolColumns]) + // Get all tools that are in any group + const groupedTools = useMemo(() => { + const grouped = new Set() + for (const group of toolGroups) { + for (const tool of group.tools) { + grouped.add(tool) + } + } + return grouped + }, [toolGroups]) - // Create a "consolidated" column if any tools are selected for consolidation - const hasConsolidatedColumn = consolidatedToolColumns.length > 0 + // Separate grouped and individual tool columns + const individualToolColumns = useMemo(() => { + return allToolColumns.filter((tool) => !groupedTools.has(tool)) + }, [allToolColumns, groupedTools]) // Use individualToolColumns for rendering const toolColumns = individualToolColumns @@ -377,13 +593,11 @@ export function Runs({ runs }: { runs: RunWithTaskMetrics[] }) { }) }, [filteredRuns, sortColumn, sortDirection]) - // Calculate colSpan for empty state (7 base columns + dynamic tools + consolidated column + 3 end columns) - const totalColumns = 7 + toolColumns.length + (hasConsolidatedColumn ? 1 : 0) + 3 + // Calculate colSpan for empty state (7 base columns + tool groups + dynamic tools + 3 end columns) + const totalColumns = 7 + toolGroups.length + toolColumns.length + 3 - // Check if any filters or settings are active + // Check if any filters are active const hasActiveFilters = timeframeFilter !== "all" || modelFilter.length > 0 || providerFilter.length > 0 - const hasConsolidatedTools = consolidatedToolColumns.length > 0 - const hasAnyCustomization = hasActiveFilters || hasConsolidatedTools const clearAllFilters = () => { setTimeframeFilter("all") @@ -391,16 +605,52 @@ export function Runs({ runs }: { runs: RunWithTaskMetrics[] }) { setProviderFilter([]) } - const resetAll = () => { - setTimeframeFilter("all") - setModelFilter([]) - setProviderFilter([]) - setConsolidatedToolColumns([]) - localStorage.removeItem(STORAGE_KEYS.TIMEFRAME) - localStorage.removeItem(STORAGE_KEYS.MODEL_FILTER) - localStorage.removeItem(STORAGE_KEYS.PROVIDER_FILTER) - localStorage.removeItem(STORAGE_KEYS.CONSOLIDATED_TOOLS) - } + // Tool group management handlers + const openNewGroupDialog = useCallback(() => { + setEditingGroup(null) + setShowGroupDialog(true) + }, []) + + const openEditGroupDialog = useCallback((group: ToolGroup) => { + setEditingGroup(group) + setShowGroupDialog(true) + }, []) + + const handleSaveGroup = useCallback( + (group: ToolGroup) => { + setToolGroups((prev) => { + const existingIndex = prev.findIndex((g) => g.id === group.id) + if (existingIndex >= 0) { + // Update existing group + const newGroups = [...prev] + newGroups[existingIndex] = group + return newGroups + } else { + // Add new group + return [...prev, group] + } + }) + toast.success(editingGroup ? "Group updated" : "Group created") + }, + [editingGroup], + ) + + const handleDeleteGroup = useCallback((groupId: string) => { + setToolGroups((prev) => prev.filter((g) => g.id !== groupId)) + toast.success("Group deleted") + }, []) + + // Get available tools for group editor (tools not in other groups) + const availableToolsForEditor = useMemo(() => { + const usedInOtherGroups = new Set() + for (const group of toolGroups) { + if (editingGroup && group.id === editingGroup.id) continue + for (const tool of group.tools) { + usedInOtherGroups.add(tool) + } + } + return toolColumnOptions.filter((opt) => !usedInOtherGroups.has(opt.value)) + }, [toolColumnOptions, toolGroups, editingGroup]) return ( <> @@ -448,49 +698,76 @@ export function Runs({ runs }: { runs: RunWithTaskMetrics[] }) { /> + {/* Tool Groups Dropdown */}
- - -
- - Consolidate: -
-
- Select tool columns to consolidate into a combined column -
-
-
0 ? "[&>div>div]:invisible" : ""}> - - - Reset all filters & consolidation - - ) - } - /> -
- {consolidatedToolColumns.length > 0 && ( -
- - {consolidatedToolColumns.length} tool - {consolidatedToolColumns.length !== 1 ? "s" : ""} - -
- )} -
+ + + + + + {toolGroups.length > 0 ? ( + <> + {toolGroups.map((group) => { + const IconComponent = getIconByName(group.icon) + return ( + { + e.preventDefault() + openEditGroupDialog(group) + }}> +
+ + {group.name} + + ({group.tools.length}) + +
+
+ + +
+
+ ) + })} + + + ) : ( +
No groups yet
+ )} + + + Add Group + +
+
{hasActiveFilters && ( @@ -580,23 +857,30 @@ export function Runs({ runs }: { runs: RunWithTaskMetrics[] }) { Tokens - {hasConsolidatedColumn && ( - - - - - - -
-
Consolidated Tools:
- {consolidatedToolColumns.map((tool) => ( -
{tool}
- ))} -
-
-
-
- )} + {/* Tool Group Columns */} + {toolGroups.map((group) => { + const IconComponent = getIconByName(group.icon) + return ( + +
+ + + + + +
+
{group.name}
+ {group.tools.map((tool) => ( +
{tool}
+ ))} +
+
+
+
+
+ ) + })} + {/* Individual Tool Columns */} {toolColumns.map((toolName) => ( @@ -628,7 +912,7 @@ export function Runs({ runs }: { runs: RunWithTaskMetrics[] }) { run={run} taskMetrics={taskMetrics} toolColumns={toolColumns} - consolidatedToolColumns={consolidatedToolColumns} + toolGroups={toolGroups} /> )) ) : ( @@ -663,6 +947,15 @@ export function Runs({ runs }: { runs: RunWithTaskMetrics[] }) { + {/* Tool Group Editor Dialog */} + + {/* Delete Incomplete Runs Confirmation Dialog */} From 99cfb0479d0f50e8d38fb2c0768f0c92a8d9434d Mon Sep 17 00:00:00 2001 From: Roo Code Date: Tue, 16 Dec 2025 18:18:16 +0000 Subject: [PATCH 3/4] fix(evals): prevent taskMetricsReady deadlock on disconnect --- packages/evals/src/cli/runTask.ts | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/packages/evals/src/cli/runTask.ts b/packages/evals/src/cli/runTask.ts index bb3187a9ad8..d7f37e72a1f 100644 --- a/packages/evals/src/cli/runTask.ts +++ b/packages/evals/src/cli/runTask.ts @@ -381,8 +381,16 @@ export const runTask = async ({ run, task, publish, logger, jobToken }: RunTaskO // Wait for taskMetricsId to be set by the TaskStarted handler. // This prevents a race condition where these events arrive before // the TaskStarted handler finishes its async database operations. + // Note: taskMetricsReady is also resolved on disconnect to prevent deadlock. await taskMetricsReady + // Guard: taskMetricsReady may have been resolved due to disconnect + // without taskMetricsId being set. Skip metrics update in this case. + if (!taskMetricsId) { + logger.info(`skipping metrics update: taskMetricsId not set (event: ${eventName})`) + return + } + const duration = Date.now() - taskStartedAt const { totalCost, totalTokensIn, totalTokensOut, contextTokens, totalCacheWrites, totalCacheReads } = @@ -409,7 +417,7 @@ export const runTask = async ({ run, task, publish, logger, jobToken }: RunTaskO } } - await updateTaskMetrics(taskMetricsId!, { + await updateTaskMetrics(taskMetricsId, { cost: totalCost, tokensIn: totalTokensIn, tokensOut: totalTokensOut, @@ -433,6 +441,10 @@ export const runTask = async ({ run, task, publish, logger, jobToken }: RunTaskO client.on(IpcMessageType.Disconnect, async () => { logger.info(`disconnected from IPC socket -> ${ipcSocketPath}`) isClientDisconnected = true + // Resolve taskMetricsReady to unblock any handlers waiting on it. + // This prevents deadlock if TaskStarted never fired or threw before resolving. + // The handlers check for taskMetricsId being set before proceeding. + resolveTaskMetricsReady() }) client.sendCommand({ From b9aa6b57c2fbf8d1abf6f944eb7e3d575a814aee Mon Sep 17 00:00:00 2001 From: Hannes Rudolph Date: Tue, 16 Dec 2025 13:09:06 -0700 Subject: [PATCH 4/4] fix(evals): ensure finished tasks get duration even without streaming data Add fallback case for finished tasks where DB metrics are empty and streaming usage is unavailable. Duration is now calculated from startedAt/finishedAt timestamps in all cases. --- apps/web-evals/src/app/runs/[id]/run.tsx | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/apps/web-evals/src/app/runs/[id]/run.tsx b/apps/web-evals/src/app/runs/[id]/run.tsx index 0aabc8bcdce..badd77741e0 100644 --- a/apps/web-evals/src/app/runs/[id]/run.tsx +++ b/apps/web-evals/src/app/runs/[id]/run.tsx @@ -354,6 +354,17 @@ export function Run({ run }: { run: Run }) { duration, cost: streamingUsage.totalCost, } + } else { + // Task finished but no DB metrics and no streaming data + // (e.g., page loaded after task completed, metrics not persisted) + // Still provide duration calculated from timestamps + metrics[task.id] = { + tokensIn: 0, + tokensOut: 0, + tokensContext: 0, + duration: calculateDurationFromTimestamps(task), + cost: 0, + } } } else if (streamingUsage) { // For running tasks, use streaming values