diff --git a/apps/web-evals/src/app/runs/[id]/run.tsx b/apps/web-evals/src/app/runs/[id]/run.tsx index e2079343d32..badd77741e0 100644 --- a/apps/web-evals/src/app/runs/[id]/run.tsx +++ b/apps/web-evals/src/app/runs/[id]/run.tsx @@ -321,6 +321,15 @@ export function Run({ run }: { run: Run }) { void usageUpdatedAt const metrics: Record = {} + // Helper to calculate duration from database timestamps when streaming duration + // is unavailable (e.g., page was loaded after TaskStarted event was published) + const calculateDurationFromTimestamps = (task: TaskWithMetrics): number => { + if (!task.startedAt) return 0 + const startTime = new Date(task.startedAt).getTime() + const endTime = task.finishedAt ? new Date(task.finishedAt).getTime() : Date.now() + return endTime - startTime + } + tasks?.forEach((task) => { const streamingUsage = tokenUsage.get(task.id) const dbMetrics = task.taskMetrics @@ -331,26 +340,54 @@ export function Run({ run }: { run: Run }) { // Check if DB metrics have meaningful values (not just default/empty) const dbHasData = dbMetrics && (dbMetrics.tokensIn > 0 || dbMetrics.tokensOut > 0 || dbMetrics.cost > 0) if (dbHasData) { - metrics[task.id] = dbMetrics + // If DB duration is 0 but we have timestamps, calculate from timestamps + const duration = dbMetrics.duration || calculateDurationFromTimestamps(task) + metrics[task.id] = { ...dbMetrics, duration } } else if (streamingUsage) { // Fall back to streaming values if DB is empty/stale + // Use streaming duration, or calculate from timestamps if not available + const duration = streamingUsage.duration || calculateDurationFromTimestamps(task) metrics[task.id] = { tokensIn: streamingUsage.totalTokensIn, tokensOut: streamingUsage.totalTokensOut, tokensContext: streamingUsage.contextTokens, - duration: streamingUsage.duration ?? 0, + duration, cost: streamingUsage.totalCost, } + } else { + // Task finished but no DB metrics and no streaming data + // (e.g., page loaded after task completed, metrics not persisted) + // Still provide duration calculated from timestamps + metrics[task.id] = { + tokensIn: 0, + tokensOut: 0, + tokensContext: 0, + duration: calculateDurationFromTimestamps(task), + cost: 0, + } } } else if (streamingUsage) { // For running tasks, use streaming values + // Use streaming duration, or calculate from task.startedAt if not available + // (happens when page loads after TaskStarted event was already published) + const duration = streamingUsage.duration || calculateDurationFromTimestamps(task) metrics[task.id] = { tokensIn: streamingUsage.totalTokensIn, tokensOut: streamingUsage.totalTokensOut, tokensContext: streamingUsage.contextTokens, - duration: streamingUsage.duration ?? 0, + duration, cost: streamingUsage.totalCost, } + } else if (task.startedAt) { + // Task has started (has startedAt in DB) but no streaming data yet + // This can happen when page loads after TaskStarted but before TokenUsageUpdated + metrics[task.id] = { + tokensIn: 0, + tokensOut: 0, + tokensContext: 0, + duration: calculateDurationFromTimestamps(task), + cost: 0, + } } }) diff --git a/apps/web-evals/src/components/home/run.tsx b/apps/web-evals/src/components/home/run.tsx index 99950bae436..379daf48a40 100644 --- a/apps/web-evals/src/components/home/run.tsx +++ b/apps/web-evals/src/components/home/run.tsx @@ -44,14 +44,22 @@ import { ScrollArea, } from "@/components/ui" +// Tool group type (same as in runs.tsx) +type ToolGroup = { + id: string + name: string + icon: string + tools: string[] +} + type RunProps = { run: EvalsRun taskMetrics: EvalsTaskMetrics | null toolColumns: ToolName[] - consolidatedToolColumns: string[] + toolGroups: ToolGroup[] } -export function Run({ run, taskMetrics, toolColumns, consolidatedToolColumns }: RunProps) { +export function Run({ run, taskMetrics, toolColumns, toolGroups }: RunProps) { const router = useRouter() const [deleteRunId, setDeleteRunId] = useState() const [showSettings, setShowSettings] = useState(false) @@ -143,6 +151,62 @@ export function Run({ run, taskMetrics, toolColumns, consolidatedToolColumns }: [router, run.id], ) + // Helper to render a tool group cell + const renderToolGroupCell = (group: ToolGroup) => { + if (!taskMetrics?.toolUsage) { + return - + } + + let totalAttempts = 0 + let totalFailures = 0 + const breakdown: Array<{ tool: string; attempts: number; rate: string }> = [] + + for (const toolName of group.tools) { + const usage = taskMetrics.toolUsage[toolName as ToolName] + if (usage) { + totalAttempts += usage.attempts + totalFailures += usage.failures + const rate = + usage.attempts > 0 + ? `${Math.round(((usage.attempts - usage.failures) / usage.attempts) * 100)}%` + : "0%" + breakdown.push({ tool: toolName, attempts: usage.attempts, rate }) + } + } + + if (totalAttempts === 0) { + return - + } + + const successRate = ((totalAttempts - totalFailures) / totalAttempts) * 100 + const rateColor = + successRate === 100 ? "text-muted-foreground" : successRate >= 80 ? "text-yellow-500" : "text-red-500" + + return ( + + +
+ {totalAttempts} + {Math.round(successRate)}% +
+
+ +
+
{group.name}
+ {breakdown.map(({ tool, attempts, rate }) => ( +
+ {tool}: + + {attempts} ({rate}) + +
+ ))} +
+
+
+ ) + } + return ( <> @@ -170,68 +234,12 @@ export function Run({ run, taskMetrics, toolColumns, consolidatedToolColumns }: )} - {consolidatedToolColumns.length > 0 && ( - - {taskMetrics?.toolUsage ? ( - (() => { - // Calculate aggregated stats for consolidated tools - let totalAttempts = 0 - let totalFailures = 0 - const breakdown: Array<{ tool: string; attempts: number; rate: string }> = [] - - for (const toolName of consolidatedToolColumns) { - const usage = taskMetrics.toolUsage[toolName as ToolName] - if (usage) { - totalAttempts += usage.attempts - totalFailures += usage.failures - const rate = - usage.attempts > 0 - ? `${Math.round(((usage.attempts - usage.failures) / usage.attempts) * 100)}%` - : "0%" - breakdown.push({ tool: toolName, attempts: usage.attempts, rate }) - } - } - - const consolidatedRate = - totalAttempts > 0 ? ((totalAttempts - totalFailures) / totalAttempts) * 100 : 100 - const rateColor = - consolidatedRate === 100 - ? "text-muted-foreground" - : consolidatedRate >= 80 - ? "text-yellow-500" - : "text-red-500" - - return totalAttempts > 0 ? ( - - -
- {totalAttempts} - {Math.round(consolidatedRate)}% -
-
- -
-
Consolidated Tools:
- {breakdown.map(({ tool, attempts, rate }) => ( -
- {tool}: - - {attempts} ({rate}) - -
- ))} -
-
-
- ) : ( - - - ) - })() - ) : ( - - - )} + {/* Tool Group Columns */} + {toolGroups.map((group) => ( + + {renderToolGroupCell(group)} - )} + ))} {toolColumns.map((toolName) => { const usage = taskMetrics?.toolUsage?.[toolName] const successRate = diff --git a/apps/web-evals/src/components/home/runs.tsx b/apps/web-evals/src/components/home/runs.tsx index f3cd0e39dab..0ac333f2cea 100644 --- a/apps/web-evals/src/components/home/runs.tsx +++ b/apps/web-evals/src/components/home/runs.tsx @@ -1,19 +1,49 @@ "use client" -import { useCallback, useEffect, useMemo, useState } from "react" +import { useCallback, useEffect, useMemo, useState, memo } from "react" import { useRouter } from "next/navigation" import { ArrowDown, ArrowUp, ArrowUpDown, + Box, + Boxes, + Check, + CheckCircle, + CircleDot, + ClipboardList, + Cog, Combine, Ellipsis, + File, + FileText, + Folder, + FolderOpen, + Hammer, + Hexagon, + Layers, + List, + ListChecks, + ListTodo, LoaderCircle, + Package, + Pencil, + PencilLine, + Plus, Rocket, - RotateCcw, + Search, + Settings2, + Shapes, + Square, + Star, + Tag, + Terminal, Trash2, + Wrench, X, + Zap, } from "lucide-react" +import type { LucideIcon } from "lucide-react" import { toast } from "sonner" import type { Run, TaskMetrics } from "@roo-code/evals" @@ -30,10 +60,17 @@ import { AlertDialogHeader, AlertDialogTitle, Button, + Dialog, + DialogContent, + DialogFooter, + DialogHeader, + DialogTitle, DropdownMenu, DropdownMenuContent, DropdownMenuItem, + DropdownMenuSeparator, DropdownMenuTrigger, + Input, MultiSelect, Select, SelectContent, @@ -52,6 +89,166 @@ import { } from "@/components/ui" import { Run as Row } from "@/components/home/run" +// Available icons for tool groups +const TOOL_GROUP_ICONS: { name: string; icon: LucideIcon }[] = [ + { name: "combine", icon: Combine }, + { name: "layers", icon: Layers }, + { name: "box", icon: Box }, + { name: "boxes", icon: Boxes }, + { name: "package", icon: Package }, + { name: "folder", icon: Folder }, + { name: "folder-open", icon: FolderOpen }, + { name: "file", icon: File }, + { name: "file-text", icon: FileText }, + { name: "list", icon: List }, + { name: "list-todo", icon: ListTodo }, + { name: "list-checks", icon: ListChecks }, + { name: "clipboard-list", icon: ClipboardList }, + { name: "check", icon: Check }, + { name: "check-circle", icon: CheckCircle }, + { name: "pencil", icon: PencilLine }, + { name: "trash", icon: Trash2 }, + { name: "x", icon: X }, + { name: "search", icon: Search }, + { name: "terminal", icon: Terminal }, + { name: "shapes", icon: Shapes }, + { name: "hexagon", icon: Hexagon }, + { name: "square", icon: Square }, + { name: "circle-dot", icon: CircleDot }, + { name: "star", icon: Star }, + { name: "zap", icon: Zap }, + { name: "hammer", icon: Hammer }, + { name: "wrench", icon: Wrench }, + { name: "cog", icon: Cog }, + { name: "settings", icon: Settings2 }, + { name: "tag", icon: Tag }, +] + +// Tool group type +export type ToolGroup = { + id: string + name: string + icon: string + tools: string[] +} + +// Helper to get icon component by name +function getIconByName(name: string): LucideIcon { + return TOOL_GROUP_ICONS.find((i) => i.name === name)?.icon ?? Combine +} + +// Generate a unique ID for tool groups +function generateGroupId(): string { + return `group-${Date.now()}-${Math.random().toString(36).substring(2, 9)}` +} + +// Isolated dialog component to prevent parent re-renders on state changes +const ToolGroupEditorDialog = memo(function ToolGroupEditorDialog({ + open, + onOpenChange, + editingGroup, + availableTools, + onSave, +}: { + open: boolean + onOpenChange: (open: boolean) => void + editingGroup: ToolGroup | null + availableTools: { label: string; value: string }[] + onSave: (group: ToolGroup) => void +}) { + const [groupName, setGroupName] = useState(editingGroup?.name ?? "") + const [groupIcon, setGroupIcon] = useState(editingGroup?.icon ?? "combine") + const [groupTools, setGroupTools] = useState(editingGroup?.tools ?? []) + + // Reset form when dialog opens or editingGroup changes + useEffect(() => { + if (open) { + setGroupName(editingGroup?.name ?? "") + setGroupIcon(editingGroup?.icon ?? "combine") + setGroupTools(editingGroup?.tools ?? []) + } + }, [open, editingGroup]) + + const canSaveGroup = groupName.trim().length > 0 && groupTools.length > 0 + + const handleSave = () => { + if (!canSaveGroup) return + const group: ToolGroup = { + id: editingGroup?.id ?? generateGroupId(), + name: groupName.trim(), + icon: groupIcon, + tools: groupTools, + } + onSave(group) + onOpenChange(false) + } + + return ( + + + + {editingGroup ? "Edit Tool Group" : "Create Tool Group"} + +
+
+ + setGroupName(e.target.value)} + className={!groupName.trim() ? "border-muted-foreground/30" : ""} + /> +
+
+ +
+ {TOOL_GROUP_ICONS.map(({ name, icon: IconComponent }) => ( + + ))} +
+
+
+ + +
+ {groupTools.length > 0 + ? `${groupTools.length} tool${groupTools.length !== 1 ? "s" : ""} selected` + : "Select at least one tool"} +
+
+
+ + + + +
+
+ ) +}) + type RunWithTaskMetrics = Run & { taskMetrics: TaskMetrics | null } type SortColumn = "model" | "provider" | "passed" | "failed" | "percent" | "cost" | "duration" | "createdAt" @@ -72,7 +269,7 @@ const STORAGE_KEYS = { TIMEFRAME: "evals-runs-timeframe", MODEL_FILTER: "evals-runs-model-filter", PROVIDER_FILTER: "evals-runs-provider-filter", - CONSOLIDATED_TOOLS: "evals-runs-consolidated-tools", + TOOL_GROUPS: "evals-runs-tool-groups", } function getTimeframeStartDate(timeframe: TimeframeOption): Date | null { @@ -137,13 +334,24 @@ export function Runs({ runs }: { runs: RunWithTaskMetrics[] }) { return stored ? JSON.parse(stored) : [] }) - // Tool column consolidation state - initialize from localStorage - const [consolidatedToolColumns, setConsolidatedToolColumns] = useState(() => { + // Tool groups state - initialize from localStorage + const [toolGroups, setToolGroups] = useState(() => { if (typeof window === "undefined") return [] - const stored = localStorage.getItem(STORAGE_KEYS.CONSOLIDATED_TOOLS) - return stored ? JSON.parse(stored) : [] + const stored = localStorage.getItem(STORAGE_KEYS.TOOL_GROUPS) + if (stored) { + try { + return JSON.parse(stored) + } catch { + return [] + } + } + return [] }) + // Tool group editor dialog state + const [showGroupDialog, setShowGroupDialog] = useState(false) + const [editingGroup, setEditingGroup] = useState(null) + // Delete runs state const [showDeleteConfirm, setShowDeleteConfirm] = useState(false) const [showDeleteOldConfirm, setShowDeleteOldConfirm] = useState(false) @@ -163,8 +371,8 @@ export function Runs({ runs }: { runs: RunWithTaskMetrics[] }) { }, [providerFilter]) useEffect(() => { - localStorage.setItem(STORAGE_KEYS.CONSOLIDATED_TOOLS, JSON.stringify(consolidatedToolColumns)) - }, [consolidatedToolColumns]) + localStorage.setItem(STORAGE_KEYS.TOOL_GROUPS, JSON.stringify(toolGroups)) + }, [toolGroups]) // Count incomplete runs (runs without taskMetricsId) const incompleteRunsCount = useMemo(() => { @@ -300,7 +508,7 @@ export function Runs({ runs }: { runs: RunWithTaskMetrics[] }) { .map(([name]): ToolName => name) }, [filteredRuns]) - // Tool column options for the consolidation dropdown + // Tool column options for the group editor const toolColumnOptions = useMemo(() => { return allToolColumns.map((tool) => ({ label: tool, @@ -308,13 +516,21 @@ export function Runs({ runs }: { runs: RunWithTaskMetrics[] }) { })) }, [allToolColumns]) - // Separate consolidated and individual tool columns - const individualToolColumns = useMemo(() => { - return allToolColumns.filter((tool) => !consolidatedToolColumns.includes(tool)) - }, [allToolColumns, consolidatedToolColumns]) + // Get all tools that are in any group + const groupedTools = useMemo(() => { + const grouped = new Set() + for (const group of toolGroups) { + for (const tool of group.tools) { + grouped.add(tool) + } + } + return grouped + }, [toolGroups]) - // Create a "consolidated" column if any tools are selected for consolidation - const hasConsolidatedColumn = consolidatedToolColumns.length > 0 + // Separate grouped and individual tool columns + const individualToolColumns = useMemo(() => { + return allToolColumns.filter((tool) => !groupedTools.has(tool)) + }, [allToolColumns, groupedTools]) // Use individualToolColumns for rendering const toolColumns = individualToolColumns @@ -377,13 +593,11 @@ export function Runs({ runs }: { runs: RunWithTaskMetrics[] }) { }) }, [filteredRuns, sortColumn, sortDirection]) - // Calculate colSpan for empty state (7 base columns + dynamic tools + consolidated column + 3 end columns) - const totalColumns = 7 + toolColumns.length + (hasConsolidatedColumn ? 1 : 0) + 3 + // Calculate colSpan for empty state (7 base columns + tool groups + dynamic tools + 3 end columns) + const totalColumns = 7 + toolGroups.length + toolColumns.length + 3 - // Check if any filters or settings are active + // Check if any filters are active const hasActiveFilters = timeframeFilter !== "all" || modelFilter.length > 0 || providerFilter.length > 0 - const hasConsolidatedTools = consolidatedToolColumns.length > 0 - const hasAnyCustomization = hasActiveFilters || hasConsolidatedTools const clearAllFilters = () => { setTimeframeFilter("all") @@ -391,16 +605,52 @@ export function Runs({ runs }: { runs: RunWithTaskMetrics[] }) { setProviderFilter([]) } - const resetAll = () => { - setTimeframeFilter("all") - setModelFilter([]) - setProviderFilter([]) - setConsolidatedToolColumns([]) - localStorage.removeItem(STORAGE_KEYS.TIMEFRAME) - localStorage.removeItem(STORAGE_KEYS.MODEL_FILTER) - localStorage.removeItem(STORAGE_KEYS.PROVIDER_FILTER) - localStorage.removeItem(STORAGE_KEYS.CONSOLIDATED_TOOLS) - } + // Tool group management handlers + const openNewGroupDialog = useCallback(() => { + setEditingGroup(null) + setShowGroupDialog(true) + }, []) + + const openEditGroupDialog = useCallback((group: ToolGroup) => { + setEditingGroup(group) + setShowGroupDialog(true) + }, []) + + const handleSaveGroup = useCallback( + (group: ToolGroup) => { + setToolGroups((prev) => { + const existingIndex = prev.findIndex((g) => g.id === group.id) + if (existingIndex >= 0) { + // Update existing group + const newGroups = [...prev] + newGroups[existingIndex] = group + return newGroups + } else { + // Add new group + return [...prev, group] + } + }) + toast.success(editingGroup ? "Group updated" : "Group created") + }, + [editingGroup], + ) + + const handleDeleteGroup = useCallback((groupId: string) => { + setToolGroups((prev) => prev.filter((g) => g.id !== groupId)) + toast.success("Group deleted") + }, []) + + // Get available tools for group editor (tools not in other groups) + const availableToolsForEditor = useMemo(() => { + const usedInOtherGroups = new Set() + for (const group of toolGroups) { + if (editingGroup && group.id === editingGroup.id) continue + for (const tool of group.tools) { + usedInOtherGroups.add(tool) + } + } + return toolColumnOptions.filter((opt) => !usedInOtherGroups.has(opt.value)) + }, [toolColumnOptions, toolGroups, editingGroup]) return ( <> @@ -448,49 +698,76 @@ export function Runs({ runs }: { runs: RunWithTaskMetrics[] }) { /> + {/* Tool Groups Dropdown */}
- - -
- - Consolidate: -
-
- Select tool columns to consolidate into a combined column -
-
-
0 ? "[&>div>div]:invisible" : ""}> - - - Reset all filters & consolidation - - ) - } - /> -
- {consolidatedToolColumns.length > 0 && ( -
- - {consolidatedToolColumns.length} tool - {consolidatedToolColumns.length !== 1 ? "s" : ""} - -
- )} -
+ + + + + + {toolGroups.length > 0 ? ( + <> + {toolGroups.map((group) => { + const IconComponent = getIconByName(group.icon) + return ( + { + e.preventDefault() + openEditGroupDialog(group) + }}> +
+ + {group.name} + + ({group.tools.length}) + +
+
+ + +
+
+ ) + })} + + + ) : ( +
No groups yet
+ )} + + + Add Group + +
+
{hasActiveFilters && ( @@ -580,23 +857,30 @@ export function Runs({ runs }: { runs: RunWithTaskMetrics[] }) { Tokens - {hasConsolidatedColumn && ( - - - - - - -
-
Consolidated Tools:
- {consolidatedToolColumns.map((tool) => ( -
{tool}
- ))} -
-
-
-
- )} + {/* Tool Group Columns */} + {toolGroups.map((group) => { + const IconComponent = getIconByName(group.icon) + return ( + +
+ + + + + +
+
{group.name}
+ {group.tools.map((tool) => ( +
{tool}
+ ))} +
+
+
+
+
+ ) + })} + {/* Individual Tool Columns */} {toolColumns.map((toolName) => ( @@ -628,7 +912,7 @@ export function Runs({ runs }: { runs: RunWithTaskMetrics[] }) { run={run} taskMetrics={taskMetrics} toolColumns={toolColumns} - consolidatedToolColumns={consolidatedToolColumns} + toolGroups={toolGroups} /> )) ) : ( @@ -663,6 +947,15 @@ export function Runs({ runs }: { runs: RunWithTaskMetrics[] }) { + {/* Tool Group Editor Dialog */} + + {/* Delete Incomplete Runs Confirmation Dialog */} diff --git a/packages/evals/src/cli/runTask.ts b/packages/evals/src/cli/runTask.ts index 5f737c1ad5d..d7f37e72a1f 100644 --- a/packages/evals/src/cli/runTask.ts +++ b/packages/evals/src/cli/runTask.ts @@ -281,6 +281,13 @@ export const runTask = async ({ run, task, publish, logger, jobToken }: RunTaskO // Track accumulated tool usage across task instances (handles rehydration after abort) const accumulatedToolUsage: ToolUsage = {} + // Promise that resolves when taskMetricsId is set, preventing race conditions + // where TaskTokenUsageUpdated arrives before TaskStarted handler completes + let resolveTaskMetricsReady: () => void + const taskMetricsReady = new Promise((resolve) => { + resolveTaskMetricsReady = resolve + }) + const ignoreEvents: Record<"broadcast" | "log", RooCodeEventName[]> = { broadcast: [RooCodeEventName.Message], log: [RooCodeEventName.TaskTokenUsageUpdated, RooCodeEventName.TaskAskResponded], @@ -360,6 +367,9 @@ export const runTask = async ({ run, task, publish, logger, jobToken }: RunTaskO taskStartedAt = Date.now() taskMetricsId = taskMetrics.id rooTaskId = payload[0] + + // Signal that taskMetricsId is now ready for other handlers + resolveTaskMetricsReady() } if (eventName === RooCodeEventName.TaskToolFailed) { @@ -367,10 +377,20 @@ export const runTask = async ({ run, task, publish, logger, jobToken }: RunTaskO await createToolError({ taskId: task.id, toolName, error }) } - if ( - (eventName === RooCodeEventName.TaskTokenUsageUpdated || eventName === RooCodeEventName.TaskCompleted) && - taskMetricsId - ) { + if (eventName === RooCodeEventName.TaskTokenUsageUpdated || eventName === RooCodeEventName.TaskCompleted) { + // Wait for taskMetricsId to be set by the TaskStarted handler. + // This prevents a race condition where these events arrive before + // the TaskStarted handler finishes its async database operations. + // Note: taskMetricsReady is also resolved on disconnect to prevent deadlock. + await taskMetricsReady + + // Guard: taskMetricsReady may have been resolved due to disconnect + // without taskMetricsId being set. Skip metrics update in this case. + if (!taskMetricsId) { + logger.info(`skipping metrics update: taskMetricsId not set (event: ${eventName})`) + return + } + const duration = Date.now() - taskStartedAt const { totalCost, totalTokensIn, totalTokensOut, contextTokens, totalCacheWrites, totalCacheReads } = @@ -421,6 +441,10 @@ export const runTask = async ({ run, task, publish, logger, jobToken }: RunTaskO client.on(IpcMessageType.Disconnect, async () => { logger.info(`disconnected from IPC socket -> ${ipcSocketPath}`) isClientDisconnected = true + // Resolve taskMetricsReady to unblock any handlers waiting on it. + // This prevents deadlock if TaskStarted never fired or threw before resolving. + // The handlers check for taskMetricsId being set before proceeding. + resolveTaskMetricsReady() }) client.sendCommand({