Skip to content

Commit d928b80

Browse files
committed
feat(web-evals): enhance dashboard with dynamic tool columns and UX improvements
- Add aggregate statistics panel on run details page - Add dynamic tool usage columns sorted by total usage - Add API config selector for multi-config imports - Add language toggle buttons for exercise selection - Persist concurrency/timeout settings to localStorage - Make table rows clickable for faster navigation - Add View Settings option in dropdown menu - Support controlled mode for MultiSelect component - Filter deprecated models from Roo Code Cloud list
1 parent f173c9c commit d928b80

File tree

6 files changed

+557
-50
lines changed

6 files changed

+557
-50
lines changed

apps/web-evals/src/app/runs/[id]/run.tsx

Lines changed: 170 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,15 +5,36 @@ import { LoaderCircle } from "lucide-react"
55

66
import type { Run, TaskMetrics as _TaskMetrics } from "@roo-code/evals"
77

8-
import { formatCurrency, formatDuration, formatTokens } from "@/lib/formatters"
8+
import { formatCurrency, formatDuration, formatTokens, formatToolUsageSuccessRate } from "@/lib/formatters"
99
import { useRunStatus } from "@/hooks/use-run-status"
10-
import { Table, TableBody, TableCell, TableHead, TableHeader, TableRow } from "@/components/ui"
10+
import {
11+
Table,
12+
TableBody,
13+
TableCell,
14+
TableHead,
15+
TableHeader,
16+
TableRow,
17+
Tooltip,
18+
TooltipContent,
19+
TooltipTrigger,
20+
} from "@/components/ui"
1121

1222
import { TaskStatus } from "./task-status"
1323
import { RunStatus } from "./run-status"
1424

1525
type TaskMetrics = Pick<_TaskMetrics, "tokensIn" | "tokensOut" | "tokensContext" | "duration" | "cost">
1626

27+
type ToolUsageEntry = { attempts: number; failures: number }
28+
type ToolUsage = Record<string, ToolUsageEntry>
29+
30+
// Generate abbreviation from tool name (e.g., "read_file" -> "RF", "list_code_definition_names" -> "LCDN")
31+
function getToolAbbreviation(toolName: string): string {
32+
return toolName
33+
.split("_")
34+
.map((word) => word[0]?.toUpperCase() ?? "")
35+
.join("")
36+
}
37+
1738
export function Run({ run }: { run: Run }) {
1839
const runStatus = useRunStatus(run)
1940
const { tasks, tokenUsage, usageUpdatedAt } = runStatus
@@ -41,16 +62,162 @@ export function Run({ run }: { run: Run }) {
4162
// eslint-disable-next-line react-hooks/exhaustive-deps
4263
}, [tasks, tokenUsage, usageUpdatedAt])
4364

65+
// Compute aggregate stats
66+
const stats = useMemo(() => {
67+
if (!tasks) return null
68+
69+
const passed = tasks.filter((t) => t.passed === true).length
70+
const failed = tasks.filter((t) => t.passed === false).length
71+
const running = tasks.filter((t) => t.startedAt && !t.finishedAt).length
72+
const pending = tasks.filter((t) => !t.startedAt && !t.finishedAt).length
73+
const total = tasks.length
74+
const completed = passed + failed
75+
76+
let totalTokensIn = 0
77+
let totalTokensOut = 0
78+
let totalCost = 0
79+
let totalDuration = 0
80+
81+
// Aggregate tool usage from completed tasks
82+
const toolUsage: ToolUsage = {}
83+
84+
for (const task of tasks) {
85+
const metrics = taskMetrics[task.id]
86+
if (metrics) {
87+
totalTokensIn += metrics.tokensIn
88+
totalTokensOut += metrics.tokensOut
89+
totalCost += metrics.cost
90+
totalDuration += metrics.duration
91+
}
92+
93+
// Aggregate tool usage from finished tasks with taskMetrics
94+
if (task.finishedAt && task.taskMetrics?.toolUsage) {
95+
for (const [key, usage] of Object.entries(task.taskMetrics.toolUsage)) {
96+
const tool = key as keyof ToolUsage
97+
if (!toolUsage[tool]) {
98+
toolUsage[tool] = { attempts: 0, failures: 0 }
99+
}
100+
toolUsage[tool].attempts += usage.attempts
101+
toolUsage[tool].failures += usage.failures
102+
}
103+
}
104+
}
105+
106+
return {
107+
passed,
108+
failed,
109+
running,
110+
pending,
111+
total,
112+
completed,
113+
passRate: completed > 0 ? ((passed / completed) * 100).toFixed(1) : null,
114+
totalTokensIn,
115+
totalTokensOut,
116+
totalCost,
117+
totalDuration,
118+
toolUsage,
119+
}
120+
}, [tasks, taskMetrics])
121+
44122
return (
45123
<>
46124
<div>
47-
<div className="mb-2">
125+
<div className="mb-4">
48126
<div>
49127
<div className="font-mono">{run.model}</div>
50128
{run.description && <div className="text-sm text-muted-foreground">{run.description}</div>}
51129
</div>
52130
{!run.taskMetricsId && <RunStatus runStatus={runStatus} />}
53131
</div>
132+
133+
{stats && (
134+
<div className="mb-4 p-4 border rounded-lg bg-muted/50">
135+
{/* Main Stats Row */}
136+
<div className="flex flex-wrap items-start justify-between gap-x-6 gap-y-3">
137+
{/* Passed/Failed */}
138+
<div className="text-center">
139+
<div className="text-2xl font-bold whitespace-nowrap">
140+
<span className="text-green-600">{stats.passed}</span>
141+
<span className="text-muted-foreground mx-1">/</span>
142+
<span className="text-red-600">{stats.failed}</span>
143+
{stats.running > 0 && (
144+
<span className="text-yellow-600 text-sm ml-2">({stats.running})</span>
145+
)}
146+
</div>
147+
<div className="text-xs text-muted-foreground">Passed / Failed</div>
148+
</div>
149+
150+
{/* Pass Rate */}
151+
<div className="text-center">
152+
<div className="text-2xl font-bold">{stats.passRate ? `${stats.passRate}%` : "-"}</div>
153+
<div className="text-xs text-muted-foreground">Pass Rate</div>
154+
</div>
155+
156+
{/* Tokens */}
157+
<div className="text-center">
158+
<div className="text-xl font-bold font-mono whitespace-nowrap">
159+
{formatTokens(stats.totalTokensIn)}
160+
<span className="text-muted-foreground mx-1">/</span>
161+
{formatTokens(stats.totalTokensOut)}
162+
</div>
163+
<div className="text-xs text-muted-foreground">Tokens In / Out</div>
164+
</div>
165+
166+
{/* Cost */}
167+
<div className="text-center">
168+
<div className="text-2xl font-bold font-mono">{formatCurrency(stats.totalCost)}</div>
169+
<div className="text-xs text-muted-foreground">Cost</div>
170+
</div>
171+
172+
{/* Duration */}
173+
<div className="text-center">
174+
<div className="text-2xl font-bold font-mono whitespace-nowrap">
175+
{stats.totalDuration > 0 ? formatDuration(stats.totalDuration) : "-"}
176+
</div>
177+
<div className="text-xs text-muted-foreground">Duration</div>
178+
</div>
179+
180+
{/* Tool Usage - Inline */}
181+
{Object.keys(stats.toolUsage).length > 0 && (
182+
<div className="flex items-center gap-2 flex-wrap">
183+
{Object.entries(stats.toolUsage)
184+
.sort(([, a], [, b]) => b.attempts - a.attempts)
185+
.map(([toolName, usage]) => {
186+
const abbr = getToolAbbreviation(toolName)
187+
const successRate =
188+
usage.attempts > 0
189+
? ((usage.attempts - usage.failures) / usage.attempts) * 100
190+
: 100
191+
const rateColor =
192+
successRate === 100
193+
? "text-green-500"
194+
: successRate >= 80
195+
? "text-yellow-500"
196+
: "text-red-500"
197+
return (
198+
<Tooltip key={toolName}>
199+
<TooltipTrigger asChild>
200+
<div className="flex items-center gap-1 px-2 py-1 rounded bg-background/50 border border-border/50 hover:border-border transition-colors cursor-default text-xs">
201+
<span className="font-medium text-muted-foreground">
202+
{abbr}
203+
</span>
204+
<span className="font-bold tabular-nums">
205+
{usage.attempts}
206+
</span>
207+
<span className={`${rateColor}`}>
208+
{formatToolUsageSuccessRate(usage)}
209+
</span>
210+
</div>
211+
</TooltipTrigger>
212+
<TooltipContent side="bottom">{toolName}</TooltipContent>
213+
</Tooltip>
214+
)
215+
})}
216+
</div>
217+
)}
218+
</div>
219+
</div>
220+
)}
54221
{!tasks ? (
55222
<LoaderCircle className="size-4 animate-spin" />
56223
) : (

0 commit comments

Comments
 (0)