diff --git a/apps/web-evals/src/app/api/runs/[id]/logs/failed/route.ts b/apps/web-evals/src/app/api/runs/[id]/logs/failed/route.ts index f8c6cec06be..8b2760df987 100644 --- a/apps/web-evals/src/app/api/runs/[id]/logs/failed/route.ts +++ b/apps/web-evals/src/app/api/runs/[id]/logs/failed/route.ts @@ -61,7 +61,7 @@ export async function GET(request: NextRequest, { params }: { params: Promise<{ archive.on("error", reject) }) - // Add each failed task's log file to the archive + // Add each failed task's log file and history files to the archive const logDir = path.join(LOG_BASE_PATH, String(runId)) let filesAdded = 0 @@ -69,18 +69,36 @@ export async function GET(request: NextRequest, { params }: { params: Promise<{ // Sanitize language and exercise to prevent path traversal const safeLanguage = sanitizePathComponent(task.language) const safeExercise = sanitizePathComponent(task.exercise) + const expectedBase = path.resolve(LOG_BASE_PATH) + + // Add the log file const logFileName = `${safeLanguage}-${safeExercise}.log` const logFilePath = path.join(logDir, logFileName) // Verify the resolved path is within the expected directory (defense in depth) - const resolvedPath = path.resolve(logFilePath) - const expectedBase = path.resolve(LOG_BASE_PATH) - if (!resolvedPath.startsWith(expectedBase)) { - continue // Skip files with suspicious paths + const resolvedLogPath = path.resolve(logFilePath) + if (resolvedLogPath.startsWith(expectedBase) && fs.existsSync(logFilePath)) { + archive.file(logFilePath, { name: logFileName }) + filesAdded++ } - if (fs.existsSync(logFilePath)) { - archive.file(logFilePath, { name: logFileName }) + // Add the API conversation history file + // Format: {language}-{exercise}.{iteration}_api_conversation_history.json + const apiHistoryFileName = `${safeLanguage}-${safeExercise}.${task.iteration}_api_conversation_history.json` + const apiHistoryFilePath = path.join(logDir, apiHistoryFileName) + const resolvedApiHistoryPath = path.resolve(apiHistoryFilePath) + if (resolvedApiHistoryPath.startsWith(expectedBase) && fs.existsSync(apiHistoryFilePath)) { + archive.file(apiHistoryFilePath, { name: apiHistoryFileName }) + filesAdded++ + } + + // Add the UI messages file + // Format: {language}-{exercise}.{iteration}_ui_messages.json + const uiMessagesFileName = `${safeLanguage}-${safeExercise}.${task.iteration}_ui_messages.json` + const uiMessagesFilePath = path.join(logDir, uiMessagesFileName) + const resolvedUiMessagesPath = path.resolve(uiMessagesFilePath) + if (resolvedUiMessagesPath.startsWith(expectedBase) && fs.existsSync(uiMessagesFilePath)) { + archive.file(uiMessagesFilePath, { name: uiMessagesFileName }) filesAdded++ } } diff --git a/apps/web-evals/src/app/runs/[id]/run.tsx b/apps/web-evals/src/app/runs/[id]/run.tsx index 41581a21c41..e2079343d32 100644 --- a/apps/web-evals/src/app/runs/[id]/run.tsx +++ b/apps/web-evals/src/app/runs/[id]/run.tsx @@ -1,8 +1,8 @@ "use client" -import { useMemo, useState, useCallback, useEffect } from "react" +import { useMemo, useState, useCallback, useEffect, Fragment } from "react" import { toast } from "sonner" -import { LoaderCircle, FileText, Copy, Check, StopCircle } from "lucide-react" +import { LoaderCircle, FileText, Copy, Check, StopCircle, List, Layers } from "lucide-react" import type { Run, TaskMetrics as _TaskMetrics, Task } from "@roo-code/evals" import type { ToolName } from "@roo-code/types" @@ -41,6 +41,9 @@ import { RunStatus } from "./run-status" type TaskMetrics = Pick<_TaskMetrics, "tokensIn" | "tokensOut" | "tokensContext" | "duration" | "cost"> +// Extended Task type with taskMetrics from useRunStatus +type TaskWithMetrics = Task & { taskMetrics: _TaskMetrics | null } + type ToolUsageEntry = { attempts: number; failures: number } type ToolUsage = Record @@ -250,6 +253,19 @@ export function Run({ run }: { run: Run }) { const [copied, setCopied] = useState(false) const [showKillDialog, setShowKillDialog] = useState(false) const [isKilling, setIsKilling] = useState(false) + const [groupByStatus, setGroupByStatus] = useState(() => { + // Initialize from localStorage if available (client-side only) + if (typeof window !== "undefined") { + const stored = localStorage.getItem("evals-group-by-status") + return stored === "true" + } + return false + }) + + // Persist groupByStatus to localStorage + useEffect(() => { + localStorage.setItem("evals-group-by-status", String(groupByStatus)) + }, [groupByStatus]) // Determine if run is still active (has heartbeat or runners) const isRunActive = !run.taskMetricsId && (!!heartbeat || (runners && runners.length > 0)) @@ -300,41 +316,6 @@ export function Run({ run }: { run: Run }) { return () => document.removeEventListener("keydown", handleKeyDown) }, [selectedTask]) - const onViewTaskLog = useCallback( - async (task: Task) => { - // Only allow viewing logs for tasks that have started - if (!task.startedAt && !tokenUsage.get(task.id)) { - toast.error("Task has not started yet") - return - } - - setSelectedTask(task) - setIsLoadingLog(true) - setTaskLog(null) - - try { - const response = await fetch(`/api/runs/${run.id}/logs/${task.id}`) - - if (!response.ok) { - const error = await response.json() - toast.error(error.error || "Failed to load log") - setSelectedTask(null) - return - } - - const data = await response.json() - setTaskLog(data.logContent) - } catch (error) { - console.error("Error loading task log:", error) - toast.error("Failed to load log") - setSelectedTask(null) - } finally { - setIsLoadingLog(false) - } - }, - [run.id, tokenUsage], - ) - const taskMetrics: Record = useMemo(() => { // Reference usageUpdatedAt to trigger recomputation when Map contents change void usageUpdatedAt @@ -376,6 +357,44 @@ export function Run({ run }: { run: Run }) { return metrics }, [tasks, tokenUsage, usageUpdatedAt]) + const onViewTaskLog = useCallback( + async (task: Task) => { + // Only allow viewing logs for tasks that have started. + // Note: we treat presence of derived metrics as evidence of a started task, + // since this page may be rendered without streaming `tokenUsage` populated. + const hasStarted = !!task.startedAt || !!tokenUsage.get(task.id) || !!taskMetrics[task.id] + if (!hasStarted) { + toast.error("Task has not started yet") + return + } + + setSelectedTask(task) + setIsLoadingLog(true) + setTaskLog(null) + + try { + const response = await fetch(`/api/runs/${run.id}/logs/${task.id}`) + + if (!response.ok) { + const error = await response.json() + toast.error(error.error || "Failed to load log") + setSelectedTask(null) + return + } + + const data = await response.json() + setTaskLog(data.logContent) + } catch (error) { + console.error("Error loading task log:", error) + toast.error("Failed to load log") + setSelectedTask(null) + } finally { + setIsLoadingLog(false) + } + }, + [run.id, tokenUsage, taskMetrics], + ) + // Collect all unique tool names from all tasks and sort by total attempts const toolColumns = useMemo(() => { // Reference usageUpdatedAt to trigger recomputation when Map contents change @@ -463,10 +482,13 @@ export function Run({ run }: { run: Run }) { } } + const remaining = tasks.length - completed + return { passed, failed, completed, + remaining, passRate: completed > 0 ? ((passed / completed) * 100).toFixed(1) : null, totalTokensIn, totalTokensOut, @@ -501,258 +523,399 @@ export function Run({ run }: { run: Run }) { return Date.now() - startTime }, [tasks, run.createdAt, run.taskMetricsId, usageUpdatedAt]) - return ( - <> -
- {stats && ( -
- {/* Provider, Model title and status */} -
- {run.settings?.apiProvider && ( - {run.settings.apiProvider} - )} -
{run.model}
- - {run.description && ( - - {run.description} - )} - {isRunActive && ( - - - - - Stop all containers for this run - - )} -
- {/* Main Stats Row */} -
- {/* Passed/Failed */} -
-
- {stats.passed} - / - {stats.failed} -
-
Passed / Failed
-
+ // Task status categories + type TaskStatusCategory = "failed" | "in_progress" | "passed" | "not_started" + + const getTaskStatusCategory = useCallback( + (task: TaskWithMetrics): TaskStatusCategory => { + if (task.passed === false) return "failed" + if (task.passed === true) return "passed" + // Check streaming data, DB metrics, or startedAt timestamp + const hasStarted = !!task.startedAt || !!tokenUsage.get(task.id) || !!taskMetrics[task.id] + if (hasStarted) return "in_progress" + return "not_started" + }, + [tokenUsage, taskMetrics], + ) - {/* Pass Rate */} -
-
= 80 - ? "text-yellow-500" - : "text-red-500" - }`}> - {stats.passRate ? `${stats.passRate}%` : "-"} -
-
Pass Rate
-
+ // Group tasks by status while preserving original index + const groupedTasks = useMemo(() => { + if (!tasks || !groupByStatus) return null - {/* Tokens */} -
-
- {formatTokens(stats.totalTokensIn)} - / - {formatTokens(stats.totalTokensOut)} -
-
Tokens In / Out
-
+ const groups: Record> = { + failed: [], + in_progress: [], + passed: [], + not_started: [], + } - {/* Cost */} -
-
{formatCurrency(stats.totalCost)}
-
Cost
-
+ tasks.forEach((task, index) => { + const status = getTaskStatusCategory(task) + groups[status].push({ task, originalIndex: index }) + }) - {/* Duration */} -
-
- {stats.totalDuration > 0 ? formatDuration(stats.totalDuration) : "-"} -
-
Duration
-
+ return groups + }, [tasks, groupByStatus, getTaskStatusCategory]) + + const statusLabels = useMemo( + (): Record => ({ + failed: { label: "Failed", className: "text-red-500", count: groupedTasks?.failed.length ?? 0 }, + in_progress: { + label: "In Progress", + className: "text-yellow-500", + count: groupedTasks?.in_progress.length ?? 0, + }, + passed: { label: "Passed", className: "text-green-500", count: groupedTasks?.passed.length ?? 0 }, + not_started: { + label: "Not Started", + className: "text-muted-foreground", + count: groupedTasks?.not_started.length ?? 0, + }, + }), + [groupedTasks], + ) - {/* Elapsed Time */} -
-
- {elapsedTime !== null ? formatDuration(elapsedTime) : "-"} -
-
Elapsed
-
-
+ const statusOrder: TaskStatusCategory[] = ["failed", "in_progress", "passed", "not_started"] - {/* Tool Usage Row */} - {Object.keys(stats.toolUsage).length > 0 && ( -
- {Object.entries(stats.toolUsage) - .sort(([, a], [, b]) => b.attempts - a.attempts) - .map(([toolName, usage]) => { - const abbr = getToolAbbreviation(toolName) - const successRate = - usage.attempts > 0 - ? ((usage.attempts - usage.failures) / usage.attempts) * 100 - : 100 - const rateColor = - successRate === 100 - ? "text-green-500" - : successRate >= 80 - ? "text-yellow-500" - : "text-red-500" - return ( - - -
- - {abbr} - - {usage.attempts} - - {formatToolUsageSuccessRate(usage)} - -
-
- {toolName} -
- ) - })} -
- )} + // Helper to render a task row + const renderTaskRow = (task: TaskWithMetrics, originalIndex: number) => { + const hasStarted = !!task.startedAt || !!tokenUsage.get(task.id) || !!taskMetrics[task.id] + return ( + hasStarted && onViewTaskLog(task)}> + + {originalIndex + 1} + + +
+ +
+ + {task.language}/{task.exercise} + {task.iteration > 1 && ( + (#{task.iteration}) + )} + + {hasStarted && ( + + + + + Click to view log + + )} +
+
+ {taskMetrics[task.id] ? ( + <> + +
+
{formatTokens(taskMetrics[task.id]!.tokensIn)}
/ +
{formatTokens(taskMetrics[task.id]!.tokensOut)}
+
+
+ + {formatTokens(taskMetrics[task.id]!.tokensContext)} + + {toolColumns.map((toolName) => { + const dbUsage = task.taskMetrics?.toolUsage?.[toolName] + const streamingUsage = toolUsage.get(task.id)?.[toolName] + const usage = task.finishedAt ? (dbUsage ?? streamingUsage) : streamingUsage + + const successRate = + usage && usage.attempts > 0 + ? ((usage.attempts - usage.failures) / usage.attempts) * 100 + : 100 + const rateColor = + successRate === 100 + ? "text-muted-foreground" + : successRate >= 80 + ? "text-yellow-500" + : "text-red-500" + return ( + + {usage ? ( +
+ {usage.attempts} + {formatToolUsageSuccessRate(usage)} +
+ ) : ( + - + )} +
+ ) + })} + + {taskMetrics[task.id]!.duration ? formatDuration(taskMetrics[task.id]!.duration) : "-"} + + + {formatCurrency(taskMetrics[task.id]!.cost)} + + + ) : ( + )} +
+ ) + } + + return ( + <> +
{!tasks ? ( ) : ( - - - - Exercise - Tokens In / Out - Context - {toolColumns.map((toolName) => ( - - - {getToolAbbreviation(toolName)} - {toolName} - - - ))} - Duration - Cost - - - - {tasks.map((task) => { - const hasStarted = !!task.startedAt || !!tokenUsage.get(task.id) - return ( - hasStarted && onViewTaskLog(task)}> - -
- -
- - {task.language}/{task.exercise} - {task.iteration > 1 && ( - - (#{task.iteration}) - - )} - - {hasStarted && ( - - - - - Click to view log - - )} -
-
-
- {taskMetrics[task.id] ? ( + <> + {/* View Toggle */} +
+ + + + + + {groupByStatus ? "Show tasks in run order" : "Group tasks by status"} + + +
+
+ + {stats && ( + + + {/* Provider, Model title and status */} +
+ {run.settings?.apiProvider && ( + + {run.settings.apiProvider} + + )} +
{run.model}
+ + {run.description && ( + + - {run.description} + + )} + {isRunActive && ( + + + + + + Stop all containers for this run + + + )} +
+ {/* Main Stats Row */} +
+ {/* Pass Rate / Fail Rate / Remaining % */} +
+
+ + {stats.completed > 0 + ? `${((stats.passed / stats.completed) * 100).toFixed(1)}%` + : "-"} + + / + + {stats.completed > 0 + ? `${((stats.failed / stats.completed) * 100).toFixed(1)}%` + : "-"} + + / + + {tasks.length > 0 + ? `${((stats.remaining / tasks.length) * 100).toFixed(1)}%` + : "-"} + +
+
+ {stats.passed} + {" / "} + {stats.failed} + {" / "} + {stats.remaining} + {" of "} + {tasks.length} +
+
+ + {/* Tokens */} +
+
+ {formatTokens(stats.totalTokensIn)} + / + {formatTokens(stats.totalTokensOut)} +
+
Tokens In / Out
+
+ + {/* Cost */} +
+
+ {formatCurrency(stats.totalCost)} +
+
Cost
+
+ + {/* Duration */} +
+
+ {stats.totalDuration > 0 + ? formatDuration(stats.totalDuration) + : "-"} +
+
Duration
+
+ + {/* Elapsed Time */} +
+
+ {elapsedTime !== null ? formatDuration(elapsedTime) : "-"} +
+
Elapsed
+
+ + {/* Estimated Time Remaining - only show if run is active and we have data */} + {!run.taskMetricsId && + elapsedTime !== null && + stats.completed > 0 && + stats.remaining > 0 && ( +
+
+ ~ + {formatDuration( + (elapsedTime / stats.completed) * stats.remaining, + )} +
+
+ Est. Remaining +
+
+ )} +
+ + {/* Tool Usage Row */} + {Object.keys(stats.toolUsage).length > 0 && ( +
+ {Object.entries(stats.toolUsage) + .sort(([, a], [, b]) => b.attempts - a.attempts) + .map(([toolName, usage]) => { + const abbr = getToolAbbreviation(toolName) + const successRate = + usage.attempts > 0 + ? ((usage.attempts - usage.failures) / + usage.attempts) * + 100 + : 100 + const rateColor = + successRate === 100 + ? "text-green-500" + : successRate >= 80 + ? "text-yellow-500" + : "text-red-500" + return ( + + +
+ + {abbr} + + + {usage.attempts} + + + {formatToolUsageSuccessRate(usage)} + +
+
+ + {toolName} + +
+ ) + })} +
+ )} +
- ) - })} - -
+ )} + + # + Exercise + Tokens In / Out + Context + {toolColumns.map((toolName) => ( + + + {getToolAbbreviation(toolName)} + {toolName} + + + ))} + Duration + Cost + + + + {groupByStatus && groupedTasks + ? // Grouped view + statusOrder.map((status) => { + const group = groupedTasks[status] + if (group.length === 0) return null + const { label, className } = statusLabels[status] + return ( + + + + + {label} ({group.length}) + + + + {group.map(({ task, originalIndex }) => + renderTaskRow(task, originalIndex), + )} + + ) + }) + : // Default order view + tasks.map((task, index) => renderTaskRow(task, index))} + + + )}
diff --git a/apps/web-evals/src/lib/__tests__/formatters.spec.ts b/apps/web-evals/src/lib/__tests__/formatters.spec.ts new file mode 100644 index 00000000000..88c8f94af9b --- /dev/null +++ b/apps/web-evals/src/lib/__tests__/formatters.spec.ts @@ -0,0 +1,30 @@ +import { formatDuration, formatTokens } from "../formatters" + +describe("formatDuration()", () => { + it("formats as H:MM:SS", () => { + expect(formatDuration(0)).toBe("0:00:00") + expect(formatDuration(1_000)).toBe("0:00:01") + expect(formatDuration(61_000)).toBe("0:01:01") + expect(formatDuration(3_661_000)).toBe("1:01:01") + }) +}) + +describe("formatTokens()", () => { + it("formats small numbers without suffix", () => { + expect(formatTokens(0)).toBe("0") + expect(formatTokens(999)).toBe("999") + }) + + it("formats thousands without decimals and clamps to 1.0M at boundary", () => { + expect(formatTokens(1_000)).toBe("1k") + expect(formatTokens(72_500)).toBe("73k") + expect(formatTokens(999_499)).toBe("999k") + expect(formatTokens(999_500)).toBe("1.0M") + }) + + it("formats millions with one decimal and clamps to 1.0B at boundary", () => { + expect(formatTokens(1_000_000)).toBe("1.0M") + expect(formatTokens(3_240_000)).toBe("3.2M") + expect(formatTokens(999_950_000)).toBe("1.0B") + }) +}) diff --git a/apps/web-evals/src/lib/formatters.ts b/apps/web-evals/src/lib/formatters.ts index e082e4f02ac..155f27dd86e 100644 --- a/apps/web-evals/src/lib/formatters.ts +++ b/apps/web-evals/src/lib/formatters.ts @@ -11,21 +11,10 @@ export const formatDuration = (durationMs: number) => { const minutes = Math.floor((seconds % 3600) / 60) const remainingSeconds = seconds % 60 - const parts = [] - - if (hours > 0) { - parts.push(`${hours}h`) - } - - if (minutes > 0) { - parts.push(`${minutes}m`) - } - - if (remainingSeconds > 0 || parts.length === 0) { - parts.push(`${remainingSeconds}s`) - } - - return parts.join(" ") + // Format as H:MM:SS + const mm = minutes.toString().padStart(2, "0") + const ss = remainingSeconds.toString().padStart(2, "0") + return `${hours}:${mm}:${ss}` } export const formatTokens = (tokens: number) => { @@ -34,11 +23,23 @@ export const formatTokens = (tokens: number) => { } if (tokens < 1000000) { - return `${(tokens / 1000).toFixed(1)}k` + // No decimal for thousands (e.g., 72k not 72.5k) + const rounded = Math.round(tokens / 1000) + // If rounding crosses the boundary to 1000k, show as 1.0M instead + if (rounded >= 1000) { + return "1.0M" + } + return `${rounded}k` } if (tokens < 1000000000) { - return `${(tokens / 1000000).toFixed(1)}M` + // Keep decimal for millions (e.g., 3.2M) + const rounded = Math.round(tokens / 100000) / 10 // Round to 1 decimal + // If rounding crosses the boundary to 1000M, show as 1.0B instead + if (rounded >= 1000) { + return "1.0B" + } + return `${rounded.toFixed(1)}M` } return `${(tokens / 1000000000).toFixed(1)}B`