diff --git a/apps/web-evals/src/actions/__tests__/killRun.spec.ts b/apps/web-evals/src/actions/__tests__/killRun.spec.ts new file mode 100644 index 00000000000..814d70d9fca --- /dev/null +++ b/apps/web-evals/src/actions/__tests__/killRun.spec.ts @@ -0,0 +1,207 @@ +// npx vitest run src/actions/__tests__/killRun.spec.ts + +import { execFileSync } from "child_process" + +// Mock child_process +vi.mock("child_process", () => ({ + execFileSync: vi.fn(), + spawn: vi.fn(), +})) + +// Mock next/cache +vi.mock("next/cache", () => ({ + revalidatePath: vi.fn(), +})) + +// Mock redis client +vi.mock("@/lib/server/redis", () => ({ + redisClient: vi.fn().mockResolvedValue({ + del: vi.fn().mockResolvedValue(1), + }), +})) + +// Mock @roo-code/evals +vi.mock("@roo-code/evals", () => ({ + createRun: vi.fn(), + deleteRun: vi.fn(), + createTask: vi.fn(), + exerciseLanguages: [], + getExercisesForLanguage: vi.fn().mockResolvedValue([]), +})) + +// Mock timers to speed up tests +vi.useFakeTimers() + +// Import after mocks +import { killRun } from "../runs" + +const mockExecFileSync = execFileSync as ReturnType + +describe("killRun", () => { + beforeEach(() => { + vi.clearAllMocks() + }) + + afterEach(() => { + vi.clearAllTimers() + }) + + it("should kill controller first, wait, then kill task containers", async () => { + const runId = 123 + + // execFileSync is used for all docker commands + mockExecFileSync + .mockReturnValueOnce("") // docker kill controller + .mockReturnValueOnce("evals-task-123-456.0\nevals-task-123-789.1\n") // docker ps + .mockReturnValueOnce("") // docker kill evals-task-123-456.0 + .mockReturnValueOnce("") // docker kill evals-task-123-789.1 + + const resultPromise = killRun(runId) + + // Fast-forward past the 10 second sleep + await vi.advanceTimersByTimeAsync(10000) + + const result = await resultPromise + + expect(result.success).toBe(true) + expect(result.killedContainers).toContain("evals-controller-123") + expect(result.killedContainers).toContain("evals-task-123-456.0") + expect(result.killedContainers).toContain("evals-task-123-789.1") + expect(result.errors).toHaveLength(0) + + // Verify execFileSync was called for docker kill + expect(mockExecFileSync).toHaveBeenNthCalledWith( + 1, + "docker", + ["kill", "evals-controller-123"], + expect.any(Object), + ) + // Verify execFileSync was called for docker ps with run-specific filter + expect(mockExecFileSync).toHaveBeenNthCalledWith( + 2, + "docker", + ["ps", "--format", "{{.Names}}", "--filter", "name=evals-task-123-"], + expect.any(Object), + ) + }) + + it("should continue killing runners even if controller is not running", async () => { + const runId = 456 + + mockExecFileSync + .mockImplementationOnce(() => { + throw new Error("No such container") + }) // controller kill fails + .mockReturnValueOnce("evals-task-456-100.0\n") // docker ps + .mockReturnValueOnce("") // docker kill task + + const resultPromise = killRun(runId) + await vi.advanceTimersByTimeAsync(10000) + const result = await resultPromise + + expect(result.success).toBe(true) + expect(result.killedContainers).toContain("evals-task-456-100.0") + // Controller not in list since it failed + expect(result.killedContainers).not.toContain("evals-controller-456") + }) + + it("should clear Redis state after killing containers", async () => { + const runId = 789 + + const mockDel = vi.fn().mockResolvedValue(1) + const { redisClient } = await import("@/lib/server/redis") + vi.mocked(redisClient).mockResolvedValue({ del: mockDel } as never) + + mockExecFileSync + .mockReturnValueOnce("") // controller kill + .mockReturnValueOnce("") // docker ps (no tasks) + + const resultPromise = killRun(runId) + await vi.advanceTimersByTimeAsync(10000) + await resultPromise + + expect(mockDel).toHaveBeenCalledWith("heartbeat:789") + expect(mockDel).toHaveBeenCalledWith("runners:789") + }) + + it("should handle docker ps failure gracefully", async () => { + const runId = 111 + + mockExecFileSync + .mockReturnValueOnce("") // controller kill succeeds + .mockImplementationOnce(() => { + throw new Error("Docker error") + }) // docker ps fails + + const resultPromise = killRun(runId) + await vi.advanceTimersByTimeAsync(10000) + const result = await resultPromise + + // Should still be successful because controller was killed + expect(result.success).toBe(true) + expect(result.killedContainers).toContain("evals-controller-111") + expect(result.errors).toContain("Failed to list Docker task containers") + }) + + it("should handle individual task kill failures", async () => { + const runId = 222 + + mockExecFileSync + .mockReturnValueOnce("") // controller kill + .mockReturnValueOnce("evals-task-222-300.0\nevals-task-222-400.0\n") // docker ps + .mockImplementationOnce(() => { + throw new Error("Kill failed") + }) // first task kill fails + .mockReturnValueOnce("") // second task kill succeeds + + const resultPromise = killRun(runId) + await vi.advanceTimersByTimeAsync(10000) + const result = await resultPromise + + expect(result.success).toBe(true) + expect(result.killedContainers).toContain("evals-controller-222") + expect(result.killedContainers).toContain("evals-task-222-400.0") + expect(result.errors.length).toBe(1) + expect(result.errors[0]).toContain("evals-task-222-300.0") + }) + + it("should return success with no containers when nothing is running", async () => { + const runId = 333 + + mockExecFileSync + .mockImplementationOnce(() => { + throw new Error("No such container") + }) // controller not running + .mockReturnValueOnce("") // no task containers + + const resultPromise = killRun(runId) + await vi.advanceTimersByTimeAsync(10000) + const result = await resultPromise + + expect(result.success).toBe(true) + expect(result.killedContainers).toHaveLength(0) + expect(result.errors).toHaveLength(0) + }) + + it("should only kill containers belonging to the specific run", async () => { + const runId = 555 + + mockExecFileSync + .mockReturnValueOnce("") // controller kill + .mockReturnValueOnce("evals-task-555-100.0\n") // docker ps + .mockReturnValueOnce("") // docker kill task + + const resultPromise = killRun(runId) + await vi.advanceTimersByTimeAsync(10000) + const result = await resultPromise + + expect(result.success).toBe(true) + // Verify execFileSync was called for docker ps with run-specific filter + expect(mockExecFileSync).toHaveBeenNthCalledWith( + 2, + "docker", + ["ps", "--format", "{{.Names}}", "--filter", "name=evals-task-555-"], + expect.any(Object), + ) + }) +}) diff --git a/apps/web-evals/src/actions/runs.ts b/apps/web-evals/src/actions/runs.ts index e07bf342115..a3fb3feccc8 100644 --- a/apps/web-evals/src/actions/runs.ts +++ b/apps/web-evals/src/actions/runs.ts @@ -3,7 +3,7 @@ import * as path from "path" import fs from "fs" import { fileURLToPath } from "url" -import { spawn } from "child_process" +import { spawn, execFileSync } from "child_process" import { revalidatePath } from "next/cache" import pMap from "p-map" @@ -18,6 +18,7 @@ import { } from "@roo-code/evals" import { CreateRun } from "@/lib/schemas" +import { redisClient } from "@/lib/server/redis" const EVALS_REPO_PATH = path.resolve(path.dirname(fileURLToPath(import.meta.url)), "../../../../../evals") @@ -116,3 +117,100 @@ export async function deleteRun(runId: number) { await _deleteRun(runId) revalidatePath("/runs") } + +export type KillRunResult = { + success: boolean + killedContainers: string[] + errors: string[] +} + +const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms)) + +/** + * Kill all Docker containers associated with a run (controller and task runners). + * Kills the controller first, waits 10 seconds, then kills runners. + * Also clears Redis state for heartbeat and runners. + * + * Container naming conventions: + * - Controller: evals-controller-{runId} + * - Task runners: evals-task-{runId}-{taskId}.{attempt} + */ +export async function killRun(runId: number): Promise { + const killedContainers: string[] = [] + const errors: string[] = [] + const controllerPattern = `evals-controller-${runId}` + const taskPattern = `evals-task-${runId}-` + + try { + // Step 1: Kill the controller first + console.log(`Killing controller: ${controllerPattern}`) + try { + execFileSync("docker", ["kill", controllerPattern], { encoding: "utf-8", timeout: 10000 }) + killedContainers.push(controllerPattern) + console.log(`Killed controller container: ${controllerPattern}`) + } catch (_error) { + // Controller might not be running - that's ok, continue to kill runners + console.log(`Controller ${controllerPattern} not running or already stopped`) + } + + // Step 2: Wait 10 seconds before killing runners + console.log("Waiting 10 seconds before killing runners...") + await sleep(10000) + + // Step 3: Find and kill all task runner containers for THIS run only + let taskContainerNames: string[] = [] + + try { + const output = execFileSync("docker", ["ps", "--format", "{{.Names}}", "--filter", `name=${taskPattern}`], { + encoding: "utf-8", + timeout: 10000, + }) + taskContainerNames = output + .split("\n") + .map((name) => name.trim()) + .filter((name) => name.length > 0 && name.startsWith(taskPattern)) + } catch (error) { + console.error("Failed to list task containers:", error) + errors.push("Failed to list Docker task containers") + } + + // Kill each task runner container + for (const containerName of taskContainerNames) { + try { + execFileSync("docker", ["kill", containerName], { encoding: "utf-8", timeout: 10000 }) + killedContainers.push(containerName) + console.log(`Killed task container: ${containerName}`) + } catch (error) { + // Container might have already stopped + console.error(`Failed to kill container ${containerName}:`, error) + errors.push(`Failed to kill container: ${containerName}`) + } + } + + // Step 4: Clear Redis state + try { + const redis = await redisClient() + const heartbeatKey = `heartbeat:${runId}` + const runnersKey = `runners:${runId}` + + await redis.del(heartbeatKey) + await redis.del(runnersKey) + console.log(`Cleared Redis keys: ${heartbeatKey}, ${runnersKey}`) + } catch (error) { + console.error("Failed to clear Redis state:", error) + errors.push("Failed to clear Redis state") + } + } catch (error) { + console.error("Error in killRun:", error) + errors.push("Unexpected error while killing containers") + } + + revalidatePath(`/runs/${runId}`) + revalidatePath("/runs") + + return { + success: killedContainers.length > 0 || errors.length === 0, + killedContainers, + errors, + } +} diff --git a/apps/web-evals/src/app/runs/[id]/page.tsx b/apps/web-evals/src/app/runs/[id]/page.tsx index aae3fc70f9b..8b993eec8a0 100644 --- a/apps/web-evals/src/app/runs/[id]/page.tsx +++ b/apps/web-evals/src/app/runs/[id]/page.tsx @@ -7,7 +7,7 @@ export default async function Page({ params }: { params: Promise<{ id: string }> const run = await findRun(Number(id)) return ( -
+
) diff --git a/apps/web-evals/src/app/runs/[id]/run-status.tsx b/apps/web-evals/src/app/runs/[id]/run-status.tsx index 4b94ef14fab..e05b1b51ebe 100644 --- a/apps/web-evals/src/app/runs/[id]/run-status.tsx +++ b/apps/web-evals/src/app/runs/[id]/run-status.tsx @@ -1,55 +1,79 @@ "use client" +import { Link2, Link2Off, CheckCircle2 } from "lucide-react" import type { RunStatus as _RunStatus } from "@/hooks/use-run-status" import { cn } from "@/lib/utils" +import { Tooltip, TooltipContent, TooltipTrigger } from "@/components/ui" -export const RunStatus = ({ runStatus: { sseStatus, heartbeat, runners = [] } }: { runStatus: _RunStatus }) => ( -
-
-
-
Task Stream:
-
{sseStatus}
-
-
-
-
-
-
-
-
-
Task Controller:
-
{heartbeat ?? "dead"}
-
-
-
-
-
-
-
-
Task Runners:
- {runners.length > 0 &&
{runners?.join(", ")}
} -
-
-) +function StreamIcon({ status }: { status: "connected" | "waiting" | "error" }) { + if (status === "connected") { + return + } + return +} + +export const RunStatus = ({ + runStatus: { sseStatus, heartbeat, runners = [] }, + isComplete = false, +}: { + runStatus: _RunStatus + isComplete?: boolean +}) => { + // For completed runs, show a simple "Complete" badge + if (isComplete) { + return ( + + +
+ +
+
+ + Run complete + +
+ ) + } + + return ( + + +
+ {/* Task Stream status icon */} + + + {/* Task Controller ID */} + {heartbeat ?? "-"} + + {/* Task Runners count */} + 0 ? "text-green-500" : "text-rose-500"}> + {runners.length > 0 ? `${runners.length}r` : "0r"} + +
+
+ +
+
+ + Task Stream: {sseStatus} +
+
+ + Task Controller: {heartbeat ?? "dead"} +
+
+ 0 ? "text-green-500" : "text-rose-500"}>● + Task Runners: {runners.length > 0 ? runners.length : "none"} +
+ {runners.length > 0 && ( +
+ {runners.map((runner) => ( +
{runner}
+ ))} +
+ )} +
+
+
+ ) +} diff --git a/apps/web-evals/src/app/runs/[id]/run.tsx b/apps/web-evals/src/app/runs/[id]/run.tsx index bd528884792..a4b39100245 100644 --- a/apps/web-evals/src/app/runs/[id]/run.tsx +++ b/apps/web-evals/src/app/runs/[id]/run.tsx @@ -2,12 +2,14 @@ import { useMemo, useState, useCallback, useEffect } from "react" import { toast } from "sonner" -import { LoaderCircle, FileText, Copy, Check } from "lucide-react" +import { LoaderCircle, FileText, Copy, Check, StopCircle } from "lucide-react" import type { Run, TaskMetrics as _TaskMetrics, Task } from "@roo-code/evals" +import type { ToolName } from "@roo-code/types" import { formatCurrency, formatDuration, formatTokens, formatToolUsageSuccessRate } from "@/lib/formatters" import { useRunStatus } from "@/hooks/use-run-status" +import { killRun } from "@/actions/runs" import { Table, TableBody, @@ -24,6 +26,14 @@ import { DialogTitle, ScrollArea, Button, + AlertDialog, + AlertDialogAction, + AlertDialogCancel, + AlertDialogContent, + AlertDialogDescription, + AlertDialogFooter, + AlertDialogHeader, + AlertDialogTitle, } from "@/components/ui" import { TaskStatus } from "./task-status" @@ -51,19 +61,80 @@ type HighlightPattern = { } const HIGHLIGHT_PATTERNS: HighlightPattern[] = [ - // Timestamps [YYYY-MM-DDTHH:MM:SS.sssZ] - { pattern: /\[(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3}Z)\]/g, className: "text-blue-400" }, - // Log levels + // Log levels - styled as badges { pattern: /\|\s*(INFO)\s*\|/g, className: "text-green-400", wrapGroup: 1 }, { pattern: /\|\s*(WARN|WARNING)\s*\|/g, className: "text-yellow-400", wrapGroup: 1 }, - { pattern: /\|\s*(ERROR)\s*\|/g, className: "text-red-400", wrapGroup: 1 }, + { pattern: /\|\s*(ERROR)\s*\|/g, className: "text-red-400 font-semibold", wrapGroup: 1 }, { pattern: /\|\s*(DEBUG)\s*\|/g, className: "text-gray-400", wrapGroup: 1 }, - // Task identifiers - { pattern: /(taskCreated|taskFocused|taskStarted|taskCompleted|EvalPass|EvalFail)/g, className: "text-purple-400" }, + // Task identifiers - important events + { + pattern: /(taskCreated|taskFocused|taskStarted|taskCompleted|taskAborted|taskResumable)/g, + className: "text-purple-400 font-medium", + }, + // Tool failures - highlight in red + { pattern: /(taskToolFailed)/g, className: "text-red-400 font-bold" }, + { pattern: /(Tool execution failed|tool.*failed|failed.*tool)/gi, className: "text-red-400" }, + { pattern: /(EvalPass)/g, className: "text-green-400 font-bold" }, + { pattern: /(EvalFail)/g, className: "text-red-400 font-bold" }, // Message arrows { pattern: /→/g, className: "text-cyan-400" }, + // Tool names in quotes + { pattern: /"(tool)":\s*"([^"]+)"/g, className: "text-orange-400" }, + // JSON keys + { pattern: /"([^"]+)":/g, className: "text-sky-300" }, + // Boolean values + { pattern: /:\s*(true|false)/g, className: "text-amber-400", wrapGroup: 1 }, + // Numbers + { pattern: /:\s*(-?\d+\.?\d*)/g, className: "text-emerald-400", wrapGroup: 1 }, ] +// Extract timestamp from a log line and return elapsed time from baseline +function formatElapsedTime(timestamp: string, baselineMs: number): string { + const currentMs = new Date(timestamp).getTime() + const elapsedMs = currentMs - baselineMs + const totalSeconds = Math.floor(elapsedMs / 1000) + const minutes = Math.floor(totalSeconds / 60) + const seconds = totalSeconds % 60 + return `${minutes.toString().padStart(2, "0")}:${seconds.toString().padStart(2, "0")}` +} + +// Extract the first timestamp from the log to use as baseline +function extractFirstTimestamp(log: string): number | null { + // Match timestamp at start of line: [2025-11-28T09:35:23.187Z | ... or [2025-11-28T09:35:23.187Z] + const match = log.match(/\[(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3}Z)[\s|\]]/) + const isoString = match?.[1] + if (!isoString) return null + return new Date(isoString).getTime() +} + +// Simplify log line by removing redundant metadata +function simplifyLogLine(line: string, baselineMs: number | null): { timestamp: string; simplified: string } { + // Extract timestamp - matches [2025-11-28T09:35:23.187Z | ... format + const timestampMatch = line.match(/\[(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3}Z)[\s|\]]/) + const isoTimestamp = timestampMatch?.[1] + if (!isoTimestamp) { + return { timestamp: "", simplified: line } + } + + const timestamp = baselineMs !== null ? formatElapsedTime(isoTimestamp, baselineMs) : isoTimestamp.slice(11, 19) + + // Remove the timestamp from the line (handles both [timestamp] and [timestamp | formats) + let simplified = line.replace(/\[\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3}Z\s*\|?\s*/, "") + + // Remove redundant metadata: pid, run, task IDs (they're same for entire log) + simplified = simplified.replace(/\|\s*pid:\d+\s*/g, "") + simplified = simplified.replace(/\|\s*run:\d+\s*/g, "") + simplified = simplified.replace(/\|\s*task:\d+\s*/g, "") + simplified = simplified.replace(/runTask\s*\|\s*/g, "") + + // Clean up extra pipes, spaces, and trailing brackets + simplified = simplified.replace(/\|\s*\|/g, "|") + simplified = simplified.replace(/^\s*\|\s*/, "") + simplified = simplified.replace(/\]\s*$/, "") // Remove trailing bracket if present + + return { timestamp, simplified } +} + // Format a single line with syntax highlighting using React elements (XSS-safe) function formatLine(line: string): React.ReactNode[] { // Find all matches with their positions @@ -125,24 +196,83 @@ function formatLine(line: string): React.ReactNode[] { return result.length > 0 ? result : [line] } +// Determine the visual style for a log line based on its content +function getLineStyle(line: string): string { + if (line.includes("ERROR")) return "bg-red-950/30 border-l-2 border-red-500" + if (line.includes("WARN") || line.includes("WARNING")) return "bg-yellow-950/20 border-l-2 border-yellow-500" + if (line.includes("taskToolFailed")) return "bg-red-950/30 border-l-2 border-red-500" + if (line.includes("taskStarted") || line.includes("taskCreated")) return "bg-purple-950/20" + if (line.includes("EvalPass")) return "bg-green-950/30 border-l-2 border-green-500" + if (line.includes("EvalFail")) return "bg-red-950/30 border-l-2 border-red-500" + if (line.includes("taskCompleted") || line.includes("taskAborted")) return "bg-blue-950/20" + return "" +} + // Format log content with basic highlighting (XSS-safe - no dangerouslySetInnerHTML) function formatLogContent(log: string): React.ReactNode[] { const lines = log.split("\n") - return lines.map((line, index) => ( -
- {line ? formatLine(line) : " "} -
- )) + const baselineMs = extractFirstTimestamp(log) + + return lines.map((line, index) => { + if (!line.trim()) { + return ( +
+ {" "} +
+ ) + } + + const parsed = simplifyLogLine(line, baselineMs) + const lineStyle = getLineStyle(line) + + return ( +
+ {/* Elapsed time */} + + {parsed.timestamp} + + {/* Log content - pl-12 ensures wrapped lines are indented under the timestamp */} + + {formatLine(parsed.simplified)} + +
+ ) + }) } export function Run({ run }: { run: Run }) { const runStatus = useRunStatus(run) - const { tasks, tokenUsage, usageUpdatedAt } = runStatus + const { tasks, tokenUsage, usageUpdatedAt, heartbeat, runners } = runStatus const [selectedTask, setSelectedTask] = useState(null) const [taskLog, setTaskLog] = useState(null) const [isLoadingLog, setIsLoadingLog] = useState(false) const [copied, setCopied] = useState(false) + const [showKillDialog, setShowKillDialog] = useState(false) + const [isKilling, setIsKilling] = useState(false) + + // Determine if run is still active (has heartbeat or runners) + const isRunActive = !run.taskMetricsId && (!!heartbeat || (runners && runners.length > 0)) + + const onKillRun = useCallback(async () => { + setIsKilling(true) + try { + const result = await killRun(run.id) + if (result.killedContainers.length > 0) { + toast.success(`Killed ${result.killedContainers.length} container(s)`) + } else if (result.errors.length === 0) { + toast.info("No running containers found") + } else { + toast.error(result.errors.join(", ")) + } + } catch (error) { + console.error("Failed to kill run:", error) + toast.error("Failed to kill run") + } finally { + setIsKilling(false) + setShowKillDialog(false) + } + }, [run.id]) const onCopyLog = useCallback(async () => { if (!taskLog) return @@ -172,9 +302,9 @@ export function Run({ run }: { run: Run }) { const onViewTaskLog = useCallback( async (task: Task) => { - // Only allow viewing logs for completed tasks - if (task.passed === null || task.passed === undefined) { - toast.error("Task is still running") + // Only allow viewing logs for tasks that have started + if (!task.startedAt && !tokenUsage.get(task.id)) { + toast.error("Task has not started yet") return } @@ -202,7 +332,7 @@ export function Run({ run }: { run: Run }) { setIsLoadingLog(false) } }, - [run.id], + [run.id, tokenUsage], ) const taskMetrics: Record = useMemo(() => { @@ -228,22 +358,34 @@ export function Run({ run }: { run: Run }) { // eslint-disable-next-line react-hooks/exhaustive-deps }, [tasks, tokenUsage, usageUpdatedAt]) + // Collect all unique tool names from all tasks and sort by total attempts + const toolColumns = useMemo(() => { + if (!tasks) return [] + + const toolTotals = new Map() + + for (const task of tasks) { + if (task.taskMetrics?.toolUsage) { + for (const [toolName, usage] of Object.entries(task.taskMetrics.toolUsage)) { + const tool = toolName as ToolName + const current = toolTotals.get(tool) ?? 0 + toolTotals.set(tool, current + usage.attempts) + } + } + } + + // Sort by total attempts descending + return Array.from(toolTotals.entries()) + .sort((a, b) => b[1] - a[1]) + .map(([name]): ToolName => name) + }, [tasks]) + // Compute aggregate stats const stats = useMemo(() => { if (!tasks) return null const passed = tasks.filter((t) => t.passed === true).length const failed = tasks.filter((t) => t.passed === false).length - // Count running tasks exactly like TaskStatus shows spinner: - // - passed is not true and not false (null/undefined) - // - AND has activity (startedAt or tokenUsage) - const running = tasks.filter( - (t) => t.passed !== true && t.passed !== false && (t.startedAt || tokenUsage.get(t.id)), - ).length - const pending = tasks.filter( - (t) => t.passed !== true && t.passed !== false && !t.startedAt && !tokenUsage.get(t.id), - ).length - const total = tasks.length const completed = passed + failed let totalTokensIn = 0 @@ -279,9 +421,6 @@ export function Run({ run }: { run: Run }) { return { passed, failed, - running, - pending, - total, completed, passRate: completed > 0 ? ((passed / completed) * 100).toFixed(1) : null, totalTokensIn, @@ -293,42 +432,96 @@ export function Run({ run }: { run: Run }) { // eslint-disable-next-line react-hooks/exhaustive-deps }, [tasks, taskMetrics, tokenUsage, usageUpdatedAt]) + // Calculate elapsed time (wall-clock time from run creation to completion or now) + const elapsedTime = useMemo(() => { + if (!tasks || tasks.length === 0) return null + + const startTime = new Date(run.createdAt).getTime() + + // If run is complete, find the latest finishedAt from tasks + if (run.taskMetricsId) { + const latestFinish = tasks.reduce((latest, task) => { + if (task.finishedAt) { + const finishTime = new Date(task.finishedAt).getTime() + return finishTime > latest ? finishTime : latest + } + return latest + }, startTime) + return latestFinish - startTime + } + + // If still running, use current time + return Date.now() - startTime + // eslint-disable-next-line react-hooks/exhaustive-deps + }, [tasks, run.createdAt, run.taskMetricsId, usageUpdatedAt]) + return ( <>
-
-
-
{run.model}
- {run.description &&
{run.description}
} -
- {!run.taskMetricsId && } -
- {stats && ( -
+
+ {/* Provider, Model title and status */} +
+ {run.settings?.apiProvider && ( + {run.settings.apiProvider} + )} +
{run.model}
+ + {run.description && ( + - {run.description} + )} + {isRunActive && ( + + + + + Stop all containers for this run + + )} +
{/* Main Stats Row */} -
+
{/* Passed/Failed */} -
+
{stats.passed} / {stats.failed} - {stats.running > 0 && ( - ({stats.running}) - )}
Passed / Failed
{/* Pass Rate */} -
-
{stats.passRate ? `${stats.passRate}%` : "-"}
+
+
= 80 + ? "text-yellow-500" + : "text-red-500" + }`}> + {stats.passRate ? `${stats.passRate}%` : "-"} +
Pass Rate
{/* Tokens */} -
+
{formatTokens(stats.totalTokensIn)} / @@ -338,58 +531,64 @@ export function Run({ run }: { run: Run }) {
{/* Cost */} -
+
{formatCurrency(stats.totalCost)}
Cost
{/* Duration */} -
+
{stats.totalDuration > 0 ? formatDuration(stats.totalDuration) : "-"}
Duration
- {/* Tool Usage - Inline */} - {Object.keys(stats.toolUsage).length > 0 && ( -
- {Object.entries(stats.toolUsage) - .sort(([, a], [, b]) => b.attempts - a.attempts) - .map(([toolName, usage]) => { - const abbr = getToolAbbreviation(toolName) - const successRate = - usage.attempts > 0 - ? ((usage.attempts - usage.failures) / usage.attempts) * 100 - : 100 - const rateColor = - successRate === 100 - ? "text-green-500" - : successRate >= 80 - ? "text-yellow-500" - : "text-red-500" - return ( - - -
- - {abbr} - - - {usage.attempts} - - - {formatToolUsageSuccessRate(usage)} - -
-
- {toolName} -
- ) - })} + {/* Elapsed Time */} +
+
+ {elapsedTime !== null ? formatDuration(elapsedTime) : "-"}
- )} +
Elapsed
+
+ + {/* Tool Usage Row */} + {Object.keys(stats.toolUsage).length > 0 && ( +
+ {Object.entries(stats.toolUsage) + .sort(([, a], [, b]) => b.attempts - a.attempts) + .map(([toolName, usage]) => { + const abbr = getToolAbbreviation(toolName) + const successRate = + usage.attempts > 0 + ? ((usage.attempts - usage.failures) / usage.attempts) * 100 + : 100 + const rateColor = + successRate === 100 + ? "text-green-500" + : successRate >= 80 + ? "text-yellow-500" + : "text-red-500" + return ( + + +
+ + {abbr} + + {usage.attempts} + + {formatToolUsageSuccessRate(usage)} + +
+
+ {toolName} +
+ ) + })} +
+ )}
)} {!tasks ? ( @@ -401,67 +600,104 @@ export function Run({ run }: { run: Run }) { Exercise Tokens In / Out Context + {toolColumns.map((toolName) => ( + + + {getToolAbbreviation(toolName)} + {toolName} + + + ))} Duration Cost - {tasks.map((task) => ( - task.finishedAt && onViewTaskLog(task)}> - -
- + {tasks.map((task) => { + const hasStarted = !!task.startedAt || !!tokenUsage.get(task.id) + return ( + hasStarted && onViewTaskLog(task)}> +
- - {task.language}/{task.exercise} - {task.iteration > 1 && ( - - (#{task.iteration}) - + +
+ + {task.language}/{task.exercise} + {task.iteration > 1 && ( + + (#{task.iteration}) + + )} + + {hasStarted && ( + + + + + Click to view log + )} - - {task.finishedAt && ( - - - - - Click to view log - - )} -
-
-
- {taskMetrics[task.id] ? ( - <> - -
-
{formatTokens(taskMetrics[task.id]!.tokensIn)}
/ -
{formatTokens(taskMetrics[task.id]!.tokensOut)}
-
- - {formatTokens(taskMetrics[task.id]!.tokensContext)} - - - {taskMetrics[task.id]!.duration - ? formatDuration(taskMetrics[task.id]!.duration) - : "-"} - - - {formatCurrency(taskMetrics[task.id]!.cost)} - - - ) : ( - - )} -
- ))} +
+
+ {taskMetrics[task.id] ? ( + <> + +
+
{formatTokens(taskMetrics[task.id]!.tokensIn)}
/ +
{formatTokens(taskMetrics[task.id]!.tokensOut)}
+
+
+ + {formatTokens(taskMetrics[task.id]!.tokensContext)} + + {toolColumns.map((toolName) => { + const usage = task.taskMetrics?.toolUsage?.[toolName] + const successRate = + usage && usage.attempts > 0 + ? ((usage.attempts - usage.failures) / usage.attempts) * 100 + : 100 + const rateColor = + successRate === 100 + ? "text-muted-foreground" + : successRate >= 80 + ? "text-yellow-500" + : "text-red-500" + return ( + + {usage ? ( +
+ + {usage.attempts} + + + {formatToolUsageSuccessRate(usage)} + +
+ ) : ( + - + )} +
+ ) + })} + + {taskMetrics[task.id]!.duration + ? formatDuration(taskMetrics[task.id]!.duration) + : "-"} + + + {formatCurrency(taskMetrics[task.id]!.cost)} + + + ) : ( + + )} +
+ ) + })}
)} @@ -479,8 +715,20 @@ export function Run({ run }: { run: Run }) { (#{selectedTask.iteration}) )} - ({selectedTask?.passed ? "Passed" : "Failed"}) + className={`ml-2 text-sm ${ + selectedTask?.passed === true + ? "text-green-600" + : selectedTask?.passed === false + ? "text-red-600" + : "text-yellow-500" + }`}> + ( + {selectedTask?.passed === true + ? "Passed" + : selectedTask?.passed === false + ? "Failed" + : "Running"} + ) {taskLog && ( @@ -523,6 +771,35 @@ export function Run({ run }: { run: Run }) {
+ + {/* Kill Run Confirmation Dialog */} + + + + Kill Run? + + This will stop the controller and all task runner containers for this run. Any running tasks + will be terminated immediately. This action cannot be undone. + + + + Cancel + + {isKilling ? ( + <> + + Killing... + + ) : ( + "Kill Run" + )} + + + + ) } diff --git a/apps/web-evals/src/app/runs/new/new-run.tsx b/apps/web-evals/src/app/runs/new/new-run.tsx index cb7dafd9922..80a921e9c10 100644 --- a/apps/web-evals/src/app/runs/new/new-run.tsx +++ b/apps/web-evals/src/app/runs/new/new-run.tsx @@ -87,11 +87,13 @@ type ImportedSettings = { export function NewRun() { const router = useRouter() - const [provider, setModelSource] = useState<"roo" | "openrouter" | "other">("roo") + const [provider, setModelSource] = useState<"roo" | "openrouter" | "other">("other") const [modelPopoverOpen, setModelPopoverOpen] = useState(false) const [useNativeToolProtocol, setUseNativeToolProtocol] = useState(true) - const [useMultipleNativeToolCalls, setUseMultipleNativeToolCalls] = useState(true) + const [useMultipleNativeToolCalls, setUseMultipleNativeToolCalls] = useState(false) const [reasoningEffort, setReasoningEffort] = useState("") + const [commandExecutionTimeout, setCommandExecutionTimeout] = useState(20) + const [terminalShellIntegrationTimeout, setTerminalShellIntegrationTimeout] = useState(30) // seconds // State for imported settings with config selection const [importedSettings, setImportedSettings] = useState(null) @@ -134,7 +136,7 @@ export function NewRun() { const [model, suite, settings] = watch(["model", "suite", "settings", "concurrency"]) - // Load concurrency and timeout from localStorage on mount + // Load settings from localStorage on mount useEffect(() => { const savedConcurrency = localStorage.getItem("evals-concurrency") if (savedConcurrency) { @@ -150,6 +152,37 @@ export function NewRun() { setValue("timeout", parsed) } } + const savedCommandTimeout = localStorage.getItem("evals-command-execution-timeout") + if (savedCommandTimeout) { + const parsed = parseInt(savedCommandTimeout, 10) + if (!isNaN(parsed) && parsed >= 20 && parsed <= 60) { + setCommandExecutionTimeout(parsed) + } + } + const savedShellTimeout = localStorage.getItem("evals-shell-integration-timeout") + if (savedShellTimeout) { + const parsed = parseInt(savedShellTimeout, 10) + if (!isNaN(parsed) && parsed >= 30 && parsed <= 60) { + setTerminalShellIntegrationTimeout(parsed) + } + } + // Load saved exercises selection + const savedSuite = localStorage.getItem("evals-suite") + if (savedSuite === "partial") { + setValue("suite", "partial") + const savedExercises = localStorage.getItem("evals-exercises") + if (savedExercises) { + try { + const parsed = JSON.parse(savedExercises) as string[] + if (Array.isArray(parsed)) { + setSelectedExercises(parsed) + setValue("exercises", parsed) + } + } catch { + // Invalid JSON, ignore + } + } + } }, [setValue]) // Extract unique languages from exercises @@ -193,6 +226,7 @@ export function NewRun() { setSelectedExercises(newSelected) setValue("exercises", newSelected) + localStorage.setItem("evals-exercises", JSON.stringify(newSelected)) }, [getExercisesForLanguage, selectedExercises, setValue], ) @@ -236,6 +270,8 @@ export function NewRun() { apiProvider: "openrouter", openRouterModelId: model, toolProtocol: useNativeToolProtocol ? "native" : "xml", + commandExecutionTimeout, + terminalShellIntegrationTimeout: terminalShellIntegrationTimeout * 1000, // Convert to ms ...experimentsSettings, } } else if (provider === "roo") { @@ -244,6 +280,8 @@ export function NewRun() { apiProvider: "roo", apiModelId: model, toolProtocol: useNativeToolProtocol ? "native" : "xml", + commandExecutionTimeout, + terminalShellIntegrationTimeout: terminalShellIntegrationTimeout * 1000, // Convert to ms ...experimentsSettings, ...(reasoningEffort ? { @@ -257,6 +295,8 @@ export function NewRun() { values.settings = { ...values.settings, toolProtocol: useNativeToolProtocol ? "native" : "xml", + commandExecutionTimeout, + terminalShellIntegrationTimeout: terminalShellIntegrationTimeout * 1000, // Convert to ms ...experimentsSettings, } } @@ -267,7 +307,16 @@ export function NewRun() { toast.error(e instanceof Error ? e.message : "An unknown error occurred.") } }, - [provider, model, router, useNativeToolProtocol, useMultipleNativeToolCalls, reasoningEffort], + [ + provider, + model, + router, + useNativeToolProtocol, + useMultipleNativeToolCalls, + reasoningEffort, + commandExecutionTimeout, + terminalShellIntegrationTimeout, + ], ) const onSelectModel = useCallback( @@ -355,9 +404,9 @@ export function NewRun() { value={provider} onValueChange={(value) => setModelSource(value as "roo" | "openrouter" | "other")}> + Import Roo Code Cloud OpenRouter - Other @@ -446,8 +495,8 @@ export function NewRun() { - setUseNativeToolProtocol(checked === true) + onCheckedChange={(checked: boolean) => + setUseNativeToolProtocol(checked) } /> Use Native Tool Calls @@ -458,8 +507,8 @@ export function NewRun() { - setUseMultipleNativeToolCalls(checked === true) + onCheckedChange={(checked: boolean) => + setUseMultipleNativeToolCalls(checked) } /> Use Multiple Native Tool Calls @@ -529,8 +578,8 @@ export function NewRun() { - setUseNativeToolProtocol(checked === true) + onCheckedChange={(checked: boolean) => + setUseNativeToolProtocol(checked) } /> Use Native Tool Calls @@ -541,8 +590,8 @@ export function NewRun() { - setUseMultipleNativeToolCalls(checked === true) + onCheckedChange={(checked: boolean) => + setUseMultipleNativeToolCalls(checked) } /> Use Multiple Native Tool Calls @@ -627,12 +676,14 @@ export function NewRun() { Exercises
{ setValue("suite", value as "full" | "partial") + localStorage.setItem("evals-suite", value) if (value === "full") { setSelectedExercises([]) setValue("exercises", []) + localStorage.removeItem("evals-exercises") } }}> @@ -669,6 +720,7 @@ export function NewRun() { onValueChange={(value) => { setSelectedExercises(value) setValue("exercises", value) + localStorage.setItem("evals-exercises", JSON.stringify(value)) }} placeholder="Select" variant="inverted" @@ -758,6 +810,70 @@ export function NewRun() { )} /> + +
+ + + + + + +

+ Maximum time in seconds to wait for terminal command execution to complete + before timing out. This applies to commands run via the execute_command tool. +

+
+
+
+
+ { + if (value !== undefined) { + setCommandExecutionTimeout(value) + localStorage.setItem("evals-command-execution-timeout", String(value)) + } + }} + /> +
{commandExecutionTimeout}
+
+
+ + +
+ + + + + + +

+ Maximum time in seconds to wait for shell integration to initialize when opening + a new terminal. +

+
+
+
+
+ { + if (value !== undefined) { + setTerminalShellIntegrationTimeout(value) + localStorage.setItem("evals-shell-integration-timeout", String(value)) + } + }} + /> +
{terminalShellIntegrationTimeout}
+
+
+ [] +export const ROO_CODE_SETTINGS_KEYS = [ + ...new Set([...GLOBAL_SETTINGS_KEYS, ...PROVIDER_SETTINGS_KEYS]), +] as Keys[] type SettingsDiffProps = { defaultSettings: RooCodeSettings diff --git a/apps/web-evals/src/components/home/run.tsx b/apps/web-evals/src/components/home/run.tsx index 5e356923471..4abbfc67b65 100644 --- a/apps/web-evals/src/components/home/run.tsx +++ b/apps/web-evals/src/components/home/run.tsx @@ -124,9 +124,13 @@ export function Run({ run, taskMetrics, toolColumns }: RunProps) { {run.passed} {run.failed} - {run.passed + run.failed > 0 && ( - {((run.passed / (run.passed + run.failed)) * 100).toFixed(1)}% - )} + {run.passed + run.failed > 0 && + (() => { + const percent = (run.passed / (run.passed + run.failed)) * 100 + const colorClass = + percent === 100 ? "text-green-500" : percent >= 80 ? "text-yellow-500" : "text-red-500" + return {percent.toFixed(1)}% + })()} {taskMetrics && ( @@ -138,12 +142,20 @@ export function Run({ run, taskMetrics, toolColumns }: RunProps) { {toolColumns.map((toolName) => { const usage = taskMetrics?.toolUsage?.[toolName] + const successRate = + usage && usage.attempts > 0 ? ((usage.attempts - usage.failures) / usage.attempts) * 100 : 100 + const rateColor = + successRate === 100 + ? "text-muted-foreground" + : successRate >= 80 + ? "text-yellow-500" + : "text-red-500" return ( {usage ? (
{usage.attempts} - {formatToolUsageSuccessRate(usage)} + {formatToolUsageSuccessRate(usage)}
) : ( - diff --git a/packages/evals/src/cli/runEvals.ts b/packages/evals/src/cli/runEvals.ts index b6259581cf9..7fe6d7ea4e1 100644 --- a/packages/evals/src/cli/runEvals.ts +++ b/packages/evals/src/cli/runEvals.ts @@ -37,22 +37,33 @@ export const runEvals = async (runId: number) => { const heartbeat = await startHeartbeat(run.id) const queue = new PQueue({ concurrency: run.concurrency }) + const STAGGER_DELAY_MS = 5000 + const filteredTasks = tasks.filter((task) => task.finishedAt === null) + + const createTaskRunner = (task: (typeof filteredTasks)[number]) => async () => { + try { + if (containerized) { + await processTaskInContainer({ taskId: task.id, jobToken: run.jobToken, logger }) + } else { + await processTask({ taskId: task.id, jobToken: run.jobToken, logger }) + } + } catch (error) { + logger.error("error processing task", error) + } + } + try { - await queue.addAll( - tasks - .filter((task) => task.finishedAt === null) - .map((task) => async () => { - try { - if (containerized) { - await processTaskInContainer({ taskId: task.id, jobToken: run.jobToken, logger }) - } else { - await processTask({ taskId: task.id, jobToken: run.jobToken, logger }) - } - } catch (error) { - logger.error("error processing task", error) - } - }), - ) + // Add tasks with staggered start times when concurrency > 1 + for (let i = 0; i < filteredTasks.length; i++) { + const task = filteredTasks[i] + if (!task) continue + if (run.concurrency > 1 && i > 0) { + await new Promise((resolve) => setTimeout(resolve, STAGGER_DELAY_MS)) + } + queue.add(createTaskRunner(task)) + } + + await queue.onIdle() logger.info("finishRun") const result = await finishRun(run.id) diff --git a/packages/evals/src/cli/runTask.ts b/packages/evals/src/cli/runTask.ts index c507dd68403..65b9633338c 100644 --- a/packages/evals/src/cli/runTask.ts +++ b/packages/evals/src/cli/runTask.ts @@ -1,4 +1,5 @@ import * as fs from "fs" +import * as fsp from "fs/promises" import * as path from "path" import * as os from "node:os" @@ -38,6 +39,58 @@ class SubprocessTimeoutError extends Error { } } +/** + * Copy conversation history files from VS Code extension storage to the log directory. + * This allows us to preserve the api_conversation_history.json and ui_messages.json + * files for post-mortem analysis alongside the log files. + */ +async function copyConversationHistory({ + rooTaskId, + logDir, + language, + exercise, + iteration, + logger, +}: { + rooTaskId: string + logDir: string + language: string + exercise: string + iteration: number + logger: Logger +}): Promise { + // VS Code extension global storage path within the container + const extensionStoragePath = "/roo/.vscode/User/globalStorage/rooveterinaryinc.roo-cline" + const taskStoragePath = path.join(extensionStoragePath, "tasks", rooTaskId) + + const filesToCopy = ["api_conversation_history.json", "ui_messages.json"] + + for (const filename of filesToCopy) { + const sourcePath = path.join(taskStoragePath, filename) + // Use sanitized exercise name (replace slashes with dashes) for the destination filename + // Include iteration number to handle multiple attempts at the same exercise + const sanitizedExercise = exercise.replace(/\//g, "-") + const destFilename = `${language}-${sanitizedExercise}.${iteration}_${filename}` + const destPath = path.join(logDir, destFilename) + + try { + // Check if source file exists + await fsp.access(sourcePath) + + // Copy the file + await fsp.copyFile(sourcePath, destPath) + logger.info(`copied ${filename} to ${destPath}`) + } catch (error) { + // File may not exist if task didn't complete properly - this is not fatal + if ((error as NodeJS.ErrnoException).code === "ENOENT") { + logger.info(`${filename} not found at ${sourcePath} - skipping`) + } else { + logger.error(`failed to copy ${filename}:`, error) + } + } + } +} + export const processTask = async ({ taskId, jobToken, @@ -114,7 +167,7 @@ export const processTaskInContainer = async ({ for (let attempt = 0; attempt <= maxRetries; attempt++) { const containerName = `evals-task-${taskId}.${attempt}` - const args = [`--name ${containerName}`, ...baseArgs] + const args = [`--name ${containerName}`, `-e EVALS_ATTEMPT=${attempt}`, ...baseArgs] const isRetry = attempt > 0 if (isRetry) { @@ -172,6 +225,7 @@ export const runTask = async ({ run, task, publish, logger, jobToken }: RunTaskO const controller = new AbortController() const cancelSignal = controller.signal const containerized = isDockerContainer() + const logDir = containerized ? `/var/log/evals/runs/${run.id}` : `/tmp/evals/runs/${run.id}` let codeCommand = containerized ? `xvfb-run --auto-servernum --server-num=1 code --wait --log trace --disable-workspace-trust --disable-gpu --disable-lcd-text --no-sandbox --user-data-dir /roo/.vscode --password-store="basic" -n ${workspacePath}` @@ -266,7 +320,23 @@ export const runTask = async ({ run, task, publish, logger, jobToken }: RunTaskO (payload[0].message.say && loggableSays.includes(payload[0].message.say)) || payload[0].message.partial !== true) ) { - logger.info(`${eventName} ->`, payload) + // Extract tool name for tool-related messages for clearer logging + let logEventName: string = eventName + if (eventName === RooCodeEventName.Message && payload[0]?.message?.ask === "tool") { + try { + const textJson = JSON.parse(payload[0].message.text ?? "{}") + if (textJson.tool) { + logEventName = `${eventName} (tool: ${textJson.tool})` + } + } catch { + // If parsing fails, use the default event name + } + } else if (eventName === RooCodeEventName.Message && payload[0]?.message?.ask === "command") { + logEventName = `${eventName} (command)` + } else if (eventName === RooCodeEventName.Message && payload[0]?.message?.ask === "completion_result") { + logEventName = `${eventName} (completion_result)` + } + logger.info(`${logEventName} ->`, payload) } if (eventName === RooCodeEventName.TaskStarted) { @@ -418,9 +488,25 @@ export const runTask = async ({ run, task, publish, logger, jobToken }: RunTaskO } } + // Copy conversation history files from VS Code extension storage to the log directory + // for post-mortem analysis. Only do this in containerized mode where we have a known path. + if (containerized && rooTaskId) { + await copyConversationHistory({ + rooTaskId, + logDir, + language, + exercise, + iteration: task.iteration, + logger, + }) + } + logger.close() - if (isApiUnstable) { + // Only throw for API instability if the task didn't complete successfully. + // If taskFinishedAt is set via TaskCompleted event, the task succeeded despite + // API retries, so re-running from scratch would waste resources. + if (isApiUnstable && !taskFinishedAt) { throw new Error("API is unstable, throwing to trigger a retry.") } }