Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion apps/web-evals/src/actions/runs.ts
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,10 @@ import { CreateRun } from "@/lib/schemas"
const EVALS_REPO_PATH = path.resolve(path.dirname(fileURLToPath(import.meta.url)), "../../../../../evals")

// eslint-disable-next-line @typescript-eslint/no-unused-vars
export async function createRun({ suite, exercises = [], systemPrompt, ...values }: CreateRun) {
export async function createRun({ suite, exercises = [], systemPrompt, timeout, ...values }: CreateRun) {
const run = await _createRun({
...values,
timeout,
socketPath: "", // TODO: Get rid of this.
})

Expand Down
27 changes: 27 additions & 0 deletions apps/web-evals/src/app/runs/new/new-run.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@ import {
CONCURRENCY_MIN,
CONCURRENCY_MAX,
CONCURRENCY_DEFAULT,
TIMEOUT_MIN,
TIMEOUT_MAX,
TIMEOUT_DEFAULT,
} from "@/lib/schemas"
import { cn } from "@/lib/utils"
import { useOpenRouterModels } from "@/hooks/use-open-router-models"
Expand Down Expand Up @@ -77,6 +80,7 @@ export function NewRun() {
exercises: [],
settings: undefined,
concurrency: CONCURRENCY_DEFAULT,
timeout: TIMEOUT_DEFAULT,
},
})

Expand Down Expand Up @@ -341,6 +345,29 @@ export function NewRun() {
)}
/>

<FormField
control={form.control}
name="timeout"
render={({ field }) => (
<FormItem>
<FormLabel>Timeout (minutes)</FormLabel>
<FormControl>
<div className="flex flex-row items-center gap-2">
<Slider
defaultValue={[field.value]}
min={TIMEOUT_MIN}
max={TIMEOUT_MAX}
step={1}
onValueChange={(value) => field.onChange(value[0])}
/>
<div>{field.value} min</div>
</div>
</FormControl>
<FormMessage />
</FormItem>
)}
/>

<FormField
control={form.control}
name="description"
Expand Down
5 changes: 5 additions & 0 deletions apps/web-evals/src/lib/schemas.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,10 @@ export const CONCURRENCY_MIN = 1
export const CONCURRENCY_MAX = 25
export const CONCURRENCY_DEFAULT = 1

export const TIMEOUT_MIN = 5
export const TIMEOUT_MAX = 10
export const TIMEOUT_DEFAULT = 5

export const createRunSchema = z
.object({
model: z.string().min(1, { message: "Model is required." }),
Expand All @@ -20,6 +24,7 @@ export const createRunSchema = z
exercises: z.array(z.string()).optional(),
settings: rooCodeSettingsSchema.optional(),
concurrency: z.number().int().min(CONCURRENCY_MIN).max(CONCURRENCY_MAX),
timeout: z.number().int().min(TIMEOUT_MIN).max(TIMEOUT_MAX),
systemPrompt: z.string().optional(),
})
.refine((data) => data.suite === "full" || (data.exercises || []).length > 0, {
Expand Down
14 changes: 10 additions & 4 deletions packages/evals/src/cli/redis.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
import { createClient, type RedisClientType } from "redis"

import { EVALS_TIMEOUT } from "@roo-code/types"

let redis: RedisClientType | undefined

export const redisClient = async () => {
Expand All @@ -18,11 +16,19 @@ export const getPubSubKey = (runId: number) => `evals:${runId}`
export const getRunnersKey = (runId: number) => `runners:${runId}`
export const getHeartbeatKey = (runId: number) => `heartbeat:${runId}`

export const registerRunner = async ({ runId, taskId }: { runId: number; taskId: number }) => {
export const registerRunner = async ({
runId,
taskId,
timeoutSeconds,
}: {
runId: number
taskId: number
timeoutSeconds: number
}) => {
const redis = await redisClient()
const runnersKey = getRunnersKey(runId)
await redis.sAdd(runnersKey, `task-${taskId}:${process.env.HOSTNAME ?? process.pid}`)
await redis.expire(runnersKey, EVALS_TIMEOUT / 1_000)
await redis.expire(runnersKey, timeoutSeconds)
}

export const deregisterRunner = async ({ runId, taskId }: { runId: number; taskId: number }) => {
Expand Down
14 changes: 4 additions & 10 deletions packages/evals/src/cli/runTask.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,7 @@ import * as os from "node:os"
import pWaitFor from "p-wait-for"
import { execa } from "execa"

import {
type TaskEvent,
TaskCommandName,
RooCodeEventName,
IpcMessageType,
EVALS_SETTINGS,
EVALS_TIMEOUT,
} from "@roo-code/types"
import { type TaskEvent, TaskCommandName, RooCodeEventName, IpcMessageType, EVALS_SETTINGS } from "@roo-code/types"
import { IpcClient } from "@roo-code/ipc"

import {
Expand Down Expand Up @@ -42,7 +35,7 @@ export const processTask = async ({ taskId, logger }: { taskId: number; logger?:
const task = await findTask(taskId)
const { language, exercise } = task
const run = await findRun(task.runId)
await registerRunner({ runId: run.id, taskId })
await registerRunner({ runId: run.id, taskId, timeoutSeconds: (run.timeout || 5) * 60 })

const containerized = isDockerContainer()

Expand Down Expand Up @@ -304,9 +297,10 @@ export const runTask = async ({ run, task, publish, logger }: RunTaskOptions) =>
})

try {
const timeoutMs = (run.timeout || 5) * 60 * 1_000 // Convert minutes to milliseconds
await pWaitFor(() => !!taskFinishedAt || !!taskAbortedAt || isClientDisconnected, {
interval: 1_000,
timeout: EVALS_TIMEOUT,
timeout: timeoutMs,
})
} catch (_error) {
taskTimedOut = true
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
ALTER TABLE "runs" ADD COLUMN "timeout" integer DEFAULT 5 NOT NULL;
3 changes: 2 additions & 1 deletion packages/evals/src/db/queries/__tests__/copyRun.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ describe("copyRun", () => {
socketPath: "/tmp/roo.sock",
description: "Test run for copying",
concurrency: 4,
timeout: 5,
})

sourceRunId = run.id
Expand Down Expand Up @@ -271,7 +272,7 @@ describe("copyRun", () => {
})

it("should copy run without task metrics", async () => {
const minimalRun = await createRun({ model: "gpt-3.5-turbo", socketPath: "/tmp/minimal.sock" })
const minimalRun = await createRun({ model: "gpt-3.5-turbo", socketPath: "/tmp/minimal.sock", timeout: 5 })

const newRunId = await copyRun({ sourceDb: db, targetDb: db, runId: minimalRun.id })

Expand Down
1 change: 1 addition & 0 deletions packages/evals/src/db/schema.ts
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ export const runs = pgTable("runs", {
pid: integer(),
socketPath: text("socket_path").notNull(),
concurrency: integer().default(2).notNull(),
timeout: integer().default(5).notNull(),
passed: integer().default(0).notNull(),
failed: integer().default(0).notNull(),
createdAt: timestamp("created_at").notNull(),
Expand Down