diff --git a/apps/web-evals/package.json b/apps/web-evals/package.json index 37740163323..446582a5d7b 100644 --- a/apps/web-evals/package.json +++ b/apps/web-evals/package.json @@ -14,6 +14,7 @@ "dependencies": { "@hookform/resolvers": "^5.1.1", "@radix-ui/react-alert-dialog": "^1.1.7", + "@radix-ui/react-checkbox": "^1.1.5", "@radix-ui/react-dialog": "^1.1.6", "@radix-ui/react-dropdown-menu": "^2.1.7", "@radix-ui/react-label": "^2.1.2", diff --git a/apps/web-evals/src/actions/runs.ts b/apps/web-evals/src/actions/runs.ts index 2eae1f6804a..82a7ebfcbe5 100644 --- a/apps/web-evals/src/actions/runs.ts +++ b/apps/web-evals/src/actions/runs.ts @@ -21,8 +21,7 @@ import { CreateRun } from "@/lib/schemas" const EVALS_REPO_PATH = path.resolve(path.dirname(fileURLToPath(import.meta.url)), "../../../../../evals") -// eslint-disable-next-line @typescript-eslint/no-unused-vars -export async function createRun({ suite, exercises = [], systemPrompt, timeout, ...values }: CreateRun) { +export async function createRun({ suite, exercises = [], timeout, ...values }: CreateRun) { const run = await _createRun({ ...values, timeout, diff --git a/apps/web-evals/src/app/api/health/route.ts b/apps/web-evals/src/app/api/health/route.ts deleted file mode 100644 index ca8a833942f..00000000000 --- a/apps/web-evals/src/app/api/health/route.ts +++ /dev/null @@ -1,24 +0,0 @@ -import { NextResponse } from "next/server" - -export async function GET() { - try { - return NextResponse.json( - { - status: "healthy", - timestamp: new Date().toISOString(), - uptime: process.uptime(), - environment: process.env.NODE_ENV || "production", - }, - { status: 200 }, - ) - } catch (error) { - return NextResponse.json( - { - status: "unhealthy", - timestamp: new Date().toISOString(), - error: error instanceof Error ? error.message : "Unknown error", - }, - { status: 503 }, - ) - } -} diff --git a/apps/web-evals/src/app/runs/new/new-run.tsx b/apps/web-evals/src/app/runs/new/new-run.tsx index 41d35f3c4c1..2d424e35f72 100644 --- a/apps/web-evals/src/app/runs/new/new-run.tsx +++ b/apps/web-evals/src/app/runs/new/new-run.tsx @@ -1,23 +1,22 @@ "use client" -import { useCallback, useRef, useState } from "react" +import { useCallback, useState } from "react" import { useRouter } from "next/navigation" import { z } from "zod" import { useQuery } from "@tanstack/react-query" import { useForm, FormProvider } from "react-hook-form" import { zodResolver } from "@hookform/resolvers/zod" -import fuzzysort from "fuzzysort" import { toast } from "sonner" -import { X, Rocket, Check, ChevronsUpDown, SlidersHorizontal, CircleCheck } from "lucide-react" +import { X, Rocket, Check, ChevronsUpDown, SlidersHorizontal } from "lucide-react" import { globalSettingsSchema, providerSettingsSchema, EVALS_SETTINGS, getModelId } from "@roo-code/types" import { createRun } from "@/actions/runs" import { getExercises } from "@/actions/exercises" + import { - createRunSchema, type CreateRun, - MODEL_DEFAULT, + createRunSchema, CONCURRENCY_MIN, CONCURRENCY_MAX, CONCURRENCY_DEFAULT, @@ -26,14 +25,19 @@ import { TIMEOUT_DEFAULT, } from "@/lib/schemas" import { cn } from "@/lib/utils" + import { useOpenRouterModels } from "@/hooks/use-open-router-models" +import { useRooCodeCloudModels } from "@/hooks/use-roo-code-cloud-models" + import { Button, + Checkbox, FormControl, FormField, FormItem, FormLabel, FormMessage, + Input, Textarea, Tabs, TabsList, @@ -48,9 +52,9 @@ import { Popover, PopoverContent, PopoverTrigger, - ScrollArea, - ScrollBar, Slider, + Label, + FormDescription, } from "@/components/ui" import { SettingsDiff } from "./settings-diff" @@ -58,26 +62,30 @@ import { SettingsDiff } from "./settings-diff" export function NewRun() { const router = useRouter() - const [mode, setMode] = useState<"openrouter" | "settings">("openrouter") - const [modelSearchValue, setModelSearchValue] = useState("") + const [provider, setModelSource] = useState<"roo" | "openrouter" | "other">("roo") const [modelPopoverOpen, setModelPopoverOpen] = useState(false) + const [useNativeToolProtocol, setUseNativeToolProtocol] = useState(true) - const modelSearchResultsRef = useRef>(new Map()) - const modelSearchValueRef = useRef("") + const openRouter = useOpenRouterModels() + const rooCodeCloud = useRooCodeCloudModels() + const models = provider === "openrouter" ? openRouter.data : rooCodeCloud.data + const searchValue = provider === "openrouter" ? openRouter.searchValue : rooCodeCloud.searchValue + const setSearchValue = provider === "openrouter" ? openRouter.setSearchValue : rooCodeCloud.setSearchValue + const onFilter = provider === "openrouter" ? openRouter.onFilter : rooCodeCloud.onFilter - const models = useOpenRouterModels() const exercises = useQuery({ queryKey: ["getExercises"], queryFn: () => getExercises() }) const form = useForm({ resolver: zodResolver(createRunSchema), defaultValues: { - model: MODEL_DEFAULT, + model: "", description: "", suite: "full", exercises: [], settings: undefined, concurrency: CONCURRENCY_DEFAULT, timeout: TIMEOUT_DEFAULT, + jobToken: "", }, }) @@ -93,8 +101,20 @@ export function NewRun() { const onSubmit = useCallback( async (values: CreateRun) => { try { - if (mode === "openrouter") { - values.settings = { ...(values.settings || {}), openRouterModelId: model } + if (provider === "openrouter") { + values.settings = { + ...(values.settings || {}), + apiProvider: "openrouter", + openRouterModelId: model, + toolProtocol: useNativeToolProtocol ? "native" : "xml", + } + } else if (provider === "roo") { + values.settings = { + ...(values.settings || {}), + apiProvider: "roo", + apiModelId: model, + toolProtocol: useNativeToolProtocol ? "native" : "xml", + } } const { id } = await createRun(values) @@ -103,28 +123,7 @@ export function NewRun() { toast.error(e instanceof Error ? e.message : "An unknown error occurred.") } }, - [mode, model, router], - ) - - const onFilterModels = useCallback( - (value: string, search: string) => { - if (modelSearchValueRef.current !== search) { - modelSearchValueRef.current = search - modelSearchResultsRef.current.clear() - - for (const { - obj: { id }, - score, - } of fuzzysort.go(search, models.data || [], { - key: "name", - })) { - modelSearchResultsRef.current.set(id, score) - } - } - - return modelSearchResultsRef.current.get(value) ?? 0 - }, - [models.data], + [provider, model, router, useNativeToolProtocol], ) const onSelectModel = useCallback( @@ -132,7 +131,7 @@ export function NewRun() { setValue("model", model) setModelPopoverOpen(false) }, - [setValue], + [setValue, setModelPopoverOpen], ) const onImportSettings = useCallback( @@ -160,7 +159,6 @@ export function NewRun() { setValue("model", getModelId(providerSettings) ?? "") setValue("settings", { ...EVALS_SETTINGS, ...providerSettings, ...globalSettings }) - setMode("settings") event.target.value = "" } catch (e) { @@ -177,13 +175,44 @@ export function NewRun() {
-
- {mode === "openrouter" && ( - ( - + ( + + setModelSource(value as "roo" | "openrouter" | "other")}> + + Roo Code Cloud + OpenRouter + Other + + + + {provider === "other" ? ( +
+ + + {settings && ( + + )} +
+ ) : ( + <> - + No model found. - {models.data?.map(({ id, name }) => ( + {models?.map(({ id, name }) => ( - -
- )} - /> - )} - - - - {settings && ( - - <> -
- -
- Imported valid Roo Code settings. Showing differences from default - settings. -
+
+ + setUseNativeToolProtocol(checked === true) + } + /> +
- - - + )} + + + + )} + /> + + {provider === "roo" && ( + ( + + Roo Code Cloud Token + + + + + + If you have access to the Roo Code Cloud repository then you can generate a + token with: +
+ + pnpm --filter @roo-code-cloud/auth production:create-job-token [org] + [timeout] + +
+
)} - - -
+ /> + )} [] -type SettingsDiffProps = HTMLAttributes & { +type SettingsDiffProps = { defaultSettings: RooCodeSettings customSettings: RooCodeSettings } @@ -14,53 +12,45 @@ type SettingsDiffProps = HTMLAttributes & { export function SettingsDiff({ customSettings: { experiments: customExperiments, ...customSettings }, defaultSettings: { experiments: defaultExperiments, ...defaultSettings }, - className, - ...props }: SettingsDiffProps) { const defaults = { ...defaultSettings, ...defaultExperiments } const custom = { ...customSettings, ...customExperiments } return ( -
-
Setting
-
Default
-
Custom
- {ROO_CODE_SETTINGS_KEYS.map((key) => { - const defaultValue = defaults[key as keyof typeof defaults] - const customValue = custom[key as keyof typeof custom] - const isDefault = JSON.stringify(defaultValue) === JSON.stringify(customValue) - - return isDefault ? null : ( - - ) - })} +
+ + + + Setting + Default + Custom + + + + {ROO_CODE_SETTINGS_KEYS.map((key) => { + const defaultValue = JSON.stringify(defaults[key as keyof typeof defaults], null, 2) + const customValue = JSON.stringify(custom[key as keyof typeof custom], null, 2) + + return defaultValue === customValue || + (isEmpty(defaultValue) && isEmpty(customValue)) ? null : ( + + + {key} + + + {defaultValue} + + + {customValue} + + + ) + })} + +
) } -type SettingDiffProps = HTMLAttributes & { - name: string - defaultValue?: string - customValue?: string -} - -export function SettingDiff({ name, defaultValue, customValue, ...props }: SettingDiffProps) { - return ( - -
- {name} -
-
-				{defaultValue}
-			
-
-				{customValue}
-			
-
- ) -} +const isEmpty = (value: string | undefined) => + value === undefined || value === "" || value === "null" || value === '""' || value === "[]" || value === "{}" diff --git a/apps/web-evals/src/components/ui/checkbox.tsx b/apps/web-evals/src/components/ui/checkbox.tsx new file mode 100644 index 00000000000..543ae2952b5 --- /dev/null +++ b/apps/web-evals/src/components/ui/checkbox.tsx @@ -0,0 +1,27 @@ +"use client" + +import * as React from "react" +import * as CheckboxPrimitive from "@radix-ui/react-checkbox" +import { CheckIcon } from "lucide-react" + +import { cn } from "@/lib/utils" + +function Checkbox({ className, ...props }: React.ComponentProps) { + return ( + + + + + + ) +} + +export { Checkbox } diff --git a/apps/web-evals/src/components/ui/index.ts b/apps/web-evals/src/components/ui/index.ts index f09397ece6d..390569074e2 100644 --- a/apps/web-evals/src/components/ui/index.ts +++ b/apps/web-evals/src/components/ui/index.ts @@ -1,6 +1,7 @@ export * from "./alert-dialog" export * from "./badge" export * from "./button" +export * from "./checkbox" export * from "./command" export * from "./dialog" export * from "./drawer" diff --git a/apps/web-evals/src/hooks/use-fuzzy-model-search.ts b/apps/web-evals/src/hooks/use-fuzzy-model-search.ts new file mode 100644 index 00000000000..a48b38606ac --- /dev/null +++ b/apps/web-evals/src/hooks/use-fuzzy-model-search.ts @@ -0,0 +1,37 @@ +import { useCallback, useRef, useState } from "react" +import fuzzysort from "fuzzysort" + +interface ModelWithId { + id: string + name: string +} + +export const useFuzzyModelSearch = (data: T[] | undefined) => { + const [searchValue, setSearchValue] = useState("") + + const searchResultsRef = useRef>(new Map()) + const searchValueRef = useRef("") + + const onFilter = useCallback( + (value: string, search: string) => { + if (searchValueRef.current !== search) { + searchValueRef.current = search + searchResultsRef.current.clear() + + for (const { + obj: { id }, + score, + } of fuzzysort.go(search, data || [], { + key: "name", + })) { + searchResultsRef.current.set(id, score) + } + } + + return searchResultsRef.current.get(value) ?? 0 + }, + [data], + ) + + return { searchValue, setSearchValue, onFilter } +} diff --git a/apps/web-evals/src/hooks/use-open-router-models.ts b/apps/web-evals/src/hooks/use-open-router-models.ts index 27800f90f21..27a4e96150a 100644 --- a/apps/web-evals/src/hooks/use-open-router-models.ts +++ b/apps/web-evals/src/hooks/use-open-router-models.ts @@ -1,5 +1,6 @@ import { z } from "zod" import { useQuery } from "@tanstack/react-query" +import { useFuzzyModelSearch } from "./use-fuzzy-model-search" export const openRouterModelSchema = z.object({ id: z.string(), @@ -25,8 +26,13 @@ export const getOpenRouterModels = async (): Promise => { return result.data.data.sort((a, b) => a.name.localeCompare(b.name)) } -export const useOpenRouterModels = () => - useQuery({ +export const useOpenRouterModels = () => { + const query = useQuery({ queryKey: ["getOpenRouterModels"], queryFn: getOpenRouterModels, }) + + const { searchValue, setSearchValue, onFilter } = useFuzzyModelSearch(query.data) + + return { ...query, searchValue, setSearchValue, onFilter } +} diff --git a/apps/web-evals/src/hooks/use-roo-code-cloud-models.ts b/apps/web-evals/src/hooks/use-roo-code-cloud-models.ts new file mode 100644 index 00000000000..034b6f04cf9 --- /dev/null +++ b/apps/web-evals/src/hooks/use-roo-code-cloud-models.ts @@ -0,0 +1,66 @@ +import { z } from "zod" +import { useQuery } from "@tanstack/react-query" +import { useFuzzyModelSearch } from "./use-fuzzy-model-search" + +export const rooCodeCloudModelSchema = z.object({ + object: z.literal("model"), + id: z.string(), + name: z.string(), + description: z.string().optional(), + context_window: z.number(), + max_tokens: z.number(), + supports_images: z.boolean().optional(), + supports_prompt_cache: z.boolean().optional(), + type: z.literal("language"), + tags: z.array(z.string()).optional(), + deprecationMessage: z.string().optional(), + owned_by: z.string(), + pricing: z.object({ + input: z.string(), + output: z.string(), + input_cache_read: z.string().optional(), + input_cache_write: z.string().optional(), + }), + evals: z + .object({ + score: z.number().min(0).max(100), + }) + .optional(), + created: z.number(), + deprecated: z.boolean().optional(), +}) + +export type RooCodeCloudModel = z.infer + +export const getRooCodeCloudModels = async (): Promise => { + const response = await fetch("https://api.roocode.com/proxy/v1/models") + + if (!response.ok) { + return [] + } + + const result = z + .object({ + object: z.literal("list"), + data: z.array(rooCodeCloudModelSchema), + }) + .safeParse(await response.json()) + + if (!result.success) { + console.error(result.error) + return [] + } + + return result.data.data.sort((a, b) => a.name.localeCompare(b.name)) +} + +export const useRooCodeCloudModels = () => { + const query = useQuery({ + queryKey: ["getRooCodeCloudModels"], + queryFn: getRooCodeCloudModels, + }) + + const { searchValue, setSearchValue, onFilter } = useFuzzyModelSearch(query.data) + + return { ...query, searchValue, setSearchValue, onFilter } +} diff --git a/apps/web-evals/src/lib/schemas.ts b/apps/web-evals/src/lib/schemas.ts index 0a460e659d6..63c5fa7de50 100644 --- a/apps/web-evals/src/lib/schemas.ts +++ b/apps/web-evals/src/lib/schemas.ts @@ -6,8 +6,6 @@ import { rooCodeSettingsSchema } from "@roo-code/types" * CreateRun */ -export const MODEL_DEFAULT = "anthropic/claude-sonnet-4" - export const CONCURRENCY_MIN = 1 export const CONCURRENCY_MAX = 25 export const CONCURRENCY_DEFAULT = 1 @@ -25,7 +23,7 @@ export const createRunSchema = z settings: rooCodeSettingsSchema.optional(), concurrency: z.number().int().min(CONCURRENCY_MIN).max(CONCURRENCY_MAX), timeout: z.number().int().min(TIMEOUT_MIN).max(TIMEOUT_MAX), - systemPrompt: z.string().optional(), + jobToken: z.string().optional(), }) .refine((data) => data.suite === "full" || (data.exercises || []).length > 0, { message: "Exercises are required when running a partial suite.", diff --git a/packages/evals/src/cli/index.ts b/packages/evals/src/cli/index.ts index de62be8ae08..f7c343de2f0 100644 --- a/packages/evals/src/cli/index.ts +++ b/packages/evals/src/cli/index.ts @@ -28,7 +28,7 @@ const main = async () => { } else if (runId !== -1) { await runEvals(runId) } else if (taskId !== -1) { - await processTask({ taskId }) + await processTask({ taskId, jobToken: process.env.ROO_CODE_CLOUD_TOKEN || null }) } else { throw new Error("Either runId or taskId must be provided.") } diff --git a/packages/evals/src/cli/runEvals.ts b/packages/evals/src/cli/runEvals.ts index 6d13abf5a8f..b6259581cf9 100644 --- a/packages/evals/src/cli/runEvals.ts +++ b/packages/evals/src/cli/runEvals.ts @@ -44,9 +44,9 @@ export const runEvals = async (runId: number) => { .map((task) => async () => { try { if (containerized) { - await processTaskInContainer({ taskId: task.id, logger }) + await processTaskInContainer({ taskId: task.id, jobToken: run.jobToken, logger }) } else { - await processTask({ taskId: task.id, logger }) + await processTask({ taskId: task.id, jobToken: run.jobToken, logger }) } } catch (error) { logger.error("error processing task", error) diff --git a/packages/evals/src/cli/runTask.ts b/packages/evals/src/cli/runTask.ts index e9149463537..c507dd68403 100644 --- a/packages/evals/src/cli/runTask.ts +++ b/packages/evals/src/cli/runTask.ts @@ -38,7 +38,15 @@ class SubprocessTimeoutError extends Error { } } -export const processTask = async ({ taskId, logger }: { taskId: number; logger?: Logger }) => { +export const processTask = async ({ + taskId, + jobToken, + logger, +}: { + taskId: number + jobToken: string | null + logger?: Logger +}) => { const task = await findTask(taskId) const { language, exercise } = task const run = await findRun(task.runId) @@ -61,7 +69,7 @@ export const processTask = async ({ taskId, logger }: { taskId: number; logger?: } logger.info(`running task ${task.id} (${language}/${exercise})...`) - await runTask({ run, task, publish, logger }) + await runTask({ run, task, jobToken, publish, logger }) logger.info(`testing task ${task.id} (${language}/${exercise})...`) const passed = await runUnitTest({ task, logger }) @@ -80,10 +88,12 @@ export const processTask = async ({ taskId, logger }: { taskId: number; logger?: export const processTaskInContainer = async ({ taskId, + jobToken, logger, maxRetries = 10, }: { taskId: number + jobToken: string | null logger: Logger maxRetries?: number }) => { @@ -95,6 +105,10 @@ export const processTaskInContainer = async ({ "-e HOST_EXECUTION_METHOD=docker", ] + if (jobToken) { + baseArgs.push(`-e ROO_CODE_CLOUD_TOKEN=${jobToken}`) + } + const command = `pnpm --filter @roo-code/evals cli --taskId ${taskId}` logger.info(command) @@ -144,11 +158,12 @@ export const processTaskInContainer = async ({ type RunTaskOptions = { run: Run task: Task + jobToken: string | null publish: (taskEvent: TaskEvent) => Promise logger: Logger } -export const runTask = async ({ run, task, publish, logger }: RunTaskOptions) => { +export const runTask = async ({ run, task, publish, logger, jobToken }: RunTaskOptions) => { const { language, exercise } = task const prompt = fs.readFileSync(path.resolve(EVALS_REPO_PATH, `prompts/${language}.md`), "utf-8") const workspacePath = path.resolve(EVALS_REPO_PATH, language, exercise) @@ -158,10 +173,14 @@ export const runTask = async ({ run, task, publish, logger }: RunTaskOptions) => const cancelSignal = controller.signal const containerized = isDockerContainer() - const codeCommand = containerized + let codeCommand = containerized ? `xvfb-run --auto-servernum --server-num=1 code --wait --log trace --disable-workspace-trust --disable-gpu --disable-lcd-text --no-sandbox --user-data-dir /roo/.vscode --password-store="basic" -n ${workspacePath}` : `code --disable-workspace-trust -n ${workspacePath}` + if (jobToken) { + codeCommand = `ROO_CODE_CLOUD_TOKEN=${jobToken} ${codeCommand}` + } + logger.info(codeCommand) // Sleep for a random amount of time between 5 and 10 seconds, unless we're diff --git a/packages/evals/src/db/migrations/0003_simple_retro_girl.sql b/packages/evals/src/db/migrations/0003_simple_retro_girl.sql new file mode 100644 index 00000000000..93b7d8f9455 --- /dev/null +++ b/packages/evals/src/db/migrations/0003_simple_retro_girl.sql @@ -0,0 +1 @@ +ALTER TABLE "runs" ADD COLUMN "jobToken" text; \ No newline at end of file diff --git a/packages/evals/src/db/migrations/meta/0003_snapshot.json b/packages/evals/src/db/migrations/meta/0003_snapshot.json new file mode 100644 index 00000000000..f660f870257 --- /dev/null +++ b/packages/evals/src/db/migrations/meta/0003_snapshot.json @@ -0,0 +1,459 @@ +{ + "id": "853d308a-3946-4ea8-9039-236bfce3c6c0", + "prevId": "3d2b8423-6170-4cb2-9f62-1c86756da97a", + "version": "7", + "dialect": "postgresql", + "tables": { + "public.runs": { + "name": "runs", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "integer", + "primaryKey": true, + "notNull": true, + "identity": { + "type": "always", + "name": "runs_id_seq", + "schema": "public", + "increment": "1", + "startWith": "1", + "minValue": "1", + "maxValue": "2147483647", + "cache": "1", + "cycle": false + } + }, + "task_metrics_id": { + "name": "task_metrics_id", + "type": "integer", + "primaryKey": false, + "notNull": false + }, + "model": { + "name": "model", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "name": { + "name": "name", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "description": { + "name": "description", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "contextWindow": { + "name": "contextWindow", + "type": "integer", + "primaryKey": false, + "notNull": false + }, + "inputPrice": { + "name": "inputPrice", + "type": "real", + "primaryKey": false, + "notNull": false + }, + "outputPrice": { + "name": "outputPrice", + "type": "real", + "primaryKey": false, + "notNull": false + }, + "cacheWritesPrice": { + "name": "cacheWritesPrice", + "type": "real", + "primaryKey": false, + "notNull": false + }, + "cacheReadsPrice": { + "name": "cacheReadsPrice", + "type": "real", + "primaryKey": false, + "notNull": false + }, + "settings": { + "name": "settings", + "type": "jsonb", + "primaryKey": false, + "notNull": false + }, + "jobToken": { + "name": "jobToken", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "pid": { + "name": "pid", + "type": "integer", + "primaryKey": false, + "notNull": false + }, + "socket_path": { + "name": "socket_path", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "concurrency": { + "name": "concurrency", + "type": "integer", + "primaryKey": false, + "notNull": true, + "default": 2 + }, + "timeout": { + "name": "timeout", + "type": "integer", + "primaryKey": false, + "notNull": true, + "default": 5 + }, + "passed": { + "name": "passed", + "type": "integer", + "primaryKey": false, + "notNull": true, + "default": 0 + }, + "failed": { + "name": "failed", + "type": "integer", + "primaryKey": false, + "notNull": true, + "default": 0 + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true + } + }, + "indexes": {}, + "foreignKeys": { + "runs_task_metrics_id_taskMetrics_id_fk": { + "name": "runs_task_metrics_id_taskMetrics_id_fk", + "tableFrom": "runs", + "tableTo": "taskMetrics", + "columnsFrom": ["task_metrics_id"], + "columnsTo": ["id"], + "onDelete": "no action", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.taskMetrics": { + "name": "taskMetrics", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "integer", + "primaryKey": true, + "notNull": true, + "identity": { + "type": "always", + "name": "taskMetrics_id_seq", + "schema": "public", + "increment": "1", + "startWith": "1", + "minValue": "1", + "maxValue": "2147483647", + "cache": "1", + "cycle": false + } + }, + "tokens_in": { + "name": "tokens_in", + "type": "integer", + "primaryKey": false, + "notNull": true + }, + "tokens_out": { + "name": "tokens_out", + "type": "integer", + "primaryKey": false, + "notNull": true + }, + "tokens_context": { + "name": "tokens_context", + "type": "integer", + "primaryKey": false, + "notNull": true + }, + "cache_writes": { + "name": "cache_writes", + "type": "integer", + "primaryKey": false, + "notNull": true + }, + "cache_reads": { + "name": "cache_reads", + "type": "integer", + "primaryKey": false, + "notNull": true + }, + "cost": { + "name": "cost", + "type": "real", + "primaryKey": false, + "notNull": true + }, + "duration": { + "name": "duration", + "type": "integer", + "primaryKey": false, + "notNull": true + }, + "tool_usage": { + "name": "tool_usage", + "type": "jsonb", + "primaryKey": false, + "notNull": false + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true + } + }, + "indexes": {}, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.tasks": { + "name": "tasks", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "integer", + "primaryKey": true, + "notNull": true, + "identity": { + "type": "always", + "name": "tasks_id_seq", + "schema": "public", + "increment": "1", + "startWith": "1", + "minValue": "1", + "maxValue": "2147483647", + "cache": "1", + "cycle": false + } + }, + "run_id": { + "name": "run_id", + "type": "integer", + "primaryKey": false, + "notNull": true + }, + "task_metrics_id": { + "name": "task_metrics_id", + "type": "integer", + "primaryKey": false, + "notNull": false + }, + "language": { + "name": "language", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "exercise": { + "name": "exercise", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "passed": { + "name": "passed", + "type": "boolean", + "primaryKey": false, + "notNull": false + }, + "started_at": { + "name": "started_at", + "type": "timestamp", + "primaryKey": false, + "notNull": false + }, + "finished_at": { + "name": "finished_at", + "type": "timestamp", + "primaryKey": false, + "notNull": false + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true + } + }, + "indexes": { + "tasks_language_exercise_idx": { + "name": "tasks_language_exercise_idx", + "columns": [ + { + "expression": "run_id", + "isExpression": false, + "asc": true, + "nulls": "last" + }, + { + "expression": "language", + "isExpression": false, + "asc": true, + "nulls": "last" + }, + { + "expression": "exercise", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": true, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": { + "tasks_run_id_runs_id_fk": { + "name": "tasks_run_id_runs_id_fk", + "tableFrom": "tasks", + "tableTo": "runs", + "columnsFrom": ["run_id"], + "columnsTo": ["id"], + "onDelete": "no action", + "onUpdate": "no action" + }, + "tasks_task_metrics_id_taskMetrics_id_fk": { + "name": "tasks_task_metrics_id_taskMetrics_id_fk", + "tableFrom": "tasks", + "tableTo": "taskMetrics", + "columnsFrom": ["task_metrics_id"], + "columnsTo": ["id"], + "onDelete": "no action", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.toolErrors": { + "name": "toolErrors", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "integer", + "primaryKey": true, + "notNull": true, + "identity": { + "type": "always", + "name": "toolErrors_id_seq", + "schema": "public", + "increment": "1", + "startWith": "1", + "minValue": "1", + "maxValue": "2147483647", + "cache": "1", + "cycle": false + } + }, + "run_id": { + "name": "run_id", + "type": "integer", + "primaryKey": false, + "notNull": false + }, + "task_id": { + "name": "task_id", + "type": "integer", + "primaryKey": false, + "notNull": false + }, + "tool_name": { + "name": "tool_name", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "error": { + "name": "error", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true + } + }, + "indexes": {}, + "foreignKeys": { + "toolErrors_run_id_runs_id_fk": { + "name": "toolErrors_run_id_runs_id_fk", + "tableFrom": "toolErrors", + "tableTo": "runs", + "columnsFrom": ["run_id"], + "columnsTo": ["id"], + "onDelete": "no action", + "onUpdate": "no action" + }, + "toolErrors_task_id_tasks_id_fk": { + "name": "toolErrors_task_id_tasks_id_fk", + "tableFrom": "toolErrors", + "tableTo": "tasks", + "columnsFrom": ["task_id"], + "columnsTo": ["id"], + "onDelete": "no action", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + } + }, + "enums": {}, + "schemas": {}, + "sequences": {}, + "roles": {}, + "policies": {}, + "views": {}, + "_meta": { + "columns": {}, + "schemas": {}, + "tables": {} + } +} diff --git a/packages/evals/src/db/migrations/meta/_journal.json b/packages/evals/src/db/migrations/meta/_journal.json index 38543557f39..9be55aecb8a 100644 --- a/packages/evals/src/db/migrations/meta/_journal.json +++ b/packages/evals/src/db/migrations/meta/_journal.json @@ -22,6 +22,13 @@ "when": 1757191027855, "tag": "0002_bouncy_blazing_skull", "breakpoints": true + }, + { + "idx": 3, + "version": "7", + "when": 1763797232454, + "tag": "0003_simple_retro_girl", + "breakpoints": true } ] } diff --git a/packages/evals/src/db/schema.ts b/packages/evals/src/db/schema.ts index 66588c792c3..d8d4c3ea0a6 100644 --- a/packages/evals/src/db/schema.ts +++ b/packages/evals/src/db/schema.ts @@ -21,6 +21,7 @@ export const runs = pgTable("runs", { cacheWritesPrice: real(), cacheReadsPrice: real(), settings: jsonb().$type(), + jobToken: text(), pid: integer(), socketPath: text("socket_path").notNull(), concurrency: integer().default(2).notNull(), diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 92719c266bd..d8476240608 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -131,6 +131,9 @@ importers: '@radix-ui/react-alert-dialog': specifier: ^1.1.7 version: 1.1.13(@types/react-dom@18.3.7(@types/react@18.3.23))(@types/react@18.3.23)(react-dom@18.3.1(react@18.3.1))(react@18.3.1) + '@radix-ui/react-checkbox': + specifier: ^1.1.5 + version: 1.3.1(@types/react-dom@18.3.7(@types/react@18.3.23))(@types/react@18.3.23)(react-dom@18.3.1(react@18.3.1))(react@18.3.1) '@radix-ui/react-dialog': specifier: ^1.1.6 version: 1.1.13(@types/react-dom@18.3.7(@types/react@18.3.23))(@types/react@18.3.23)(react-dom@18.3.1(react@18.3.1))(react@18.3.1)