diff --git a/genkit-tools/cli/src/cli.ts b/genkit-tools/cli/src/cli.ts index 431e250c5d..e1a93472b9 100644 --- a/genkit-tools/cli/src/cli.ts +++ b/genkit-tools/cli/src/cli.ts @@ -25,7 +25,7 @@ import * as clc from 'colorette'; import { Command, program } from 'commander'; import { config } from './commands/config'; import { evalExtractData } from './commands/eval-extract-data'; -import { evalFlowRun } from './commands/eval-flow-run'; +import { evalFlow } from './commands/eval-flow'; import { evalRun } from './commands/eval-run'; import { flowBatchRun } from './commands/flow-batch-run'; import { flowResume } from './commands/flow-resume'; @@ -48,7 +48,7 @@ const commands: Command[] = [ flowResume, evalExtractData, evalRun, - evalFlowRun, + evalFlow, init, config, ]; diff --git a/genkit-tools/cli/src/commands/eval-flow-run.ts b/genkit-tools/cli/src/commands/eval-flow.ts similarity index 85% rename from genkit-tools/cli/src/commands/eval-flow-run.ts rename to genkit-tools/cli/src/commands/eval-flow.ts index 74d8e49af2..551b97a3ae 100644 --- a/genkit-tools/cli/src/commands/eval-flow-run.ts +++ b/genkit-tools/cli/src/commands/eval-flow.ts @@ -21,6 +21,8 @@ import { } from '@genkit-ai/tools-common'; import { EvalExporter, + EvalFlowInput, + EvalFlowInputSchema, enrichResultsWithScoring, extractMetricsMetadata, getEvalStore, @@ -50,8 +52,10 @@ interface EvalFlowRunOptions { outputFormat: string; } +const EVAL_FLOW_SCHEMA = '{samples: Array<{input: any; reference?: any;}>}'; + /** Command to run a flow and evaluate the output */ -export const evalFlowRun = new Command('eval:flow') +export const evalFlow = new Command('eval:flow') .argument('', 'Name of the flow to run') .argument('[data]', 'JSON data to use to start the flow') .option('--input ', 'JSON batch data to use to run the flow') @@ -138,7 +142,12 @@ export const evalFlowRun = new Command('eval:flow') return; } - const evalDataset = await fetchDataSet(runner, flowName, states); + const evalDataset = await fetchDataSet( + runner, + flowName, + states, + parsedData + ); const evalRunId = randomUUID(); const scores: Record = {}; for (const action of filteredEvaluatorActions) { @@ -178,7 +187,10 @@ export const evalFlowRun = new Command('eval:flow') } ); -async function readInputs(data: string, filePath: string): Promise { +async function readInputs( + data: string, + filePath: string +): Promise { const parsedData = JSON.parse( data ? data : await readFile(filePath!, 'utf8') ); @@ -186,17 +198,26 @@ async function readInputs(data: string, filePath: string): Promise { return parsedData as any[]; } - return [parsedData]; + try { + return EvalFlowInputSchema.parse(parsedData); + } catch (e) { + throw new Error( + `Error parsing the input. Please provide an array of inputs for the flow or a ${EVAL_FLOW_SCHEMA} object. Error: ${e}` + ); + } } async function runFlows( runner: Runner, flowName: string, - data: any[] + data: EvalFlowInput ): Promise { const states: FlowState[] = []; + let inputs: any[] = Array.isArray(data) + ? (data as any[]) + : data.samples.map((c) => c.input); - for (const d of data) { + for (const d of inputs) { logger.info(`Running '/flow/${flowName}' ...`); let state = ( await runner.runAction({ @@ -227,11 +248,23 @@ async function runFlows( async function fetchDataSet( runner: Runner, flowName: string, - states: FlowState[] + states: FlowState[], + parsedData: EvalFlowInput ): Promise { + let references: any[] | undefined = undefined; + if (!Array.isArray(parsedData)) { + const maybeReferences = parsedData.samples.map((c: any) => c.reference); + if (maybeReferences.length === states.length) { + references = maybeReferences; + } else { + logger.warn( + 'The input size does not match the flow states generated. Ignoring reference mapping...' + ); + } + } const extractors = await getEvalExtractors(flowName); return await Promise.all( - states.map(async (s) => { + states.map(async (s, i) => { const traceIds = s.executions.flatMap((e) => e.traceIds); if (traceIds.length > 1) { logger.warn('The flow is split across multiple traces'); @@ -264,6 +297,7 @@ async function fetchDataSet( input: inputs[0], output: outputs[0], context: contexts, + reference: references?.at(i), traceIds, }; }) diff --git a/genkit-tools/cli/tests/commands/eval-flow-run_test.ts b/genkit-tools/cli/tests/commands/eval-flow_test.ts similarity index 87% rename from genkit-tools/cli/tests/commands/eval-flow-run_test.ts rename to genkit-tools/cli/tests/commands/eval-flow_test.ts index ddf5edab41..96eff5d1ed 100644 --- a/genkit-tools/cli/tests/commands/eval-flow-run_test.ts +++ b/genkit-tools/cli/tests/commands/eval-flow_test.ts @@ -15,10 +15,10 @@ */ import { describe, expect, it } from '@jest/globals'; -import { evalFlowRun } from '../../src/commands/eval-flow-run'; +import { evalFlow } from '../../src/commands/eval-flow'; describe('eval:flow', () => { - const command = evalFlowRun.exitOverride().configureOutput({ + const command = evalFlow.exitOverride().configureOutput({ writeOut: () => {}, writeErr: () => {}, }); diff --git a/genkit-tools/common/src/eval/index.ts b/genkit-tools/common/src/eval/index.ts index cf3f0e0000..28f2374b9b 100644 --- a/genkit-tools/common/src/eval/index.ts +++ b/genkit-tools/common/src/eval/index.ts @@ -16,6 +16,7 @@ import { EvalStore } from '../types/eval'; import { LocalFileEvalStore } from './localFileEvalStore'; +export { EvalFlowInput, EvalFlowInputSchema } from '../types/eval'; export * from './exporter'; export * from './parser'; diff --git a/genkit-tools/common/src/types/eval.ts b/genkit-tools/common/src/types/eval.ts index 239cf6f50e..4cff861792 100644 --- a/genkit-tools/common/src/types/eval.ts +++ b/genkit-tools/common/src/types/eval.ts @@ -21,6 +21,33 @@ import { ListEvalKeysRequest, ListEvalKeysResponse } from './apis'; * This file defines schema and types that are used by the Eval store. */ +/** + * Structured input for eval:flow + */ +export const EvalFlowStructuredInputSchema = z.object({ + samples: z.array( + z.object({ + input: z.any(), + reference: z.any().optional(), + }) + ), +}); +export type EvalFlowStructuredInput = z.infer< + typeof EvalFlowStructuredInputSchema +>; + +/** + * A dataset that is ready for eval:flow. + * + * This could be an array of input objects to the target flow, or + * It could be a JSON object as specified, with support for references. + */ +export const EvalFlowInputSchema = z.union([ + z.array(z.any()), + EvalFlowStructuredInputSchema, +]); +export type EvalFlowInput = z.infer; + /** * A record that is ready for evaluation. * diff --git a/js/samples/cat-eval/eval/cat_adoption_qna.json b/js/samples/cat-eval/eval/cat_adoption_qna.json new file mode 100644 index 0000000000..36f93b965f --- /dev/null +++ b/js/samples/cat-eval/eval/cat_adoption_qna.json @@ -0,0 +1,20 @@ +{ + "cases": [ + { + "input": "What are typical cat behaviors?", + "reference": "Cats like to purr, push things away and cuddle." + }, + { + "input": "What supplies do you need when bringing home a new cat?", + "reference": "Litter box, cat food and plenty of yarn" + }, + { + "input": "How often should you trim your cat's nails?", + "reference": "Trim your cat's nails only when you feel like they're overgrown" + }, + { + "input": "What are some plants that are toxic to cats?", + "reference": "I don't know, maybe poison ivy?" + } + ] +}