Skip to content

Commit c0aa9ba

Browse files
authored
Support references in eval:flow (#63)
* Support references * Feedback * Feedback
1 parent 456e1e3 commit c0aa9ba

File tree

6 files changed

+94
-12
lines changed

6 files changed

+94
-12
lines changed

genkit-tools/cli/src/cli.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ import * as clc from 'colorette';
2525
import { Command, program } from 'commander';
2626
import { config } from './commands/config';
2727
import { evalExtractData } from './commands/eval-extract-data';
28-
import { evalFlowRun } from './commands/eval-flow-run';
28+
import { evalFlow } from './commands/eval-flow';
2929
import { evalRun } from './commands/eval-run';
3030
import { flowBatchRun } from './commands/flow-batch-run';
3131
import { flowResume } from './commands/flow-resume';
@@ -48,7 +48,7 @@ const commands: Command[] = [
4848
flowResume,
4949
evalExtractData,
5050
evalRun,
51-
evalFlowRun,
51+
evalFlow,
5252
init,
5353
config,
5454
];

genkit-tools/cli/src/commands/eval-flow-run.ts renamed to genkit-tools/cli/src/commands/eval-flow.ts

Lines changed: 42 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@ import {
2121
} from '@genkit-ai/tools-common';
2222
import {
2323
EvalExporter,
24+
EvalFlowInput,
25+
EvalFlowInputSchema,
2426
enrichResultsWithScoring,
2527
extractMetricsMetadata,
2628
getEvalStore,
@@ -50,8 +52,10 @@ interface EvalFlowRunOptions {
5052
outputFormat: string;
5153
}
5254

55+
const EVAL_FLOW_SCHEMA = '{samples: Array<{input: any; reference?: any;}>}';
56+
5357
/** Command to run a flow and evaluate the output */
54-
export const evalFlowRun = new Command('eval:flow')
58+
export const evalFlow = new Command('eval:flow')
5559
.argument('<flowName>', 'Name of the flow to run')
5660
.argument('[data]', 'JSON data to use to start the flow')
5761
.option('--input <filename>', 'JSON batch data to use to run the flow')
@@ -138,7 +142,12 @@ export const evalFlowRun = new Command('eval:flow')
138142
return;
139143
}
140144

141-
const evalDataset = await fetchDataSet(runner, flowName, states);
145+
const evalDataset = await fetchDataSet(
146+
runner,
147+
flowName,
148+
states,
149+
parsedData
150+
);
142151
const evalRunId = randomUUID();
143152
const scores: Record<string, any> = {};
144153
for (const action of filteredEvaluatorActions) {
@@ -178,25 +187,37 @@ export const evalFlowRun = new Command('eval:flow')
178187
}
179188
);
180189

181-
async function readInputs(data: string, filePath: string): Promise<any[]> {
190+
async function readInputs(
191+
data: string,
192+
filePath: string
193+
): Promise<EvalFlowInput> {
182194
const parsedData = JSON.parse(
183195
data ? data : await readFile(filePath!, 'utf8')
184196
);
185197
if (Array.isArray(parsedData)) {
186198
return parsedData as any[];
187199
}
188200

189-
return [parsedData];
201+
try {
202+
return EvalFlowInputSchema.parse(parsedData);
203+
} catch (e) {
204+
throw new Error(
205+
`Error parsing the input. Please provide an array of inputs for the flow or a ${EVAL_FLOW_SCHEMA} object. Error: ${e}`
206+
);
207+
}
190208
}
191209

192210
async function runFlows(
193211
runner: Runner,
194212
flowName: string,
195-
data: any[]
213+
data: EvalFlowInput
196214
): Promise<FlowState[]> {
197215
const states: FlowState[] = [];
216+
let inputs: any[] = Array.isArray(data)
217+
? (data as any[])
218+
: data.samples.map((c) => c.input);
198219

199-
for (const d of data) {
220+
for (const d of inputs) {
200221
logger.info(`Running '/flow/${flowName}' ...`);
201222
let state = (
202223
await runner.runAction({
@@ -227,11 +248,23 @@ async function runFlows(
227248
async function fetchDataSet(
228249
runner: Runner,
229250
flowName: string,
230-
states: FlowState[]
251+
states: FlowState[],
252+
parsedData: EvalFlowInput
231253
): Promise<EvalInput[]> {
254+
let references: any[] | undefined = undefined;
255+
if (!Array.isArray(parsedData)) {
256+
const maybeReferences = parsedData.samples.map((c: any) => c.reference);
257+
if (maybeReferences.length === states.length) {
258+
references = maybeReferences;
259+
} else {
260+
logger.warn(
261+
'The input size does not match the flow states generated. Ignoring reference mapping...'
262+
);
263+
}
264+
}
232265
const extractors = await getEvalExtractors(flowName);
233266
return await Promise.all(
234-
states.map(async (s) => {
267+
states.map(async (s, i) => {
235268
const traceIds = s.executions.flatMap((e) => e.traceIds);
236269
if (traceIds.length > 1) {
237270
logger.warn('The flow is split across multiple traces');
@@ -264,6 +297,7 @@ async function fetchDataSet(
264297
input: inputs[0],
265298
output: outputs[0],
266299
context: contexts,
300+
reference: references?.at(i),
267301
traceIds,
268302
};
269303
})

genkit-tools/cli/tests/commands/eval-flow-run_test.ts renamed to genkit-tools/cli/tests/commands/eval-flow_test.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,10 @@
1515
*/
1616

1717
import { describe, expect, it } from '@jest/globals';
18-
import { evalFlowRun } from '../../src/commands/eval-flow-run';
18+
import { evalFlow } from '../../src/commands/eval-flow';
1919

2020
describe('eval:flow', () => {
21-
const command = evalFlowRun.exitOverride().configureOutput({
21+
const command = evalFlow.exitOverride().configureOutput({
2222
writeOut: () => {},
2323
writeErr: () => {},
2424
});

genkit-tools/common/src/eval/index.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616

1717
import { EvalStore } from '../types/eval';
1818
import { LocalFileEvalStore } from './localFileEvalStore';
19+
export { EvalFlowInput, EvalFlowInputSchema } from '../types/eval';
1920
export * from './exporter';
2021
export * from './parser';
2122

genkit-tools/common/src/types/eval.ts

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,33 @@ import { ListEvalKeysRequest, ListEvalKeysResponse } from './apis';
2121
* This file defines schema and types that are used by the Eval store.
2222
*/
2323

24+
/**
25+
* Structured input for eval:flow
26+
*/
27+
export const EvalFlowStructuredInputSchema = z.object({
28+
samples: z.array(
29+
z.object({
30+
input: z.any(),
31+
reference: z.any().optional(),
32+
})
33+
),
34+
});
35+
export type EvalFlowStructuredInput = z.infer<
36+
typeof EvalFlowStructuredInputSchema
37+
>;
38+
39+
/**
40+
* A dataset that is ready for eval:flow.
41+
*
42+
* This could be an array of input objects to the target flow, or
43+
* It could be a JSON object as specified, with support for references.
44+
*/
45+
export const EvalFlowInputSchema = z.union([
46+
z.array(z.any()),
47+
EvalFlowStructuredInputSchema,
48+
]);
49+
export type EvalFlowInput = z.infer<typeof EvalFlowInputSchema>;
50+
2451
/**
2552
* A record that is ready for evaluation.
2653
*
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
{
2+
"cases": [
3+
{
4+
"input": "What are typical cat behaviors?",
5+
"reference": "Cats like to purr, push things away and cuddle."
6+
},
7+
{
8+
"input": "What supplies do you need when bringing home a new cat?",
9+
"reference": "Litter box, cat food and plenty of yarn"
10+
},
11+
{
12+
"input": "How often should you trim your cat's nails?",
13+
"reference": "Trim your cat's nails only when you feel like they're overgrown"
14+
},
15+
{
16+
"input": "What are some plants that are toxic to cats?",
17+
"reference": "I don't know, maybe poison ivy?"
18+
}
19+
]
20+
}

0 commit comments

Comments
 (0)