Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions genkit-tools/cli/src/cli.ts
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ import * as clc from 'colorette';
import { Command, program } from 'commander';
import { config } from './commands/config';
import { evalExtractData } from './commands/eval-extract-data';
import { evalFlowRun } from './commands/eval-flow-run';
import { evalFlow } from './commands/eval-flow';
import { evalRun } from './commands/eval-run';
import { flowBatchRun } from './commands/flow-batch-run';
import { flowResume } from './commands/flow-resume';
Expand All @@ -48,7 +48,7 @@ const commands: Command[] = [
flowResume,
evalExtractData,
evalRun,
evalFlowRun,
evalFlow,
init,
config,
];
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ import {
} from '@genkit-ai/tools-common';
import {
EvalExporter,
EvalFlowInput,
EvalFlowInputSchema,
enrichResultsWithScoring,
extractMetricsMetadata,
getEvalStore,
Expand Down Expand Up @@ -50,8 +52,10 @@ interface EvalFlowRunOptions {
outputFormat: string;
}

const EVAL_FLOW_SCHEMA = '{samples: Array<{input: any; reference?: any;}>}';

/** Command to run a flow and evaluate the output */
export const evalFlowRun = new Command('eval:flow')
export const evalFlow = new Command('eval:flow')
.argument('<flowName>', 'Name of the flow to run')
.argument('[data]', 'JSON data to use to start the flow')
.option('--input <filename>', 'JSON batch data to use to run the flow')
Expand Down Expand Up @@ -138,7 +142,12 @@ export const evalFlowRun = new Command('eval:flow')
return;
}

const evalDataset = await fetchDataSet(runner, flowName, states);
const evalDataset = await fetchDataSet(
runner,
flowName,
states,
parsedData
);
const evalRunId = randomUUID();
const scores: Record<string, any> = {};
for (const action of filteredEvaluatorActions) {
Expand Down Expand Up @@ -178,25 +187,37 @@ export const evalFlowRun = new Command('eval:flow')
}
);

async function readInputs(data: string, filePath: string): Promise<any[]> {
async function readInputs(
data: string,
filePath: string
): Promise<EvalFlowInput> {
const parsedData = JSON.parse(
data ? data : await readFile(filePath!, 'utf8')
);
if (Array.isArray(parsedData)) {
return parsedData as any[];
}

return [parsedData];
try {
return EvalFlowInputSchema.parse(parsedData);
} catch (e) {
throw new Error(
`Error parsing the input. Please provide an array of inputs for the flow or a ${EVAL_FLOW_SCHEMA} object. Error: ${e}`
);
}
}

async function runFlows(
runner: Runner,
flowName: string,
data: any[]
data: EvalFlowInput
): Promise<FlowState[]> {
const states: FlowState[] = [];
let inputs: any[] = Array.isArray(data)
? (data as any[])
: data.samples.map((c) => c.input);

for (const d of data) {
for (const d of inputs) {
logger.info(`Running '/flow/${flowName}' ...`);
let state = (
await runner.runAction({
Expand Down Expand Up @@ -227,11 +248,23 @@ async function runFlows(
async function fetchDataSet(
runner: Runner,
flowName: string,
states: FlowState[]
states: FlowState[],
parsedData: EvalFlowInput
): Promise<EvalInput[]> {
let references: any[] | undefined = undefined;
if (!Array.isArray(parsedData)) {
const maybeReferences = parsedData.samples.map((c: any) => c.reference);
if (maybeReferences.length === states.length) {
references = maybeReferences;
} else {
logger.warn(
'The input size does not match the flow states generated. Ignoring reference mapping...'
);
}
}
const extractors = await getEvalExtractors(flowName);
return await Promise.all(
states.map(async (s) => {
states.map(async (s, i) => {
const traceIds = s.executions.flatMap((e) => e.traceIds);
if (traceIds.length > 1) {
logger.warn('The flow is split across multiple traces');
Expand Down Expand Up @@ -264,6 +297,7 @@ async function fetchDataSet(
input: inputs[0],
output: outputs[0],
context: contexts,
reference: references?.at(i),
traceIds,
};
})
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,10 @@
*/

import { describe, expect, it } from '@jest/globals';
import { evalFlowRun } from '../../src/commands/eval-flow-run';
import { evalFlow } from '../../src/commands/eval-flow';

describe('eval:flow', () => {
const command = evalFlowRun.exitOverride().configureOutput({
const command = evalFlow.exitOverride().configureOutput({
writeOut: () => {},
writeErr: () => {},
});
Expand Down
1 change: 1 addition & 0 deletions genkit-tools/common/src/eval/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

import { EvalStore } from '../types/eval';
import { LocalFileEvalStore } from './localFileEvalStore';
export { EvalFlowInput, EvalFlowInputSchema } from '../types/eval';
export * from './exporter';
export * from './parser';

Expand Down
27 changes: 27 additions & 0 deletions genkit-tools/common/src/types/eval.ts
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,33 @@ import { ListEvalKeysRequest, ListEvalKeysResponse } from './apis';
* This file defines schema and types that are used by the Eval store.
*/

/**
* Structured input for eval:flow
*/
export const EvalFlowStructuredInputSchema = z.object({
samples: z.array(
z.object({
input: z.any(),
reference: z.any().optional(),
})
),
});
export type EvalFlowStructuredInput = z.infer<
typeof EvalFlowStructuredInputSchema
>;

/**
* A dataset that is ready for eval:flow.
*
* This could be an array of input objects to the target flow, or
* It could be a JSON object as specified, with support for references.
*/
export const EvalFlowInputSchema = z.union([
z.array(z.any()),
EvalFlowStructuredInputSchema,
]);
export type EvalFlowInput = z.infer<typeof EvalFlowInputSchema>;

/**
* A record that is ready for evaluation.
*
Expand Down
20 changes: 20 additions & 0 deletions js/samples/cat-eval/eval/cat_adoption_qna.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
{
"cases": [
{
"input": "What are typical cat behaviors?",
"reference": "Cats like to purr, push things away and cuddle."
},
{
"input": "What supplies do you need when bringing home a new cat?",
"reference": "Litter box, cat food and plenty of yarn"
},
{
"input": "How often should you trim your cat's nails?",
"reference": "Trim your cat's nails only when you feel like they're overgrown"
},
{
"input": "What are some plants that are toxic to cats?",
"reference": "I don't know, maybe poison ivy?"
}
]
}