Support references in eval:flow (#63)

ssbushi · web-flow · commit c0aa9babbb07 · 2024-05-08T20:05:23.000Z
* Support references

* Feedback

* Feedback
diff --git a/genkit-tools/cli/src/cli.ts b/genkit-tools/cli/src/cli.ts
@@ -25,7 +25,7 @@ import * as clc from 'colorette';
 import { Command, program } from 'commander';
 import { config } from './commands/config';
 import { evalExtractData } from './commands/eval-extract-data';
-import { evalFlowRun } from './commands/eval-flow-run';
+import { evalFlow } from './commands/eval-flow';
 import { evalRun } from './commands/eval-run';
 import { flowBatchRun } from './commands/flow-batch-run';
 import { flowResume } from './commands/flow-resume';
@@ -48,7 +48,7 @@ const commands: Command[] = [
   flowResume,
   evalExtractData,
   evalRun,
-  evalFlowRun,
+  evalFlow,
   init,
   config,
 ];
diff --git a/genkit-tools/cli/src/commands/eval-flow.ts b/genkit-tools/cli/src/commands/eval-flow.ts
@@ -21,6 +21,8 @@ import {
 } from '@genkit-ai/tools-common';
 import {
   EvalExporter,
+  EvalFlowInput,
+  EvalFlowInputSchema,
   enrichResultsWithScoring,
   extractMetricsMetadata,
   getEvalStore,
@@ -50,8 +52,10 @@ interface EvalFlowRunOptions {
   outputFormat: string;
 }
 
+const EVAL_FLOW_SCHEMA = '{samples: Array<{input: any; reference?: any;}>}';
+
 /** Command to run a flow and evaluate the output */
-export const evalFlowRun = new Command('eval:flow')
+export const evalFlow = new Command('eval:flow')
   .argument('<flowName>', 'Name of the flow to run')
   .argument('[data]', 'JSON data to use to start the flow')
   .option('--input <filename>', 'JSON batch data to use to run the flow')
@@ -138,7 +142,12 @@ export const evalFlowRun = new Command('eval:flow')
           return;
         }
 
-        const evalDataset = await fetchDataSet(runner, flowName, states);
+        const evalDataset = await fetchDataSet(
+          runner,
+          flowName,
+          states,
+          parsedData
+        );
         const evalRunId = randomUUID();
         const scores: Record<string, any> = {};
         for (const action of filteredEvaluatorActions) {
@@ -178,25 +187,37 @@ export const evalFlowRun = new Command('eval:flow')
     }
   );
 
-async function readInputs(data: string, filePath: string): Promise<any[]> {
+async function readInputs(
+  data: string,
+  filePath: string
+): Promise<EvalFlowInput> {
   const parsedData = JSON.parse(
     data ? data : await readFile(filePath!, 'utf8')
   );
   if (Array.isArray(parsedData)) {
     return parsedData as any[];
   }
 
-  return [parsedData];
+  try {
+    return EvalFlowInputSchema.parse(parsedData);
+  } catch (e) {
+    throw new Error(
+      `Error parsing the input. Please provide an array of inputs for the flow or a ${EVAL_FLOW_SCHEMA} object. Error: ${e}`
+    );
+  }
 }
 
 async function runFlows(
   runner: Runner,
   flowName: string,
-  data: any[]
+  data: EvalFlowInput
 ): Promise<FlowState[]> {
   const states: FlowState[] = [];
+  let inputs: any[] = Array.isArray(data)
+    ? (data as any[])
+    : data.samples.map((c) => c.input);
 
-  for (const d of data) {
+  for (const d of inputs) {
     logger.info(`Running '/flow/${flowName}' ...`);
     let state = (
       await runner.runAction({
@@ -227,11 +248,23 @@ async function runFlows(
 async function fetchDataSet(
   runner: Runner,
   flowName: string,
-  states: FlowState[]
+  states: FlowState[],
+  parsedData: EvalFlowInput
 ): Promise<EvalInput[]> {
+  let references: any[] | undefined = undefined;
+  if (!Array.isArray(parsedData)) {
+    const maybeReferences = parsedData.samples.map((c: any) => c.reference);
+    if (maybeReferences.length === states.length) {
+      references = maybeReferences;
+    } else {
+      logger.warn(
+        'The input size does not match the flow states generated. Ignoring reference mapping...'
+      );
+    }
+  }
   const extractors = await getEvalExtractors(flowName);
   return await Promise.all(
-    states.map(async (s) => {
+    states.map(async (s, i) => {
       const traceIds = s.executions.flatMap((e) => e.traceIds);
       if (traceIds.length > 1) {
         logger.warn('The flow is split across multiple traces');
@@ -264,6 +297,7 @@ async function fetchDataSet(
         input: inputs[0],
         output: outputs[0],
         context: contexts,
+        reference: references?.at(i),
         traceIds,
       };
     })
diff --git a/genkit-tools/cli/tests/commands/eval-flow_test.ts b/genkit-tools/cli/tests/commands/eval-flow_test.ts
@@ -15,10 +15,10 @@
  */
 
 import { describe, expect, it } from '@jest/globals';
-import { evalFlowRun } from '../../src/commands/eval-flow-run';
+import { evalFlow } from '../../src/commands/eval-flow';
 
 describe('eval:flow', () => {
-  const command = evalFlowRun.exitOverride().configureOutput({
+  const command = evalFlow.exitOverride().configureOutput({
     writeOut: () => {},
     writeErr: () => {},
   });
diff --git a/genkit-tools/common/src/eval/index.ts b/genkit-tools/common/src/eval/index.ts
@@ -16,6 +16,7 @@
 
 import { EvalStore } from '../types/eval';
 import { LocalFileEvalStore } from './localFileEvalStore';
+export { EvalFlowInput, EvalFlowInputSchema } from '../types/eval';
 export * from './exporter';
 export * from './parser';
 
diff --git a/genkit-tools/common/src/types/eval.ts b/genkit-tools/common/src/types/eval.ts
@@ -21,6 +21,33 @@ import { ListEvalKeysRequest, ListEvalKeysResponse } from './apis';
  * This file defines schema and types that are used by the Eval store.
  */
 
+/**
+ * Structured input for eval:flow
+ */
+export const EvalFlowStructuredInputSchema = z.object({
+  samples: z.array(
+    z.object({
+      input: z.any(),
+      reference: z.any().optional(),
+    })
+  ),
+});
+export type EvalFlowStructuredInput = z.infer<
+  typeof EvalFlowStructuredInputSchema
+>;
+
+/**
+ * A dataset that is ready for eval:flow.
+ *
+ * This could be an array of input objects to the target flow, or
+ * It could be a JSON object as specified, with support for references.
+ */
+export const EvalFlowInputSchema = z.union([
+  z.array(z.any()),
+  EvalFlowStructuredInputSchema,
+]);
+export type EvalFlowInput = z.infer<typeof EvalFlowInputSchema>;
+
 /**
  * A record that is ready for evaluation.
  *
diff --git a/js/samples/cat-eval/eval/cat_adoption_qna.json b/js/samples/cat-eval/eval/cat_adoption_qna.json
@@ -0,0 +1,20 @@
+{
+  "cases": [
+    {
+      "input": "What are typical cat behaviors?",
+      "reference": "Cats like to purr, push things away and cuddle."
+    },
+    {
+      "input": "What supplies do you need when bringing home a new cat?",
+      "reference": "Litter box, cat food and plenty of yarn"
+    },
+    {
+      "input": "How often should you trim your cat's nails?",
+      "reference": "Trim your cat's nails only when you feel like they're overgrown"
+    },
+    {
+      "input": "What are some plants that are toxic to cats?",
+      "reference": "I don't know, maybe poison ivy?"
+    }
+  ]
+}