docs: update evals docs (#1394)

--------- Co-authored-by: Chris Ray Gill <chgill@google.com>
firebase · Nov 27, 2024 · 6a2b296 · 6a2b296
1 parent 71b7bcb
commit 6a2b296
Show file tree

Hide file tree

Showing 2 changed files with 106 additions and 23 deletions.
diff --git a/docs/evaluation.md b/docs/evaluation.md
@@ -49,13 +49,39 @@ use to generate output for evaluation.
 ["Cheese", "Broccoli", "Spinach and Kale"]
 ```
 
+If the evaluator requires a reference output for evaluating a flow, you can pass both 
+input and reference output using this format instead:
+
+```json
+{
+  "samples": [
+    {
+      "input": "What is the French word for Cheese?",
+      "reference": "Fromage"
+    },
+    {
+      "input": "What green vegetable looks like cauliflower?",
+      "reference": "Broccoli"
+    }
+  ]
+}
+```
+
+Note that you can use any JSON data type in the input JSON file. Genkit will pass them along with the same data type to your flow.
+
 You can then use the `eval:flow` command to evaluate your flow against the test
 cases provided in `testInputs.json`.
 
 ```posix-terminal
 genkit eval:flow menuSuggestionFlow --input testInputs.json
 ```
 
+If your flow requires auth, you may specify it using the `--auth` argument:
+
+```posix-terminal
+genkit eval:flow menuSuggestionFlow --input testInputs.json --auth "{\"email_verified\": true}"
+```
+
 You can then see evaluation results in the Developer UI by running:
 
 ```posix-terminal
@@ -222,9 +248,18 @@ might be asking about it.
 import { genkit, run, z } from "genkit";
 import { googleAI, gemini15Flash } from "@genkit-ai/googleai";
 import { chunk } from "llm-chunk";
+import path from 'path';
 
 const ai = genkit({ plugins: [googleAI()] });
 
+const chunkingConfig = {
+  minLength: 1000, // number of minimum characters into chunk
+  maxLength: 2000, // number of maximum characters into chunk
+  splitter: 'sentence', // paragraph | sentence
+  overlap: 100, // number of overlap chracters
+  delimiters: '', // regex for base split method
+} as any;
+
 export const synthesizeQuestions = ai.defineFlow(
   {
     name: "synthesizeQuestions",
@@ -233,6 +268,8 @@ export const synthesizeQuestions = ai.defineFlow(
   },
   async (filePath) => {
     filePath = path.resolve(filePath);
+    // `extractText` loads the PDF and extracts its contents as text.
+    // See our RAG documentation for more details. 
     const pdfTxt = await run("extract-text", () => extractText(filePath));
 
     const chunks = await run("chunk-it", async () =>

diff --git a/docs/plugin-authoring-evaluator.md b/docs/plugin-authoring-evaluator.md
@@ -6,6 +6,8 @@ Firebase Genkit can be extended to support custom evaluation of test case output
 
 Evaluators are functions that assess the content given to and generated by an LLM. There are two main approaches to automated evaluation (testing): heuristic assessment and LLM-based assessment. In the heuristic approach, you define a deterministic function like those of traditional software development. In an LLM-based assessment, the content is fed back to an LLM and the LLM is asked to score the output according to criteria set in a prompt.
 
+Regardless of the approach you take, you need to use the `ai.defineEvaluator` method to define an evaluator action in Genkit. We will see a couple of examples of how to use this method in this document. 
+
 ### LLM based Evaluators
 
 An LLM-based evaluator leverages an LLM to evaluate the input, context, or output of your generative AI feature.
@@ -88,7 +90,7 @@ export async function deliciousnessScore<
   });
 
   // Call the LLM to generate an evaluation result
-  const response = await generate({
+  const response = await ai.generate({
     model: judgeLlm,
     prompt: finalPrompt,
     config: judgeConfig,
@@ -113,6 +115,7 @@ export async function deliciousnessScore<
 The final step is to write a function that defines the evaluator action itself.
 
 ```ts
+import { Genkit, ModelReference, z } from 'genkit';
 import { BaseEvalDataPoint, EvaluatorAction } from 'genkit/evaluator';
 
 /**
@@ -121,14 +124,16 @@ import { BaseEvalDataPoint, EvaluatorAction } from 'genkit/evaluator';
 export function createDeliciousnessEvaluator<
   ModelCustomOptions extends z.ZodTypeAny,
 >(
+  ai: Genkit,
   judge: ModelReference<ModelCustomOptions>,
   judgeConfig: z.infer<ModelCustomOptions>
 ): EvaluatorAction {
-  return defineEvaluator(
+  return ai.defineEvaluator(
     {
       name: `myAwesomeEval/deliciousness`,
       displayName: 'Deliciousness',
       definition: 'Determines if output is considered delicous.',
+      isBilled: true,
     },
     async (datapoint: BaseEvalDataPoint) => {
       const score = await deliciousnessScore(judge, datapoint, judgeConfig);
@@ -141,6 +146,47 @@ export function createDeliciousnessEvaluator<
 }
 ```
 
+The `defineEvaluator` method is similar to other Genkit constructors like `defineFlow`, `defineRetriever` etc. The user should provide an `EvaluatorFn` to the `defineEvaluator` callback. The `EvaluatorFn` accepts a `BaseEvalDataPoint` which corresponds to a single entry in a dataset under evaluation, along with an optional custom options parameter if specified. The function, should process the datapoint and return an `EvalResponse` object. 
+
+Here are the Zod Schemas for `BaseEvalDataPoint` and `EvalResponse`:
+
+```ts
+export const BaseEvalDataPoint = z.object({
+  testCaseId: z.string(),
+  input: z.unknown(),
+  output: z.unknown().optional(),
+  context: z.array(z.unknown()).optional(),
+  reference: z.unknown().optional(),
+  testCaseId: z.string().optional(),
+  traceIds: z.array(z.string()).optional(),
+});
+
+export const EvalResponse = z.object({
+  sampleIndex: z.number().optional(),
+  testCaseId: z.string(),
+  traceId: z.string().optional(),
+  spanId: z.string().optional(),
+  evaluation: z.union([ScoreSchema, z.array(ScoreSchema)]),
+});
+```
+where `ScoreSchema` is defined as:
+
+```ts
+const ScoreSchema = z.object({
+  id: z.string().describe('Optional ID to differentiate multiple scores').optional(),
+  score: z.union([z.number(), z.string(), z.boolean()]).optional(),
+  error: z.string().optional(),
+  details: z
+    .object({
+      reasoning: z.string().optional(),
+    })
+    .passthrough()
+    .optional(),
+});
+```
+
+`defineEvaluator` lets the user provide a name and user-readable display name and a definition for the evaluator. The display name and definiton will be displayed in evaluation runs in the Dev UI. It also has an optional `isBilled` option which marks whether this evaluator may result in billing (eg: if it uses a billed LLM or API). If an evaluator is billed, the user is prompted for a confirmation in the CLI before they can run an evaluation, to help guard from unintended expenses.
+
 ### Heuristic Evaluators
 
 A heuristic evaluator can be any function used to evaluate the input, context, or output of your generative AI feature.
@@ -179,6 +225,16 @@ export async function usPhoneRegexScore(
     details: { reasoning },
   };
 }
+
+/**
+ * Create an EvalResponse from an individual scored datapoint.
+ */
+function fillScores(dataPoint: BaseEvalDataPoint, score: Score): EvalResponse {
+  return {
+    testCaseId: dataPoint.testCaseId,
+    evaluation: score,
+  };
+}
 ```
 
 #### Define the evaluator action
@@ -199,11 +255,11 @@ export function createUSPhoneRegexEvaluator(
         name: `myAwesomeEval/${metric.name.toLocaleLowerCase()}`,
         displayName: 'Regex Match',
         definition:
-          'Runs the output against a regex and responds with 1 if a match is found and 0 otherwise.',
+          'Runs the output against a regex and responds with true if a match is found and false otherwise.',
         isBilled: false,
       },
       async (datapoint: BaseEvalDataPoint) => {
-        const score = await regexMatchScore(datapoint, regexMetric.regex);
+        const score = await usPhoneRegexScore(datapoint);
         return fillScores(datapoint, score);
       }
     );
@@ -233,11 +289,6 @@ export interface PluginOptions {
 If this new plugin uses an LLM as a judge and the plugin supports swapping out which LLM to use, define additional parameters in the `PluginOptions` object.
 
 ```ts
-export enum MyAwesomeMetric {
-  DELICIOUSNESS = 'DELICIOUSNESS',
-  US_PHONE_REGEX_MATCH = 'US_PHONE_REGEX_MATCH',
-}
-
 export interface PluginOptions<ModelCustomOptions extends z.ZodTypeAny> {
   judge: ModelReference<ModelCustomOptions>;
   judgeConfig?: z.infer<ModelCustomOptions>;
@@ -252,11 +303,12 @@ Plugins are registered with the framework via the `genkit.config.ts` file in a p
 In this case we have two evaluators `DELICIOUSNESS` and `US_PHONE_REGEX_MATCH`. This is where those evaluators are registered with the plugin and with Firebase Genkit.
 
 ```ts
+import { GenkitPlugin, genkitPlugin } from 'genkit/plugin';
+
 export function myAwesomeEval<ModelCustomOptions extends z.ZodTypeAny>(
   options: PluginOptions<ModelCustomOptions>
-): PluginProvider {
+): GenkitPlugin {
   // Define the new plugin
-  const plugin = (options?: MyPluginOptions<ModelCustomOptions>) => {
     return genkitPlugin(
     'myAwesomeEval',
     async (ai: Genkit) => {
@@ -272,10 +324,7 @@ export function myAwesomeEval<ModelCustomOptions extends z.ZodTypeAny>(
         }
       });
       return { evaluators };
-    })
-  }
-  // Create the plugin with the passed options
-  return plugin(options);
+    });
 }
 export default myAwesomeEval;
 ```
@@ -351,14 +400,11 @@ These examples can be human generated or you can ask an LLM to help create a set
 
 Then use the Genkit CLI to run the evaluator against these test cases.
 
-```bash
-genkit eval:run deliciousness_dataset.json
-```
+```posix-terminal
+# Start your genkit runtime
+genkit start -- <command to start your app>
 
-View your results in the Genkit UI.
-
-```bash
-genkit start
+genkit eval:run deliciousness_dataset.json
 ```
 
-Navigate to `localhost:4000/evaluate`.
+Navigate to `localhost:4000/evaluate` to view your results in the Genkit UI.