firebase · ssbushi · Nov 27, 2024 · Nov 25, 2024 · Nov 25, 2024 · Nov 27, 2024
diff --git a/docs/evaluation.md b/docs/evaluation.md
@@ -49,13 +49,39 @@ use to generate output for evaluation.
 ["Cheese", "Broccoli", "Spinach and Kale"]
 ```
 
+If the evaluator requires a reference output for evaluating a flow, you can pass both 
+input and reference output using this format instead:
+
+```json
+{
+  "samples": [
+    {
+      "input": "What is the French word for Cheese?",
+      "reference": "Fromage"
+    },
+    {
+      "input": "What green vegetable looks like cauliflower?",
+      "reference": "Broccoli"
+    }
+  ]
+}
+```
+
+Note that you can use any JSON data type in the input JSON file. Genkit will pass them along with the same data type to your flow.
+
 You can then use the `eval:flow` command to evaluate your flow against the test
 cases provided in `testInputs.json`.
 
 ```posix-terminal
 genkit eval:flow menuSuggestionFlow --input testInputs.json
 ```
 
+If your flow requires auth, you may specify it using the `--auth` argument:
+
+```posix-terminal
+genkit eval:flow menuSuggestionFlow --input testInputs.json --auth "{\"email_verified\": true}"
+```
+
 You can then see evaluation results in the Developer UI by running:
 
 ```posix-terminal

diff --git a/docs/plugin-authoring-evaluator.md b/docs/plugin-authoring-evaluator.md
@@ -6,6 +6,8 @@ Firebase Genkit can be extended to support custom evaluation of test case output
 
 Evaluators are functions that assess the content given to and generated by an LLM. There are two main approaches to automated evaluation (testing): heuristic assessment and LLM-based assessment. In the heuristic approach, you define a deterministic function like those of traditional software development. In an LLM-based assessment, the content is fed back to an LLM and the LLM is asked to score the output according to criteria set in a prompt.
 
+Regardless of the approach you take, you need to use the `ai.defineEvaluator` method to define an evaluator action in Genkit. We will see a couple of examples of how to use this method in this document. 
+
 ### LLM based Evaluators
 
 An LLM-based evaluator leverages an LLM to evaluate the input, context, or output of your generative AI feature.
@@ -87,7 +89,7 @@ export async function deliciousnessScore<
   });
 
   // Call the LLM to generate an evaluation result
-  const response = await generate({
+  const response = await ai.generate({
     model: judgeLlm,
     prompt: finalPrompt,
     config: judgeConfig,
@@ -128,6 +130,7 @@ export function createDeliciousnessEvaluator<
       name: `myAwesomeEval/deliciousness`,
       displayName: 'Deliciousness',
       definition: 'Determines if output is considered delicous.',
+      isBilled: true,
     },
     async (datapoint: BaseEvalDataPoint) => {
       const score = await deliciousnessScore(judge, datapoint, judgeConfig);
@@ -140,6 +143,47 @@ export function createDeliciousnessEvaluator<
 }
 ```
 
+The `defineEvaluator` method is similar to other Genkit constructors like `defineFlow`, `defineRetriever` etc. The user should provide an `EvaluatorFn` to the `defineEvaluator` callback. The `EvaluatorFn` accepts a `BaseEvalDataPoint` which corresponds to a single entry in a dataset under evaluation, along with an optional custom options parameter if specified. The function, should process the datapoint and return an `EvalResponse` object. 
+
+Here are the Zod Schemas for `BaseEvalDataPoint` and `EvalResponse`:
+
+```ts
+export const BaseEvalDataPoint = z.object({
+  testCaseId: z.string(),
+  input: z.unknown(),
+  output: z.unknown().optional(),
+  context: z.array(z.unknown()).optional(),
+  reference: z.unknown().optional(),
+  testCaseId: z.string().optional(),
+  traceIds: z.array(z.string()).optional(),
+});
+
+export const EvalResponse = z.object({
+  sampleIndex: z.number().optional(),
+  testCaseId: z.string(),
+  traceId: z.string().optional(),
+  spanId: z.string().optional(),
+  evaluation: z.union([ScoreSchema, z.array(ScoreSchema)]),
+});
+```
+where `ScoreSchema` is defined as:
+
+```ts
+const ScoreSchema = z.object({
+  id: z.string().describe('Optional ID to differentiate multiple scores').optional(),
+  score: z.union([z.number(), z.string(), z.boolean()]).optional(),
+  error: z.string().optional(),
+  details: z
+    .object({
+      reasoning: z.string().optional(),
+    })
+    .passthrough()
+    .optional(),
+});
+```
+
+`defineEvaluator` lets the user provide a name and user-readable display name and a definition for the evaluator. The display name and definiton will be displayed in evaluation runs in the Dev UI. It also has an optional `isBilled` option which marks whether this evaluator may result in billing (eg: if it uses a billed LLM or API). If an evaluator is billed, the user is prompted for a confirmation in the CLI before they can run an evaluation, to help guard from unintended expenses.
+
 ### Heuristic Evaluators
 
 A heuristic evaluator can be any function used to evaluate the input, context, or output of your generative AI feature.
@@ -178,6 +222,16 @@ export async function usPhoneRegexScore(
     details: { reasoning },
   };
 }
+
+/**
+ * Create an EvalResponse from an individual scored datapoint.
+ */
+function fillScores(dataPoint: BaseEvalDataPoint, score: Score): EvalResponse {
+  return {
+    testCaseId: dataPoint.testCaseId,
+    evaluation: score,
+  };
+}
 ```
 
 #### Define the evaluator action
@@ -198,11 +252,11 @@ export function createUSPhoneRegexEvaluator(
         name: `myAwesomeEval/${metric.name.toLocaleLowerCase()}`,
         displayName: 'Regex Match',
         definition:
-          'Runs the output against a regex and responds with 1 if a match is found and 0 otherwise.',
+          'Runs the output against a regex and responds with true if a match is found and false otherwise.',
         isBilled: false,
       },
       async (datapoint: BaseEvalDataPoint) => {
-        const score = await regexMatchScore(datapoint, regexMetric.regex);
+        const score = await usPhoneRegexScore(datapoint);
         return fillScores(datapoint, score);
       }
     );
@@ -232,11 +286,6 @@ export interface PluginOptions {
 If this new plugin uses an LLM as a judge and the plugin supports swapping out which LLM to use, define additional parameters in the `PluginOptions` object.
 
 ```ts
-export enum MyAwesomeMetric {
-  DELICIOUSNESS = 'DELICIOUSNESS',
-  US_PHONE_REGEX_MATCH = 'US_PHONE_REGEX_MATCH',
-}
-
 export interface PluginOptions<ModelCustomOptions extends z.ZodTypeAny> {
   judge: ModelReference<ModelCustomOptions>;
   judgeConfig?: z.infer<ModelCustomOptions>;
@@ -350,14 +399,11 @@ These examples can be human generated or you can ask an LLM to help create a set
 
 Then use the Genkit CLI to run the evaluator against these test cases.
 
-```bash
-genkit eval:run deliciousness_dataset.json
-```
-
-View your results in the Genkit UI.
+```posix-terminal
+# Start your genkit runtime
+genkit start -- <command to start your app>
 
-```bash
-genkit start
+genkit eval:run deliciousness_dataset.json
 ```
 
-Navigate to `localhost:4000/evaluate`.
+Navigate to `localhost:4000/evaluate` to view your results in the Genkit UI.