feat: ability to add global scorers (#178)

empirical-run · Apr 26, 2024 · db945c2 · db945c2
1 parent 93f995b
commit db945c2
Show file tree

Hide file tree

Showing 9 changed files with 78 additions and 96 deletions.
diff --git a/.changeset/yellow-trees-pull.md b/.changeset/yellow-trees-pull.md
@@ -0,0 +1,5 @@
+---
+"@empiricalrun/cli": minor
+---
+
+feat: ability to add global scorers
diff --git a/docs/quickstart.mdx b/docs/quickstart.mdx
@@ -80,12 +80,7 @@ Our test will succeed if the model outputs valid JSON.
           "type": "model",
           "provider": "openai",
           "model": "gpt-3.5-turbo",
-          "prompt": "Extract the name, age and location from the message, and respond with a JSON object. If an entity is missing, respond with null.\n\nMessage: {{user_message}}",
-          "scorers": [
-            {
-              "type": "is-json"
-            }
-          ]
+          "prompt": "Extract the name, age and location from the message, and respond with a JSON object. If an entity is missing, respond with null.\n\nMessage: {{user_message}}"
         },
         {
           "type": "model",
@@ -96,12 +91,7 @@ Our test will succeed if the model outputs valid JSON.
               "type": "json_object"
             }
           },
-          "prompt": "Extract the name, age and location from the message, and respond with a JSON object. If an entity is missing, respond with null.\n\nMessage: {{user_message}}",
-          "scorers": [
-            {
-              "type": "is-json"
-            }
-          ]
+          "prompt": "Extract the name, age and location from the message, and respond with a JSON object. If an entity is missing, respond with null.\n\nMessage: {{user_message}}"
         }
       ],
       "dataset": {
@@ -117,7 +107,12 @@ Our test will succeed if the model outputs valid JSON.
             }
           }
         ]
-      }
+      },
+      "scorers": [
+        {
+          "type": "is-json"
+        }
+      ]
     }
     ```
     </Accordion>

diff --git a/docs/scoring/basics.mdx b/docs/scoring/basics.mdx
@@ -12,11 +12,6 @@ as you like.
 
 ```json empiricalrc.json
 {
-    "type": "model",
-    "name": "gpt-3.5-turbo run",
-    "provider": "openai",
-    "model": "gpt-3.5-turbo",
-    "prompt": "Always respond with a JSON object.",
     "scorers": [
         {
             "type": "is-json"

diff --git a/examples/basic/empiricalrc.json b/examples/basic/empiricalrc.json
@@ -5,12 +5,7 @@
       "type": "model",
       "provider": "openai",
       "model": "gpt-3.5-turbo",
-      "prompt": "Extract the name, age and location from the message, and respond with a JSON object. If an entity is missing, respond with null.\n\nMessage: {{user_message}}",
-      "scorers": [
-        {
-          "type": "is-json"
-        }
-      ]
+      "prompt": "Extract the name, age and location from the message, and respond with a JSON object. If an entity is missing, respond with null.\n\nMessage: {{user_message}}"
     },
     {
       "type": "model",
@@ -21,12 +16,7 @@
         "response_format": {
           "type": "json_object"
         }
-      },
-      "scorers": [
-        {
-          "type": "is-json"
-        }
-      ]
+      }
     }
   ],
   "dataset": {
@@ -42,5 +32,10 @@
         }
       }
     ]
-  }
+  },
+  "scorers": [
+    {
+      "type": "is-json"
+    }
+  ]
 }
diff --git a/examples/humaneval/empiricalrc.json b/examples/humaneval/empiricalrc.json
@@ -8,17 +8,17 @@
       "prompt": "Complete the following python function. Return only the completed function so that it can be directly run on a Python shell, including imports like from typing import List.\n```python\n{{prompt}}\n```",
       "parameters": {
         "temperature": 0.1
-      },
-      "scorers": [
-        {
-          "type": "py-script",
-          "path": "score.py",
-          "name": "unit-tests"
-        }
-      ]
+      }
     }
   ],
   "dataset": {
     "path": "HumanEval.jsonl"
-  }
+  },
+  "scorers": [
+    {
+      "type": "py-script",
+      "path": "score.py",
+      "name": "unit-tests"
+    }
+  ]
 }
diff --git a/examples/rag/empiricalrc.json b/examples/rag/empiricalrc.json
@@ -6,29 +6,23 @@
       "path": "rag.py",
       "parameters": {
         "model": "gpt-3.5-turbo"
-      },
-      "scorers": [
-        {
-          "type": "py-script",
-          "path": "score.py"
-        }
-      ]
+      }
     },
     {
       "type": "py-script",
       "path": "rag.py",
       "parameters": {
         "model": "gpt-4-turbo-preview"
-      },
-      "scorers": [
-        {
-          "type": "py-script",
-          "path": "score.py"
-        }
-      ]
+      }
     }
   ],
   "dataset": {
     "path": ".empiricalrun/dataset.jsonl"
-  }
+  },
+  "scorers": [
+    {
+      "type": "py-script",
+      "path": "score.py"
+    }
+  ]
 }
diff --git a/examples/spider/empiricalrc.json b/examples/spider/empiricalrc.json
@@ -14,15 +14,6 @@
                     "role": "user",
                     "content": "Question: {{question}} \n\nAnswer the above question with only the SQL query."
                 }
-            ],
-            "scorers": [
-                {
-                    "type": "sql-syntax"
-                },
-                {
-                    "type": "py-script",
-                    "path": "execution_accuracy.py"
-                }
             ]
         },
         {
@@ -38,15 +29,6 @@
                     "role": "user",
                     "content": "Question: {{question}} \n\nAnswer the above question with only the SQL query."
                 }
-            ],
-            "scorers": [
-                {
-                    "type": "sql-syntax"
-                },
-                {
-                    "type": "py-script",
-                    "path": "execution_accuracy.py"
-                }
             ]
         },
         {
@@ -62,19 +44,19 @@
                     "role": "user",
                     "content": "Question: {{question}} \n\nAnswer the above question with only the SQL query."
                 }
-            ],
-            "scorers": [
-                {
-                    "type": "sql-syntax"
-                },
-                {
-                    "type": "py-script",
-                    "path": "execution_accuracy.py"
-                }
             ]
         }
     ],
     "dataset": {
         "path": "https://docs.google.com/spreadsheets/d/1x_p0lX2pJEyGkFoe1A9nY3q87qOJUd547f2lz99ugiM/edit#gid=0"
-    }
+    },
+    "scorers": [
+        {
+            "type": "sql-syntax"
+        },
+        {
+            "type": "py-script",
+            "path": "execution_accuracy.py"
+        }
+    ]
 }
diff --git a/packages/cli/src/bin/index.ts b/packages/cli/src/bin/index.ts
@@ -49,6 +49,31 @@ const cacheDir = ".empiricalrun";
 const outputFilePath = `${cwd}/${cacheDir}/${outputFileName}`;
 const runtimeOptionsPath = `${cwd}/${cacheDir}/runtime.json`;
 
+const readConfig = async (): Promise<RunsConfig> => {
+  let data: string;
+  try {
+    data = (await fs.readFile(configFileFullPath)).toString();
+    console.log(buildSuccessLog(`read ${configFileName} file successfully`));
+  } catch (err) {
+    console.log(buildErrorLog(`Failed to read ${configFileName} file`));
+    console.log(yellow("Please ensure running init command first"));
+    process.exit(1);
+  }
+  const { runs, dataset, scorers } = JSON.parse(data) as RunsConfig;
+
+  runs.forEach((r) => {
+    // if scorers are not set for a run, then override it with the global scorers
+    if (!r.scorers && scorers) {
+      r.scorers = scorers;
+    }
+  });
+
+  return {
+    runs,
+    dataset,
+  };
+};
+
 program
   .name("Empirical.run CLI")
   .description(
@@ -90,19 +115,9 @@ program
     dotenv.config({ path: runTimeOptions.envFilePath });
     console.log(yellow("Initiating run..."));
 
-    let data;
     const startTime = performance.now();
-    try {
-      data = await fs.readFile(configFileFullPath);
-    } catch (err) {
-      console.log(buildErrorLog(`Failed to read ${configFileName} file`));
-      console.log(yellow("Please ensure running init command first"));
-      process.exit(1);
-    }
+    const { runs, dataset: datasetConfig } = await readConfig();
 
-    console.log(buildSuccessLog(`read ${configFileName} file successfully`));
-    const jsonStr = data.toString();
-    const { runs, dataset: datasetConfig } = JSON.parse(jsonStr) as RunsConfig;
     // TODO: add check here for empty runs config. Add validator of the file
     let dataset: Dataset;
     const store = new EmpiricalStore();

diff --git a/packages/cli/src/types/index.ts b/packages/cli/src/types/index.ts
@@ -1,7 +1,8 @@
-import { RunConfig, DatasetConfig } from "@empiricalrun/types";
+import { RunConfig, DatasetConfig, Scorer } from "@empiricalrun/types";
 
 export type RunsConfig = {
+  $schema?: string;
   runs: RunConfig[];
   dataset: DatasetConfig;
-  $schema?: string;
+  scorers?: Scorer[];
 };