feat: add support for js functions as scorer (#230)

empirical-run · May 22, 2024 · f61545c · f61545c
1 parent 5cae1df
commit f61545c
Show file tree

Hide file tree

Showing 30 changed files with 4,233 additions and 175 deletions.
diff --git a/.changeset/thin-phones-destroy.md b/.changeset/thin-phones-destroy.md
@@ -0,0 +1,8 @@
+---
+"empiricalrun": minor
+"@empiricalrun/scorer": minor
+"@empiricalrun/types": minor
+"web": patch
+---
+
+feat: add support for js/ts scripts as scorers
diff --git a/apps/web/components/run-config-view.tsx b/apps/web/components/run-config-view.tsx
@@ -16,7 +16,6 @@ enum RunConfigTab {
 const defaultTabMap: Record<RunConfigType, RunConfigTab> = {
   model: RunConfigTab.prompt,
   "py-script": RunConfigTab.parameters,
-  "js-script": RunConfigTab.parameters,
   assistant: RunConfigTab.instructions,
 };
 

diff --git a/apps/web/components/sample-output-card.tsx b/apps/web/components/sample-output-card.tsx
@@ -58,18 +58,12 @@ export default function SampleOutputCard({
   const showCompareAgainst = useMemo(
     () =>
       !baseSample?.output.tool_calls?.length &&
-      (baseSample?.expected?.value ||
-        comparisonSamples?.some(
-          (comparisonSample, index) =>
-            comparisonSample?.output &&
-            comparisonResults?.[index]?.id !== baseResult?.id,
-        )),
-    [
-      baseResult?.id,
-      baseSample?.expected?.value,
-      comparisonResults,
-      comparisonSamples,
-    ],
+      comparisonSamples?.some(
+        (comparisonSample, index) =>
+          comparisonSample?.output &&
+          comparisonResults?.[index]?.id !== baseResult?.id,
+      ),
+    [baseResult?.id, comparisonResults, comparisonSamples],
   );
 
   const clearDiffView = useCallback(() => {
@@ -163,21 +157,6 @@ export default function SampleOutputCard({
                         >
                           <span className="text-xs">none</span>
                         </DropdownMenuCheckboxItem>
-                        {baseSample?.expected?.value && (
-                          <DropdownMenuCheckboxItem
-                            checked={
-                              diffView.enabled && diffView.type === "expected"
-                            }
-                            onCheckedChange={() => {
-                              enableDiffView({
-                                type: "expected",
-                                text: baseSample?.expected?.value || "",
-                              });
-                            }}
-                          >
-                            <span className="text-xs">expected</span>
-                          </DropdownMenuCheckboxItem>
-                        )}
                         {comparisonSamples?.map((s, i) => {
                           const result = comparisonResults?.[i];
                           if (result?.id === baseResult.id) {

diff --git a/examples/spider-using-ts/.gitignore b/examples/spider-using-ts/.gitignore
@@ -0,0 +1,4 @@
+
+# Ignore outputs from Empirical
+.empiricalrun
+db_files
diff --git a/examples/spider-using-ts/README.md b/examples/spider-using-ts/README.md
@@ -0,0 +1,38 @@
+# Scoring Text-to-SQL outputs using Typescript
+
+LLMs are good at converting natural language questions to SQL queries. This examples uses that
+scenario to demo Empirical. 
+This example is based on the [Spider](https://github.com/taoyds/spider) dataset and uses typescript config to score the outputs.
+
+In this example, we generate SQL queries, and score them on
+
+1. SQL syntax (with the `sql-syntax` scorer): Checks if the output syntax is valid SQL. For example, if the output is in
+   markdown syntax (with backticks), it is not a valid SQL query.
+2. Execution accuracy: We run the generated SQL query against a test database, and check
+   if the query returns a result. This scorer cleans query outputs that have backticks
+   ([see code](./empiricalrc.ts)).
+
+This example requires Typescript.
+
+## Usage
+
+1. Install npm dependencies
+  ```sh
+  npm i
+  ```
+
+1. Review the `empiricalrc.ts` configuration, and make changes if any. The current configuration runs models
+   from OpenAI, and Llama and thus, requires [relevant environment variables](https://docs.empirical.run/models/basic).
+  ```sh
+  cat empiricalrc.ts
+  ```
+
+1. Run with Empirical
+  ```sh
+  npx empiricalrun
+  ```
+
+1. See results on the Empirical web reporter
+  ```sh
+  npx empiricalrun ui
+  ```
diff --git a/examples/spider-using-ts/empiricalrc.ts b/examples/spider-using-ts/empiricalrc.ts
@@ -0,0 +1,62 @@
+import { Config, loadDataset, JSScriptScorer } from "empiricalrun";
+import { executeQuery, getConnection, getSchema } from "./src/db";
+
+async function datasetLoader() {
+  let dataset = await loadDataset({
+    path: "https://docs.google.com/spreadsheets/d/1x_p0lX2pJEyGkFoe1A9nY3q87qOJUd547f2lz99ugiM/edit#gid=1000015421"
+  })
+  dataset.samples = dataset.samples.map(sample => {
+    // get DB schema for the mentioned database name
+    sample.inputs.schema = getSchema(sample.inputs.database_name)
+    return sample;
+  })
+  return dataset
+}
+
+async function execAccuracy ({ output, inputs }) {
+  let score = 0;
+  let message: string;
+  try {
+    const dbName = inputs.database_name;
+    const con = await getConnection(dbName);
+    const res = await executeQuery(con, output.value!);
+    const [firstRow] = res;
+    score = firstRow ? 1 : 0.5;
+    message = firstRow ? "Result preview: " + firstRow.join(", "): "No results found"
+  } catch (e) {
+    score = 0;
+    message = String(e);
+  }
+  return {
+    score,
+    message
+  };
+}
+
+const config: Config = {
+  runs: [
+    {
+      provider: "openai",
+      type: "model",
+      model: "gpt-3.5-turbo",
+      prompt:
+        "You are an SQLite expert who can convert natural language questions to SQL queries for the database schema given below.\n\nDatabase schema:\n{{schema}}\n\nAnswer the following question with only the SQL query.\n\nQuestion: {{question}}",
+    },
+    {
+      type: "model",
+      provider: "fireworks",
+      model: "llama-v3-8b-instruct",
+      prompt:
+        "You are an SQLite expert who can convert natural language questions to SQL queries for the database schema given below.\n\nDatabase schema:\n{{schema}}\n\nAnswer the following question with only the SQL query.\n\nQuestion: {{question}}",
+    }
+  ],
+  dataset: datasetLoader,
+  scorers: [
+    {
+      type: "sql-syntax",
+    },
+    execAccuracy
+  ]
+};
+
+export default config;