Skip to content

Commit

Permalink
feat: add support for js functions as scorer (#230)
Browse files Browse the repository at this point in the history
  • Loading branch information
saikatmitra91 authored May 22, 2024
1 parent 5cae1df commit f61545c
Show file tree
Hide file tree
Showing 30 changed files with 4,233 additions and 175 deletions.
8 changes: 8 additions & 0 deletions .changeset/thin-phones-destroy.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
---
"empiricalrun": minor
"@empiricalrun/scorer": minor
"@empiricalrun/types": minor
"web": patch
---

feat: add support for js/ts scripts as scorers
1 change: 0 additions & 1 deletion apps/web/components/run-config-view.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ enum RunConfigTab {
const defaultTabMap: Record<RunConfigType, RunConfigTab> = {
model: RunConfigTab.prompt,
"py-script": RunConfigTab.parameters,
"js-script": RunConfigTab.parameters,
assistant: RunConfigTab.instructions,
};

Expand Down
33 changes: 6 additions & 27 deletions apps/web/components/sample-output-card.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -58,18 +58,12 @@ export default function SampleOutputCard({
const showCompareAgainst = useMemo(
() =>
!baseSample?.output.tool_calls?.length &&
(baseSample?.expected?.value ||
comparisonSamples?.some(
(comparisonSample, index) =>
comparisonSample?.output &&
comparisonResults?.[index]?.id !== baseResult?.id,
)),
[
baseResult?.id,
baseSample?.expected?.value,
comparisonResults,
comparisonSamples,
],
comparisonSamples?.some(
(comparisonSample, index) =>
comparisonSample?.output &&
comparisonResults?.[index]?.id !== baseResult?.id,
),
[baseResult?.id, comparisonResults, comparisonSamples],
);

const clearDiffView = useCallback(() => {
Expand Down Expand Up @@ -163,21 +157,6 @@ export default function SampleOutputCard({
>
<span className="text-xs">none</span>
</DropdownMenuCheckboxItem>
{baseSample?.expected?.value && (
<DropdownMenuCheckboxItem
checked={
diffView.enabled && diffView.type === "expected"
}
onCheckedChange={() => {
enableDiffView({
type: "expected",
text: baseSample?.expected?.value || "",
});
}}
>
<span className="text-xs">expected</span>
</DropdownMenuCheckboxItem>
)}
{comparisonSamples?.map((s, i) => {
const result = comparisonResults?.[i];
if (result?.id === baseResult.id) {
Expand Down
4 changes: 4 additions & 0 deletions examples/spider-using-ts/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@

# Ignore outputs from Empirical
.empiricalrun
db_files
38 changes: 38 additions & 0 deletions examples/spider-using-ts/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# Scoring Text-to-SQL outputs using Typescript

LLMs are good at converting natural language questions to SQL queries. This examples uses that
scenario to demo Empirical.
This example is based on the [Spider](https://github.com/taoyds/spider) dataset and uses typescript config to score the outputs.

In this example, we generate SQL queries, and score them on

1. SQL syntax (with the `sql-syntax` scorer): Checks if the output syntax is valid SQL. For example, if the output is in
markdown syntax (with backticks), it is not a valid SQL query.
2. Execution accuracy: We run the generated SQL query against a test database, and check
if the query returns a result. This scorer cleans query outputs that have backticks
([see code](./empiricalrc.ts)).

This example requires Typescript.

## Usage

1. Install npm dependencies
```sh
npm i
```

1. Review the `empiricalrc.ts` configuration, and make changes if any. The current configuration runs models
from OpenAI, and Llama and thus, requires [relevant environment variables](https://docs.empirical.run/models/basic).
```sh
cat empiricalrc.ts
```

1. Run with Empirical
```sh
npx empiricalrun
```

1. See results on the Empirical web reporter
```sh
npx empiricalrun ui
```
62 changes: 62 additions & 0 deletions examples/spider-using-ts/empiricalrc.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import { Config, loadDataset, JSScriptScorer } from "empiricalrun";
import { executeQuery, getConnection, getSchema } from "./src/db";

async function datasetLoader() {
let dataset = await loadDataset({
path: "https://docs.google.com/spreadsheets/d/1x_p0lX2pJEyGkFoe1A9nY3q87qOJUd547f2lz99ugiM/edit#gid=1000015421"
})
dataset.samples = dataset.samples.map(sample => {
// get DB schema for the mentioned database name
sample.inputs.schema = getSchema(sample.inputs.database_name)
return sample;
})
return dataset
}

async function execAccuracy ({ output, inputs }) {
let score = 0;
let message: string;
try {
const dbName = inputs.database_name;
const con = await getConnection(dbName);
const res = await executeQuery(con, output.value!);
const [firstRow] = res;
score = firstRow ? 1 : 0.5;
message = firstRow ? "Result preview: " + firstRow.join(", "): "No results found"
} catch (e) {
score = 0;
message = String(e);
}
return {
score,
message
};
}

const config: Config = {
runs: [
{
provider: "openai",
type: "model",
model: "gpt-3.5-turbo",
prompt:
"You are an SQLite expert who can convert natural language questions to SQL queries for the database schema given below.\n\nDatabase schema:\n{{schema}}\n\nAnswer the following question with only the SQL query.\n\nQuestion: {{question}}",
},
{
type: "model",
provider: "fireworks",
model: "llama-v3-8b-instruct",
prompt:
"You are an SQLite expert who can convert natural language questions to SQL queries for the database schema given below.\n\nDatabase schema:\n{{schema}}\n\nAnswer the following question with only the SQL query.\n\nQuestion: {{question}}",
}
],
dataset: datasetLoader,
scorers: [
{
type: "sql-syntax",
},
execAccuracy
]
};

export default config;
Loading

0 comments on commit f61545c

Please sign in to comment.