-
Notifications
You must be signed in to change notification settings - Fork 12
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: add support for js functions as scorer (#230)
- Loading branch information
1 parent
5cae1df
commit f61545c
Showing
30 changed files
with
4,233 additions
and
175 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
--- | ||
"empiricalrun": minor | ||
"@empiricalrun/scorer": minor | ||
"@empiricalrun/types": minor | ||
"web": patch | ||
--- | ||
|
||
feat: add support for js/ts scripts as scorers |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
|
||
# Ignore outputs from Empirical | ||
.empiricalrun | ||
db_files |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
# Scoring Text-to-SQL outputs using Typescript | ||
|
||
LLMs are good at converting natural language questions to SQL queries. This examples uses that | ||
scenario to demo Empirical. | ||
This example is based on the [Spider](https://github.com/taoyds/spider) dataset and uses typescript config to score the outputs. | ||
|
||
In this example, we generate SQL queries, and score them on | ||
|
||
1. SQL syntax (with the `sql-syntax` scorer): Checks if the output syntax is valid SQL. For example, if the output is in | ||
markdown syntax (with backticks), it is not a valid SQL query. | ||
2. Execution accuracy: We run the generated SQL query against a test database, and check | ||
if the query returns a result. This scorer cleans query outputs that have backticks | ||
([see code](./empiricalrc.ts)). | ||
|
||
This example requires Typescript. | ||
|
||
## Usage | ||
|
||
1. Install npm dependencies | ||
```sh | ||
npm i | ||
``` | ||
|
||
1. Review the `empiricalrc.ts` configuration, and make changes if any. The current configuration runs models | ||
from OpenAI, and Llama and thus, requires [relevant environment variables](https://docs.empirical.run/models/basic). | ||
```sh | ||
cat empiricalrc.ts | ||
``` | ||
|
||
1. Run with Empirical | ||
```sh | ||
npx empiricalrun | ||
``` | ||
|
||
1. See results on the Empirical web reporter | ||
```sh | ||
npx empiricalrun ui | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
import { Config, loadDataset, JSScriptScorer } from "empiricalrun"; | ||
import { executeQuery, getConnection, getSchema } from "./src/db"; | ||
|
||
async function datasetLoader() { | ||
let dataset = await loadDataset({ | ||
path: "https://docs.google.com/spreadsheets/d/1x_p0lX2pJEyGkFoe1A9nY3q87qOJUd547f2lz99ugiM/edit#gid=1000015421" | ||
}) | ||
dataset.samples = dataset.samples.map(sample => { | ||
// get DB schema for the mentioned database name | ||
sample.inputs.schema = getSchema(sample.inputs.database_name) | ||
return sample; | ||
}) | ||
return dataset | ||
} | ||
|
||
async function execAccuracy ({ output, inputs }) { | ||
let score = 0; | ||
let message: string; | ||
try { | ||
const dbName = inputs.database_name; | ||
const con = await getConnection(dbName); | ||
const res = await executeQuery(con, output.value!); | ||
const [firstRow] = res; | ||
score = firstRow ? 1 : 0.5; | ||
message = firstRow ? "Result preview: " + firstRow.join(", "): "No results found" | ||
} catch (e) { | ||
score = 0; | ||
message = String(e); | ||
} | ||
return { | ||
score, | ||
message | ||
}; | ||
} | ||
|
||
const config: Config = { | ||
runs: [ | ||
{ | ||
provider: "openai", | ||
type: "model", | ||
model: "gpt-3.5-turbo", | ||
prompt: | ||
"You are an SQLite expert who can convert natural language questions to SQL queries for the database schema given below.\n\nDatabase schema:\n{{schema}}\n\nAnswer the following question with only the SQL query.\n\nQuestion: {{question}}", | ||
}, | ||
{ | ||
type: "model", | ||
provider: "fireworks", | ||
model: "llama-v3-8b-instruct", | ||
prompt: | ||
"You are an SQLite expert who can convert natural language questions to SQL queries for the database schema given below.\n\nDatabase schema:\n{{schema}}\n\nAnswer the following question with only the SQL query.\n\nQuestion: {{question}}", | ||
} | ||
], | ||
dataset: datasetLoader, | ||
scorers: [ | ||
{ | ||
type: "sql-syntax", | ||
}, | ||
execAccuracy | ||
] | ||
}; | ||
|
||
export default config; |
Oops, something went wrong.